Compare commits

..

1 Commits

Author SHA1 Message Date
Georgi Gerganov
492eaad571 ci : change python3 -> python
ggml-ci
2025-01-15 16:18:56 +02:00
123 changed files with 2086 additions and 10340 deletions

View File

@@ -56,7 +56,6 @@ jobs:
mkdir build
cd build
cmake .. \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL_USE_BF16=ON \
@@ -88,7 +87,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
- name: Upload artifacts
@@ -121,7 +119,6 @@ jobs:
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
cmake -B build \
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_METAL=OFF \
@@ -152,7 +149,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
- name: Upload artifacts
@@ -162,8 +158,8 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
name: llama-bin-macos-x64.zip
ubuntu-cpu-cmake:
runs-on: ubuntu-22.04
ubuntu-latest-cmake:
runs-on: ubuntu-latest
steps:
- name: Clone
@@ -183,10 +179,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_CURL=ON \
-DGGML_RPC=ON
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
cmake --build . --config Release -j $(nproc)
- name: Test
@@ -224,7 +217,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
cp LICENSE ./build/bin/
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
- name: Upload artifacts
@@ -242,7 +234,7 @@ jobs:
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug]
build_type: [Debug, Release]
steps:
- name: Clone
@@ -261,10 +253,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP)
@@ -273,11 +262,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. \
-DLLAMA_FATAL_WARNINGS=ON \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DGGML_OPENMP=OFF
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
@@ -307,8 +292,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. \
-DGGML_RPC=ON
cmake -DGGML_RPC=ON ..
cmake --build . --config Release -j $(nproc)
- name: Test
@@ -338,8 +322,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. \
-DGGML_VULKAN=ON
cmake -DGGML_VULKAN=ON ..
cmake --build . --config Release -j $(nproc)
- name: Test
@@ -366,18 +349,13 @@ jobs:
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . \
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
-DGGML_HIP=ON
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support
id: cmake_build_legacy_hip
run: |
cmake -B build2 -S . \
-DCMAKE_C_COMPILER=hipcc \
-DCMAKE_CXX_COMPILER=hipcc \
-DGGML_HIP=ON
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-musa:
@@ -398,8 +376,7 @@ jobs:
- name: Build with native CMake MUSA support
id: cmake_build
run: |
cmake -B build -S . \
-DGGML_MUSA=ON
cmake -B build -S . -DGGML_MUSA=ON
cmake --build build --config Release -j $(nproc)
ubuntu-22-cmake-sycl:
@@ -440,10 +417,7 @@ jobs:
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake .. \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-sycl-fp16:
@@ -484,13 +458,42 @@ jobs:
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake .. \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx \
-DGGML_SYCL_F16=ON
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
cmake --build . --config Release -j $(nproc)
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
# how to debug it.
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
# would be great if we fix these
macOS-latest-cmake:
runs-on: macos-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
continue-on-error: true
run: |
brew update
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
macOS-latest-cmake-ios:
runs-on: macos-latest
@@ -793,7 +796,6 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts
@@ -821,13 +823,7 @@ jobs:
- name: Build with CMake
run: |
cmake -S . -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CUDA_ARCHITECTURES=89-real \
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_NATIVE=OFF \
-DGGML_CUDA=ON
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
cmake --build build
windows-2019-cmake-cuda:
@@ -916,11 +912,7 @@ jobs:
shell: cmd
run: |
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -S . -B build -G "Ninja Multi-Config" \
-DLLAMA_BUILD_SERVER=ON \
-DGGML_NATIVE=OFF \
-DGGML_CUDA=ON \
-DGGML_RPC=ON
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
cmake --build build --config Release
@@ -1205,7 +1197,8 @@ jobs:
runs-on: ubuntu-latest
needs:
- ubuntu-cpu-cmake
- ubuntu-latest-cmake
- macOS-latest-cmake
- windows-latest-cmake
- windows-2019-cmake-cuda
- windows-latest-cmake-hip-release

View File

@@ -112,9 +112,9 @@ jobs:
-DGGML_OPENMP=OFF ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build_sanitizers
if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
@@ -124,31 +124,12 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Build (sanitizers)
id: cmake_build
if: ${{ matrix.sanitizer == '' }}
run: |
cmake -B build \
-DGGML_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
- name: Tests
id: server_integration_tests
if: ${{ matrix.sanitizer == '' }}
run: |
cd examples/server/tests
./tests.sh
- name: Tests (sanitizers)
id: server_integration_tests_sanitizers
if: ${{ matrix.sanitizer != '' }}
run: |
cd examples/server/tests
LLAMA_SANITIZE=1 ./tests.sh
- name: Slow tests
id: server_integration_tests_slow
if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}

1
.gitignore vendored
View File

@@ -18,7 +18,6 @@
*.metallib
*.o
*.so
*.swp
*.tmp
# IDE / OS

View File

@@ -16,7 +16,6 @@ endif()
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(LLAMA_STANDALONE ON)
@@ -84,8 +83,11 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
# override ggml options
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
# change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE)
@@ -115,62 +117,16 @@ llama_option_depr(WARNING LLAMA_SYCL GGML_SYCL)
llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
llama_option_depr(WARNING LLAMA_CANN GGML_CANN)
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
message(STATUS "Using -fsanitize=thread")
add_compile_options(-fsanitize=thread)
link_libraries (-fsanitize=thread)
endif()
if (LLAMA_SANITIZE_ADDRESS)
message(STATUS "Using -fsanitize=address")
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
link_libraries (-fsanitize=address)
endif()
if (LLAMA_SANITIZE_UNDEFINED)
message(STATUS "Using -fsanitize=undefined")
add_compile_options(-fsanitize=undefined)
link_libraries (-fsanitize=undefined)
endif()
endif()
#
# 3rd-party
# build the library
#
if (NOT TARGET ggml)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
#
# build the library
#
add_subdirectory(src)
#
# utils, programs, examples and tests
#
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()
#
# install
#
@@ -244,3 +200,21 @@ configure_file(cmake/llama.pc.in
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION lib/pkgconfig)
#
# utils, programs, examples and tests
#
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)
endif()

View File

@@ -1361,9 +1361,7 @@ llama-server: \
examples/server/httplib.h \
examples/server/index.html.hpp \
examples/server/loading.html.hpp \
common/chat-template.hpp \
common/json.hpp \
common/minja.hpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

View File

@@ -16,9 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
## Hot topics
- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
@@ -206,7 +204,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
</details>

View File

@@ -299,7 +299,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -433,7 +433,7 @@ function gg_run_pythia_1_4b {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -564,7 +564,7 @@ function gg_run_pythia_2_8b {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -699,7 +699,7 @@ function gg_run_embd_bge_small {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -747,7 +747,7 @@ function gg_run_rerank_tiny {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
@@ -814,8 +814,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
mkdir -p ${mnt_models}
ln -sfn ${mnt_models} ${SRC}/models-mnt
# Create a fresh python3 venv and enter it
if ! python3 -m venv "$MNT/venv"; then
# Create a fresh python venv and enter it
if ! python -m venv "$MNT/venv"; then
echo "Error: Failed to create Python virtual environment at $MNT/venv."
exit 1
fi

View File

@@ -44,7 +44,7 @@ if(MSVC)
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
else()
execute_process(
COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE
)

View File

@@ -56,7 +56,6 @@ add_library(${TARGET} STATIC
arg.cpp
arg.h
base64.hpp
chat-template.hpp
common.cpp
common.h
console.cpp
@@ -65,7 +64,6 @@ add_library(${TARGET} STATIC
json.hpp
log.cpp
log.h
minja.hpp
ngram-cache.cpp
ngram-cache.h
sampling.cpp

View File

@@ -133,8 +133,7 @@ static void common_params_handle_model_default(
const std::string & model_url,
std::string & hf_repo,
std::string & hf_file,
const std::string & hf_token,
const std::string & model_default) {
const std::string & hf_token) {
if (!hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (hf_file.empty()) {
@@ -164,7 +163,7 @@ static void common_params_handle_model_default(
model = fs_get_cache_file(string_split<std::string>(f, '/').back());
}
} else if (model.empty()) {
model = model_default;
model = DEFAULT_MODEL_PATH;
}
}
@@ -300,9 +299,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
}
// TODO: refactor model params in a common struct
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH);
common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, "");
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token);
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token);
if (params.escape) {
string_process_escapes(params.prompt);
@@ -325,14 +323,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
}
if (!params.chat_template.empty() && !common_chat_verify_template(params.chat_template, params.use_jinja)) {
throw std::runtime_error(string_format(
"error: the supplied chat template is not supported: %s%s\n",
params.chat_template.c_str(),
params.use_jinja ? "" : "\nnote: llama.cpp was started without --jinja, we only support commonly used templates"
));
}
return true;
}
@@ -386,30 +376,6 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
return devices;
}
static void add_rpc_devices(std::string servers) {
auto rpc_servers = string_split<std::string>(servers, ',');
if (rpc_servers.empty()) {
throw std::invalid_argument("no RPC servers specified");
}
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
throw std::invalid_argument("failed to find RPC backend");
}
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
throw std::invalid_argument("failed to find RPC device add function");
}
for (const auto & server : rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
ggml_backend_device_register(dev);
} else {
throw std::invalid_argument("failed to register RPC device");
}
}
}
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params
@@ -1419,8 +1385,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--rpc"}, "SERVERS",
"comma separated list of RPC servers",
[](common_params & params, const std::string & value) {
add_rpc_devices(value);
GGML_UNUSED(params);
params.rpc_servers = value;
}
).set_env("LLAMA_ARG_RPC"));
}
@@ -1639,13 +1604,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.hf_repo = value;
}
).set_env("LLAMA_ARG_HF_REPO"));
add_opt(common_arg(
{"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
"Same as --hf-repo, but for the draft model (default: unused)",
[](common_params & params, const std::string & value) {
params.speculative.hf_repo = value;
}
).set_env("LLAMA_ARG_HFD_REPO"));
add_opt(common_arg(
{"-hff", "--hf-file"}, "FILE",
"Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
@@ -1955,44 +1913,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--jinja"},
"use jinja template for chat (default: disabled)",
[](common_params & params) {
params.use_jinja = true;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
"set custom jinja chat template (default: template taken from model's metadata)\n"
"if suffix/prefix are specified, template will be disabled\n"
"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
),
[](common_params & params, const std::string & value) {
if (!common_chat_verify_template(value)) {
throw std::runtime_error(string_format(
"error: the supplied chat template is not supported: %s\n"
"note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
value.c_str()
));
}
params.chat_template = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
add_opt(common_arg(
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
string_format(
"set custom jinja chat template file (default: template taken from model's metadata)\n"
"if suffix/prefix are specified, template will be disabled\n"
"only commonly used templates are accepted (unless --jinja is set before this flag):\n"
"list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
),
[](common_params & params, const std::string & value) {
std::ifstream file(value);
if (!file) {
throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
}
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(params.chat_template));
}
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
@@ -2291,13 +2229,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.vocoder.model = value;
}
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--tts-use-guide-tokens"},
"Use guide tokens to improve TTS word recall",
[](common_params & params) {
params.vocoder.use_guide_tokens = true;
}
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
// model-specific
add_opt(common_arg(

View File

@@ -1,268 +0,0 @@
/*
Copyright 2024 Google LLC
Use of this source code is governed by an MIT-style
license that can be found in the LICENSE file or at
https://opensource.org/licenses/MIT.
*/
// SPDX-License-Identifier: MIT
#pragma once
#include "minja.hpp"
#include <json.hpp>
#include <string>
#include <vector>
using json = nlohmann::ordered_json;
namespace minja {
class chat_template {
public:
private:
bool supports_tools_ = true;
// Meta-Llama-3.1-8B-Instruct's template expects arguments to be an object.
// Most other templates (and OpenAI's API) expect the arguments object to be stringified.
bool requires_object_arguments_ = false;
bool requires_typed_content_ = false;
bool supports_system_role_ = true;
bool supports_parallel_tool_calls_ = false;
std::string source_;
std::string bos_token_;
std::string eos_token_;
std::shared_ptr<minja::TemplateNode> template_root_;
std::string try_raw_render(
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const
{
try {
auto prompt = apply(messages, tools, add_generation_prompt, extra_context, /* adjust_inputs= */ false);
// fprintf(stderr, "Prompt: %s\n", prompt.c_str());
return prompt;
} catch (const std::exception & e) {
// fprintf(stderr, "Error: %s\n", e.what());
return "";
}
}
public:
chat_template(const std::string & source, const std::string & bos_token, const std::string & eos_token)
: source_(source), bos_token_(bos_token), eos_token_(eos_token)
{
template_root_ = minja::Parser::parse(source_, {
/* .trim_blocks = */ true,
/* .lstrip_blocks = */ true,
/* .keep_trailing_newline = */ false,
});
supports_tools_ = source.find("tools") != std::string::npos;
auto renders_string_arguments =
try_raw_render({
{
{"role", "user"},
{"content", "Hey"}
},
{
{"role", "assistant"},
{"tool_calls", json::array({
{
{"id", "call_1___"},
{"type", "function"},
{"function", {
{"arguments", "{\"code\": \"print('Hello, World!')\"}"},
{"name", "ipython"},
}},
},
})},
}
}, {}, false).find("{\"code\": \"print") != std::string::npos;
if (!renders_string_arguments) {
auto renders_object_arguments =
try_raw_render({
{
{"role", "user"},
{"content", "Hey"}
},
{
{"role", "assistant"},
{"tool_calls", json::array({
{
{"id", "call_1___"},
{"type", "function"},
{"function", {
{"arguments", {
{"code", "print('Hello, World!')"},
}},
{"name", "ipython"},
}},
},
})},
}
}, {}, false).find("{\"code\": \"print") != std::string::npos;
requires_object_arguments_ = renders_object_arguments;
}
supports_parallel_tool_calls_ = source.find("tool_call_id") != std::string::npos;
supports_system_role_ = try_raw_render({
{{"role", "system"}, {"content", "<System Needle>"}},
{{"role", "user"}, {"content", "Hey"}}
}, {}, false).find("<System Needle>") != std::string::npos;
requires_typed_content_ = try_raw_render({{{"role", "user"}, {"content", "Hey"}}}, {}, false).find("Hey") == std::string::npos
&& try_raw_render({{{"role", "user"}, {"content", {{{"type", "text"}, {"text", "Hey"}}}}}}, {}, false).find("Hey") != std::string::npos;
}
const std::string & source() const { return source_; }
const std::string & bos_token() const { return bos_token_; }
const std::string & eos_token() const { return eos_token_; }
bool supports_tools() const { return supports_tools_; }
bool supports_parallel_tool_calls() const { return supports_parallel_tool_calls_; }
std::string apply(
const nlohmann::ordered_json & messages,
const nlohmann::ordered_json & tools,
bool add_generation_prompt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json(),
bool adjust_inputs = true) const
{
json actual_messages;
// First, "fix" messages so they have a chance to be rendered correctly by the template
if (adjust_inputs && (requires_object_arguments_ || !supports_system_role_ || !supports_tools_ || requires_typed_content_)) {
actual_messages = json::array();
auto add_message = [&](const json & msg) {
if (requires_typed_content_ && msg.contains("content") && !msg.at("content").is_null() && msg.at("content").is_string()) {
actual_messages.push_back({
{"role", msg.at("role")},
{"content", {{
{"type", "text"},
{"text", msg.at("content")},
}}},
});
} else {
actual_messages.push_back(msg);
}
};
std::string pending_system;
auto flush_sys = [&]() {
if (!pending_system.empty()) {
add_message({
{"role", "user"},
{"content", pending_system},
});
pending_system.clear();
}
};
for (const auto & message_ : messages) {
auto message = message_;
if (!message.contains("role") || !message.contains("content")) {
throw std::runtime_error("message must have 'role' and 'content' fields: " + message.dump());
}
std::string role = message.at("role");
if (message.contains("tool_calls")) {
if (requires_object_arguments_ || !supports_tools_) {
for (auto & tool_call : message.at("tool_calls")) {
if (tool_call["type"] == "function") {
auto & function = tool_call.at("function");
std::string arguments = function.at("arguments");
function["arguments"] = json::parse(arguments);
}
}
}
if (!supports_tools_) {
auto content = message.at("content");
auto tool_calls = json::array();
for (const auto & tool_call : message.at("tool_calls")) {
if (tool_call.at("type") != "function") {
continue;
}
const auto & function = tool_call.at("function");
auto tc = json {
{"name", function.at("name")},
{"arguments", function.at("arguments")},
};
if (tool_call.contains("id")) {
tc["id"] = tool_call["id"];
}
tool_calls.push_back(tc);
}
auto obj = json {
{"tool_calls", tool_calls},
};
if (!content.is_null() && content != "") {
obj["content"] = content;
}
message["content"] = obj.dump(2);
message.erase("tool_calls");
}
}
if (!supports_tools_ && role == "tool") {
message["role"] = "user";
auto obj = json {
{"tool_response", {
{"tool", message.at("name")},
{"content", message.at("content")},
}},
};
if (message.contains("tool_call_id")) {
obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
}
message["content"] = obj.dump(2);
message.erase("name");
}
if (!message["content"].is_null() && !supports_system_role_) {
std::string content = message.at("content");
if (role == "system") {
if (!pending_system.empty()) pending_system += "\n";
pending_system += content;
continue;
} else {
if (role == "user") {
if (!pending_system.empty()) {
message["content"] = pending_system + (content.empty() ? "" : "\n" + content);
pending_system.clear();
}
} else {
flush_sys();
}
}
}
add_message(message);
}
flush_sys();
} else {
actual_messages = messages;
}
auto context = minja::Context::make(json({
{"messages", actual_messages},
{"add_generation_prompt", add_generation_prompt},
{"bos_token", bos_token_},
{"eos_token", eos_token_},
}));
if (!tools.is_null()) {
auto tools_val = minja::Value(tools);
context->set("tools", tools_val);
}
if (!extra_context.is_null()) {
for (auto & kv : extra_context.items()) {
minja::Value val(kv.value());
context->set(kv.key(), val);
}
}
return template_root_->render(context);
}
};
} // namespace minja

View File

@@ -12,7 +12,6 @@
#include "json.hpp"
#include "json-schema-to-grammar.h"
#include "llama.h"
#include "chat-template.hpp"
#include <algorithm>
#include <cinttypes>
@@ -484,48 +483,6 @@ void string_replace_all(std::string & s, const std::string & search, const std::
s = std::move(builder);
}
std::string string_join(const std::vector<std::string> & values, const std::string & separator) {
std::ostringstream result;
for (size_t i = 0; i < values.size(); ++i) {
if (i > 0) {
result << separator;
}
result << values[i];
}
return result.str();
}
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter) {
std::vector<std::string> parts;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
parts.push_back(str.substr(start, end - start));
start = end + delimiter.length();
end = str.find(delimiter, start);
}
parts.push_back(str.substr(start));
return parts;
}
std::string string_repeat(const std::string & str, size_t n) {
if (n == 0) {
return "";
}
std::string result;
result.reserve(str.length() * n);
for (size_t i = 0; i < n; ++i) {
result += str;
}
return result;
}
std::string string_from(bool value) {
return value ? "true" : "false";
}
@@ -1086,6 +1043,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
@@ -1771,75 +1729,67 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
// Chat template utils
//
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
if (use_jinja) {
try {
auto chat_template = minja::chat_template(tmpl, "<s>", "</s>");
chat_template.apply({{
{"role", "user"},
{"content", "test"},
}}, json(), true);
return true;
} catch (const std::exception & e) {
LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
return false;
}
}
std::string common_get_builtin_chat_template(const struct llama_model * model) {
const char * ptr_tmpl = llama_model_chat_template(model);
return ptr_tmpl == nullptr ? "" : ptr_tmpl;
}
bool common_chat_verify_template(const std::string & tmpl) {
llama_chat_message chat[] = {{"user", "test"}};
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
return res >= 0;
}
std::string common_chat_apply_template(
const common_chat_template & tmpl,
std::string common_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<common_chat_msg> & msgs,
bool add_ass,
bool use_jinja) {
if (use_jinja) {
auto messages = json::array();
for (const auto & msg : msgs) {
messages.push_back({{"role", msg.role}, {"content", msg.content}});
}
return tmpl.apply(messages, /* tools= */ json(), add_ass);
}
bool add_ass) {
int alloc_size = 0;
bool fallback = false; // indicate if we must fallback to default chatml
std::vector<llama_chat_message> chat;
for (const auto & msg : msgs) {
chat.push_back({msg.role.c_str(), msg.content.c_str()});
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
}
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
std::vector<char> buf(alloc_size);
// run the first time to get the total output length
int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
// error: chat template is not supported
if (res < 0) {
// if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported");
if (ptr_tmpl != nullptr) {
// if the custom "tmpl" is not supported, we throw an error
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
throw std::runtime_error("this custom template is not supported");
}
// If the built-in template is not supported, we default to chatml
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
fallback = true;
}
// if it turns out that our buffer is too small, we resize it
if ((size_t) res > buf.size()) {
buf.resize(res);
res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
res = llama_chat_apply_template(
fallback ? "chatml" : ptr_tmpl,
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
}
std::string formatted_chat(buf.data(), res);
return formatted_chat;
}
std::string common_chat_format_single(
const common_chat_template & tmpl,
std::string common_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg,
bool add_ass,
bool use_jinja) {
bool add_ass) {
std::ostringstream ss;
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
std::vector<common_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
@@ -1847,74 +1797,21 @@ std::string common_chat_format_single(
};
// format chat with new_msg
chat_new.push_back(new_msg);
auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
// get the diff part
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return ss.str();
}
std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl) {
std::vector<common_chat_msg> msgs = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "How are you?"},
};
return common_chat_apply_template(tmpl, msgs, true, use_jinja);
}
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
{
auto vocab = llama_model_get_vocab(model);
std::string default_template_src = chat_template_override;
std::string template_tool_use_src = chat_template_override;
bool has_explicit_template = !chat_template_override.empty();
if (chat_template_override.empty()) {
auto str = llama_model_chat_template(model, /* name */ nullptr);
if (str) {
default_template_src = str;
has_explicit_template = true;
}
str = llama_model_chat_template(model, /* name */ "tool_use");
if (str) {
template_tool_use_src = str;
has_explicit_template = true;
}
}
if (default_template_src.empty() || default_template_src == "chatml") {
if (!template_tool_use_src.empty()) {
default_template_src = template_tool_use_src;
} else {
default_template_src = R"(
{%- for message in messages -%}
{{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>\n" -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{- "<|im_start|>assistant\n" -}}
{%- endif -%}
)";
}
}
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
if (token == LLAMA_TOKEN_NULL) {
if (default_template_src.find(jinja_variable_name) != std::string::npos
|| template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
}
return std::string();
} else {
return common_token_to_piece(vocab, token, true);
}
};
auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
return {
has_explicit_template,
std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
template_tool_use_src.empty()
? nullptr
: std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos)
};
return common_chat_apply_template(model, tmpl, msgs, true);
}
//

View File

@@ -175,11 +175,7 @@ struct common_params_speculative {
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string model = ""; // draft model for speculative decoding // NOLINT
std::string model_url = ""; // model url to download // NOLINT
std::string model = ""; // draft model for speculative decoding // NOLINT
};
struct common_params_vocoder {
@@ -188,8 +184,6 @@ struct common_params_vocoder {
std::string model = ""; // model path // NOLINT
std::string model_url = ""; // model url to download // NOLINT
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
};
struct common_params {
@@ -252,6 +246,7 @@ struct common_params {
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -334,7 +329,6 @@ struct common_params {
std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT
std::string chat_template = ""; // NOLINT
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
std::vector<std::string> api_keys;
@@ -429,10 +423,6 @@ std::string string_format(const char * fmt, ...);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_join(const std::vector<std::string> & values, const std::string & separator);
std::vector<std::string> string_split(const std::string & str, const std::string & delimiter);
std::string string_repeat(const std::string & str, size_t n);
void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
template<class T>
@@ -517,14 +507,12 @@ struct llama_model * common_load_model_from_url(
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
struct llama_model * common_load_model_from_hf(
const std::string & repo,
const std::string & remote_path,
const std::string & local_path,
const std::string & hf_token,
const struct llama_model_params & params);
std::pair<std::string, std::string> common_get_hf_file(
const std::string & hf_repo_with_tag,
const std::string & hf_token);
@@ -608,43 +596,30 @@ struct common_chat_msg {
std::string content;
};
// Get the built-in chat template for the model. Return empty string if not present.
std::string common_get_builtin_chat_template(const struct llama_model * model);
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
namespace minja {
class chat_template;
}
typedef minja::chat_template common_chat_template;
struct common_chat_templates {
bool has_explicit_template; // Model had builtin template or template overridde was specified.
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
std::unique_ptr<common_chat_template> template_tool_use;
};
bool common_chat_verify_template(const std::string & tmpl);
// CPP wrapper for llama_chat_apply_template
// If the built-in template is not supported, we default to chatml
// If the custom "tmpl" is not supported, we throw an error
std::string common_chat_apply_template(
const common_chat_template & tmpl,
std::string common_chat_apply_template(const struct llama_model * model,
const std::string & tmpl,
const std::vector<common_chat_msg> & chat,
bool add_ass,
bool use_jinja);
bool add_ass);
// Format single message, while taking into account the position of that message in chat history
std::string common_chat_format_single(
const common_chat_template & tmpl,
std::string common_chat_format_single(const struct llama_model * model,
const std::string & tmpl,
const std::vector<common_chat_msg> & past_msg,
const common_chat_msg & new_msg,
bool add_ass,
bool use_jinja);
bool add_ass);
// Returns an example of formatted chat
std::string common_chat_format_example(
const common_chat_template & tmpl, bool use_jinja);
common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);
std::string common_chat_format_example(const struct llama_model * model,
const std::string & tmpl);
//
// KV cache utils

View File

@@ -1,6 +1,4 @@
#include "json-schema-to-grammar.h"
#include "common.h"
#include <algorithm>
#include <fstream>
#include <map>
@@ -13,6 +11,11 @@
using json = nlohmann::ordered_json;
template <typename Iterator>
static std::string join(Iterator begin, Iterator end, const std::string & separator);
static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
@@ -125,8 +128,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
if (sub_len > 0) {
auto from_sub = from.substr(i + 1);
auto to_sub = to.substr(i + 1);
auto sub_zeros = string_repeat("0", sub_len);
auto sub_nines = string_repeat("9", sub_len);
auto sub_zeros = repeat("0", sub_len);
auto sub_nines = repeat("9", sub_len);
auto to_reached = false;
out << "(";
@@ -185,8 +188,8 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream &
auto max_digits = max_s.length();
for (auto digits = min_digits; digits < max_digits; digits++) {
uniform_range(min_s, string_repeat("9", digits));
min_s = "1" + string_repeat("0", digits);
uniform_range(min_s, repeat("9", digits));
min_s = "1" + repeat("0", digits);
out << " | ";
}
uniform_range(min_s, max_s);
@@ -315,6 +318,49 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
template <typename Iterator>
std::string join(Iterator begin, Iterator end, const std::string & separator) {
std::ostringstream result;
if (begin != end) {
result << *begin;
for (Iterator it = begin + 1; it != end; ++it) {
result << separator << *it;
}
}
return result.str();
}
static std::vector<std::string> split(const std::string & str, const std::string & delimiter) {
std::vector<std::string> tokens;
size_t start = 0;
size_t end = str.find(delimiter);
while (end != std::string::npos) {
tokens.push_back(str.substr(start, end - start));
start = end + delimiter.length();
end = str.find(delimiter, start);
}
tokens.push_back(str.substr(start));
return tokens;
}
static std::string repeat(const std::string & str, size_t n) {
if (n == 0) {
return "";
}
std::string result;
result.reserve(str.length() * n);
for (size_t i = 0; i < n; ++i) {
result += str;
}
return result;
}
static std::string replacePattern(const std::string & input, const std::regex & regex, const std::function<std::string(const std::smatch &)> & replacement) {
std::smatch match;
std::string result;
@@ -343,7 +389,6 @@ static std::string format_literal(const std::string & literal) {
class SchemaConverter {
private:
friend std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb);
std::function<json(const std::string &)> _fetch_json;
bool _dotall;
std::map<std::string, std::string> _rules;
@@ -373,7 +418,7 @@ private:
for (size_t i = 0; i < alt_schemas.size(); i++) {
rules.push_back(visit(alt_schemas[i], name + (name.empty() ? "alternative-" : "-") + std::to_string(i)));
}
return string_join(rules, " | ");
return join(rules.begin(), rules.end(), " | ");
}
std::string _visit_pattern(const std::string & pattern, const std::string & name) {
@@ -436,7 +481,7 @@ private:
for (const auto & item : ret) {
results.push_back(to_rule(item));
}
return std::make_pair(string_join(results, " "), false);
return std::make_pair(join(results.begin(), results.end(), " "), false);
};
while (i < length) {
@@ -494,7 +539,7 @@ private:
}
curly_brackets += '}';
i++;
auto nums = string_split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
auto nums = split(curly_brackets.substr(1, curly_brackets.length() - 2), ",");
int min_times = 0;
int max_times = std::numeric_limits<int>::max();
try {
@@ -809,7 +854,7 @@ public:
return;
}
std::string pointer = ref.substr(ref.find('#') + 1);
std::vector<std::string> tokens = string_split(pointer, "/");
std::vector<std::string> tokens = split(pointer, "/");
for (size_t i = 1; i < tokens.size(); ++i) {
std::string sel = tokens[i];
if (target.is_null() || !target.contains(sel)) {
@@ -860,7 +905,7 @@ public:
for (const auto & v : schema["enum"]) {
enum_values.push_back(_generate_constant_rule(v));
}
return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space");
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
} else if ((schema_type.is_null() || schema_type == "object")
&& (schema.contains("properties") ||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
@@ -974,10 +1019,10 @@ public:
void check_errors() {
if (!_errors.empty()) {
throw std::runtime_error("JSON schema conversion failed:\n" + string_join(_errors, "\n"));
throw std::runtime_error("JSON schema conversion failed:\n" + join(_errors.begin(), _errors.end(), "\n"));
}
if (!_warnings.empty()) {
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", string_join(_warnings, "; ").c_str());
fprintf(stderr, "WARNING: JSON schema conversion was incomplete: %s\n", join(_warnings.begin(), _warnings.end(), "; ").c_str());
}
}
@@ -991,27 +1036,10 @@ public:
};
std::string json_schema_to_grammar(const json & schema) {
return build_grammar([&](const llama_grammar_builder & callbacks) {
auto copy = schema;
callbacks.resolve_refs(copy);
callbacks.add_schema("", copy);
});
}
std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb) {
SchemaConverter converter([&](const std::string &) { return json(); }, /* dotall= */ false);
llama_grammar_builder builder {
/* .add_rule = */ [&](const std::string & name, const std::string & rule) {
return converter._add_rule(name, rule);
},
/* .add_schema = */ [&](const std::string & name, const nlohmann::ordered_json & schema) {
return converter.visit(schema, name == "root" ? "" : name);
},
/* .resolve_refs = */ [&](nlohmann::ordered_json & schema) {
converter.resolve_refs(schema, "");
}
};
cb(builder);
SchemaConverter converter([](const std::string &) { return json::object(); }, /* dotall= */ false);
auto copy = schema;
converter.resolve_refs(copy, "input");
converter.visit(copy, "");
converter.check_errors();
return converter.format_grammar();
}

View File

@@ -5,12 +5,4 @@
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
std::string json_schema_to_grammar(const nlohmann::ordered_json & schema);
struct llama_grammar_builder {
std::function<std::string(const std::string &, const std::string &)> add_rule;
std::function<std::string(const std::string &, const nlohmann::ordered_json &)> add_schema;
std::function<void(nlohmann::ordered_json &)> resolve_refs;
};
std::string build_grammar(const std::function<void(const llama_grammar_builder &)> & cb);
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);

File diff suppressed because it is too large Load Diff

View File

@@ -696,9 +696,6 @@ class Model:
if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
# ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
res = "deepseek-v3"
if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
# ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
res = "deepseek-r1-qwen"
if res is None:
logger.warning("\n")
@@ -2885,66 +2882,6 @@ class InternLM2Model(Model):
return [(self.map_tensor_name(name), data_torch)]
@Model.register("InternLM3ForCausalLM")
class InternLM3Model(Model):
model_arch = gguf.MODEL_ARCH.LLAMA
def set_vocab(self):
tokens, scores, toktypes = self._create_vocab_sentencepiece()
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
if "added_tokens_decoder" in tokenizer_config_json:
for token_id, token_data in tokenizer_config_json["added_tokens_decoder"].items():
if token_data.get("special"):
token_id = int(token_id)
token = token_data["content"]
special_vocab._set_special_token(token, token_id)
# update eos token
if token == '<|im_end|>' and "eos" in special_vocab.special_token_ids:
special_vocab.special_token_ids["eos"] = token_id
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "head_dim" in hparams:
rope_dim = hparams["head_dim"]
else:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
self.gguf_writer.add_rope_dimension_count(rope_dim)
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "linear" or self.hparams["rope_scaling"].get("rope_type") == "linear":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
return [(self.map_tensor_name(name), data_torch)]
@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT

View File

@@ -65,50 +65,49 @@ else:
# TODO: add models here, base models preferred
models = [
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
{"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", },
{"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
{"name": "jina-v1-en", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-reranker-v1-tiny-en", },
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
{"name": "t5", "tokt": TOKENIZER_TYPE.UGM, "repo": "https://huggingface.co/google-t5/t5-small", },
{"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", },
{"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", },
{"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", },
{'name': "bloom", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
{'name': "gpt3-finnish", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
{"name": "exaone", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
{"name": "phi-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
{"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", },
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", },
{"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"},
{"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"},
{"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"},
{"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
]

View File

@@ -133,7 +133,7 @@ The docker build option is currently limited to *intel GPU* targets.
### Build image
```sh
# Using FP16
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
```
*Notes*:

View File

@@ -286,7 +286,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
```sh
# Build the image
docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
# Then, use it:
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33

View File

@@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
## Building Docker locally
```bash
docker build -t local/llama.cpp:full-cuda --target full -f .devops/cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda --target light -f .devops/cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile .
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
```
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
## Building Docker locally
```bash
docker build -t local/llama.cpp:full-musa --target full -f .devops/musa.Dockerfile .
docker build -t local/llama.cpp:light-musa --target light -f .devops/musa.Dockerfile .
docker build -t local/llama.cpp:server-musa --target server -f .devops/musa.Dockerfile .
docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
```
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.

View File

@@ -345,18 +345,8 @@ struct lora_merge_ctx {
gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur = inp_base;
for (size_t i = 0; i < adapters.size(); ++i) {
struct ggml_tensor * delta;
bool is_tok_embd = string_starts_with(name_base, "token_embd");
if (is_tok_embd) {
printf("%s : detected token embeddings tensor\n", __func__);
delta = ggml_mul_mat(ctx0,
ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32),
ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32));
} else {
delta = ggml_mul_mat(ctx0,
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32))),
ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
}
struct ggml_tensor * a_T = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_cast(ctx0, inp_a[i], GGML_TYPE_F32)));
struct ggml_tensor * delta = ggml_mul_mat(ctx0, a_T, ggml_cast(ctx0, inp_b[i], GGML_TYPE_F32));
// scale
const float alpha = adapters[i]->alpha;
const float rank = (float) inp_b[i]->ne[0];

View File

@@ -41,7 +41,7 @@ echo PASS
echo
# 2b. Test the sharded model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS
echo
@@ -51,7 +51,7 @@ echo PASS
echo
# 3b. Test the merged model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS
echo
@@ -61,7 +61,7 @@ echo PASS
echo
# 4b. Test the sharded model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS
echo
@@ -71,7 +71,7 @@ echo
#echo
# 5b. Test the merged model is loading properly
#$MAIN -no-cnv --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS
#echo
@@ -81,7 +81,7 @@ echo PASS
echo
# 6b. Test the sharded model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS
echo

View File

@@ -683,7 +683,7 @@ struct cmd_params_instance {
bool cpu_strict;
int poll;
int n_gpu_layers;
std::string rpc_servers_str;
std::string rpc_servers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
@@ -696,37 +696,8 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
if (!rpc_servers_str.empty()) {
auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
// add RPC devices
if (!rpc_servers.empty()) {
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
if (!rpc_reg) {
fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
exit(1);
}
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
if (!ggml_backend_rpc_add_device_fn) {
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
exit(1);
}
static std::vector<ggml_backend_dev_t> devices;
devices.clear();
for (const std::string & server : rpc_servers) {
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
if (dev) {
devices.push_back(dev);
} else {
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
exit(1);
}
}
devices.push_back(nullptr);
mparams.devices = devices.data();
}
if (!rpc_servers.empty()) {
mparams.rpc_servers = rpc_servers.c_str();
}
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
@@ -737,7 +708,7 @@ struct cmd_params_instance {
}
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
tensor_split == other.tensor_split;
}

View File

@@ -347,7 +347,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
jlong context_pointer,
jlong batch_pointer,
jstring jtext,
jboolean format_chat,
jint n_len
) {
@@ -357,8 +356,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
bool parse_special = (format_chat == JNI_TRUE);
const auto tokens_list = common_tokenize(context, text, true, parse_special);
const auto tokens_list = common_tokenize(context, text, 1);
auto n_ctx = llama_n_ctx(context);
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
@@ -370,7 +368,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
}
for (auto id : tokens_list) {
LOGi("token: `%s`-> %d ", common_token_to_piece(context, id).c_str(), id);
LOGi("%s", common_token_to_piece(context, id).c_str());
}
common_batch_clear(*batch);

View File

@@ -65,7 +65,6 @@ class LLamaAndroid {
context: Long,
batch: Long,
text: String,
formatChat: Boolean,
nLen: Int
): Int
@@ -116,10 +115,10 @@ class LLamaAndroid {
}
}
fun send(message: String, formatChat: Boolean = false): Flow<String> = flow {
fun send(message: String): Flow<String> = flow {
when (val state = threadLocalState.get()) {
is State.Loaded -> {
val ncur = IntVar(completion_init(state.context, state.batch, message, formatChat, nlen))
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, state.sampler, nlen, ncur)
if (str == null) {

View File

@@ -1,46 +0,0 @@
## MiniCPM-o 2.6
Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
### Prepare models and code
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
Clone llama.cpp:
```bash
git clone git@github.com:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpm-omni
```
### Usage of MiniCPM-o 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
# quantize int4 version
./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Build llama.cpp using `CMake`:
https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md
```bash
cmake -B build
cmake --build build --config Release
```
Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```

View File

@@ -718,9 +718,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
else if (ctx->minicpmv_version == 3) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 4) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);
}
@@ -1056,11 +1053,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
n_head = hidden_size/d_head;
num_query = 64;
}
else if (ctx->minicpmv_version == 4) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}
struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
@@ -2049,7 +2041,6 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
images[images.size()-1].push_back(patch);
}
}
clip_image_u8_free(refine_image);
}
return images;
}
@@ -2088,13 +2079,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
clip_image_f32_free(res);
}
}
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t j = 0; j < imgs[i].size(); ++j) {
if (imgs[i][j] != nullptr) {
clip_image_u8_free(imgs[i][j]);
}
}
}
return true;
}
else if (ctx->has_qwen2vl_merger) {
@@ -2351,9 +2335,6 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
else if (ctx->minicpmv_version == 3) {
n_patches = 64;
}
else if (ctx->minicpmv_version == 4) {
n_patches = 64;
}
} else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
int patch_size = params.patch_size * 2;
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
@@ -2533,8 +2514,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
int bucket_coords_h[1024];
int bucket_coords_w[1024];
int bucket_coords_h[70];
int bucket_coords_w[70];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
@@ -2562,9 +2543,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
else if (ctx->minicpmv_version == 3) {
embed_dim = 3584;
}
else if (ctx->minicpmv_version == 4) {
embed_dim = 3584;
}
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
@@ -2808,9 +2786,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
else if (ctx->minicpmv_version == 3) {
return 3584;
}
else if (ctx->minicpmv_version == 4) {
return 3584;
}
}
if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
return ctx->vision_model.mm_1_b->ne[0];

View File

@@ -216,7 +216,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
return true;
}
static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size) {
static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
int width = image->nx;
int height = image->ny;
int num_patches = (height / patch_size) * (width / patch_size);
@@ -277,7 +277,13 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
else {
encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
}
if (!encoded) {
@@ -307,9 +313,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding

View File

@@ -140,9 +140,6 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
else if (has_minicpmv_projector == 4) {
system_prompt = "<|im_start|>user\n";
}
LOG_INF("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
@@ -230,9 +227,6 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt;
}
else if (has_minicpmv_projector == 4) {
user_prompt = "<|im_start|>user\n" + prompt;
}
}
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@@ -242,9 +236,6 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 4) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}
// generate the response
@@ -317,6 +308,7 @@ int main(int argc, char ** argv) {
const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0) break;
if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);// mistral llava-1.6
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
fflush(stdout);

View File

@@ -501,7 +501,7 @@ default_image_mean = [0.48145466, 0.4578275, 0.40821073]
default_image_std = [0.26862954, 0.26130258, 0.27577711]
ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2)
ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3', default=2)
# with proper
args = ap.parse_args()
@@ -545,19 +545,12 @@ if args.use_f32:
minicpmv_version = args.minicpmv_version
emb_dim = 4096
block_count = 26
if minicpmv_version == 1:
emb_dim = 2304
block_count = 26
elif minicpmv_version == 2:
emb_dim = 4096
block_count = 27
elif minicpmv_version == 3:
emb_dim = 3584
block_count = 27
elif minicpmv_version == 4:
emb_dim = 3584
block_count = 27
default_vision_config = {
"hidden_size": 1152,
@@ -574,9 +567,6 @@ model = Idefics2VisionTransformer(vision_config)
if minicpmv_version == 3:
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
elif minicpmv_version == 4:
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
processor = None
# if model.attn_pool is not None:
@@ -597,7 +587,7 @@ elif args.minicpmv_projector is not None:
fname_middle = "mmproj-"
has_text_encoder = False
has_minicpmv_projector = True
minicpmv_version = 4
minicpmv_version = 3
elif args.vision_only:
fname_middle = "vision-"
has_text_encoder = False
@@ -635,6 +625,7 @@ if has_vision_encoder:
fout.add_uint32("clip.vision.projection_dim", 0)
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
block_count = 26
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
if processor is not None:

View File

@@ -8,7 +8,7 @@ ap.add_argument("-m", "--model", help="Path to MiniCPM-V model")
args = ap.parse_args()
# find the model part that includes the the multimodal projector weights
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True, torch_dtype=torch.bfloat16)
model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
checkpoint = model.state_dict()
# get a list of mm tensor names

View File

@@ -310,9 +310,9 @@ These options help improve the performance and memory usage of the LLaMA models.
### Batch Size
- `-ub N`, `--ubatch-size N`: Physical batch size. This is the maximum number of tokens that may be processed at a time. Increasing this value may improve performance during prompt processing, at the expense of higher memory usage. Default: `512`.
- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
- `-b N`, `--batch-size N`: Logical batch size. Increasing this value above the value of the physical batch size may improve prompt processing performance when using multiple GPUs with pipeline parallelism. Default: `2048`.
- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`.
### Prompt Caching

View File

@@ -4,7 +4,6 @@
#include "log.h"
#include "sampling.h"
#include "llama.h"
#include "chat-template.hpp"
#include <cstdio>
#include <cstring>
@@ -85,6 +84,14 @@ static void sigint_handler(int signo) {
}
#endif
static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
common_chat_msg new_msg{role, content};
auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
chat_msgs.push_back({role, content});
LOG_DBG("formatted: '%s'\n", formatted.c_str());
return formatted;
}
int main(int argc, char ** argv) {
common_params params;
g_params = &params;
@@ -158,7 +165,6 @@ int main(int argc, char ** argv) {
}
const llama_vocab * vocab = llama_model_get_vocab(model);
auto chat_templates = common_chat_templates_from_model(model, params.chat_template);
LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
@@ -201,7 +207,7 @@ int main(int argc, char ** argv) {
}
// auto enable conversation mode if chat template is available
const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
if (has_chat_template) {
LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
@@ -219,7 +225,7 @@ int main(int argc, char ** argv) {
// print chat template example in conversation mode
if (params.conversation_mode) {
if (params.enable_chat_template) {
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
} else {
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
}
@@ -263,18 +269,10 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp;
auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
common_chat_msg new_msg{role, content};
auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
chat_msgs.push_back({role, content});
LOG_DBG("formatted: '%s'\n", formatted.c_str());
return formatted;
};
{
auto prompt = (params.conversation_mode && params.enable_chat_template)
// format the system prompt in conversation mode (fallback to default if empty)
? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
// otherwise use the prompt as is
: params.prompt;
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
@@ -781,7 +779,7 @@ int main(int argc, char ** argv) {
}
if (params.enable_chat_template) {
chat_add_and_format("assistant", assistant_ss.str());
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
}
is_interacting = true;
LOG("\n");
@@ -846,7 +844,7 @@ int main(int argc, char ** argv) {
bool format_chat = params.conversation_mode && params.enable_chat_template;
std::string user_inp = format_chat
? chat_add_and_format("user", std::move(buffer))
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
: std::move(buffer);
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);

View File

@@ -47,7 +47,7 @@ echo PASS
echo
# 3a. Test the requanted model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
echo PASS
echo
@@ -57,7 +57,7 @@ echo PASS
echo
# 4b. Test the requanted model is loading properly
$MAIN -no-cnv --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
echo PASS
echo

View File

@@ -1,5 +1,5 @@
set(TARGET llama-run)
add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
add_executable(${TARGET} run.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@@ -3,10 +3,11 @@
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
```bash
llama-run granite3-moe
llama-run granite-code
```
```bash
llama-run -h
Description:
Runs a llm
@@ -16,7 +17,7 @@ Usage:
Options:
-c, --context-size <value>
Context size (default: 2048)
-n, -ngl, --ngl <value>
-n, --ngl <value>
Number of GPU layers (default: 0)
--temp <value>
Temperature (default: 0.8)

View File

@@ -1,26 +0,0 @@
Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

File diff suppressed because it is too large Load Diff

View File

@@ -1,128 +0,0 @@
/* linenoise.h -- VERSION 1.0
*
* Guerrilla line editing library against the idea that a line editing lib
* needs to be 20,000 lines of C++ code.
*
* See linenoise.cpp for more information.
*
* ------------------------------------------------------------------------
*
* Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
* Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
* Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef __LINENOISE_H
#define __LINENOISE_H
#ifdef __cplusplus
extern "C" {
#endif
#include <stddef.h> /* For size_t. */
#include <stdlib.h>
extern const char *linenoiseEditMore;
/* The linenoiseState structure represents the state during line editing.
* We pass this state to functions implementing specific editing
* functionalities. */
struct linenoiseState {
int in_completion; /* The user pressed TAB and we are now in completion
* mode, so input is handled by completeLine(). */
size_t completion_idx; /* Index of next completion to propose. */
int ifd; /* Terminal stdin file descriptor. */
int ofd; /* Terminal stdout file descriptor. */
char *buf; /* Edited line buffer. */
size_t buflen; /* Edited line buffer size. */
const char *prompt; /* Prompt to display. */
size_t plen; /* Prompt length. */
size_t pos; /* Current cursor position. */
size_t oldpos; /* Previous refresh cursor position. */
size_t len; /* Current edited line length. */
size_t cols; /* Number of columns in terminal. */
size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */
int history_index; /* The history index we are currently editing. */
};
struct linenoiseCompletions {
size_t len = 0;
char ** cvec = nullptr;
bool to_free = true;
~linenoiseCompletions() {
if (!to_free) {
return;
}
for (size_t i = 0; i < len; ++i) {
free(cvec[i]);
}
free(cvec);
}
};
/* Non blocking API. */
int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
const char *linenoiseEditFeed(struct linenoiseState *l);
void linenoiseEditStop(struct linenoiseState *l);
void linenoiseHide(struct linenoiseState *l);
void linenoiseShow(struct linenoiseState *l);
/* Blocking API. */
const char *linenoise(const char *prompt);
void linenoiseFree(void *ptr);
/* Completion API. */
typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
typedef void(linenoiseFreeHintsCallback)(const char *);
void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
void linenoiseSetHintsCallback(linenoiseHintsCallback *);
void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
void linenoiseAddCompletion(linenoiseCompletions *, const char *);
/* History API. */
int linenoiseHistoryAdd(const char *line);
int linenoiseHistorySetMaxLen(int len);
int linenoiseHistorySave(const char *filename);
int linenoiseHistoryLoad(const char *filename);
/* Other utilities. */
void linenoiseClearScreen(void);
void linenoiseSetMultiLine(int ml);
void linenoisePrintKeyCodes(void);
void linenoiseMaskModeEnable(void);
void linenoiseMaskModeDisable(void);
#ifdef __cplusplus
}
#endif
#endif /* __LINENOISE_H */

View File

@@ -19,16 +19,13 @@
#include <cstring>
#include <filesystem>
#include <iostream>
#include <list>
#include <sstream>
#include <string>
#include <vector>
#include "common.h"
#include "json.hpp"
#include "linenoise.cpp/linenoise.h"
#include "llama-cpp.h"
#include "chat-template.hpp"
#if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
[[noreturn]] static void sigint_handler(int) {
@@ -106,7 +103,6 @@ class Opt {
llama_model_params model_params;
std::string model_;
std::string user;
bool use_jinja = false;
int context_size = -1, ngl = -1;
float temperature = -1;
bool verbose = false;
@@ -147,8 +143,7 @@ class Opt {
if (handle_option_with_value(argc, argv, i, context_size) == 1) {
return 1;
}
} else if (options_parsing &&
(strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) {
} else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
if (handle_option_with_value(argc, argv, i, ngl) == 1) {
return 1;
}
@@ -159,8 +154,6 @@ class Opt {
} else if (options_parsing &&
(parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
verbose = true;
} else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
use_jinja = true;
} else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
help = true;
return 0;
@@ -195,7 +188,7 @@ class Opt {
"Options:\n"
" -c, --context-size <value>\n"
" Context size (default: %d)\n"
" -n, -ngl, --ngl <value>\n"
" -n, --ngl <value>\n"
" Number of GPU layers (default: %d)\n"
" --temp <value>\n"
" Temperature (default: %.1f)\n"
@@ -543,7 +536,7 @@ class LlamaData {
llama_sampler_ptr sampler;
llama_context_ptr context;
std::vector<llama_chat_message> messages;
std::list<std::string> msg_strs;
std::vector<std::string> msg_strs;
std::vector<char> fmtted;
int init(Opt & opt) {
@@ -635,20 +628,20 @@ class LlamaData {
return path.substr(pos + 1);
}
int rm_until_substring(std::string & model_, const std::string & substring) {
const std::string::size_type pos = model_.find(substring);
int remove_proto(std::string & model_) {
const std::string::size_type pos = model_.find("://");
if (pos == std::string::npos) {
return 1;
}
model_ = model_.substr(pos + substring.size()); // Skip past the substring
model_ = model_.substr(pos + 3); // Skip past "://"
return 0;
}
int resolve_model(std::string & model_) {
int ret = 0;
if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
rm_until_substring(model_, "://");
remove_proto(model_);
return ret;
}
@@ -657,16 +650,13 @@ class LlamaData {
const std::vector<std::string> headers = { "--header",
"Accept: application/vnd.docker.distribution.manifest.v2+json" };
if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
rm_until_substring(model_, "://");
ret = huggingface_dl(model_, headers, bn);
} else if (string_starts_with(model_, "hf.co/")) {
rm_until_substring(model_, "hf.co/");
remove_proto(model_);
ret = huggingface_dl(model_, headers, bn);
} else if (string_starts_with(model_, "ollama://")) {
rm_until_substring(model_, "://");
remove_proto(model_);
ret = ollama_dl(model_, headers, bn);
} else if (string_starts_with(model_, "https://")) {
ret = download(model_, headers, bn, true);
download(model_, headers, bn, true);
} else {
ret = ollama_dl(model_, headers, bn);
}
@@ -721,31 +711,13 @@ static void add_message(const char * role, const std::string & text, LlamaData &
}
// Function to apply the chat template and resize `formatted` if needed
static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
if (use_jinja) {
json messages = json::array();
for (const auto & msg : llama_data.messages) {
messages.push_back({
{"role", msg.role},
{"content", msg.content},
});
}
try {
auto result = tmpl.apply(messages, /* tools= */ json(), append);
llama_data.fmtted.resize(result.size() + 1);
memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
return result.size();
} catch (const std::exception & e) {
printe("failed to render the chat template: %s\n", e.what());
return -1;
}
}
static int apply_chat_template(LlamaData & llama_data, const bool append) {
int result = llama_chat_apply_template(
tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(), llama_data.messages.size(), append,
append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
if (append && result > static_cast<int>(llama_data.fmtted.size())) {
llama_data.fmtted.resize(result);
result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
result = llama_chat_apply_template(llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(),
llama_data.messages.size(), append, llama_data.fmtted.data(),
llama_data.fmtted.size());
}
@@ -755,12 +727,10 @@ static int apply_chat_template(const common_chat_template & tmpl, LlamaData & ll
// Function to tokenize the prompt
static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
std::vector<llama_token> & prompt_tokens) {
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
prompt_tokens.resize(n_prompt_tokens);
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first,
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
true) < 0) {
printe("failed to tokenize the prompt\n");
return -1;
@@ -806,7 +776,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
const llama_vocab * vocab = llama_model_get_vocab(llama_data.model.get());
std::vector<llama_token> tokens;
if (tokenize_prompt(vocab, prompt, tokens, llama_data) < 0) {
if (tokenize_prompt(vocab, prompt, tokens) < 0) {
return 1;
}
@@ -837,44 +807,24 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
batch = llama_batch_get_one(&new_token_id, 1);
}
printf("\033[0m");
return 0;
}
static int read_user_input(std::string & user_input) {
static const char * prompt_prefix = "> ";
#ifdef WIN32
printf(
"\r%*s"
"\r\033[0m%s",
get_terminal_width(), " ", prompt_prefix);
std::getline(std::cin, user_input);
static int read_user_input(std::string & user) {
std::getline(std::cin, user);
if (std::cin.eof()) {
printf("\n");
return 1;
}
#else
std::unique_ptr<char, decltype(&std::free)> line(const_cast<char *>(linenoise(prompt_prefix)), free);
if (!line) {
if (user == "/bye") {
return 1;
}
user_input = line.get();
#endif
if (user_input == "/bye") {
return 1;
}
if (user_input.empty()) {
if (user.empty()) {
return 2;
}
#ifndef WIN32
linenoiseHistoryAdd(line.get());
#endif
return 0; // Should have data in happy path
}
@@ -897,8 +847,8 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
}
// Helper function to apply the chat template and handle errors
static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
const int new_len = apply_chat_template(llama_data, append);
if (new_len < 0) {
printe("failed to apply the chat template\n");
return -1;
@@ -915,6 +865,10 @@ static int handle_user_input(std::string & user_input, const std::string & user)
return 0; // No need for interactive input
}
printf(
"\r%*s"
"\r\033[32m> \033[0m",
get_terminal_width(), " ");
return read_user_input(user_input); // Returns true if input ends the loop
}
@@ -957,11 +911,9 @@ static int get_user_input(std::string & user_input, const std::string & user) {
}
// Main chat loop function
static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) {
static int chat_loop(LlamaData & llama_data, const std::string & user) {
int prev_len = 0;
llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), "");
GGML_ASSERT(chat_templates.template_default);
static const bool stdout_a_terminal = is_stdout_a_terminal();
while (true) {
// Get user input
@@ -972,7 +924,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
add_message("user", user.empty() ? user_input : user, llama_data);
int new_len;
if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
return 1;
}
@@ -987,7 +939,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
}
add_message("assistant", response, llama_data);
if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
return 1;
}
}
@@ -1047,7 +999,7 @@ int main(int argc, const char ** argv) {
return 1;
}
if (chat_loop(llama_data, opt.user, opt.use_jinja)) {
if (chat_loop(llama_data, opt.user)) {
return 1;
}

View File

@@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
| `--grammar-file FNAME` | file to read grammar from |
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) |
**Example-specific params**

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -19,7 +19,6 @@
#include "loading.html.hpp"
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstddef>
#include <cinttypes>
@@ -33,8 +32,6 @@
using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
enum stop_type {
STOP_TYPE_NONE,
STOP_TYPE_EOS,
@@ -267,11 +264,6 @@ struct server_task {
params.speculative.n_min = std::max(params.speculative.n_min, 2);
params.speculative.n_max = std::max(params.speculative.n_max, 0);
// Use OpenAI API logprobs only if n_probs wasn't provided
if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
}
if (data.contains("lora")) {
if (data.at("lora").is_array()) {
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
@@ -1433,10 +1425,6 @@ struct server_queue {
} else {
queue_tasks.push_back(std::move(task));
}
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
condition_tasks.notify_one();
return task.id;
}
@@ -1454,10 +1442,6 @@ struct server_queue {
} else {
queue_tasks.push_back(std::move(task));
}
// if this is cancel task make sure to clean up pending tasks
if (task.type == SERVER_TASK_TYPE_CANCEL) {
cleanup_pending_task(task.id_target);
}
}
condition_tasks.notify_one();
return 0;
@@ -1552,20 +1536,6 @@ struct server_queue {
}
}
}
private:
void cleanup_pending_task(int id_task) {
// no need lock because this is called exclusively by post()
auto rm_func = [id_task](const server_task & task) {
return task.id_target == id_task;
};
queue_tasks.erase(
std::remove_if(queue_tasks.begin(), queue_tasks.end(), rm_func),
queue_tasks.end());
queue_tasks_deferred.erase(
std::remove_if(queue_tasks_deferred.begin(), queue_tasks_deferred.end(), rm_func),
queue_tasks_deferred.end());
}
};
struct server_response {
@@ -1601,12 +1571,6 @@ struct server_response {
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(id_task);
// make sure to clean up all pending results
queue_results.erase(
std::remove_if(queue_results.begin(), queue_results.end(), [id_task](const server_task_result_ptr & res) {
return res->id == id_task;
}),
queue_results.end());
}
void remove_waiting_task_ids(const std::unordered_set<int> & id_tasks) {
@@ -1626,24 +1590,6 @@ struct server_response {
return !queue_results.empty();
});
for (size_t i = 0; i < queue_results.size(); i++) {
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
server_task_result_ptr res = std::move(queue_results[i]);
queue_results.erase(queue_results.begin() + i);
return res;
}
}
}
// should never reach here
}
// same as recv(), but have timeout in seconds
// if timeout is reached, nullptr is returned
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
while (true) {
std::unique_lock<std::mutex> lock(mutex_results);
for (int i = 0; i < (int) queue_results.size(); i++) {
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
server_task_result_ptr res = std::move(queue_results[i]);
@@ -1651,11 +1597,6 @@ struct server_response {
return res;
}
}
std::cv_status cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout));
if (cr_res == std::cv_status::timeout) {
return nullptr;
}
}
// should never reach here
@@ -1720,8 +1661,6 @@ struct server_context {
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
common_chat_templates chat_templates;
~server_context() {
// Clear any sampling context
for (server_slot & slot : slots) {
@@ -1762,16 +1701,13 @@ struct server_context {
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
if (!params_base.speculative.model.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
auto params_dft = params_base;
params_dft.devices = params_base.speculative.devices;
params_dft.hf_file = params_base.speculative.hf_file;
params_dft.hf_repo = params_base.speculative.hf_repo;
params_dft.model = params_base.speculative.model;
params_dft.model_url = params_base.speculative.model_url;
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
@@ -1799,44 +1735,16 @@ struct server_context {
// force F16 KV cache for the draft model for extra performance
cparams_dft.type_k = GGML_TYPE_F16;
cparams_dft.type_v = GGML_TYPE_F16;
// the context is not needed - we will create one for each slot
llama_init_dft.context.reset();
}
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
return true;
}
bool validate_builtin_chat_template(bool use_jinja) const {
bool validate_builtin_chat_template() const {
llama_chat_message chat[] = {{"user", "test"}};
if (use_jinja) {
auto templates = common_chat_templates_from_model(model, "");
GGML_ASSERT(templates.template_default);
try {
templates.template_default->apply({{
{"role", "user"},
{"content", "test"},
}}, json(), true);
if (templates.template_tool_use) {
templates.template_tool_use->apply({{
{"role", "user"},
{"content", "test"},
}}, json(), true);
}
return true;
} catch (const std::exception & e) {
SRV_ERR("failed to apply template: %s\n", e.what());
return false;
}
} else {
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
}
const char * tmpl = llama_model_chat_template(model);
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
}
void init() {
@@ -2403,8 +2311,8 @@ struct server_context {
server_task task(SERVER_TASK_TYPE_CANCEL);
task.id_target = id_task;
queue_results.remove_waiting_task_id(id_task);
cancel_tasks.push_back(task);
queue_results.remove_waiting_task_id(id_task);
}
// push to beginning of the queue, so it has highest priority
queue_tasks.post(cancel_tasks, true);
@@ -2414,21 +2322,10 @@ struct server_context {
void receive_multi_results(
const std::unordered_set<int> & id_tasks,
const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
const std::function<void(json)> & error_handler,
const std::function<bool()> & is_connection_closed) {
const std::function<void(json)> & error_handler) {
std::vector<server_task_result_ptr> results(id_tasks.size());
for (int i = 0; i < (int)id_tasks.size(); i++) {
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
if (is_connection_closed()) {
cancel_tasks(id_tasks);
return;
}
if (result == nullptr) {
i--; // retry
continue;
}
for (size_t i = 0; i < id_tasks.size(); i++) {
server_task_result_ptr result = queue_results.recv(id_tasks);
if (result->is_error()) {
error_handler(result->to_json());
@@ -2452,20 +2349,10 @@ struct server_context {
void receive_cmpl_results_stream(
const std::unordered_set<int> & id_tasks,
const std::function<bool(server_task_result_ptr&)> & result_handler,
const std::function<void(json)> & error_handler,
const std::function<bool()> & is_connection_closed) {
const std::function<void(json)> & error_handler) {
size_t n_finished = 0;
while (true) {
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
if (is_connection_closed()) {
cancel_tasks(id_tasks);
return;
}
if (result == nullptr) {
continue; // retry
}
server_task_result_ptr result = queue_results.recv(id_tasks);
if (result->is_error()) {
error_handler(result->to_json());
@@ -3721,12 +3608,9 @@ int main(int argc, char ** argv) {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model },
{ "chat_template", ctx_server.chat_templates.template_default->source() },
{ "chat_template", common_get_builtin_chat_template(ctx_server.model) },
{ "build_info", build_info },
};
if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
}
res_ok(res, data);
};
@@ -3749,7 +3633,6 @@ int main(int argc, char ** argv) {
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
server_task_type type,
json & data,
std::function<bool()> is_connection_closed,
httplib::Response & res,
oaicompat_type oaicompat) {
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
@@ -3811,7 +3694,7 @@ int main(int argc, char ** argv) {
}
}, [&](const json & error_data) {
res_error(res, error_data);
}, is_connection_closed);
});
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
} else {
@@ -3821,7 +3704,6 @@ int main(int argc, char ** argv) {
if (res_json.is_array()) {
for (const auto & res : res_json) {
if (!server_sent_event(sink, "data", res)) {
// sending failed (HTTP connection closed), cancel the generation
return false;
}
}
@@ -3831,9 +3713,6 @@ int main(int argc, char ** argv) {
}
}, [&](const json & error_data) {
server_sent_event(sink, "error", error_data);
}, [&sink]() {
// note: do not use req.is_connection_closed here because req is already destroyed
return !sink.is_writable();
});
if (oaicompat != OAICOMPAT_TYPE_NONE) {
static const std::string ev_done = "data: [DONE]\n\n";
@@ -3856,7 +3735,6 @@ int main(int argc, char ** argv) {
return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_NONE);
};
@@ -3866,7 +3744,6 @@ int main(int argc, char ** argv) {
return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_COMPLETION);
};
@@ -3943,7 +3820,6 @@ int main(int argc, char ** argv) {
return handle_completions_impl(
SERVER_TASK_TYPE_INFILL,
data,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
};
@@ -3954,14 +3830,10 @@ int main(int argc, char ** argv) {
return;
}
auto body = json::parse(req.body);
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_CHAT);
};
@@ -4108,7 +3980,7 @@ int main(int argc, char ** argv) {
}, [&](const json & error_data) {
res_error(res, error_data);
error = true;
}, req.is_connection_closed);
});
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
}
@@ -4198,7 +4070,7 @@ int main(int argc, char ** argv) {
}, [&](const json & error_data) {
res_error(res, error_data);
error = true;
}, req.is_connection_closed);
});
}
if (error) {
@@ -4367,7 +4239,7 @@ int main(int argc, char ** argv) {
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (params.chat_template.empty()) {
if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) {
if (!ctx_server.validate_builtin_chat_template()) {
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
params.chat_template = "chatml";
}
@@ -4375,8 +4247,8 @@ int main(int argc, char ** argv) {
// print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
ctx_server.chat_templates.template_default->source().c_str(),
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
common_chat_format_example(ctx_server.model, params.chat_template).c_str());
ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1));

View File

@@ -4,26 +4,22 @@ from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(autouse=True)
@pytest.fixture(scope="module", autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
@pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
[
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", False, None),
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", True, None),
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
]
)
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
global server
server.jinja = jinja
server.chat_template = chat_template
server.start()
res = server.make_request("POST", "/chat/completions", data={
"model": model,

View File

@@ -1,5 +1,4 @@
import pytest
import requests
import time
from openai import OpenAI
from utils import *
@@ -406,23 +405,3 @@ def test_n_probs_post_sampling():
assert "bytes" in prob and type(prob["bytes"]) == list
# because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
def test_cancel_request():
global server
server.n_ctx = 4096
server.n_predict = -1
server.n_slots = 1
server.server_slots = True
server.start()
# send a request that will take a long time, but cancel it before it finishes
try:
server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
}, timeout=0.1)
except requests.exceptions.ReadTimeout:
pass # expected
# make sure the slot is free
time.sleep(1) # wait for HTTP_POLLING_SECONDS
res = server.make_request("GET", "/slots")
assert res.body[0]["is_processing"] == False

View File

@@ -26,9 +26,6 @@ from re import RegexFlag
import wget
DEFAULT_HTTP_TIMEOUT = 10 if "LLAMA_SANITIZE" not in os.environ else 30
class ServerResponse:
headers: dict
status_code: int
@@ -72,14 +69,13 @@ class ServerProcess:
pooling: str | None = None
draft: int | None = None
api_key: str | None = None
response_format: str | None = None
lora_files: List[str] | None = None
disable_ctx_shift: int | None = False
draft_min: int | None = None
draft_max: int | None = None
no_webui: bool | None = None
jinja: bool | None = None
chat_template: str | None = None
chat_template_file: str | None = None
# session variables
process: subprocess.Popen | None = None
@@ -92,7 +88,7 @@ class ServerProcess:
if "PORT" in os.environ:
self.server_port = int(os.environ["PORT"])
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
def start(self, timeout_seconds: int = 10) -> None:
if "LLAMA_SERVER_BIN_PATH" in os.environ:
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
elif os.name == "nt":
@@ -170,12 +166,8 @@ class ServerProcess:
server_args.extend(["--draft-min", self.draft_min])
if self.no_webui:
server_args.append("--no-webui")
if self.jinja:
server_args.append("--jinja")
if self.chat_template:
server_args.extend(["--chat-template", self.chat_template])
if self.chat_template_file:
server_args.extend(["--chat-template-file", self.chat_template_file])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
@@ -227,18 +219,17 @@ class ServerProcess:
path: str,
data: dict | Any | None = None,
headers: dict | None = None,
timeout: float | None = None,
) -> ServerResponse:
url = f"http://{self.server_host}:{self.server_port}{path}"
parse_body = False
if method == "GET":
response = requests.get(url, headers=headers, timeout=timeout)
response = requests.get(url, headers=headers)
parse_body = True
elif method == "POST":
response = requests.post(url, headers=headers, json=data, timeout=timeout)
response = requests.post(url, headers=headers, json=data)
parse_body = True
elif method == "OPTIONS":
response = requests.options(url, headers=headers, timeout=timeout)
response = requests.options(url, headers=headers)
else:
raise ValueError(f"Unimplemented method: {method}")
result = ServerResponse()

View File

@@ -16,8 +16,6 @@
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include "minja.hpp"
#include "chat-template.hpp"
#include <random>
#include <sstream>
@@ -351,7 +349,7 @@ static llama_tokens format_infill(
}
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
std::vector<common_chat_msg> chat;
for (size_t i = 0; i < messages.size(); ++i) {
@@ -379,7 +377,7 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
chat.push_back({role, content});
}
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
return formatted_chat;
@@ -578,23 +576,14 @@ static json oaicompat_completion_params_parse(const json & body) {
return llama_params;
}
static json oaicompat_completion_params_parse(
const json & body, /* openai api json semantics */
const common_chat_template & tmpl,
bool use_jinja)
{
static json oaicompat_chat_completion_params_parse(
const struct llama_model * model,
const json & body, /* openai api json semantics */
const std::string & chat_template) {
json llama_params;
auto tools = json_value(body, "tools", json());
auto has_tools = tools.is_array() && !tools.empty();
if (has_tools) {
if (use_jinja) {
LOG_WRN("tools param is not fully supported yet\n");
} else {
throw std::runtime_error("tools param requires --jinja flag");
}
}
// Apply chat template to the list of messages
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
// Handle "stop" field
if (body.contains("stop") && body.at("stop").is_string()) {
@@ -617,13 +606,6 @@ static json oaicompat_completion_params_parse(
}
}
// Apply chat template to the list of messages
if (use_jinja) {
llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
} else {
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
}
// Handle "n" field
int n_choices = json_value(body, "n", 1);
if (n_choices != 1) {
@@ -639,7 +621,7 @@ static json oaicompat_completion_params_parse(
}
// Params supported by OAI but unsupported by llama.cpp
static const std::vector<std::string> unsupported_params { "tool_choice" };
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
for (const auto & param : unsupported_params) {
if (body.contains(param)) {
throw std::runtime_error("Unsupported param: " + param);

View File

@@ -141,7 +141,6 @@
:msg="pendingMsg"
:key="pendingMsg.id"
:is-generating="isGenerating"
:show-thought-in-progress="config.showThoughtInProgress"
:edit-user-msg-and-regenerate="() => {}"
:regenerate-msg="() => {}"></message-bubble>
</div>
@@ -203,20 +202,6 @@
</template>
</div>
</details>
<!-- Section: Reasoning models -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Reasoning models</summary>
<div class="collapse-content">
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
<span class="ml-4">Expand though process by default for generating message</span>
</div>
<div class="flex flex-row items-center mb-2">
<input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
<span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
</div>
</div>
</details>
<!-- Section: Advanced config -->
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
<summary class="collapse-title font-bold">Advanced config</summary>
@@ -276,17 +261,7 @@
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
<!-- render message as markdown -->
<div v-else dir="auto">
<details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
<summary class="collapse-title">
<span v-if="splitMsgContent.isThinking">
<span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
<b>Thinking</b>
</span>
<b v-else>Thought Process</b>
</summary>
<vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
</details>
<vue-markdown :source="splitMsgContent.content"></vue-markdown>
<vue-markdown :source="msg.content"></vue-markdown>
</div>
<!-- render timings if enabled -->
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">

View File

@@ -17,11 +17,6 @@ import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator';
const isDev = import.meta.env.MODE === 'development';
// types
/** @typedef {{ id: number, role: 'user' | 'assistant', content: string, timings: any }} Message */
/** @typedef {{ role: 'user' | 'assistant', content: string }} APIMessage */
/** @typedef {{ id: string, lastModified: number, messages: Array<Message> }} Conversation */
// utility functions
const isString = (x) => !!x.toLowerCase;
const isBoolean = (x) => x === true || x === false;
@@ -55,8 +50,6 @@ const CONFIG_DEFAULT = {
apiKey: '',
systemMessage: 'You are a helpful assistant.',
showTokensPerSecond: false,
showThoughtInProgress: false,
excludeThoughtOnReq: true,
// make sure these default values are in sync with `common.h`
samplers: 'edkypmxt',
temperature: 0.8,
@@ -179,7 +172,6 @@ const MessageBubble = defineComponent({
config: Object,
msg: Object,
isGenerating: Boolean,
showThoughtInProgress: Boolean,
editUserMsgAndRegenerate: Function,
regenerateMsg: Function,
},
@@ -196,31 +188,7 @@ const MessageBubble = defineComponent({
prompt_per_second: this.msg.timings.prompt_n / (this.msg.timings.prompt_ms / 1000),
predicted_per_second: this.msg.timings.predicted_n / (this.msg.timings.predicted_ms / 1000),
};
},
splitMsgContent() {
const content = this.msg.content;
if (this.msg.role !== 'assistant') {
return { content };
}
let actualContent = '';
let cot = '';
let isThinking = false;
let thinkSplit = content.split('<think>', 2);
actualContent += thinkSplit[0];
while (thinkSplit[1] !== undefined) {
// <think> tag found
thinkSplit = thinkSplit[1].split('</think>', 2);
cot += thinkSplit[0];
isThinking = true;
if (thinkSplit[1] !== undefined) {
// </think> closing tag found
isThinking = false;
thinkSplit = thinkSplit[1].split('<think>', 2);
actualContent += thinkSplit[0];
}
}
return { content: actualContent, cot, isThinking };
},
}
},
methods: {
copyMsg() {
@@ -240,10 +208,7 @@ const MessageBubble = defineComponent({
// format: { [convId]: { id: string, lastModified: number, messages: [...] } }
// convId is a string prefixed with 'conv-'
const StorageUtils = {
/**
* manage conversations
* @returns {Array<Conversation>}
*/
// manage conversations
getAllConversations() {
const res = [];
for (const key in localStorage) {
@@ -254,19 +219,11 @@ const StorageUtils = {
res.sort((a, b) => b.lastModified - a.lastModified);
return res;
},
/**
* can return null if convId does not exist
* @param {string} convId
* @returns {Conversation | null}
*/
// can return null if convId does not exist
getOneConversation(convId) {
return JSON.parse(localStorage.getItem(convId) || 'null');
},
/**
* if convId does not exist, create one
* @param {string} convId
* @param {Message} msg
*/
// if convId does not exist, create one
appendMsg(convId, msg) {
if (msg.content === null) return;
const conv = StorageUtils.getOneConversation(convId) || {
@@ -278,24 +235,12 @@ const StorageUtils = {
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
},
/**
* Get new conversation id
* @returns {string}
*/
getNewConvId() {
return `conv-${Date.now()}`;
},
/**
* remove conversation by id
* @param {string} convId
*/
remove(convId) {
localStorage.removeItem(convId);
},
/**
* remove all conversations
* @param {string} convId
*/
filterAndKeepMsgs(convId, predicate) {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
@@ -303,11 +248,6 @@ const StorageUtils = {
conv.lastModified = Date.now();
localStorage.setItem(convId, JSON.stringify(conv));
},
/**
* remove last message from conversation
* @param {string} convId
* @returns {Message | undefined}
*/
popMsg(convId) {
const conv = StorageUtils.getOneConversation(convId);
if (!conv) return;
@@ -382,12 +322,10 @@ const mainApp = createApp({
data() {
return {
conversations: StorageUtils.getAllConversations(),
/** @type {Array<Message>} */
messages: [],
messages: [], // { id: number, role: 'user' | 'assistant', content: string }
viewingConvId: StorageUtils.getNewConvId(),
inputMsg: '',
isGenerating: false,
/** @type {Array<Message> | null} */
pendingMsg: null, // the on-going message from assistant
stopGeneration: () => {},
selectedTheme: StorageUtils.getTheme(),
@@ -395,7 +333,6 @@ const mainApp = createApp({
showConfigDialog: false,
// const
themes: THEMES,
/** @type {CONFIG_DEFAULT} */
configDefault: {...CONFIG_DEFAULT},
configInfo: {...CONFIG_INFO},
isDev,
@@ -488,50 +425,42 @@ const mainApp = createApp({
this.isGenerating = true;
try {
/** @type {CONFIG_DEFAULT} */
const config = this.config;
const abortController = new AbortController();
this.stopGeneration = () => abortController.abort();
/** @type {Array<APIMessage>} */
let messages = [
{ role: 'system', content: config.systemMessage },
...normalizeMsgsForAPI(this.messages),
];
if (config.excludeThoughtOnReq) {
messages = filterThoughtFromMsgs(messages);
}
if (isDev) console.log({messages});
const params = {
messages,
messages: [
{ role: 'system', content: this.config.systemMessage },
...this.messages,
],
stream: true,
cache_prompt: true,
samplers: config.samplers,
temperature: config.temperature,
dynatemp_range: config.dynatemp_range,
dynatemp_exponent: config.dynatemp_exponent,
top_k: config.top_k,
top_p: config.top_p,
min_p: config.min_p,
typical_p: config.typical_p,
xtc_probability: config.xtc_probability,
xtc_threshold: config.xtc_threshold,
repeat_last_n: config.repeat_last_n,
repeat_penalty: config.repeat_penalty,
presence_penalty: config.presence_penalty,
frequency_penalty: config.frequency_penalty,
dry_multiplier: config.dry_multiplier,
dry_base: config.dry_base,
dry_allowed_length: config.dry_allowed_length,
dry_penalty_last_n: config.dry_penalty_last_n,
max_tokens: config.max_tokens,
timings_per_token: !!config.showTokensPerSecond,
...(config.custom.length ? JSON.parse(config.custom) : {}),
samplers: this.config.samplers,
temperature: this.config.temperature,
dynatemp_range: this.config.dynatemp_range,
dynatemp_exponent: this.config.dynatemp_exponent,
top_k: this.config.top_k,
top_p: this.config.top_p,
min_p: this.config.min_p,
typical_p: this.config.typical_p,
xtc_probability: this.config.xtc_probability,
xtc_threshold: this.config.xtc_threshold,
repeat_last_n: this.config.repeat_last_n,
repeat_penalty: this.config.repeat_penalty,
presence_penalty: this.config.presence_penalty,
frequency_penalty: this.config.frequency_penalty,
dry_multiplier: this.config.dry_multiplier,
dry_base: this.config.dry_base,
dry_allowed_length: this.config.dry_allowed_length,
dry_penalty_last_n: this.config.dry_penalty_last_n,
max_tokens: this.config.max_tokens,
timings_per_token: !!this.config.showTokensPerSecond,
...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
};
const chunks = sendSSEPostRequest(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...(config.apiKey ? {'Authorization': `Bearer ${config.apiKey}`} : {})
...(this.config.apiKey ? {'Authorization': `Bearer ${this.config.apiKey}`} : {})
},
body: JSON.stringify(params),
signal: abortController.signal,
@@ -548,7 +477,7 @@ const mainApp = createApp({
};
}
const timings = chunk.timings;
if (timings && config.showTokensPerSecond) {
if (timings && this.config.showTokensPerSecond) {
// only extract what's really needed, to save some space
this.pendingMsg.timings = {
prompt_n: timings.prompt_n,
@@ -669,33 +598,3 @@ try {
<button class="btn" onClick="localStorage.clear(); window.location.reload();">Clear localStorage</button>
</div>`;
}
/**
* filter out redundant fields upon sending to API
* @param {Array<APIMessage>} messages
* @returns {Array<APIMessage>}
*/
function normalizeMsgsForAPI(messages) {
return messages.map((msg) => {
return {
role: msg.role,
content: msg.content,
};
});
}
/**
* recommended for DeepsSeek-R1, filter out content between <think> and </think> tags
* @param {Array<APIMessage>} messages
* @returns {Array<APIMessage>}
*/
function filterThoughtFromMsgs(messages) {
return messages.map((msg) => {
return {
role: msg.role,
content: msg.role === 'assistant'
? msg.content.split('</think>').at(-1).trim()
: msg.content,
};
});
}

View File

@@ -98,12 +98,10 @@ int main(int argc, char ** argv) {
auto generate = [&](const std::string & prompt) {
std::string response;
const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
// tokenize the prompt
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
std::vector<llama_token> prompt_tokens(n_prompt_tokens);
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), is_first, true) < 0) {
if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
GGML_ABORT("failed to tokenize the prompt\n");
}
@@ -163,7 +161,7 @@ int main(int argc, char ** argv) {
break;
}
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
const char * tmpl = llama_model_chat_template(model);
// add the user input to the message list and format it
messages.push_back({"user", strdup(user.c_str())});

View File

@@ -425,33 +425,6 @@ static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
}
static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str) {
const std::string& delimiter = "<|text_sep|>";
std::vector<llama_token> result;
size_t start = 0;
size_t end = str.find(delimiter);
//first token is always a newline, as it was not previously added
result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
while (end != std::string::npos) {
std::string current_word = str.substr(start, end - start);
auto tmp = common_tokenize(vocab, current_word, false, true);
result.push_back(tmp[0]);
start = end + delimiter.length();
end = str.find(delimiter, start);
}
// Add the last part
std::string current_word = str.substr(start);
auto tmp = common_tokenize(vocab, current_word, false, true);
if (tmp.size() > 0) {
result.push_back(tmp[0]);
}
return result;
}
int main(int argc, char ** argv) {
common_params params;
@@ -521,7 +494,6 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
std::vector<llama_token> codes;
std::vector<llama_token> guide_tokens;
// process prompt and generate voice codes
{
@@ -536,9 +508,6 @@ int main(int argc, char ** argv) {
// convert the input text into the necessary format expected by OuteTTS
{
std::string prompt_clean = process_text(params.prompt);
if (params.vocoder.use_guide_tokens) {
guide_tokens = prepare_guide_tokens(vocab, prompt_clean);
}
LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());
@@ -748,8 +717,6 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
int n_past = batch.n_tokens;
int n_decode = 0;
bool next_token_uses_guide_token = true;
while (n_decode <= n_predict) {
// prepare the next batch
common_batch_clear(batch);
@@ -761,17 +728,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
continue;
}
llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
//guide tokens help prevent hallucinations by forcing the TTS to use the correct word
if (!guide_tokens.empty() && next_token_uses_guide_token && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
llama_token guide_token = guide_tokens[0];
guide_tokens.erase(guide_tokens.begin());
new_token_id = guide_token; //ensure correct word fragment is used
}
//this is the token id that always precedes a new word
next_token_uses_guide_token = (new_token_id == 198);
const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
common_sampler_accept(smpl[i], new_token_id, true);

View File

@@ -58,8 +58,7 @@ else()
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif()
if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
if (CMAKE_CROSSCOMPILING)
set(GGML_NATIVE_DEFAULT OFF)
else()
set(GGML_NATIVE_DEFAULT ON)
@@ -154,7 +153,6 @@ option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashA
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)

View File

@@ -203,8 +203,6 @@ extern "C" {
// Backend registry
//
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Backend (reg) enumeration
GGML_API size_t ggml_backend_reg_count(void);
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);

View File

@@ -1384,20 +1384,16 @@ extern "C" {
float scale,
float max_bias);
GGML_API struct ggml_tensor * ggml_soft_max_ext_back(
GGML_API struct ggml_tensor * ggml_soft_max_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
float scale,
float max_bias);
struct ggml_tensor * b);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace(
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
float scale,
float max_bias);
struct ggml_tensor * b);
// rotary position embedding
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)

View File

@@ -37,7 +37,6 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
return true;
}
// ops that return true for this function must not use restrict pointers for their backend implementations
static bool ggml_op_can_inplace(enum ggml_op op) {
switch (op) {
case GGML_OP_SCALE:
@@ -53,12 +52,8 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
case GGML_OP_LOG:
case GGML_OP_UNARY:
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
case GGML_OP_SILU_BACK:
case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
case GGML_OP_SOFT_MAX:
case GGML_OP_SOFT_MAX_BACK:
return true;
default:

View File

@@ -208,6 +208,7 @@ extern "C" {
// Internal backend registry API
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Add backend dynamic loading support to the backend

View File

@@ -5573,88 +5573,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
uint32_t utmp[4];
#ifdef __ARM_FEATURE_SVE
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
memcpy(utmp, x[i].scales, K_SCALE_SIZE);
uint32x2_t mins8 = { 0 };
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[0] &= kmask1;
const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
sumf -= dmin * vaddvq_s32(prod);
const uint8_t * scales = (const uint8_t *)utmp;
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
const int vector_length = ggml_cpu_get_sve_cnt()*8;
const svuint8_t m4b = svdup_n_u8(0xf);
const svint32_t mzero = svdup_n_s32(0);
svint32_t sumi1 = svdup_n_s32(0);
svint32_t sumi1_1 = svdup_n_s32(0);
svint32_t sumi1_2 = svdup_n_s32(0);
svint32_t sumi2 = svdup_n_s32(0);
svint32_t sumi2_1 = svdup_n_s32(0);
svint32_t sumi2_2 = svdup_n_s32(0);
switch (vector_length) {
case 128:
{
for (int j = 0; j < QK_K/64; ++j) {
svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), m4b));
svint8_t q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
sumi1_1 = svmla_n_s32_x(svptrue_b32(), sumi1_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), m4b));
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
sumi1_2 = svmla_n_s32_x(svptrue_b32(), sumi1_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4), 4));
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
sumi2_1 = svmla_n_s32_x(svptrue_b32(), sumi2_1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_b8(), svld1_u8(svptrue_b8(), q4+16), 4));
q8bytes = svld1_s8(svptrue_b8(), q8); q8 += 16;
sumi2_2 = svmla_n_s32_x(svptrue_b32(), sumi2_2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
q4 += 32;
}
sumi1 = svadd_s32_x(svptrue_b32(), sumi1_1, sumi1_2);
sumi2 = svadd_s32_x(svptrue_b32(), sumi2_1, sumi2_2);
sumf += d * (svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sumi1, sumi2)));
} break;
case 256:
case 512:
{
for (int j = 0; j < QK_K/64; ++j) {
const svuint8_t q4bits = svld1_u8(svptrue_pat_b8(SV_VL32), q4); q4 += 32;
svint8_t q4bytes = svreinterpret_s8_u8(svand_u8_x(svptrue_pat_b8(SV_VL32), q4bits, m4b));
svint8_t q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
sumi1 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi1, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+0]);
q4bytes = svreinterpret_s8_u8(svlsr_n_u8_x(svptrue_pat_b8(SV_VL32), q4bits, 4));
q8bytes = svld1_s8(svptrue_pat_b8(SV_VL32), q8); q8 += 32;
sumi2 = svmla_n_s32_x(svptrue_pat_b32(SV_VL8), sumi2, svdot_s32(mzero, q4bytes, q8bytes), scales[2*j+1]);
}
sumf += d * (svaddv_s32(svptrue_pat_b32(SV_VL8), svadd_s32_x(svptrue_pat_b32(SV_VL8), sumi1, sumi2)));
} break;
default:
assert(false && "Unsupported vector length");
break;
}
}
*s = sumf;
#elif __ARM_NEON
#ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0);

View File

@@ -3967,57 +3967,6 @@ static void ggml_compute_forward_dup_bytes(
}
}
static void ggml_compute_forward_dup_q(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
GGML_TENSOR_BINARY_OP_LOCALS
const enum ggml_type type = src0->type;
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
size_t qk = ggml_blck_size(type);
const int64_t nr = ggml_nelements(src1) / qk;
// destination must be contiguous in the first dimension
GGML_ASSERT(nb10 == ggml_type_size(dst->type));
// must either have first dimension large enough to hold a row, or fully contiguous
GGML_ASSERT((ne10 % qk) == 0 || ggml_is_contiguous(dst));
const int ith = params->ith;
const int nth = params->nth;
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
for (int64_t ir = ir0; ir < ir1; ++ir) {
uint32_t i = ir * qk;
const int64_t i03 = i/(ne00 * ne01 * ne02);
const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
const int64_t i13 = i/(ne10 * ne11 * ne12);
const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
dequantize_row_q(
(const void *) ((char *) src0->data + x_offset),
(float *) ((char *) dst->data + dst_offset), qk);
}
}
static void ggml_compute_forward_dup(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -4044,10 +3993,6 @@ static void ggml_compute_forward_dup(
} break;
default:
{
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
ggml_compute_forward_dup_q(params, dst);
break;
}
GGML_ABORT("fatal error");
}
}
@@ -6746,20 +6691,20 @@ static void ggml_compute_forward_silu_back_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * grad = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * grad = dst->src[1];
assert(ggml_is_contiguous_1(grad));
assert(ggml_is_contiguous_1(src1));
assert(ggml_is_contiguous_1(src0));
assert(ggml_is_contiguous_1(dst));
assert(ggml_are_same_shape(src1, dst));
assert(ggml_are_same_shape(src1, grad));
assert(ggml_are_same_shape(src0, dst));
assert(ggml_are_same_shape(src0, grad));
const int ith = params->ith;
const int nth = params->nth;
const int nc = src1->ne[0];
const int nr = ggml_nrows(src1);
const int nc = src0->ne[0];
const int nr = ggml_nrows(src0);
// rows per thread
const int dr = (nr + nth - 1)/nth;
@@ -6771,7 +6716,7 @@ static void ggml_compute_forward_silu_back_f32(
for (int i1 = ir0; i1 < ir1; i1++) {
ggml_vec_silu_backward_f32(nc,
(float *) ((char *) dst->data + i1*( dst->nb[1])),
(float *) ((char *) src1->data + i1*(src1->nb[1])),
(float *) ((char *) src0->data + i1*(src0->nb[1])),
(float *) ((char *) grad->data + i1*(grad->nb[1])));
#ifndef NDEBUG
@@ -6950,7 +6895,7 @@ static void ggml_compute_forward_norm_f32(
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
GGML_ASSERT(eps > 0.0f);
// TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7021,7 +6966,7 @@ static void ggml_compute_forward_rms_norm_f32(
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
GGML_ASSERT(eps > 0.0f);
// TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7073,13 +7018,12 @@ static void ggml_compute_forward_rms_norm_back_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; // gradients from forward pass output
const struct ggml_tensor * src1 = dst->src[1]; // src1 from forward pass
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1));
GGML_ASSERT(src0->nb[0] == sizeof(float));
GGML_ASSERT(src1->nb[0] == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
@@ -7098,8 +7042,8 @@ static void ggml_compute_forward_rms_norm_back_f32(
const int64_t i12 = i02;
const int64_t i13 = i03;
const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
const float * x = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
ggml_float sum_xx = 0.0;
ggml_float sum_xdz = 0.0;
@@ -7122,9 +7066,9 @@ static void ggml_compute_forward_rms_norm_back_f32(
{
// z = rms_norm(x)
//
// rms_norm(src1) =
// rms_norm(src0) =
// scale(
// src1,
// src0,
// div(
// 1,
// sqrt(
@@ -7132,13 +7076,13 @@ static void ggml_compute_forward_rms_norm_back_f32(
// scale(
// sum(
// sqr(
// src1)),
// src0)),
// (1.0/N)),
// eps))));
// postorder:
// ## op args grad
// 00 param src1 grad[#00]
// 00 param src0 grad[#00]
// 01 const 1
// 02 sqr (#00) grad[#02]
// 03 sum (#02) grad[#03]
@@ -7215,7 +7159,6 @@ static void ggml_compute_forward_rms_norm_back_f32(
// dx := scale(dx, rrms)
float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
// dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps)
ggml_vec_cpy_f32 (ne00, dx, x);
// ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
@@ -7807,13 +7750,12 @@ static void ggml_compute_forward_out_prod_f32(
const int ith = params->ith;
const int nth = params->nth;
GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 == ne12);
GGML_ASSERT(ne3 == ne13);
GGML_ASSERT(ne2 % ne02 == 0);
GGML_ASSERT(ne3 % ne03 == 0);
GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 == ne02);
GGML_ASSERT(ne02 == ne12);
GGML_ASSERT(ne3 == ne13);
GGML_ASSERT(ne03 == ne13);
// we don't support permuted src0 or src1
GGML_ASSERT(nb00 == sizeof(float));
@@ -7855,10 +7797,6 @@ static void ggml_compute_forward_out_prod_f32(
const int64_t blck_0 = MAX(GGML_VEC_MAD_UNROLL, 32);
const int64_t blck_1 = 16;
// dps == dst per src0, used for group query attention
const int64_t dps2 = ne2 / ne02;
const int64_t dps3 = ne3 / ne03;
for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
const int64_t bir1 = MIN(bir + blck_1, ir1);
for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
@@ -7869,8 +7807,8 @@ static void ggml_compute_forward_out_prod_f32(
const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
const int64_t i02 = i2 / dps2;
const int64_t i03 = i3 / dps3;
const int64_t i02 = i2;
const int64_t i03 = i3;
//const int64_t i10 = i1;
const int64_t i12 = i2;
@@ -7883,7 +7821,7 @@ static void ggml_compute_forward_out_prod_f32(
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
}
@@ -7892,7 +7830,7 @@ static void ggml_compute_forward_out_prod_f32(
float * s0 = (float *) ((char *) src0->data + ( i01*nb01 + i02*nb02 + i03*nb03));
float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
float * d = (float *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
ggml_vec_mad_f32(ne0, d, s0, *s1);
}
@@ -8968,9 +8906,9 @@ static void ggml_compute_forward_soft_max(
}
// ggml_compute_forward_soft_max_ext_back
// ggml_compute_forward_soft_max_back
static void ggml_compute_forward_soft_max_ext_back_f32(
static void ggml_compute_forward_soft_max_back_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -8983,14 +8921,6 @@ static void ggml_compute_forward_soft_max_ext_back_f32(
GGML_ASSERT(ggml_are_same_shape(src0, dst));
GGML_ASSERT(ggml_are_same_shape(src1, dst));
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
GGML_ASSERT(max_bias == 0.0f);
// TODO: handle transposed/permuted matrices
const int ith = params->ith;
@@ -9039,11 +8969,10 @@ static void ggml_compute_forward_soft_max_ext_back_f32(
// linear runtime, no additional memory
float dot_y_dy = 0;
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
ggml_vec_cpy_f32 (nc, dx, dy);
ggml_vec_acc1_f32 (nc, dx, -dot_y_dy);
ggml_vec_mul_f32 (nc, dx, dx, y);
ggml_vec_scale_f32(nc, dx, scale);
ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
ggml_vec_cpy_f32 (nc, dx, dy);
ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
ggml_vec_mul_f32 (nc, dx, dx, y);
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
@@ -9054,7 +8983,7 @@ static void ggml_compute_forward_soft_max_ext_back_f32(
}
}
static void ggml_compute_forward_soft_max_ext_back(
static void ggml_compute_forward_soft_max_back(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
@@ -9063,7 +8992,7 @@ static void ggml_compute_forward_soft_max_ext_back(
switch (src0->type) {
case GGML_TYPE_F32:
{
ggml_compute_forward_soft_max_ext_back_f32(params, dst);
ggml_compute_forward_soft_max_back_f32(params, dst);
} break;
default:
{
@@ -10056,10 +9985,9 @@ static void ggml_compute_forward_im2col_back_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
const struct ggml_tensor * src1 = dst->src[1]; // convolution kernel
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@@ -10081,11 +10009,11 @@ static void ggml_compute_forward_im2col_back_f32(
const int64_t IH = is_2D ? ne1 : 1;
const int64_t IW = ne0;
const int64_t KH = is_2D ? ne11 : 1;
const int64_t KW = ne10;
const int64_t KH = is_2D ? ne01 : 1;
const int64_t KW = ne00;
const int64_t OH = is_2D ? ne02 : 1;
const int64_t OW = ne01;
const int64_t OH = is_2D ? ne12 : 1;
const int64_t OW = ne11;
int ofs0 = is_2D ? nb3 : nb2;
int ofs1 = is_2D ? nb2 : nb1;
@@ -10131,9 +10059,9 @@ static void ggml_compute_forward_im2col_back_f32(
continue;
}
const float * const grad_in = (const float *) src0->data
const float * const src_data = (const float *) src1->data
+ (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
grad += grad_in[iic*(KH*KW) + ikh*KW + ikw];
grad += src_data[iic*(KH*KW) + ikh*KW + ikw];
}
}
float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
@@ -12556,22 +12484,22 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * grad = dst->src[0]; // gradient of forward pass output
const struct ggml_tensor * src0f = dst->src[1]; // src0 of forward pass
const struct ggml_tensor * src1f = dst->src[2]; // src1 of forward pass
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * opt0 = dst->src[2];
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_is_contiguous(src0f));
GGML_ASSERT(ggml_is_contiguous(src1f));
GGML_ASSERT(ggml_is_contiguous(grad));
GGML_ASSERT(ggml_are_same_shape(src0f, src1f) && ggml_are_same_shape(src0f, dst));
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(ggml_is_contiguous(opt0));
GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
const int64_t ith = params->ith;
const int64_t nth = params->nth;
// TODO: handle transposed/permuted matrices
const int64_t nc = src0f->ne[0];
const int64_t nr = ggml_nrows(src0f);
const int64_t nc = src0->ne[0];
const int64_t nr = ggml_nrows(src0);
// rows per thread
const int64_t dr = (nr + nth - 1)/nth;
@@ -12580,12 +12508,12 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
const int64_t ir0 = dr*ith;
const int64_t ir1 = MIN(ir0 + dr, nr);
const float d_by_nr = ((const float *) grad->data)[0] / (float) nr;
const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr;
for (int64_t i1 = ir0; i1 < ir1; i1++) {
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
const float * s0 = (const float *)((const char *) src0f->data + i1*src0f->nb[1]);
const float * s1 = (const float *)((const char *) src1f->data + i1*src1f->nb[1]);
float * ds0 = (float *)((char *) dst->data + i1*dst->nb[1]);
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
#ifndef NDEBUG
for (int64_t i = 0; i < nc; ++i) {
@@ -12598,11 +12526,11 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
// soft_max
float max = -INFINITY;
ggml_vec_max_f32(nc, &max, s0);
const ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
ggml_float sum = ggml_vec_soft_max_f32(nc, ds0, s0, max);
assert(sum > 0.0);
ggml_vec_scale_f32(nc, ds0, 1.0/sum);
// grad(src0f) = (softmax(src0f) - src1f) * grad(cross_entropy_loss(src0f, src1f)) / nr
// grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
ggml_vec_sub_f32(nc, ds0, ds0, s1);
ggml_vec_scale_f32(nc, ds0, d_by_nr);
@@ -12899,7 +12827,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
} break;
case GGML_OP_SOFT_MAX_BACK:
{
ggml_compute_forward_soft_max_ext_back(params, tensor);
ggml_compute_forward_soft_max_back(params, tensor);
} break;
case GGML_OP_ROPE:
{

View File

@@ -403,21 +403,10 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
case GGML_OP_SOFT_MAX_BACK: {
if (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32) {
return false;
}
float max_bias = 0.0f;
memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
return max_bias == 0.0f;
}
case GGML_OP_IM2COL_BACK:
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD:
return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
default:
return true;
}

View File

@@ -93,31 +93,26 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
template <typename T>
static __global__ void k_repeat_back(
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3) {
const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t ne0, const int64_t ne1, const int64_t ne2) {
const int64_t tid0 = int64_t(blockIdx.x)*blockDim.x + threadIdx.x;
const int64_t tid1 = int64_t(blockIdx.y)*blockDim.y + threadIdx.y;
const int64_t tid23 = int64_t(blockIdx.z)*blockDim.z + threadIdx.z;
const int64_t tid2 = tid23 % ne2;
const int64_t tid3 = tid23 / ne2;
const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y;
const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z;
if (tid0 >= ne0) {
return;
}
T sum = 0;
for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) {
for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
sum += src[i3*s03 + i2*s02 + i1*s01 + i0*s00];
}
for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
sum += src[i2*ne01*ne00 + i1*ne00 + i0];
}
}
}
dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
}
template<float (*bin_op)(const float, const float)>
@@ -279,14 +274,12 @@ struct bin_bcast_cuda {
template <typename T>
static void repeat_back_cuda(
const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const size_t s00, const size_t s01, const size_t s02, const size_t s03,
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2*ne3);
k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>
(src, dst, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3);
const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2);
}
template<class op>
@@ -333,26 +326,27 @@ void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(src0->type == dst->type);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_can_repeat(dst, src0));
cudaStream_t stream = ctx.stream();
GGML_TENSOR_UNARY_OP_LOCALS;
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
GGML_ASSERT(src0->ne[3] == 1);
GGML_ASSERT(ne2*ne3 <= (1 << 15));
const size_t ts = ggml_type_size(src0->type);
const size_t s00 = nb00 / ts;
const size_t s01 = nb01 / ts;
const size_t s02 = nb02 / ts;
const size_t s03 = nb03 / ts;
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
const int64_t ne2 = dst->ne[2];
GGML_ASSERT(dst->ne[3] == 1);
switch (dst->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0->data;
float * dst_d = (float *) dst->data;
repeat_back_cuda(src0_d, dst_d, ne00, ne01, ne02, ne03, s00, s01, s02, s03, ne0, ne1, ne2, ne3, stream);
repeat_back_cuda<float>(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream);
} break;
default: {
GGML_ASSERT(false);

View File

@@ -588,7 +588,7 @@ struct ggml_tensor_extra_gpu {
};
#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
#define USE_CUDA_GRAPH
#endif

View File

@@ -5,89 +5,95 @@
#include <cmath>
#include <cstdint>
template <bool use_shared>
static __global__ void cross_entropy_loss_f32(
const float * __restrict__ logits, const float * __restrict__ labels, float * __restrict__ dst, const int nclasses, const int k) {
extern __shared__ float tmp[];
static __global__ void cross_entropy_loss_f32(const float * logits, const float * labels, float * dst, const int nclasses, const int k) {
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
const int i0 = blockDim.x*blockIdx.x + warp_id*WARP_SIZE;
logits += int64_t(blockIdx.x)*nclasses;
labels += int64_t(blockIdx.x)*nclasses;
const int ne_tmp = WARP_SIZE*nclasses;
extern __shared__ float tmp_all[];
float * tmp_logits = tmp_all + (2*warp_id + 0)*ne_tmp;
float * tmp_labels = tmp_all + (2*warp_id + 1)*ne_tmp;
// Each warp first loads ne_tmp logits/labels into shared memory:
for (int i = lane_id; i < ne_tmp; i += WARP_SIZE) {
const int ig = i0*nclasses + i; // ig == i global
tmp_logits[i] = ig < k*nclasses ? logits[ig] : 0.0f;
tmp_labels[i] = ig < k*nclasses ? labels[ig] : 0.0f;
}
// Each thread in the warp then calculates the cross entropy loss for a single row.
// TODO: pad in order to avoid shared memory bank conflicts.
// Find maximum for softmax:
float max_logit = -INFINITY;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
const float val = logits[i];
max_logit = fmaxf(max_logit, val);
if (use_shared) {
tmp[i] = val;
}
float max = -INFINITY;
for (int i = 0; i < nclasses; ++i) {
max = fmaxf(max, tmp_logits[lane_id*nclasses + i]);
}
max_logit = warp_reduce_max(max_logit);
// Calculate log(softmax(logits)) which is just logits - max:
float sum = 0.0f;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
const float logit_i = use_shared ? tmp[i] : logits[i];
sum += expf(logit_i - max_logit);
for (int i = 0; i < nclasses; ++i) {
float val = tmp_logits[lane_id*nclasses + i] - max;
sum += expf(val);
tmp_logits[lane_id*nclasses + i] = val;
}
sum = warp_reduce_sum(sum);
sum = logf(sum);
// log(exp(logits - max) / sum) = (logits - max) - log(sum)
float loss = 0.0f;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
const float logit_i = use_shared ? tmp[i] : logits[i];
loss += (logit_i - max_logit - sum) * labels[i];
for (int i = 0; i < nclasses; ++i) {
loss += (tmp_logits[lane_id*nclasses + i] - sum) * tmp_labels[lane_id*nclasses + i];
}
loss = -warp_reduce_sum(loss) / (float)k;
if (threadIdx.x != 0) {
__syncthreads();
if (lane_id == 0) {
tmp_all[warp_id] = loss;
}
__syncthreads();
if (warp_id != 0) {
return;
}
loss = lane_id < CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE/WARP_SIZE ? tmp_all[lane_id] : 0.0f;
loss = warp_reduce_sum(loss);
if (lane_id != 0) {
return;
}
dst[blockIdx.x] = loss;
}
template <bool use_shared>
static __global__ void cross_entropy_loss_back_f32(
const float * __restrict__ grad, const float * __restrict__ logits, const float * __restrict__ labels,
float * __restrict__ dst, const int nclasses) {
static __global__ void cross_entropy_loss_back_f32(const float * logits, const float * labels, const float * loss, float * dst, const int nclasses) {
extern __shared__ float tmp[];
logits += int64_t(blockIdx.x)*nclasses;
labels += int64_t(blockIdx.x)*nclasses;
dst += int64_t(blockIdx.x)*nclasses;
float maxval = -INFINITY;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
const float val = logits[i];
const float val = logits[blockIdx.x*nclasses + i];
maxval = fmaxf(maxval, val);
if (use_shared) {
tmp[i] = val;
}
tmp[i] = val;
}
maxval = warp_reduce_max(maxval);
float sum = 0.0f;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
const float val = expf((use_shared ? tmp[i] : logits[i]) - maxval);
const float val = expf(tmp[i] - maxval);
sum += val;
if (use_shared) {
tmp[i] = val;
} else {
dst[i] = val;
}
tmp[i] = val;
}
sum = warp_reduce_sum(sum);
const float sm_scale = 1.0f/sum;
const float d_by_nrows = *grad/gridDim.x;
const float d_by_nrows = *loss/gridDim.x;
for (int i = threadIdx.x; i < nclasses; i += WARP_SIZE) {
const float val = use_shared ? tmp[i] : dst[i];
dst[i] = (val*sm_scale - labels[i])*d_by_nrows;
dst[blockIdx.x*nclasses + i] = (tmp[i]*sm_scale - labels[blockIdx.x*nclasses + i])*d_by_nrows;
}
}
@@ -113,77 +119,48 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
ggml_cuda_pool & pool = ctx.pool();
cudaStream_t stream = ctx.stream();
const dim3 blocks_dim(WARP_SIZE, 1, 1);
const dim3 blocks_num(nrows, 1, 1);
const size_t nbytes_shared = ne00*sizeof(float);
const int id = ggml_cuda_get_device();
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
const dim3 blocks_dim(CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
const dim3 blocks_num((nrows + CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE - 1) / CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE, 1, 1);
const int shmem = 2*CUDA_CROSS_ENTROPY_LOSS_BLOCK_SIZE*ne00*sizeof(float);
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
if (nbytes_shared <= smpbo) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
shared_memory_limit_raised[id] = true;
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
} else {
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
}
CUDA_CHECK(cudaGetLastError());
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
// Combine results from individual blocks:
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
}
void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * grad = dst->src[0];
const ggml_tensor * src0f = dst->src[1];
const ggml_tensor * src1f = dst->src[2];
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * opt0 = dst->src[2];
GGML_ASSERT(src0f->type == GGML_TYPE_F32);
GGML_ASSERT(src1f->type == GGML_TYPE_F32);
GGML_ASSERT( grad->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(opt0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_scalar(grad));
GGML_ASSERT(ggml_is_contiguous(src0f));
GGML_ASSERT(ggml_is_contiguous(src1f));
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(ggml_is_contiguous(opt0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_are_same_shape(src0f, src1f));
GGML_ASSERT(ggml_are_same_shape(src0f, dst));
GGML_ASSERT(ggml_are_same_shape(src0, src1));
GGML_ASSERT(ggml_are_same_shape(src0, dst));
const int64_t ne00 = src0f->ne[0];
const int64_t nrows = ggml_nrows(src0f);
const int64_t ne00 = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
const float * grad_d = (const float *) grad->data;
const float * src0f_d = (const float *) src0f->data;
const float * src1f_d = (const float *) src1f->data;
float * dst_d = (float *) dst->data;
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
const float * opt0_d = (const float *) opt0->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
const dim3 blocks_dim(WARP_SIZE, 1, 1);
const dim3 blocks_num(nrows, 1, 1);
const size_t nbytes_shared = ne00*sizeof(float);
const int shmem = ne00*sizeof(float);
const int id = ggml_cuda_get_device();
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
if (nbytes_shared <= smpbo) {
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
if (!shared_memory_limit_raised[id]) {
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
shared_memory_limit_raised[id] = true;
}
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
} else {
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
}
cross_entropy_loss_back_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, opt0_d, dst_d, ne00);
}

View File

@@ -3,15 +3,15 @@
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(
const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
const void * src0, const int32_t * src1, dst_t * dst,
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
@@ -22,10 +22,10 @@ static __global__ void k_get_rows(
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03;
const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
const int ib = i00/qk; // block index
const int iqs = (i00%qk)/qr; // quant index
const int ib = i00/qk; // block index
const int iqs = (i00%qk)/qr; // quant index
const int iybs = i00 - i00%qk; // dst block start index
const int y_offset = qr == 1 ? 1 : qk/2;
@@ -39,15 +39,15 @@ static __global__ void k_get_rows(
template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
const src0_t * src0, const int32_t * src1, dst_t * dst,
int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
/*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
/*size_t s0,*/ size_t s1, size_t s2, size_t s3,
/*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
size_t s10, size_t s11, size_t s12/*, size_t s13*/) {
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i00 = blockIdx.x*blockDim.x + threadIdx.x;
const int i10 = blockDim.y*blockIdx.y + threadIdx.y;
const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
@@ -58,38 +58,14 @@ static __global__ void k_get_rows_float(
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = src0_row[i00];
}
template<typename grad_t, typename dst_t>
static __global__ void k_get_rows_back_float(
const grad_t * __restrict__ grad, const int32_t * __restrict__ rows, dst_t * __restrict__ dst, const int64_t ncols, const int64_t nrows_grad) {
const int col = blockIdx.x*blockDim.x + threadIdx.x;
if (col >= ncols) {
return;
}
const int dst_row = blockIdx.y*blockDim.y + threadIdx.y;
float sum = 0.0f;
for (int64_t i = 0; i < nrows_grad; ++i) {
if (rows[i] != dst_row) {
continue;
}
sum += grad[i*ncols + col];
}
dst[dst_row*ncols + col] = sum;
}
template<int qk, int qr, dequantize_kernel_t dq>
static void get_rows_cuda(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS
@@ -111,25 +87,22 @@ static void get_rows_cuda(
GGML_ASSERT(ne00 % 2 == 0);
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
GGML_UNUSED(dst);
}
template<typename src0_t>
static void get_rows_cuda_float(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
GGML_TENSOR_BINARY_OP_LOCALS
GGML_ASSERT(ne13 == 1);
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne10, ne11*ne12);
@@ -146,12 +119,12 @@ static void get_rows_cuda_float(
//const size_t s13 = nb13 / ggml_element_size(src1);
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
src0_dd, src1_dd, dst_dd,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
GGML_UNUSED(dst);
}
@@ -159,41 +132,42 @@ static void get_rows_cuda_float(
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const void * src0_d = (const void *) src0->data;
const int32_t * src1_d = (const int32_t *) src1->data;
float * dst_d = (float *) dst->data;
const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
const int32_t * src1_i32 = (const int32_t *) src1_d;
switch (src0->type) {
case GGML_TYPE_F16:
get_rows_cuda_float(src0, src1, dst, (const half *) src0_d, src1_d, dst_d, stream);
get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream);
break;
case GGML_TYPE_F32:
get_rows_cuda_float(src0, src1, dst, (const float *) src0_d, src1_d, dst_d, stream);
get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break;
case GGML_TYPE_Q4_0:
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break;
case GGML_TYPE_Q4_1:
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break;
case GGML_TYPE_Q5_0:
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break;
case GGML_TYPE_Q5_1:
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break;
case GGML_TYPE_Q8_0:
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_d, dst_d, stream);
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
break;
default:
// TODO: k-quants
@@ -201,34 +175,3 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
break;
}
}
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
const ggml_tensor * src1 = dst->src[1]; // src1 in forward pass
GGML_TENSOR_BINARY_OP_LOCALS
const float * src0_d = (const float *) src0->data;
const int32_t * src1_d = (const int32_t *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ne02*ne03 == 1);
GGML_ASSERT(ne12*ne13 == 1);
GGML_ASSERT(ne2*ne3 == 1);
const dim3 block_dims(CUDA_GET_ROWS_BACK_BLOCK_SIZE, 1, 1);
const int block_num_x = (ne00 + CUDA_GET_ROWS_BACK_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BACK_BLOCK_SIZE;
const dim3 block_nums(block_num_x, ne1, 1);
k_get_rows_back_float<<<block_nums, block_dims, 0, stream>>>(src0_d, src1_d, dst_d, ne00, ne10);
}

View File

@@ -1,8 +1,5 @@
#include "common.cuh"
#define CUDA_GET_ROWS_BLOCK_SIZE 256
#define CUDA_GET_ROWS_BACK_BLOCK_SIZE 256
void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_get_rows_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
int id = -1; // in case cudaGetDevice fails
(void)cudaGetDevice(&id);
cudaGetDevice(&id);
GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
@@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;
#if !defined(GGML_CUDA_NO_VMM)
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
CUdevice device;
CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
}
#endif // !defined(GGML_CUDA_NO_VMM)
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
info.devices[id].vmm = !!device_vmm;
cudaDeviceProp prop;
@@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
};
// pool with virtual memory
#if !defined(GGML_CUDA_NO_VMM)
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@@ -309,9 +309,6 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
size_t pool_used = 0;
size_t pool_size = 0;
size_t granularity;
#if defined(GGML_USE_HIP)
std::vector<std::pair<CUdeviceptr, size_t>> mappings;
#endif
explicit ggml_cuda_pool_vmm(int device) :
device(device),
@@ -320,14 +317,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
~ggml_cuda_pool_vmm() {
if (pool_addr != 0) {
#if defined(GGML_USE_HIP)
// Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
}
#else
CU_CHECK(cuMemUnmap(pool_addr, pool_size));
#endif
CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
}
}
@@ -360,11 +350,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
}
// map at the end of the pool
CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
#if defined(GGML_USE_HIP)
mappings.push_back({start_ptr, reserve_size});
#endif
CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
// the memory allocation handle is no longer needed after mapping
CU_CHECK(cuMemRelease(handle));
@@ -374,7 +360,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
access.location.id = device;
access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
// add to the pool
pool_size += reserve_size;
@@ -386,7 +372,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(pool_addr != 0);
void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
void * ptr = (void *) (pool_addr + pool_used);
*actual_size = size;
pool_used += size;
@@ -405,17 +391,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
pool_used -= size;
// all deallocations must be in reverse order of the allocations
GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
}
};
#endif // !defined(GGML_CUDA_NO_VMM)
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_CUDA_NO_VMM)
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
}
#endif // !defined(GGML_CUDA_NO_VMM)
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
}
@@ -561,7 +547,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
if (err != cudaSuccess) {
// clear the error
(void)cudaGetLastError();
cudaGetLastError();
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
return nullptr;
}
@@ -976,7 +962,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
cudaError_t err = cudaMallocHost((void **) &ptr, size);
if (err != cudaSuccess) {
// clear the error
(void)cudaGetLastError();
cudaGetLastError();
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err));
return nullptr;
@@ -1096,9 +1082,7 @@ static void ggml_cuda_op_mul_mat_cublas(
const int compute_capability = ggml_cuda_info().devices[id].cc;
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
if (compute_capability >= GGML_CUDA_CC_VOLTA && use_fp16) {
if (compute_capability >= GGML_CUDA_CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
if (src0->type != GGML_TYPE_F16) {
@@ -1119,38 +1103,28 @@ static void ggml_cuda_op_mul_mat_cublas(
to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
}
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get();
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
}
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (compute_capability == GGML_CUDA_CC_CDNA) {
const float alpha = 1.0f;
const float beta = 0.0f;
CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta, dst_dd_i, CUDA_R_32F, ldc,
CUBLAS_COMPUTE_32F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
ggml_cuda_pool_alloc<half> dst_f16(ctx.pool(id), row_diff*src1_ncols);
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
}
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
} else {
ggml_cuda_pool_alloc<float> src0_ddq_as_f32(ctx.pool(id));
ggml_cuda_pool_alloc<float> src1_ddq_as_f32(ctx.pool(id));
@@ -1223,7 +1197,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
cudaGetLastError();
}
} else {
cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
@@ -1231,7 +1205,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
CUDA_CHECK(err);
} else {
// reset the error
(void)cudaGetLastError();
cudaGetLastError();
}
}
}
@@ -1639,6 +1613,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F;
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
}
// dst strides
size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3];
@@ -1667,12 +1645,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
beta = &beta_f32;
}
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
cu_compute_type = CUBLAS_COMPUTE_32F;
alpha = &alpha_f32;
beta = &beta_f32;
}
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
@@ -2031,9 +2003,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_GET_ROWS:
ggml_cuda_op_get_rows(ctx, dst);
break;
case GGML_OP_GET_ROWS_BACK:
ggml_cuda_op_get_rows_back(ctx, dst);
break;
case GGML_OP_DUP:
ggml_cuda_dup(ctx, dst);
break;
@@ -2122,15 +2091,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_LEAKY_RELU:
ggml_cuda_op_leaky_relu(ctx, dst);
break;
case GGML_OP_SILU_BACK:
ggml_cuda_op_silu_back(ctx, dst);
break;
case GGML_OP_RMS_NORM:
ggml_cuda_op_rms_norm(ctx, dst);
break;
case GGML_OP_RMS_NORM_BACK:
ggml_cuda_op_rms_norm_back(ctx, dst);
break;
case GGML_OP_MUL_MAT:
if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) {
GGML_LOG_ERROR("%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]);
@@ -2175,9 +2138,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SOFT_MAX:
ggml_cuda_op_soft_max(ctx, dst);
break;
case GGML_OP_SOFT_MAX_BACK:
ggml_cuda_op_soft_max_back(ctx, dst);
break;
case GGML_OP_ROPE:
ggml_cuda_op_rope(ctx, dst);
break;
@@ -2466,7 +2426,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
if (stat == cudaErrorInvalidDeviceFunction) {
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
// We don't need to update blas nodes, so clear error and move on.
(void)cudaGetLastError();
cudaGetLastError();
} else {
GGML_ASSERT(stat == cudaSuccess);
}
@@ -2521,20 +2481,14 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
cudaGraphExecUpdateResultInfo result_info;
#ifdef __HIP_PLATFORM_AMD__
hipGraphNode_t errorNode;
hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
#else
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
#endif
if (stat == cudaErrorGraphExecUpdateFailure) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
#endif
// The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate
(void)cudaGetLastError();
cudaGetLastError();
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
cuda_ctx->cuda_graph->instance = nullptr;
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
@@ -2762,7 +2716,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) {
// clear the error
(void)cudaGetLastError();
cudaGetLastError();
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, cudaGetErrorString(err));
@@ -2782,7 +2736,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
cudaError_t err = cudaHostUnregister(buffer);
if (err != cudaSuccess) {
// clear the error
(void)cudaGetLastError();
cudaGetLastError();
}
}
@@ -2958,7 +2912,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
}
} break;
case GGML_OP_OUT_PROD:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
case GGML_OP_GET_ROWS:
{
switch (op->src[0]->type) {
@@ -2974,10 +2928,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return false;
}
} break;
case GGML_OP_GET_ROWS_BACK:
{
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
} break;
case GGML_OP_CPY:
{
ggml_type src0_type = op->src[0]->type;
@@ -3036,7 +2986,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
} break;
case GGML_OP_REPEAT_BACK:
return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15);
return op->type == GGML_TYPE_F32 && op->src[0]->ne[3] == 1;
case GGML_OP_CONCAT:
{
ggml_type src0_type = op->src[0]->type;
@@ -3051,12 +3001,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
}
return false;
} break;
case GGML_OP_SILU_BACK:
return ggml_is_contiguous(op->src[0]);
break;
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
case GGML_OP_RMS_NORM_BACK:
return ggml_is_contiguous(op->src[0]) && op->ne[0] % WARP_SIZE == 0;
break;
case GGML_OP_NONE:
@@ -3081,11 +3027,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
return true;
case GGML_OP_SOFT_MAX_BACK: {
float max_bias = 0.0f;
memcpy(&max_bias, (const float *) op->op_params + 1, sizeof(float));
return max_bias == 0.0f;
}
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK: {
const size_t ts = ggml_type_size(op->src[0]->type);

View File

@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
int64_t nwarps = 1;
int64_t rows_per_cuda_block = 1;
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_RDNA2) { // NVIDIA and AMD older than RDNA2
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
switch(ncols_y) {
case 1:
nwarps = 4;
@@ -166,7 +166,6 @@ static void mul_mat_vec_q_cuda(
break;
}
}
const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block;
const dim3 block_nums(nblocks, 1, 1);
const dim3 block_dims(WARP_SIZE, nwarps, 1);

View File

@@ -5,24 +5,20 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
x += int64_t(row)*ncols;
dst += int64_t(row)*ncols;
float2 mean_var = make_float2(0.0f, 0.0f);
float2 mean_var = make_float2(0.f, 0.f);
for (int col = tid; col < ncols; col += block_size) {
const float xi = x[col];
const float xi = x[row*ncols + col];
mean_var.x += xi;
mean_var.y += xi * xi;
}
// sum up partial sums
mean_var = warp_reduce_sum(mean_var);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
if (block_size > WARP_SIZE) {
__shared__ float2 s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = mean_var;
}
@@ -36,7 +32,7 @@ static __global__ void norm_f32(const float * x, float * dst, const int ncols, c
const float inv_std = rsqrtf(var + eps);
for (int col = tid; col < ncols; col += block_size) {
dst[col] = (x[col] - mean) * inv_std;
dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
}
}
@@ -44,8 +40,14 @@ template <int block_size>
static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) {
// blockIdx.x: num_groups idx
// threadIdx.x: block_size idx
const int start = blockIdx.x*group_size + threadIdx.x;
const int end = min(blockIdx.x*group_size + group_size, ne_elements);
int start = blockIdx.x * group_size;
int end = start + group_size;
start += threadIdx.x;
if (end >= ne_elements) {
end = ne_elements;
}
float tmp = 0.0f; // partial sum for thread in warp
@@ -54,11 +56,10 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
}
tmp = warp_reduce_sum(tmp);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
if (block_size > WARP_SIZE) {
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
@@ -67,11 +68,11 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
tmp = warp_reduce_sum(tmp);
}
const float mean = tmp / group_size;
float mean = tmp / group_size;
tmp = 0.0f;
for (int j = start; j < end; j += block_size) {
const float xi = x[j] - mean;
float xi = x[j] - mean;
dst[j] = xi;
tmp += xi * xi;
}
@@ -79,8 +80,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
tmp = warp_reduce_sum(tmp);
if (block_size > WARP_SIZE) {
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
@@ -89,8 +90,8 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
tmp = warp_reduce_sum(tmp);
}
const float variance = tmp / group_size;
const float scale = rsqrtf(variance + eps);
float variance = tmp / group_size;
float scale = rsqrtf(variance + eps);
for (int j = start; j < end; j += block_size) {
dst[j] *= scale;
}
@@ -101,23 +102,19 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
x += int64_t(row)*ncols;
dst += int64_t(row)*ncols;
float tmp = 0.0f; // partial sum for thread in warp
for (int col = tid; col < ncols; col += block_size) {
const float xi = x[col];
const float xi = x[row*ncols + col];
tmp += xi * xi;
}
// sum up partial sums
tmp = warp_reduce_sum(tmp);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
if (block_size > WARP_SIZE) {
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
int warp_id = threadIdx.x / WARP_SIZE;
int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
@@ -130,63 +127,12 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
const float scale = rsqrtf(mean + eps);
for (int col = tid; col < ncols; col += block_size) {
dst[col] = scale * x[col];
}
}
template <int block_size>
static __global__ void rms_norm_back_f32(
const float * grad, const float * xf, float * dst, const int ncols, const float eps) {
const int row = blockIdx.x*blockDim.y + threadIdx.y;
const int tid = threadIdx.x;
grad += int64_t(row)*ncols;
xf += int64_t(row)*ncols;
dst += int64_t(row)*ncols;
float sum_xx = 0.0f; // sum for squares of x, equivalent to forward pass
float sum_xg = 0.0f; // sum for x * gradient, needed because RMS norm mixes inputs
for (int col = tid; col < ncols; col += block_size) {
const float xfi = xf[col];
sum_xx += xfi * xfi;
sum_xg += xfi * grad[col];
}
// sum up partial sums
sum_xx = warp_reduce_sum(sum_xx);
sum_xg = warp_reduce_sum(sum_xg);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
__shared__ float s_sum_xx[32];
__shared__ float s_sum_xg[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
if (lane_id == 0) {
s_sum_xx[warp_id] = sum_xx;
s_sum_xg[warp_id] = sum_xg;
}
__syncthreads();
sum_xx = s_sum_xx[lane_id];
sum_xx = warp_reduce_sum(sum_xx);
sum_xg = s_sum_xg[lane_id];
sum_xg = warp_reduce_sum(sum_xg);
}
const float mean_eps = sum_xx / ncols + eps;
const float sum_eps = sum_xx + ncols*eps;
const float scale_grad = rsqrtf(mean_eps);
const float scale_x = -scale_grad * sum_xg/sum_eps;
for (int col = tid; col < ncols; col += block_size) {
dst[col] = scale_grad*grad[col] + scale_x*xf[col];
dst[row*ncols + col] = scale * x[row*ncols + col];
}
}
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
@@ -196,8 +142,7 @@ static void norm_f32_cuda(const float * x, float * dst, const int ncols, const i
}
}
static void group_norm_f32_cuda(
const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const float eps, const int group_size, const int ne_elements, cudaStream_t stream) {
if (group_size < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
group_norm_f32<WARP_SIZE><<<num_groups, block_dims, 0, stream>>>(x, dst, group_size, ne_elements, eps);
@@ -208,6 +153,7 @@ static void group_norm_f32_cuda(
}
static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(x, dst, ncols, eps);
@@ -217,16 +163,6 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
}
}
static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) {
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_back_f32<WARP_SIZE><<<nrows, block_dims, 0, stream>>>(grad, xf, dst, ncols, eps);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_back_f32<1024><<<nrows, block_dims, 0, stream>>>(grad, xf, dst, ncols, eps);
}
}
void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
@@ -243,7 +179,6 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
}
@@ -263,7 +198,6 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
float eps;
memcpy(&eps, dst->op_params + 1, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], eps, group_size, ggml_nelements(src0), stream);
@@ -285,33 +219,6 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream);
}
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * grad = dst->src[0]; // gradients
const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass
const float * grad_d = (const float *) grad->data;
const float * src0f_d = (const float *) src0f->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(grad));
GGML_ASSERT( grad->type == GGML_TYPE_F32);
GGML_ASSERT(src0f->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0f->ne[0];
const int64_t nrows = ggml_nrows(src0f);
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
GGML_ASSERT(eps >= 0.0f);
rms_norm_back_f32_cuda(grad_d, src0f_d, dst_d, ne00, nrows, eps, stream);
}

View File

@@ -5,5 +5,3 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -11,15 +11,16 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ne01 == ne11);
GGML_ASSERT(ne0 == ne00);
GGML_ASSERT(ne1 == ne10);
GGML_ASSERT(ne2 % src0->ne[2] == 0);
GGML_ASSERT(ne3 % src0->ne[3] == 0);
GGML_ASSERT(ne2 == src0->ne[2]);
GGML_ASSERT(ne2 == src1->ne[2]);
GGML_ASSERT(ne3 == src0->ne[3]);
GGML_ASSERT(ne3 == src1->ne[3]);
const float * src0_d = (const float *) src0->data;
@@ -32,37 +33,19 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const float alpha = 1.0f;
const float beta = 0.0f;
GGML_ASSERT(ne2 == 1);
GGML_ASSERT(ne3 == 1);
CUBLAS_CHECK(cublasSetStream(handle, stream));
const int64_t lda = nb01 / sizeof(float);
const int64_t ldc = nb1 / sizeof(float);
const bool src1_T = ggml_is_transposed(src1);
const cublasOperation_t src1_cublas_op = src1_T ? CUBLAS_OP_N : CUBLAS_OP_T;
const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
GGML_ASSERT( (src1_T ? nb11 : nb10) == sizeof(float));
// data strides in dimensions 2/3
const size_t s02 = nb02 / sizeof(float);
const size_t s03 = nb03 / sizeof(float);
const size_t s12 = nb12 / sizeof(float);
const size_t s13 = nb13 / sizeof(float);
const size_t s2 = nb2 / sizeof(float);
const size_t s3 = nb3 / sizeof(float);
// dps == dst per src0, used for group query attention
const int64_t dps2 = ne2 / ne02;
const int64_t dps3 = ne3 / ne03;
// TODO batched matrix multiplication
for (int64_t i3 = 0; i3 < ne3; ++i3) {
for (int64_t i2 = 0; i2 < ne2; ++i2) {
CUBLAS_CHECK(
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
ne0, ne1, ne01,
&alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda,
src1_d + i3 *s13 + i2 *s12, ldb,
&beta, dst_d + i3 *s3 + i2 *s2, ldc));
}
}
CUBLAS_CHECK(
cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op,
ne0, ne1, ne01,
&alpha, src0_d, ne00,
src1_d, ldb,
&beta, dst_d, ne0));
}

View File

@@ -39,9 +39,9 @@ static __device__ void rope_yarn(
template<bool forward, bool has_ff, typename T>
static __global__ void rope_norm(
const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
@@ -83,9 +83,9 @@ static __global__ void rope_norm(
template<bool forward, bool has_ff, typename T>
static __global__ void rope_neox(
const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors) {
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
@@ -127,9 +127,9 @@ static __global__ void rope_neox(
template<bool forward, bool has_ff, typename T>
static __global__ void rope_multi(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
const int n_dims, const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
@@ -187,9 +187,9 @@ static __global__ void rope_multi(
template<bool forward, bool has_ff, typename T>
static __global__ void rope_vision(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
const float theta_scale, const float * freq_factors, const mrope_sections sections) {
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims,
const int32_t * __restrict__ pos, const float freq_scale, const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
const float theta_scale, const float * __restrict__ freq_factors, const mrope_sections sections) {
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
if (i0 >= ne0) {
@@ -234,9 +234,9 @@ static __global__ void rope_vision(
template<bool forward, typename T>
static void rope_norm_cuda(
const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@@ -257,9 +257,9 @@ static void rope_norm_cuda(
template<bool forward, typename T>
static void rope_neox_cuda(
const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@@ -280,9 +280,9 @@ static void rope_neox_cuda(
template<bool forward, typename T>
static void rope_multi_cuda(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@@ -303,9 +303,9 @@ static void rope_multi_cuda(
template<bool forward, typename T>
static void rope_vision_cuda(
const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
const T * __restrict__ x, T * __restrict__ dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
const int32_t * __restrict__ pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
const rope_corr_dims corr_dims, const float * __restrict__ freq_factors, const mrope_sections sections, cudaStream_t stream) {
GGML_ASSERT(ne0 % 2 == 0);
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);

View File

@@ -1,7 +1,5 @@
#include "common.cuh"
#include "ggml.h"
#include "softmax.cuh"
#include <cstdint>
template <typename T>
static __device__ __forceinline__ float t2f32(T val) {
@@ -13,20 +11,14 @@ __device__ float __forceinline__ t2f32<half>(half val) {
return __half2float(val);
}
template <bool use_shared, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32(
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
template <bool vals_smem, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
const int tid = threadIdx.x;
const int rowx = blockIdx.x;
const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension
x += int64_t(rowx)*ncols;
mask += int64_t(rowy)*ncols * (mask != nullptr);
dst += int64_t(rowx)*ncols;
const int block_size = block_size_template == 0 ? blockDim.x : block_size_template;
const int warp_id = threadIdx.x / WARP_SIZE;
@@ -37,7 +29,7 @@ static __global__ void soft_max_f32(
extern __shared__ float data_soft_max_f32[];
float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
// shared memory buffer to cache values between iterations:
float * vals = use_shared ? buf_iw + WARP_SIZE : dst;
float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
float max_val = -INFINITY;
@@ -49,7 +41,10 @@ static __global__ void soft_max_f32(
break;
}
const float val = x[col]*scale + (mask ? slope*t2f32(mask[col]) : 0.0f);
const int64_t ix = (int64_t)rowx*ncols + col;
const int64_t iy = (int64_t)rowy*ncols + col;
const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f);
vals[col] = val;
max_val = max(max_val, val);
@@ -115,29 +110,8 @@ static __global__ void soft_max_f32(
return;
}
dst[col] = vals[col] * inv_sum;
}
}
static __global__ void soft_max_back_f32(
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {
const int tid = threadIdx.x;
const int rowx = blockIdx.x;
grad += int64_t(rowx)*ncols;
dstf += int64_t(rowx)*ncols;
dst += int64_t(rowx)*ncols;
float dgf_dot = 0.0f; // dot product of dst from forward pass and gradients
for (int col = tid; col < ncols; col += WARP_SIZE) {
dgf_dot += dstf[col]*grad[col];
}
dgf_dot = warp_reduce_sum(dgf_dot);
for (int col = tid; col < ncols; col += WARP_SIZE) {
dst[col] = scale * (grad[col] - dgf_dot) * dstf[col];
const int64_t idst = (int64_t)rowx*ncols + col;
dst[idst] = vals[col] * inv_sum;
}
}
@@ -147,7 +121,7 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1);
const dim3 block_nums(nrows_x, 1, 1);
const size_t nbytes_shared = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
const uint32_t n_head = nrows_x/nrows_y;
@@ -157,68 +131,50 @@ static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, cons
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// FIXME: this limit could be raised by ~2-4x on Ampere or newer
if (nbytes_shared < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
switch (ncols_x) {
case 32:
soft_max_f32<true, 32, 32><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 64:
soft_max_f32<true, 64, 64><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 128:
soft_max_f32<true, 128, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 256:
soft_max_f32<true, 256, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 512:
soft_max_f32<true, 512, 512><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 1024:
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 2048:
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 4096:
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
default:
soft_max_f32<true, 0, 0><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
}
} else {
const size_t nbytes_shared_low = WARP_SIZE*sizeof(float);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, nbytes_shared_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
const size_t shmem_low = WARP_SIZE*sizeof(float);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
}
}
static void soft_max_back_f32_cuda(
const float * grad, const float * dstf, float * dst,
const int ncols, const int nrows, const float scale, cudaStream_t stream) {
const dim3 block_dims(WARP_SIZE, 1, 1);
const dim3 block_nums(nrows, 1, 1);
soft_max_back_f32<<<block_nums, block_dims, 0, stream>>>(grad, dstf, dst, ncols, scale);
}
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const float * src0_d = (const float *) src0->data;
const void * src1_d = src1 ? (const void *) src1->data : nullptr;
float * dst_d = (float *) dst->data;
const float * src0_d = (const float *)src0->data;
const void * src1_d = src1 ? (const void *)src1->data : nullptr;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
@@ -233,42 +189,18 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
if (use_f16) {
soft_max_f32_cuda(src0_d, (const half *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
const half * src1_dd = (const half *)src1_d;
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} else {
soft_max_f32_cuda(src0_d, (const float *) src1_d, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
const float * src1_dd = (const float *)src1_d;
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
}
}
void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // grad
const ggml_tensor * src1 = dst->src[1]; // forward pass output
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ncols = src0->ne[0];
const int64_t nrows = ggml_nrows(src0);
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
GGML_ASSERT(max_bias == 0.0f);
soft_max_back_f32_cuda(src0_d, src1_d, dst_d, ncols, nrows, scale, stream);
}

View File

@@ -3,5 +3,3 @@
#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_soft_max_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -51,19 +51,6 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
dst[i] = x[i] / (1.0f + expf(-x[i]));
}
static __global__ void silu_back_f32(
const float * grad, const float * xf, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
const float xfi = xf[i];
const float s = 1.0f / (1.0f + expf(-xfi));
dst[i] = grad[i] * s * (1.0f + xfi * (1.0f - s));
}
static __global__ void tanh_f32(const float * x, float * dst, int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
@@ -186,11 +173,6 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void silu_back_f32_cuda(const float * grad, const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
silu_back_f32<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
}
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -302,24 +284,6 @@ void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; // input from forward pass
const ggml_tensor * src1 = dst->src[1]; // grads of forward pass output
const float * src0_d = (const float *) src0->data;
const float * src1_d = (const float *) src1->data;
float * dst_d = (float *) dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
silu_back_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(src0), stream);
}
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;

View File

@@ -4,7 +4,6 @@
#define CUDA_STEP_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_SILU_BACK_BLOCK_SIZE 256
#define CUDA_TANH_BLOCK_SIZE 256
#define CUDA_RELU_BLOCK_SIZE 256
#define CUDA_SIGMOID_BLOCK_SIZE 256
@@ -24,8 +23,6 @@ void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -19,12 +19,6 @@
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F HIPBLAS_R_16F
#define CUDA_R_32F HIPBLAS_R_32F
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
#define cublasCreate hipblasCreate
@@ -80,21 +74,6 @@
#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cuDeviceGet hipDeviceGet
#define CUdevice hipDevice_t
#define CUdeviceptr hipDeviceptr_t
#define cuMemUnmap hipMemUnmap
#define CUmemAccessDesc hipMemAccessDesc
#define cuMemAddressFree hipMemAddressFree
#define cuMemRelease hipMemRelease
#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
#define cuMemCreate hipMemCreate
#define cuMemAddressReserve hipMemAddressReserve
#define cuMemMap hipMemMap
#define cuMemSetAccess hipMemSetAccess
#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
#define CUmemAllocationProp hipMemAllocationProp
#define cuDeviceGetAttribute hipDeviceGetAttribute
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamDestroy hipStreamDestroy
#define cudaStreamFireAndForget hipStreamFireAndForget
@@ -102,28 +81,6 @@
#define cudaStreamPerThread hipStreamPerThread
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
#define cudaGraphExec_t hipGraphExec_t
#define cudaGraphNode_t hipGraphNode_t
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaKernelNodeParams hipKernelNodeParams
#define cudaGraphExecDestroy hipGraphExecDestroy
#define cudaGraphLaunch hipGraphLaunch
#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
#define cudaGraphNodeType hipGraphNodeType
#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
#define cudaGraphInstantiate hipGraphInstantiate
#define cudaStreamEndCapture hipStreamEndCapture
#define cudaGraphDestroy hipGraphDestroy
#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
#define cudaGraphNodeGetType hipGraphNodeGetType
#define cudaGraphGetNodes hipGraphGetNodes
#define cudaGraphExecUpdate hipGraphExecUpdate
#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
#define cudaStreamBeginCapture hipStreamBeginCapture
#define cudaGraph_t hipGraph_t
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#define __trap() do { abort(); __builtin_unreachable(); } while(0)

View File

@@ -92,14 +92,6 @@ if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (GGML_HIP_GRAPHS)
add_compile_definitions(GGML_HIP_GRAPHS)
endif()
if (GGML_CUDA_NO_VMM)
add_compile_definitions(GGML_CUDA_NO_VMM)
endif()
if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-hip PRIVATE hip::device)

View File

@@ -4416,6 +4416,7 @@ void kernel_mul_mv_q2_K_f32_impl(
device const half * dh = &x[ib].d;
for (int row = 0; row < N_DST; row++) {
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
for (int i = 0; i < 8; i += 2) {
@@ -4446,7 +4447,7 @@ void kernel_mul_mv_q2_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
@@ -4612,7 +4613,7 @@ void kernel_mul_mv_q3_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
if (tiisg == 0) {
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
for (int row = 0; row < 2; ++row) {
dst_f32[first_row + row] = sumf1[row];
}
}
@@ -4728,7 +4729,7 @@ void kernel_mul_mv_q4_K_f32_impl(
device float * dst_f32 = (device float *) dst + (int64_t)im*args.ne0*args.ne1 + (int64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
@@ -4860,7 +4861,7 @@ void kernel_mul_mv_q5_K_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
for (int row = 0; row < 2; ++row) {
const float tot = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = tot;
@@ -4905,10 +4906,6 @@ void kernel_mul_mv_q6_K_f32_impl(
const int row = 2*r0 + sgitg;
if (row >= args.ne0) {
return;
}
const uint i12 = im%args.ne12;
const uint i13 = im/args.ne12;
@@ -5064,7 +5061,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
@@ -5182,7 +5179,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
@@ -5292,7 +5289,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.5f;
@@ -5404,7 +5401,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
@@ -5517,7 +5514,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum * 0.25f;
@@ -5617,7 +5614,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
@@ -5712,7 +5709,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < N_DST && first_row + row < args.ne0; ++row) {
for (int row = 0; row < N_DST; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
@@ -5802,7 +5799,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
for (int row = 0; row < 2 && first_row + row < args.ne01; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;
@@ -5891,7 +5888,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
for (int row = 0; row < 2 && first_row + row < args.ne0; ++row) {
for (int row = 0; row < 2; ++row) {
all_sum = simd_sum(sumf[row]);
if (tiisg == 0) {
dst_f32[first_row + row] = all_sum;

View File

@@ -181,7 +181,7 @@ struct ggml_backend_rpc_context {
struct ggml_backend_rpc_buffer_context {
std::shared_ptr<socket_t> sock;
void * base_ptr;
std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
uint64_t remote_ptr;
};
@@ -423,15 +423,16 @@ static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
if (ctx->base_ptr != nullptr) {
return ctx->base_ptr;
if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
return ctx->base_cache[buffer];
}
rpc_msg_buffer_get_base_req request = {ctx->remote_ptr};
rpc_msg_buffer_get_base_rsp response;
bool status = send_rpc_cmd(ctx->sock, RPC_CMD_BUFFER_GET_BASE, &request, sizeof(request), &response, sizeof(response));
GGML_ASSERT(status);
ctx->base_ptr = reinterpret_cast<void *>(response.base_ptr);
return ctx->base_ptr;
void * base_ptr = reinterpret_cast<void *>(response.base_ptr);
ctx->base_cache[buffer] = base_ptr;
return base_ptr;
}
static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
@@ -556,7 +557,7 @@ static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_back
if (response.remote_ptr != 0) {
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
ggml_backend_rpc_buffer_interface,
new ggml_backend_rpc_buffer_context{sock, nullptr, response.remote_ptr},
new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
response.remote_size);
return buffer;
} else {

View File

@@ -333,12 +333,8 @@ struct ggml_backend_sycl_context {
// pool
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);
static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
ggml_sycl_pool & pool(int device) {
if (pools[device] == nullptr) {
pools[device] = new_pool_for_device(stream(device,0), device);
@@ -349,15 +345,6 @@ struct ggml_backend_sycl_context {
ggml_sycl_pool & pool() {
return pool(device);
}
ggml_sycl_pool & host_pool(int device) {
if (host_pools[device] == nullptr) {
host_pools[device] = new_pool_for_host(stream(device, 0), device);
}
return *host_pools[device];
}
ggml_sycl_pool & host_pool() { return host_pool(device); }
};
// common device functions

View File

@@ -82,14 +82,6 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
return device_type.str();
}
template <typename Ts> struct matrix_info_t {
oneapi::mkl::transpose transpose_info[2];
Ts value_info[2];
std::int64_t size_info[3];
std::int64_t ld_info[3];
std::int64_t groupsize_info;
};
namespace dpct
{
typedef sycl::queue *queue_ptr;
@@ -1735,13 +1727,26 @@ namespace dpct
};
template <class Ta, class Tb, class Tc, class Ts>
inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
int ldb, const void * beta, void ** c, int ldc, int batch_size,
matrix_info_t<float> * matrix_info) {
inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
oneapi::mkl::transpose b_trans, int m, int n, int k,
const void *alpha, const void **a, int lda,
const void **b, int ldb, const void *beta, void **c,
int ldc, int batch_size)
{
struct matrix_info_t
{
oneapi::mkl::transpose transpose_info[2];
Ts value_info[2];
std::int64_t size_info[3];
std::int64_t ld_info[3];
std::int64_t groupsize_info;
};
Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
matrix_info_t *matrix_info =
(matrix_info_t *)std::malloc(sizeof(matrix_info_t));
matrix_info->transpose_info[0] = a_trans;
matrix_info->transpose_info[1] = b_trans;
matrix_info->value_info[0] = alpha_value;
@@ -1758,18 +1763,23 @@ namespace dpct
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast<const Ta **>(a),
matrix_info->ld_info, reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1,
&(matrix_info->groupsize_info));
#else
sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info,
reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
#endif
q.submit([&](sycl::handler &cgh)
{
cgh.depends_on(e);
cgh.host_task([=] { std::free(matrix_info); }); });
}
template <class Ta, class Tb, class Tc, class Ts>
@@ -2412,11 +2422,25 @@ namespace dpct
/// \param [in] ldc Leading dimension of C.
/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
/// \param [in] scaling_type Data type of the scaling factors.
inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
matrix_info_t<float> * matrix_info) {
inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
oneapi::mkl::transpose b_trans, int m, int n, int k,
const void *alpha, const void *a[],
library_data_t a_type, int lda, const void *b[],
library_data_t b_type, int ldb, const void *beta,
void *c[], library_data_t c_type, int ldc,
int batch_size, library_data_t scaling_type)
{
if (scaling_type == library_data_t::real_float &&
c_type == library_data_t::complex_float)
{
scaling_type = library_data_t::complex_float;
}
else if (scaling_type == library_data_t::real_double &&
c_type == library_data_t::complex_double)
{
scaling_type = library_data_t::complex_double;
}
std::uint64_t key =
detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
switch (key)
@@ -2425,24 +2449,48 @@ namespace dpct
library_data_t::real_float, library_data_t::real_float,
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<float, float, float, float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
beta, c, ldc, batch_size, matrix_info);
detail::gemm_batch_impl<float, float, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
library_data_t::real_double, library_data_t::real_double,
library_data_t::real_double, library_data_t::real_double):
{
detail::gemm_batch_impl<double, double, double, double>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
beta, c, ldc, batch_size, matrix_info);
detail::gemm_batch_impl<double, double, double, double>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
library_data_t::complex_float, library_data_t::complex_float,
library_data_t::complex_float, library_data_t::complex_float):
{
detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
std::complex<float>, std::complex<float>>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
library_data_t::complex_double, library_data_t::complex_double,
library_data_t::complex_double, library_data_t::complex_double):
{
detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
std::complex<double>, std::complex<double>>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
library_data_t::real_half, library_data_t::real_half,
library_data_t::real_half, library_data_t::real_half):
{
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
#ifdef __INTEL_MKL__
@@ -2450,16 +2498,19 @@ namespace dpct
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_bfloat16, library_data_t::real_float):
{
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
oneapi::mkl::bfloat16, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
library_data_t::real_bfloat16, library_data_t::real_bfloat16,
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
b, ldb, beta, c, ldc, batch_size);
break;
}
#endif
@@ -2471,9 +2522,10 @@ namespace dpct
dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
float beta_float =
dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, float>(
q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size,
matrix_info);
detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
float>(q, a_trans, b_trans, m, n, k, &alpha_float,
a, lda, b, ldb, &beta_float, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
@@ -2481,7 +2533,8 @@ namespace dpct
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
@@ -2489,7 +2542,8 @@ namespace dpct
library_data_t::real_float, library_data_t::real_float):
{
detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
batch_size);
break;
}
case detail::get_type_combination_id(
@@ -2503,7 +2557,8 @@ namespace dpct
sycl::half alpha_half(alpha_value);
sycl::half beta_half(beta_value);
detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info);
q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
batch_size);
break;
}
default:

View File

@@ -1173,85 +1173,6 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
}
};
struct ggml_sycl_pool_host : public ggml_sycl_pool {
queue_ptr qptr;
int device;
inline static int counter{ 0 };
struct ggml_sycl_buffer {
void * ptr = nullptr;
size_t size = 0;
};
// Set arbitrarly to 64
static constexpr int MAX_POOL_SIZE{ 64 };
std::vector<ggml_sycl_buffer> buffer_pool = std::vector<ggml_sycl_buffer>(MAX_POOL_SIZE);
size_t pool_size = 0;
explicit ggml_sycl_pool_host(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
~ggml_sycl_pool_host() {
for (int i = 0; i < MAX_POOL_SIZE; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
b.ptr = nullptr;
pool_size -= b.size;
b.size = 0;
}
}
counter = 0;
}
void * alloc(size_t size, size_t * actual_size) override {
if (counter == MAX_POOL_SIZE) {
ggml_sycl_buffer b = buffer_pool[0];
void * ptr = b.ptr;
*actual_size = b.size;
counter = 1;
return ptr;
}
ggml_sycl_buffer & b = buffer_pool[counter];
if (b.ptr == nullptr) {
void * ptr;
SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *) sycl::malloc_host(size, *qptr)));
if (!ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
return nullptr;
}
pool_size += size;
*actual_size = size;
counter = counter + 1;
return ptr;
} else {
++counter;
b.size = size;
return b.ptr;
}
}
void free(void * ptr, size_t size) override {
// if the pool is not completed add the pointer to it in place of the first nullptr found.
// Otherwise do nothing, pointers will be freed once the pool is deallocated.
for (int i = 0; i < MAX_POOL_SIZE; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr == nullptr) {
b.ptr = ptr;
b.size = size;
return;
}
}
}
};
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(queue_ptr qptr, int device) {
// return pool for the host to speed up memory management
return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_host(qptr, device));
}
std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
// TBD: NO VMM support
// if (ggml_sycl_info().devices[device].vmm) {
@@ -3442,7 +3363,6 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
ggml_sycl_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23);
ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);
sycl::range<3> block_dims(1, ne12, ne13);
/*
@@ -3471,10 +3391,14 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
});
}
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
*main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
(const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
(void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
*main_stream, oneapi::mkl::transpose::trans,
oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
(const void **)(ptrs_src.get() + 0 * ne23),
dpct::library_data_t::real_half, nb01 / nb00,
(const void **)(ptrs_src.get() + 1 * ne23),
dpct::library_data_t::real_half, nb11 / nb10, beta,
(void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
cu_compute_type)));
}
}
catch (sycl::exception const &exc) {

View File

@@ -29,6 +29,8 @@
#include "ggml-vulkan-shaders.hpp"
#define VK_API_VERSION VK_API_VERSION_1_2
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
#define VK_VENDOR_ID_AMD 0x1002
@@ -226,8 +228,6 @@ struct vk_device_struct {
vk_pipeline pipeline_repeat_f32;
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16;
vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT];
vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT];
vk_pipeline pipeline_norm_f32;
vk_pipeline pipeline_group_norm_f32;
vk_pipeline pipeline_rms_norm_f32;
@@ -384,13 +384,10 @@ struct vk_flash_attn_push_constants {
uint32_t nev3;
uint32_t nem1;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t nb21;
uint32_t nb22;
uint32_t nb23;
uint32_t nb31;
@@ -1612,7 +1609,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM(PIPELINE_NAME . f16acc, NAMELC, _f16acc, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
CREATE_MM(PIPELINE_NAME . f32acc, NAMELC, , WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
CREATE_MM(pipeline_matmul_f32, matmul_f32_f32, , wg_denoms, warptile, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_matmul_f32_f16, matmul_f32_f16, , wg_denoms, warptile, vk_mat_mat_push_constants, 3)
CREATE_MM2(pipeline_matmul_f16, matmul_f16, wg_denoms, warptile, vk_mat_mat_push_constants, 3)
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
@@ -1625,18 +1626,21 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f16, _f16acc, mmq_wg_denoms_k, warptile_mmq_k, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f16, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3)
CREATE_MM(pipeline_matmul_id_f32, matmul_id_f32_f32, , wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
CREATE_MM2(pipeline_matmul_id_f16, matmul_id_f16, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f16, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmqid_wg_denoms, warptile_mmqid, vk_mat_mat_id_push_constants, 4)
#undef CREATE_MM
#undef CREATE_MM2
} else
@@ -1673,31 +1677,31 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM2(pipeline_matmul_f16_f32, matmul_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 3, );
if (device->coopmat_acc_f16_support) {
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
} else {
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0].f16acc, matmul_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1].f16acc, matmul_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0].f16acc, matmul_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1].f16acc, matmul_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0].f16acc, matmul_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K].f16acc, matmul_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K].f16acc, matmul_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K].f16acc, matmul_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K].f16acc, matmul_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K].f16acc, matmul_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
CREATE_MM(pipeline_dequant_mul_mat_mat[GGML_TYPE_IQ4_NL].f16acc, matmul_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_push_constants, 3, );
}
// If there's not enough shared memory for row_ids and the result tile, don't create these pipelines.
@@ -1707,31 +1711,31 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_MM2(pipeline_matmul_id_f16_f32, matmul_id_f16_f32, wg_denoms, warptile, vk_mat_mat_push_constants, 4, _id);
if (device->coopmat_acc_f16_support) {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, _f16acc, wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
} else {
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0].f16acc, matmul_id_q4_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1].f16acc, matmul_id_q4_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0].f16acc, matmul_id_q5_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1].f16acc, matmul_id_q5_1_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0].f16acc, matmul_id_q8_0_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , mmq_wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K].f16acc, matmul_id_q2_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K].f16acc, matmul_id_q3_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K].f16acc, matmul_id_q4_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K].f16acc, matmul_id_q5_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K].f16acc, matmul_id_q6_k_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
CREATE_MM(pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL].f16acc, matmul_id_iq4_nl_f32, , wg_denoms, warptile_mmq, vk_mat_mat_id_push_constants, 4, _id);
}
}
#undef CREATE_MM2
@@ -1961,20 +1965,6 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_0], "cpy_f32_q4_0", cpy_f32_q4_0_len, cpy_f32_q4_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q4_1], "cpy_f32_q4_1", cpy_f32_q4_1_len, cpy_f32_q4_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_0], "cpy_f32_q5_0", cpy_f32_q5_0_len, cpy_f32_q5_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q5_1], "cpy_f32_q5_1", cpy_f32_q5_1_len, cpy_f32_q5_1_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_Q8_0], "cpy_f32_q8_0", cpy_f32_q8_0_len, cpy_f32_q8_0_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_quant[GGML_TYPE_IQ4_NL], "cpy_f32_iq4_nl", cpy_f32_iq4_nl_len, cpy_f32_iq4_nl_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_0], "cpy_q4_0_f32", cpy_q4_0_f32_len, cpy_q4_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q4_1], "cpy_q4_1_f32", cpy_q4_1_f32_len, cpy_q4_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q4_1), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_0], "cpy_q5_0_f32", cpy_q5_0_f32_len, cpy_q5_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q5_1], "cpy_q5_1_f32", cpy_q5_1_f32_len, cpy_q5_1_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q5_1), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_Q8_0], "cpy_q8_0_f32", cpy_q8_0_f32_len, cpy_q8_0_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_Q8_0), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_cpy_quant_f32[GGML_TYPE_IQ4_NL], "cpy_iq4_nl_f32", cpy_iq4_nl_f32_len, cpy_iq4_nl_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {(uint32_t)ggml_blck_size(GGML_TYPE_IQ4_NL), 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f32_norepeat, "add_f32_norepeat", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {1}, 1);
ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0}, 1);
@@ -2012,7 +2002,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_tanh_f32, "tanh_f32", tanh_f32_len, tanh_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true);
ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_wg512, "soft_max_f32_wg512", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
@@ -2278,14 +2268,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
}
#endif
VkPhysicalDeviceMaintenance4Features maint4_features {};
maint4_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_4_FEATURES;
if (maintenance4_support) {
last_struct->pNext = (VkBaseOutStructure *)&maint4_features;
last_struct = (VkBaseOutStructure *)&maint4_features;
device_extensions.push_back("VK_KHR_maintenance4");
}
vkGetPhysicalDeviceFeatures2(device->physical_device, &device_features2);
device->fp16 = device->fp16 && vk12_features.shaderFloat16;
@@ -2661,14 +2643,7 @@ void ggml_vk_instance_init() {
vk_instance_initialized = true;
uint32_t api_version = vk::enumerateInstanceVersion();
if (api_version < VK_API_VERSION_1_2) {
std::cerr << "ggml_vulkan: Error: Vulkan 1.2 required." << std::endl;
GGML_ABORT("fatal error");
}
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, api_version };
vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, VK_API_VERSION };
const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
const bool validation_ext = ggml_vk_instance_validation_ext_available(instance_extensions);
@@ -2978,7 +2953,7 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_co
}
}
GGML_ASSERT(src1_type == GGML_TYPE_F32 || (ctx->device->coopmat2 && src1_type == GGML_TYPE_F16));
GGML_ASSERT(src1_type == GGML_TYPE_F32);
switch (src0_type) {
case GGML_TYPE_Q4_0:
@@ -3714,33 +3689,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
return ctx->device->pipeline_cpy_f16_f16;
}
}
if (src->type == GGML_TYPE_F32) {
switch (to) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return ctx->device->pipeline_cpy_f32_quant[to];
default:
break;
}
}
if (to == GGML_TYPE_F32) {
switch (src->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return ctx->device->pipeline_cpy_quant_f32[src->type];
default:
break;
}
}
std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
GGML_ABORT("fatal error");
@@ -3818,9 +3766,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
src1_uma = d_Qy != nullptr;
}
// Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf
const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) ||
!ggml_vk_dim01_contiguous(src0);
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
// Reformat and convert to fp16 if src1 is non-contiguous, or for coopmat2 for better perf
const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) ||
!ggml_vk_dim01_contiguous(src1);
@@ -4400,11 +4347,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
ids_uma = d_ids != nullptr;
}
// Reformat and convert to fp16 if non-contiguous, or for coopmat2 for better perf
const bool x_non_contig = (ctx->device->coopmat2 && src0->type == GGML_TYPE_F32) ||
!ggml_vk_dim01_contiguous(src0);
const bool y_non_contig = (ctx->device->coopmat2 && src1->type == GGML_TYPE_F32) ||
!ggml_vk_dim01_contiguous(src1);
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
@@ -4414,8 +4358,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
if (qx_needs_dequant) {
// Fall back to dequant + f16 mulmat
mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, GGML_TYPE_F16, y_f32_kernel ? GGML_TYPE_F32 : GGML_TYPE_F16, (ggml_prec)dst->op_params[0]);
GGML_ABORT("fatal error");
}
// Not implemented
@@ -4823,14 +4766,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
}
assert(pipelines);
const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type));
const uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type));
const uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type));
bool aligned = (KV % pipelines[1]->align) == 0 &&
// the "aligned" shader variant will forcibly align strides, for performance
(q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
bool aligned = (KV % pipelines[1]->align) == 0;
vk_pipeline pipeline = pipelines[aligned];
assert(pipeline);
@@ -4866,15 +4802,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
if (ctx->device->uma) {
ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset);
ggml_vk_host_get(ctx->device, k->data, d_K, k_buf_offset);
ggml_vk_host_get(ctx->device, v->data, d_V, v_buf_offset);
ggml_vk_host_get(ctx->device, dst->data, d_D, d_buf_offset);
ggml_vk_host_get(ctx->device, k->data, d_K, q_buf_offset);
ggml_vk_host_get(ctx->device, v->data, d_V, q_buf_offset);
ggml_vk_host_get(ctx->device, dst->data, d_D, q_buf_offset);
Q_uma = d_Q != nullptr;
K_uma = d_K != nullptr;
V_uma = d_V != nullptr;
D_uma = d_D != nullptr;
if (mask) {
ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset);
ggml_vk_host_get(ctx->device, mask->data, d_M, q_buf_offset);
M_uma = d_M != nullptr;
}
}
@@ -4912,18 +4848,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
}
}
const vk_flash_attn_push_constants pc = { N, KV,
(uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
(uint32_t)neq2, (uint32_t)neq3,
(uint32_t)nek2, (uint32_t)nek3,
(uint32_t)nev2, (uint32_t)nev3,
nem1,
q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
nbm1,
scale, max_bias, logit_softcap,
mask != nullptr, n_head_log2, m0, m1 };
const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, (uint32_t)nbq2, (uint32_t)nbq3, (uint32_t)nbk2, (uint32_t)nbk3, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1 };
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
{
vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
@@ -5235,7 +5160,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
}
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3];
std::cerr << "), " << ggml_op_name(op) << ", " << (dryrun ? "dryrun" : "") << ")");
GGML_ASSERT(op == GGML_OP_GET_ROWS || op == GGML_OP_CPY || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
GGML_ASSERT(ggml_vk_op_supports_incontiguous(op) || ggml_vk_dim01_contiguous(src0)); // NOLINT
GGML_ASSERT(dst->buffer != nullptr);
const uint64_t ne00 = src0->ne[0];
@@ -7980,36 +7905,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
{
ggml_type src0_type = op->src[0]->type;
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
if (src0_type == GGML_TYPE_F32) {
switch (src1_type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return true;
default:
break;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
return true;
}
if (src1_type == GGML_TYPE_F32) {
switch (src0_type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_IQ4_NL:
return true;
default:
break;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
return true;
}
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
return true;
}
@@ -8700,7 +8601,6 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
ggml_tensor * src0 = tensor->src[0];
ggml_tensor * src1 = tensor->src[1];
ggml_tensor * src2 = tensor->src[2];
ggml_tensor * src3 = tensor->src[3];
void * tensor_data = tensor->data;
@@ -8763,9 +8663,6 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
if (src2 != nullptr) {
std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
}
if (src3 != nullptr) {
std::cerr << "src3=" << src3 << " src3->name=" << src3->name << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
}
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
std::cerr << std::endl << "Result:" << std::endl;
ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3);
@@ -8810,9 +8707,6 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
if (src2 != nullptr) {
std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
}
if (src3 != nullptr) {
std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
}
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
std::cerr << std::endl << "Result:" << std::endl;
ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
@@ -8835,9 +8729,6 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
if (src2 != nullptr) {
std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
}
if (src3 != nullptr) {
std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
}
std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
std::cerr << std::endl << "Result:" << std::endl;
ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]);

View File

@@ -1,51 +0,0 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
#include "dequant_funcs.comp"
#if defined(DATA_A_IQ4_NL)
// 16 invocations needed for init_iq4nl_shmem
layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
#else
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
#endif
void main() {
#if defined(DATA_A_IQ4_NL)
init_iq4nl_shmem();
if (gl_LocalInvocationIndex.x != 0) {
return;
}
#endif
const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
if (idx >= p.ne) {
return;
}
uint dst_idx = get_doffset() + dst_idx(idx);
uint src_idx = src0_idx_quant(idx, QUANT_K);
const uint a_offset = 0;
const uint ib = src_idx;
const vec2 dm = get_dm(ib, a_offset);
[[unroll]] for (int j = 0; j < QUANT_K; j += 4) {
vec4 v = dequantize4(ib, j / QUANT_R, a_offset);
v = v * dm.x + vec4(dm.y);
#if QUANT_R == 2
data_d[dst_idx + j/2 + 0] = v[0];
data_d[dst_idx + j/2 + QUANT_K/2 + 0] = v[1];
data_d[dst_idx + j/2 + 1] = v[2];
data_d[dst_idx + j/2 + QUANT_K/2 + 1] = v[3];
#else
data_d[dst_idx + j + 0] = v[0];
data_d[dst_idx + j + 1] = v[1];
data_d[dst_idx + j + 2] = v[2];
data_d[dst_idx + j + 3] = v[3];
#endif
}
}

View File

@@ -1,237 +0,0 @@
#version 450
#include "types.comp"
#include "generic_unary_head.comp"
#if defined(DATA_A_IQ4_NL)
// 16 invocations needed for init_iq4nl_shmem
layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
#else
layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
#endif
layout (binding = 0) readonly buffer S {float data_s[];};
layout (binding = 1) writeonly buffer Q {A_TYPE data_q[];};
#if defined(DATA_A_Q4_0)
void quantize(uint dst_idx, uint src_idx)
{
float amax = 0.0;
float vmax = 0.0;
[[unroll]] for (int j = 0; j < QUANT_K_Q4_0; ++j) {
const float v = data_s[src_idx + j];
if (amax < abs(v)) {
amax = abs(v);
vmax = v;
}
}
const float d = vmax / -8;
const float id = (d != 0.0) ? 1.0/d : 0.0;
data_q[dst_idx].d = float16_t(d);
[[unroll]] for (int j = 0; j < QUANT_K_Q4_0/2; ++j) {
const float x0 = data_s[src_idx + 0 + j]*id;
const float x1 = data_s[src_idx + QUANT_K_Q4_0/2 + j]*id;
const uint xi0 = min(15, int(x0 + 8.5));
const uint xi1 = min(15, int(x1 + 8.5));
data_q[dst_idx].qs[j] = uint8_t(xi0 | (xi1 << 4));
}
}
#endif
#if defined(DATA_A_Q4_1)
void quantize(uint dst_idx, uint src_idx)
{
float vmin = 1.0/0.0;
float vmax = -vmin;
[[unroll]] for (int j = 0; j < QUANT_K_Q4_1; ++j) {
const float v = data_s[src_idx + j];
if (v < vmin) vmin = v;
if (v > vmax) vmax = v;
}
const float d = (vmax - vmin) / ((1 << 4) - 1);
const float id = (d != 0.0) ? 1.0/d : 0.0;
data_q[dst_idx].d = float16_t(d);
data_q[dst_idx].m = float16_t(vmin);
[[unroll]] for (int j = 0; j < QUANT_K_Q4_1/2; ++j) {
const float x0 = (data_s[src_idx + 0 + j] - vmin)*id;
const float x1 = (data_s[src_idx + QUANT_K_Q4_1/2 + j] - vmin)*id;
const uint xi0 = min(15, int(x0 + 0.5));
const uint xi1 = min(15, int(x1 + 0.5));
data_q[dst_idx].qs[j] = uint8_t(xi0 | (xi1 << 4));
}
}
#endif
#if defined(DATA_A_Q5_0)
void quantize(uint dst_idx, uint src_idx)
{
float amax = 0.0;
float vmax = 0.0;
[[unroll]] for (int j = 0; j < QUANT_K_Q5_0; ++j) {
const float v = data_s[src_idx + j];
if (amax < abs(v)) {
amax = abs(v);
vmax = v;
}
}
const float d = vmax / -16;
const float id = (d != 0.0) ? 1.0/d : 0.0;
data_q[dst_idx].d = float16_t(d);
uint32_t qh = 0;
[[unroll]] for (int j = 0; j < QUANT_K_Q5_0/2; ++j) {
const float x0 = data_s[src_idx + 0 + j]*id;
const float x1 = data_s[src_idx + QUANT_K_Q5_0/2 + j]*id;
const uint xi0 = min(31, int(x0 + 16.5));
const uint xi1 = min(31, int(x1 + 16.5));
data_q[dst_idx].qs[j] = uint8_t((xi0 & 0xf) | ((xi1 & 0xf) << 4));
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
qh |= ((xi1 & 0x10u) >> 4) << (j + QUANT_K_Q5_0/2);
}
data_q[dst_idx].qh[0] = uint16_t(qh & 0xFFFF);
data_q[dst_idx].qh[1] = uint16_t(qh >> 16);
}
#endif
#if defined(DATA_A_Q5_1)
void quantize(uint dst_idx, uint src_idx)
{
float min = data_s[src_idx + 0];
float max = min;
[[unroll]] for (int j = 1; j < QUANT_K_Q5_1; ++j) {
const float v = data_s[src_idx + j];
min = v < min ? v : min;
max = v > max ? v : max;
}
const float d = (max - min) / 31;
const float id = (d != 0) ? 1.0/d : 0.0;
data_q[dst_idx].d = float16_t(d);
data_q[dst_idx].m = float16_t(min);
uint32_t qh = 0;
[[unroll]] for (int j = 0; j < QUANT_K_Q5_1/2; ++j) {
const float x0 = (data_s[src_idx + 0 + j] - min)*id;
const float x1 = (data_s[src_idx + QUANT_K_Q5_1/2 + j] - min)*id;
const uint xi0 = uint(x0 + 0.5);
const uint xi1 = uint(x1 + 0.5);
data_q[dst_idx].qs[j] = uint8_t((xi0 & 0xf) | ((xi1 & 0xf) << 4));
qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
qh |= ((xi1 & 0x10u) >> 4) << (j + QUANT_K_Q5_1/2);
}
data_q[dst_idx].qh = qh;
}
#endif
#if defined(DATA_A_Q8_0)
void quantize(uint dst_idx, uint src_idx)
{
float amax = 0.0; // absolute max
[[unroll]] for (int j = 0; j < QUANT_K_Q8_0; j++) {
const float v = data_s[src_idx + j];
amax = max(amax, abs(v));
}
const float d = amax / ((1 << 7) - 1);
const float id = (d != 0.0) ? 1.0/d : 0.0;
data_q[dst_idx].d = float16_t(d);
[[unroll]] for (int j = 0; j < QUANT_K_Q8_0; ++j) {
const float x0 = data_s[src_idx + j]*id;
data_q[dst_idx].qs[j] = int8_t(round(x0));
}
}
#endif
#if defined(DATA_A_IQ4_NL)
uint best_index(float x) {
if (x <= kvalues_iq4nl[0]) return 0;
if (x >= kvalues_iq4nl[15]) return 15;
int ml = 0, mu = 15;
while (mu-ml > 1) {
int mav = (ml+mu)/2;
if (x < kvalues_iq4nl[mav]) mu = mav; else ml = mav;
}
return x - kvalues_iq4nl[mu-1] < kvalues_iq4nl[mu] - x ? mu-1 : mu;
}
void quantize(uint dst_idx, uint src_idx)
{
float amax = 0.0;
float vmax = 0.0;
[[unroll]] for (int j = 0; j < QUANT_K_IQ4_NL; ++j) {
const float v = data_s[src_idx + j];
if (amax < abs(v)) {
amax = abs(v);
vmax = v;
}
}
float d = vmax / kvalues_iq4nl[0];
const float id = (d != 0.0) ? 1.0/d : 0.0;
float sumqx = 0, sumq2 = 0;
[[unroll]] for (int j = 0; j < QUANT_K_IQ4_NL/2; ++j) {
const float x0 = data_s[src_idx + 0 + j]*id;
const float x1 = data_s[src_idx + QUANT_K_IQ4_NL/2 + j]*id;
const uint xi0 = best_index(x0);
const uint xi1 = best_index(x1);
data_q[dst_idx].qs[j] = uint8_t(xi0 | (xi1 << 4));
const float v0 = kvalues_iq4nl[xi0];
const float v1 = kvalues_iq4nl[xi1];
const float w0 = data_s[src_idx + 0 + j]*data_s[src_idx + 0 + j];
const float w1 = data_s[src_idx + QUANT_K_IQ4_NL/2 + j]*data_s[src_idx + QUANT_K_IQ4_NL/2 + j];
sumqx += w0*v0*data_s[src_idx + j] + w1*v1*data_s[src_idx + QUANT_K_IQ4_NL/2 + j];
sumq2 += w0*v0*v0 + w1*v1*v1;
}
data_q[dst_idx].d = float16_t(sumq2 > 0 ? sumqx/sumq2 : d);
}
#endif
void main() {
#if defined(DATA_A_IQ4_NL)
init_iq4nl_shmem();
if (gl_LocalInvocationIndex.x != 0) {
return;
}
#endif
const uint idx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x * QUANT_K;
if (idx >= p.ne) {
return;
}
uint dst_idx = dst_idx_quant(idx, QUANT_K);
uint src_idx = get_aoffset() + src0_idx(idx);
quantize(dst_idx, src_idx);
}

View File

@@ -101,25 +101,19 @@ layout(buffer_reference, std430, buffer_reference_align = 4) buffer decodeBufQ2_
block_q2_K block;
};
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2_K_packed16 {
block_q2_K_packed16 block;
};
float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
const f16vec2 d = bl.block.d;
const uint idx = coordInBlock[1];
const uint iqs = idx;
const uint scalesi = (idx & 0xF0) >> 4; // 0..15
const uint qsshift = (idx & 0x60) >> 4; // 0,2,4,6
uint qs = uint32_t(bl16.block.qs[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]);
qs = (qs >> qsshift) & 0x0303;
qs = unpack8(qs)[idx & 1];
const uint qsi = (iqs / 128) * 32 + (iqs % 32); // 0..31
const uint scalesi = iqs / 16; // 0..15
const uint qsshift = ((iqs % 128) / 32) * 2; // 0,2,4,6
uint32_t qs = bl.block.qs[qsi];
const uint scales = bl.block.scales[scalesi];
float16_t ret = d.x * float16_t(scales & 0xF) * float16_t(qs) - d.y * float16_t(scales >> 4);
float16_t ret = d.x * float16_t(scales & 0xF) * float16_t((qs >> qsshift) & 3) - d.y * float16_t(scales >> 4);
return ret;
}
@@ -163,47 +157,39 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4
block_q4_K_packed16 block;
};
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed128 {
block_q4_K_packed128 block;
};
float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl);
decodeBufQ4_K_packed128 bl128 = decodeBufQ4_K_packed128(bl);
const uint idx = coordInBlock[1];
const uint b = (idx & 0x20) >> 5; // 0,1
const uint is = (idx & 0xE0) >> 5; // 0..7
uvec4 v = bl128.block.q4k[0];
const f16vec2 loadd = unpackFloat2x16(v.x);
const f16vec2 loadd = bl.block.d;
uint32_t sc;
uint32_t mbyte;
uint32_t scale0 = v.y;
uint32_t scale4 = v.z;
uint32_t scale8 = v.w;
uint32_t scidx0 = (is < 4) ? is : (is + 4);
uint32_t scidx1 = (is < 4) ? is : (is - 4);
uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0;
uint32_t scidxshift1 = (is < 4) ? 0 : 2;
uint32_t mbidx0 = is + 4;
uint32_t mbidx1 = (is < 4) ? is + 4 : is;
uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0;
uint32_t mbidxshift0 = (is < 4) ? 0 : 4;
uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
uint32_t mbidxshift1 = (is < 4) ? 0 : 2;
uint32_t sc_lo = scale0;
uint32_t mb_lo = scale4;
uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
sc = is < 4 ? sc_lo : sc_hi;
mbyte = is < 4 ? mb_lo : mb_hi;
sc = sc >> (8 * (is & 3));
mbyte = mbyte >> (8 * (is & 3));
sc &= 0x3F;
mbyte &= 0x3F;
sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1));
mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1));
const float16_t d = loadd.x * float16_t(sc);
const float16_t m = loadd.y * float16_t(mbyte);
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
qs = (qs >> (b * 4 + 8 * (idx & 1))) & 0xF;
qs = (qs >> (b * 4)) & 0x0F0F;
qs = unpack8(qs)[idx & 1];
float16_t ret = d * float16_t(qs) - m;
@@ -218,53 +204,47 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5
block_q5_K_packed16 block;
};
layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed128 {
block_q5_K_packed128 block;
};
float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
{
decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl);
decodeBufQ5_K_packed128 bl128 = decodeBufQ5_K_packed128(bl);
const uint idx = coordInBlock[1];
const uint b = (idx & 0x20) >> 5; // 0,1
const uint is = (idx & 0xE0) >> 5; // 0..7
uvec4 v = bl128.block.q5k[0];
const uint32_t hm = 0x0101 << is;
const f16vec2 loadd = unpackFloat2x16(v.x);
const f16vec2 loadd = bl.block.d;
uint32_t sc;
uint32_t mbyte;
uint32_t scale0 = v.y;
uint32_t scale4 = v.z;
uint32_t scale8 = v.w;
uint32_t scidx0 = (is < 4) ? is : (is + 4);
uint32_t scidx1 = (is < 4) ? is : (is - 4);
uint32_t scidxmask1 = (is < 4) ? 0x30 : 0xC0;
uint32_t scidxshift1 = (is < 4) ? 0 : 2;
uint32_t mbidx0 = is + 4;
uint32_t mbidx1 = (is < 4) ? is + 4 : is;
uint32_t mbidxmask0 = (is < 4) ? 0xF : 0xF0;
uint32_t mbidxshift0 = (is < 4) ? 0 : 4;
uint32_t mbidxmask1 = (is < 4) ? 0x30 : 0xC0;
uint32_t mbidxshift1 = (is < 4) ? 0 : 2;
uint32_t sc_lo = scale0;
uint32_t mb_lo = scale4;
uint32_t sc_hi = (scale8 & 0x0F0F0F0F) | ((scale0 & 0xC0C0C0C0) >> 2);
uint32_t mb_hi = ((scale8 & 0xF0F0F0F0) >> 4) | ((scale4 & 0xC0C0C0C0) >> 2);
sc = is < 4 ? sc_lo : sc_hi;
mbyte = is < 4 ? mb_lo : mb_hi;
sc = sc >> (8 * (is & 3));
mbyte = mbyte >> (8 * (is & 3));
sc &= 0x3F;
mbyte &= 0x3F;
sc = uint8_t((bl.block.scales[scidx0] & 0xF) | ((bl.block.scales[scidx1] & scidxmask1) >> scidxshift1));
mbyte = uint8_t(((bl.block.scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((bl.block.scales[mbidx1] & mbidxmask1) >> mbidxshift1));
const float16_t d = loadd.x * float16_t(sc);
const float16_t m = loadd.y * float16_t(mbyte);
uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]);
qh = ((qh >> is) & 0x101) << 4;
qh = qh & hm;
qh = unpack8(qh)[idx & 1];
uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]);
qs = (qs >> (b * 4)) & 0x0F0F;
qs = unpack8(qs | qh)[idx & 1];
qs = unpack8(qs)[idx & 1];
float16_t ret = d * (float16_t(qs)) - m;
float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m;
return ret;
}

View File

@@ -12,7 +12,7 @@ layout (push_constant) uniform parameter
#include "types.comp"
layout(local_size_x = 1, local_size_y = 512, local_size_z = 1) in;
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};

View File

@@ -42,13 +42,10 @@ layout (push_constant) uniform parameter {
uint32_t nev3;
uint32_t nem1;
uint32_t nb01;
uint32_t nb02;
uint32_t nb03;
uint32_t nb11;
uint32_t nb12;
uint32_t nb13;
uint32_t nb21;
uint32_t nb22;
uint32_t nb23;
uint32_t nb31;
@@ -149,24 +146,7 @@ void main() {
tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);
// nb?1 are already divided by the type size and are in units of elements
uint32_t q_stride = p.nb01;
uint32_t k_stride = p.nb11;
uint32_t v_stride = p.nb21;
// hint to the compiler that strides are aligned for the aligned variant of the shader
if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
{
q_stride &= ~7;
#if !defined(BLOCK_SIZE)
k_stride &= ~7;
v_stride &= ~7;
#endif
}
tensorLayoutQ = setTensorLayoutStrideNV(tensorLayoutQ, q_stride, 1);
tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Q;
coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Q;
coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Qf16;
uint32_t q_offset = iq2*p.nb02+iq3*p.nb03;

View File

@@ -54,23 +54,3 @@ uint dst_idx(uint idx) {
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
}
uint src0_idx_quant(uint idx, uint qk) {
const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
const uint i02_offset = i02*p.ne01*p.ne00;
const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00;
}
uint dst_idx_quant(uint idx, uint qk) {
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
const uint i12_offset = i12*p.ne11*p.ne10;
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10;
}

View File

@@ -5,80 +5,6 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE sccache1[BLOCK_SIZE/16][16];
shared FLOAT_TYPE sccache2[BLOCK_SIZE/16][16];
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint v_im, const uint ix, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
barrier();
if (!all_threads) { // when we don't have enough blocks to use all threads
if (i < num_blocks_per_row) {
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
}
barrier();
if (i >= num_blocks_per_row)
continue;
} else {
const uint32_t scale = uint32_t(data_a[ib0 + i].scales[itid]);
sccache1[ix][itid] = FLOAT_TYPE(scale & 0xF);
sccache2[ix][itid] = FLOAT_TYPE((scale >> 4) & 0xF);
barrier();
}
const uint32_t qs_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
const vec4 qs_u32_0 = vec4(unpack8(qs_u32 & 0x03030303));
const vec4 qs_u32_2 = vec4(unpack8((qs_u32 >> 2) & 0x03030303));
const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
vec2 d = vec2(data_a[ib0 + i].d);
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]);
vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]);
vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]);
vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]);
vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]);
vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]);
vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]);
vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]);
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 2; ++l) {
sum1 = fma(FLOAT_TYPE(b0[l]), sccache1[ix][ 8*v_im] * qs_u32_0[l ],
fma(FLOAT_TYPE(b16[l]), sccache1[ix][1 + 8*v_im] * qs_u32_0[l+2],
fma(FLOAT_TYPE(b32[l]), sccache1[ix][2 + 8*v_im] * qs_u32_2[l ],
fma(FLOAT_TYPE(b48[l]), sccache1[ix][3 + 8*v_im] * qs_u32_2[l+2],
fma(FLOAT_TYPE(b64[l]), sccache1[ix][4 + 8*v_im] * qs_u32_4[l ],
fma(FLOAT_TYPE(b80[l]), sccache1[ix][5 + 8*v_im] * qs_u32_4[l+2],
fma(FLOAT_TYPE(b96[l]), sccache1[ix][6 + 8*v_im] * qs_u32_6[l ],
fma(FLOAT_TYPE(b112[l]), sccache1[ix][7 + 8*v_im] * qs_u32_6[l+2], sum1))))))));
sum2 = fma(FLOAT_TYPE(b0[l]), sccache2[ix][ 8*v_im],
fma(FLOAT_TYPE(b16[l]), sccache2[ix][1 + 8*v_im],
fma(FLOAT_TYPE(b32[l]), sccache2[ix][2 + 8*v_im],
fma(FLOAT_TYPE(b48[l]), sccache2[ix][3 + 8*v_im],
fma(FLOAT_TYPE(b64[l]), sccache2[ix][4 + 8*v_im],
fma(FLOAT_TYPE(b80[l]), sccache2[ix][5 + 8*v_im],
fma(FLOAT_TYPE(b96[l]), sccache2[ix][6 + 8*v_im],
fma(FLOAT_TYPE(b112[l]), sccache2[ix][7 + 8*v_im], sum2))))))));
}
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
}
}
}
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
@@ -88,28 +14,88 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
const uint tid = gl_LocalInvocationID.x;
const uint itid = tid%16; // 0...15
const uint ix = tid/16;
const uint itid = tid%16; // 0...16
const uint ix = tid/16;
const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - 8*v_im; // 0...7
const uint step = 8;
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - step*v_im; // 0...15 or 0...7
const uint l0 = 2*v_in; // 0...15
const uint q_offset = 32*v_im + l0;
const uint s_offset = 8*v_im;
const uint y_offset = 128*v_im + l0;
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[j][i] = FLOAT_TYPE(0);
}
}
const uint nbr_par_th = num_blocks_per_row%it_size;
const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
uint i0 = 0;
[[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
calc_superblock(a_offset, b_offset, itid, v_im, ix, q_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
calc_superblock(a_offset, b_offset, itid, v_im, ix, q_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
vec2 d = vec2(data_a[ib0 + i].d);
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0];
uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1];
uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F;
uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F;
uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F;
uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F;
uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32));
uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32));
uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32));
uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32));
uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0];
uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8];
uvec2 qs0 = uvec2(unpack8(qs0_u16));
uvec2 qs16 = uvec2(unpack8(qs16_u16));
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]);
vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]);
vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]);
vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]);
vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]);
vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]);
vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]);
vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]);
FLOAT_TYPE sum1 = FLOAT_TYPE(0.0);
FLOAT_TYPE sum2 = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 2; ++l) {
sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3),
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3),
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3),
fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3),
fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l] >> 4) & 3),
fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3),
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l] >> 6) & 3),
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1))))))));
sum2 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_hi4[0]),
fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_hi4[1]),
fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_hi4[2]),
fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_hi4[3]),
fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_hi4[0]),
fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_hi4[1]),
fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]),
fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2))))))));
}
temp[j][n] = fma(dall, sum1, fma(-dmin, sum2, temp[j][n]));
}
}
}
reduce_result(temp, d_offset, first_row, num_rows, tid);
}

View File

@@ -5,74 +5,6 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][2][8];
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
void calc_superblock(const uint a_offset, const uint b_offset, const uint ix, const uint itid8, const uint v_im, const uint v_im4, const uint v_in, const uint32_t hm_m[4], const uint q_offset, const uint y_offset, const uint s_shift, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
if (!all_threads) { // when we don't have enough blocks to use all threads
barrier();
if (i < num_blocks_per_row)
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
barrier();
if (i >= num_blocks_per_row)
continue;
}
const uint32_t hmk = ~(uint32_t(data_a_packed16[ib0 + i].hmask[v_in]) | (uint32_t(data_a_packed16[ib0 + i].hmask[v_in + 8]) << 16));
const vec4 hmk_0 = vec4(unpack8(((hmk & hm_m[0]) >> ( v_im4)) << 2));
const vec4 hmk_1 = vec4(unpack8(((hmk & hm_m[1]) >> (1 + v_im4)) << 2));
const vec4 hmk_2 = vec4(unpack8(((hmk & hm_m[2]) >> (2 + v_im4)) << 2));
const vec4 hmk_3 = vec4(unpack8(((hmk & hm_m[3]) >> (3 + v_im4)) << 2));
// 0, 1, 16, 17
uint32_t qs_u32 = uint32_t(data_a[ib0 + i].qs[q_offset]) | (uint32_t(data_a[ib0 + i].qs[q_offset + 1]) << 8);
qs_u32 |= (uint32_t(data_a[ib0 + i].qs[q_offset + 16]) | (uint32_t(data_a[ib0 + i].qs[q_offset + 17]) << 8)) << 16;
const vec4 qs_u32_0 = vec4(unpack8(qs_u32 & 0x03030303));
const vec4 qs_u32_2 = vec4(unpack8((qs_u32 >> 2) & 0x03030303));
const vec4 qs_u32_4 = vec4(unpack8((qs_u32 >> 4) & 0x03030303));
const vec4 qs_u32_6 = vec4(unpack8((qs_u32 >> 6) & 0x03030303));
if (all_threads) {
barrier();
sccache[ix][v_im][itid8] = FLOAT_TYPE(int8_t(((data_a[ib0+i].scales[itid8] >> v_im4) & 0xF) | (((data_a[ib0+i].scales[itid8%4+8] >> s_shift) & 3) << 4)) - 32);
barrier();
}
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]);
vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]);
vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]);
vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]);
vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]);
vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]);
vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]);
vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]);
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 2; ++l) {
sum = fma(FLOAT_TYPE( b0[l]) * sccache[ix][v_im][0], qs_u32_0[l ] - hmk_0[l ],
fma(FLOAT_TYPE( b16[l]) * sccache[ix][v_im][1], qs_u32_0[l+2] - hmk_0[l+2],
fma(FLOAT_TYPE( b32[l]) * sccache[ix][v_im][2], qs_u32_2[l ] - hmk_1[l ],
fma(FLOAT_TYPE( b48[l]) * sccache[ix][v_im][3], qs_u32_2[l+2] - hmk_1[l+2],
fma(FLOAT_TYPE( b64[l]) * sccache[ix][v_im][4], qs_u32_4[l ] - hmk_2[l ],
fma(FLOAT_TYPE( b80[l]) * sccache[ix][v_im][5], qs_u32_4[l+2] - hmk_2[l+2],
fma(FLOAT_TYPE( b96[l]) * sccache[ix][v_im][6], qs_u32_6[l ] - hmk_3[l ],
fma(FLOAT_TYPE(b112[l]) * sccache[ix][v_im][7], qs_u32_6[l+2] - hmk_3[l+2], sum))))))));
}
temp[j][n] = fma(d, sum, temp[j][n]);
}
}
}
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
@@ -82,37 +14,76 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
const uint tid = gl_LocalInvocationID.x;
const uint itid = tid%16; // 0...15
const uint ix = tid/16;
const uint itid8 = itid%8;
const uint itid = tid%16; // 0...16
const uint ix = tid/16;
const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_im4 = v_im*4;
const uint v_in = itid - 8*v_im; // 0...7
const uint step = 8;
const uint32_t m = 0x01010101 << (4 * v_im);
uint32_t hm_m[4];
[[unroll]] for (uint j = 0; j < 4; ++j)
hm_m[j] = m << j;
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - step*v_im; // 0...15 or 0...7
const uint8_t m = uint8_t(1 << (4 * v_im));
const uint l0 = 2*v_in; // 0...15
const uint q_offset = 32*v_im + l0;
const uint y_offset = 128*v_im + l0;
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[j][i] = FLOAT_TYPE(0);
}
}
const uint s_shift = v_im4 + 2*(itid8/4);
const uint s_shift = 4 * v_im;
const uint nbr_par_th = num_blocks_per_row%it_size;
const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
uint i0 = 0;
[[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
calc_superblock(a_offset, b_offset, ix, itid8, v_im, v_im4, v_in, hm_m, q_offset, y_offset, s_shift, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
calc_superblock(a_offset, b_offset, ix, itid8, v_im, v_im4, v_in, hm_m, q_offset, y_offset, s_shift, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
u8vec2 s0 = unpack8(s0_16);
u8vec2 s2 = unpack8(s2_16);
u8vec2 s4 = unpack8(s4_16);
u8vec2 s6 = unpack8(s6_16);
u8vec2 s8 = unpack8(s8_16);
u8vec2 s10 = unpack8(s10_16);
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec2 b0 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 0]);
vec2 b16 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 8]);
vec2 b32 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 16]);
vec2 b48 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 24]);
vec2 b64 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 32]);
vec2 b80 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 40]);
vec2 b96 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 48]);
vec2 b112 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y_idx) / 2 + 56]);
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 2; ++l) {
sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(b96[l]) * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(b16[l]) * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(b48[l]) * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
}
temp[j][n] = fma(d, sum, temp[j][n]);
}
}
}
reduce_result(temp, d_offset, first_row, num_rows, tid);
}

View File

@@ -6,86 +6,6 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
const uint y1_idx = i * QUANT_K + y_offset;
const uint y2_idx = y1_idx + 128;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
vec2 d = vec2(data_a[ib0 + i].d);
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;
const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));
const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
const FLOAT_TYPE sc2 = scale_0_4_l_f.z;
const FLOAT_TYPE sc3 = scale_0_4_l_f.w;
const FLOAT_TYPE sc4 = scale8_f.x;
const FLOAT_TYPE sc5 = scale8_f.y;
const FLOAT_TYPE sc6 = scale8_f.z;
const FLOAT_TYPE sc7 = scale8_f.w;
const uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
const uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
const uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
const uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
const uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
const uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
const vec4 qs0_lo4 = vec4(unpack8(qs0_u32_lo4));
const vec4 qs64_lo4 = vec4(unpack8(qs64_u32_lo4));
const vec4 qs0_hi4 = vec4(unpack8(qs0_u32_hi4));
const vec4 qs64_hi4 = vec4(unpack8(qs64_u32_hi4));
const FLOAT_TYPE q4_0 = qs0_lo4.x;
const FLOAT_TYPE q4_1 = qs0_lo4.y;
const FLOAT_TYPE q4_2 = qs0_lo4.z;
const FLOAT_TYPE q4_3 = qs0_lo4.w;
const FLOAT_TYPE q4_4 = qs0_hi4.x;
const FLOAT_TYPE q4_5 = qs0_hi4.y;
const FLOAT_TYPE q4_6 = qs0_hi4.z;
const FLOAT_TYPE q4_7 = qs0_hi4.w;
const FLOAT_TYPE q4_8 = qs64_lo4.x;
const FLOAT_TYPE q4_9 = qs64_lo4.y;
const FLOAT_TYPE q4_10 = qs64_lo4.z;
const FLOAT_TYPE q4_11 = qs64_lo4.w;
const FLOAT_TYPE q4_12 = qs64_hi4.x;
const FLOAT_TYPE q4_13 = qs64_hi4.y;
const FLOAT_TYPE q4_14 = qs64_hi4.z;
const FLOAT_TYPE q4_15 = qs64_hi4.w;
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec4 by10 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 ]);
vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]);
vec4 by20 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 ]);
vec4 by232 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8]);
const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3)));
const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7)));
const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11)));
const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
const FLOAT_TYPE smin =
fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7)))))))))))))));
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
}
}
}
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
@@ -95,11 +15,13 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
const uint tid = gl_LocalInvocationID.x;
const uint itid = tid%16; // 0...15
const uint ix = tid/16;
const uint itid = tid%16; // 0...16
const uint ix = tid/16;
const uint il = itid/4; // 0...3
const uint ir = itid - 4*il; // 0...3
const uint step = 4;
const uint il = itid/step; // 0...3
const uint ir = itid - step*il; // 0...7 or 0...3
const uint n = 4;
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
@@ -109,14 +31,89 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
const uint q_offset = 32*v_im + l0;
const uint y_offset = 64*v_im + l0;
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[j][i] = FLOAT_TYPE(0);
}
}
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size)
calc_superblock(a_offset, b_offset, v_im, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows);
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y1_idx = i * QUANT_K + y_offset;
const uint y2_idx = y1_idx + 128;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
vec2 d = vec2(data_a[ib0 + i].d);
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
uvec4 scale0 = uvec4(unpack8(scale0_u32));
uvec4 scale4 = uvec4(unpack8(scale4_u32));
uvec4 scale8 = uvec4(unpack8(scale8_u32));
const uint32_t sc0 = ( scale0.x & 0x3f);
const uint32_t sc1 = ( scale0.y & 0x3f);
const uint32_t sc2 = ( scale4.x & 0x3f);
const uint32_t sc3 = ( scale4.y & 0x3f);
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4];
uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16];
uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F;
uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F;
uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F;
uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F;
uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4));
uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4));
uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4));
uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4));
const uint32_t q4_0 = qs0_lo4.x;
const uint32_t q4_1 = qs0_lo4.y;
const uint32_t q4_2 = qs0_lo4.z;
const uint32_t q4_3 = qs0_lo4.w;
const uint32_t q4_4 = qs0_hi4.x;
const uint32_t q4_5 = qs0_hi4.y;
const uint32_t q4_6 = qs0_hi4.z;
const uint32_t q4_7 = qs0_hi4.w;
const uint32_t q4_8 = qs64_lo4.x;
const uint32_t q4_9 = qs64_lo4.y;
const uint32_t q4_10 = qs64_lo4.z;
const uint32_t q4_11 = qs64_lo4.w;
const uint32_t q4_12 = qs64_hi4.x;
const uint32_t q4_13 = qs64_hi4.y;
const uint32_t q4_14 = qs64_hi4.z;
const uint32_t q4_15 = qs64_hi4.w;
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec4 by10 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 ]);
vec4 by132 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y1_idx) / 4 + 8]);
vec4 by20 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 ]);
vec4 by232 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y2_idx) / 4 + 8]);
const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3)));
const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7)));
const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11)));
const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15)));
const FLOAT_TYPE smin =
fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7,
fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7,
fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7,
fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7)))))))))))))));
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
}
}
}
reduce_result(temp, d_offset, first_row, num_rows, tid);
}

View File

@@ -6,118 +6,6 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
void calc_superblock(const uint a_offset, const uint b_offset, const uint v_im, const uint l0, const uint q_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
const uint y1_idx = i * QUANT_K + y_offset;
const uint y2_idx = y1_idx + 128;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
vec2 d = vec2(data_a[ib0 + i].d);
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
const uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
const uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
const uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
const uint32_t scale_0_4_l = (scale4_u32 << 16) | scale0_u32;
const uint32_t scale_0_4_h = (scale_0_4_l & 0xC0C0C0C0) >> 2;
const vec4 scale_0_4_l_f = vec4(unpack8(scale_0_4_l & 0x3F3F3F3F));
const vec4 scale8_f = vec4(unpack8((((scale8_u32 << 12) | scale8_u32) & 0x0F0F0F0F) | scale_0_4_h));
const FLOAT_TYPE sc0 = scale_0_4_l_f.x;
const FLOAT_TYPE sc1 = scale_0_4_l_f.y;
const FLOAT_TYPE sc2 = scale_0_4_l_f.z;
const FLOAT_TYPE sc3 = scale_0_4_l_f.w;
const FLOAT_TYPE sc4 = scale8_f.x;
const FLOAT_TYPE sc5 = scale8_f.y;
const FLOAT_TYPE sc6 = scale8_f.z;
const FLOAT_TYPE sc7 = scale8_f.w;
const uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
const uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
const uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
const uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
const uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
const uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010);
const uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
qs0_16_u32_lo4 += qs0_16_lo4_offset16;
qs0_16_u32_hi4 += qs0_16_hi4_offset16;
qs64_80_u32_lo4 += qs64_80_lo4_offset16;
qs64_80_u32_hi4 += qs64_80_hi4_offset16;
const vec4 qs0_16_lo4 = vec4(unpack8(qs0_16_u32_lo4));
const vec4 qs64_80_lo4 = vec4(unpack8(qs64_80_u32_lo4));
const vec4 qs0_16_hi4 = vec4(unpack8(qs0_16_u32_hi4));
const vec4 qs64_80_hi4 = vec4(unpack8(qs64_80_u32_hi4));
const FLOAT_TYPE q4_0 = qs0_16_lo4.x;
const FLOAT_TYPE q4_1 = qs0_16_lo4.y;
const FLOAT_TYPE q4_2 = qs0_16_lo4.z;
const FLOAT_TYPE q4_3 = qs0_16_lo4.w;
const FLOAT_TYPE q4_4 = qs0_16_hi4.x;
const FLOAT_TYPE q4_5 = qs0_16_hi4.y;
const FLOAT_TYPE q4_6 = qs0_16_hi4.z;
const FLOAT_TYPE q4_7 = qs0_16_hi4.w;
const FLOAT_TYPE q4_8 = qs64_80_lo4.x;
const FLOAT_TYPE q4_9 = qs64_80_lo4.y;
const FLOAT_TYPE q4_10 = qs64_80_lo4.z;
const FLOAT_TYPE q4_11 = qs64_80_lo4.w;
const FLOAT_TYPE q4_12 = qs64_80_hi4.x;
const FLOAT_TYPE q4_13 = qs64_80_hi4.y;
const FLOAT_TYPE q4_14 = qs64_80_hi4.z;
const FLOAT_TYPE q4_15 = qs64_80_hi4.w;
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec2 by10 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 ]);
vec2 by116 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8]);
vec2 by132 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16]);
vec2 by148 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24]);
vec2 by20 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 ]);
vec2 by216 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8]);
vec2 by232 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16]);
vec2 by248 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24]);
const FLOAT_TYPE sx =
fma(FLOAT_TYPE(by10.x), q4_0,
fma(FLOAT_TYPE(by10.y), q4_1,
fma(FLOAT_TYPE(by116.x), q4_2,
FLOAT_TYPE(by116.y) * q4_3)));
const FLOAT_TYPE sy =
fma(FLOAT_TYPE(by132.x), q4_4,
fma(FLOAT_TYPE(by132.y), q4_5,
fma(FLOAT_TYPE(by148.x), q4_6,
FLOAT_TYPE(by148.y) * q4_7)));
const FLOAT_TYPE sz =
fma(FLOAT_TYPE(by20.x), q4_8,
fma(FLOAT_TYPE(by20.y), q4_9,
fma(FLOAT_TYPE(by216.x), q4_10,
FLOAT_TYPE(by216.y) * q4_11)));
const FLOAT_TYPE sw =
fma(FLOAT_TYPE(by232.x), q4_12,
fma(FLOAT_TYPE(by232.y), q4_13,
fma(FLOAT_TYPE(by248.x), q4_14,
FLOAT_TYPE(by248.y) * q4_15)));
const FLOAT_TYPE smin =
fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
}
}
}
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
@@ -127,11 +15,11 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
const uint tid = gl_LocalInvocationID.x;
const uint itid = tid%16; // 0...15
const uint ix = tid/16;
const uint itid = tid%16; // 0...16
const uint ix = tid/16;
const uint il = itid/4; // 0...3
const uint ir = itid - 4*il; // 0...3
const uint ir = itid - 4*il; // 0...7 or 0...3
const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
const uint v_in = il % 2;
@@ -140,14 +28,121 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
const uint q_offset = 32*v_im + l0;
const uint y_offset = 64*v_im + l0;
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[j][i] = FLOAT_TYPE(0);
}
}
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size)
calc_superblock(a_offset, b_offset, v_im, l0, q_offset, y_offset, i, num_blocks_per_row, first_row, num_rows);
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y1_idx = i * QUANT_K + y_offset;
const uint y2_idx = y1_idx + 128;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
vec2 d = vec2(data_a[ib0 + i].d);
const FLOAT_TYPE dall = FLOAT_TYPE(d.x);
const FLOAT_TYPE dmin = FLOAT_TYPE(d.y);
uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ];
uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2];
uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4];
uvec4 scale0 = uvec4(unpack8(scale0_u32));
uvec4 scale4 = uvec4(unpack8(scale4_u32));
uvec4 scale8 = uvec4(unpack8(scale8_u32));
const uint32_t sc0 = ( scale0.x & 0x3f);
const uint32_t sc1 = ( scale0.y & 0x3f);
const uint32_t sc2 = ( scale4.x & 0x3f);
const uint32_t sc3 = ( scale4.y & 0x3f);
const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2));
const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2));
const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2));
const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2));
uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16);
uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16);
uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F;
uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F;
uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F;
uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F;
uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8]));
uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4;
uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3;
uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0;
uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1;
qs0_16_u32_lo4 += qs0_16_lo4_offset16;
qs0_16_u32_hi4 += qs0_16_hi4_offset16;
qs64_80_u32_lo4 += qs64_80_lo4_offset16;
qs64_80_u32_hi4 += qs64_80_hi4_offset16;
uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4));
uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4));
uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4));
uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4));
const uint32_t q4_0 = qs0_16_lo4.x;
const uint32_t q4_1 = qs0_16_lo4.y;
const uint32_t q4_2 = qs0_16_lo4.z;
const uint32_t q4_3 = qs0_16_lo4.w;
const uint32_t q4_4 = qs0_16_hi4.x;
const uint32_t q4_5 = qs0_16_hi4.y;
const uint32_t q4_6 = qs0_16_hi4.z;
const uint32_t q4_7 = qs0_16_hi4.w;
const uint32_t q4_8 = qs64_80_lo4.x;
const uint32_t q4_9 = qs64_80_lo4.y;
const uint32_t q4_10 = qs64_80_lo4.z;
const uint32_t q4_11 = qs64_80_lo4.w;
const uint32_t q4_12 = qs64_80_hi4.x;
const uint32_t q4_13 = qs64_80_hi4.y;
const uint32_t q4_14 = qs64_80_hi4.z;
const uint32_t q4_15 = qs64_80_hi4.w;
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec2 by10 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 ]);
vec2 by116 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 8]);
vec2 by132 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 16]);
vec2 by148 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y1_idx) / 2 + 24]);
vec2 by20 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 ]);
vec2 by216 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 8]);
vec2 by232 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 16]);
vec2 by248 = vec2(data_b_v2[(j*p.batch_stride_b + b_offset + y2_idx) / 2 + 24]);
const FLOAT_TYPE sx =
fma(FLOAT_TYPE(by10.x), q4_0,
fma(FLOAT_TYPE(by10.y), q4_1,
fma(FLOAT_TYPE(by116.x), q4_2,
FLOAT_TYPE(by116.y) * q4_3)));
const FLOAT_TYPE sy =
fma(FLOAT_TYPE(by132.x), q4_4,
fma(FLOAT_TYPE(by132.y), q4_5,
fma(FLOAT_TYPE(by148.x), q4_6,
FLOAT_TYPE(by148.y) * q4_7)));
const FLOAT_TYPE sz =
fma(FLOAT_TYPE(by20.x), q4_8,
fma(FLOAT_TYPE(by20.y), q4_9,
fma(FLOAT_TYPE(by216.x), q4_10,
FLOAT_TYPE(by216.y) * q4_11)));
const FLOAT_TYPE sw =
fma(FLOAT_TYPE(by232.x), q4_12,
fma(FLOAT_TYPE(by232.y), q4_13,
fma(FLOAT_TYPE(by248.x), q4_14,
FLOAT_TYPE(by248.y) * q4_15)));
const FLOAT_TYPE smin =
fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2,
fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3,
fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6,
(FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7)));
temp[j][n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[j][n]));
}
}
}
reduce_result(temp, d_offset, first_row, num_rows, tid);
}

View File

@@ -6,77 +6,7 @@
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
shared FLOAT_TYPE sccache[BLOCK_SIZE/16][16];
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
void calc_superblock(const uint a_offset, const uint b_offset, const uint itid, const uint ix, const uint ql_offset, const uint qh_offset, const uint s_offset, const uint y_offset, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows, const bool all_threads) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
if (!all_threads) { // when we don't have enough blocks to use all threads
barrier();
if (i < num_blocks_per_row)
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
barrier();
if (i >= num_blocks_per_row)
continue;
}
const uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
const uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
const uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
const uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
const uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
const uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
const uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
const uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
const uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
const uint32_t qh4_u32 = (qh_u32 & 0x30303030);
const uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
const uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32;
const uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
const uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
const uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
const vec4 q0 = vec4(unpack8(q0_u32)) - 32;
const vec4 q1 = vec4(unpack8(q1_u32)) - 32;
const vec4 q2 = vec4(unpack8(q2_u32)) - 32;
const vec4 q3 = vec4(unpack8(q3_u32)) - 32;
if (all_threads) {
barrier();
sccache[ix][itid] = FLOAT_TYPE(data_a[ib0 + i].scales[itid]);
barrier();
}
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec4 by0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 ]);
vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8]);
vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]);
vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]);
FLOAT_TYPE sum[4] = {0, 0, 0, 0};
[[unroll]] for (uint l = 0; l < 4; ++l) {
sum[0] = fma(FLOAT_TYPE(by0[l]), q0[l], sum[0]);
sum[1] = fma(FLOAT_TYPE(by32[l]), q1[l], sum[1]);
sum[2] = fma(FLOAT_TYPE(by64[l]), q2[l], sum[2]);
sum[3] = fma(FLOAT_TYPE(by96[l]), q3[l], sum[3]);
}
temp[j][n] = fma(fma(sum[0], sccache[ix][s_offset], fma(sum[1], sccache[ix][s_offset + 2], fma(sum[2], sccache[ix][s_offset + 4], sum[3] * sccache[ix][s_offset + 6]))), d, temp[j][n]);
}
}
}
void compute_outputs(const uint first_row, const uint num_rows) {
void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
uint a_offset, b_offset, d_offset;
get_offsets(a_offset, b_offset, d_offset);
@@ -85,11 +15,13 @@ void compute_outputs(const uint first_row, const uint num_rows) {
// 16 threads are used to process each block
const uint it_size = gl_WorkGroupSize.x/16;
const uint tid = gl_LocalInvocationID.x;
const uint itid = tid%16; // 0...15
const uint ix = tid/16;
const uint itid = tid%16; // 0...16
const uint ix = tid/16;
const uint v_im = itid/8; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - 8*v_im; // 0...7
const uint step = 8;
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - step*v_im; // 0...15 or 0...7
const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28
const uint is = v_in / 4;
@@ -99,18 +31,68 @@ void compute_outputs(const uint first_row, const uint num_rows) {
const uint s_offset = 8*v_im + is;
const uint y_offset = 128*v_im + l0;
FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
temp[j][i] = FLOAT_TYPE(0);
}
}
const uint nbr_par_th = num_blocks_per_row%it_size;
const uint nbr_all_th = num_blocks_per_row - nbr_par_th;
uint i0 = 0;
[[unroll]] for (; i0 < nbr_all_th; i0 += it_size)
calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, true);
calc_superblock(a_offset, b_offset, itid, ix, ql_offset, qh_offset, s_offset, y_offset, i0 + ix, num_blocks_per_row, first_row, num_rows, false);
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;
[[unroll]] for (uint n = 0; n < num_rows; ++n) {
const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
FLOAT_TYPE scales[4];
scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]);
scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]);
scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]);
scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]);
uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16);
uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16);
uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F;
uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F;
uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F;
uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F;
uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16);
uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4;
uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2;
uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0;
uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2;
uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32;
uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32;
uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32;
uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32;
uvec4 q0 = uvec4(unpack8(q0_u32));
uvec4 q1 = uvec4(unpack8(q1_u32));
uvec4 q2 = uvec4(unpack8(q2_u32));
uvec4 q3 = uvec4(unpack8(q3_u32));
[[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
vec4 by0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 ]);
vec4 by32 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 8]);
vec4 by64 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 16]);
vec4 by96 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 24]);
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
[[unroll]] for (int l = 0; l < 4; ++l) {
sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32),
fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32),
fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32),
fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum))));
}
temp[j][n] += sum * d;
}
}
}
reduce_result(temp, d_offset, first_row, num_rows, tid);
}

View File

@@ -57,13 +57,17 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
#if QUANT_K > 1
#define DECODEFUNCA , dequantFuncA
#define MAT_A_TYPE float16_t
#include "dequant_funcs_cm2.comp"
#else
#define DECODEFUNCA
#define MAT_A_TYPE A_TYPE
#endif
#define MAT_B_TYPE B_TYPE
#ifdef MUL_MAT_ID
layout (binding = 3) readonly buffer IDS {int data_ids[];};
@@ -232,13 +236,16 @@ void main() {
for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) {
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
coopmat<MAT_A_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
coopmat<MAT_B_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA);
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose);
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
sum = coopMatMulAdd(mat_a, mat_b, sum);
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose);
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
}
} else
#endif // !defined(MUL_MAT_ID)
@@ -254,8 +261,10 @@ void main() {
[[dont_unroll]]
for (uint block_k = start_k; block_k < end_k; block_k += BK) {
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
coopmat<MAT_A_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a;
coopmat<MAT_B_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA> mat_a_ft;
coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB> mat_b_ft;
// Clamping is expensive, so detect different code paths for each combination
// of A and B needing clamping.
@@ -272,12 +281,16 @@ void main() {
#else
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose);
#endif
sum = coopMatMulAdd(mat_a, mat_b, sum);
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
} else if (unclampedA && !unclampedB) {
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA);
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
sum = coopMatMulAdd(mat_a, mat_b, sum);
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
} else if (!unclampedA && unclampedB) {
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
#ifdef MUL_MAT_ID
@@ -285,12 +298,16 @@ void main() {
#else
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose);
#endif
sum = coopMatMulAdd(mat_a, mat_b, sum);
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
} else if (!unclampedA && !unclampedB) {
coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA);
coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose);
sum = coopMatMulAdd(mat_a, mat_b, sum);
mat_a_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BM, BK, gl_MatrixUseA>(mat_a);
mat_b_ft = coopmat<FLOAT_TYPE, gl_ScopeWorkgroup, BK, BN, gl_MatrixUseB>(mat_b);
sum = coopMatMulAdd(mat_a_ft, mat_b_ft, sum);
}
}
}

View File

@@ -227,11 +227,6 @@ struct block_q4_K_packed32
uint32_t qs[QUANT_K_Q4_K/2/4];
};
struct block_q4_K_packed128
{
uvec4 q4k[9];
};
#if defined(DATA_A_Q4_K)
#define QUANT_K QUANT_K_Q4_K
#define A_TYPE block_q4_K
@@ -257,11 +252,6 @@ struct block_q5_K_packed16
uint16_t qs[QUANT_K_Q5_K/2/2];
};
struct block_q5_K_packed128
{
uvec4 q5k[11];
};
#if defined(DATA_A_Q5_K)
#define QUANT_K QUANT_K_Q5_K
#define A_TYPE block_q5_K

View File

@@ -17,13 +17,13 @@
#include <cstring>
#include <cstdlib>
#include <cassert>
#include <algorithm>
#include <sys/stat.h>
#include <sys/types.h>
#ifdef _WIN32
#include <windows.h>
#include <direct.h> // For _mkdir on Windows
#include <algorithm> // For std::replace on w64devkit
#else
#include <unistd.h>
#include <sys/wait.h>
@@ -316,11 +316,8 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool
// For aligned matmul loads
std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : "2";
// don't generate f32 variants for coopmat2
if (!coopmat2) {
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
}
string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc);
if (tname != "f16" && tname != "f32") {
string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, {"B_IS_FLOAT", "1"}}), fp16, coopmat, coopmat2, f16acc);
@@ -420,11 +417,6 @@ void process_shaders() {
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
for (std::string t : {"q4_0", "q4_1", "q5_0", "q5_1", "q8_0", "iq4_nl"}) {
string_to_spv("cpy_f32_" + t, "copy_to_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("cpy_" + t + "_f32", "copy_from_quant.comp", {{"DATA_A_" + to_uppercase(t), "1"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
}
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
@@ -502,7 +494,6 @@ void write_output_files() {
fprintf(hdr, "#include <cstdint>\n\n");
fprintf(src, "#include \"%s\"\n\n", basename(target_hpp).c_str());
std::sort(shader_fnames.begin(), shader_fnames.end());
for (const auto& pair : shader_fnames) {
const std::string& name = pair.first;
#ifdef _WIN32

View File

@@ -3450,14 +3450,12 @@ struct ggml_tensor * ggml_soft_max_ext(
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
}
// ggml_soft_max_ext_back
// ggml_soft_max_back
static struct ggml_tensor * ggml_soft_max_ext_back_impl(
static struct ggml_tensor * ggml_soft_max_back_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
float scale,
float max_bias,
bool inplace) {
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
@@ -3465,28 +3463,21 @@ static struct ggml_tensor * ggml_soft_max_ext_back_impl(
result->src[0] = a;
result->src[1] = b;
memcpy((float *) result->op_params + 0, &scale, sizeof(float));
memcpy((float *) result->op_params + 1, &max_bias, sizeof(float));
return result;
}
struct ggml_tensor * ggml_soft_max_ext_back(
struct ggml_tensor * ggml_soft_max_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
float scale,
float max_bias) {
return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, false);
struct ggml_tensor * b) {
return ggml_soft_max_back_impl(ctx, a, b, false);
}
struct ggml_tensor * ggml_soft_max_ext_back_inplace(
struct ggml_tensor * ggml_soft_max_back_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
float scale,
float max_bias) {
return ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, true);
struct ggml_tensor * b) {
return ggml_soft_max_back_impl(ctx, a, b, true);
}
// ggml_rope
@@ -5085,10 +5076,10 @@ struct ggml_tensor * ggml_cross_entropy_loss_back(
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c) {
GGML_ASSERT(ggml_is_scalar(a));
GGML_ASSERT(ggml_are_same_shape(b, c));
GGML_ASSERT(ggml_are_same_shape(a, b));
GGML_ASSERT(ggml_is_scalar(c));
struct ggml_tensor * result = ggml_dup_tensor(ctx, b);
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
result->op = GGML_OP_CROSS_ENTROPY_LOSS_BACK;
result->src[0] = a;
@@ -5267,7 +5258,7 @@ static void ggml_sub_or_set(
}
static void ggml_compute_backward(
struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, const bool * grads_needed) {
struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i, bool * grads_needed) {
struct ggml_tensor * tensor = cgraph->nodes[i];
struct ggml_tensor * grad = ggml_graph_get_grad(cgraph, tensor);
@@ -5339,7 +5330,7 @@ static void ggml_compute_backward(
} break;
case GGML_OP_MUL: {
if (src0_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, src1));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, src1, grad));
}
if (src1_needs_grads) {
struct ggml_tensor * tmp = ggml_mul(ctx, src0, grad);
@@ -5411,7 +5402,7 @@ static void ggml_compute_backward(
if (src0_needs_grads) {
float eps;
memcpy(&eps, tensor->op_params, sizeof(float));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, grad, src0, eps));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_rms_norm_back(ctx, src0, grad, eps));
}
} break;
case GGML_OP_MUL_MAT: {
@@ -5431,25 +5422,21 @@ static void ggml_compute_backward(
// src1.shape [n,p,qq,rr]
if (src0_needs_grads) {
GGML_ASSERT(grad->ne[2] == src1->ne[2]);
GGML_ASSERT(grad->ne[3] == src1->ne[3]);
struct ggml_tensor * tmp =
struct ggml_tensor * s1_tg =
ggml_out_prod(ctx, // [n,m,qq,rr]
src1, // [n,p,qq,rr]
grad); // [m,p,qq,rr]
if (!ggml_are_same_shape(tmp, src0)) {
GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
GGML_ASSERT(tmp->ne[3] == 1);
const int64_t nr2 = tmp->ne[2] / src0->ne[2];
const size_t nb2 = tmp->nb[2] * nr2;
const size_t nb3 = tmp->nb[2];
tmp = ggml_view_4d(ctx, tmp, src0->ne[0], src0->ne[1], src0->ne[2], nr2, tmp->nb[1], nb2, nb3, 0);
tmp = ggml_repeat_back(ctx, tmp, src0);
const int64_t qq = s1_tg->ne[2];
const int64_t rr = s1_tg->ne[3];
const int64_t q1 = src0->ne[2];
const int64_t r1 = src0->ne[3];
const bool ne2_broadcasted = qq > q1;
const bool ne3_broadcasted = rr > r1;
if (ne2_broadcasted || ne3_broadcasted) {
// sum broadcast repetitions of s1_tg into shape of src0
s1_tg = ggml_repeat_back(ctx, s1_tg, src0);
}
ggml_add_or_set(ctx, cgraph, isrc0, tmp);
ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/);
}
if (src1_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc1,
@@ -5518,9 +5505,7 @@ static void ggml_compute_backward(
if (src0_needs_grads) {
GGML_ASSERT(!cgraph->grads[isrc0] || ggml_is_contiguous(cgraph->grads[isrc0]));
GGML_ASSERT(ggml_is_contiguous(grad));
GGML_ASSERT(ggml_nelements(tensor) == ggml_nelements(src0));
ggml_add_or_set(ctx, cgraph, isrc0,
ggml_are_same_shape(tensor, src0) ? grad : ggml_reshape(ctx, grad, src0));
ggml_add_or_set(ctx, cgraph, isrc0, grad);
}
} break;
case GGML_OP_RESHAPE: {
@@ -5600,13 +5585,7 @@ static void ggml_compute_backward(
} break;
case GGML_OP_SOFT_MAX: {
if (src0_needs_grads) {
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (const float *) tensor->op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) tensor->op_params + 1, sizeof(float));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_ext_back(ctx, grad, tensor, scale, max_bias));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_soft_max_back(ctx, grad, tensor));
}
GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
} break;
@@ -5647,7 +5626,7 @@ static void ggml_compute_backward(
const int32_t d1 = ggml_get_op_params_i32(tensor, 5);
const bool is_2D = ggml_get_op_params_i32(tensor, 6) == 1;
ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, grad, src0, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
ggml_add_or_set(ctx, cgraph, isrc1, ggml_im2col_back(ctx, src0, grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
}
} break;
case GGML_OP_POOL_2D: {
@@ -5690,7 +5669,7 @@ static void ggml_compute_backward(
} break;
case GGML_UNARY_OP_SILU: {
if (src0_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, grad, src0));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_silu_back(ctx, src0, grad));
}
} break;
case GGML_UNARY_OP_EXP: {
@@ -5707,7 +5686,7 @@ static void ggml_compute_backward(
} break;
case GGML_OP_CROSS_ENTROPY_LOSS: {
if (src0_needs_grads) {
ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, grad, src0, src1));
ggml_add_or_set(ctx, cgraph, isrc0, ggml_cross_entropy_loss_back(ctx, src0, src1, grad));
}
GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
} break;

View File

@@ -648,10 +648,6 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
ok = ok && data != nullptr;
if (ok) {
ggml_set_name(data, "GGUF tensor data binary blob");
}
// read the binary blob with the tensor data
ok = ok && gr.read(data->data, ctx->size);

Some files were not shown because too many files have changed in this diff Show More