mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
39 Commits
b2055
...
gg/flash-a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b957b8f5f6 | ||
|
|
2e46013749 | ||
|
|
910b15bb40 | ||
|
|
8ad92dc1ec | ||
|
|
2ddc9bbef1 | ||
|
|
3d03bcb7af | ||
|
|
78df5527e4 | ||
|
|
d073e4f933 | ||
|
|
5fcb9c1c5a | ||
|
|
c6c1132e5e | ||
|
|
abeaf0d90e | ||
|
|
4794821a31 | ||
|
|
1db22d7032 | ||
|
|
134c81c78d | ||
|
|
0ad44baf33 | ||
|
|
8612864108 | ||
|
|
3a428a1097 | ||
|
|
ecc466a460 | ||
|
|
77f6976a87 | ||
|
|
b3dd7d975f | ||
|
|
6fea843b24 | ||
|
|
f9ca5dcbe8 | ||
|
|
40ea8cd1ac | ||
|
|
432ad04ffa | ||
|
|
d917746ddb | ||
|
|
1446a12b29 | ||
|
|
17720fad66 | ||
|
|
77d08f3272 | ||
|
|
a4b6341c7b | ||
|
|
f31955f5d1 | ||
|
|
8cde449b8b | ||
|
|
b97325800a | ||
|
|
52ae085750 | ||
|
|
528da7515e | ||
|
|
1173f49c3b | ||
|
|
a9681febd6 | ||
|
|
c3cdfffa88 | ||
|
|
fa7ebcca99 | ||
|
|
a1c004ef2e |
@@ -1,8 +1,8 @@
|
||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
FROM intel/hpckit:$ONEAPI_VERSION as build
|
||||
|
||||
ARG LLAMA_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
@@ -10,18 +10,16 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
|
||||
RUN mkdir build && \
|
||||
cd build && \
|
||||
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||
echo "LLAMA_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
cmake --build . --config Release --target main
|
||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
|
||||
cmake --build . --config Release --target main server
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/main /main
|
||||
COPY --from=build /app/build/bin/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget
|
||||
|
||||
# Install Vulkan SDK
|
||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||
apt update -y && \
|
||||
apt-get install -y vulkan-sdk
|
||||
|
||||
# Build it
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN mkdir build && \
|
||||
cd build && \
|
||||
cmake .. -DLLAMA_VULKAN=1 && \
|
||||
cmake --build . --config Release --target main
|
||||
|
||||
# Clean up
|
||||
WORKDIR /
|
||||
RUN cp /app/build/bin/main /main && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
||||
@@ -1,8 +1,8 @@
|
||||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
FROM intel/hpckit:$ONEAPI_VERSION as build
|
||||
|
||||
ARG LLAMA_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
@@ -10,16 +10,13 @@ WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
|
||||
RUN mkdir build && \
|
||||
cd build && \
|
||||
if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||
echo "LLAMA_SYCL_F16 is set" && \
|
||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
||||
fi && \
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
cmake --build . --config Release --target server
|
||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
|
||||
cmake --build . --config Release --target main server
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/server /server
|
||||
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget
|
||||
|
||||
# Install Vulkan SDK
|
||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||
apt update -y && \
|
||||
apt-get install -y vulkan-sdk
|
||||
|
||||
# Build it
|
||||
WORKDIR /app
|
||||
COPY . .
|
||||
RUN mkdir build && \
|
||||
cd build && \
|
||||
cmake .. -DLLAMA_VULKAN=1 && \
|
||||
cmake --build . --config Release --target server
|
||||
|
||||
# Clean up
|
||||
WORKDIR /
|
||||
RUN cp /app/build/bin/server /server && \
|
||||
rm -rf /app
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
||||
6
.github/workflows/build.yml
vendored
6
.github/workflows/build.yml
vendored
@@ -356,8 +356,6 @@ jobs:
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'vulkan'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -408,7 +406,7 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
@@ -453,7 +451,7 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
@@ -100,10 +100,6 @@ option(LLAMA_HIPBLAS "llama: use hipBLAS"
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
|
||||
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
|
||||
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
|
||||
option(LLAMA_VULKAN_VALIDATE "llama: enable Vulkan validation" OFF)
|
||||
option(LLAMA_VULKAN_RUN_TESTS "llama: run Vulkan tests" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
@@ -427,7 +423,10 @@ if (LLAMA_VULKAN)
|
||||
if (Vulkan_FOUND)
|
||||
message(STATUS "Vulkan found")
|
||||
|
||||
add_library(ggml-vulkan OBJECT ggml-vulkan.cpp ggml-vulkan.h)
|
||||
set(GGML_HEADERS_VULKAN ggml-vulkan.h)
|
||||
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
|
||||
|
||||
add_library(ggml-vulkan STATIC ggml-vulkan.cpp ggml-vulkan.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml-vulkan PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
@@ -435,22 +434,6 @@ if (LLAMA_VULKAN)
|
||||
|
||||
add_compile_definitions(GGML_USE_VULKAN)
|
||||
|
||||
if (LLAMA_VULKAN_CHECK_RESULTS)
|
||||
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_CHECK_RESULTS)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_DEBUG)
|
||||
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_DEBUG)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_VALIDATE)
|
||||
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_VALIDATE)
|
||||
endif()
|
||||
|
||||
if (LLAMA_VULKAN_RUN_TESTS)
|
||||
target_compile_definitions(ggml-vulkan PRIVATE GGML_VULKAN_RUN_TESTS)
|
||||
endif()
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-vulkan)
|
||||
else()
|
||||
message(WARNING "Vulkan not found")
|
||||
@@ -1029,6 +1012,7 @@ add_library(ggml OBJECT
|
||||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
@@ -1110,7 +1094,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_VULKAN}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
|
||||
17
Makefile
17
Makefile
@@ -457,18 +457,6 @@ ifdef LLAMA_VULKAN_CHECK_RESULTS
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
|
||||
endif
|
||||
|
||||
ifdef LLAMA_VULKAN_DEBUG
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_DEBUG
|
||||
endif
|
||||
|
||||
ifdef LLAMA_VULKAN_VALIDATE
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
|
||||
endif
|
||||
|
||||
ifdef LLAMA_VULKAN_RUN_TESTS
|
||||
MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
|
||||
endif
|
||||
|
||||
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # LLAMA_VULKAN
|
||||
@@ -598,11 +586,8 @@ train.o: common/train.cpp common/train.h
|
||||
libllama.so: llama.o ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||
ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS)
|
||||
|
||||
clean:
|
||||
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
|
||||
#
|
||||
# Examples
|
||||
|
||||
168
README-sycl.md
168
README-sycl.md
@@ -1,15 +1,22 @@
|
||||
# llama.cpp for SYCL
|
||||
|
||||
- [Background](#background)
|
||||
- [OS](#os)
|
||||
- [Intel GPU](#intel-gpu)
|
||||
- [Docker](#docker)
|
||||
- [Linux](#linux)
|
||||
- [Windows](#windows)
|
||||
- [Environment Variable](#environment-variable)
|
||||
- [Known Issue](#known-issue)
|
||||
- [Q&A](#q&a)
|
||||
- [Todo](#todo)
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Windows](#windows)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Q&A](#q&a)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
@@ -29,65 +36,20 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Support|Windows 11|
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
### Verified
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770, 730M|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
|
||||
|
||||
### Memory
|
||||
|
||||
The memory is a limitation to run LLM on GPUs.
|
||||
|
||||
When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`.
|
||||
|
||||
For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+.
|
||||
|
||||
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
|
||||
|
||||
## Docker
|
||||
|
||||
Note:
|
||||
- Only docker on Linux is tested. Docker on WSL may not work.
|
||||
- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
|
||||
|
||||
### Build the image
|
||||
|
||||
You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
|
||||
|
||||
|
||||
```sh
|
||||
# For F16:
|
||||
#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
|
||||
|
||||
# Or, for F32:
|
||||
docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
|
||||
|
||||
# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
|
||||
```
|
||||
|
||||
### Run
|
||||
|
||||
```sh
|
||||
# Firstly, find all the DRI cards:
|
||||
ls -la /dev/dri
|
||||
# Then, pick the card that you want to use.
|
||||
|
||||
# For example with "/dev/dri/card1"
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
|
||||
## Linux
|
||||
|
||||
@@ -101,7 +63,7 @@ Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```sh
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
@@ -110,7 +72,7 @@ Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```sh
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
@@ -128,6 +90,7 @@ Platform #0: Intel(R) OpenCL HD Graphics
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
@@ -136,13 +99,13 @@ Following guide use the default folder as example. If you use other folder, plea
|
||||
|
||||
b. Check
|
||||
|
||||
```sh
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
@@ -155,25 +118,21 @@ Output (example):
|
||||
|
||||
2. Build locally:
|
||||
|
||||
Note:
|
||||
- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
```sh
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
# For FP16:
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
# Or, for FP32:
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
# Build example/main only
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
# Or, build all binary
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
cd ..
|
||||
@@ -181,16 +140,18 @@ cd ..
|
||||
|
||||
or
|
||||
|
||||
```sh
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
@@ -201,10 +162,10 @@ source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```sh
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
# or running the "main" executable and look at the output log:
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
@@ -233,13 +194,13 @@ found 4 SYCL devices:
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```sh
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```sh
|
||||
./examples/sycl/run_llama2.sh
|
||||
```
|
||||
./examples/sycl/run-llama2.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
@@ -262,13 +223,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
|
||||
Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
|
||||
|
||||
Note: **The driver is mandatory for compute function**.
|
||||
|
||||
2. Install Visual Studio.
|
||||
|
||||
Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows.
|
||||
|
||||
3. Install Intel® oneAPI Base toolkit.
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
@@ -297,7 +252,7 @@ In oneAPI command line:
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
@@ -305,21 +260,15 @@ Output (example):
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
|
||||
|
||||
```
|
||||
|
||||
4. Install cmake & make
|
||||
3. Install cmake & make
|
||||
|
||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
||||
a. Download & install cmake for windows: https://cmake.org/download/
|
||||
|
||||
b. Download & install make for Windows provided by mingw-w64
|
||||
b. Download & install make for windows provided by mingw-w64: https://www.mingw-w64.org/downloads/
|
||||
|
||||
- Download binary package for Windows in https://github.com/niXman/mingw-builds-binaries/releases.
|
||||
|
||||
Like [x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z](https://github.com/niXman/mingw-builds-binaries/releases/download/13.2.0-rt_v11-rev1/x86_64-13.2.0-release-win32-seh-msvcrt-rt_v11-rev1.7z).
|
||||
|
||||
- Unzip the binary package. In the **bin** sub-folder and rename **xxx-make.exe** to **make.exe**.
|
||||
|
||||
- Add the **bin** folder path in the Windows system PATH environment.
|
||||
|
||||
### Build locally:
|
||||
|
||||
@@ -360,8 +309,6 @@ Note:
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example.
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
- In Search, input 'oneAPI'.
|
||||
@@ -458,7 +405,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap** or **--mmap 0**.
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Q&A
|
||||
|
||||
@@ -472,25 +419,8 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
- Meet compile error.
|
||||
|
||||
Remove folder **build** and try again.
|
||||
|
||||
- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux.
|
||||
|
||||
Please run **sudo sycl-ls**.
|
||||
|
||||
If you see it in result, please add video/render group to your ID:
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Then **relogin**.
|
||||
|
||||
If you do not see it, please check the installation GPU steps again.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
||||
|
||||
65
README.md
65
README.md
@@ -143,7 +143,6 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
|
||||
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
|
||||
- [iohub/collama](https://github.com/iohub/coLLaMA)
|
||||
- [pythops/tenere](https://github.com/pythops/tenere)
|
||||
|
||||
---
|
||||
|
||||
@@ -394,28 +393,28 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
Check [BLIS.md](docs/BLIS.md) for more information.
|
||||
|
||||
- #### SYCL
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||
|
||||
llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
|
||||
|
||||
- #### Intel oneMKL
|
||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
|
||||
|
||||
- Using manual oneAPI installation:
|
||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
|
||||
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-runtime docker image, only required for manual installation
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
- Using oneAPI docker image:
|
||||
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
|
||||
If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
|
||||
|
||||
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
|
||||
|
||||
@@ -602,48 +601,14 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
||||
|
||||
- #### Vulkan
|
||||
- #### SYCL
|
||||
|
||||
**With docker**:
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||
|
||||
You don't need to install Vulkan SDK. It will be installed inside the container.
|
||||
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
|
||||
```sh
|
||||
# Build the image
|
||||
docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
|
||||
|
||||
# Then, use it:
|
||||
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
|
||||
**Without docker**:
|
||||
|
||||
Firstly, you need to make sure you installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
|
||||
|
||||
For example, on Ubuntu 22.04 (jammy), use the command below:
|
||||
|
||||
```bash
|
||||
wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
|
||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
|
||||
apt update -y
|
||||
apt-get install -y vulkan-sdk
|
||||
# To verify the installation, use the command below:
|
||||
vulkaninfo
|
||||
```
|
||||
|
||||
Then, build llama.cpp using the cmake command below:
|
||||
|
||||
```bash
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake .. -DLLAMA_VULKAN=1
|
||||
cmake --build . --config Release
|
||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||
./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||
|
||||
# You should see in the output, ggml_vulkan detected your GPU. For example:
|
||||
# ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
|
||||
```
|
||||
|
||||
### Prepare Data & Run
|
||||
|
||||
|
||||
@@ -515,7 +515,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
||||
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-scaled") {
|
||||
if (++i >= argc) {
|
||||
@@ -527,7 +527,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
@@ -664,7 +664,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.antiprompt.emplace_back(argv[i]);
|
||||
params.antiprompt.push_back(argv[i]);
|
||||
} else if (arg == "-ld" || arg == "--logdir") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -880,7 +880,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back();
|
||||
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -75,7 +75,8 @@ struct gpt_params {
|
||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
|
||||
// pinging @cebtenzzre
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
||||
@@ -203,8 +203,6 @@ class Model:
|
||||
return CodeShellModel
|
||||
if model_architecture == "OrionForCausalLM":
|
||||
return OrionModel
|
||||
if model_architecture == "InternLM2ForCausalLM":
|
||||
return InternLM2Model
|
||||
return Model
|
||||
|
||||
def _is_model_safetensors(self) -> bool:
|
||||
@@ -256,8 +254,6 @@ class Model:
|
||||
return gguf.MODEL_ARCH.CODESHELL
|
||||
if arch == "OrionForCausalLM":
|
||||
return gguf.MODEL_ARCH.ORION
|
||||
if arch == "InternLM2ForCausalLM":
|
||||
return gguf.MODEL_ARCH.INTERNLM2
|
||||
|
||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||
|
||||
@@ -1138,7 +1134,7 @@ class GPT2Model(Model):
|
||||
|
||||
for name, data_torch in self.get_tensors():
|
||||
# we don't need these
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
|
||||
continue
|
||||
|
||||
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
|
||||
@@ -1348,154 +1344,6 @@ class CodeShellModel(Model):
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
|
||||
class InternLM2Model(Model):
|
||||
def set_vocab(self):
|
||||
# (TODO): Is there a better way?
|
||||
# Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
|
||||
# \x00 specially and convert it into an emoji character to prevent it from being mistakenly
|
||||
# recognized as an empty string in C++.
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
from sentencepiece import sentencepiece_model_pb2 as model
|
||||
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
sentencepiece_model = model.ModelProto()
|
||||
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
|
||||
for token_id in range(vocab_size):
|
||||
piece = tokenizer.id_to_piece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(token_id)
|
||||
if text == b"\x00":
|
||||
# (TODO): fixme
|
||||
# Hack here and replace the \x00 characters.
|
||||
print(f"InternLM2 convert token '{text}' to '🐉'!")
|
||||
text = "🐉"
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if tokenizer.is_unknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif tokenizer.is_control(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif tokenizer.is_unused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.is_byte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
|
||||
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||
if added_tokens_file.is_file():
|
||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||
added_tokens_json = json.load(f)
|
||||
|
||||
for key in added_tokens_json:
|
||||
tokens.append(key.encode("utf-8"))
|
||||
scores.append(-1000.0)
|
||||
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("InternLM2")
|
||||
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
|
||||
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
|
||||
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
|
||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
|
||||
|
||||
def post_write_tensors(self, tensor_map, name, data_torch):
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
def write_tensors(self):
|
||||
from einops import rearrange
|
||||
|
||||
num_heads = self.hparams.get("num_attention_heads")
|
||||
num_kv_heads = self.hparams.get("num_key_value_heads")
|
||||
hidden_size = self.hparams.get("hidden_size")
|
||||
q_per_kv = num_heads // num_kv_heads
|
||||
head_dim = hidden_size // num_heads
|
||||
num_groups = num_heads // q_per_kv
|
||||
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
model_kv = dict(self.get_tensors())
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
|
||||
for name, data_torch in model_kv.items():
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
if re.match(qkv_pattern, name):
|
||||
bid = re.findall(qkv_pattern, name)[0]
|
||||
qkv = data_torch
|
||||
qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
|
||||
q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
|
||||
q = rearrange(q, " o g n i -> o (g n i)").T
|
||||
k = rearrange(k, " o g n i -> o (g n i)").T
|
||||
v = rearrange(v, " o g n i -> o (g n i)").T
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
|
||||
self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
|
||||
else:
|
||||
self.post_write_tensors(tensor_map, name, data_torch)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
@@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = n_kv_max;
|
||||
ctx_params.n_batch = 512;
|
||||
ctx_params.n_batch = 2048;
|
||||
ctx_params.mul_mat_q = mmq;
|
||||
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
|
||||
@@ -23,23 +23,19 @@ usage: ./llama-bench [options]
|
||||
|
||||
options:
|
||||
-h, --help
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
-b, --batch-size <n> (default: 512)
|
||||
-ctk <t>, --cache-type-k <t> (default: f16)
|
||||
-ctv <t>, --cache-type-v <t> (default: f16)
|
||||
-t, --threads <n> (default: 112)
|
||||
-ngl, --n-gpu-layers <n> (default: 99)
|
||||
-sm, --split-mode <none|layer|row> (default: layer)
|
||||
-mg, --main-gpu <i> (default: 0)
|
||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||
-mmp, --mmap <0|1> (default: 1)
|
||||
-mmq, --mul-mat-q <0|1> (default: 1)
|
||||
-ts, --tensor_split <ts0/ts1/..> (default: 0)
|
||||
-r, --repetitions <n> (default: 5)
|
||||
-o, --output <csv|json|md|sql> (default: md)
|
||||
-v, --verbose (default: 0)
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
-b, --batch-size <n> (default: 512)
|
||||
--memory-f32 <0|1> (default: 0)
|
||||
-t, --threads <n> (default: 16)
|
||||
-ngl N, --n-gpu-layers <n> (default: 99)
|
||||
-mg i, --main-gpu <i> (default: 0)
|
||||
-mmq, --mul-mat-q <0|1> (default: 1)
|
||||
-ts, --tensor_split <ts0/ts1/..>
|
||||
-r, --repetitions <n> (default: 5)
|
||||
-o, --output <csv|json|md|sql> (default: md)
|
||||
-v, --verbose (default: 0)
|
||||
|
||||
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
||||
```
|
||||
@@ -55,10 +51,6 @@ Each test is repeated the number of times given by `-r`, and the results are ave
|
||||
|
||||
For a description of the other options, see the [main example](../main/README.md).
|
||||
|
||||
Note:
|
||||
|
||||
- When using SYCL backend, there would be hang issue in some cases. Please set `--mmp 0`.
|
||||
|
||||
## Examples
|
||||
|
||||
### Text generation with different models
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
// utils
|
||||
static uint64_t get_time_ns() {
|
||||
@@ -121,22 +120,6 @@ static std::string get_gpu_info() {
|
||||
id += "/";
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#ifdef GGML_USE_SYCL
|
||||
int device_list[GGML_SYCL_MAX_DEVICES];
|
||||
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
|
||||
|
||||
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
||||
if (device_list[i] >0 ){
|
||||
char buf[128];
|
||||
ggml_sycl_get_device_description(i, buf, sizeof(buf));
|
||||
id += buf;
|
||||
id += "/";
|
||||
}
|
||||
}
|
||||
if (id.length() >2 ) {
|
||||
id.pop_back();
|
||||
}
|
||||
#endif
|
||||
// TODO: other backends
|
||||
return id;
|
||||
@@ -178,7 +161,6 @@ struct cmd_params {
|
||||
std::vector<bool> no_kv_offload;
|
||||
std::vector<bool> mul_mat_q;
|
||||
std::vector<std::vector<float>> tensor_split;
|
||||
std::vector<bool> use_mmap;
|
||||
int reps;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
@@ -198,7 +180,6 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* no_kv_offload */ {false},
|
||||
/* mul_mat_q */ {true},
|
||||
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
||||
/* use_mmap */ {true},
|
||||
/* reps */ 5,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN
|
||||
@@ -220,7 +201,6 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
||||
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
@@ -390,13 +370,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
|
||||
} else if (arg == "-mmp" || arg == "--mmap") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<bool>(argv[i], split_delim);
|
||||
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ts" || arg == "--tensor-split") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -468,7 +441,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
||||
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
||||
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||
|
||||
return params;
|
||||
@@ -488,7 +460,6 @@ struct cmd_params_instance {
|
||||
bool no_kv_offload;
|
||||
bool mul_mat_q;
|
||||
std::vector<float> tensor_split;
|
||||
bool use_mmap;
|
||||
|
||||
llama_model_params to_llama_mparams() const {
|
||||
llama_model_params mparams = llama_model_default_params();
|
||||
@@ -497,7 +468,6 @@ struct cmd_params_instance {
|
||||
mparams.split_mode = split_mode;
|
||||
mparams.main_gpu = main_gpu;
|
||||
mparams.tensor_split = tensor_split.data();
|
||||
mparams.use_mmap = use_mmap;
|
||||
|
||||
return mparams;
|
||||
}
|
||||
@@ -507,7 +477,6 @@ struct cmd_params_instance {
|
||||
n_gpu_layers == other.n_gpu_layers &&
|
||||
split_mode == other.split_mode &&
|
||||
main_gpu == other.main_gpu &&
|
||||
use_mmap == other.use_mmap &&
|
||||
tensor_split == other.tensor_split;
|
||||
}
|
||||
|
||||
@@ -534,7 +503,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & sm : params.split_mode)
|
||||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & mmp : params.use_mmap)
|
||||
for (const auto & nb : params.n_batch)
|
||||
for (const auto & tk : params.type_k)
|
||||
for (const auto & tv : params.type_v)
|
||||
@@ -559,7 +527,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .mul_mat_q = */ mmq,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .use_mmap = */ mmp,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -582,7 +549,6 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .no_kv_offload= */ nkvo,
|
||||
/* .mul_mat_q = */ mmq,
|
||||
/* .tensor_split = */ ts,
|
||||
/* .use_mmap = */ mmp,
|
||||
};
|
||||
instances.push_back(instance);
|
||||
}
|
||||
@@ -599,7 +565,6 @@ struct test {
|
||||
static const bool vulkan;
|
||||
static const bool kompute;
|
||||
static const bool metal;
|
||||
static const bool sycl;
|
||||
static const bool gpu_blas;
|
||||
static const bool blas;
|
||||
static const std::string cpu_info;
|
||||
@@ -618,7 +583,6 @@ struct test {
|
||||
bool no_kv_offload;
|
||||
bool mul_mat_q;
|
||||
std::vector<float> tensor_split;
|
||||
bool use_mmap;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
std::string test_time;
|
||||
@@ -641,7 +605,6 @@ struct test {
|
||||
no_kv_offload = inst.no_kv_offload;
|
||||
mul_mat_q = inst.mul_mat_q;
|
||||
tensor_split = inst.tensor_split;
|
||||
use_mmap = inst.use_mmap;
|
||||
n_prompt = inst.n_prompt;
|
||||
n_gen = inst.n_gen;
|
||||
// RFC 3339 date-time format
|
||||
@@ -691,29 +654,25 @@ struct test {
|
||||
if (metal) {
|
||||
return "Metal";
|
||||
}
|
||||
if (sycl) {
|
||||
return GGML_SYCL_NAME;
|
||||
}
|
||||
if (gpu_blas) {
|
||||
return "GPU BLAS";
|
||||
}
|
||||
if (blas) {
|
||||
return "BLAS";
|
||||
}
|
||||
|
||||
return "CPU";
|
||||
}
|
||||
|
||||
static const std::vector<std::string> & get_fields() {
|
||||
static const std::vector<std::string> fields = {
|
||||
"build_commit", "build_number",
|
||||
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
|
||||
"cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_threads", "type_k", "type_v",
|
||||
"n_gpu_layers", "split_mode",
|
||||
"main_gpu", "no_kv_offload",
|
||||
"mul_mat_q", "tensor_split", "use_mmap",
|
||||
"mul_mat_q", "tensor_split",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
"avg_ts", "stddev_ts"
|
||||
@@ -732,8 +691,8 @@ struct test {
|
||||
return INT;
|
||||
}
|
||||
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "mul_mat_q" || field == "use_mmap") {
|
||||
field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "mul_mat_q") {
|
||||
return BOOL;
|
||||
}
|
||||
if (field == "avg_ts" || field == "stddev_ts") {
|
||||
@@ -761,13 +720,13 @@ struct test {
|
||||
std::vector<std::string> values = {
|
||||
build_commit, std::to_string(build_number),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
||||
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
|
||||
std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||
std::to_string(main_gpu), std::to_string(no_kv_offload),
|
||||
std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
|
||||
std::to_string(mul_mat_q), tensor_split_str,
|
||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
||||
@@ -794,7 +753,6 @@ const bool test::kompute = !!ggml_cpu_has_kompute();
|
||||
const bool test::metal = !!ggml_cpu_has_metal();
|
||||
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
||||
const bool test::blas = !!ggml_cpu_has_blas();
|
||||
const bool test::sycl = !!ggml_cpu_has_sycl();
|
||||
const std::string test::cpu_info = get_cpu_info();
|
||||
const std::string test::gpu_info = get_gpu_info();
|
||||
|
||||
@@ -937,9 +895,6 @@ struct markdown_printer : public printer {
|
||||
if (field == "no_kv_offload") {
|
||||
return "nkvo";
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return "mmap";
|
||||
}
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
@@ -948,46 +903,43 @@ struct markdown_printer : public printer {
|
||||
|
||||
void print_header(const cmd_params & params) override {
|
||||
// select fields to print
|
||||
fields.emplace_back("model");
|
||||
fields.emplace_back("size");
|
||||
fields.emplace_back("params");
|
||||
fields.emplace_back("backend");
|
||||
fields.push_back("model");
|
||||
fields.push_back("size");
|
||||
fields.push_back("params");
|
||||
fields.push_back("backend");
|
||||
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
|
||||
if (!is_cpu_backend) {
|
||||
fields.emplace_back("n_gpu_layers");
|
||||
fields.push_back("n_gpu_layers");
|
||||
}
|
||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||
fields.emplace_back("n_threads");
|
||||
fields.push_back("n_threads");
|
||||
}
|
||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||
fields.emplace_back("n_batch");
|
||||
fields.push_back("n_batch");
|
||||
}
|
||||
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
|
||||
fields.emplace_back("type_k");
|
||||
fields.push_back("type_k");
|
||||
}
|
||||
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
||||
fields.emplace_back("type_v");
|
||||
fields.push_back("type_v");
|
||||
}
|
||||
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
||||
fields.emplace_back("main_gpu");
|
||||
fields.push_back("main_gpu");
|
||||
}
|
||||
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
|
||||
fields.emplace_back("split_mode");
|
||||
fields.push_back("split_mode");
|
||||
}
|
||||
if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
|
||||
fields.emplace_back("mul_mat_q");
|
||||
fields.push_back("mul_mat_q");
|
||||
}
|
||||
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
|
||||
fields.emplace_back("no_kv_offload");
|
||||
fields.push_back("no_kv_offload");
|
||||
}
|
||||
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
||||
fields.emplace_back("tensor_split");
|
||||
fields.push_back("tensor_split");
|
||||
}
|
||||
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
||||
fields.emplace_back("use_mmap");
|
||||
}
|
||||
fields.emplace_back("test");
|
||||
fields.emplace_back("t/s");
|
||||
fields.push_back("test");
|
||||
fields.push_back("t/s");
|
||||
|
||||
fprintf(fout, "|");
|
||||
for (const auto & field : fields) {
|
||||
|
||||
@@ -352,12 +352,12 @@ int main(int argc, char ** argv) {
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.emplace_back("### Instruction:\n\n");
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
}
|
||||
// similar for chatml mode
|
||||
else if (params.chatml) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.emplace_back("<|im_start|>user\n");
|
||||
params.antiprompt.push_back("<|im_start|>user\n");
|
||||
}
|
||||
|
||||
// enable interactive mode if interactive start is specified
|
||||
|
||||
@@ -457,14 +457,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
|
||||
std::ofstream logits_stream;
|
||||
if (!params.logits_file.empty()) {
|
||||
logits_stream.open(params.logits_file.c_str(), std::ios::binary);
|
||||
logits_stream.open(params.logits_file.c_str());
|
||||
if (!logits_stream.is_open()) {
|
||||
fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
|
||||
return {};
|
||||
}
|
||||
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
||||
logits_stream.write("_logits_", 8);
|
||||
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
|
||||
logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
|
||||
}
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
@@ -881,7 +881,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
size_t li = hs_cur.common_prefix;
|
||||
for (int s = 0; s < 4; ++s) {
|
||||
for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
|
||||
eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
|
||||
eval_pairs.push_back(std::make_pair(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]));
|
||||
}
|
||||
++li;
|
||||
}
|
||||
@@ -1159,13 +1159,13 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
|
||||
size_t li = n_base1 - 1;
|
||||
for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
|
||||
eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
|
||||
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[0][j+1]));
|
||||
}
|
||||
const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
|
||||
const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
|
||||
li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
|
||||
for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
|
||||
eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
|
||||
eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[1][j+1]));
|
||||
}
|
||||
}
|
||||
compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
|
||||
@@ -1524,7 +1524,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
size_t li = cur_task.common_prefix;
|
||||
for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
|
||||
for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
|
||||
eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
|
||||
eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
|
||||
}
|
||||
++li;
|
||||
}
|
||||
|
||||
@@ -257,13 +257,13 @@ int main(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.include_layers.emplace_back(argv[i]);
|
||||
params.include_layers.push_back(argv[i]);
|
||||
} else if (arg == "-L" || arg == "--exclude-layer") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.exclude_layers.emplace_back(argv[i]);
|
||||
params.exclude_layers.push_back(argv[i]);
|
||||
} else if (arg == "-t" || arg == "--type") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
||||
@@ -208,13 +208,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--include-weights") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
included_weights.emplace_back(argv[++arg_idx]);
|
||||
included_weights.push_back(argv[++arg_idx]);
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
} else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) {
|
||||
if (arg_idx < argc-1) {
|
||||
excluded_weights.emplace_back(argv[++arg_idx]);
|
||||
excluded_weights.push_back(argv[++arg_idx]);
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
@@ -1884,7 +1884,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.api_keys.emplace_back(argv[i]);
|
||||
sparams.api_keys.push_back(argv[i]);
|
||||
}
|
||||
else if (arg == "--api-key-file")
|
||||
{
|
||||
@@ -2160,7 +2160,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.emplace_back(argv[i], 1.0f);
|
||||
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-scaled")
|
||||
@@ -2176,7 +2176,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
|
||||
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-base")
|
||||
@@ -2318,7 +2318,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
}
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back();
|
||||
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
|
||||
|
||||
761
ggml-cuda.cu
761
ggml-cuda.cu
@@ -108,6 +108,7 @@
|
||||
#include <cuda.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <mma.h>
|
||||
|
||||
#if CUDART_VERSION < 11020
|
||||
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
|
||||
@@ -655,6 +656,19 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half warp_reduce_sum(half x) {
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
x = __hadd(__shfl_xor_sync(0xffffffff, x, mask, 32), x);
|
||||
}
|
||||
return x;
|
||||
#else
|
||||
(void) x;
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
@@ -676,6 +690,18 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half warp_reduce_max(half x) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
x = __hmax(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||
}
|
||||
return x;
|
||||
#else
|
||||
(void) x;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
GGML_UNUSED(a);
|
||||
@@ -989,6 +1015,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
|
||||
if (lane_id == 0) {
|
||||
s_sum[warp_id] = tmp;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
tmp = s_sum[lane_id];
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
@@ -5917,7 +5944,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int
|
||||
}
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template, bool need_check>
|
||||
static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
||||
static __global__ void soft_max_f16(const float * x, const half * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2;
|
||||
@@ -5952,12 +5979,12 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
||||
if (need_check && col_data + 0 >= ncols_data) {
|
||||
val.x = -INFINITY;
|
||||
} else {
|
||||
val.x = x[ix + 0]*scale + (y ? y[iy + 0] : 0.0f);
|
||||
val.x = x[ix + 0]*scale + (y ? __half2float(y[iy + 0]) : 0.0f);
|
||||
}
|
||||
if (need_check && col_data + WARP_SIZE >= ncols_data) {
|
||||
val.y = -INFINITY;
|
||||
} else {
|
||||
val.y = x[ix + WARP_SIZE]*scale + (y ? y[iy + WARP_SIZE] : 0.0f);
|
||||
val.y = x[ix + WARP_SIZE]*scale + (y ? __half2float(y[iy + WARP_SIZE]) : 0.0f);
|
||||
}
|
||||
if (!need_check || col_smem < (vals_smem ? ncols_smem : ncols_data)) {
|
||||
vals[col_smem] = val;
|
||||
@@ -6047,7 +6074,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
||||
}
|
||||
|
||||
template <bool vals_smem, int ncols_template, int block_size_template>
|
||||
static __global__ void soft_max_f32(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
||||
static __global__ void soft_max_f32(const float * x, const half * y, float * dst, const int ncols_par, const int nrows_y, const float scale) {
|
||||
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
@@ -6077,7 +6104,7 @@ static __global__ void soft_max_f32(const float * x, const float * y, float * ds
|
||||
const int ix = rowx*ncols + col;
|
||||
const int iy = rowy*ncols + col;
|
||||
|
||||
const float val = x[ix]*scale + (y ? y[iy] : 0.0f);
|
||||
const float val = x[ix]*scale + (y ? __half2float(y[iy]) : 0.0f);
|
||||
vals[col] = val;
|
||||
max_val = max(max_val, val);
|
||||
}
|
||||
@@ -6249,6 +6276,528 @@ static __global__ void pool2d_nchw_kernel(
|
||||
o_ptr[cur_oh * ow + cur_ow] = res;
|
||||
}
|
||||
|
||||
#define CUDA_FLASH_ATTENTION_BLOCK_SIZE 256
|
||||
|
||||
template<int block_size, int k_seq_len, int k_head_dim>
|
||||
static __global__ void flash_attn_f32(
|
||||
const float* __restrict__ q,
|
||||
const float* __restrict__ k,
|
||||
const float* __restrict__ v,
|
||||
float* __restrict__ kqv,
|
||||
float kq_scale,
|
||||
int head_dim, int seq_len, int num_heads) {
|
||||
const int head = blockIdx.x / seq_len;
|
||||
const int head_size = head_dim * seq_len;
|
||||
const int s = blockIdx.x % seq_len;
|
||||
|
||||
extern __shared__ char flash_attn_shmem_f32[];
|
||||
float* S = (float*)flash_attn_shmem_f32;
|
||||
float* warp_data = (float*)(flash_attn_shmem_f32 + seq_len * sizeof(float));
|
||||
|
||||
// QK^T
|
||||
#pragma unroll
|
||||
for(int is0 = 0; is0 < k_seq_len; is0 += block_size) {
|
||||
const int is = threadIdx.x + is0;
|
||||
if(is >= seq_len) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int key_offset = is * head_dim + head * head_size;
|
||||
const int query_offset = s * head_dim + head * head_size;
|
||||
|
||||
float tmp = 0.0f;
|
||||
for(int d = 0; d < head_dim; d++) {
|
||||
tmp += k[key_offset + d] * q[query_offset + d];
|
||||
}
|
||||
S[is] = tmp * kq_scale;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float max_val = -INFINITY;
|
||||
// get the max
|
||||
#pragma unroll
|
||||
for(int is0 = 0; is0 < k_seq_len; is0 += block_size) {
|
||||
const int is = threadIdx.x + is0;
|
||||
if(is >= seq_len) {
|
||||
break;
|
||||
}
|
||||
|
||||
max_val = fmaxf(max_val , S[is]);
|
||||
}
|
||||
|
||||
max_val = warp_reduce_max(max_val);
|
||||
|
||||
{ // get max from all threads
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
warp_data[warp_id] = max_val;
|
||||
}
|
||||
__syncthreads();
|
||||
max_val = warp_data[lane_id];
|
||||
max_val = warp_reduce_max(max_val);
|
||||
}
|
||||
|
||||
// softmax(QK^T)
|
||||
float sum = 0.0f;
|
||||
#pragma unroll
|
||||
for(int is0 = 0; is0 < k_seq_len; is0 += block_size) {
|
||||
const int is = threadIdx.x + is0;
|
||||
if(is >= seq_len) {
|
||||
break;
|
||||
}
|
||||
float tmp = expf(S[is] - max_val);
|
||||
sum += tmp;
|
||||
S[is] = tmp;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
sum = warp_reduce_sum(sum);
|
||||
{ // softmax sum partials
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane_id = threadIdx.x % WARP_SIZE;
|
||||
if (lane_id == 0) {
|
||||
warp_data[warp_id] = sum;
|
||||
}
|
||||
__syncthreads();
|
||||
sum = warp_data[lane_id];
|
||||
sum = warp_reduce_sum(sum);
|
||||
}
|
||||
|
||||
float inv_sum = 1.0f / sum;
|
||||
#pragma unroll
|
||||
for(int is0 = 0; is0 < k_seq_len; is0 += block_size) {
|
||||
const int is = threadIdx.x + is0;
|
||||
if(is >= seq_len) {
|
||||
break;
|
||||
}
|
||||
|
||||
S[is] *= inv_sum;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// softmax(QK^T)V
|
||||
#pragma unroll
|
||||
for (int d0 = threadIdx.x; d0 < k_head_dim; d0 += block_size) {
|
||||
const int d = threadIdx.x + d0;
|
||||
if(d >= head_dim) {
|
||||
break;
|
||||
}
|
||||
const int dst_index = d + s * head_dim + head * head_size;
|
||||
const int value_offset = d * seq_len + head * head_size;
|
||||
|
||||
float temp = 0.0f;
|
||||
#pragma unroll
|
||||
for(int ic = 0; ic < k_seq_len;ic++) {
|
||||
if(ic >= seq_len) {
|
||||
break;
|
||||
}
|
||||
temp += v[value_offset + ic] * S[ic];
|
||||
}
|
||||
kqv[dst_index] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half, nvcuda::wmma::row_major> half16x16_a;
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half, nvcuda::wmma::row_major> half16x16_b;
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half, nvcuda::wmma::col_major> half16x16_bT;
|
||||
typedef nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, half> half16x16_acc;
|
||||
#endif
|
||||
|
||||
// based on metal version
|
||||
template<int D, int Q, int C> // D head size, Q queries per block, C cache items per block
|
||||
static __global__ void flash_attn_ext_f16(
|
||||
const char* __restrict__ q,
|
||||
const char* __restrict__ k,
|
||||
const char* __restrict__ v,
|
||||
const char* __restrict__ mask,
|
||||
float* __restrict__ dst,
|
||||
float scale,
|
||||
int ne00,
|
||||
int ne01,
|
||||
int ne02,
|
||||
int ne03,
|
||||
int ne10,
|
||||
int ne11,
|
||||
int ne12,
|
||||
int ne13,
|
||||
int ne31,
|
||||
int nb31,
|
||||
int nb01,
|
||||
int nb02,
|
||||
int nb03,
|
||||
int nb11,
|
||||
int nb12,
|
||||
int nb13,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3) {
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
const int warp_id = threadIdx.y;
|
||||
const int lane_id = threadIdx.x;
|
||||
|
||||
const int num_warps = blockDim.y; // number of warps
|
||||
const int iq3 = blockIdx.z;
|
||||
const int iq2 = blockIdx.y;
|
||||
const int iq1 = blockIdx.x * Q;
|
||||
|
||||
const int D2 = D/2;
|
||||
const int D16 = D/16;
|
||||
const int Q16 = Q/16;
|
||||
const int NW = WARP_SIZE;
|
||||
const int SH = (C + Q); // shared memory per simdgroup in (half)
|
||||
|
||||
const int T = D + num_warps*SH; // shared memory size per query in (half)
|
||||
const int T2 = T/2; // shared memory size per query in (half2)
|
||||
|
||||
extern __shared__ half __flash_attn_f16_shmem[];
|
||||
// pq
|
||||
half * sq = (half *) (__flash_attn_f16_shmem + 0*D); // holds the query data
|
||||
half2 * sq2 = (half2 *) (__flash_attn_f16_shmem + 0*D); // same as above but in half2
|
||||
half * ss = (half *) (__flash_attn_f16_shmem + warp_id*SH + 1*D); // scratch buffer for attention and diagonal matrix
|
||||
|
||||
half16x16_acc zr;
|
||||
half16x16_acc lo[Q16][D16];
|
||||
|
||||
// load heads from Q to shared memory
|
||||
for (int64_t j = warp_id; j < Q; j += num_warps) {
|
||||
const float2 * q2 = (const float2 *) (q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
|
||||
|
||||
for (int64_t i = lane_id; i < D2; i += NW) {
|
||||
if (iq1 + j < ne01) {
|
||||
sq2[j*T2 + i] = __float22half2_rn(q2[i]);
|
||||
} else {
|
||||
sq2[j*T2 + i] = make_half2(0.0, 0.0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
nvcuda::wmma::fill_fragment(zr, 0.0);
|
||||
|
||||
// zero out lo
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
nvcuda::wmma::fill_fragment(lo[j][i], 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
// zero out shared memory SH
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
for (int64_t i = lane_id; i < SH; i += NW) {
|
||||
ss[j*T + i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
{
|
||||
half S[Q];
|
||||
half M[Q];
|
||||
|
||||
for(int i = 0; i < Q; i++) {
|
||||
S[i] = __float2half(0.0f);
|
||||
M[i] = __float2half(-INFINITY);
|
||||
}
|
||||
|
||||
// assume K and V are same shape
|
||||
const int ne22 = ne12;
|
||||
const int ne23 = ne13;
|
||||
|
||||
const int nb21 = nb11;
|
||||
const int nb22 = nb12;
|
||||
const int nb23 = nb13;
|
||||
|
||||
// broadcast
|
||||
const int rk2 = ne02/ne12;
|
||||
const int rk3 = ne03/ne13;
|
||||
|
||||
const int rv2 = ne02/ne22;
|
||||
const int rv3 = ne03/ne23;
|
||||
|
||||
// k indices
|
||||
const int ik2 = iq2 / rk2;
|
||||
const int ik3 = iq3 / rk3;
|
||||
|
||||
// v indices
|
||||
const int iv2 = iq2 / rv2;
|
||||
const int iv3 = iq3 / rv3;
|
||||
|
||||
// load the queries from shared memory into local memory
|
||||
half16x16_a mq[Q16][D16];
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
nvcuda::wmma::load_matrix_sync(mq[j][i], sq + 16*j*T + i*16, T);
|
||||
}
|
||||
}
|
||||
|
||||
// pointer to the mask
|
||||
const half * mp = mask ? (const half *) (mask + iq1*nb31) : nullptr;
|
||||
|
||||
// prepare diagonal scale matrix
|
||||
half16x16_b mscale;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
ss[i*T + i] = __float2half(scale);
|
||||
}
|
||||
nvcuda::wmma::load_matrix_sync(mscale, ss, T);
|
||||
|
||||
// loop over the KV cache
|
||||
// each simdgroup handles blocks of Q rows and C columns
|
||||
for (int64_t ic = C*warp_id; ic < ne11; ic += C*num_warps) {
|
||||
// Q*K^T
|
||||
{
|
||||
for (int cc = 0; cc < C/16; ++cc) {
|
||||
half16x16_acc mqk[Q16];
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
nvcuda::wmma::fill_fragment(mqk[j], 0);
|
||||
}
|
||||
|
||||
const half * pk = (const half *) ((const char *) k + ((ic + 16*cc)*nb11 + ik2*nb12 + ik3*nb13));
|
||||
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
half16x16_bT mk; // transposed key
|
||||
nvcuda::wmma::load_matrix_sync(mk, pk + i*16, nb11/sizeof(half));
|
||||
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
nvcuda::wmma::mma_sync(mqk[j], mq[j][i], mk, mqk[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// mqk = mqk*scale + mask
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
half16x16_a mqka;
|
||||
half16x16_acc mm;
|
||||
if(mp) {
|
||||
nvcuda::wmma::load_matrix_sync(mm, mp + 16*j*(nb31/sizeof(half)) + ic + 16*cc, nb31/sizeof(half), nvcuda::wmma::mem_row_major);
|
||||
}
|
||||
|
||||
// convert accumulator to matrix_a
|
||||
nvcuda::wmma::store_matrix_sync( ss + 16*j*T + 16*cc, mqk[j], T, nvcuda::wmma::mem_row_major);
|
||||
nvcuda::wmma::load_matrix_sync (mqka, ss + 16*j*T + 16*cc, T);
|
||||
|
||||
nvcuda::wmma::mma_sync(mqk[j], mqka, mscale, mp ? mm : zr);
|
||||
nvcuda::wmma::store_matrix_sync(ss + 16*j*T + 16*cc, mqk[j], T, nvcuda::wmma::mem_row_major);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// used to detect blocks full of -INF
|
||||
half smax = __float2half(-INFINITY);
|
||||
|
||||
// online softmax
|
||||
if (C == 32) {
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
const int64_t p = lane_id;
|
||||
|
||||
const half m = M[j];
|
||||
const half s = ss[j*T + p];
|
||||
|
||||
smax = warp_reduce_max(__hmax(smax, s));
|
||||
M[j] = warp_reduce_max(__hmax(M[j], s));
|
||||
|
||||
const half ms = __hisinf(m) ? __float2half(0.0f) : hexp(m - M[j]);
|
||||
const half vs = __hisinf(s) ? __float2half(0.0f) : hexp(s - M[j]);
|
||||
|
||||
S[j] = S[j]*ms + warp_reduce_sum(vs);
|
||||
|
||||
// create a QxQ diagonal matrix for rescaling the output
|
||||
if (p == j) {
|
||||
ss[j*T + C + j] = ms;
|
||||
}
|
||||
|
||||
// the P matrix from the paper (Q rows, C columns)
|
||||
ss[j*T + p] = vs;
|
||||
}
|
||||
} else {
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
const half m = M[j];
|
||||
|
||||
for (int64_t p = lane_id; p < C; p += NW) {
|
||||
const half s = ss[j*T + p];
|
||||
|
||||
smax = __hmax(smax, s);
|
||||
M[j] = __hmax(M[j], s);
|
||||
}
|
||||
|
||||
smax = warp_reduce_max(smax);
|
||||
M[j] = warp_reduce_max(M[j]);
|
||||
|
||||
const half ms = __hisinf(m) ? __float2half(0.0f) : hexp(m - M[j]);
|
||||
|
||||
// create a QxQ diagonal matrix for rescaling the output
|
||||
if (lane_id == j) {
|
||||
ss[j*T + C + j] = ms;
|
||||
}
|
||||
|
||||
// local sum
|
||||
half ls = 0.0f;
|
||||
|
||||
for (int64_t p = lane_id; p < C; p += NW) {
|
||||
const half s = ss[j*T + p];
|
||||
|
||||
const half vs = __hisinf(s) ? __float2half(0.0f) : hexp(s - M[j]);
|
||||
|
||||
ls += vs;
|
||||
|
||||
// the P matrix from the paper (Q rows, C columns)
|
||||
ss[j*T + p] = vs;
|
||||
}
|
||||
|
||||
S[j] = S[j]*ms + warp_reduce_sum(ls);
|
||||
}
|
||||
}
|
||||
|
||||
// skip -INF blocks
|
||||
if (__hisinf(smax)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// O = diag(ms)*O
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
half16x16_a mm;
|
||||
half16x16_b lob;
|
||||
|
||||
nvcuda::wmma::load_matrix_sync(mm, ss + 16*j*T + C + 16*j, T);
|
||||
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
// convert accumulator to matrix_b
|
||||
nvcuda::wmma::store_matrix_sync( ss + 16*j*T + C + 16*j, lo[j][i], T, nvcuda::wmma::mem_row_major);
|
||||
nvcuda::wmma::load_matrix_sync (lob, ss + 16*j*T + C + 16*j, T);
|
||||
|
||||
nvcuda::wmma::fill_fragment(lo[j][i], 0.0);
|
||||
nvcuda::wmma::mma_sync(lo[j][i], mm, lob, lo[j][i]);
|
||||
}
|
||||
|
||||
// restore zeros
|
||||
nvcuda::wmma::store_matrix_sync(ss + 16*j*T + C + 16*j, zr, T, nvcuda::wmma::mem_row_major);
|
||||
}
|
||||
|
||||
// O = O + (Q*K^T)*V
|
||||
{
|
||||
for (int cc = 0; cc < C/16; ++cc) {
|
||||
const half * pv = (const half *) ((const char *) v + ((ic + 16*cc)*nb21 + iv2*nb22 + iv3*nb23));
|
||||
|
||||
half16x16_b mk[D16];
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
nvcuda::wmma::load_matrix_sync(mk[i], pv + i*16, nb21/sizeof(half));
|
||||
}
|
||||
|
||||
half16x16_a mv[Q16];
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
nvcuda::wmma::load_matrix_sync(mv[j], ss + 16*j*T + 16*cc, T);
|
||||
}
|
||||
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
nvcuda::wmma::mma_sync(lo[j][i], mv[j], mk[i], lo[j][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
if (lane_id == 0) {
|
||||
ss[j*T + 0] = S[j];
|
||||
ss[j*T + 1] = M[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reduce the warps sequentially
|
||||
for (int64_t sg = 1; sg < num_warps; ++sg) {
|
||||
half S = __float2half(0.0f);
|
||||
half M = __float2half(-INFINITY);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// each simdgroup stores its output to shared memory, reusing sq
|
||||
if (warp_id == sg) {
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
nvcuda::wmma::store_matrix_sync(sq + 16*j*T + i*16, lo[j][i], T, nvcuda::wmma::mem_row_major);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// the first simdgroup accumulates the results from the other simdgroups
|
||||
if (warp_id == 0) {
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
const half S0 = ss[j*T + 0];
|
||||
const half S1 = ss[j*T + sg*SH + 0];
|
||||
|
||||
const half M0 = ss[j*T + 1];
|
||||
const half M1 = ss[j*T + sg*SH + 1];
|
||||
|
||||
M = __hmax(M0, M1);
|
||||
|
||||
const half ms0 = __hisinf(M0) ? __float2half(0.0f) : hexp(M0 - M);
|
||||
const half ms1 = __hisinf(M1) ? __float2half(0.0f) : hexp(M1 - M);
|
||||
|
||||
S = S0*ms0 + S1*ms1;
|
||||
|
||||
if (lane_id == 0) {
|
||||
ss[j*T + 0] = S;
|
||||
ss[j*T + 1] = M;
|
||||
|
||||
ss[j*T + C + j ] = ms0;
|
||||
ss[j*T + C + j + sg*SH] = ms1;
|
||||
}
|
||||
}
|
||||
|
||||
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
half16x16_a ms0;
|
||||
half16x16_a ms1;
|
||||
half16x16_b t;
|
||||
half16x16_acc t2;
|
||||
|
||||
nvcuda::wmma::load_matrix_sync(ms0, ss + 16*j*T + C + 16*j, T);
|
||||
nvcuda::wmma::load_matrix_sync(ms1, ss + 16*j*T + C + 16*j + sg*SH, T);
|
||||
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
nvcuda::wmma::fill_fragment(t2, 0.0);
|
||||
nvcuda::wmma::load_matrix_sync(t, sq + 16*j*T + i*16, T);
|
||||
nvcuda::wmma::mma_sync(t2, ms1, t, t2);
|
||||
|
||||
// convert accumulator to matrix_b
|
||||
nvcuda::wmma::store_matrix_sync( sq + 16*j*T + i*16, lo[j][i], T, nvcuda::wmma::mem_row_major);
|
||||
nvcuda::wmma::load_matrix_sync (t, sq + 16*j*T + i*16, T);
|
||||
|
||||
nvcuda::wmma::mma_sync(lo[j][i], ms0, t, t2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// store result to shared memory (reuse sq)
|
||||
if (warp_id == 0) {
|
||||
for (int64_t j = 0; j < Q16; ++j) {
|
||||
for (int64_t i = 0; i < D16; ++i) {
|
||||
nvcuda::wmma::store_matrix_sync(sq + 16*j*T + i*16, lo[j][i], T, nvcuda::wmma::mem_row_major);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// final rescale with 1/S and store to global memory
|
||||
if (warp_id == 0) {
|
||||
for (int64_t j = 0; j < Q && iq1 + j < ne01; ++j) {
|
||||
const half S = ss[j*T + 0];
|
||||
|
||||
for (int64_t i = lane_id; i < D; i += NW) {
|
||||
dst[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D + i] = __half2float(sq[j*T + i] / S);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<int qk, int qr, dequantize_kernel_t dq>
|
||||
static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||
const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) {
|
||||
@@ -7585,7 +8134,7 @@ static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols
|
||||
diag_mask_inf_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols_x, rows_per_channel, n_past);
|
||||
}
|
||||
|
||||
static void soft_max_f16_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||
static void soft_max_f16_cuda(const float * x, const half * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||
int nth = WARP_SIZE;
|
||||
while (nth < ncols_x/2 && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const dim3 block_dims(nth, 1, 1);
|
||||
@@ -7628,7 +8177,7 @@ static void soft_max_f16_cuda(const float * x, const float * y, float * dst, con
|
||||
}
|
||||
}
|
||||
|
||||
static void soft_max_f32_cuda(const float * x, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||
static void soft_max_f32_cuda(const float * x, const half * y, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, cudaStream_t stream) {
|
||||
int nth = WARP_SIZE;
|
||||
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
|
||||
const dim3 block_dims(nth, 1, 1);
|
||||
@@ -7682,6 +8231,13 @@ static void im2col_cuda(const float* x, T* dst,
|
||||
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
|
||||
}
|
||||
|
||||
static void flash_attn_f32_cuda(const float* q, const float* k,const float* v, float* dst, float kq_scale, const int d_head, const int seq_len, const int num_heads, cudaStream_t stream) {
|
||||
int sram_memory_size = seq_len*sizeof(float) + WARP_SIZE * sizeof(float);
|
||||
int num_blocks = num_heads * seq_len;
|
||||
flash_attn_f32<CUDA_FLASH_ATTENTION_BLOCK_SIZE, 1024, 64><<<num_blocks, CUDA_FLASH_ATTENTION_BLOCK_SIZE, sram_memory_size, stream>>>(
|
||||
q, k, v, dst, kq_scale, d_head, seq_len, num_heads);
|
||||
}
|
||||
|
||||
// buffer pool for cuda
|
||||
#define MAX_CUDA_BUFFERS 256
|
||||
|
||||
@@ -8657,9 +9213,9 @@ static void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||
|
||||
if (src1_convert_f16) {
|
||||
src1_dfloat = src1_dfloat_a.alloc(ne00);
|
||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||
to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream);
|
||||
ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
||||
ne00, 1, sizeof(float), 0, 0,
|
||||
ne00, 1, sizeof(half), 0, 0, 0, 0, 0, 0, stream);
|
||||
}
|
||||
#else
|
||||
const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
|
||||
@@ -9060,11 +9616,11 @@ static void ggml_cuda_op_soft_max(
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
|
||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16); // src1 contains mask and it is optional
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows_x = ggml_nrows(src0);
|
||||
const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
|
||||
const int64_t nrows_y = src1 ? src0->ne[1] : 1; // note: using number of queries since mask can be padded!
|
||||
|
||||
float scale = 1.0f;
|
||||
memcpy(&scale, dst->op_params, sizeof(float));
|
||||
@@ -9080,9 +9636,9 @@ static void ggml_cuda_op_soft_max(
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX
|
||||
|
||||
if (use_f16_soft_max) {
|
||||
soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
soft_max_f16_cuda(src0_dd, src1 ? (const half *) src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
} else {
|
||||
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
soft_max_f32_cuda(src0_dd, src1 ? (const half *) src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
|
||||
}
|
||||
|
||||
(void) dst;
|
||||
@@ -10284,6 +10840,170 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||
}
|
||||
}
|
||||
|
||||
inline void ggml_cuda_flash_attn(const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV) {
|
||||
GGML_ASSERT(Q->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(K->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(V->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(Q->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(K->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(V->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(KQV->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) Q->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) K->extra;
|
||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) V->extra;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) KQV->extra;
|
||||
|
||||
const int64_t d_head = Q->ne[0];
|
||||
const int64_t sequence_length = Q->ne[1];
|
||||
const int64_t num_heads = Q->ne[2];
|
||||
|
||||
GGML_ASSERT(Q->ne[0] == d_head);
|
||||
GGML_ASSERT(K->ne[0] == d_head);
|
||||
GGML_ASSERT(V->ne[1] == d_head);
|
||||
|
||||
GGML_ASSERT(Q->ne[1] == sequence_length);
|
||||
GGML_ASSERT(K->ne[1] == sequence_length);
|
||||
GGML_ASSERT(V->ne[0] == sequence_length);
|
||||
|
||||
GGML_ASSERT(Q->ne[2] == num_heads);
|
||||
GGML_ASSERT(K->ne[2] == num_heads);
|
||||
GGML_ASSERT(V->ne[2] == num_heads);
|
||||
|
||||
float KQ_scale = 1.0f / sqrtf((float)d_head);
|
||||
|
||||
flash_attn_f32_cuda(
|
||||
(float *) src0_extra->data_device[g_main_device], // Query
|
||||
(float *) src1_extra->data_device[g_main_device], // Key
|
||||
(float *) src2_extra->data_device[g_main_device], // Value
|
||||
(float *) dst_extra->data_device[g_main_device], // dst
|
||||
KQ_scale, d_head, sequence_length, num_heads, main_stream);
|
||||
}
|
||||
|
||||
|
||||
inline void ggml_cuda_flash_attn_ext(const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, const ggml_tensor * mask, ggml_tensor * KQV) {
|
||||
GGML_ASSERT(Q->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(K->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(V->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(KQV->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_ASSERT(Q->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(K->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(V->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(KQV->backend == GGML_BACKEND_GPU);
|
||||
|
||||
GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(!mask || mask->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
|
||||
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
|
||||
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||
|
||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) Q->extra;
|
||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) K->extra;
|
||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) V->extra;
|
||||
ggml_tensor_extra_gpu * src3_extra = mask ? (ggml_tensor_extra_gpu *) mask->extra : nullptr;
|
||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) KQV->extra;
|
||||
|
||||
float scale;
|
||||
memcpy(&scale, KQV->op_params, sizeof(float));
|
||||
|
||||
#define NQPB 16
|
||||
#define NCPW 128
|
||||
|
||||
const int nqpb = NQPB; // queries per block
|
||||
const int ncpw = NCPW; // cache values per warp (does not work for other values)
|
||||
|
||||
const int nwarps_max = 8; // TODO: we don't want to launch too much warps. how much is too much?
|
||||
// TODO: produces wrong results for nwarps > 8 (RTX 2060) - not sure why
|
||||
const int nwarps = Q->ne[1] <= nqpb ? std::max(2, std::min((int) K->ne[1]/ncpw, nwarps_max)) : 2;
|
||||
|
||||
dim3 blocks_num((Q->ne[1] + nqpb - 1) / nqpb, Q->ne[2], Q->ne[3]);
|
||||
dim3 block_dim(32, nwarps, 1);
|
||||
|
||||
const size_t shmem = nqpb*(Q->ne[0] + nwarps*(ncpw + nqpb))*(sizeof(float)/2);
|
||||
|
||||
switch (Q->ne[0])
|
||||
{
|
||||
case 16:
|
||||
flash_attn_ext_f16<16, NQPB, NCPW>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) src0_extra->data_device[g_main_device], // Query
|
||||
(const char *) src1_extra->data_device[g_main_device], // Key
|
||||
(const char *) src2_extra->data_device[g_main_device], // Value
|
||||
mask ? ((const char *) src3_extra->data_device[g_main_device]) : nullptr, // Mask
|
||||
(float *) dst_extra->data_device[g_main_device], // dst
|
||||
scale,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
break;
|
||||
case 64:
|
||||
flash_attn_ext_f16<64, NQPB, NCPW>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) src0_extra->data_device[g_main_device], // Query
|
||||
(const char *) src1_extra->data_device[g_main_device], // Key
|
||||
(const char *) src2_extra->data_device[g_main_device], // Value
|
||||
mask ? ((const char *) src3_extra->data_device[g_main_device]) : nullptr, // Mask
|
||||
(float *) dst_extra->data_device[g_main_device], // dst
|
||||
scale,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
break;
|
||||
case 80:
|
||||
flash_attn_ext_f16<80, NQPB, NCPW>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) src0_extra->data_device[g_main_device], // Query
|
||||
(const char *) src1_extra->data_device[g_main_device], // Key
|
||||
(const char *) src2_extra->data_device[g_main_device], // Value
|
||||
mask ? ((const char *) src3_extra->data_device[g_main_device]) : nullptr, // Mask
|
||||
(float *) dst_extra->data_device[g_main_device], // dst
|
||||
scale,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
break;
|
||||
case 128:
|
||||
flash_attn_ext_f16<128, NQPB, NCPW>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) src0_extra->data_device[g_main_device], // Query
|
||||
(const char *) src1_extra->data_device[g_main_device], // Key
|
||||
(const char *) src2_extra->data_device[g_main_device], // Value
|
||||
mask ? ((const char *) src3_extra->data_device[g_main_device]) : nullptr, // Mask
|
||||
(float *) dst_extra->data_device[g_main_device], // dst
|
||||
scale,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
||||
}
|
||||
@@ -10573,6 +11293,10 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||
case GGML_OP_ARGSORT:
|
||||
func = ggml_cuda_argsort;
|
||||
break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -10587,7 +11311,13 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return true;
|
||||
}
|
||||
func(tensor->src[0], tensor->src[1], tensor);
|
||||
if(tensor->op == GGML_OP_FLASH_ATTN) {
|
||||
ggml_cuda_flash_attn(tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
||||
} else if(tensor->op == GGML_OP_FLASH_ATTN_EXT) {
|
||||
ggml_cuda_flash_attn_ext(tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
||||
} else {
|
||||
func(tensor->src[0], tensor->src[1], tensor);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -11403,6 +12133,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
case GGML_OP_UPSCALE:
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
||||
394
ggml-metal.m
394
ggml-metal.m
@@ -141,6 +141,12 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
|
||||
GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
|
||||
GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
|
||||
@@ -390,6 +396,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
id<MTLFunction> metal_function = [metal_library newFunctionWithName:@"kernel_"#name]; \
|
||||
kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:metal_function error:&error]; \
|
||||
[metal_function release]; \
|
||||
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
|
||||
(int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
|
||||
(int) kernel->pipeline.threadExecutionWidth); \
|
||||
if (error) { \
|
||||
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
|
||||
[metal_library release]; \
|
||||
@@ -401,130 +410,136 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
|
||||
// simd_sum and simd_max requires MTLGPUFamilyApple7
|
||||
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1, get_rows_q5_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0, get_rows_q8_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K, get_rows_q2_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K, get_rows_q3_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K, get_rows_q4_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K, get_rows_q5_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1, get_rows_q5_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0, get_rows_q8_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K, get_rows_q2_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K, get_rows_q3_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K, get_rows_q4_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K, get_rows_q5_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
||||
}
|
||||
|
||||
[metal_library release];
|
||||
@@ -640,6 +655,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_PAD:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
return true;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
@@ -1171,6 +1187,8 @@ static bool ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16);
|
||||
|
||||
int nth = 32; // SIMD width
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
@@ -2178,6 +2196,110 @@ static bool ggml_metal_graph_compute(
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
|
||||
struct ggml_tensor * src3 = gf->nodes[i]->src[3];
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src1, src2));
|
||||
GGML_ASSERT(src3);
|
||||
|
||||
size_t offs_src2 = 0;
|
||||
size_t offs_src3 = 0;
|
||||
|
||||
GGML_ASSERT(src2);
|
||||
id<MTLBuffer> id_src2 = ggml_metal_get_buffer(src2, &offs_src2);
|
||||
|
||||
id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil;
|
||||
|
||||
GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) &&
|
||||
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
|
||||
|
||||
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
|
||||
const int64_t ne31 = src3 ? src3->ne[1] : 0;
|
||||
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
|
||||
const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
|
||||
|
||||
const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30);
|
||||
const uint64_t nb31 = src3 ? src3->nb[1] : 0;
|
||||
const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32);
|
||||
const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33);
|
||||
|
||||
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
|
||||
|
||||
float scale;
|
||||
memcpy(&scale, dst->op_params, sizeof(float));
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
switch (ne00) {
|
||||
case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
|
||||
case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
|
||||
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
|
||||
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
|
||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
|
||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
GGML_METAL_LOG_ERROR("add template specialization for this size\n");
|
||||
GGML_ASSERT(false && "add template specialization for this size");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: extend if necessary
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
|
||||
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
|
||||
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
|
||||
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
|
||||
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
|
||||
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
|
||||
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
|
||||
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
|
||||
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
|
||||
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
|
||||
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
|
||||
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
|
||||
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
|
||||
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
|
||||
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
|
||||
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
|
||||
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
|
||||
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
|
||||
[encoder setBytes:&ne31 length:sizeof( int64_t) atIndex:21];
|
||||
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:22];
|
||||
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:23];
|
||||
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:24];
|
||||
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:25];
|
||||
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26];
|
||||
[encoder setBytes:&scale length:sizeof( float) atIndex:27];
|
||||
|
||||
const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !!
|
||||
const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !!
|
||||
|
||||
GGML_ASSERT(nqptg % 8 == 0);
|
||||
GGML_ASSERT(ncpsg % 32 == 0);
|
||||
|
||||
// simdgroups per threadgroup (a.k.a. warps)
|
||||
// for small batches use more simdgroups (needs more tests, to confirm if it's worth it)
|
||||
const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)) : 4;
|
||||
|
||||
const size_t smem = nqptg*(ne00 + nsg*(ncpsg + nqptg))*(sizeof(float)/2);
|
||||
|
||||
//printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength);
|
||||
GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
|
||||
[encoder setThreadgroupMemoryLength:smem atIndex:0];
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
|
||||
} break;
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_CONT:
|
||||
@@ -2379,10 +2501,13 @@ GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backe
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
|
||||
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
|
||||
#ifndef GGML_METAL_NDEBUG
|
||||
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
||||
if (@available(macOS 10.12, iOS 16.0, *)) {
|
||||
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
|
||||
__func__,
|
||||
size_aligned / 1024.0 / 1024.0,
|
||||
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
|
||||
@@ -2392,10 +2517,15 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
|
||||
GGML_METAL_LOG_INFO("\n");
|
||||
}
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
|
||||
__func__,
|
||||
size_aligned / 1024.0 / 1024.0,
|
||||
device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
UNUSED(device);
|
||||
UNUSED(size_aligned);
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
@@ -2429,8 +2559,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
|
||||
return NULL;
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||
ggml_backend_metal_log_allocated_size(device);
|
||||
ggml_backend_metal_log_allocated_size(device, size_aligned);
|
||||
|
||||
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
||||
}
|
||||
@@ -2517,7 +2646,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||
ggml_backend_metal_log_allocated_size(device, size_aligned);
|
||||
|
||||
++ctx->n_buffers;
|
||||
} else {
|
||||
@@ -2540,7 +2669,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
|
||||
ggml_backend_metal_log_allocated_size(device, size_step_aligned);
|
||||
|
||||
if (i + size_step < size) {
|
||||
GGML_METAL_LOG_INFO("\n");
|
||||
}
|
||||
@@ -2549,8 +2679,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_metal_log_allocated_size(device);
|
||||
|
||||
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||
}
|
||||
|
||||
|
||||
425
ggml-metal.metal
425
ggml-metal.metal
@@ -349,9 +349,9 @@ kernel void kernel_sum_rows(
|
||||
}
|
||||
|
||||
kernel void kernel_soft_max(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
@@ -366,9 +366,9 @@ kernel void kernel_soft_max(
|
||||
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
||||
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr;
|
||||
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
|
||||
device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
device const half * pmask = src1 != src0 ? (device const half *) src1 + i01*ne00 : nullptr;
|
||||
device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
|
||||
// parallel max
|
||||
float lmax = -INFINITY;
|
||||
@@ -435,14 +435,14 @@ kernel void kernel_soft_max(
|
||||
}
|
||||
|
||||
kernel void kernel_soft_max_4(
|
||||
device const float * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant float & scale,
|
||||
threadgroup float * buf [[threadgroup(0)]],
|
||||
threadgroup float * buf [[threadgroup(0)]],
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
uint tpitg[[thread_position_in_threadgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
@@ -452,15 +452,15 @@ kernel void kernel_soft_max_4(
|
||||
const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
|
||||
const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
|
||||
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
|
||||
device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
|
||||
device const half4 * pmask = src1 != src0 ? (device const half4 *) src1 + i01*ne00/4 : nullptr;
|
||||
device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
|
||||
|
||||
// parallel max
|
||||
float4 lmax4 = -INFINITY;
|
||||
|
||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
|
||||
lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4) (pmask ? pmask[i00] : 0.0f));
|
||||
}
|
||||
|
||||
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
|
||||
@@ -486,7 +486,7 @@ kernel void kernel_soft_max_4(
|
||||
// parallel sum
|
||||
float4 lsum4 = 0.0f;
|
||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
|
||||
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4) (pmask ? pmask[i00] : 0.0f)) - max_val);
|
||||
lsum4 += exp_psrc4;
|
||||
pdst4[i00] = exp_psrc4;
|
||||
}
|
||||
@@ -1984,6 +1984,401 @@ kernel void kernel_leaky_relu_f32(
|
||||
dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
|
||||
}
|
||||
|
||||
typedef void (flash_attn_ext_f16_t)(
|
||||
device const char * q,
|
||||
device const char * k,
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant int64_t & ne31,
|
||||
constant uint64_t & nb31,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant float & scale,
|
||||
threadgroup half * shared,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]);
|
||||
|
||||
// ref: https://arxiv.org/pdf/2307.08691.pdf
|
||||
template<int64_t D, int64_t Q, int64_t C> // head size, queries per threadgroup, cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_f16(
|
||||
device const char * q,
|
||||
device const char * k,
|
||||
device const char * v,
|
||||
device const char * mask,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant int64_t & ne31,
|
||||
constant uint64_t & nb31,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant float & scale,
|
||||
threadgroup half * shared [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
const uint nsg = ntg.y; // number of simdgroups
|
||||
|
||||
const int64_t iq3 = tgpig[2];
|
||||
const int64_t iq2 = tgpig[1];
|
||||
const int64_t iq1 = tgpig[0]*Q;
|
||||
|
||||
const int64_t D4 = D/4;
|
||||
const int64_t D8 = D/8;
|
||||
const int64_t Q8 = Q/8;
|
||||
const int64_t NW = N_SIMDWIDTH;
|
||||
const int64_t SH = (C + Q); // shared memory per simdgroup in (half)
|
||||
|
||||
const int64_t T = D + nsg*SH; // shared memory size per query in (half)
|
||||
const int64_t T4 = T/4; // shared memory size per query in (half4)
|
||||
|
||||
threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
|
||||
threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
|
||||
threadgroup half * ss = (threadgroup half *) (shared + sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
|
||||
|
||||
// store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper)
|
||||
simdgroup_half8x8 lo[Q8][D8];
|
||||
|
||||
// load heads from Q to shared memory
|
||||
for (int64_t j = sgitg; j < Q; j += nsg) {
|
||||
device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03));
|
||||
|
||||
for (int64_t i = tiisg; i < D4; i += NW) {
|
||||
if (iq1 + j < ne01) {
|
||||
sq4[j*T4 + i] = (half4) q4[i];
|
||||
} else {
|
||||
sq4[j*T4 + i] = 0.0h;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// zero out lo
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
for (int64_t i = 0; i < D8; ++i) {
|
||||
lo[j][i] = make_filled_simdgroup_matrix<half, 8>(0.0h);
|
||||
}
|
||||
}
|
||||
|
||||
// zero out shared memory SH
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
for (int64_t i = tiisg; i < SH; i += NW) {
|
||||
ss[j*T + i] = 0.0h;
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
{
|
||||
half S[Q] = { [0 ... Q-1] = 0.0h };
|
||||
half M[Q] = { [0 ... Q-1] = -INFINITY };
|
||||
|
||||
// assume K and V are same shape
|
||||
const int64_t ne22 = ne12;
|
||||
const int64_t ne23 = ne13;
|
||||
|
||||
const uint64_t nb21 = nb11;
|
||||
const uint64_t nb22 = nb12;
|
||||
const uint64_t nb23 = nb13;
|
||||
|
||||
// broadcast
|
||||
const int64_t rk2 = ne02/ne12;
|
||||
const int64_t rk3 = ne03/ne13;
|
||||
|
||||
const int64_t rv2 = ne02/ne22;
|
||||
const int64_t rv3 = ne03/ne23;
|
||||
|
||||
// k indices
|
||||
const int64_t ik2 = iq2 / rk2;
|
||||
const int64_t ik3 = iq3 / rk3;
|
||||
|
||||
// v indices
|
||||
const int64_t iv2 = iq2 / rv2;
|
||||
const int64_t iv3 = iq3 / rv3;
|
||||
|
||||
// load the queries from shared memory into local memory
|
||||
simdgroup_half8x8 mq[Q8][D8];
|
||||
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
for (int64_t i = 0; i < D8; ++i) {
|
||||
simdgroup_load(mq[j][i], sq + 8*j*T + i*8, T);
|
||||
}
|
||||
}
|
||||
|
||||
// pointer to the mask
|
||||
device const half * mp = (device const half *) (mask + iq1*nb31);
|
||||
|
||||
// prepare diagonal scale matrix
|
||||
simdgroup_half8x8 mscale(scale);
|
||||
|
||||
// loop over the KV cache
|
||||
// each simdgroup handles blocks of Q rows and C columns
|
||||
for (int64_t ic = C*sgitg; ic < ne11; ic += C*nsg) {
|
||||
// Q*K^T
|
||||
{
|
||||
for (int cc = 0; cc < C/8; ++cc) {
|
||||
simdgroup_half8x8 mqk[Q8];
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
mqk[j] = make_filled_simdgroup_matrix<half, 8>(0.h);
|
||||
}
|
||||
|
||||
device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13));
|
||||
|
||||
for (int64_t i = 0; i < D8; ++i) {
|
||||
simdgroup_half8x8 mk;
|
||||
simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose
|
||||
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
simdgroup_multiply_accumulate(mqk[j], mq[j][i], mk, mqk[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// mqk = mqk*scale + mask
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
simdgroup_half8x8 mm;
|
||||
simdgroup_load(mm, mp + 8*j*(nb31/sizeof(half)) + ic + 8*cc, nb31/sizeof(half), 0, false);
|
||||
simdgroup_multiply_accumulate(mqk[j], mqk[j], mscale, mm);
|
||||
|
||||
simdgroup_store(mqk[j], ss + 8*j*T + 8*cc, T, 0, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// used to detect blocks full of -INF
|
||||
half smax = -INFINITY;
|
||||
|
||||
// online softmax
|
||||
if (C == 32) {
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
const int64_t p = tiisg;
|
||||
|
||||
const half m = M[j];
|
||||
const half s = ss[j*T + p];
|
||||
|
||||
smax = simd_max(max(smax, s));
|
||||
M[j] = simd_max(max(M[j], s));
|
||||
|
||||
const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]);
|
||||
const half vs = s == -INFINITY ? 0.0h : exp(s - M[j]);
|
||||
|
||||
S[j] = S[j]*ms + simd_sum(vs);
|
||||
|
||||
// create a QxQ diagonal matrix for rescaling the output
|
||||
if (p == j) {
|
||||
ss[j*T + C + j] = ms;
|
||||
}
|
||||
|
||||
// the P matrix from the paper (Q rows, C columns)
|
||||
ss[j*T + p] = vs;
|
||||
}
|
||||
} else {
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
const half m = M[j];
|
||||
|
||||
for (int64_t p = tiisg; p < C; p += NW) {
|
||||
const half s = ss[j*T + p];
|
||||
|
||||
smax = simd_max(max(smax, s));
|
||||
M[j] = simd_max(max(M[j], s));
|
||||
}
|
||||
|
||||
const half ms = m == -INFINITY ? 0.0h : exp(m - M[j]);
|
||||
|
||||
S[j] = S[j]*ms;
|
||||
|
||||
// create a QxQ diagonal matrix for rescaling the output
|
||||
if (tiisg == j) {
|
||||
ss[j*T + C + j] = ms;
|
||||
}
|
||||
|
||||
for (int64_t p = tiisg; p < C; p += NW) {
|
||||
const half s = ss[j*T + p];
|
||||
|
||||
const half vs = s == -INFINITY ? 0.0h : exp(s - M[j]);
|
||||
|
||||
S[j] = S[j] + simd_sum(vs);
|
||||
|
||||
// the P matrix from the paper (Q rows, C columns)
|
||||
ss[j*T + p] = vs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// skip -INF blocks
|
||||
if (smax == -INFINITY) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// O = diag(ms)*O
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
simdgroup_half8x8 mm;
|
||||
simdgroup_load(mm, ss + 8*j*T + C + 8*j, T, 0, false);
|
||||
|
||||
for (int64_t i = 0; i < D8; ++i) {
|
||||
simdgroup_multiply(lo[j][i], mm, lo[j][i]);
|
||||
}
|
||||
}
|
||||
|
||||
// O = O + (Q*K^T)*V
|
||||
{
|
||||
for (int cc = 0; cc < C/8; ++cc) {
|
||||
device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23));
|
||||
|
||||
for (int64_t i = 0; i < D8; ++i) {
|
||||
simdgroup_half8x8 mk;
|
||||
simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false);
|
||||
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
simdgroup_half8x8 mv;
|
||||
simdgroup_load(mv, ss + 8*j*T + 8*cc, T, 0, false);
|
||||
|
||||
simdgroup_multiply_accumulate(lo[j][i], mv, mk, lo[j][i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// these are needed for reducing the results from the simdgroups (reuse the ss buffer)
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
if (tiisg == 0) {
|
||||
ss[j*T + 0] = S[j];
|
||||
ss[j*T + 1] = M[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reduce the warps sequentially
|
||||
for (int64_t sg = 1; sg < nsg; ++sg) {
|
||||
half S = { 0.0h };
|
||||
half M = { -INFINITY };
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// each simdgroup stores its output to shared memory, reusing sq
|
||||
if (sgitg == sg) {
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
for (int64_t i = 0; i < D8; ++i) {
|
||||
simdgroup_store(lo[j][i], sq + 8*j*T + i*8, T, 0, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// the first simdgroup accumulates the results from the other simdgroups
|
||||
if (sgitg == 0) {
|
||||
for (int64_t j = 0; j < Q; ++j) {
|
||||
const half S0 = ss[j*T + 0];
|
||||
const half S1 = ss[j*T + sg*SH + 0];
|
||||
|
||||
const half M0 = ss[j*T + 1];
|
||||
const half M1 = ss[j*T + sg*SH + 1];
|
||||
|
||||
M = max(M0, M1);
|
||||
|
||||
const half ms0 = M0 == -INFINITY ? 0.0h : exp(M0 - M);
|
||||
const half ms1 = M1 == -INFINITY ? 0.0h : exp(M1 - M);
|
||||
|
||||
S = S0*ms0 + S1*ms1;
|
||||
|
||||
if (tiisg == 0) {
|
||||
ss[j*T + 0] = S;
|
||||
ss[j*T + 1] = M;
|
||||
|
||||
ss[j*T + C + j ] = ms0;
|
||||
ss[j*T + C + j + sg*SH] = ms1;
|
||||
}
|
||||
}
|
||||
|
||||
// O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
simdgroup_half8x8 t;
|
||||
simdgroup_half8x8 ms0;
|
||||
simdgroup_half8x8 ms1;
|
||||
|
||||
simdgroup_load(ms0, ss + 8*j*T + C + 8*j, T, 0, false);
|
||||
simdgroup_load(ms1, ss + 8*j*T + C + 8*j + sg*SH, T, 0, false);
|
||||
|
||||
for (int64_t i = 0; i < D8; ++i) {
|
||||
simdgroup_load (t, sq + 8*j*T + i*8, T, 0, false);
|
||||
simdgroup_multiply(t, ms1, t);
|
||||
|
||||
simdgroup_multiply_accumulate(lo[j][i], ms0, lo[j][i], t);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// store result to shared memory (reuse sq)
|
||||
if (sgitg == 0) {
|
||||
for (int64_t j = 0; j < Q8; ++j) {
|
||||
for (int64_t i = 0; i < D8; ++i) {
|
||||
simdgroup_store(lo[j][i], sq + 8*j*T + i*8, T, 0, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
device float4 * dst4 = (device float4 *) dst;
|
||||
|
||||
// final rescale with 1/S and store to global memory
|
||||
if (sgitg == 0) {
|
||||
for (int64_t j = 0; j < Q && iq1 + j < ne01; ++j) {
|
||||
const half S = ss[j*T + 0];
|
||||
|
||||
for (int64_t i = tiisg; i < D4; i += NW) {
|
||||
dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) sq4[j*T4 + i]/S;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64, 8, 32>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80, 8, 32>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96, 8, 32>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112, 8, 32>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128, 8, 32>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256, 8, 32>;
|
||||
|
||||
kernel void kernel_cpy_f16_f16(
|
||||
device const half * src0,
|
||||
device half * dst,
|
||||
|
||||
124
ggml-sycl.cpp
124
ggml-sycl.cpp
@@ -337,7 +337,6 @@ namespace dpct
|
||||
}
|
||||
size_t get_global_mem_size() const { return _global_mem_size; }
|
||||
size_t get_local_mem_size() const { return _local_mem_size; }
|
||||
size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; }
|
||||
/// Returns the maximum clock rate of device's global memory in kHz. If
|
||||
/// compiler does not support this API then returns default value 3200000 kHz.
|
||||
unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
|
||||
@@ -399,10 +398,6 @@ namespace dpct
|
||||
{
|
||||
_local_mem_size = local_mem_size;
|
||||
}
|
||||
void set_max_mem_alloc_size(size_t max_mem_alloc_size)
|
||||
{
|
||||
_max_mem_alloc_size = max_mem_alloc_size;
|
||||
}
|
||||
void set_max_work_group_size(int max_work_group_size)
|
||||
{
|
||||
_max_work_group_size = max_work_group_size;
|
||||
@@ -470,7 +465,6 @@ namespace dpct
|
||||
int _max_register_size_per_work_group;
|
||||
size_t _global_mem_size;
|
||||
size_t _local_mem_size;
|
||||
size_t _max_mem_alloc_size;
|
||||
size_t _max_nd_range_size[3];
|
||||
int _max_nd_range_size_i[3];
|
||||
uint32_t _device_id;
|
||||
@@ -522,7 +516,6 @@ namespace dpct
|
||||
dev.get_info<sycl::info::device::max_work_group_size>());
|
||||
prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
|
||||
prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
|
||||
prop.set_max_mem_alloc_size(dev.get_info<sycl::info::device::max_mem_alloc_size>());
|
||||
|
||||
#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
|
||||
if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
|
||||
@@ -651,11 +644,6 @@ namespace dpct
|
||||
return get_device_info().get_global_mem_size();
|
||||
}
|
||||
|
||||
size_t get_max_mem_alloc_size() const
|
||||
{
|
||||
return get_device_info().get_max_mem_alloc_size();
|
||||
}
|
||||
|
||||
/// Get the number of bytes of free and total memory on the SYCL device.
|
||||
/// \param [out] free_memory The number of bytes of free memory on the SYCL device.
|
||||
/// \param [out] total_memory The number of bytes of total memory on the SYCL device.
|
||||
@@ -1366,7 +1354,6 @@ namespace dpct
|
||||
}
|
||||
#else
|
||||
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
||||
GGML_UNUSED(direction);
|
||||
#endif // DPCT_USM_LEVEL_NONE
|
||||
}
|
||||
|
||||
@@ -1668,7 +1655,7 @@ namespace dpct
|
||||
using Ty = typename DataType<T>::T2;
|
||||
Ty s_h;
|
||||
if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
|
||||
detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host)
|
||||
detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
|
||||
.wait();
|
||||
else
|
||||
s_h = *reinterpret_cast<const Ty *>(s);
|
||||
@@ -1692,20 +1679,6 @@ namespace dpct
|
||||
int ldb, const void *beta, void *c, int ldc)
|
||||
{
|
||||
#ifndef __INTEL_MKL__
|
||||
GGML_UNUSED(q);
|
||||
GGML_UNUSED(a_trans);
|
||||
GGML_UNUSED(b_trans);
|
||||
GGML_UNUSED(m);
|
||||
GGML_UNUSED(n);
|
||||
GGML_UNUSED(k);
|
||||
GGML_UNUSED(alpha);
|
||||
GGML_UNUSED(a);
|
||||
GGML_UNUSED(lda);
|
||||
GGML_UNUSED(b);
|
||||
GGML_UNUSED(ldb);
|
||||
GGML_UNUSED(beta);
|
||||
GGML_UNUSED(c);
|
||||
GGML_UNUSED(ldc);
|
||||
throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
|
||||
"Project does not support this API.");
|
||||
#else
|
||||
@@ -1845,7 +1818,7 @@ namespace dpct
|
||||
|
||||
template <typename T>
|
||||
T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
|
||||
unsigned int logical_sub_group_size = 32)
|
||||
int logical_sub_group_size = 32)
|
||||
{
|
||||
unsigned int id = g.get_local_linear_id();
|
||||
unsigned int start_index =
|
||||
@@ -2175,7 +2148,6 @@ namespace dpct
|
||||
}
|
||||
#else
|
||||
return q.memcpy(to_ptr, from_ptr, size, dep_events);
|
||||
GGML_UNUSED(direction);
|
||||
#endif // DPCT_USM_LEVEL_NONE
|
||||
}
|
||||
|
||||
@@ -2956,6 +2928,7 @@ void ggml_sycl_set_main_device(int main_device);
|
||||
void ggml_sycl_set_mul_mat_q(bool mul_mat_q);
|
||||
void ggml_sycl_set_scratch_size(size_t scratch_size);
|
||||
void ggml_sycl_free_scratch(void);
|
||||
int ggml_sycl_get_device_count(void);
|
||||
void ggml_sycl_get_device_description(int device, char * description, size_t description_size);
|
||||
bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||
int ggml_backend_sycl_get_device(ggml_backend_t backend);
|
||||
@@ -3318,7 +3291,7 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
|
||||
std::ofstream logfile;
|
||||
logfile.open(filename);
|
||||
// printf("local buf element %d\n", total_elements);
|
||||
for(size_t i=0; i<total_elements; i++){
|
||||
for(int i=0; i<total_elements; i++){
|
||||
if((i+1)%20 ==0) logfile <<std::endl;
|
||||
else logfile << local_buf[i] <<" ";
|
||||
}
|
||||
@@ -3412,7 +3385,6 @@ static __dpct_inline__ float warp_reduce_max(float x,
|
||||
|
||||
static __dpct_inline__ float op_repeat(const float a, const float b) {
|
||||
return b;
|
||||
GGML_UNUSED(a);
|
||||
}
|
||||
|
||||
static __dpct_inline__ float op_add(const float a, const float b) {
|
||||
@@ -8247,8 +8219,7 @@ static void clamp_f32(const float * x, float * dst, const float min, const float
|
||||
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void im2col_kernel(const float *x, T *dst, int offset_delta,
|
||||
static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
|
||||
int IW, int IH, int OW, int KW, int KH,
|
||||
int pelements, int CHW, int s0, int s1, int p0,
|
||||
int p1, int d0, int d1,
|
||||
@@ -11020,8 +10991,7 @@ static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
||||
static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
|
||||
int OW, int OH, int KW, int KH, int IC,
|
||||
int offset_delta, int s0, int s1, int p0,
|
||||
int p1, int d0, int d1,
|
||||
@@ -11038,7 +11008,7 @@ static void im2col_sycl(const float *x, T *dst, int IW, int IH,
|
||||
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
|
||||
sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
im2col_kernel(x, dst, offset_delta, IW, IH, OW, KW, KH,
|
||||
im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
|
||||
parallel_elements, (IC * KH * KW), s0, s1, p0,
|
||||
p1, d0, d1, item_ct1);
|
||||
});
|
||||
@@ -11175,10 +11145,10 @@ DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
|
||||
// g_sycl_pool_handles[GGML_SYCL_MAX_DEVICES];
|
||||
static dpct::device_ptr g_sycl_pool_addr[GGML_SYCL_MAX_DEVICES] = {0};
|
||||
static size_t g_sycl_pool_used[GGML_SYCL_MAX_DEVICES] = {0};
|
||||
static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
|
||||
|
||||
static void *ggml_sycl_pool_malloc_vmm(size_t size, size_t *actual_size) try {
|
||||
GGML_UNUSED(size);
|
||||
GGML_UNUSED(actual_size);
|
||||
|
||||
return NULL;
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
@@ -11342,10 +11312,10 @@ void ggml_init_sycl() try {
|
||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
|
||||
#if defined(GGML_SYCL_F16)
|
||||
fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__);
|
||||
#if defined(GGML_SYCL_FP16)
|
||||
fprintf(stderr, "%s: GGML_SYCL_FP16: yes\n", __func__);
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
||||
fprintf(stderr, "%s: GGML_SYCL_FP16: no\n", __func__);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -11368,8 +11338,9 @@ void ggml_init_sycl() try {
|
||||
if(id!=user_device_id) continue;
|
||||
|
||||
device_inx++;
|
||||
int device_vmm = 0;
|
||||
|
||||
g_device_caps[device_inx].vmm = 0;
|
||||
g_device_caps[device_inx].vmm = !!device_vmm;
|
||||
g_device_caps[device_inx].device_id = id;
|
||||
g_sycl_device_id2index[id].index = device_inx;
|
||||
|
||||
@@ -11377,12 +11348,18 @@ void ggml_init_sycl() try {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(id))));
|
||||
|
||||
// fprintf(stderr,
|
||||
// " Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
|
||||
// prop.get_name(), prop.get_major_version(),
|
||||
// prop.get_minor_version(), device_vmm ? "yes" : "no");
|
||||
|
||||
g_tensor_split[device_inx] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
g_device_caps[device_inx].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
|
||||
// printf("g_device_caps[%d].cc=%d\n", device_inx, g_device_caps[device_inx].cc);
|
||||
}
|
||||
device_inx = -1;
|
||||
for (int id = 0; id < g_all_sycl_device_count; ++id) {
|
||||
@@ -12218,6 +12195,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
|
||||
// ldc == nrows of the matrix that cuBLAS writes into
|
||||
int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
|
||||
|
||||
const int compute_capability = g_device_caps[id].cc;
|
||||
#ifdef GGML_SYCL_F16
|
||||
bool use_fp16 = true; // TODO(Yu) SYCL capability check
|
||||
#else
|
||||
@@ -12426,7 +12404,7 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
@@ -12449,11 +12427,8 @@ inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
|
||||
|
||||
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
|
||||
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
} else {
|
||||
im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
}
|
||||
im2col_f32_f16_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
|
||||
IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
|
||||
|
||||
(void) src0;
|
||||
(void) src0_dd;
|
||||
@@ -12705,7 +12680,7 @@ static void ggml_sycl_set_peer_access(const int n_tokens) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// int can_access_peer;
|
||||
int can_access_peer;
|
||||
// SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||
// if (can_access_peer) {
|
||||
// if (enable_peer_access) {
|
||||
@@ -12730,6 +12705,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
const int64_t nrows0 = ggml_nrows(src0);
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
@@ -13825,6 +13801,13 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
|
||||
src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get();
|
||||
dst_row_extra.data_device[g_main_device_index] = dst_contiguous.get();
|
||||
|
||||
const dpct::memcpy_direction src1_kind =
|
||||
src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device
|
||||
: dpct::device_to_device;
|
||||
const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU
|
||||
? dpct::device_to_host
|
||||
: dpct::device_to_device;
|
||||
|
||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||
|
||||
@@ -14510,37 +14493,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
||||
int max_compute_units = -1;
|
||||
for(int i=0;i<max_len;i++) id_list[i] = 0;
|
||||
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
|
||||
for(int id=0; id< device_count; id++){
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu()) continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
|
||||
}
|
||||
|
||||
for(int id=0;id< device_count;id++){
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
if (!device.is_gpu()) continue;
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
|
||||
id_list[id] = 1;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
int ggml_sycl_get_device_count() try {
|
||||
int device_count;
|
||||
if (CHECK_TRY_ERROR(device_count =
|
||||
@@ -14555,7 +14507,7 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
||||
void ggml_sycl_get_device_description(int device, char *description,
|
||||
size_t description_size) try {
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
@@ -14806,12 +14758,6 @@ static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_ty
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
return dpct::get_current_device().get_max_mem_alloc_size();
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
int64_t row_low = 0;
|
||||
int64_t row_high = ggml_nrows(tensor);
|
||||
@@ -14842,7 +14788,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
|
||||
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
|
||||
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
|
||||
/* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
|
||||
/* .is_host = */ nullptr,
|
||||
|
||||
@@ -22,8 +22,7 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
11140
ggml-vulkan-shaders.hpp
11140
ggml-vulkan-shaders.hpp
File diff suppressed because it is too large
Load Diff
437
ggml-vulkan.cpp
437
ggml-vulkan.cpp
File diff suppressed because it is too large
Load Diff
335
ggml.c
335
ggml.c
@@ -865,7 +865,7 @@ do { \
|
||||
|
||||
#if defined(__F16C__)
|
||||
// the _mm256_cvt intrinsics require F16C
|
||||
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
|
||||
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
||||
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
||||
#else
|
||||
static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
|
||||
@@ -1371,6 +1371,37 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// xs and vs are byte strides of x and v
|
||||
inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {
|
||||
|
||||
@@ -1455,6 +1486,35 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); }
|
||||
inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
|
||||
inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
|
||||
@@ -1701,6 +1761,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"LEAKY_RELU",
|
||||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_ATTN_EXT",
|
||||
"FLASH_FF",
|
||||
"FLASH_ATTN_BACK",
|
||||
"WIN_PART",
|
||||
@@ -1725,7 +1786,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
||||
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -1787,6 +1848,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"leaky_relu(x)",
|
||||
|
||||
"flash_attn(x)",
|
||||
"flash_attn_ext(x)",
|
||||
"flash_ff(x)",
|
||||
"flash_attn_back(x)",
|
||||
"win_part(x)",
|
||||
@@ -1811,7 +1873,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
|
||||
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4188,6 +4250,8 @@ struct ggml_tensor * ggml_mul_mat(
|
||||
void ggml_mul_mat_set_prec(
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec) {
|
||||
GGML_ASSERT(a->op == GGML_OP_MUL_MAT);
|
||||
|
||||
const int32_t prec_i32 = (int32_t) prec;
|
||||
|
||||
ggml_set_op_params_i32(a, 0, prec_i32);
|
||||
@@ -5021,10 +5085,11 @@ static struct ggml_tensor * ggml_soft_max_impl(
|
||||
bool inplace) {
|
||||
GGML_ASSERT(ggml_is_contiguous(a));
|
||||
if (mask) {
|
||||
GGML_ASSERT(mask->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||
GGML_ASSERT(mask->ne[2] == 1);
|
||||
GGML_ASSERT(mask->ne[3] == 1);
|
||||
GGML_ASSERT(ggml_can_repeat_rows(mask, a));
|
||||
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
|
||||
}
|
||||
|
||||
bool is_node = false;
|
||||
@@ -5775,6 +5840,59 @@ struct ggml_tensor * ggml_flash_attn(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn_ext
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * mask,
|
||||
float scale) {
|
||||
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
||||
// TODO: check if vT can be multiplied by (k*qT)
|
||||
if (mask) {
|
||||
GGML_ASSERT(ggml_is_contiguous(mask));
|
||||
GGML_ASSERT(mask->ne[2] == 1);
|
||||
GGML_ASSERT(mask->ne[3] == 1);
|
||||
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
|
||||
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
|
||||
//GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
|
||||
}
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (q->grad || k->grad || v->grad) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
// permute(0, 2, 1, 3)
|
||||
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, ne);
|
||||
|
||||
float params[] = { scale };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_FLASH_ATTN_EXT;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = q;
|
||||
result->src[1] = k;
|
||||
result->src[2] = v;
|
||||
result->src[3] = mask;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
void ggml_flash_attn_ext_set_prec(
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec) {
|
||||
GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
||||
|
||||
const int32_t prec_i32 = (int32_t) prec;
|
||||
|
||||
ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
|
||||
}
|
||||
|
||||
// ggml_flash_ff
|
||||
|
||||
struct ggml_tensor * ggml_flash_ff(
|
||||
@@ -11437,12 +11555,14 @@ static void ggml_compute_forward_soft_max_f32(
|
||||
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
|
||||
|
||||
// broadcast the mask across rows
|
||||
float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
||||
ggml_fp16_t * mp = src1 ? (ggml_fp16_t *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
|
||||
|
||||
ggml_vec_cpy_f32 (nc, wp, sp);
|
||||
ggml_vec_scale_f32(nc, wp, scale);
|
||||
if (mp) {
|
||||
ggml_vec_acc_f32(nc, wp, mp);
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
wp[i] += GGML_FP16_TO_FP32(mp[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
@@ -13552,6 +13672,197 @@ static void ggml_compute_forward_flash_attn(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn_ext
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * q,
|
||||
const struct ggml_tensor * k,
|
||||
const struct ggml_tensor * v,
|
||||
const struct ggml_tensor * mask,
|
||||
struct ggml_tensor * dst) {
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t D = neq0;
|
||||
const int64_t N = neq1;
|
||||
|
||||
GGML_ASSERT(ne0 == D);
|
||||
GGML_ASSERT(ne2 == N);
|
||||
|
||||
GGML_ASSERT(nbq0 == sizeof(float));
|
||||
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
|
||||
|
||||
GGML_ASSERT(neq0 == D);
|
||||
GGML_ASSERT(nek0 == D);
|
||||
GGML_ASSERT(nev0 == D);
|
||||
|
||||
GGML_ASSERT(neq1 == N);
|
||||
GGML_ASSERT(nev0 == D);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
// broadcast factors
|
||||
const int64_t rk2 = neq2/nek2;
|
||||
const int64_t rk3 = neq3/nek3;
|
||||
|
||||
const int64_t rv2 = neq2/nev2;
|
||||
const int64_t rv3 = neq3/nev3;
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by q rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in q
|
||||
const int nr = neq1*neq2*neq3;
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
float scale = 1.0f;
|
||||
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
|
||||
|
||||
// loop over n_batch and n_head
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// q indices
|
||||
const int iq3 = ir/(neq2*neq1);
|
||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||
|
||||
float S = 0.0f;
|
||||
float M = -INFINITY;
|
||||
|
||||
float * V32 = (float *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32);
|
||||
ggml_fp16_t * Q16 = (ggml_fp16_t *) (V32); // reuse memory
|
||||
ggml_fp16_t * V16 = (ggml_fp16_t *) (V32 + D);
|
||||
|
||||
memset(V16, 0, D*sizeof(ggml_fp16_t));
|
||||
|
||||
const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL;
|
||||
|
||||
// k indices
|
||||
const int ik3 = iq3 / rk3;
|
||||
const int ik2 = iq2 / rk2;
|
||||
|
||||
// v indices
|
||||
const int iv3 = iq3 / rv3;
|
||||
const int iv2 = iq2 / rv2;
|
||||
|
||||
// online softmax / attention
|
||||
// loop over n_kv and n_head_kv
|
||||
// ref: https://arxiv.org/pdf/2112.05682.pdf
|
||||
for (int64_t ic = 0; ic < nek1; ++ic) {
|
||||
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
|
||||
if (mv == -INFINITY) {
|
||||
continue;
|
||||
}
|
||||
|
||||
float s;
|
||||
|
||||
// convert Q to F16 in V32
|
||||
{
|
||||
const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3));
|
||||
|
||||
for (int64_t d = 0; d < D; ++d) {
|
||||
Q16[d] = GGML_FP32_TO_FP16(pq[d]);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_vec_dot_f16(D,
|
||||
&s,
|
||||
(ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
||||
Q16);
|
||||
|
||||
s = s*scale + mv;
|
||||
|
||||
const float Mold = M;
|
||||
|
||||
float ms = 1.0f;
|
||||
float vs = 1.0f;
|
||||
|
||||
if (s > M) {
|
||||
M = s;
|
||||
ms = expf(Mold - M);
|
||||
|
||||
// V = V*expf(Mold - M)
|
||||
ggml_vec_scale_f16(D, V16, ms);
|
||||
} else {
|
||||
vs = expf(s - M);
|
||||
}
|
||||
|
||||
const ggml_fp16_t * v16 = (const ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
||||
|
||||
// V += v*expf(s - M)
|
||||
ggml_vec_mad_f16(D, V16, v16, vs);
|
||||
|
||||
S = S*ms + vs;
|
||||
}
|
||||
|
||||
// V /= S
|
||||
for (int64_t d = 0; d < D; ++d) {
|
||||
V32[d] = GGML_FP16_TO_FP32(V16[d])/S;
|
||||
}
|
||||
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// original
|
||||
//memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
|
||||
|
||||
// permute(0, 2, 1, 3)
|
||||
memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * q,
|
||||
const struct ggml_tensor * k,
|
||||
const struct ggml_tensor * v,
|
||||
const struct ggml_tensor * mask,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (dst->op_params[1]) {
|
||||
case GGML_PREC_DEFAULT:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
// TODO: implement F32 precision
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_ff
|
||||
|
||||
static void ggml_compute_forward_flash_ff_f16(
|
||||
@@ -15086,6 +15397,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
const bool masked = t != 0;
|
||||
ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
|
||||
@@ -16082,6 +16397,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
struct ggml_tensor * flash_grad = NULL;
|
||||
if (src0->grad || src1->grad || tensor->src[2]->grad) {
|
||||
@@ -16810,6 +17126,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
@@ -17204,6 +17521,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
const int64_t ne00 = node->src[0]->ne[0]; // D
|
||||
|
||||
cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||
|
||||
20
ggml.h
20
ggml.h
@@ -454,6 +454,7 @@ extern "C" {
|
||||
GGML_OP_LEAKY_RELU,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_ATTN_EXT,
|
||||
GGML_OP_FLASH_FF,
|
||||
GGML_OP_FLASH_ATTN_BACK,
|
||||
GGML_OP_WIN_PART,
|
||||
@@ -1645,6 +1646,25 @@ extern "C" {
|
||||
struct ggml_tensor * v,
|
||||
bool masked);
|
||||
|
||||
#define GGML_KQ_MASK_PAD 32
|
||||
|
||||
// q: [n_embd, n_batch, n_head, 1]
|
||||
// k: [n_embd, n_kv, n_head_kv, 1]
|
||||
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * mask,
|
||||
float scale);
|
||||
|
||||
GGML_API void ggml_flash_attn_ext_set_prec(
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
|
||||
@@ -157,10 +157,19 @@ struct block_q6_K
|
||||
|
||||
# Dequant functions
|
||||
shader_f16_dequant_func = """
|
||||
#define DEQUANT_FUNC f16vec2 v = f16vec2(data_a[ib + 0], data_a[ib + 1]);
|
||||
"""
|
||||
shader_f16_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]);
|
||||
"""
|
||||
|
||||
shader_q4_0_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
||||
f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \
|
||||
v = (v - 8.0hf)*d;
|
||||
"""
|
||||
shader_q4_0_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
const uint vui = uint(data_a[ib].qs[iqs]); \
|
||||
vec2 v = vec2(vui & 0xF, vui >> 4); \
|
||||
@@ -168,6 +177,13 @@ v = (v - 8.0f)*d;
|
||||
"""
|
||||
|
||||
shader_q4_1_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
const float16_t m = data_a[ib].m; \
|
||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
||||
f16vec2 v = f16vec2(vui & 0xF, vui >> 4); \
|
||||
v = v*d + m;
|
||||
"""
|
||||
shader_q4_1_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
const float m = float(data_a[ib].m); \
|
||||
const uint vui = uint(data_a[ib].qs[iqs]); \
|
||||
@@ -176,6 +192,14 @@ v = v*d + m;
|
||||
"""
|
||||
|
||||
shader_q5_0_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
|
||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
|
||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
||||
f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
|
||||
v = (v - 16.0hf) * d;
|
||||
"""
|
||||
shader_q5_0_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \
|
||||
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \
|
||||
@@ -185,6 +209,14 @@ v = (v - 16.0f) * d;
|
||||
"""
|
||||
|
||||
shader_q5_1_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
const float16_t m = data_a[ib].m; \
|
||||
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
|
||||
const uint8_t vui = data_a[ib].qs[iqs]; \
|
||||
f16vec2 v = f16vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \
|
||||
v = v*d + m;
|
||||
"""
|
||||
shader_q5_1_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
const float m = float(data_a[ib].m); \
|
||||
const ivec2 qh = ivec2(((data_a[ib].qh >> iqs) << 4) & 0x10, (data_a[ib].qh >> (iqs + 12)) & 0x10); \
|
||||
@@ -194,6 +226,11 @@ v = v*d + m;
|
||||
"""
|
||||
|
||||
shader_q8_0_dequant_func = """
|
||||
#define DEQUANT_FUNC const float16_t d = data_a[ib].d; \
|
||||
f16vec2 v = f16vec2(data_a[ib].qs[iqs], data_a[ib].qs[iqs + 1]); \
|
||||
v = v * d;
|
||||
"""
|
||||
shader_q8_0_dequant_func_compat = """
|
||||
#define DEQUANT_FUNC const float d = float(data_a[ib].d); \
|
||||
vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \
|
||||
v = v * d;
|
||||
@@ -1652,8 +1689,7 @@ void main() {
|
||||
}
|
||||
|
||||
const float xi = float(data_a[i]);
|
||||
const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
|
||||
data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1)));
|
||||
data_d[i] = D_TYPE(0.5f*xi*(1.0f + tanh(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))));
|
||||
}
|
||||
"""
|
||||
|
||||
@@ -2073,7 +2109,7 @@ lock = asyncio.Lock()
|
||||
shader_fnames = []
|
||||
|
||||
|
||||
async def string_to_spv(name, code, defines, fp16=True):
|
||||
async def string_to_spv(name, code, defines, fp16):
|
||||
f = NamedTemporaryFile(mode="w", delete=False)
|
||||
f.write(code)
|
||||
f.flush()
|
||||
@@ -2163,6 +2199,64 @@ async def main():
|
||||
tasks.append(string_to_spv("matmul_f16_f32_aligned_m", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv("matmul_f16_f32_aligned_s", "".join(stream), {"LOAD_VEC": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16))
|
||||
|
||||
# Build dequant shaders
|
||||
tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}, fp16))
|
||||
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
|
||||
stream.extend((dequant_head, shader_int8_ext, shader_float_type))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q2_K:
|
||||
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
|
||||
elif i == GGML_TYPE_Q3_K:
|
||||
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
|
||||
elif i == GGML_TYPE_Q4_K:
|
||||
stream.extend((shader_q4_K_defines, dequant_q4_K_body))
|
||||
elif i == GGML_TYPE_Q5_K:
|
||||
stream.extend((shader_q5_K_defines, dequant_q5_K_body))
|
||||
elif i == GGML_TYPE_Q6_K:
|
||||
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}, fp16))
|
||||
|
||||
# get_rows
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
stream.extend((generic_head, shader_int8_ext, shader_float_type))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat if not fp16 else shader_f16_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat if not fp16 else shader_q4_0_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat if not fp16 else shader_q4_1_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat if not fp16 else shader_q5_0_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat if not fp16 else shader_q5_1_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat if not fp16 else shader_q8_0_dequant_func, get_rows_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}, fp16))
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
||||
|
||||
# Shaders where precision is needed, so no fp16 version
|
||||
|
||||
# mul mat vec
|
||||
@@ -2171,17 +2265,17 @@ async def main():
|
||||
stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body))
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func_compat, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body))
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func_compat, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, mul_mat_vec_body))
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func_compat, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, mul_mat_vec_body))
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func_compat, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, mul_mat_vec_body))
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func_compat, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, mul_mat_vec_body))
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func_compat, mul_mat_vec_body))
|
||||
elif i == GGML_TYPE_Q2_K:
|
||||
stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body))
|
||||
elif i == GGML_TYPE_Q3_K:
|
||||
@@ -2195,101 +2289,43 @@ async def main():
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}))
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION}, fp16))
|
||||
|
||||
# Dequant shaders
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
|
||||
stream.extend((dequant_head, shader_int8_ext, shader_f32))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, dequant_body))
|
||||
elif i == GGML_TYPE_Q2_K:
|
||||
stream.extend((shader_q2_K_defines, dequant_q2_K_body))
|
||||
elif i == GGML_TYPE_Q3_K:
|
||||
stream.extend((shader_q3_K_defines, dequant_q3_K_body))
|
||||
elif i == GGML_TYPE_Q4_K:
|
||||
stream.extend((shader_q4_K_defines, dequant_q4_K_body))
|
||||
elif i == GGML_TYPE_Q5_K:
|
||||
stream.extend((shader_q5_K_defines, dequant_q5_K_body))
|
||||
elif i == GGML_TYPE_Q6_K:
|
||||
stream.extend((shader_q6_K_defines, dequant_q6_K_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"dequant_{type_names[i]}", "".join(stream), {"D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("f32_to_f16", f32_to_f16_src, {}))
|
||||
|
||||
# get_rows
|
||||
for i in range(0, VK_NUM_TYPES):
|
||||
stream.clear()
|
||||
stream.extend((generic_head, shader_int8_ext, shader_f32))
|
||||
|
||||
if i == GGML_TYPE_F16:
|
||||
stream.extend((shader_f16_defines, shader_f16_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_0:
|
||||
stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q4_1:
|
||||
stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_0:
|
||||
stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q5_1:
|
||||
stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body))
|
||||
elif i == GGML_TYPE_Q8_0:
|
||||
stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body))
|
||||
else:
|
||||
continue
|
||||
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
# Norms
|
||||
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("cpy_f32_f32", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("cpy_f32_f16", f"{cpy_src}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"}, True))
|
||||
tasks.append(string_to_spv("cpy_f16_f16", f"{cpy_src}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
||||
|
||||
tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("add_f32", f"{generic_head}\n{shader_f32}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}))
|
||||
tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {}, True))
|
||||
tasks.append(string_to_spv("mul_f32", f"{generic_head}\n{shader_f32}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("scale_f32", f"{generic_head}\n{shader_f32}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("sqr_f32", f"{generic_head}\n{shader_f32}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("clamp_f32", f"{generic_head}\n{shader_f32}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("relu_f32", f"{generic_head}\n{shader_f32}\n{relu_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("soft_max_f32", f"{generic_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
|
||||
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
||||
|
||||
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("rope_neox_f32", rope_neox_src, {"A_TYPE": "float", "D_TYPE": "float"}, True))
|
||||
tasks.append(string_to_spv("rope_neox_f16", rope_neox_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}, True))
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
@@ -72,7 +72,6 @@ class Keys:
|
||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||
HF_JSON = "tokenizer.huggingface.json"
|
||||
RWKV = "tokenizer.rwkv.world"
|
||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||
@@ -103,7 +102,6 @@ class MODEL_ARCH(IntEnum):
|
||||
PLAMO = auto()
|
||||
CODESHELL = auto()
|
||||
ORION = auto()
|
||||
INTERNLM2 = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -155,7 +153,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.PLAMO: "plamo",
|
||||
MODEL_ARCH.CODESHELL: "codeshell",
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -449,21 +446,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.INTERNLM2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
||||
@@ -411,9 +411,6 @@ class GGUFWriter:
|
||||
def add_add_eos_token(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||
|
||||
def add_add_space_prefix(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||
|
||||
def add_chat_template(self, value: str) -> None:
|
||||
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||
|
||||
|
||||
@@ -19,7 +19,6 @@ class TensorNameMap:
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
"wte", # gpt2
|
||||
"transformer.embd.wte", # phi2
|
||||
"model.tok_embeddings", # internlm2
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
@@ -43,7 +42,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen
|
||||
"output", # llama-pth bloom internlm2
|
||||
"output", # llama-pth bloom
|
||||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
),
|
||||
@@ -52,7 +51,7 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||
"model.norm", # llama-hf baichuan internlm2
|
||||
"model.norm", # llama-hf baichuan
|
||||
"norm", # llama-pth
|
||||
"embeddings.LayerNorm", # bert
|
||||
"transformer.norm_f", # mpt
|
||||
@@ -85,7 +84,6 @@ class TensorNameMap:
|
||||
"h.{bid}.ln_1", # gpt2
|
||||
"transformer.h.{bid}.ln", # phi2
|
||||
"model.layers.layers.{bid}.norm", # plamo
|
||||
"model.layers.{bid}.attention_norm", # internlm2
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -113,7 +111,6 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.query", # bert
|
||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||
"model.layers.{bid}.attention.wq" # internlm2
|
||||
),
|
||||
|
||||
# Attention key
|
||||
@@ -123,7 +120,6 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.key", # bert
|
||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||
"model.layers.{bid}.attention.wk" # internlm2
|
||||
),
|
||||
|
||||
# Attention value
|
||||
@@ -133,7 +129,6 @@ class TensorNameMap:
|
||||
"encoder.layer.{bid}.attention.self.value", # bert
|
||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||
"model.layers.{bid}.attention.wv" # internlm2
|
||||
),
|
||||
|
||||
# Attention output
|
||||
@@ -152,7 +147,6 @@ class TensorNameMap:
|
||||
"h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||
"model.layers.{bid}.attention.wo", # internlm2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
@@ -175,7 +169,6 @@ class TensorNameMap:
|
||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln2", # yi
|
||||
"h.{bid}.ln_2", # gpt2
|
||||
"model.layers.{bid}.ffn_norm", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
@@ -201,7 +194,6 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -220,7 +212,6 @@ class TensorNameMap:
|
||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||
"transformer.h.{bid}.mlp.w2", # qwen
|
||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
@@ -245,7 +236,6 @@ class TensorNameMap:
|
||||
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||
"model.layers.{bid}.feed_forward.w2", # internlm2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
|
||||
351
llama.cpp
351
llama.cpp
@@ -102,6 +102,8 @@
|
||||
#define LLAMA_MAX_NODES 8192
|
||||
#define LLAMA_MAX_EXPERTS 8
|
||||
|
||||
#define LLAMA_FLASH_ATTN
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
@@ -204,11 +206,10 @@ enum llm_arch {
|
||||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
LLM_ARCH_INTERNLM2,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_LLAMA, "llama" },
|
||||
{ LLM_ARCH_FALCON, "falcon" },
|
||||
{ LLM_ARCH_GPT2, "gpt2" },
|
||||
@@ -227,7 +228,6 @@ static std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
};
|
||||
|
||||
enum llm_kv {
|
||||
@@ -280,12 +280,11 @@ enum llm_kv {
|
||||
LLM_KV_TOKENIZER_PAD_ID,
|
||||
LLM_KV_TOKENIZER_ADD_BOS,
|
||||
LLM_KV_TOKENIZER_ADD_EOS,
|
||||
LLM_KV_TOKENIZER_ADD_PREFIX,
|
||||
LLM_KV_TOKENIZER_HF_JSON,
|
||||
LLM_KV_TOKENIZER_RWKV,
|
||||
};
|
||||
|
||||
static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
||||
{ LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" },
|
||||
{ LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" },
|
||||
{ LLM_KV_GENERAL_ALIGNMENT, "general.alignment" },
|
||||
@@ -335,7 +334,6 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||
{ LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" },
|
||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||
};
|
||||
@@ -346,7 +344,7 @@ struct LLM_KV {
|
||||
llm_arch arch;
|
||||
|
||||
std::string operator()(llm_kv kv) const {
|
||||
return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]);
|
||||
return ::format(LLM_KV_NAMES[kv].c_str(), LLM_ARCH_NAMES[arch].c_str());
|
||||
}
|
||||
};
|
||||
|
||||
@@ -673,23 +671,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_INTERNLM2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -747,13 +729,13 @@ struct LLM_TN {
|
||||
// gguf helpers
|
||||
//
|
||||
|
||||
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||||
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
||||
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
||||
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
||||
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
||||
};
|
||||
|
||||
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||
if (kv.second == name) {
|
||||
return kv.first;
|
||||
@@ -1397,7 +1379,6 @@ enum e_model {
|
||||
MODEL_13B,
|
||||
MODEL_14B,
|
||||
MODEL_15B,
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_34B,
|
||||
MODEL_40B,
|
||||
@@ -1415,7 +1396,6 @@ static const size_t GiB = 1024*MiB;
|
||||
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
bool rope_finetuned;
|
||||
uint32_t n_vocab;
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_embd;
|
||||
@@ -1435,7 +1415,8 @@ struct llama_hparams {
|
||||
float rope_freq_base_train;
|
||||
float rope_freq_scale_train;
|
||||
uint32_t n_yarn_orig_ctx;
|
||||
int32_t rope_scaling_type_train;
|
||||
int8_t rope_scaling_type_train : 3;
|
||||
bool rope_finetuned : 1;
|
||||
|
||||
float f_clamp_kqv;
|
||||
float f_max_alibi_bias;
|
||||
@@ -1639,8 +1620,6 @@ struct llama_vocab {
|
||||
id special_suffix_id = 32008;
|
||||
id special_eot_id = 32010;
|
||||
|
||||
bool add_space_prefix = true;
|
||||
|
||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
||||
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
||||
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
||||
@@ -2701,7 +2680,7 @@ struct llama_model_loader {
|
||||
// load LLaMA models
|
||||
//
|
||||
|
||||
static const char * llama_model_arch_name(llm_arch arch) {
|
||||
static std::string llama_model_arch_name(llm_arch arch) {
|
||||
auto it = LLM_ARCH_NAMES.find(arch);
|
||||
if (it == LLM_ARCH_NAMES.end()) {
|
||||
return "unknown";
|
||||
@@ -2754,7 +2733,6 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_14B: return "14B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_20B: return "20B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
@@ -2767,14 +2745,6 @@ static const char * llama_model_type_name(e_model type) {
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
||||
model.arch = ml.get_arch();
|
||||
@@ -3038,15 +3008,6 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 48: model.type = e_model::MODEL_20B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -3098,11 +3059,6 @@ static void llm_load_vocab(
|
||||
vocab.special_unk_id = 0;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
|
||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||
if (add_space_prefix_keyidx != -1) {
|
||||
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||
} // The default value of add_space_prefix is true.
|
||||
} else if (tokenizer_name == "gpt2") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||
|
||||
@@ -3310,12 +3266,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & vocab = model.vocab;
|
||||
|
||||
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
||||
const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
||||
|
||||
// hparams
|
||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch));
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
|
||||
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
|
||||
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix
|
||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
|
||||
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
||||
@@ -3336,7 +3292,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||
LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx);
|
||||
@@ -4064,35 +4020,8 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
// layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -4361,23 +4290,34 @@ static void llm_build_kv_store(
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
// compute the transposed [n_tokens, n_embd] V matrix
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
|
||||
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
||||
cb(v_cur_t, "v_cur_t", il);
|
||||
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa,
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
||||
cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
||||
|
||||
#if defined(LLAMA_FLASH_ATTN)
|
||||
// NOTE: the V cache is not transposed when using FLASH attention !!
|
||||
struct ggml_tensor * v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa,
|
||||
(ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa))*kv_head);
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view));
|
||||
|
||||
GGML_UNUSED(n_ctx);
|
||||
#else
|
||||
// compute the transposed [n_tokens, n_embd] V matrix
|
||||
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
||||
cb(v_cur_t, "v_cur_t", il);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
||||
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
||||
(kv_head)*ggml_element_size(kv.v_l[il]));
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct ggml_tensor * llm_build_norm(
|
||||
@@ -4538,6 +4478,28 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
0);
|
||||
cb(k, "k", il);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
#if defined(LLAMA_FLASH_ATTN)
|
||||
// split cached v into n_head heads (not transposed)
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v_l[il],
|
||||
n_embd_head_v, n_kv, n_head_kv,
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
||||
0);
|
||||
cb(v, "v", il);
|
||||
|
||||
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_DEFAULT);
|
||||
//printf("q: %4d %4d %4d %4d\n", q->ne[0], q->ne[1], q->ne[2], q->ne[3]);
|
||||
//printf("k: %4d %4d %4d %4d\n", k->ne[0], k->ne[1], k->ne[2], k->ne[3]);
|
||||
//printf("v: %4d %4d %4d %4d\n", v->ne[0], v->ne[1], v->ne[2], v->ne[3]);
|
||||
//printf("m: %4d %4d %4d %4d\n", kq_mask->ne[0], kq_mask->ne[1], kq_mask->ne[2], kq_mask->ne[3]);
|
||||
//printf("r: %4d %4d %4d %4d\n", kqv->ne[0], kqv->ne[1], kqv->ne[2], kqv->ne[3]);
|
||||
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
||||
#else
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
@@ -4570,7 +4532,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
cb(kq, "kq_soft_max_ext", il);
|
||||
}
|
||||
|
||||
// split cached v into n_head heads
|
||||
// split cached v into n_head heads (transposed)
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v_l[il],
|
||||
n_kv, n_embd_head_v, n_head_kv,
|
||||
@@ -4585,8 +4547,9 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
cb(kqv_merged, "kqv_merged", il);
|
||||
|
||||
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
||||
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
||||
cb(cur, "kqv_merged_cont", il);
|
||||
#endif
|
||||
|
||||
ggml_build_forward_expand(graph, cur);
|
||||
|
||||
@@ -4758,7 +4721,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -4942,7 +4905,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -5063,7 +5026,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -5185,7 +5148,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
@@ -5282,7 +5245,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
if (do_rope_shift) {
|
||||
@@ -5485,7 +5448,7 @@ struct llm_build_context {
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5575,7 +5538,7 @@ struct llm_build_context {
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
inpL = llm_build_norm(ctx0, inpL, hparams,
|
||||
@@ -5668,7 +5631,7 @@ struct llm_build_context {
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -5768,7 +5731,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -5891,7 +5854,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -6005,7 +5968,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -6126,7 +6089,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -6248,7 +6211,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -6355,7 +6318,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
@@ -6453,7 +6416,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -6561,7 +6524,7 @@ struct llm_build_context {
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
struct ggml_tensor * KQ_mask = ggml_cast(ctx0, ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0), GGML_TYPE_F16);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
@@ -6661,126 +6624,6 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_internlm2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
@@ -6939,10 +6782,6 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_orion();
|
||||
} break;
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
{
|
||||
result = llm.build_internlm2();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -7042,7 +6881,8 @@ static int llama_decode_internal(
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||
// note: we pad the n_kv because certain GPU kernels require it (e.g. ggml_flash_attn_ext)
|
||||
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(128, GGML_PAD(llama_kv_cache_cell_max(kv_self), 128)));
|
||||
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
||||
|
||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||
@@ -7885,9 +7725,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
//
|
||||
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
if (&fragment == &fragment_buffer.front()) {
|
||||
if (vocab.add_space_prefix) {
|
||||
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
||||
}
|
||||
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
||||
}
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
@@ -10413,7 +10251,10 @@ struct llama_context * llama_new_context_with_model(
|
||||
const auto & hparams = model->hparams;
|
||||
auto & cparams = ctx->cparams;
|
||||
|
||||
cparams.n_batch = params.n_batch;
|
||||
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
||||
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
||||
cparams.n_batch = std::max((uint32_t) GGML_KQ_MASK_PAD, params.n_batch);
|
||||
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch;
|
||||
cparams.yarn_ext_factor = params.yarn_ext_factor;
|
||||
@@ -10539,8 +10380,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
}
|
||||
ctx->backends.push_back(ctx->backend_cpu);
|
||||
|
||||
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
||||
cparams.n_ctx, cparams.offload_kqv)) {
|
||||
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
|
||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
@@ -10594,6 +10434,9 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
||||
|
||||
// zero-out the input buffer to prevent NaNs in padded tensors
|
||||
ggml_backend_buffer_clear(ctx->buf_input, 0);
|
||||
|
||||
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
||||
ggml_backend_buffer_name(ctx->buf_input),
|
||||
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|
||||
@@ -10735,7 +10578,7 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
|
||||
|
||||
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "%s %s %s",
|
||||
llama_model_arch_name(model->arch),
|
||||
llama_model_arch_name(model->arch).c_str(),
|
||||
llama_model_type_name(model->type),
|
||||
llama_model_ftype_name(model->ftype).c_str());
|
||||
}
|
||||
@@ -11377,24 +11220,22 @@ struct llama_batch llama_batch_get_one(
|
||||
};
|
||||
}
|
||||
|
||||
struct llama_batch llama_batch_init(int32_t n_tokens_alloc, int32_t embd, int32_t n_seq_max) {
|
||||
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
|
||||
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
||||
|
||||
if (embd) {
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens_alloc * embd);
|
||||
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
||||
} else {
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens_alloc);
|
||||
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
||||
}
|
||||
|
||||
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens_alloc);
|
||||
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens_alloc);
|
||||
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * (n_tokens_alloc + 1));
|
||||
for (int i = 0; i < n_tokens_alloc; ++i) {
|
||||
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
||||
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
||||
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
||||
}
|
||||
batch.seq_id[n_tokens_alloc] = nullptr;
|
||||
|
||||
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens_alloc);
|
||||
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
||||
|
||||
return batch;
|
||||
}
|
||||
@@ -11405,7 +11246,7 @@ void llama_batch_free(struct llama_batch batch) {
|
||||
if (batch.pos) free(batch.pos);
|
||||
if (batch.n_seq_id) free(batch.n_seq_id);
|
||||
if (batch.seq_id) {
|
||||
for (int i = 0; batch.seq_id[i] != nullptr; ++i) {
|
||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||
free(batch.seq_id[i]);
|
||||
}
|
||||
free(batch.seq_id);
|
||||
|
||||
2
llama.h
2
llama.h
@@ -213,7 +213,7 @@ extern "C" {
|
||||
uint32_t n_batch; // prompt processing maximum batch size
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||
|
||||
@@ -141,28 +141,6 @@ for wt in "${wtypes[@]}"; do
|
||||
wfiles+=("")
|
||||
done
|
||||
|
||||
# map wtype input to index
|
||||
if [[ ! -z "$wtype" ]]; then
|
||||
iw=-1
|
||||
is=0
|
||||
for wt in "${wtypes[@]}"; do
|
||||
# uppercase
|
||||
uwt=$(echo "$wt" | tr '[:lower:]' '[:upper:]')
|
||||
if [[ "$uwt" == "$wtype" ]]; then
|
||||
iw=$is
|
||||
break
|
||||
fi
|
||||
is=$((is+1))
|
||||
done
|
||||
|
||||
if [[ $iw -eq -1 ]]; then
|
||||
printf "[-] Invalid weight type: %s\n" "$wtype"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
wtype="$iw"
|
||||
fi
|
||||
|
||||
# sample repos
|
||||
repos=(
|
||||
"https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
|
||||
@@ -274,10 +252,8 @@ for file in $model_files; do
|
||||
printf " %2d) %s %s\n" $iw "$have" "$file"
|
||||
done
|
||||
|
||||
wfile="${wfiles[$wtype]}"
|
||||
|
||||
# ask for weights type until provided and available
|
||||
while [[ -z "$wfile" ]]; do
|
||||
while [[ -z "$wtype" ]]; do
|
||||
printf "\n"
|
||||
read -p "[+] Select weight type: " wtype
|
||||
wfile="${wfiles[$wtype]}"
|
||||
|
||||
@@ -1101,7 +1101,7 @@ struct test_soft_max : public test_case {
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * b = nullptr;
|
||||
if (mask) { b = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]); }
|
||||
if (mask) { b = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, ne[0], ne[1]); }
|
||||
ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale);
|
||||
return out;
|
||||
}
|
||||
@@ -1450,6 +1450,76 @@ struct test_leaky_relu : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_FLASH_ATTN_EXT
|
||||
struct test_flash_attn_ext : public test_case {
|
||||
const int64_t hs; // head size
|
||||
const int64_t nh; // num heads
|
||||
const int64_t kv; // kv size
|
||||
const int64_t nb; // batch size
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(hs, nh, kv, nb);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-4;
|
||||
}
|
||||
|
||||
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
||||
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
|
||||
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// Attention
|
||||
struct test_attn : public test_case {
|
||||
const int64_t hs; // head size
|
||||
const int64_t nh; // num heads
|
||||
const int64_t kv; // kv size
|
||||
const int64_t nb; // batch size
|
||||
|
||||
std::string op_desc(ggml_tensor * t) override {
|
||||
return "ATTN";
|
||||
|
||||
GGML_UNUSED(t);
|
||||
}
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR4(hs, nh, kv, nb);
|
||||
}
|
||||
|
||||
double max_nmse_err() override {
|
||||
return 5e-4;
|
||||
}
|
||||
|
||||
test_attn(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
|
||||
: hs(hs), nh(nh), kv(kv), nb(nb) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
|
||||
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
|
||||
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, hs, nh, 1); // transposed
|
||||
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, nb, 1, 1);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
cur = ggml_mul_mat (ctx, k, q);
|
||||
cur = ggml_soft_max_ext(ctx, cur, mask, 1.0f/sqrtf(hs));
|
||||
cur = ggml_mul_mat (ctx, v, cur);
|
||||
cur = ggml_permute (ctx, cur, 0, 2, 1, 3);
|
||||
cur = ggml_cont_2d (ctx, cur, hs*nh, nb);
|
||||
|
||||
return cur;
|
||||
}
|
||||
};
|
||||
|
||||
// Mixtral MOE
|
||||
struct test_moe : public test_case {
|
||||
const int n_experts;
|
||||
@@ -1723,7 +1793,7 @@ struct test_llama : public test_llm {
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
||||
|
||||
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
@@ -1845,7 +1915,7 @@ struct test_falcon : public test_llm {
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1);
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
|
||||
|
||||
ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
|
||||
@@ -2129,6 +2199,30 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
test_cases.emplace_back(new test_pad());
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
||||
#if 0
|
||||
for (int hs : { 64, 80, 96, 112, 128, 256, }) {
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, 2048, 4096, }) {
|
||||
for (int nb : { 1, 2, 4, 8, 512, 1024, 2048, }) {
|
||||
test_cases.emplace_back(new test_attn (hs, nh, kv, nb));
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int hs : { 128, }) {
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
for (int nb : { 1, 2, 4, 8, 512 }) {
|
||||
test_cases.emplace_back(new test_attn (hs, nh, kv, nb));
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(__SANITIZE_THREAD__)
|
||||
// FIXME: these tests use too much memory with thread sanitizer
|
||||
test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
|
||||
|
||||
@@ -105,7 +105,7 @@ int main()
|
||||
|
||||
for (auto rule : expected_rules)
|
||||
{
|
||||
parsed_grammar.rules.emplace_back();
|
||||
parsed_grammar.rules.push_back({});
|
||||
for (auto element : rule)
|
||||
{
|
||||
parsed_grammar.rules.back().push_back(element);
|
||||
|
||||
Reference in New Issue
Block a user