mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f4bd8b3d26 | ||
|
|
51e9d02599 | ||
|
|
d273c1402b | ||
|
|
27b040691c | ||
|
|
29c60d8cdd | ||
|
|
359cbe3f46 | ||
|
|
e18bc6aaf3 | ||
|
|
ee94172d33 | ||
|
|
934266c0e0 | ||
|
|
9c4fdcbec8 | ||
|
|
24ecb58168 | ||
|
|
9afdffe70e | ||
|
|
3b3963c55c | ||
|
|
dda64fc17c | ||
|
|
0350f58152 | ||
|
|
ad52d5c259 | ||
|
|
172b78210a | ||
|
|
13ad16af12 | ||
|
|
8f7080bf48 | ||
|
|
e1b40ac3b9 | ||
|
|
dc020985b8 | ||
|
|
344f9126cc | ||
|
|
9a17ab914b | ||
|
|
ea3b0590ee | ||
|
|
29499bb593 | ||
|
|
48aa8fd1f2 | ||
|
|
583fd6b000 | ||
|
|
9f773486ab |
61
.github/workflows/build.yml
vendored
61
.github/workflows/build.yml
vendored
@@ -693,26 +693,28 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'rpc'
|
||||
- build: 'rpc-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'noavx'
|
||||
- build: 'noavx-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
- build: 'avx2-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx'
|
||||
- build: 'avx-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512'
|
||||
- build: 'avx512-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast'
|
||||
- build: 'clblast-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute'
|
||||
- build: 'kompute-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'vulkan'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'arm64'
|
||||
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'msvc-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -723,13 +725,13 @@ jobs:
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
if: ${{ matrix.build == 'kompute-x64' }}
|
||||
run: |
|
||||
git submodule update --init kompute
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
||||
mkdir $env:RUNNER_TEMP/opencl
|
||||
@@ -737,7 +739,7 @@ jobs:
|
||||
|
||||
- name: Download CLBlast
|
||||
id: get_clblast
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||
@@ -750,7 +752,7 @@ jobs:
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
@@ -763,38 +765,41 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
|
||||
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. ${{ matrix.defines }}
|
||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
cmake -S . -B build ${{ matrix.defines }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add clblast.dll
|
||||
id: add_clblast_dll
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Check AVX512F support
|
||||
id: check_avx512f
|
||||
if: ${{ matrix.build == 'avx512' }}
|
||||
if: ${{ matrix.build == 'avx512-x64' }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cd build
|
||||
@@ -808,14 +813,14 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Test (Intel SDE)
|
||||
id: cmake_test_sde
|
||||
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# for some weird reason windows tar doesn't like sde tar.xz
|
||||
@@ -843,14 +848,14 @@ jobs:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
|
||||
name: llama-bin-win-${{ matrix.build }}-x64.zip
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
runs-on: windows-latest
|
||||
|
||||
@@ -1007,6 +1007,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||
endif ()
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
|
||||
45
CMakePresets.json
Normal file
45
CMakePresets.json
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"version": 4,
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "base",
|
||||
"hidden": true,
|
||||
"generator": "Ninja",
|
||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||
"cacheVariables": {
|
||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-llvm", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
||||
]
|
||||
}
|
||||
@@ -532,7 +532,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||
&& cmake --build build --config Release -- -j 16
|
||||
```
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
|
||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||
|
||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||
@@ -712,6 +712,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
### Prepare and Quantize
|
||||
|
||||
> [!NOTE]
|
||||
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
||||
|
||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||
|
||||
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||
|
||||
16
cmake/arm64-windows-llvm.cmake
Normal file
16
cmake/arm64-windows-llvm.cmake
Normal file
@@ -0,0 +1,16 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
6
cmake/arm64-windows-msvc.cmake
Normal file
6
cmake/arm64-windows-msvc.cmake
Normal file
@@ -0,0 +1,6 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
@@ -2553,7 +2553,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
|
||||
size_t pos_start = 0;
|
||||
size_t pos_found = 0;
|
||||
|
||||
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
||||
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
||||
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
||||
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
||||
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
||||
|
||||
@@ -26,7 +26,7 @@ namespace grammar_parser {
|
||||
|
||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
|
||||
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
||||
return result.first->second;
|
||||
}
|
||||
|
||||
|
||||
@@ -272,7 +272,7 @@ private:
|
||||
if (literal.empty()) {
|
||||
return false;
|
||||
}
|
||||
ret.push_back(std::make_pair(literal, true));
|
||||
ret.emplace_back(literal, true);
|
||||
literal.clear();
|
||||
return true;
|
||||
};
|
||||
@@ -298,7 +298,7 @@ private:
|
||||
while (i < length) {
|
||||
char c = sub_pattern[i];
|
||||
if (c == '.') {
|
||||
seq.push_back(std::make_pair(get_dot(), false));
|
||||
seq.emplace_back(get_dot(), false);
|
||||
i++;
|
||||
} else if (c == '(') {
|
||||
i++;
|
||||
@@ -307,7 +307,7 @@ private:
|
||||
_warnings.push_back("Unsupported pattern syntax");
|
||||
}
|
||||
}
|
||||
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
|
||||
seq.emplace_back("(" + to_rule(transform()) + ")", false);
|
||||
} else if (c == ')') {
|
||||
i++;
|
||||
if (start > 0 && sub_pattern[start - 1] != '(') {
|
||||
@@ -331,9 +331,9 @@ private:
|
||||
}
|
||||
square_brackets += ']';
|
||||
i++;
|
||||
seq.push_back(std::make_pair(square_brackets, false));
|
||||
seq.emplace_back(square_brackets, false);
|
||||
} else if (c == '|') {
|
||||
seq.push_back(std::make_pair("|", false));
|
||||
seq.emplace_back("|", false);
|
||||
i++;
|
||||
} else if (c == '*' || c == '+' || c == '?') {
|
||||
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
|
||||
@@ -417,7 +417,7 @@ private:
|
||||
}
|
||||
}
|
||||
if (!literal.empty()) {
|
||||
seq.push_back(std::make_pair(literal, true));
|
||||
seq.emplace_back(literal, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
10
common/log.h
10
common/log.h
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_FLF_FMT "%s"
|
||||
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "%s"
|
||||
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
// Main LOG macro.
|
||||
// behaves like printf, and supports arguments the exact same way.
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
// Secondary target can be changed just like LOG_TARGET
|
||||
// by defining LOG_TEE_TARGET
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||
#endif
|
||||
|
||||
// LOG macro variants with auto endline.
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
||||
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
||||
#else
|
||||
|
||||
@@ -20,11 +20,13 @@
|
||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
# TODO: automate the update of convert-hf-to-gguf.py
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
@@ -35,6 +37,7 @@ from transformers import AutoTokenizer
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("convert-hf-to-gguf-update")
|
||||
sess = requests.Session()
|
||||
|
||||
|
||||
class TOKENIZER_TYPE(IntEnum):
|
||||
@@ -79,63 +82,44 @@ models = [
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
]
|
||||
|
||||
# make directory "models/tokenizers" if it doesn't exist
|
||||
if not os.path.exists("models/tokenizers"):
|
||||
os.makedirs("models/tokenizers")
|
||||
|
||||
|
||||
def download_file_with_auth(url, token, save_path):
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 200:
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
else:
|
||||
logger.info(f"Failed to download file. Status code: {response.status_code}")
|
||||
response = sess.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
|
||||
|
||||
# download the tokenizer models
|
||||
for model in models:
|
||||
def download_model(model):
|
||||
name = model["name"]
|
||||
repo = model["repo"]
|
||||
tokt = model["tokt"]
|
||||
|
||||
if not os.path.exists(f"models/tokenizers/{name}"):
|
||||
os.makedirs(f"models/tokenizers/{name}")
|
||||
else:
|
||||
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
|
||||
continue
|
||||
|
||||
logger.info(f"Downloading {name} to models/tokenizers/{name}")
|
||||
|
||||
url = f"{repo}/raw/main/config.json"
|
||||
save_path = f"models/tokenizers/{name}/config.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
url = f"{repo}/raw/main/tokenizer.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
# if downloaded file is less than 1KB, we likely need to download an LFS instead
|
||||
if os.path.getsize(save_path) < 1024:
|
||||
# remove the file
|
||||
os.remove(save_path)
|
||||
url = f"{repo}/resolve/main/tokenizer.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
|
||||
|
||||
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
url = f"{repo}/resolve/main/tokenizer.model"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.model"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
files.append("tokenizer.model")
|
||||
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||
continue
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||
|
||||
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
url = f"{repo}/raw/main/tokenizer_config.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||
# TODO: auto-update convert-hf-to-gguf.py with the generated function
|
||||
|
||||
src_ifs = ""
|
||||
for model in models:
|
||||
@@ -224,11 +208,18 @@ src_func = f"""
|
||||
return res
|
||||
"""
|
||||
|
||||
print(src_func) # noqa: NP100
|
||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||
convert_py = convert_py_pth.read_text()
|
||||
convert_py = re.sub(
|
||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||
lambda m: m.group(1) + src_func + m.group(3),
|
||||
convert_py,
|
||||
flags=re.DOTALL | re.MULTILINE,
|
||||
)
|
||||
|
||||
logger.info("\n")
|
||||
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
||||
logger.info("\n")
|
||||
convert_py_pth.write_text(convert_py)
|
||||
|
||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||
|
||||
# generate tests for each tokenizer model
|
||||
|
||||
|
||||
@@ -402,6 +402,7 @@ class Model:
|
||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||
# do not modify it manually!
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# Marker: Start get_vocab_base_pre
|
||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||
# is specific for the BPE pre-tokenizer used by the model
|
||||
@@ -489,6 +490,7 @@ class Model:
|
||||
logger.debug(f"chkhsh: {chkhsh}")
|
||||
|
||||
return res
|
||||
# Marker: End get_vocab_base_pre
|
||||
|
||||
def _set_vocab_gpt2(self) -> None:
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
@@ -526,7 +528,7 @@ class Model:
|
||||
|
||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||
added_vocab = tokenizer.special_tokens
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
|
||||
@@ -1109,7 +1109,7 @@ class OutputFile:
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = str(params.path_model.parent).split("/")[-1]
|
||||
name = params.path_model.name
|
||||
elif params.n_ctx == 4096:
|
||||
# Heuristic detection of LLaMA v2 model
|
||||
name = "LLaMA v2"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Debugging Tests Tips
|
||||
|
||||
## How to run & debug a specific test without anything else to keep the feedback loop short?
|
||||
## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
|
||||
|
||||
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
|
||||
|
||||
@@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from
|
||||
|
||||
It will then build & run in the debugger for you.
|
||||
|
||||
To just execute a test and get back a PASS or FAIL message run:
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh test-tokenizer
|
||||
```
|
||||
|
||||
To test in GDB use the `-g` flag to enable gdb test mode.
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh -g test-tokenizer
|
||||
|
||||
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
|
||||
>>> b main
|
||||
```
|
||||
|
||||
To speed up the testing loop, if you know your test number you can just run it similar to below:
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh test 23
|
||||
```
|
||||
|
||||
For further reference use `debug-test.sh -h` to print help.
|
||||
|
||||
|
||||
@@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
|
||||
make -j
|
||||
```
|
||||
|
||||
#### Step 3.1: Identify Test Command for Debugging
|
||||
#### Step 3: Find all tests available that matches REGEX
|
||||
|
||||
The output of this command will give you the command & arguments needed to run GDB.
|
||||
|
||||
@@ -69,11 +83,13 @@ Labels: main
|
||||
...
|
||||
```
|
||||
|
||||
So for test #1 we can tell these two pieces of relevant information:
|
||||
#### Step 4: Identify Test Command for Debugging
|
||||
|
||||
So for test #1 above we can tell these two pieces of relevant information:
|
||||
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
||||
|
||||
#### Step 3.2: Run GDB on test command
|
||||
#### Step 5: Run GDB on test command
|
||||
|
||||
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
|
||||
|
||||
|
||||
@@ -211,6 +211,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// clean up
|
||||
llama_print_timings(ctx);
|
||||
llama_batch_free(batch);
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
llama_backend_free();
|
||||
|
||||
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||
struct {
|
||||
struct ggml_tensor * newline;
|
||||
struct ggml_context * ctx;
|
||||
} model;
|
||||
|
||||
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
|
||||
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
||||
if (newline_tmp->buffer == NULL) {
|
||||
LOG_TEE("newline_tmp tensor buffer is NULL\n");
|
||||
}
|
||||
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
|
||||
} else {
|
||||
model.newline->data = newline_tmp->data;
|
||||
if (model.newline->data == NULL) {
|
||||
LOG_TEE("newline_tmp tensor data is NULL\n");
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||
// fill it with the image embeddings, ignoring the base
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# quantize
|
||||
|
||||
TODO
|
||||
You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
|
||||
|
||||
Note: It is synced from llama.cpp `main` every 6 hours.
|
||||
|
||||
## Llama 2 7B
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ cmake --build . --config Release
|
||||
Then, start the `rpc-server` with the backend:
|
||||
|
||||
```bash
|
||||
$ bin/rpc-server 0.0.0.0 50052
|
||||
$ bin/rpc-server -p 50052
|
||||
create_backend: using CUDA backend
|
||||
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
|
||||
@@ -53,7 +53,7 @@ Starting RPC server on 0.0.0.0:50052
|
||||
|
||||
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
||||
```
|
||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||
|
||||
|
||||
@@ -7,9 +7,64 @@
|
||||
#endif
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#ifdef _WIN32
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
struct rpc_server_params {
|
||||
std::string host = "0.0.0.0";
|
||||
int port = 50052;
|
||||
size_t backend_mem = 0;
|
||||
};
|
||||
|
||||
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
|
||||
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
||||
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
||||
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
if (arg == "-H" || arg == "--host") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.host = argv[i];
|
||||
} else if (arg == "-p" || arg == "--port") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.port = std::stoi(argv[i]);
|
||||
if (params.port <= 0 || params.port > 65535) {
|
||||
return false;
|
||||
}
|
||||
} else if (arg == "-m" || arg == "--mem") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static ggml_backend_t create_backend() {
|
||||
ggml_backend_t backend = NULL;
|
||||
#ifdef GGML_USE_CUDA
|
||||
@@ -38,21 +93,25 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||
#else
|
||||
// TODO: implement for other backends
|
||||
*free_mem = 1;
|
||||
*total_mem = 1;
|
||||
#ifdef _WIN32
|
||||
MEMORYSTATUSEX status;
|
||||
status.dwLength = sizeof(status);
|
||||
GlobalMemoryStatusEx(&status);
|
||||
*total_mem = status.ullTotalPhys;
|
||||
*free_mem = status.ullAvailPhys;
|
||||
#else
|
||||
long pages = sysconf(_SC_PHYS_PAGES);
|
||||
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||
*total_mem = pages * page_size;
|
||||
*free_mem = *total_mem;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
const char * host = argv[1];
|
||||
int port = std::stoi(argv[2]);
|
||||
if (port <= 0 || port > 65535) {
|
||||
fprintf(stderr, "Invalid port number: %d\n", port);
|
||||
rpc_server_params params;
|
||||
if (!rpc_server_params_parse(argc, argv, params)) {
|
||||
fprintf(stderr, "Invalid parameters\n");
|
||||
return 1;
|
||||
}
|
||||
ggml_backend_t backend = create_backend();
|
||||
@@ -60,10 +119,15 @@ int main(int argc, char * argv[]) {
|
||||
fprintf(stderr, "Failed to create backend\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Starting RPC server on %s:%d\n", host, port);
|
||||
std::string endpoint = params.host + ":" + std::to_string(params.port);
|
||||
size_t free_mem, total_mem;
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
||||
if (params.backend_mem > 0) {
|
||||
free_mem = params.backend_mem;
|
||||
total_mem = params.backend_mem;
|
||||
} else {
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
}
|
||||
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
||||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||
ggml_backend_free(backend);
|
||||
return 0;
|
||||
|
||||
@@ -17,7 +17,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
|
||||
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
|
||||
- `-t N`, `--threads N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
|
||||
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||
@@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
|
||||
- `--numa distribute`: Spread execution evenly over all nodes
|
||||
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
|
||||
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
|
||||
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
|
||||
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--numa`: Attempt optimizations that may help on some NUMA systems.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
||||
@@ -2387,6 +2387,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
||||
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
||||
@@ -2439,6 +2440,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||
break;
|
||||
}
|
||||
sparams.port = std::stoi(argv[i]);
|
||||
} else if (arg == "--rpc") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rpc_servers = argv[i];
|
||||
} else if (arg == "--host") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
||||
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
|
||||
|
||||
tensor->buffer = buffer;
|
||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||
tensor->backend = tensor->view_src->backend;
|
||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||
}
|
||||
|
||||
|
||||
@@ -2558,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
}
|
||||
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (cuda_graph_update_required) {
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
||||
} else {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
||||
|
||||
@@ -1,35 +1,36 @@
|
||||
#include "upscale.cuh"
|
||||
|
||||
static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) {
|
||||
// blockIdx.z: idx of ne02*ne03
|
||||
// blockIdx.y: idx of ne01*scale_factor, aka ne1
|
||||
// blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE
|
||||
// ne00xne01: ne00 * ne01
|
||||
int ne0 = ne00 * scale_factor;
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
static __global__ void upscale_f32(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3) {
|
||||
int index = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (index >= ne10 * ne11 * ne12 * ne13) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
int i00 = nidx / scale_factor;
|
||||
int i01 = blockIdx.y / scale_factor;
|
||||
int offset_src =
|
||||
i00 +
|
||||
i01 * ne00 +
|
||||
blockIdx.z * ne00xne01;
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
|
||||
int i10 = index % ne10;
|
||||
int i11 = (index / ne10) % ne11;
|
||||
int i12 = (index / (ne10 * ne11)) % ne12;
|
||||
int i13 = (index / (ne10 * ne11 * ne12)) % ne13;
|
||||
|
||||
int i00 = i10 / sf0;
|
||||
int i01 = i11 / sf1;
|
||||
int i02 = i12 / sf2;
|
||||
int i03 = i13 / sf3;
|
||||
|
||||
dst[index] = *(float *)((char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00);
|
||||
}
|
||||
|
||||
static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03,
|
||||
const int scale_factor, cudaStream_t stream) {
|
||||
int ne0 = (ne00 * scale_factor);
|
||||
int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03);
|
||||
upscale_f32<<<gridDim, CUDA_UPSCALE_BLOCK_SIZE, 0, stream>>>(x, dst, ne00, ne00 * ne01, scale_factor);
|
||||
static void upscale_f32_cuda(const float * x, float * dst,
|
||||
const int nb00, const int nb01, const int nb02, const int nb03,
|
||||
const int ne10, const int ne11, const int ne12, const int ne13,
|
||||
const float sf0, const float sf1, const float sf2, const float sf3,
|
||||
cudaStream_t stream) {
|
||||
int dst_size = ne10 * ne11 * ne12 * ne13;
|
||||
int num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
|
||||
|
||||
upscale_f32<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
@@ -39,10 +40,12 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int scale_factor = dst->op_params[0];
|
||||
const float sf0 = (float)dst->ne[0]/src0->ne[0];
|
||||
const float sf1 = (float)dst->ne[1]/src0->ne[1];
|
||||
const float sf2 = (float)dst->ne[2]/src0->ne[2];
|
||||
const float sf3 = (float)dst->ne[3]/src0->ne[3];
|
||||
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream);
|
||||
upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
|
||||
}
|
||||
|
||||
10
ggml-metal.m
10
ggml-metal.m
@@ -2353,7 +2353,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
{
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
|
||||
const int sf = dst->op_params[0];
|
||||
const float sf0 = (float)ne0/src0->ne[0];
|
||||
const float sf1 = (float)ne1/src0->ne[1];
|
||||
const float sf2 = (float)ne2/src0->ne[2];
|
||||
const float sf3 = (float)ne3/src0->ne[3];
|
||||
|
||||
const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
|
||||
|
||||
@@ -2376,7 +2379,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
||||
[encoder setBytes:&sf0 length:sizeof(sf0) atIndex:18];
|
||||
[encoder setBytes:&sf1 length:sizeof(sf1) atIndex:19];
|
||||
[encoder setBytes:&sf2 length:sizeof(sf2) atIndex:20];
|
||||
[encoder setBytes:&sf3 length:sizeof(sf3) atIndex:21];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
|
||||
@@ -1852,7 +1852,10 @@ kernel void kernel_upscale_f32(
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int32_t & sf,
|
||||
constant float & sf0,
|
||||
constant float & sf1,
|
||||
constant float & sf2,
|
||||
constant float & sf3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
@@ -1861,15 +1864,17 @@ kernel void kernel_upscale_f32(
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3;
|
||||
const int64_t i02 = i2;
|
||||
const int64_t i01 = i1/sf;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1);
|
||||
const int64_t i03 = i3/sf3;
|
||||
const int64_t i02 = i2/sf2;
|
||||
const int64_t i01 = i1/sf1;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
dst_ptr[i0] = src0_ptr[i0/sf];
|
||||
const int64_t i00 = i0/sf0;
|
||||
|
||||
device const float * src0_ptr = (device const float *) (src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
|
||||
device float * dst_ptr = (device float *) (dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
dst_ptr[0] = src0_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1986,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
||||
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
|
||||
const float * qw = quant_weights + QK_K * i + 16*j;
|
||||
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
||||
} else {
|
||||
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
||||
@@ -3487,10 +3487,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q4_0 * restrict vx0 = vx;
|
||||
const block_q4_0 * restrict vx1 = vx + bx;
|
||||
|
||||
const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
|
||||
const block_q8_0 * restrict vy0 = vy;
|
||||
const block_q8_0 * restrict vy1 = vy + by;
|
||||
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
||||
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
|
||||
@@ -3524,10 +3523,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||
|
||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||
float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
@@ -3894,9 +3895,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q4_1 * restrict vx0 = vx;
|
||||
const block_q4_1 * restrict vx1 = vx + bx;
|
||||
const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
|
||||
const block_q8_1 * restrict vy0 = vy;
|
||||
const block_q8_1 * restrict vy1 = vy + by;
|
||||
const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
|
||||
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
||||
@@ -3907,11 +3908,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
const block_q8_1 * restrict b_y0 = &vy0[i];
|
||||
const block_q8_1 * restrict b_y1 = &vy1[i];
|
||||
|
||||
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
||||
summs0 += summs_t;
|
||||
float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
||||
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||
|
||||
@@ -3931,10 +3932,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||
|
||||
// mmla into int32x4_t
|
||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
||||
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
||||
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
||||
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
||||
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
||||
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
||||
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
||||
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
@@ -3953,7 +3955,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||
sumv2 = sumv2 + summs0;
|
||||
sumv2 = vaddq_f32(sumv2, summs0);
|
||||
|
||||
vst1_f32(s, vget_low_f32(sumv2));
|
||||
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||
@@ -4837,9 +4839,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q8_0 * restrict vx0 = vx;
|
||||
const block_q8_0 * restrict vx1 = vx + bx;
|
||||
const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
|
||||
const block_q8_0 * restrict vy0 = vy;
|
||||
const block_q8_0 * restrict vy1 = vy + by;
|
||||
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
||||
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
|
||||
@@ -4861,10 +4863,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||
|
||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
|
||||
15
ggml-rpc.cpp
15
ggml-rpc.cpp
@@ -28,7 +28,7 @@
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#define GGML_DEBUG 1
|
||||
#define GGML_DEBUG 0
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
@@ -134,7 +134,13 @@ static bool set_no_delay(sockfd_t sockfd) {
|
||||
int flag = 1;
|
||||
// set TCP_NODELAY to disable Nagle's algorithm
|
||||
int ret = setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, (char *)&flag, sizeof(int));
|
||||
return ret >= 0;
|
||||
return ret == 0;
|
||||
}
|
||||
|
||||
static bool set_reuse_addr(sockfd_t sockfd) {
|
||||
int flag = 1;
|
||||
int ret = setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof(int));
|
||||
return ret == 0;
|
||||
}
|
||||
|
||||
static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
||||
@@ -181,7 +187,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||
if (sock == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (!set_reuse_addr(sockfd)) {
|
||||
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
||||
return nullptr;
|
||||
}
|
||||
struct sockaddr_in serv_addr;
|
||||
serv_addr.sin_family = AF_INET;
|
||||
serv_addr.sin_addr.s_addr = inet_addr(host);
|
||||
|
||||
@@ -13987,6 +13987,10 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
|
||||
|
||||
#pragma message("TODO: generalize upscale operator")
|
||||
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
|
||||
GGML_ASSERT(false && "TODO: generalize upscale operator");
|
||||
|
||||
const int scale_factor = dst->op_params[0];
|
||||
|
||||
upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
|
||||
|
||||
15
ggml.h
15
ggml.h
@@ -565,7 +565,8 @@ extern "C" {
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_backend_type backend;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
struct ggml_backend_buffer * buffer;
|
||||
|
||||
@@ -1674,12 +1675,24 @@ extern "C" {
|
||||
float p1);
|
||||
|
||||
// nearest interpolate
|
||||
// multiplies ne0 and ne1 by scale factor
|
||||
// used in stable-diffusion
|
||||
GGML_API struct ggml_tensor * ggml_upscale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int scale_factor);
|
||||
|
||||
// nearest interpolate
|
||||
// nearest interpolate to specified dimensions
|
||||
// used in tortoise.cpp
|
||||
GGML_API struct ggml_tensor * ggml_upscale_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3);
|
||||
|
||||
// pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
|
||||
GGML_API struct ggml_tensor * ggml_pad(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
35
llama.cpp
35
llama.cpp
@@ -6622,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
||||
cb(q, "q", il);
|
||||
@@ -6644,8 +6645,8 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v_l[il],
|
||||
n_embd_head_v, n_kv, n_head_kv,
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
||||
0);
|
||||
cb(v, "v", il);
|
||||
|
||||
@@ -6655,7 +6656,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
||||
} else {
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
@@ -6700,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
cb(kqv_merged, "kqv_merged", il);
|
||||
|
||||
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
||||
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
||||
cb(cur, "kqv_merged_cont", il);
|
||||
}
|
||||
|
||||
@@ -12818,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
@@ -12844,6 +12852,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
GGML_ASSERT(vocab.special_add_eos != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
@@ -13904,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||
|
||||
// Sample the next word X using top-k sampling
|
||||
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
llama_token X = llama_sample_token(ctx, candidates);
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
||||
@@ -13920,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||
// Update mu using the learning rate and error
|
||||
*mu = *mu - eta * e;
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
return X;
|
||||
}
|
||||
|
||||
@@ -17015,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
||||
}
|
||||
else {
|
||||
if (cell_range_begin != kv_self.size) {
|
||||
cell_ranges.push_back({ cell_range_begin, i });
|
||||
cell_ranges.emplace_back(cell_range_begin, i);
|
||||
cell_range_begin = kv_self.size;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cell_range_begin != kv_self.size) {
|
||||
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
||||
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
||||
}
|
||||
|
||||
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
||||
|
||||
@@ -1,117 +1,203 @@
|
||||
#!/bin/bash
|
||||
test_suite=${1:-}
|
||||
test_number=${2:-}
|
||||
|
||||
PROG=${0##*/}
|
||||
build_dir="build-ci-debug"
|
||||
|
||||
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||
echo "Debug specific ctest program."
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -h, --help Display this help and exit"
|
||||
echo
|
||||
echo "Arguments:"
|
||||
echo " <test_regex> (Mandatory) Supply one regex to the script to filter tests"
|
||||
echo " (test_number) (Optional) Test number to run a specific test"
|
||||
echo
|
||||
echo "Example:"
|
||||
echo " $PROG test-tokenizer"
|
||||
echo " $PROG test-tokenizer 3"
|
||||
echo
|
||||
exit 0
|
||||
fi
|
||||
# Print Color Commands
|
||||
red=$(tput setaf 1)
|
||||
green=$(tput setaf 2)
|
||||
yellow=$(tput setaf 3)
|
||||
blue=$(tput setaf 4)
|
||||
magenta=$(tput setaf 5)
|
||||
cyan=$(tput setaf 6)
|
||||
normal=$(tput sgr0)
|
||||
|
||||
# Function to select and debug a test
|
||||
function select_test() {
|
||||
test_suite=${1:-test}
|
||||
test_number=${2:-}
|
||||
|
||||
# Sanity Check If Tests Is Detected
|
||||
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||
if [ ${#tests[@]} -eq 0 ]
|
||||
then
|
||||
echo "No tests avaliable... check your compliation process..."
|
||||
echo "Exiting."
|
||||
exit 1
|
||||
fi
|
||||
# Print Help Message
|
||||
####################
|
||||
|
||||
if [ -z $test_number ]
|
||||
then
|
||||
# List out avaliable tests
|
||||
printf "Which test would you like to debug?\n"
|
||||
id=0
|
||||
for s in "${tests[@]}"
|
||||
do
|
||||
echo "Test# ${id}"
|
||||
echo " $s"
|
||||
((id++))
|
||||
done
|
||||
print_full_help() {
|
||||
cat << EOF
|
||||
Usage: $PROG [OPTION]... <test_regex> (test_number)
|
||||
Debug specific ctest program.
|
||||
|
||||
# Prompt user which test they wanted to run
|
||||
printf "\nRun test#? "
|
||||
read test_number
|
||||
else
|
||||
printf "\nUser Already Requested #${test_number}"
|
||||
fi
|
||||
Options:
|
||||
-h, --help display this help and exit
|
||||
-g run in gdb mode
|
||||
|
||||
# Start GDB with the requested test binary and arguments
|
||||
printf "Debugging(GDB) test: ${tests[test_number]}\n"
|
||||
# Change IFS (Internal Field Separator)
|
||||
sIFS=$IFS
|
||||
IFS=$'\n'
|
||||
Arguments:
|
||||
<test_regex> (Mandatory) Supply one regex to the script to filter tests
|
||||
(test_number) (Optional) Test number to run a specific test
|
||||
|
||||
# Get test args
|
||||
gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
|
||||
IFS=$sIFS
|
||||
printf "Debug arguments: ${gdb_args[test_number]}\n\n"
|
||||
|
||||
# Expand paths if needed
|
||||
args=()
|
||||
for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
|
||||
do
|
||||
args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
|
||||
done
|
||||
|
||||
# Execute debugger
|
||||
echo "gdb args: ${args[@]}"
|
||||
gdb --args ${args[@]}
|
||||
Example:
|
||||
$PROG test-tokenizer
|
||||
$PROG test-tokenizer 3
|
||||
EOF
|
||||
}
|
||||
|
||||
abort() {
|
||||
echo "Error: $1" >&2
|
||||
cat << EOF >&2
|
||||
Usage: $PROG [OPTION]... <test_regex> (test_number)
|
||||
Debug specific ctest program.
|
||||
Refer to --help for full instructions.
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
# Dependency Sanity Check
|
||||
#########################
|
||||
|
||||
check_dependency() {
|
||||
command -v "$1" >/dev/null 2>&1 || {
|
||||
abort "$1 is required but not found. Please install it and try again."
|
||||
}
|
||||
}
|
||||
|
||||
check_dependency ctest
|
||||
check_dependency cmake
|
||||
|
||||
|
||||
# Step 0: Check the args
|
||||
if [ -z "$test_suite" ]
|
||||
then
|
||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||
echo "Supply one regex to the script to filter tests,"
|
||||
echo "and optionally a test number to run a specific test."
|
||||
echo "Use --help flag for full instructions"
|
||||
exit 1
|
||||
########################
|
||||
|
||||
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||
print_full_help >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Parse command-line options
|
||||
gdb_mode=false
|
||||
while getopts "g" opt; do
|
||||
case $opt in
|
||||
g)
|
||||
gdb_mode=true
|
||||
echo "gdb_mode Mode Enabled"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Shift the option parameters
|
||||
shift $((OPTIND - 1))
|
||||
|
||||
# Positionial Argument Processing : <test_regex>
|
||||
if [ -z "${1}" ]; then
|
||||
abort "Test regex is required"
|
||||
else
|
||||
test_suite=${1:-}
|
||||
fi
|
||||
|
||||
# Positionial Argument Processing : (test_number)
|
||||
test_number=${2:-}
|
||||
|
||||
|
||||
# Step 1: Reset and Setup folder context
|
||||
########################################
|
||||
|
||||
## Sanity check that we are actually in a git repo
|
||||
repo_root=$(git rev-parse --show-toplevel)
|
||||
if [ ! -d "$repo_root" ]; then
|
||||
echo "Error: Not in a Git repository."
|
||||
exit 1
|
||||
abort "Not in a Git repository."
|
||||
fi
|
||||
|
||||
## Reset folder to root context of git repo
|
||||
pushd "$repo_root" || exit 1
|
||||
## Reset folder to root context of git repo and Create and enter build directory
|
||||
pushd "$repo_root"
|
||||
rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
|
||||
|
||||
## Create and enter build directory
|
||||
rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
|
||||
|
||||
# Step 2: Setup Build Environment and Compile Test Binaries
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
|
||||
pushd "$build_dir" && make -j || exit 1
|
||||
###########################################################
|
||||
|
||||
# Step 3: Debug the Test
|
||||
select_test "$test_suite" "$test_number"
|
||||
# Note: test-eval-callback requires -DLLAMA_CURL
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
|
||||
pushd "$build_dir"
|
||||
make -j || abort "Failed to compile"
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Step 4: Return to the directory from which the user ran the command.
|
||||
popd || exit 1
|
||||
popd || exit 1
|
||||
popd || exit 1
|
||||
|
||||
# Step 3: Find all tests available that matches REGEX
|
||||
####################################################
|
||||
|
||||
# Ctest Gather Tests
|
||||
# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
|
||||
# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
|
||||
# `-V` : Verbose Mode
|
||||
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||
pushd "$build_dir"
|
||||
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||
if [ ${#tests[@]} -eq 0 ]; then
|
||||
abort "No tests avaliable... check your compliation process..."
|
||||
fi
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
|
||||
# Step 4: Identify Test Command for Debugging
|
||||
#############################################
|
||||
|
||||
# Select test number
|
||||
if [ -z $test_number ]; then
|
||||
# List out avaliable tests
|
||||
printf "Which test would you like to debug?\n"
|
||||
id=0
|
||||
for s in "${tests[@]}"
|
||||
do
|
||||
echo "Test# ${id}"
|
||||
echo " $s"
|
||||
((id++))
|
||||
done
|
||||
|
||||
# Prompt user which test they wanted to run
|
||||
printf "\nRun test#? "
|
||||
read test_number
|
||||
|
||||
else
|
||||
printf "\nUser Already Requested #${test_number}\n"
|
||||
|
||||
fi
|
||||
|
||||
# Grab all tests commands
|
||||
pushd "$build_dir"
|
||||
sIFS=$IFS # Save Initial IFS (Internal Field Separator)
|
||||
IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces)
|
||||
test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args
|
||||
IFS=$sIFS # Reset IFS (Internal Field Separator)
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Grab specific test command
|
||||
single_test_name="${tests[test_number]}"
|
||||
single_test_command="${test_args[test_number]}"
|
||||
|
||||
|
||||
# Step 5: Execute or GDB Debug
|
||||
##############################
|
||||
|
||||
printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n"
|
||||
printf "${cyan}single_test_command: ${single_test_command}${normal}\n"
|
||||
|
||||
if [ "$gdb_mode" = "true" ]; then
|
||||
# Execute debugger
|
||||
pushd "$repo_root" || exit 1
|
||||
eval "gdb --args ${single_test_command}"
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
else
|
||||
# Execute Test
|
||||
pushd "$repo_root" || exit 1
|
||||
eval "${single_test_command}"
|
||||
exit_code=$?
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Print Result
|
||||
printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n"
|
||||
printf "${yellow}Command: ${single_test_command}${normal}\n"
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
printf "${green}TEST PASS${normal}\n"
|
||||
else
|
||||
printf "${red}TEST FAIL${normal}\n"
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
# Return to the directory from which the user ran the command.
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
@@ -112,6 +112,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
# src/ggml-opencl.h -> ggml-opencl.h
|
||||
# src/ggml-quants.c -> ggml-quants.c
|
||||
# src/ggml-quants.h -> ggml-quants.h
|
||||
# src/ggml-rpc.cpp -> ggml-rpc.cpp
|
||||
# src/ggml-rpc.h -> ggml-rpc.h
|
||||
# src/ggml-sycl.cpp -> ggml-sycl.cpp
|
||||
# src/ggml-sycl.h -> ggml-sycl.h
|
||||
# src/ggml-vulkan.cpp -> ggml-vulkan.cpp
|
||||
@@ -149,6 +151,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
||||
-e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
|
||||
-e 's/src\/ggml-rpc\.h/ggml-rpc.h/g' \
|
||||
-e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
|
||||
-e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
|
||||
-e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
|
||||
|
||||
@@ -1 +1 @@
|
||||
fafd5e7f89382b8cfb51e3dac8d4f1500ca44918
|
||||
126d34985705a5a2222723c145cb4e125ac689f3
|
||||
|
||||
@@ -20,6 +20,8 @@ cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||
cp -rpv ../ggml/src/ggml-quants.c ./ggml-quants.c
|
||||
cp -rpv ../ggml/src/ggml-quants.h ./ggml-quants.h
|
||||
cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml-rpc.cpp
|
||||
cp -rpv ../ggml/src/ggml-rpc.h ./ggml-rpc.h
|
||||
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml-sycl.cpp
|
||||
cp -rpv ../ggml/src/ggml-sycl.h ./ggml-sycl.h
|
||||
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml-vulkan.cpp
|
||||
|
||||
@@ -1329,23 +1329,47 @@ struct test_upscale : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const int32_t scale_factor;
|
||||
const bool transpose;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, scale_factor);
|
||||
return VARS_TO_STR4(type, ne, scale_factor, transpose);
|
||||
}
|
||||
|
||||
test_upscale(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {512, 512, 3, 1},
|
||||
int32_t scale_factor = 2)
|
||||
: type(type), ne(ne), scale_factor(scale_factor) {}
|
||||
int32_t scale_factor = 2, bool transpose = false)
|
||||
: type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
if (transpose) a = ggml_transpose(ctx, a);
|
||||
ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_UPSCALE (ext)
|
||||
struct test_upscale_ext : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const std::array<int64_t, 4> ne_tgt;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, ne_tgt);
|
||||
}
|
||||
|
||||
test_upscale_ext(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {2, 5, 7, 11},
|
||||
std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
|
||||
: type(type), ne(ne), ne_tgt(ne_tgt) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_GROUP_NORM
|
||||
struct test_group_norm : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -2169,6 +2193,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||
|
||||
test_cases.emplace_back(new test_sum_rows());
|
||||
test_cases.emplace_back(new test_upscale());
|
||||
test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
|
||||
test_cases.emplace_back(new test_upscale_ext());
|
||||
test_cases.emplace_back(new test_group_norm());
|
||||
test_cases.emplace_back(new test_acc());
|
||||
test_cases.emplace_back(new test_pad());
|
||||
|
||||
Reference in New Issue
Block a user