mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
26 Commits
b2890
...
ci-android
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
007f2ece0a | ||
|
|
99d1e7eb8a | ||
|
|
2117b30380 | ||
|
|
8725937362 | ||
|
|
51e9d02599 | ||
|
|
d273c1402b | ||
|
|
27b040691c | ||
|
|
29c60d8cdd | ||
|
|
359cbe3f46 | ||
|
|
e18bc6aaf3 | ||
|
|
ee94172d33 | ||
|
|
934266c0e0 | ||
|
|
9c4fdcbec8 | ||
|
|
24ecb58168 | ||
|
|
9afdffe70e | ||
|
|
3b3963c55c | ||
|
|
dda64fc17c | ||
|
|
0350f58152 | ||
|
|
ad52d5c259 | ||
|
|
172b78210a | ||
|
|
13ad16af12 | ||
|
|
8f7080bf48 | ||
|
|
e1b40ac3b9 | ||
|
|
dc020985b8 | ||
|
|
344f9126cc | ||
|
|
9a17ab914b |
61
.github/workflows/build.yml
vendored
61
.github/workflows/build.yml
vendored
@@ -693,26 +693,28 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'rpc'
|
||||
- build: 'rpc-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'noavx'
|
||||
- build: 'noavx-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
- build: 'avx2-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx'
|
||||
- build: 'avx-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512'
|
||||
- build: 'avx512-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast'
|
||||
- build: 'clblast-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
- build: 'openblas-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
- build: 'kompute'
|
||||
- build: 'kompute-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'vulkan'
|
||||
- build: 'vulkan-x64'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'arm64'
|
||||
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'llvm-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'msvc-arm64'
|
||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -723,13 +725,13 @@ jobs:
|
||||
|
||||
- name: Clone Kompute submodule
|
||||
id: clone_kompute
|
||||
if: ${{ matrix.build == 'kompute' }}
|
||||
if: ${{ matrix.build == 'kompute-x64' }}
|
||||
run: |
|
||||
git submodule update --init kompute
|
||||
|
||||
- name: Download OpenCL SDK
|
||||
id: get_opencl
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
|
||||
mkdir $env:RUNNER_TEMP/opencl
|
||||
@@ -737,7 +739,7 @@ jobs:
|
||||
|
||||
- name: Download CLBlast
|
||||
id: get_clblast
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
|
||||
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
|
||||
@@ -750,7 +752,7 @@ jobs:
|
||||
|
||||
- name: Download OpenBLAS
|
||||
id: get_openblas
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
|
||||
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
|
||||
@@ -763,38 +765,41 @@ jobs:
|
||||
|
||||
- name: Install Vulkan SDK
|
||||
id: get_vulkan
|
||||
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
|
||||
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
|
||||
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
|
||||
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
|
||||
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. ${{ matrix.defines }}
|
||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
cmake -S . -B build ${{ matrix.defines }}
|
||||
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||
|
||||
- name: Add clblast.dll
|
||||
id: add_clblast_dll
|
||||
if: ${{ matrix.build == 'clblast' }}
|
||||
if: ${{ matrix.build == 'clblast-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
|
||||
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
|
||||
|
||||
- name: Add libopenblas.dll
|
||||
id: add_libopenblas_dll
|
||||
if: ${{ matrix.build == 'openblas' }}
|
||||
if: ${{ matrix.build == 'openblas-x64' }}
|
||||
run: |
|
||||
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
|
||||
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
|
||||
|
||||
- name: Check AVX512F support
|
||||
id: check_avx512f
|
||||
if: ${{ matrix.build == 'avx512' }}
|
||||
if: ${{ matrix.build == 'avx512-x64' }}
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cd build
|
||||
@@ -808,14 +813,14 @@ jobs:
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
# not all machines have native AVX-512
|
||||
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
|
||||
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Test (Intel SDE)
|
||||
id: cmake_test_sde
|
||||
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# for some weird reason windows tar doesn't like sde tar.xz
|
||||
@@ -843,14 +848,14 @@ jobs:
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
|
||||
name: llama-bin-win-${{ matrix.build }}-x64.zip
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
|
||||
name: llama-bin-win-${{ matrix.build }}.zip
|
||||
|
||||
windows-latest-cmake-cuda:
|
||||
runs-on: windows-latest
|
||||
|
||||
@@ -1007,6 +1007,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||
endif ()
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
|
||||
45
CMakePresets.json
Normal file
45
CMakePresets.json
Normal file
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"version": 4,
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "base",
|
||||
"hidden": true,
|
||||
"generator": "Ninja",
|
||||
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||
"cacheVariables": {
|
||||
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
||||
|
||||
{
|
||||
"name": "arm64-windows-msvc", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{
|
||||
"name": "arm64-windows-llvm", "hidden": true,
|
||||
"architecture": { "value": "arm64", "strategy": "external" },
|
||||
"toolset": { "value": "host=x86_64", "strategy": "external" },
|
||||
"cacheVariables": {
|
||||
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
|
||||
}
|
||||
},
|
||||
|
||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
||||
]
|
||||
}
|
||||
@@ -532,7 +532,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||
&& cmake --build build --config Release -- -j 16
|
||||
```
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
|
||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
|
||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||
|
||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||
@@ -712,6 +712,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
### Prepare and Quantize
|
||||
|
||||
> [!NOTE]
|
||||
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
|
||||
|
||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||
|
||||
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||
|
||||
16
cmake/arm64-windows-llvm.cmake
Normal file
16
cmake/arm64-windows-llvm.cmake
Normal file
@@ -0,0 +1,16 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
|
||||
set( CMAKE_C_COMPILER clang )
|
||||
set( CMAKE_CXX_COMPILER clang++ )
|
||||
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
|
||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
|
||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||
|
||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||
6
cmake/arm64-windows-msvc.cmake
Normal file
6
cmake/arm64-windows-msvc.cmake
Normal file
@@ -0,0 +1,6 @@
|
||||
set( CMAKE_SYSTEM_NAME Windows )
|
||||
set( CMAKE_SYSTEM_PROCESSOR arm64 )
|
||||
|
||||
set( target arm64-pc-windows-msvc )
|
||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||
@@ -2553,7 +2553,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
|
||||
size_t pos_start = 0;
|
||||
size_t pos_found = 0;
|
||||
|
||||
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
||||
if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
|
||||
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
||||
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
||||
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
||||
|
||||
@@ -26,7 +26,7 @@ namespace grammar_parser {
|
||||
|
||||
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
|
||||
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
|
||||
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
|
||||
auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
|
||||
return result.first->second;
|
||||
}
|
||||
|
||||
|
||||
@@ -272,7 +272,7 @@ private:
|
||||
if (literal.empty()) {
|
||||
return false;
|
||||
}
|
||||
ret.push_back(std::make_pair(literal, true));
|
||||
ret.emplace_back(literal, true);
|
||||
literal.clear();
|
||||
return true;
|
||||
};
|
||||
@@ -298,7 +298,7 @@ private:
|
||||
while (i < length) {
|
||||
char c = sub_pattern[i];
|
||||
if (c == '.') {
|
||||
seq.push_back(std::make_pair(get_dot(), false));
|
||||
seq.emplace_back(get_dot(), false);
|
||||
i++;
|
||||
} else if (c == '(') {
|
||||
i++;
|
||||
@@ -307,7 +307,7 @@ private:
|
||||
_warnings.push_back("Unsupported pattern syntax");
|
||||
}
|
||||
}
|
||||
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false));
|
||||
seq.emplace_back("(" + to_rule(transform()) + ")", false);
|
||||
} else if (c == ')') {
|
||||
i++;
|
||||
if (start > 0 && sub_pattern[start - 1] != '(') {
|
||||
@@ -331,9 +331,9 @@ private:
|
||||
}
|
||||
square_brackets += ']';
|
||||
i++;
|
||||
seq.push_back(std::make_pair(square_brackets, false));
|
||||
seq.emplace_back(square_brackets, false);
|
||||
} else if (c == '|') {
|
||||
seq.push_back(std::make_pair("|", false));
|
||||
seq.emplace_back("|", false);
|
||||
i++;
|
||||
} else if (c == '*' || c == '+' || c == '?') {
|
||||
seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
|
||||
@@ -417,7 +417,7 @@ private:
|
||||
}
|
||||
}
|
||||
if (!literal.empty()) {
|
||||
seq.push_back(std::make_pair(literal, true));
|
||||
seq.emplace_back(literal, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
10
common/log.h
10
common/log.h
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_FLF_FMT "%s"
|
||||
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
|
||||
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
|
||||
#endif
|
||||
#else
|
||||
#define LOG_TEE_FLF_FMT "%s"
|
||||
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
// Main LOG macro.
|
||||
// behaves like printf, and supports arguments the exact same way.
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
|
||||
// Secondary target can be changed just like LOG_TARGET
|
||||
// by defining LOG_TEE_TARGET
|
||||
//
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
|
||||
#else
|
||||
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
|
||||
#endif
|
||||
|
||||
// LOG macro variants with auto endline.
|
||||
#ifndef _MSC_VER
|
||||
#if !defined(_MSC_VER) || defined(__clang__)
|
||||
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
|
||||
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
|
||||
#else
|
||||
|
||||
@@ -20,11 +20,13 @@
|
||||
# - Update llama.cpp with the new pre-tokenizer if necessary
|
||||
#
|
||||
# TODO: generate tokenizer tests for llama.cpp
|
||||
# TODO: automate the update of convert-hf-to-gguf.py
|
||||
#
|
||||
|
||||
import logging
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
@@ -35,6 +37,7 @@ from transformers import AutoTokenizer
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("convert-hf-to-gguf-update")
|
||||
sess = requests.Session()
|
||||
|
||||
|
||||
class TOKENIZER_TYPE(IntEnum):
|
||||
@@ -79,63 +82,44 @@ models = [
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
]
|
||||
|
||||
# make directory "models/tokenizers" if it doesn't exist
|
||||
if not os.path.exists("models/tokenizers"):
|
||||
os.makedirs("models/tokenizers")
|
||||
|
||||
|
||||
def download_file_with_auth(url, token, save_path):
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 200:
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
else:
|
||||
logger.info(f"Failed to download file. Status code: {response.status_code}")
|
||||
response = sess.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||
with open(save_path, 'wb') as f:
|
||||
f.write(response.content)
|
||||
logger.info(f"File {save_path} downloaded successfully")
|
||||
|
||||
|
||||
# download the tokenizer models
|
||||
for model in models:
|
||||
def download_model(model):
|
||||
name = model["name"]
|
||||
repo = model["repo"]
|
||||
tokt = model["tokt"]
|
||||
|
||||
if not os.path.exists(f"models/tokenizers/{name}"):
|
||||
os.makedirs(f"models/tokenizers/{name}")
|
||||
else:
|
||||
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
|
||||
continue
|
||||
|
||||
logger.info(f"Downloading {name} to models/tokenizers/{name}")
|
||||
|
||||
url = f"{repo}/raw/main/config.json"
|
||||
save_path = f"models/tokenizers/{name}/config.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
url = f"{repo}/raw/main/tokenizer.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
# if downloaded file is less than 1KB, we likely need to download an LFS instead
|
||||
if os.path.getsize(save_path) < 1024:
|
||||
# remove the file
|
||||
os.remove(save_path)
|
||||
url = f"{repo}/resolve/main/tokenizer.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
|
||||
|
||||
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
|
||||
if tokt == TOKENIZER_TYPE.SPM:
|
||||
url = f"{repo}/resolve/main/tokenizer.model"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer.model"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
files.append("tokenizer.model")
|
||||
|
||||
for file in files:
|
||||
save_path = f"models/tokenizers/{name}/{file}"
|
||||
if os.path.isfile(save_path):
|
||||
logger.info(f"{name}: File {save_path} already exists - skipping")
|
||||
continue
|
||||
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
|
||||
|
||||
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
url = f"{repo}/raw/main/tokenizer_config.json"
|
||||
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
|
||||
download_file_with_auth(url, token, save_path)
|
||||
|
||||
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
|
||||
# TODO: auto-update convert-hf-to-gguf.py with the generated function
|
||||
|
||||
src_ifs = ""
|
||||
for model in models:
|
||||
@@ -224,11 +208,18 @@ src_func = f"""
|
||||
return res
|
||||
"""
|
||||
|
||||
print(src_func) # noqa: NP100
|
||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||
convert_py = convert_py_pth.read_text()
|
||||
convert_py = re.sub(
|
||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||
lambda m: m.group(1) + src_func + m.group(3),
|
||||
convert_py,
|
||||
flags=re.DOTALL | re.MULTILINE,
|
||||
)
|
||||
|
||||
logger.info("\n")
|
||||
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
||||
logger.info("\n")
|
||||
convert_py_pth.write_text(convert_py)
|
||||
|
||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||
|
||||
# generate tests for each tokenizer model
|
||||
|
||||
|
||||
@@ -402,6 +402,7 @@ class Model:
|
||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||
# do not modify it manually!
|
||||
# ref: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||
# Marker: Start get_vocab_base_pre
|
||||
def get_vocab_base_pre(self, tokenizer) -> str:
|
||||
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
|
||||
# is specific for the BPE pre-tokenizer used by the model
|
||||
@@ -489,6 +490,7 @@ class Model:
|
||||
logger.debug(f"chkhsh: {chkhsh}")
|
||||
|
||||
return res
|
||||
# Marker: End get_vocab_base_pre
|
||||
|
||||
def _set_vocab_gpt2(self) -> None:
|
||||
tokens, toktypes, tokpre = self.get_vocab_base()
|
||||
@@ -526,7 +528,7 @@ class Model:
|
||||
|
||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||
added_vocab = tokenizer.special_tokens
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
|
||||
@@ -1109,7 +1109,7 @@ class OutputFile:
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = str(params.path_model.parent).split("/")[-1]
|
||||
name = params.path_model.name
|
||||
elif params.n_ctx == 4096:
|
||||
# Heuristic detection of LLaMA v2 model
|
||||
name = "LLaMA v2"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Debugging Tests Tips
|
||||
|
||||
## How to run & debug a specific test without anything else to keep the feedback loop short?
|
||||
## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
|
||||
|
||||
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
|
||||
|
||||
@@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from
|
||||
|
||||
It will then build & run in the debugger for you.
|
||||
|
||||
To just execute a test and get back a PASS or FAIL message run:
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh test-tokenizer
|
||||
```
|
||||
|
||||
To test in GDB use the `-g` flag to enable gdb test mode.
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh -g test-tokenizer
|
||||
|
||||
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
|
||||
>>> b main
|
||||
```
|
||||
|
||||
To speed up the testing loop, if you know your test number you can just run it similar to below:
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh test 23
|
||||
```
|
||||
|
||||
For further reference use `debug-test.sh -h` to print help.
|
||||
|
||||
|
||||
@@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
|
||||
make -j
|
||||
```
|
||||
|
||||
#### Step 3.1: Identify Test Command for Debugging
|
||||
#### Step 3: Find all tests available that matches REGEX
|
||||
|
||||
The output of this command will give you the command & arguments needed to run GDB.
|
||||
|
||||
@@ -69,11 +83,13 @@ Labels: main
|
||||
...
|
||||
```
|
||||
|
||||
So for test #1 we can tell these two pieces of relevant information:
|
||||
#### Step 4: Identify Test Command for Debugging
|
||||
|
||||
So for test #1 above we can tell these two pieces of relevant information:
|
||||
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
||||
|
||||
#### Step 3.2: Run GDB on test command
|
||||
#### Step 5: Run GDB on test command
|
||||
|
||||
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
|
||||
|
||||
|
||||
@@ -12,15 +12,17 @@ cmake_minimum_required(VERSION 3.22.1)
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
#include(FetchContent)
|
||||
#FetchContent_Declare(
|
||||
# llama
|
||||
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
# GIT_TAG ci-android
|
||||
#)
|
||||
#
|
||||
## Also provides "common"
|
||||
#FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
add_subdirectory(../../../../../../ please-work)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
|
||||
@@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
|
||||
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
|
||||
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
|
||||
struct {
|
||||
struct ggml_tensor * newline;
|
||||
struct ggml_context * ctx;
|
||||
} model;
|
||||
|
||||
@@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
|
||||
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
|
||||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
||||
if (newline_tmp->buffer == NULL) {
|
||||
LOG_TEE("newline_tmp tensor buffer is NULL\n");
|
||||
}
|
||||
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
|
||||
} else {
|
||||
model.newline->data = newline_tmp->data;
|
||||
if (model.newline->data == NULL) {
|
||||
LOG_TEE("newline_tmp tensor data is NULL\n");
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
|
||||
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
|
||||
// fill it with the image embeddings, ignoring the base
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
# quantize
|
||||
|
||||
TODO
|
||||
You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
|
||||
|
||||
Note: It is synced from llama.cpp `main` every 6 hours.
|
||||
|
||||
## Llama 2 7B
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ cmake --build . --config Release
|
||||
Then, start the `rpc-server` with the backend:
|
||||
|
||||
```bash
|
||||
$ bin/rpc-server 0.0.0.0 50052
|
||||
$ bin/rpc-server -p 50052
|
||||
create_backend: using CUDA backend
|
||||
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
|
||||
@@ -53,7 +53,7 @@ Starting RPC server on 0.0.0.0:50052
|
||||
|
||||
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
||||
```
|
||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||
|
||||
|
||||
@@ -7,9 +7,60 @@
|
||||
#endif
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#ifdef _WIN32
|
||||
# include <windows.h>
|
||||
#else
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
struct rpc_server_params {
|
||||
std::string host = "0.0.0.0";
|
||||
int port = 50052;
|
||||
size_t backend_mem = 0;
|
||||
};
|
||||
|
||||
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
|
||||
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
|
||||
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
|
||||
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
if (arg == "-H" || arg == "--host") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.host = argv[i];
|
||||
} else if (arg == "-p" || arg == "--port") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.port = std::stoi(argv[i]);
|
||||
if (params.port <= 0 || params.port > 65535) {
|
||||
return false;
|
||||
}
|
||||
} else if (arg == "-m" || arg == "--mem") {
|
||||
if (++i >= argc) {
|
||||
return false;
|
||||
}
|
||||
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static ggml_backend_t create_backend() {
|
||||
ggml_backend_t backend = NULL;
|
||||
#ifdef GGML_USE_CUDA
|
||||
@@ -38,21 +89,25 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||
#else
|
||||
// TODO: implement for other backends
|
||||
*free_mem = 1;
|
||||
*total_mem = 1;
|
||||
#ifdef _WIN32
|
||||
MEMORYSTATUSEX status;
|
||||
status.dwLength = sizeof(status);
|
||||
GlobalMemoryStatusEx(&status);
|
||||
*total_mem = status.ullTotalPhys;
|
||||
*free_mem = status.ullAvailPhys;
|
||||
#else
|
||||
long pages = sysconf(_SC_PHYS_PAGES);
|
||||
long page_size = sysconf(_SC_PAGE_SIZE);
|
||||
*total_mem = pages * page_size;
|
||||
*free_mem = *total_mem;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
const char * host = argv[1];
|
||||
int port = std::stoi(argv[2]);
|
||||
if (port <= 0 || port > 65535) {
|
||||
fprintf(stderr, "Invalid port number: %d\n", port);
|
||||
rpc_server_params params;
|
||||
if (!rpc_server_params_parse(argc, argv, params)) {
|
||||
fprintf(stderr, "Invalid parameters\n");
|
||||
return 1;
|
||||
}
|
||||
ggml_backend_t backend = create_backend();
|
||||
@@ -60,10 +115,15 @@ int main(int argc, char * argv[]) {
|
||||
fprintf(stderr, "Failed to create backend\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Starting RPC server on %s:%d\n", host, port);
|
||||
std::string endpoint = params.host + ":" + std::to_string(params.port);
|
||||
size_t free_mem, total_mem;
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
||||
if (params.backend_mem > 0) {
|
||||
free_mem = params.backend_mem;
|
||||
total_mem = params.backend_mem;
|
||||
} else {
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
}
|
||||
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
||||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||
ggml_backend_free(backend);
|
||||
return 0;
|
||||
|
||||
@@ -17,7 +17,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||
|
||||
**Command line options:**
|
||||
|
||||
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
|
||||
- `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
|
||||
- `-t N`, `--threads N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend.
|
||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU.
|
||||
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
|
||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||
@@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
|
||||
- `--numa distribute`: Spread execution evenly over all nodes
|
||||
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
|
||||
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system
|
||||
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
|
||||
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
- `--numa`: Attempt optimizations that may help on some NUMA systems.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
||||
@@ -293,14 +293,13 @@ def start_server_background(args):
|
||||
|
||||
|
||||
def is_server_listening(server_fqdn, server_port):
|
||||
try:
|
||||
url = f"{server_fqdn}:{server_port}/health"
|
||||
if not url.startswith("http://"):
|
||||
url = f"http://{url}"
|
||||
result = requests.get(url)
|
||||
return result.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
|
||||
result = sock.connect_ex((server_fqdn, server_port))
|
||||
_is_server_listening = result == 0
|
||||
if _is_server_listening:
|
||||
print(f"server is listening on {server_fqdn}:{server_port}...")
|
||||
return _is_server_listening
|
||||
|
||||
|
||||
def escape_metric_name(metric_name):
|
||||
return re.sub('[^A-Z0-9]', '_', metric_name.upper())
|
||||
|
||||
@@ -2387,6 +2387,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
|
||||
printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n");
|
||||
printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
|
||||
@@ -2439,6 +2440,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||
break;
|
||||
}
|
||||
sparams.port = std::stoi(argv[i]);
|
||||
} else if (arg == "--rpc") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.rpc_servers = argv[i];
|
||||
} else if (arg == "--host") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
||||
@@ -1895,7 +1895,6 @@ void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * t
|
||||
|
||||
tensor->buffer = buffer;
|
||||
tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
|
||||
tensor->backend = tensor->view_src->backend;
|
||||
ggml_backend_buffer_init_tensor(buffer, tensor);
|
||||
}
|
||||
|
||||
|
||||
@@ -2558,7 +2558,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
}
|
||||
|
||||
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||
if (cuda_graph_update_required) {
|
||||
if (use_cuda_graph && cuda_graph_update_required) {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
||||
} else {
|
||||
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
||||
|
||||
@@ -1986,7 +1986,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
||||
|
||||
for (int j = 0; j < QK_K/16; ++j) {
|
||||
if (quant_weights) {
|
||||
const float * qw = quant_weights ? quant_weights + QK_K * i + 16*j : NULL;
|
||||
const float * qw = quant_weights + QK_K * i + 16*j;
|
||||
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j+l]*x[16*j+l]);
|
||||
} else {
|
||||
for (int l = 0; l < 16; ++l) weight[l] = x[16*j+l]*x[16*j+l];
|
||||
@@ -3487,10 +3487,9 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q4_0 * restrict vx0 = vx;
|
||||
const block_q4_0 * restrict vx1 = vx + bx;
|
||||
|
||||
const block_q4_0 * restrict vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
|
||||
const block_q8_0 * restrict vy0 = vy;
|
||||
const block_q8_0 * restrict vy1 = vy + by;
|
||||
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
||||
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
|
||||
@@ -3524,10 +3523,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||
|
||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||
float32_t _scale[4] = { GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
@@ -3894,9 +3895,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q4_1 * restrict vx0 = vx;
|
||||
const block_q4_1 * restrict vx1 = vx + bx;
|
||||
const block_q4_1 * restrict vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
|
||||
const block_q8_1 * restrict vy0 = vy;
|
||||
const block_q8_1 * restrict vy1 = vy + by;
|
||||
const block_q8_1 * restrict vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
|
||||
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t summs0 = vdupq_n_f32(0.0f);
|
||||
@@ -3907,11 +3908,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
const block_q8_1 * restrict b_y0 = &vy0[i];
|
||||
const block_q8_1 * restrict b_y1 = &vy1[i];
|
||||
|
||||
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
||||
summs0 += summs_t;
|
||||
float32_t summs_t[4] = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
||||
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
||||
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
||||
summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
||||
|
||||
@@ -3931,10 +3932,11 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||
|
||||
// mmla into int32x4_t
|
||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
||||
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
||||
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
||||
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
||||
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d,
|
||||
GGML_FP16_TO_FP32(b_x0->d)*b_y1->d,
|
||||
GGML_FP16_TO_FP32(b_x1->d)*b_y0->d,
|
||||
GGML_FP16_TO_FP32(b_x1->d)*b_y1->d};
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
@@ -3953,7 +3955,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
||||
|
||||
float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
|
||||
float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
|
||||
sumv2 = sumv2 + summs0;
|
||||
sumv2 = vaddq_f32(sumv2, summs0);
|
||||
|
||||
vst1_f32(s, vget_low_f32(sumv2));
|
||||
vst1_f32(s + bs, vget_high_f32(sumv2));
|
||||
@@ -4837,9 +4839,9 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
#if defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (nrc == 2) {
|
||||
const block_q8_0 * restrict vx0 = vx;
|
||||
const block_q8_0 * restrict vx1 = vx + bx;
|
||||
const block_q8_0 * restrict vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
|
||||
const block_q8_0 * restrict vy0 = vy;
|
||||
const block_q8_0 * restrict vy1 = vy + by;
|
||||
const block_q8_0 * restrict vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
|
||||
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
|
||||
@@ -4861,10 +4863,11 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||
const int8x16_t y1_l = vld1q_s8(b_y1->qs);
|
||||
const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
|
||||
|
||||
float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||
float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
|
||||
GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
|
||||
float32x4_t scale = vld1q_f32(_scale);
|
||||
|
||||
int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
#define GGML_DEBUG 1
|
||||
#define GGML_DEBUG 0
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||
#else
|
||||
|
||||
@@ -13989,7 +13989,7 @@ inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
|
||||
|
||||
#pragma message("TODO: generalize upscale operator")
|
||||
#pragma message(" https://github.com/ggerganov/ggml/pull/814")
|
||||
GGML_ASSERT(false && "TODO: generalize upscale operator);
|
||||
GGML_ASSERT(false && "TODO: generalize upscale operator");
|
||||
|
||||
const int scale_factor = dst->op_params[0];
|
||||
|
||||
|
||||
3
ggml.h
3
ggml.h
@@ -565,7 +565,8 @@ extern "C" {
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_backend_type backend;
|
||||
|
||||
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
|
||||
|
||||
struct ggml_backend_buffer * buffer;
|
||||
|
||||
|
||||
35
llama.cpp
35
llama.cpp
@@ -6622,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
|
||||
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
||||
cb(q, "q", il);
|
||||
@@ -6644,8 +6645,8 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v_l[il],
|
||||
n_embd_head_v, n_kv, n_head_kv,
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
||||
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
||||
0);
|
||||
cb(v, "v", il);
|
||||
|
||||
@@ -6655,7 +6656,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens);
|
||||
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
||||
} else {
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
@@ -6700,7 +6701,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
||||
cb(kqv_merged, "kqv_merged", il);
|
||||
|
||||
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
||||
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
||||
cb(cur, "kqv_merged_cont", il);
|
||||
}
|
||||
|
||||
@@ -12818,6 +12819,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
GGML_ASSERT(vocab.special_eos_id != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
@@ -12844,6 +12852,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
||||
LLAMA_LOG_WARN(
|
||||
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
||||
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
||||
"Are you sure this is what you want?\n", __FUNCTION__);
|
||||
}
|
||||
|
||||
if (add_special && vocab.special_add_eos == 1) {
|
||||
GGML_ASSERT(vocab.special_add_eos != -1);
|
||||
output.push_back(vocab.special_eos_id);
|
||||
@@ -13904,9 +13919,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||
|
||||
// Sample the next word X using top-k sampling
|
||||
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
llama_token X = llama_sample_token(ctx, candidates);
|
||||
t_start_sample_us = ggml_time_us();
|
||||
|
||||
@@ -13920,9 +13933,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||
// Update mu using the learning rate and error
|
||||
*mu = *mu - eta * e;
|
||||
|
||||
if (ctx) {
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
}
|
||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||
return X;
|
||||
}
|
||||
|
||||
@@ -17015,13 +17026,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
||||
}
|
||||
else {
|
||||
if (cell_range_begin != kv_self.size) {
|
||||
cell_ranges.push_back({ cell_range_begin, i });
|
||||
cell_ranges.emplace_back(cell_range_begin, i);
|
||||
cell_range_begin = kv_self.size;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cell_range_begin != kv_self.size) {
|
||||
cell_ranges.push_back({ cell_range_begin, kv_self.size });
|
||||
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
||||
}
|
||||
|
||||
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
||||
|
||||
@@ -1,117 +1,203 @@
|
||||
#!/bin/bash
|
||||
test_suite=${1:-}
|
||||
test_number=${2:-}
|
||||
|
||||
PROG=${0##*/}
|
||||
build_dir="build-ci-debug"
|
||||
|
||||
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||
echo "Debug specific ctest program."
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -h, --help Display this help and exit"
|
||||
echo
|
||||
echo "Arguments:"
|
||||
echo " <test_regex> (Mandatory) Supply one regex to the script to filter tests"
|
||||
echo " (test_number) (Optional) Test number to run a specific test"
|
||||
echo
|
||||
echo "Example:"
|
||||
echo " $PROG test-tokenizer"
|
||||
echo " $PROG test-tokenizer 3"
|
||||
echo
|
||||
exit 0
|
||||
fi
|
||||
# Print Color Commands
|
||||
red=$(tput setaf 1)
|
||||
green=$(tput setaf 2)
|
||||
yellow=$(tput setaf 3)
|
||||
blue=$(tput setaf 4)
|
||||
magenta=$(tput setaf 5)
|
||||
cyan=$(tput setaf 6)
|
||||
normal=$(tput sgr0)
|
||||
|
||||
# Function to select and debug a test
|
||||
function select_test() {
|
||||
test_suite=${1:-test}
|
||||
test_number=${2:-}
|
||||
|
||||
# Sanity Check If Tests Is Detected
|
||||
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||
if [ ${#tests[@]} -eq 0 ]
|
||||
then
|
||||
echo "No tests avaliable... check your compliation process..."
|
||||
echo "Exiting."
|
||||
exit 1
|
||||
fi
|
||||
# Print Help Message
|
||||
####################
|
||||
|
||||
if [ -z $test_number ]
|
||||
then
|
||||
# List out avaliable tests
|
||||
printf "Which test would you like to debug?\n"
|
||||
id=0
|
||||
for s in "${tests[@]}"
|
||||
do
|
||||
echo "Test# ${id}"
|
||||
echo " $s"
|
||||
((id++))
|
||||
done
|
||||
print_full_help() {
|
||||
cat << EOF
|
||||
Usage: $PROG [OPTION]... <test_regex> (test_number)
|
||||
Debug specific ctest program.
|
||||
|
||||
# Prompt user which test they wanted to run
|
||||
printf "\nRun test#? "
|
||||
read test_number
|
||||
else
|
||||
printf "\nUser Already Requested #${test_number}"
|
||||
fi
|
||||
Options:
|
||||
-h, --help display this help and exit
|
||||
-g run in gdb mode
|
||||
|
||||
# Start GDB with the requested test binary and arguments
|
||||
printf "Debugging(GDB) test: ${tests[test_number]}\n"
|
||||
# Change IFS (Internal Field Separator)
|
||||
sIFS=$IFS
|
||||
IFS=$'\n'
|
||||
Arguments:
|
||||
<test_regex> (Mandatory) Supply one regex to the script to filter tests
|
||||
(test_number) (Optional) Test number to run a specific test
|
||||
|
||||
# Get test args
|
||||
gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
|
||||
IFS=$sIFS
|
||||
printf "Debug arguments: ${gdb_args[test_number]}\n\n"
|
||||
|
||||
# Expand paths if needed
|
||||
args=()
|
||||
for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
|
||||
do
|
||||
args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
|
||||
done
|
||||
|
||||
# Execute debugger
|
||||
echo "gdb args: ${args[@]}"
|
||||
gdb --args ${args[@]}
|
||||
Example:
|
||||
$PROG test-tokenizer
|
||||
$PROG test-tokenizer 3
|
||||
EOF
|
||||
}
|
||||
|
||||
abort() {
|
||||
echo "Error: $1" >&2
|
||||
cat << EOF >&2
|
||||
Usage: $PROG [OPTION]... <test_regex> (test_number)
|
||||
Debug specific ctest program.
|
||||
Refer to --help for full instructions.
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
# Dependency Sanity Check
|
||||
#########################
|
||||
|
||||
check_dependency() {
|
||||
command -v "$1" >/dev/null 2>&1 || {
|
||||
abort "$1 is required but not found. Please install it and try again."
|
||||
}
|
||||
}
|
||||
|
||||
check_dependency ctest
|
||||
check_dependency cmake
|
||||
|
||||
|
||||
# Step 0: Check the args
|
||||
if [ -z "$test_suite" ]
|
||||
then
|
||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||
echo "Supply one regex to the script to filter tests,"
|
||||
echo "and optionally a test number to run a specific test."
|
||||
echo "Use --help flag for full instructions"
|
||||
exit 1
|
||||
########################
|
||||
|
||||
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||
print_full_help >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Parse command-line options
|
||||
gdb_mode=false
|
||||
while getopts "g" opt; do
|
||||
case $opt in
|
||||
g)
|
||||
gdb_mode=true
|
||||
echo "gdb_mode Mode Enabled"
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Shift the option parameters
|
||||
shift $((OPTIND - 1))
|
||||
|
||||
# Positionial Argument Processing : <test_regex>
|
||||
if [ -z "${1}" ]; then
|
||||
abort "Test regex is required"
|
||||
else
|
||||
test_suite=${1:-}
|
||||
fi
|
||||
|
||||
# Positionial Argument Processing : (test_number)
|
||||
test_number=${2:-}
|
||||
|
||||
|
||||
# Step 1: Reset and Setup folder context
|
||||
########################################
|
||||
|
||||
## Sanity check that we are actually in a git repo
|
||||
repo_root=$(git rev-parse --show-toplevel)
|
||||
if [ ! -d "$repo_root" ]; then
|
||||
echo "Error: Not in a Git repository."
|
||||
exit 1
|
||||
abort "Not in a Git repository."
|
||||
fi
|
||||
|
||||
## Reset folder to root context of git repo
|
||||
pushd "$repo_root" || exit 1
|
||||
## Reset folder to root context of git repo and Create and enter build directory
|
||||
pushd "$repo_root"
|
||||
rm -rf "$build_dir" && mkdir "$build_dir" || abort "Failed to make $build_dir"
|
||||
|
||||
## Create and enter build directory
|
||||
rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
|
||||
|
||||
# Step 2: Setup Build Environment and Compile Test Binaries
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
|
||||
pushd "$build_dir" && make -j || exit 1
|
||||
###########################################################
|
||||
|
||||
# Step 3: Debug the Test
|
||||
select_test "$test_suite" "$test_number"
|
||||
# Note: test-eval-callback requires -DLLAMA_CURL
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_CURL=1 || abort "Failed to build enviroment"
|
||||
pushd "$build_dir"
|
||||
make -j || abort "Failed to compile"
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Step 4: Return to the directory from which the user ran the command.
|
||||
popd || exit 1
|
||||
popd || exit 1
|
||||
popd || exit 1
|
||||
|
||||
# Step 3: Find all tests available that matches REGEX
|
||||
####################################################
|
||||
|
||||
# Ctest Gather Tests
|
||||
# `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
|
||||
# `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
|
||||
# `-V` : Verbose Mode
|
||||
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||
pushd "$build_dir"
|
||||
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||
if [ ${#tests[@]} -eq 0 ]; then
|
||||
abort "No tests avaliable... check your compliation process..."
|
||||
fi
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
|
||||
# Step 4: Identify Test Command for Debugging
|
||||
#############################################
|
||||
|
||||
# Select test number
|
||||
if [ -z $test_number ]; then
|
||||
# List out avaliable tests
|
||||
printf "Which test would you like to debug?\n"
|
||||
id=0
|
||||
for s in "${tests[@]}"
|
||||
do
|
||||
echo "Test# ${id}"
|
||||
echo " $s"
|
||||
((id++))
|
||||
done
|
||||
|
||||
# Prompt user which test they wanted to run
|
||||
printf "\nRun test#? "
|
||||
read test_number
|
||||
|
||||
else
|
||||
printf "\nUser Already Requested #${test_number}\n"
|
||||
|
||||
fi
|
||||
|
||||
# Grab all tests commands
|
||||
pushd "$build_dir"
|
||||
sIFS=$IFS # Save Initial IFS (Internal Field Separator)
|
||||
IFS=$'\n' # Change IFS (Internal Field Separator) (So we split ctest output by newline rather than by spaces)
|
||||
test_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' )) # Get test args
|
||||
IFS=$sIFS # Reset IFS (Internal Field Separator)
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Grab specific test command
|
||||
single_test_name="${tests[test_number]}"
|
||||
single_test_command="${test_args[test_number]}"
|
||||
|
||||
|
||||
# Step 5: Execute or GDB Debug
|
||||
##############################
|
||||
|
||||
printf "${magenta}Running Test #${test_number}: ${single_test_name}${normal}\n"
|
||||
printf "${cyan}single_test_command: ${single_test_command}${normal}\n"
|
||||
|
||||
if [ "$gdb_mode" = "true" ]; then
|
||||
# Execute debugger
|
||||
pushd "$repo_root" || exit 1
|
||||
eval "gdb --args ${single_test_command}"
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
else
|
||||
# Execute Test
|
||||
pushd "$repo_root" || exit 1
|
||||
eval "${single_test_command}"
|
||||
exit_code=$?
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
# Print Result
|
||||
printf "${blue}Ran Test #${test_number}: ${single_test_name}${normal}\n"
|
||||
printf "${yellow}Command: ${single_test_command}${normal}\n"
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
printf "${green}TEST PASS${normal}\n"
|
||||
else
|
||||
printf "${red}TEST FAIL${normal}\n"
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
# Return to the directory from which the user ran the command.
|
||||
popd > /dev/null || exit 1
|
||||
|
||||
Reference in New Issue
Block a user