mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
35a2ee9143 | ||
|
|
ec903c0341 | ||
|
|
a1d6df129b | ||
|
|
bbe7c56c99 |
@@ -466,17 +466,17 @@ function(get_flags CCID CCVER)
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
||||
set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
|
||||
list(APPEND CXX_FLAGS -Wno-format-truncation)
|
||||
endif()
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
elseif (CCID MATCHES "Intel")
|
||||
# enable max optimization level when using Intel compiler
|
||||
@@ -510,16 +510,18 @@ if (LLAMA_ALL_WARNINGS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
|
||||
if (NOT MSVC)
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
|
||||
list(APPEND CUDA_FLAGS -Wno-pedantic)
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
|
||||
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
@@ -547,13 +549,8 @@ if (LLAMA_CUBLAS)
|
||||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument
|
||||
if (NOT CUDA_CXX_FLAGS STREQUAL "")
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
|
||||
endif()
|
||||
list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
@@ -618,12 +615,7 @@ if (NOT MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
function(add_compile_option_cpp ARG)
|
||||
# Adds a compile option to C/C++ only, but not for Cuda.
|
||||
# Use, e.g., for CPU-architecture flags.
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
|
||||
endfunction()
|
||||
set(ARCH_FLAGS "")
|
||||
|
||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||
message(STATUS "ARM detected")
|
||||
@@ -636,19 +628,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
add_compile_options(-mfp16-format=ieee)
|
||||
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||
# Raspberry Pi 1, Zero
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
# Raspberry Pi 2
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
add_compile_options(-mno-unaligned-access)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||
@@ -659,7 +651,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||
include(cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_option_cpp(/arch:AVX512)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
@@ -673,49 +665,61 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
endif()
|
||||
elseif (LLAMA_AVX2)
|
||||
add_compile_option_cpp(/arch:AVX2)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||
elseif (LLAMA_AVX)
|
||||
add_compile_option_cpp(/arch:AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
endif()
|
||||
else()
|
||||
if (LLAMA_NATIVE)
|
||||
add_compile_option_cpp(-march=native)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
endif()
|
||||
if (LLAMA_F16C)
|
||||
add_compile_option_cpp(-mf16c)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
endif()
|
||||
if (LLAMA_FMA)
|
||||
add_compile_option_cpp(-mfma)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
endif()
|
||||
if (LLAMA_AVX)
|
||||
add_compile_option_cpp(-mavx)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
endif()
|
||||
if (LLAMA_AVX2)
|
||||
add_compile_option_cpp(-mavx2)
|
||||
list(APPEND ARCH_FLAGS -mavx2)
|
||||
endif()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_option_cpp(-mavx512f)
|
||||
add_compile_option_cpp(-mavx512bw)
|
||||
list(APPEND ARCH_FLAGS -mavx512f)
|
||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_option_cpp(-mavx512vbmi)
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_option_cpp(-mavx512vnni)
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
add_compile_options(-mcpu=powerpc64le)
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
else()
|
||||
add_compile_options(-mcpu=native -mtune=native)
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Unknown architecture")
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
|
||||
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
|
||||
endif()
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
endif()
|
||||
|
||||
if (MINGW)
|
||||
# Target Windows 8 for PrefetchVirtualMemory
|
||||
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
|
||||
|
||||
@@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||
// will be empty (default) if there are parse errors
|
||||
if (result->parsed_grammar.rules.empty()) {
|
||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||
delete result;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
@@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
|
||||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
bool suff_rm_leading_spc = params.escape;
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
|
||||
@@ -1277,7 +1277,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
".*weight",
|
||||
};
|
||||
|
||||
std::vector<uint8_t> read_data(512);
|
||||
std::vector<uint8_t> work(512);
|
||||
std::vector<float> conv_buf(512);
|
||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||
|
||||
@@ -30,7 +30,8 @@ Command line options:
|
||||
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
|
||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
## Build
|
||||
|
||||
server is build alongside everything else from the root of the project
|
||||
|
||||
@@ -184,6 +184,12 @@ struct llama_client_slot
|
||||
struct llama_sampling_params sparams;
|
||||
llama_sampling_context *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1;// group-attention factor
|
||||
int32_t ga_w = 512; // group-attention width
|
||||
|
||||
int32_t n_past_se = 0; // self-extend
|
||||
|
||||
// multimodal
|
||||
std::vector<slot_image> images;
|
||||
|
||||
@@ -212,7 +218,8 @@ struct llama_client_slot
|
||||
sent_count = 0;
|
||||
sent_token_probs_index = 0;
|
||||
infill = false;
|
||||
|
||||
ga_i = 0;
|
||||
n_past_se = 0;
|
||||
generated_token_probs.clear();
|
||||
|
||||
for (slot_image & img : images)
|
||||
@@ -399,9 +406,26 @@ struct llama_server_context
|
||||
|
||||
slot.id = i;
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.reset();
|
||||
|
||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||
|
||||
const int ga_n = params.grp_attn_n;
|
||||
const int ga_w = params.grp_attn_w;
|
||||
|
||||
if (ga_n != 1) {
|
||||
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
||||
}
|
||||
|
||||
slot.ga_i = 0;
|
||||
slot.ga_n = ga_n;
|
||||
slot.ga_w = ga_w;
|
||||
|
||||
slot.reset();
|
||||
|
||||
slots.push_back(slot);
|
||||
}
|
||||
|
||||
@@ -657,7 +681,7 @@ struct llama_server_context
|
||||
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
|
||||
size_t end_prefix = pos;
|
||||
pos += pattern.length();
|
||||
size_t end_pos = prompt.find("]", pos);
|
||||
size_t end_pos = prompt.find(']', pos);
|
||||
if (end_pos != std::string::npos)
|
||||
{
|
||||
std::string image_id = prompt.substr(pos, end_pos - pos);
|
||||
@@ -1349,32 +1373,35 @@ struct llama_server_context
|
||||
|
||||
for (llama_client_slot &slot : slots)
|
||||
{
|
||||
if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
if (slot.ga_n == 1)
|
||||
{
|
||||
// Shift context
|
||||
const int n_left = slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
if (slot.is_processing() && slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
{
|
||||
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
|
||||
// Shift context
|
||||
const int n_left = slot.n_past - slot.params.n_keep - 1;
|
||||
const int n_discard = n_left / 2;
|
||||
|
||||
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
|
||||
llama_kv_cache_seq_rm (ctx, slot.id, slot.params.n_keep + 1 , slot.params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
|
||||
|
||||
for (size_t i = slot.params.n_keep + 1 + n_discard; i < slot.cache_tokens.size(); i++)
|
||||
{
|
||||
slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
|
||||
}
|
||||
|
||||
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
|
||||
|
||||
slot.n_past -= n_discard;
|
||||
|
||||
slot.truncated = true;
|
||||
|
||||
LOG_VERBOSE("context shift", {
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "n_keep", params.n_keep },
|
||||
{ "n_left", n_left },
|
||||
});
|
||||
}
|
||||
|
||||
slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
|
||||
|
||||
slot.n_past -= n_discard;
|
||||
|
||||
slot.truncated = true;
|
||||
|
||||
LOG_VERBOSE("context shift", {
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_keep", params.n_keep},
|
||||
{"n_left", n_left},
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1401,7 +1428,8 @@ struct llama_server_context
|
||||
|
||||
slot.i_batch = batch.n_tokens;
|
||||
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
|
||||
slot.n_past += 1;
|
||||
}
|
||||
@@ -1499,6 +1527,8 @@ struct llama_server_context
|
||||
llama_sampling_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
slot.ga_i = 0;
|
||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
|
||||
}
|
||||
else
|
||||
@@ -1512,6 +1542,25 @@ struct llama_server_context
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
||||
|
||||
if (slot.ga_n != 1)
|
||||
{
|
||||
int ga_i = 0;
|
||||
int32_t ga_n = slot.ga_n;
|
||||
int32_t ga_w = slot.ga_w;
|
||||
int32_t slot_npast = 0;
|
||||
for (int k = 0; k < slot.n_past; ++k)
|
||||
{
|
||||
while (slot_npast >= ga_i + ga_w) {
|
||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||
slot_npast -= bd;
|
||||
ga_i += ga_w/ga_n;
|
||||
}
|
||||
slot_npast++;
|
||||
}
|
||||
slot.n_past_se = slot_npast;
|
||||
slot.ga_i = ga_i;
|
||||
}
|
||||
|
||||
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
|
||||
}
|
||||
|
||||
@@ -1526,6 +1575,10 @@ struct llama_server_context
|
||||
// we have to evaluate at least 1 token to generate logits.
|
||||
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
|
||||
slot.n_past--;
|
||||
if (slot.ga_i > 0)
|
||||
{
|
||||
slot.n_past_se--;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_VERBOSE("prompt ingested", {
|
||||
@@ -1538,9 +1591,22 @@ struct llama_server_context
|
||||
|
||||
// process the prefix of first image
|
||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
||||
int ga_i = slot.ga_i;
|
||||
int32_t ga_n = slot.ga_n;
|
||||
int32_t ga_w = slot.ga_w;
|
||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
||||
{
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
|
||||
if (slot.ga_n != 1)
|
||||
{
|
||||
while (slot_npast >= ga_i + ga_w) {
|
||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
||||
slot_npast -= bd;
|
||||
ga_i += ga_w/ga_n;
|
||||
}
|
||||
}
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
slot_npast += 1;
|
||||
}
|
||||
|
||||
if (has_images && !ingest_images(slot, n_batch))
|
||||
@@ -1570,6 +1636,36 @@ struct llama_server_context
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
|
||||
{
|
||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
||||
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
if (slot.ga_n != 1)
|
||||
{
|
||||
// context extension via Self-Extend
|
||||
while (slot.n_past_se >= slot.ga_i + slot.ga_w)
|
||||
{
|
||||
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
|
||||
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
|
||||
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
||||
LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
||||
LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
||||
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
|
||||
llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
|
||||
llama_kv_cache_seq_shift(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
|
||||
|
||||
slot.n_past_se -= bd;
|
||||
|
||||
slot.ga_i += slot.ga_w / slot.ga_n;
|
||||
|
||||
LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
||||
}
|
||||
slot.n_past_se += n_tokens;
|
||||
}
|
||||
}
|
||||
llama_batch batch_view =
|
||||
{
|
||||
n_tokens,
|
||||
@@ -1583,6 +1679,7 @@ struct llama_server_context
|
||||
};
|
||||
|
||||
const int ret = llama_decode(ctx, batch_view);
|
||||
|
||||
if (ret != 0)
|
||||
{
|
||||
if (n_batch == 1 || ret < 0)
|
||||
@@ -1728,6 +1825,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -gan N, --grp-attn-n N Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
|
||||
printf(" -gaw N, --grp-attn-w N Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
@@ -1913,6 +2012,25 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--grp-attn-n" || arg == "-gan")
|
||||
{
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
||||
params.grp_attn_n = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--grp-attn-w" || arg == "-gaw")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
|
||||
params.grp_attn_w = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--threads-batch" || arg == "-tb")
|
||||
{
|
||||
if (++i >= argc)
|
||||
|
||||
@@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
|
||||
@@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
||||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
|
||||
@@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
|
||||
}
|
||||
);
|
||||
|
||||
std::string add_template = MULTILINE_QUOTE(
|
||||
__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
|
||||
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
||||
|
||||
if (i >= get_global_size(0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
|
||||
}
|
||||
);
|
||||
|
||||
#define CL_CHECK(err) \
|
||||
do { \
|
||||
cl_int err_ = (err); \
|
||||
@@ -878,6 +890,7 @@ static std::string generate_kernels() {
|
||||
}
|
||||
src << mul_kernel << '\n';
|
||||
}
|
||||
src << add_template << '\n';
|
||||
|
||||
return src.str();
|
||||
}
|
||||
@@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
|
||||
static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
|
||||
static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
|
||||
static cl_kernel mul_f32_cl;
|
||||
static cl_kernel add_f32_cl;
|
||||
static bool fp16_support;
|
||||
|
||||
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
||||
@@ -1100,9 +1114,10 @@ void ggml_cl_init(void) {
|
||||
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
||||
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
||||
// Disabled due to faulty outputs
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
||||
fp16_support = false; // strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
// fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
||||
|
||||
cl_context_properties properties[] = {
|
||||
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
|
||||
@@ -1150,6 +1165,8 @@ void ggml_cl_init(void) {
|
||||
|
||||
// mul kernel
|
||||
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
||||
|
||||
CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
|
||||
}
|
||||
|
||||
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
||||
@@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
|
||||
ggml_cl_mul_f32(src0, src1, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
size_t x_size;
|
||||
size_t d_size;
|
||||
|
||||
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
|
||||
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
||||
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
|
||||
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
cl_event ev;
|
||||
|
||||
// copy src0 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
|
||||
|
||||
const int64_t i13 = i03%ne13;
|
||||
const int64_t i12 = i02%ne12;
|
||||
const int i1 = i13*ne12*ne11 + i12*ne11;
|
||||
|
||||
cl_int x_offset = 0;
|
||||
cl_int y_offset = i1*ne10;
|
||||
cl_int d_offset = 0;
|
||||
|
||||
size_t global = ne00 * ne01;
|
||||
cl_int ky = ne10 * ne11;
|
||||
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
||||
|
||||
CL_CHECK(clReleaseEvent(ev));
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
|
||||
}
|
||||
}
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
}
|
||||
|
||||
void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cl_add_f32(src0, src1, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
|
||||
@@ -10,6 +10,7 @@ extern "C" {
|
||||
GGML_API void ggml_cl_init(void);
|
||||
|
||||
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
||||
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
11
ggml.c
11
ggml.c
@@ -7207,6 +7207,17 @@ static void ggml_compute_forward_add_f32(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
if (src1->backend == GGML_BACKEND_GPU) {
|
||||
// TODO: OpenCL kernel support full broadcast
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||
if (ith == 0) {
|
||||
ggml_cl_add(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
@@ -243,7 +243,6 @@ int main(int argc, char** argv) {
|
||||
if (useQ4_1) q41.resize(n4);
|
||||
else q40.resize(n4);
|
||||
std::vector<block_q8_0> q8(n8);
|
||||
std::vector<int64_t> H(16, 0);
|
||||
double sumt = 0, sumt2 = 0, maxt = 0;
|
||||
double sumqt = 0, sumqt2 = 0, maxqt = 0;
|
||||
double sum = 0, sumq = 0, exactSum = 0;
|
||||
|
||||
@@ -102,7 +102,6 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
||||
} else if (t->type == GGML_TYPE_I8) {
|
||||
tv.push_back((float)*(int8_t *) &buf[i]);
|
||||
} else if (quantized) {
|
||||
std::vector<float> vq(ggml_blck_size(t->type));
|
||||
tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
|
||||
tv.insert(tv.end(), vq.begin(), vq.end());
|
||||
} else {
|
||||
|
||||
@@ -190,7 +190,6 @@ int main()
|
||||
index++;
|
||||
}
|
||||
|
||||
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
||||
std::vector<llama_grammar_candidate> next_candidates;
|
||||
next_candidates.resize(24);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user