mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
1 Commits
b6628
...
gg/fix-sve
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a8b0089a5b |
@@ -1616,36 +1616,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
);
|
||||
});
|
||||
|
||||
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
||||
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
||||
);
|
||||
|
||||
auto recipient_in_channel = builder.add_rule("recipient_in_channel",
|
||||
channel + " \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_channel, " | ") + " )"
|
||||
);
|
||||
|
||||
if (data.grammar_lazy) {
|
||||
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
||||
"\"<|start|>assistant\"? \" to=functions.\" ( " +
|
||||
string_join(tool_rules_recipient_in_role, " | ") + " )"
|
||||
);
|
||||
|
||||
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
||||
} else {
|
||||
auto not_end = builder.add_rule("not-end",
|
||||
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
|
||||
auto analysis = builder.add_rule("analysis",
|
||||
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
||||
auto commentary = builder.add_rule("commentary",
|
||||
"\"<|channel|>commentary<|message|>\" ( " + not_end + " )* \"<|end|>\"");
|
||||
|
||||
auto recipient_in_role = builder.add_rule("recipient_in_role",
|
||||
"\" to=functions.\" ( " + string_join(tool_rules_recipient_in_role, " | ") + " )"
|
||||
);
|
||||
|
||||
builder.add_rule("root",
|
||||
"( " + analysis + " \"<|start|>assistant\" )? " +
|
||||
"( " + commentary + " \"<|start|>assistant\" )? " +
|
||||
"( " + recipient_in_role + " | " + recipient_in_channel + " )"
|
||||
);
|
||||
}
|
||||
builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
|
||||
|
||||
// Trigger on tool calls that appear in the commentary channel
|
||||
data.grammar_triggers.push_back({
|
||||
|
||||
@@ -135,10 +135,6 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return p;
|
||||
}
|
||||
|
||||
static const char * dl_error() {
|
||||
return "";
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using dl_handle = void;
|
||||
@@ -159,11 +155,6 @@ static void * dl_get_sym(dl_handle * handle, const char * name) {
|
||||
return dlsym(handle, name);
|
||||
}
|
||||
|
||||
static const char * dl_error() {
|
||||
const char *rslt = dlerror();
|
||||
return rslt != nullptr ? rslt : "";
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
|
||||
@@ -249,7 +240,7 @@ struct ggml_backend_registry {
|
||||
dl_handle_ptr handle { dl_load_library(path) };
|
||||
if (!handle) {
|
||||
if (!silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(path).c_str(), dl_error());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(path).c_str());
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
@@ -539,7 +530,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
||||
dl_handle_ptr handle { dl_load_library(entry) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path_str(entry.path()).c_str(), dl_error());
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
||||
}
|
||||
if (handle) {
|
||||
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
|
||||
|
||||
@@ -8646,41 +8646,7 @@ static void ggml_compute_forward_ssm_scan_f32(
|
||||
const int ii = i1 + h*nr;
|
||||
const float x_dt = x[ii] * dt_soft_plus;
|
||||
float sumf = 0.0f;
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int ggml_f32_epr = svcntw();
|
||||
const int ggml_f32_step = 1 * ggml_f32_epr;
|
||||
|
||||
const int np = (nc & ~(ggml_f32_step - 1));
|
||||
|
||||
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
|
||||
|
||||
GGML_F32_VEC adA = GGML_F32_VEC_SET1(dA);
|
||||
GGML_F32_VEC axdt = GGML_F32_VEC_SET1(x_dt);
|
||||
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
// TODO: maybe unroll more?
|
||||
for (int j = 0; j < 1; j++) {
|
||||
GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc);
|
||||
GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + g*nc);
|
||||
GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + g*nc);
|
||||
|
||||
t0 = GGML_F32_VEC_MUL(t0, adA);
|
||||
t1 = GGML_F32_VEC_MUL(t1, axdt);
|
||||
|
||||
t0 = GGML_F32_VEC_ADD(t0, t1);
|
||||
|
||||
sum = GGML_F32_VEC_FMA(sum, t0, t2);
|
||||
|
||||
GGML_F32_VEC_STORE(s + i + j*ggml_f32_epr + ii*nc, t0);
|
||||
}
|
||||
}
|
||||
|
||||
sumf = GGML_F32xt_REDUCE_ONE(sum);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
// todo: RVV implementation
|
||||
const int np = 0;
|
||||
#else
|
||||
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
||||
const int np = (nc & ~(GGML_F32_STEP - 1));
|
||||
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
@@ -8711,7 +8677,6 @@ static void ggml_compute_forward_ssm_scan_f32(
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
GGML_F32_VEC_REDUCE(sumf, sum);
|
||||
#endif
|
||||
#else
|
||||
const int np = 0;
|
||||
#endif
|
||||
@@ -8741,30 +8706,6 @@ static void ggml_compute_forward_ssm_scan_f32(
|
||||
for (int i1 = 0; i1 < nr; ++i1) {
|
||||
const int ii = i1 + h*nr;
|
||||
const float x_dt = x[ii] * dt_soft_plus;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt);
|
||||
svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus);
|
||||
svfloat32_t r1_vector = GGML_F32_VEC_ZERO;
|
||||
|
||||
// d_state
|
||||
// TODO: what happens when (d_state % svcntw()) != 0?
|
||||
for (int64_t k = 0; k < nc; k += svcntw()) {
|
||||
svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]);
|
||||
svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + g*nc]);
|
||||
svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + g*nc]);
|
||||
svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]);
|
||||
|
||||
svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA);
|
||||
t1 = exp_ps_sve(svptrue_b32(), t1);
|
||||
svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB);
|
||||
|
||||
vs0 = GGML_F32_VEC_FMA(t2, vs0, t1);
|
||||
r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector);
|
||||
|
||||
GGML_F32_VEC_STORE(&s[ii*nc + k], vs0);
|
||||
}
|
||||
y[ii] = GGML_F32xt_REDUCE_ONE(r1_vector);
|
||||
#else
|
||||
float sumf = 0.0f;
|
||||
// NOTE: can't really use GGML_SIMD here because d_state is usually 16
|
||||
// and also because expf is used within the loop.
|
||||
@@ -8779,7 +8720,6 @@ static void ggml_compute_forward_ssm_scan_f32(
|
||||
s[i] = state;
|
||||
}
|
||||
y[ii] = sumf;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9231,14 +9171,6 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
||||
#define GGML_F32X_MUL GGML_F32x16_MUL
|
||||
#define GGML_F32X_FMA GGML_F32x16_FMA
|
||||
#define WKV_VECTOR_SIZE 16
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
#define GGML_F32X GGML_F32xt
|
||||
#define GGML_F32X_SET1 GGML_F32xt_SET1
|
||||
#define GGML_F32X_LOAD GGML_F32xt_LOAD
|
||||
#define GGML_F32X_STORE GGML_F32xt_STORE
|
||||
#define GGML_F32X_MUL GGML_F32xt_MUL
|
||||
#define GGML_F32X_FMA GGML_F32xt_FMA
|
||||
#define WKV_VECTOR_SIZE 8
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#define GGML_F32X GGML_F32x4
|
||||
#define GGML_F32X_SET1 GGML_F32x4_SET1
|
||||
@@ -9251,11 +9183,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32(
|
||||
|
||||
#ifdef WKV_VECTOR_SIZE
|
||||
int wkv_vector_size;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
wkv_vector_size = svcntw();
|
||||
#else
|
||||
wkv_vector_size = WKV_VECTOR_SIZE;
|
||||
#endif
|
||||
wkv_vector_size = WKV_VECTOR_SIZE;
|
||||
const int64_t vec_count = head_size / wkv_vector_size;
|
||||
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
@@ -9447,14 +9375,6 @@ static void ggml_compute_forward_gla_f32(
|
||||
#define GGML_F32X_MUL GGML_F32x16_MUL
|
||||
#define GGML_F32X_FMA GGML_F32x16_FMA
|
||||
#define GLA_VECTOR_SIZE 16
|
||||
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
|
||||
#define GGML_F32X GGML_F32xt
|
||||
#define GGML_F32X_SET1 GGML_F32xt_SET1
|
||||
#define GGML_F32X_LOAD GGML_F32xt_LOAD
|
||||
#define GGML_F32X_STORE GGML_F32xt_STORE
|
||||
#define GGML_F32X_MUL GGML_F32xt_MUL
|
||||
#define GGML_F32X_FMA GGML_F32xt_FMA
|
||||
#define GLA_VECTOR_SIZE 8
|
||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
||||
#define GGML_F32X GGML_F32x4
|
||||
#define GGML_F32X_SET1 GGML_F32x4_SET1
|
||||
@@ -9467,11 +9387,7 @@ static void ggml_compute_forward_gla_f32(
|
||||
|
||||
#ifdef GLA_VECTOR_SIZE
|
||||
int gla_vector_size;
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
gla_vector_size = svcntw();
|
||||
#else
|
||||
gla_vector_size = GLA_VECTOR_SIZE;
|
||||
#endif
|
||||
gla_vector_size = GLA_VECTOR_SIZE;
|
||||
const int64_t vec_count = head_size / gla_vector_size;
|
||||
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
@@ -9631,127 +9547,84 @@ static void ggml_compute_forward_rwkv_wkv7_f32(
|
||||
GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS
|
||||
int64_t h_stride_2d = head_size * head_size;
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE) || defined(__riscv_v_intrinsic)
|
||||
// scalar Route to scalar implementation //TODO: Write SVE code and RVV code
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
int64_t t_offset = t * t_stride;
|
||||
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
||||
float * state_cur = state + state_offset;
|
||||
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
|
||||
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
int64_t t_offset = t * t_stride;
|
||||
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
||||
float * state_cur = state + state_offset;
|
||||
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
|
||||
|
||||
for (int64_t h = h_start; h < h_end; h++) {
|
||||
int64_t h_offset = h * h_stride;
|
||||
int64_t t_h_offset = t_offset + h_offset;
|
||||
int64_t h_2d_offset = h * h_stride_2d;
|
||||
for (int64_t h = h_start; h < h_end; h++) {
|
||||
int64_t h_offset = h * h_stride;
|
||||
int64_t t_h_offset = t_offset + h_offset;
|
||||
int64_t h_2d_offset = h * h_stride_2d;
|
||||
|
||||
for (int64_t i = 0; i < head_size; i++) {
|
||||
int64_t t_h_i_offset = t_h_offset + i;
|
||||
int64_t h_2d_i_offset = h_2d_offset + i * h_stride;
|
||||
for (int64_t ii = 0; ii < head_size; ii++) {
|
||||
int64_t t_h_i_offset = t_h_offset + ii;
|
||||
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
|
||||
|
||||
float v_val = v[t_h_i_offset];
|
||||
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
|
||||
|
||||
float sa = 0, result = 0;
|
||||
for (int64_t j = 0; j < head_size; j++) {
|
||||
sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j];
|
||||
}
|
||||
|
||||
for (int64_t j = 0; j < head_size; j++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
||||
|
||||
float r_val = r[t_h_j_offset];
|
||||
float w_val = w[t_h_j_offset];
|
||||
float k_val = k[t_h_j_offset];
|
||||
float b_val = b[t_h_j_offset];
|
||||
float kv_val = v_val * k_val;
|
||||
float prev_state_val = state_prev[h_2d_i_j_offset];
|
||||
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
||||
result += state_cur[h_2d_i_j_offset] * r_val;
|
||||
}
|
||||
dst_data[t_h_i_offset] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
int64_t t_offset = t * t_stride;
|
||||
int64_t state_offset = head_size * C * (t / (T / n_seqs));
|
||||
float * state_cur = state + state_offset;
|
||||
float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset;
|
||||
|
||||
for (int64_t h = h_start; h < h_end; h++) {
|
||||
int64_t h_offset = h * h_stride;
|
||||
int64_t t_h_offset = t_offset + h_offset;
|
||||
int64_t h_2d_offset = h * h_stride_2d;
|
||||
|
||||
for (int64_t ii = 0; ii < head_size; ii++) {
|
||||
int64_t t_h_i_offset = t_h_offset + ii;
|
||||
int64_t h_2d_i_offset = h_2d_offset + ii * h_stride;
|
||||
|
||||
GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]);
|
||||
|
||||
float sa = 0;
|
||||
{
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
|
||||
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
||||
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
|
||||
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
|
||||
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_REDUCE(sa, sum);
|
||||
}
|
||||
|
||||
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
|
||||
|
||||
int64_t j = 0;
|
||||
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
for (; j < head_size; j += GGML_F32_STEP) {
|
||||
float sa = 0;
|
||||
{
|
||||
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
GGML_F32_VEC ax[GGML_F32_ARR];
|
||||
GGML_F32_VEC ay[GGML_F32_ARR];
|
||||
for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) {
|
||||
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
|
||||
|
||||
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
|
||||
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
|
||||
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
|
||||
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
|
||||
|
||||
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
|
||||
|
||||
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
|
||||
// kv + s * decay + sa * b
|
||||
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
|
||||
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
|
||||
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
|
||||
|
||||
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
|
||||
ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]);
|
||||
ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]);
|
||||
sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]);
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
|
||||
GGML_F32_VEC_REDUCE(sa, sum);
|
||||
}
|
||||
|
||||
// There shouldn't be left-overs though.
|
||||
for (; j < head_size; j++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
||||
GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa);
|
||||
|
||||
float r_val = r[t_h_j_offset];
|
||||
float w_val = w[t_h_j_offset];
|
||||
float k_val = k[t_h_j_offset];
|
||||
float b_val = b[t_h_j_offset];
|
||||
float kv_val = v[t_h_i_offset] * k_val;
|
||||
int64_t j = 0;
|
||||
GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
|
||||
for (; j < head_size; j += GGML_F32_STEP) {
|
||||
for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR;
|
||||
|
||||
float prev_state_val = state_prev[h_2d_i_j_offset];
|
||||
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
||||
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
|
||||
GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]);
|
||||
GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]);
|
||||
GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]);
|
||||
GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]);
|
||||
|
||||
k_vec = GGML_F32_VEC_MUL(v_vec, k_vec);
|
||||
|
||||
GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]);
|
||||
// kv + s * decay + sa * b
|
||||
state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec);
|
||||
state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec);
|
||||
GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec);
|
||||
|
||||
result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec);
|
||||
}
|
||||
}
|
||||
GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec);
|
||||
|
||||
// There shouldn't be left-overs though.
|
||||
for (; j < head_size; j++) {
|
||||
int64_t t_h_j_offset = t_h_offset + j;
|
||||
int64_t h_2d_i_j_offset = h_2d_i_offset + j;
|
||||
|
||||
float r_val = r[t_h_j_offset];
|
||||
float w_val = w[t_h_j_offset];
|
||||
float k_val = k[t_h_j_offset];
|
||||
float b_val = b[t_h_j_offset];
|
||||
float kv_val = v[t_h_i_offset] * k_val;
|
||||
|
||||
float prev_state_val = state_prev[h_2d_i_j_offset];
|
||||
state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val;
|
||||
dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
for (int64_t t = 0; t < T; t++) {
|
||||
int64_t t_offset = t * t_stride;
|
||||
|
||||
@@ -2,10 +2,6 @@
|
||||
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
@@ -149,164 +145,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
// number of elements to fit in a single register
|
||||
//
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 SVE
|
||||
#define GGML_F32_EPR 8
|
||||
#define DEFAULT_PG svptrue_b32()
|
||||
|
||||
#define GGML_F32xt svfloat32_t
|
||||
#define GGML_F32xt_ZERO svdup_n_f32(0.0f)
|
||||
#define GGML_F32xt_SET1(x) svdup_n_f32(x)
|
||||
#define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
|
||||
#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
|
||||
#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
|
||||
#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
|
||||
#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
|
||||
#define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
|
||||
#define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
|
||||
{ \
|
||||
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
|
||||
sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
|
||||
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
|
||||
sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
|
||||
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
|
||||
sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
|
||||
sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
|
||||
(res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
|
||||
}
|
||||
#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
|
||||
|
||||
#define GGML_F32_VEC GGML_F32xt
|
||||
#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32xt_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32xt_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32xt_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32xt_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32xt_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
|
||||
|
||||
// F16 SVE
|
||||
#define DEFAULT_PG32 svptrue_b32()
|
||||
#define DEFAULT_PG16 svptrue_b16()
|
||||
|
||||
#define GGML_F32Cxt svfloat16_t
|
||||
#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f)
|
||||
#define GGML_F32Cxt_SET1(x) svdup_n_f16(x)
|
||||
#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p))
|
||||
#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec))
|
||||
|
||||
#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a)
|
||||
#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b)
|
||||
#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b)
|
||||
#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED
|
||||
|
||||
#define GGML_F16x_VEC GGML_F32Cxt
|
||||
#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO
|
||||
#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1
|
||||
#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p)
|
||||
#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r)
|
||||
#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA
|
||||
#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD
|
||||
#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL
|
||||
#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE
|
||||
|
||||
#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a)
|
||||
#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
|
||||
#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \
|
||||
{ \
|
||||
sum1 = svadd_f16_x(pg16, sum1, sum2); \
|
||||
sum3 = svadd_f16_x(pg16, sum3, sum4); \
|
||||
sum1 = svadd_f16_x(pg16, sum1, sum3); \
|
||||
__fp16 sum_f16 = svaddv_f16(pg16, sum1); \
|
||||
(res) = (ggml_float) sum_f16; \
|
||||
}
|
||||
#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__)
|
||||
|
||||
// F16 NEON
|
||||
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 8
|
||||
|
||||
#define GGML_F16x8 float16x8_t
|
||||
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
||||
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
||||
#define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
|
||||
#define GGML_F16x8_STORE vst1q_f16
|
||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||
#define GGML_F16x8_ADD vaddq_f16
|
||||
#define GGML_F16x8_MUL vmulq_f16
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x8
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
||||
#else
|
||||
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
||||
// and take advantage of the vcvt_ functions to convert to/from FP16
|
||||
|
||||
#define GGML_F16_STEP 16
|
||||
#define GGML_F16_EPR 4
|
||||
|
||||
#define GGML_F32Cx4 float32x4_t
|
||||
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
||||
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
||||
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
|
||||
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
||||
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
||||
#define GGML_F32Cx4_ADD vaddq_f32
|
||||
#define GGML_F32Cx4_MUL vmulq_f32
|
||||
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx4
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||
#endif
|
||||
|
||||
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
|
||||
@@ -18,73 +18,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
|
||||
#if defined(GGML_SIMD)
|
||||
float sumf = 0.0f;
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
||||
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
||||
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
||||
|
||||
const int np = (n & ~(ggml_f32_step - 1));
|
||||
svfloat32_t sum1 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum2 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum3 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum4 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum5 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum6 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum7 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sum8 = svdup_n_f32(0.0f);
|
||||
svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8;
|
||||
svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8;
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8);
|
||||
}
|
||||
// leftovers
|
||||
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
||||
const int np2 = (n & ~(ggml_f32_epr - 1));
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b32(np2, n);
|
||||
ax1 = svld1_f32(pg, x + np2);
|
||||
ay1 = svld1_f32(pg, y + np2);
|
||||
sum1 = svmad_f32_m(pg, ax1, ay1, sum1);
|
||||
}
|
||||
// reduce sum1,sum2 to sum1
|
||||
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
int vl = __riscv_vsetvlmax_e32m8();
|
||||
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
||||
vfloat32m8_t vsum;
|
||||
@@ -215,69 +149,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
|
||||
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8; //get vector length
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
|
||||
const int np= (n & ~(ggml_f16_step - 1));
|
||||
svfloat16_t sum1 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum2 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum3 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum4 = svdup_n_f16(0.0f);
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8);
|
||||
}
|
||||
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
|
||||
sum1 = svmad_f16_x(pg, hx, hy, sum1);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_zvfh)
|
||||
int vl = __riscv_vsetvlmax_e32m2();
|
||||
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
|
||||
|
||||
@@ -118,150 +118,37 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
||||
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
|
||||
}
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16; // running when 16
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
|
||||
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
||||
|
||||
const int np = (n & ~(ggml_f16_step - 1));
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
svfloat16_t sum_00 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_01 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_02 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_03 = svdup_n_f16(0.0f);
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
|
||||
svfloat16_t sum_10 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_11 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_12 = svdup_n_f16(0.0f);
|
||||
svfloat16_t sum_13 = svdup_n_f16(0.0f);
|
||||
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
|
||||
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
|
||||
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements
|
||||
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
|
||||
ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
|
||||
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
|
||||
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
|
||||
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
|
||||
ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
|
||||
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
|
||||
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
|
||||
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
|
||||
ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
|
||||
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
|
||||
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
|
||||
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
|
||||
ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
|
||||
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
|
||||
|
||||
sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
|
||||
ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
|
||||
sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
|
||||
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
|
||||
|
||||
sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
|
||||
ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
|
||||
sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
|
||||
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
|
||||
|
||||
sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
|
||||
ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
|
||||
sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
|
||||
}
|
||||
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1));
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
|
||||
sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
|
||||
rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
|
||||
sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
|
||||
svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
|
||||
sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
|
||||
sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
|
||||
}
|
||||
GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
|
||||
GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
|
||||
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
|
||||
|
||||
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
||||
}
|
||||
sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reduce sum0..sum3 to sum0
|
||||
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
||||
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
||||
}
|
||||
// reduce sum0..sum3 to sum0
|
||||
for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
|
||||
GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
|
||||
@@ -277,86 +164,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
|
||||
|
||||
inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
|
||||
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
||||
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
||||
const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
|
||||
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
||||
|
||||
const int np = (n & ~(ggml_f32_step - 1));
|
||||
svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
|
||||
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
||||
|
||||
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
|
||||
ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
|
||||
|
||||
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
|
||||
ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
|
||||
|
||||
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
|
||||
ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
|
||||
|
||||
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
|
||||
ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
|
||||
|
||||
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
|
||||
ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
|
||||
|
||||
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
|
||||
ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
|
||||
}
|
||||
// leftovers
|
||||
// Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
|
||||
const int np2 = (n & ~(ggml_f32_epr - 1));
|
||||
for (int i = np; i < np2; i += ggml_f32_epr) {
|
||||
ax1 = GGML_F32_VEC_LOAD(x + i);
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
}
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np2 < n) {
|
||||
svbool_t pg =svwhilelt_b32(np2, n);
|
||||
ax1 = svld1_f32(pg, x + np2);
|
||||
ay1 = svld1_f32(pg, y + np2);
|
||||
ay1 = svmad_f32_m(pg, ax1, vx, ay1);
|
||||
|
||||
svst1_f32(pg, y + np2, ay1);
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
for (int i = 0, avl; i < n; i += avl) {
|
||||
avl = __riscv_vsetvl_e32m8(n - i);
|
||||
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
||||
@@ -396,113 +204,28 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
|
||||
}
|
||||
|
||||
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16;
|
||||
const int ggml_f16_step = 8 * ggml_f16_epr;
|
||||
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
const int np= (n & ~(ggml_f16_step - 1));
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
|
||||
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
|
||||
|
||||
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
|
||||
|
||||
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
|
||||
ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
|
||||
|
||||
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
|
||||
ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
|
||||
|
||||
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
|
||||
ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
|
||||
|
||||
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
|
||||
ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
|
||||
|
||||
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
|
||||
ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
|
||||
|
||||
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
|
||||
ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
|
||||
|
||||
GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
const int np2 = (n & ~(ggml_f16_epr - 1));
|
||||
for (int k = np; k < np2; k += ggml_f16_epr) {
|
||||
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
|
||||
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
|
||||
ry = GGML_F16x_VEC_FMA(ry, rx, vx);
|
||||
}
|
||||
|
||||
GGML_F16x_VEC_STORE(y + k, ry, 0);
|
||||
}
|
||||
|
||||
if (np2 < n) {
|
||||
svbool_t pg = svwhilelt_b16(np2, n);
|
||||
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
|
||||
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
|
||||
hy = svmad_f16_x(pg, hx, vx, hy);
|
||||
svst1_f16(pg, (__fp16 *)(y + np2), hy);
|
||||
}
|
||||
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ax[GGML_F16_ARR];
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#endif
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
@@ -523,14 +246,7 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
|
||||
}
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// scalar Route to scalar implementation //TODO: Write SVE code
|
||||
for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] += x[k][i]*v[k][0];
|
||||
}
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
for (int i = 0, avl; i < n; i += avl) {
|
||||
avl = __riscv_vsetvl_e32m8(n - i);
|
||||
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
||||
@@ -586,12 +302,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
|
||||
#elif defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// scalar ; TODO: Write SVE code
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = x[i]*s + b;
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
for (int i = 0, avl; i < n; i += avl) {
|
||||
avl = __riscv_vsetvl_e32m8(n - i);
|
||||
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
|
||||
@@ -610,7 +321,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
|
||||
for (int i = 0; i < np; i += GGML_F32_STEP) {
|
||||
for (int j = 0; j < GGML_F32_ARR; j++) {
|
||||
ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
|
||||
ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
|
||||
ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
|
||||
|
||||
GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
|
||||
}
|
||||
@@ -634,33 +345,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
vDSP_vsmul(y, 1, &v, y, 1, n);
|
||||
#elif defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
|
||||
const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
|
||||
const int ggml_f32_step = 2 * ggml_f32_epr;
|
||||
|
||||
GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
|
||||
const int np = (n & ~(ggml_f32_step - 1));
|
||||
svfloat32_t ay1;
|
||||
svfloat32_t ay2;
|
||||
for (int i = 0; i < np; i += ggml_f32_step) {
|
||||
ay1 = GGML_F32_VEC_LOAD(y + i);
|
||||
ay1 = GGML_F32_VEC_MUL(ay1, vx);
|
||||
GGML_F32_VEC_STORE(y + i, ay1);
|
||||
|
||||
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
|
||||
ay2 = GGML_F32_VEC_MUL(ay2, vx);
|
||||
GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
|
||||
}
|
||||
// leftovers
|
||||
// maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
|
||||
if (np < n) {
|
||||
svbool_t pg = svwhilelt_b32(np, n);
|
||||
ay1 = svld1_f32(pg, y + np);
|
||||
ay1 = svmul_f32_m(pg, ay1, vx);
|
||||
svst1_f32(pg, y + np, ay1);
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
for (int i = 0, avl; i < n; i += avl) {
|
||||
avl = __riscv_vsetvl_e32m8(n - i);
|
||||
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
|
||||
@@ -697,60 +382,26 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
|
||||
}
|
||||
|
||||
inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
|
||||
#if defined(GGML_SIMD)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const int sve_register_length = svcntb() * 8;
|
||||
const int ggml_f16_epr = sve_register_length / 16;
|
||||
const int ggml_f16_step = 2 * ggml_f16_epr;
|
||||
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
|
||||
GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
|
||||
const int np = (n & ~(ggml_f16_step - 1));
|
||||
svfloat16_t ay1, ay2;
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
for (int i = 0; i < np; i += ggml_f16_step) {
|
||||
ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
|
||||
ay1 = GGML_F16x_VEC_MUL(ay1, vx);
|
||||
GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
|
||||
ay2 = GGML_F16x_VEC_MUL(ay2, vx);
|
||||
GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
// leftovers
|
||||
// maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
|
||||
if (np < n) {
|
||||
svbool_t pg = svwhilelt_b16(np, n);
|
||||
svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
|
||||
svfloat16_t out = svmul_f16_m(pg, hy, vx);
|
||||
svst1_f16(pg, (__fp16 *)(y + np), out);
|
||||
}
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
// todo: RVV impl
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#else
|
||||
const int np = (n & ~(GGML_F16_STEP - 1));
|
||||
}
|
||||
|
||||
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
|
||||
|
||||
GGML_F16_VEC ay[GGML_F16_ARR];
|
||||
|
||||
for (int i = 0; i < np; i += GGML_F16_STEP) {
|
||||
for (int j = 0; j < GGML_F16_ARR; j++) {
|
||||
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
|
||||
ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
|
||||
|
||||
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
|
||||
}
|
||||
}
|
||||
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#endif
|
||||
// leftovers
|
||||
for (int i = np; i < n; ++i) {
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
|
||||
}
|
||||
#else
|
||||
// scalar
|
||||
for (int i = 0; i < n; ++i) {
|
||||
|
||||
@@ -3639,11 +3639,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_SUM:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
return true;
|
||||
case GGML_OP_ARGSORT:
|
||||
// TODO: Support arbitrary column width
|
||||
return op->src[0]->ne[0] <= 1024;
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
|
||||
@@ -683,11 +683,9 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
(ggml_get_op_params_i32(op, 4) == 0) && (ggml_get_op_params_i32(op, 6) == 0);
|
||||
case GGML_OP_PAD_REFLECT_1D:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_ARGSORT:
|
||||
// TODO: Support arbitrary column width
|
||||
return op->src[0]->ne[0] <= 1024;
|
||||
case GGML_OP_ARANGE:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
|
||||
@@ -67,48 +67,30 @@ layout (binding = 5) writeonly buffer O {D_TYPE data_o[];};
|
||||
#if defined(A_TYPE_PACKED16)
|
||||
#define BINDING_IDX_K 0
|
||||
#define BINDING_IDX_V 1
|
||||
layout (binding = 1) readonly buffer K_PACKED16 {A_TYPE_PACKED16 k_data_packed16[];} k_packed;
|
||||
layout (binding = 2) readonly buffer V_PACKED16 {A_TYPE_PACKED16 v_data_packed16[];} v_packed;
|
||||
layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
#define BLOCK_BYTE_SIZE 18
|
||||
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
return float(k_packed.k_data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
|
||||
} else {
|
||||
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
return float(v_packed.v_data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
|
||||
}
|
||||
return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q8_0)
|
||||
#define BLOCK_BYTE_SIZE 34
|
||||
vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
const i8vec2 v0 = unpack8(int32_t(k_packed.k_data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
|
||||
const i8vec2 v1 = unpack8(int32_t(k_packed.k_data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
|
||||
const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
|
||||
const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
|
||||
|
||||
return float(k_packed.k_data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
|
||||
} else {
|
||||
const i8vec2 v0 = unpack8(int32_t(v_packed.v_data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
|
||||
const i8vec2 v1 = unpack8(int32_t(v_packed.v_data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
|
||||
|
||||
return float(v_packed.v_data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
|
||||
}
|
||||
return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -3687,7 +3687,6 @@ struct ggml_tensor * ggml_set_rows(
|
||||
result->op = GGML_OP_SET_ROWS;
|
||||
result->src[0] = b;
|
||||
result->src[1] = c;
|
||||
result->src[2] = a; // note: order is weird due to legacy reasons (https://github.com/ggml-org/llama.cpp/pull/16063#discussion_r2385795931)
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -6155,7 +6155,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_add1());
|
||||
test_cases.emplace_back(new test_scale());
|
||||
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
|
||||
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {100, 10, 10, 10}, 2.0f, 1.0f));
|
||||
test_cases.emplace_back(new test_softcap(GGML_TYPE_F32, {10, 10, 10, 10}, 50.0f));
|
||||
test_cases.emplace_back(new test_silu_back());
|
||||
|
||||
@@ -6567,7 +6566,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {1024, 1, 1, 1}, order));
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16384, 1, 1, 1}, order)); // bailingmoe2 (group selection)
|
||||
}
|
||||
|
||||
for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) {
|
||||
|
||||
@@ -707,10 +707,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
embd.push_back(id);
|
||||
|
||||
if (params.conversation_mode && !waiting_for_first_input && !llama_vocab_is_eog(vocab, id)) {
|
||||
assistant_ss << common_token_to_piece(ctx, id, false);
|
||||
}
|
||||
|
||||
// echo this to console
|
||||
input_echo = true;
|
||||
|
||||
@@ -828,7 +824,11 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
// if current token is not EOG, we add it to current assistant message
|
||||
if (params.conversation_mode && !waiting_for_first_input) {
|
||||
const auto id = common_sampler_last(smpl);
|
||||
assistant_ss << common_token_to_piece(ctx, id, false);
|
||||
|
||||
if (!prompt.empty()) {
|
||||
prompt.clear();
|
||||
is_interacting = false;
|
||||
|
||||
@@ -1931,13 +1931,11 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
|
||||
LOG("Maximum KLD: %10.6f\n", kld_values.back());
|
||||
LOG("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f));
|
||||
LOG("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f));
|
||||
LOG("95.0%% KLD: %10.6f\n", percentile(kld_values, 0.950f));
|
||||
LOG("90.0%% KLD: %10.6f\n", percentile(kld_values, 0.900f));
|
||||
LOG("Median KLD: %10.6f\n", kld_median);
|
||||
LOG("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f));
|
||||
LOG(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f));
|
||||
LOG(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f));
|
||||
LOG(" 0.1%% KLD: %10.6f\n", percentile(kld_values, 0.001f));
|
||||
LOG("Minimum KLD: %10.6f\n", kld_values.front());
|
||||
|
||||
LOG("\n");
|
||||
|
||||
Binary file not shown.
@@ -39,7 +39,6 @@
|
||||
--sidebar-ring: oklch(0.708 0 0);
|
||||
--code-background: oklch(0.225 0 0);
|
||||
--code-foreground: oklch(0.875 0 0);
|
||||
--layer-popover: 1000000;
|
||||
}
|
||||
|
||||
.dark {
|
||||
|
||||
@@ -362,8 +362,7 @@
|
||||
|
||||
<Dialog.Root {open} onOpenChange={handleClose}>
|
||||
<Dialog.Content
|
||||
class="z-999999 flex h-[100dvh] max-h-[100dvh] min-h-[100dvh] flex-col gap-0 rounded-none p-0
|
||||
md:h-[64vh] md:max-h-[64vh] md:min-h-0 md:rounded-lg"
|
||||
class="z-999999 flex h-[100vh] flex-col gap-0 rounded-none p-0 md:h-[64vh] md:rounded-lg"
|
||||
style="max-width: 48rem;"
|
||||
>
|
||||
<div class="flex flex-1 flex-col overflow-hidden md:flex-row">
|
||||
@@ -442,7 +441,7 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<ScrollArea class="max-h-[calc(100dvh-13.5rem)] flex-1 md:max-h-[calc(100vh-13.5rem)]">
|
||||
<ScrollArea class="max-h-[calc(100vh-13.5rem)] flex-1">
|
||||
<div class="space-y-6 p-4 md:p-6">
|
||||
<div>
|
||||
<div class="mb-6 flex hidden items-center gap-2 border-b border-border/30 pb-6 md:flex">
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
import * as Select from '$lib/components/ui/select';
|
||||
import { Textarea } from '$lib/components/ui/textarea';
|
||||
import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO } from '$lib/constants/settings-config';
|
||||
import { IsMobile } from '$lib/hooks/is-mobile.svelte';
|
||||
import { supportsVision } from '$lib/stores/server.svelte';
|
||||
import type { Component } from 'svelte';
|
||||
|
||||
@@ -16,6 +17,8 @@
|
||||
}
|
||||
|
||||
let { fields, localConfig, onConfigChange, onThemeChange }: Props = $props();
|
||||
|
||||
let isMobile = $state(new IsMobile());
|
||||
</script>
|
||||
|
||||
{#each fields as field (field.key)}
|
||||
@@ -27,10 +30,10 @@
|
||||
|
||||
<Input
|
||||
id={field.key}
|
||||
value={String(localConfig[field.key] ?? '')}
|
||||
value={String(localConfig[field.key] || '')}
|
||||
onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
|
||||
class="w-full md:max-w-md"
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] || 'none'}`}
|
||||
class={isMobile ? 'w-full' : 'max-w-md'}
|
||||
/>
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="mt-1 text-xs text-muted-foreground">
|
||||
@@ -44,10 +47,10 @@
|
||||
|
||||
<Textarea
|
||||
id={field.key}
|
||||
value={String(localConfig[field.key] ?? '')}
|
||||
value={String(localConfig[field.key] || '')}
|
||||
onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
|
||||
class="min-h-[100px] w-full md:max-w-2xl"
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] || 'none'}`}
|
||||
class={isMobile ? 'min-h-[100px] w-full' : 'min-h-[100px] max-w-2xl'}
|
||||
/>
|
||||
{#if field.help || SETTING_CONFIG_INFO[field.key]}
|
||||
<p class="mt-1 text-xs text-muted-foreground">
|
||||
@@ -75,7 +78,7 @@
|
||||
}
|
||||
}}
|
||||
>
|
||||
<Select.Trigger class="w-full md:w-auto md:max-w-md">
|
||||
<Select.Trigger class={isMobile ? 'w-full' : 'max-w-md'}>
|
||||
<div class="flex items-center gap-2">
|
||||
{#if selectedOption?.icon}
|
||||
{@const IconComponent = selectedOption.icon}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
<script lang="ts">
|
||||
import { Button } from '$lib/components/ui/button';
|
||||
import * as AlertDialog from '$lib/components/ui/alert-dialog';
|
||||
|
||||
interface Props {
|
||||
onReset?: () => void;
|
||||
@@ -9,15 +8,8 @@
|
||||
|
||||
let { onReset, onSave }: Props = $props();
|
||||
|
||||
let showResetDialog = $state(false);
|
||||
|
||||
function handleResetClick() {
|
||||
showResetDialog = true;
|
||||
}
|
||||
|
||||
function handleConfirmReset() {
|
||||
function handleReset() {
|
||||
onReset?.();
|
||||
showResetDialog = false;
|
||||
}
|
||||
|
||||
function handleSave() {
|
||||
@@ -26,23 +18,7 @@
|
||||
</script>
|
||||
|
||||
<div class="flex justify-between border-t border-border/30 p-6">
|
||||
<Button variant="outline" onclick={handleResetClick}>Reset to default</Button>
|
||||
<Button variant="outline" onclick={handleReset}>Reset to default</Button>
|
||||
|
||||
<Button onclick={handleSave}>Save settings</Button>
|
||||
</div>
|
||||
|
||||
<AlertDialog.Root bind:open={showResetDialog}>
|
||||
<AlertDialog.Content>
|
||||
<AlertDialog.Header>
|
||||
<AlertDialog.Title>Reset Settings to Default</AlertDialog.Title>
|
||||
<AlertDialog.Description>
|
||||
Are you sure you want to reset all settings to their default values? This action cannot be
|
||||
undone and will permanently remove all your custom configurations.
|
||||
</AlertDialog.Description>
|
||||
</AlertDialog.Header>
|
||||
<AlertDialog.Footer>
|
||||
<AlertDialog.Cancel>Cancel</AlertDialog.Cancel>
|
||||
<AlertDialog.Action onclick={handleConfirmReset}>Reset to Default</AlertDialog.Action>
|
||||
</AlertDialog.Footer>
|
||||
</AlertDialog.Content>
|
||||
</AlertDialog.Root>
|
||||
|
||||
@@ -87,7 +87,7 @@
|
||||
<Sidebar.GroupContent>
|
||||
<Sidebar.Menu>
|
||||
{#each filteredConversations as conversation (conversation.id)}
|
||||
<Sidebar.MenuItem class="mb-1">
|
||||
<Sidebar.MenuItem class="mb-1" onclick={handleMobileSidebarItemClick}>
|
||||
<ChatSidebarConversationItem
|
||||
conversation={{
|
||||
id: conversation.id,
|
||||
@@ -95,7 +95,6 @@
|
||||
lastModified: conversation.lastModified,
|
||||
currNode: conversation.currNode
|
||||
}}
|
||||
{handleMobileSidebarItemClick}
|
||||
isActive={currentChatId === conversation.id}
|
||||
onSelect={selectConversation}
|
||||
onEdit={editConversation}
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
interface Props {
|
||||
isActive?: boolean;
|
||||
conversation: DatabaseConversation;
|
||||
handleMobileSidebarItemClick?: () => void;
|
||||
onDelete?: (id: string) => void;
|
||||
onEdit?: (id: string, name: string) => void;
|
||||
onSelect?: (id: string) => void;
|
||||
@@ -17,7 +16,6 @@
|
||||
|
||||
let {
|
||||
conversation,
|
||||
handleMobileSidebarItemClick,
|
||||
onDelete,
|
||||
onEdit,
|
||||
onSelect,
|
||||
@@ -49,7 +47,6 @@
|
||||
|
||||
function handleConfirmEdit() {
|
||||
if (!editedName.trim()) return;
|
||||
showEditDialog = false;
|
||||
onEdit?.(conversation.id, editedName);
|
||||
}
|
||||
|
||||
@@ -88,12 +85,7 @@
|
||||
: ''}"
|
||||
onclick={handleSelect}
|
||||
>
|
||||
<!-- svelte-ignore a11y_click_events_have_key_events -->
|
||||
<!-- svelte-ignore a11y_no_static_element_interactions -->
|
||||
<div
|
||||
class="text flex min-w-0 flex-1 items-center space-x-3"
|
||||
onclick={handleMobileSidebarItemClick}
|
||||
>
|
||||
<div class="text flex min-w-0 flex-1 items-center space-x-3">
|
||||
<div class="min-w-0 flex-1">
|
||||
<p class="truncate text-sm font-medium">{conversation.name}</p>
|
||||
|
||||
@@ -186,10 +178,5 @@
|
||||
&:is(:hover) :global([data-slot='dropdown-menu-trigger']) {
|
||||
opacity: 1;
|
||||
}
|
||||
@media (max-width: 768px) {
|
||||
:global([data-slot='dropdown-menu-trigger']) {
|
||||
opacity: 1 !important;
|
||||
}
|
||||
}
|
||||
}
|
||||
</style>
|
||||
|
||||
@@ -37,7 +37,6 @@
|
||||
<DropdownMenu.Root bind:open>
|
||||
<DropdownMenu.Trigger
|
||||
class="flex h-6 w-6 cursor-pointer items-center justify-center rounded-md p-0 text-sm font-medium transition-colors hover:bg-accent hover:text-accent-foreground focus:bg-accent focus:text-accent-foreground focus:outline-none disabled:pointer-events-none disabled:opacity-50 data-[state=open]:bg-accent data-[state=open]:text-accent-foreground {triggerClass}"
|
||||
onclick={(e) => e.stopPropagation()}
|
||||
>
|
||||
{#if triggerTooltip}
|
||||
<Tooltip.Root delayDuration={TOOLTIP_DELAY_DURATION}>
|
||||
@@ -54,7 +53,7 @@
|
||||
{/if}
|
||||
</DropdownMenu.Trigger>
|
||||
|
||||
<DropdownMenu.Content {align} class="z-[999999] w-48">
|
||||
<DropdownMenu.Content {align} class="z-999 w-48">
|
||||
{#each actions as action, index (action.label)}
|
||||
{#if action.separator && index > 0}
|
||||
<DropdownMenu.Separator />
|
||||
|
||||
@@ -19,15 +19,7 @@
|
||||
bind:ref
|
||||
data-slot="alert-dialog-content"
|
||||
class={cn(
|
||||
'fixed z-[999999] grid w-full gap-4 border bg-background p-6 shadow-lg duration-200',
|
||||
// Mobile: Bottom sheet behavior
|
||||
'right-0 bottom-0 left-0 max-h-[100dvh] translate-x-0 translate-y-0 overflow-y-auto rounded-t-lg',
|
||||
'data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:slide-out-to-bottom-full',
|
||||
'data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:slide-in-from-bottom-full',
|
||||
// Desktop: Centered dialog behavior
|
||||
'sm:top-[50%] sm:right-auto sm:bottom-auto sm:left-[50%] sm:max-h-[100vh] sm:max-w-lg sm:translate-x-[-50%] sm:translate-y-[-50%] sm:rounded-lg',
|
||||
'sm:data-[state=closed]:slide-out-to-bottom-0 sm:data-[state=closed]:zoom-out-95',
|
||||
'sm:data-[state=open]:slide-in-from-bottom-0 sm:data-[state=open]:zoom-in-95',
|
||||
'fixed top-[50%] left-[50%] z-50 grid w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 rounded-lg border bg-background p-6 shadow-lg duration-200 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 sm:max-w-lg',
|
||||
className
|
||||
)}
|
||||
{...restProps}
|
||||
|
||||
@@ -13,10 +13,7 @@
|
||||
<div
|
||||
bind:this={ref}
|
||||
data-slot="alert-dialog-footer"
|
||||
class={cn(
|
||||
'mt-6 flex flex-row gap-2 sm:mt-0 sm:justify-end [&>*]:flex-1 sm:[&>*]:flex-none',
|
||||
className
|
||||
)}
|
||||
class={cn('flex flex-col-reverse gap-2 sm:flex-row sm:justify-end', className)}
|
||||
{...restProps}
|
||||
>
|
||||
{@render children?.()}
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
bind:ref
|
||||
data-slot="dialog-content"
|
||||
class={cn(
|
||||
`fixed top-[50%] left-[50%] z-50 grid max-h-[100dvh] w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 overflow-y-auto rounded-lg border border-border/30 bg-background p-6 shadow-lg duration-200 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 sm:max-w-lg md:max-h-[100vh]`,
|
||||
'fixed top-[50%] left-[50%] z-50 grid w-full max-w-[calc(100%-2rem)] translate-x-[-50%] translate-y-[-50%] gap-4 rounded-lg border border-border/30 bg-background p-6 shadow-lg duration-200 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95 sm:max-w-lg',
|
||||
className
|
||||
)}
|
||||
{...restProps}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
<script lang="ts">
|
||||
import { onDestroy, onMount } from 'svelte';
|
||||
import { Select as SelectPrimitive } from 'bits-ui';
|
||||
import SelectScrollUpButton from './select-scroll-up-button.svelte';
|
||||
import SelectScrollDownButton from './select-scroll-down-button.svelte';
|
||||
@@ -15,76 +14,6 @@
|
||||
}: WithoutChild<SelectPrimitive.ContentProps> & {
|
||||
portalProps?: SelectPrimitive.PortalProps;
|
||||
} = $props();
|
||||
|
||||
let cleanupInternalListeners: (() => void) | undefined;
|
||||
|
||||
onMount(() => {
|
||||
const listenerOptions: AddEventListenerOptions = { passive: false };
|
||||
|
||||
const blockOutsideWheel = (event: WheelEvent) => {
|
||||
if (!ref) {
|
||||
return;
|
||||
}
|
||||
|
||||
const target = event.target as Node | null;
|
||||
|
||||
if (!target || !ref.contains(target)) {
|
||||
event.preventDefault();
|
||||
event.stopPropagation();
|
||||
}
|
||||
};
|
||||
|
||||
const blockOutsideTouchMove = (event: TouchEvent) => {
|
||||
if (!ref) {
|
||||
return;
|
||||
}
|
||||
|
||||
const target = event.target as Node | null;
|
||||
|
||||
if (!target || !ref.contains(target)) {
|
||||
event.preventDefault();
|
||||
event.stopPropagation();
|
||||
}
|
||||
};
|
||||
|
||||
document.addEventListener('wheel', blockOutsideWheel, listenerOptions);
|
||||
document.addEventListener('touchmove', blockOutsideTouchMove, listenerOptions);
|
||||
|
||||
return () => {
|
||||
document.removeEventListener('wheel', blockOutsideWheel, listenerOptions);
|
||||
document.removeEventListener('touchmove', blockOutsideTouchMove, listenerOptions);
|
||||
};
|
||||
});
|
||||
|
||||
$effect(() => {
|
||||
const element = ref;
|
||||
|
||||
cleanupInternalListeners?.();
|
||||
|
||||
if (!element) {
|
||||
return;
|
||||
}
|
||||
|
||||
const stopWheelPropagation = (event: WheelEvent) => {
|
||||
event.stopPropagation();
|
||||
};
|
||||
|
||||
const stopTouchPropagation = (event: TouchEvent) => {
|
||||
event.stopPropagation();
|
||||
};
|
||||
|
||||
element.addEventListener('wheel', stopWheelPropagation);
|
||||
element.addEventListener('touchmove', stopTouchPropagation);
|
||||
|
||||
cleanupInternalListeners = () => {
|
||||
element.removeEventListener('wheel', stopWheelPropagation);
|
||||
element.removeEventListener('touchmove', stopTouchPropagation);
|
||||
};
|
||||
});
|
||||
|
||||
onDestroy(() => {
|
||||
cleanupInternalListeners?.();
|
||||
});
|
||||
</script>
|
||||
|
||||
<SelectPrimitive.Portal {...portalProps}>
|
||||
@@ -93,7 +22,7 @@
|
||||
{sideOffset}
|
||||
data-slot="select-content"
|
||||
class={cn(
|
||||
'relative z-[var(--layer-popover,1000000)] max-h-(--bits-select-content-available-height) min-w-[8rem] origin-(--bits-select-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border bg-popover text-popover-foreground shadow-md data-[side=bottom]:translate-y-1 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:-translate-x-1 data-[side=left]:slide-in-from-right-2 data-[side=right]:translate-x-1 data-[side=right]:slide-in-from-left-2 data-[side=top]:-translate-y-1 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
|
||||
'relative z-50 max-h-(--bits-select-content-available-height) min-w-[8rem] origin-(--bits-select-content-transform-origin) overflow-x-hidden overflow-y-auto rounded-md border bg-popover text-popover-foreground shadow-md data-[side=bottom]:translate-y-1 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:-translate-x-1 data-[side=left]:slide-in-from-right-2 data-[side=right]:translate-x-1 data-[side=right]:slide-in-from-left-2 data-[side=top]:-translate-y-1 data-[side=top]:slide-in-from-bottom-2 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[state=open]:animate-in data-[state=open]:fade-in-0 data-[state=open]:zoom-in-95',
|
||||
className
|
||||
)}
|
||||
{...restProps}
|
||||
|
||||
Reference in New Issue
Block a user