Compare commits

..

3 Commits

4 changed files with 103 additions and 6 deletions

View File

@@ -203,7 +203,6 @@
#elif defined(__riscv)
// quants.c
#define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
#define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0
// repack.cpp
#define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1
#define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4

View File

@@ -480,6 +480,104 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
#endif
}
#if defined(__riscv_v)
static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl256(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
const int qk = QK1_0;
const int nb = n / qk;
assert(n % qk == 0);
const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
//LMUL = 1, VLMAX = 32
const size_t vl32 = __riscv_vsetvl_e8m1(32);
assert(vl32 == 32);
const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
float sumf = 0;
for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
float acc = 0;
for (int k = 0; k < 4; ++k) {
const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
const vbool8_t is_not_zero = __riscv_vlm_v_b8(x[ib].qs + 4 * k, vl32);
const vint8m1_t qy = __riscv_vle8_v_i8m1(yb->qs, vl32);
const vint8m1_t neg_qy = __riscv_vneg_v_i8m1(qy, vl32);
const vint8m1_t sy = __riscv_vmerge_vvm_i8m1(neg_qy, qy, is_not_zero, vl32);
const vint16m1_t red = __riscv_vwredsum_vs_i8m1_i16m1(sy, zero, vl32);
acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
}
sumf += d0 * acc;
}
*s = sumf;
}
static NOINLINE void ggml_vec_dot_q1_0_q8_0_vl128(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) {
const int qk = QK1_0;
const int nb = n / qk;
assert(n % qk == 0);
const block_q1_0 * GGML_RESTRICT x = vx;
const block_q8_0 * GGML_RESTRICT y = vy;
//LMUL = 2, VLMAX = 32
const size_t vl32 = __riscv_vsetvl_e8m2(32);
assert(vl32 == 32);
const vint16m1_t zero = __riscv_vmv_v_x_i16m1(0, 1);
float sumf = 0;
for (int ib = 0; ib < nb; ++ib) {
const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
float acc = 0;
for (int k = 0; k < 4; ++k) {
const block_q8_0 * GGML_RESTRICT yb = &y[ib * 4 + k];
const vbool4_t is_not_zero = __riscv_vlm_v_b4(x[ib].qs + 4 * k, vl32);
const vint8m2_t qy = __riscv_vle8_v_i8m2(yb->qs, vl32);
const vint8m2_t neg_qy =__riscv_vneg_v_i8m2(qy, vl32);
const vint8m2_t sy = __riscv_vmerge_vvm_i8m2(neg_qy, qy, is_not_zero, vl32);
const vint16m1_t red = __riscv_vwredsum_vs_i8m2_i16m1(sy, zero, vl32);
acc += GGML_CPU_FP16_TO_FP32(yb->d) * (float)__riscv_vmv_x_s_i16m1_i16(red);
}
sumf += d0 * acc;
}
*s = sumf;
}
#endif
void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
#if defined(__riscv_v)
assert(nrc == 1);
const size_t vlen_bits = __riscv_vlenb() * 8;
if (vlen_bits >= 256) {
ggml_vec_dot_q1_0_q8_0_vl256(n, s, vx, vy);
} else if (vlen_bits >= 128) {
ggml_vec_dot_q1_0_q8_0_vl128(n, s, vx, vy);
} else {
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
}
#else
ggml_vec_dot_q1_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc);
#endif
}
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
assert(nrc == 1);
UNUSED(nrc);

View File

@@ -2656,13 +2656,8 @@ size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * sr
throw std::runtime_error("wrong sequence state magic");
}
const bool need_seq_match = (flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
llama_seq_id seq_id_read;
io->read(&seq_id_read, sizeof(seq_id_read));
if (need_seq_match && seq_id != seq_id_read) {
throw std::runtime_error("wrong sequence id");
}
return state_seq_read_data(*io, seq_id, flags);
} catch (const std::exception & err) {

View File

@@ -403,6 +403,11 @@ static bool log_mel_spectrogram(
return false;
}
std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
// expose the padded buffer to downstream FFT and to out.n_len computation
// mirrors the no_padding and center_padding branches above
samples = samples_padded.data();
n_samples = samples_padded.size();
}
// preemphasis