mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
3 Commits
master-129
...
master-be2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
be2301bcda | ||
|
|
1aa18ef994 | ||
|
|
9a08eaf3c4 |
@@ -61,6 +61,13 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
|
||||
// get data from the device into host memory
|
||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
||||
|
||||
// try to find operations that can be run concurrently in the graph
|
||||
// you should run it again if the topology of your graph changes
|
||||
void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||
|
||||
// if the graph has been optimized for concurrently dispatch
|
||||
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
|
||||
|
||||
// same as ggml_graph_compute but uses Metal
|
||||
// creates gf->n_threads command buffers in parallel
|
||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||
|
||||
147
ggml-metal.m
147
ggml-metal.m
@@ -36,6 +36,9 @@ struct ggml_metal_context {
|
||||
int n_buffers;
|
||||
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||
|
||||
int concur_list[GGML_MAX_NODES];
|
||||
int concur_list_len;
|
||||
|
||||
// custom kernels
|
||||
#define GGML_METAL_DECL_KERNEL(name) \
|
||||
id<MTLFunction> function_##name; \
|
||||
@@ -98,6 +101,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
ctx->device = MTLCreateSystemDefaultDevice();
|
||||
ctx->queue = [ctx->device newCommandQueue];
|
||||
ctx->n_buffers = 0;
|
||||
ctx->concur_list_len = 0;
|
||||
|
||||
// determine if we can use MPS
|
||||
if (MPSSupportsMTLDevice(ctx->device)) {
|
||||
@@ -217,6 +221,13 @@ void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
||||
ctx->n_cb = n_cb;
|
||||
}
|
||||
|
||||
bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
|
||||
if (ctx->concur_list_len) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||
// Metal buffer based on the host memory pointer
|
||||
@@ -355,11 +366,98 @@ void ggml_metal_get_tensor(
|
||||
memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
|
||||
}
|
||||
|
||||
void ggml_metal_graph_find_concurrency(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_cgraph * gf) {
|
||||
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
|
||||
int nodes_unused[GGML_MAX_NODES];
|
||||
|
||||
for (int i = 0; i < GGML_MAX_NODES; i++) {ctx->concur_list[i] = 0;}
|
||||
for (int i = 0; i < gf->n_nodes; i++) {nodes_unused[i] = 1;}
|
||||
ctx->concur_list_len = 0;
|
||||
|
||||
int n_left = gf->n_nodes;
|
||||
int n_start = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
|
||||
int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
|
||||
|
||||
while (n_left > 0) {
|
||||
// number of nodes at a layer (that can be issued concurrently)
|
||||
int concurrency = 0;
|
||||
for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
|
||||
if (nodes_unused[i]) {
|
||||
// if the requirements for gf->nodes[i] are satisfied
|
||||
int exe_flag=1;
|
||||
// scan all srcs
|
||||
for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
|
||||
struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
|
||||
if (src_cur) {
|
||||
// if is leaf nodes it's satisfied.
|
||||
if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {continue;}
|
||||
|
||||
// otherwise this src should be the output from previous nodes.
|
||||
int is_found = 0;
|
||||
// scan 2*search_depth back because we inserted barrier.
|
||||
for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
|
||||
if (gf->nodes[ctx->concur_list[j]] == src_cur) {is_found = 1; break;}
|
||||
}
|
||||
if (is_found == 0) {exe_flag = 0; break;}
|
||||
}
|
||||
}
|
||||
if (exe_flag) {
|
||||
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
|
||||
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
|
||||
int64_t data_start = (int64_t) gf->nodes[i]->data;
|
||||
int64_t length = (int64_t) ggml_nbytes(gf->nodes[i]);
|
||||
for (int j = n_start; j < i; j++) {
|
||||
if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
|
||||
&& gf->nodes[j]->op != GGML_OP_VIEW \
|
||||
&& gf->nodes[j]->op != GGML_OP_TRANSPOSE \
|
||||
&& gf->nodes[j]->op != GGML_OP_PERMUTE) {
|
||||
if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
|
||||
((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
|
||||
continue;
|
||||
} else {
|
||||
exe_flag = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (exe_flag) {
|
||||
ctx->concur_list[level_pos + concurrency] = i;
|
||||
nodes_unused[i] = 0;
|
||||
concurrency++;
|
||||
ctx->concur_list_len++;
|
||||
}
|
||||
}
|
||||
}
|
||||
n_left -= concurrency;
|
||||
// adding a barrier different layer
|
||||
ctx->concur_list[level_pos + concurrency] = -1;
|
||||
ctx->concur_list_len++;
|
||||
// jump all sorted nodes at nodes_bak
|
||||
while (!nodes_unused[n_start]) {n_start++;}
|
||||
level_pos += concurrency + 1;
|
||||
}
|
||||
|
||||
if (ctx->concur_list_len > GGML_MAX_NODES) {
|
||||
fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_metal_graph_compute(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_cgraph * gf) {
|
||||
metal_printf("%s: evaluating graph\n", __func__);
|
||||
|
||||
// if there is ctx->concur_list, dispatch concurrently
|
||||
// else fallback to serial dispatch
|
||||
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
|
||||
|
||||
const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_NODES;
|
||||
|
||||
const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
|
||||
edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
|
||||
|
||||
// create multiple command buffers and enqueue them
|
||||
// then, we encode the graph into the command buffers in parallel
|
||||
|
||||
@@ -378,7 +476,7 @@ void ggml_metal_graph_compute(
|
||||
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
|
||||
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
|
||||
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
||||
|
||||
dispatch_async(queue, ^{
|
||||
size_t offs_src0 = 0;
|
||||
@@ -389,10 +487,21 @@ void ggml_metal_graph_compute(
|
||||
|
||||
id<MTLComputeCommandEncoder> encoder = nil;
|
||||
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||
const int node_start = (cb_idx + 0) * n_nodes_per_cb;
|
||||
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
|
||||
|
||||
for (int ind = node_start; ind < node_end; ++ind) {
|
||||
const int i = has_concur ? ctx->concur_list[ind] : ind;
|
||||
|
||||
if (i == -1) {
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
continue;
|
||||
}
|
||||
[encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = node_start; i < node_end; ++i) {
|
||||
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||
|
||||
struct ggml_tensor * src0 = gf->nodes[i]->src[0];
|
||||
@@ -463,7 +572,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
if (ggml_nelements(src1) == ne10) {
|
||||
@@ -484,7 +593,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_MUL:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
if (ggml_nelements(src1) == ne10) {
|
||||
@@ -505,7 +614,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_SCALE:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
const float scale = *(const float *) src1->data;
|
||||
@@ -524,7 +633,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_UNARY_OP_SILU:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_silu];
|
||||
@@ -538,7 +647,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_UNARY_OP_RELU:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_relu];
|
||||
@@ -552,7 +661,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_UNARY_OP_GELU:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_gelu];
|
||||
@@ -572,7 +681,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
const int nth = 32;
|
||||
@@ -590,7 +699,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_DIAG_MASK_INF:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
const int n_past = ((int32_t *)(dst->op_params))[0];
|
||||
@@ -653,7 +762,7 @@ void ggml_metal_graph_compute(
|
||||
}
|
||||
} else {
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
int nth0 = 32;
|
||||
@@ -780,7 +889,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_GET_ROWS:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
switch (src0->type) {
|
||||
@@ -809,7 +918,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
float eps;
|
||||
@@ -832,7 +941,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_NORM:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
const float eps = 1e-5f;
|
||||
@@ -854,7 +963,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_ALIBI:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
GGML_ASSERT((src0t == GGML_TYPE_F32));
|
||||
@@ -897,7 +1006,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_ROPE:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||
@@ -941,7 +1050,7 @@ void ggml_metal_graph_compute(
|
||||
case GGML_OP_CONT:
|
||||
{
|
||||
if (encoder == nil) {
|
||||
encoder = [command_buffer computeCommandEncoder];
|
||||
encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
|
||||
}
|
||||
|
||||
const int nth = 32;
|
||||
|
||||
115
ggml-metal.metal
115
ggml-metal.metal
@@ -387,87 +387,90 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
}
|
||||
|
||||
// function for calculate inner product between a q4_0 block and 32 floats (yl), sumy is SUM(yl[i])
|
||||
float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl) {
|
||||
// function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
|
||||
// il indicates where the q4 quants begin (0 or QK4_0/4)
|
||||
// we assume that the yl's have been multiplied with the appropriate scale factor
|
||||
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
||||
inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
|
||||
float d = qb_curr->d;
|
||||
float4 acc = 0.f;
|
||||
device uint16_t * qs = ((device uint16_t *)qb_curr + 1);
|
||||
for (int i = 0; i < 16; i+=2) {
|
||||
acc[0] += yl[i] * (qs[i / 2] & 0x000F);
|
||||
acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
|
||||
acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
|
||||
acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
|
||||
float2 acc = 0.f;
|
||||
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
|
||||
for (int i = 0; i < 8; i+=2) {
|
||||
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
||||
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
||||
acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
|
||||
+ yl[i + 9] * (qs[i / 2] & 0xF000);
|
||||
}
|
||||
return d * (sumy * -8.f + acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f);
|
||||
return d * (sumy * -8.f + acc[0] + acc[1]);
|
||||
}
|
||||
|
||||
// function for calculate inner product between a q4_1 block and 32 floats (yl), sumy is SUM(yl[i])
|
||||
float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl) {
|
||||
// function for calculate inner product between half a q4_1 block and 16 floats (yl), sumy is SUM(yl[i])
|
||||
// il indicates where the q4 quants begin (0 or QK4_0/4)
|
||||
// we assume that the yl's have been multiplied with the appropriate scale factor
|
||||
// that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
|
||||
inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
|
||||
float d = qb_curr->d;
|
||||
float m = qb_curr->m;
|
||||
float4 acc = 0.f;
|
||||
device uint16_t * qs = ((device uint16_t *)qb_curr + 2);
|
||||
for (int i = 0; i < 16; i+=2) {
|
||||
acc[0] += yl[i] * (qs[i / 2] & 0x000F);
|
||||
acc[1] += yl[i + 16] * (qs[i / 2] & 0x00F0);
|
||||
acc[2] += yl[i + 1] * (qs[i / 2] & 0x0F00);
|
||||
acc[3] += yl[i + 17] * (qs[i / 2] & 0xF000);
|
||||
device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
|
||||
float2 acc = 0.f;
|
||||
for (int i = 0; i < 8; i+=2) {
|
||||
acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
|
||||
+ yl[i + 1] * (qs[i / 2] & 0x0F00);
|
||||
acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
|
||||
+ yl[i + 9] * (qs[i / 2] & 0xF000);
|
||||
}
|
||||
return d * (acc[0] + acc[1]/16.f + acc[2]/256.f + acc[3]/4096.f) + sumy * m;
|
||||
return d * (acc[0] + acc[1]) + sumy * m;
|
||||
}
|
||||
|
||||
// putting them in the kernel cause a significant performance penalty
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
template<typename block_q_type>
|
||||
//Note: This is a template, but strictly speaking it only applies to
|
||||
// quantizations where the block size is 32. It also does not
|
||||
// giard against the number of rows not being divisible by
|
||||
// N_DST, so this is another explicit assumption of the implementation.
|
||||
template<typename block_q_type, int nr, int nsg, int nw>
|
||||
void mul_vec_q_n_f32(device const void * src0, device const float * src1, device float * dst,
|
||||
int64_t ne00, int64_t ne10, int64_t ne0, int64_t ne01,
|
||||
uint2 tgpig, uint tiisg, uint sgitg) {
|
||||
const int nb = ne00/QK4_0;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
device const block_q_type * x = (device const block_q_type *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
|
||||
const int first_row = (r0 * nsg + sgitg) * nr;
|
||||
device const block_q_type * x = (device const block_q_type *) src0 + first_row * nb;
|
||||
device const float * y = (device const float *) src1 + r1*ne10;
|
||||
float4 y_curr[8]; // src1 vector cache
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
thread float * yl=(thread float *)y_curr;
|
||||
float yl[16]; // src1 vector cache
|
||||
float sumf[nr]={0.f};
|
||||
|
||||
// each thread in a SIMD group deals with 1 block.
|
||||
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
||||
const int ix = tiisg/2;
|
||||
const int il = 8*(tiisg%2);
|
||||
|
||||
device const float * yb = y + ix * QK4_0 + il;
|
||||
|
||||
// each thread in a SIMD group deals with half a block.
|
||||
for (int ib = ix; ib < nb; ib += nw/2) {
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0)) + i);
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
sumy += yb[i] + yb[i+1];
|
||||
yl[i+0] = yb[i+ 0];
|
||||
yl[i+1] = yb[i+ 1]/256.f;
|
||||
sumy += yb[i+16] + yb[i+17];
|
||||
yl[i+8] = yb[i+16]/16.f;
|
||||
yl[i+9] = yb[i+17]/4096.f;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
sumf[row] += block_q_n_dot_y(x+(tiisg + row * nb + column * N_SIMDWIDTH), sumy, yl);
|
||||
for (int row = 0; row < nr; row++) {
|
||||
sumf[row] += block_q_n_dot_y(x+ib+row*nb, sumy, yl, il);
|
||||
}
|
||||
|
||||
yb += QK4_0 * 16;
|
||||
}
|
||||
|
||||
// from now loads two rows every time and 16 blocks per row
|
||||
int ir = tiisg / (N_SIMDWIDTH / 2);
|
||||
int ib = tiisg % (N_SIMDWIDTH / 2);
|
||||
for (int ind = 0; ind < (nb % N_SIMDWIDTH + N_SIMDWIDTH / 2 - 1)/(N_SIMDWIDTH / 2); ind++) {
|
||||
int nb_start = (nb / N_SIMDWIDTH) * N_SIMDWIDTH + ind * (N_SIMDWIDTH / 2); //where the left blocks start
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + (nb_start + ib) * QK4_0) + i);
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row+=2) {
|
||||
if (nb_start + ib < nb) {
|
||||
sumf[row + ir] += block_q_n_dot_y(x + (nb_start + ib + (row + ir) * nb), sumy, yl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && first_row + row < ne01) {
|
||||
dst[r1*ne0 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -483,7 +486,7 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
mul_vec_q_n_f32<block_q4_0>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
||||
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q4_1_f32(
|
||||
@@ -497,7 +500,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
mul_vec_q_n_f32<block_q4_1>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
||||
mul_vec_q_n_f32<block_q4_1, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne10,ne0,ne01,tgpig,tiisg,sgitg);
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32(
|
||||
|
||||
325
k_quants.c
325
k_quants.c
@@ -1666,6 +1666,62 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc) + summs;
|
||||
|
||||
#elif defined __AVX__
|
||||
|
||||
const __m128i m3 = _mm_set1_epi8(3);
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
uint32_t ud, um;
|
||||
const uint8_t * restrict db = (const uint8_t *)&ud;
|
||||
const uint8_t * restrict mb = (const uint8_t *)&um;
|
||||
|
||||
float summs = 0;
|
||||
|
||||
// TODO: optimize this
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||
const float dmin = -y[i].d * ggml_fp16_to_fp32(x[i].dmin);
|
||||
|
||||
const uint8_t * restrict q2 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const uint32_t * restrict sc = (const uint32_t *)x[i].scales;
|
||||
ud = (sc[0] >> 0) & 0x0f0f0f0f;
|
||||
um = (sc[0] >> 4) & 0x0f0f0f0f;
|
||||
|
||||
int32_t smin = mb[0] * y[i].bsums[0] + mb[1] * y[i].bsums[1] + mb[2] * y[i].bsums[2] + mb[3] * y[i].bsums[3];
|
||||
summs += dmin * smin;
|
||||
|
||||
const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
|
||||
const __m128i q2_0 = _mm_and_si128(q2bits, m3);
|
||||
const __m128i q2_1 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
|
||||
const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
|
||||
const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
|
||||
|
||||
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
||||
|
||||
const __m128i p0 = _mm_maddubs_epi16(q2_0, _mm256_extractf128_si256(q8_0, 0));
|
||||
const __m128i p1 = _mm_maddubs_epi16(q2_1, _mm256_extractf128_si256(q8_0, 1));
|
||||
const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
|
||||
const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
|
||||
|
||||
const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
|
||||
const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
|
||||
const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
|
||||
const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
|
||||
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[2]), _mm256_cvtepi32_ps(p_2)), acc);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[3]), _mm256_cvtepi32_ps(p_3)), acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc) + summs;
|
||||
|
||||
#else
|
||||
|
||||
float sumf = 0;
|
||||
@@ -2295,6 +2351,93 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __AVX__
|
||||
|
||||
const __m128i m3 = _mm_set1_epi8(3);
|
||||
const __m128i m1 = _mm_set1_epi8(1);
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
uint64_t aux64;
|
||||
|
||||
uint16_t aux16[2];
|
||||
const int8_t * aux8 = (const int8_t *)aux16;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||
|
||||
const uint8_t * restrict q3 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const uint16_t a = *(const uint16_t *)x[i].scales;
|
||||
aux16[0] = a & 0x0f0f;
|
||||
aux16[1] = (a >> 4) & 0x0f0f;
|
||||
|
||||
const __m128i scale_0 = _mm_set1_epi16(aux8[0] - 8);
|
||||
const __m128i scale_1 = _mm_set1_epi16(aux8[2] - 8);
|
||||
const __m128i scale_2 = _mm_set1_epi16(aux8[1] - 8);
|
||||
const __m128i scale_3 = _mm_set1_epi16(aux8[3] - 8);
|
||||
|
||||
memcpy(&aux64, x[i].hmask, 8);
|
||||
|
||||
__m128i q3h_0 = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
|
||||
__m128i q3h_1 = _mm_srli_epi16(q3h_0, 2);
|
||||
__m128i q3h_2 = _mm_srli_epi16(q3h_0, 4);
|
||||
__m128i q3h_3 = _mm_srli_epi16(q3h_0, 6);
|
||||
q3h_0 = _mm_slli_epi16(_mm_andnot_si128(q3h_0, m1), 2);
|
||||
q3h_1 = _mm_slli_epi16(_mm_andnot_si128(q3h_1, m1), 2);
|
||||
q3h_2 = _mm_slli_epi16(_mm_andnot_si128(q3h_2, m1), 2);
|
||||
q3h_3 = _mm_slli_epi16(_mm_andnot_si128(q3h_3, m1), 2);
|
||||
|
||||
// load low 2 bits
|
||||
const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
|
||||
|
||||
// prepare low and high bits
|
||||
const __m128i q3l_0 = _mm_and_si128(q3bits, m3);
|
||||
const __m128i q3l_1 = _mm_and_si128(_mm_srli_epi16(q3bits, 2), m3);
|
||||
const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits, 4), m3);
|
||||
const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits, 6), m3);
|
||||
|
||||
// load Q8 quants
|
||||
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
||||
|
||||
// Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm_maddubs_epi16,
|
||||
// and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
|
||||
// and 2 if the high bit was set)
|
||||
const __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, _mm256_extractf128_si256(q8_0, 0));
|
||||
const __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, _mm256_extractf128_si256(q8_0, 1));
|
||||
const __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, _mm256_extractf128_si256(q8_1, 0));
|
||||
const __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, _mm256_extractf128_si256(q8_1, 1));
|
||||
|
||||
__m128i p16_0 = _mm_maddubs_epi16(q3l_0, _mm256_extractf128_si256(q8_0, 0));
|
||||
__m128i p16_1 = _mm_maddubs_epi16(q3l_1, _mm256_extractf128_si256(q8_0, 1));
|
||||
__m128i p16_2 = _mm_maddubs_epi16(q3l_2, _mm256_extractf128_si256(q8_1, 0));
|
||||
__m128i p16_3 = _mm_maddubs_epi16(q3l_3, _mm256_extractf128_si256(q8_1, 1));
|
||||
|
||||
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
||||
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
||||
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
||||
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
||||
|
||||
// multiply with scales
|
||||
p16_0 = _mm_madd_epi16(scale_0, p16_0);
|
||||
p16_1 = _mm_madd_epi16(scale_1, p16_1);
|
||||
p16_2 = _mm_madd_epi16(scale_2, p16_2);
|
||||
p16_3 = _mm_madd_epi16(scale_3, p16_3);
|
||||
|
||||
p16_0 = _mm_add_epi32(p16_0, p16_2);
|
||||
p16_1 = _mm_add_epi32(p16_1, p16_3);
|
||||
__m256i p16 = _mm256_set_m128i(p16_1, p16_0);
|
||||
|
||||
// multiply with block scale and accumulate
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
|
||||
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
@@ -2781,6 +2924,60 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc) - summs;
|
||||
|
||||
#elif defined __AVX__
|
||||
|
||||
const __m128i m4 = _mm_set1_epi8(0xF);
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
float summs = 0;
|
||||
|
||||
uint16_t aux16[2];
|
||||
const uint8_t * scales = (const uint8_t *)aux16;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = ggml_fp16_to_fp32(x[i].d[0]) * y[i].d;
|
||||
const float m = ggml_fp16_to_fp32(x[i].d[1]) * y[i].d;
|
||||
const __m256 vd = _mm256_set1_ps(d);
|
||||
|
||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
|
||||
summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3]));
|
||||
|
||||
const uint8_t * restrict q4 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4);
|
||||
const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0);
|
||||
const __m128i q4bits_1 = _mm256_extractf128_si256(q4bits, 1);
|
||||
const __m128i q4_0 = _mm_and_si128(q4bits_0, m4);
|
||||
const __m128i q4_1 = _mm_and_si128(q4bits_1, m4);
|
||||
const __m128i q4_2 = _mm_and_si128(_mm_srli_epi16(q4bits_0, 4), m4);
|
||||
const __m128i q4_3 = _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4);
|
||||
|
||||
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
||||
|
||||
const __m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
|
||||
const __m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
|
||||
const __m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
|
||||
const __m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
|
||||
|
||||
const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
|
||||
const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
|
||||
|
||||
const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
|
||||
const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
|
||||
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc) - summs;
|
||||
|
||||
#else
|
||||
|
||||
uint8_t aux8[QK_K];
|
||||
@@ -3295,6 +3492,63 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __AVX__
|
||||
|
||||
const __m128i m4 = _mm_set1_epi8(0xF);
|
||||
const __m128i mone = _mm_set1_epi8(1);
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const uint8_t * restrict q5 = x[i].qs;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||
|
||||
const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
|
||||
|
||||
const __m128i scale_0 = _mm_set1_epi16(x[i].scales[0]);
|
||||
const __m128i scale_1 = _mm_set1_epi16(x[i].scales[1]);
|
||||
const __m128i scale_2 = _mm_set1_epi16(x[i].scales[2]);
|
||||
const __m128i scale_3 = _mm_set1_epi16(x[i].scales[3]);
|
||||
|
||||
int64_t aux64;
|
||||
memcpy(&aux64, x[i].qh, 8);
|
||||
const __m128i haux128_0 = _mm_set_epi64x(aux64 >> 1, aux64);
|
||||
const __m128i haux128_1 = _mm_srli_epi16(haux128_0, 2);
|
||||
|
||||
const __m128i q5h_0 = _mm_slli_epi16(_mm_andnot_si128(haux128_0, mone), 4);
|
||||
const __m128i q5h_1 = _mm_slli_epi16(_mm_andnot_si128(haux128_1, mone), 4);
|
||||
const __m128i q5h_2 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_0, 4), mone), 4);
|
||||
const __m128i q5h_3 = _mm_slli_epi16(_mm_andnot_si128(_mm_srli_epi16(haux128_1, 4), mone), 4);
|
||||
|
||||
const __m128i q5l_0 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 0), m4);
|
||||
const __m128i q5l_1 = _mm_and_si128(_mm256_extractf128_si256(q5bits, 1), m4);
|
||||
const __m128i q5l_2 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 0), 4), m4);
|
||||
const __m128i q5l_3 = _mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q5bits, 1), 4), m4);
|
||||
|
||||
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
||||
|
||||
const __m128i p16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5l_0, _mm256_extractf128_si256(q8_0, 0)));
|
||||
const __m128i p16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5l_1, _mm256_extractf128_si256(q8_0, 1)));
|
||||
const __m128i p16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5l_2, _mm256_extractf128_si256(q8_1, 0)));
|
||||
const __m128i p16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5l_3, _mm256_extractf128_si256(q8_1, 1)));
|
||||
const __m128i s16_0 = _mm_madd_epi16(scale_0, _mm_maddubs_epi16(q5h_0, _mm256_extractf128_si256(q8_0, 0)));
|
||||
const __m128i s16_1 = _mm_madd_epi16(scale_1, _mm_maddubs_epi16(q5h_1, _mm256_extractf128_si256(q8_0, 1)));
|
||||
const __m128i s16_2 = _mm_madd_epi16(scale_2, _mm_maddubs_epi16(q5h_2, _mm256_extractf128_si256(q8_1, 0)));
|
||||
const __m128i s16_3 = _mm_madd_epi16(scale_3, _mm_maddubs_epi16(q5h_3, _mm256_extractf128_si256(q8_1, 1)));
|
||||
|
||||
const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
|
||||
const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
|
||||
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
|
||||
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
@@ -3857,6 +4111,77 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#elif defined __AVX__
|
||||
|
||||
const __m128i m4 = _mm_set1_epi8(0xF);
|
||||
const __m128i m2 = _mm_set1_epi8(3);
|
||||
const __m128i m32s = _mm_set1_epi8(32);
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
|
||||
const float d = y[i].d * ggml_fp16_to_fp32(x[i].d);
|
||||
|
||||
const uint8_t * restrict q4 = x[i].ql;
|
||||
const uint8_t * restrict qh = x[i].qh;
|
||||
const int8_t * restrict q8 = y[i].qs;
|
||||
|
||||
const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]);
|
||||
const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]);
|
||||
const __m64 scales_3 = _mm_set1_pi8(x[i].scales[2]);
|
||||
const __m64 scales_4 = _mm_set1_pi8(x[i].scales[3]);
|
||||
|
||||
__m128i sumi_0 = _mm_setzero_si128();
|
||||
__m128i sumi_1 = _mm_setzero_si128();
|
||||
|
||||
const __m128i scale_0 = _mm_set_epi64(scales_2, scales_1);
|
||||
const __m128i scale_1 = _mm_set_epi64(scales_4, scales_3);
|
||||
|
||||
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
|
||||
const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
|
||||
|
||||
const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH, m2), 4);
|
||||
const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 2), m2), 4);
|
||||
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 4), m2), 4);
|
||||
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH, 6), m2), 4);
|
||||
|
||||
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 0), m4), q4h_0);
|
||||
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(_mm256_extractf128_si256(q4bits1, 1), m4), q4h_1);
|
||||
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 0), 4), m4), q4h_2);
|
||||
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(_mm256_extractf128_si256(q4bits1, 1), 4), m4), q4h_3);
|
||||
|
||||
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
|
||||
|
||||
__m128i q8s_0 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 0));
|
||||
__m128i q8s_1 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_0, 1));
|
||||
__m128i q8s_2 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 0));
|
||||
__m128i q8s_3 = _mm_maddubs_epi16(m32s, _mm256_extractf128_si256(q8_1, 1));
|
||||
|
||||
__m128i p16_0 = _mm_maddubs_epi16(q4_0, _mm256_extractf128_si256(q8_0, 0));
|
||||
__m128i p16_1 = _mm_maddubs_epi16(q4_1, _mm256_extractf128_si256(q8_0, 1));
|
||||
__m128i p16_2 = _mm_maddubs_epi16(q4_2, _mm256_extractf128_si256(q8_1, 0));
|
||||
__m128i p16_3 = _mm_maddubs_epi16(q4_3, _mm256_extractf128_si256(q8_1, 1));
|
||||
|
||||
p16_0 = _mm_sub_epi16(p16_0, q8s_0);
|
||||
p16_1 = _mm_sub_epi16(p16_1, q8s_1);
|
||||
p16_2 = _mm_sub_epi16(p16_2, q8s_2);
|
||||
p16_3 = _mm_sub_epi16(p16_3, q8s_3);
|
||||
|
||||
p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
|
||||
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
|
||||
p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
|
||||
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
|
||||
|
||||
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
|
||||
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
|
||||
|
||||
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
|
||||
}
|
||||
|
||||
*s = hsum_float_8(acc);
|
||||
|
||||
#else
|
||||
|
||||
int8_t aux8[QK_K];
|
||||
|
||||
@@ -1720,6 +1720,9 @@ static bool llama_eval_internal(
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (lctx.ctx_metal && N == 1) {
|
||||
if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
|
||||
ggml_metal_graph_find_concurrency(lctx.ctx_metal,&gf);
|
||||
}
|
||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||
|
||||
Reference in New Issue
Block a user