mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
9 Commits
master-4e7
...
master-697
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
697966680b | ||
|
|
27ad57a69b | ||
|
|
32c5411631 | ||
|
|
ff5d58faec | ||
|
|
b782422a3e | ||
|
|
1cbf561466 | ||
|
|
975221e954 | ||
|
|
4523d10d0c | ||
|
|
680e6f9177 |
@@ -10,13 +10,13 @@ shift
|
||||
# Join the remaining arguments into a single string
|
||||
arg2="$@"
|
||||
|
||||
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
|
||||
python3 ./convert.py $arg2
|
||||
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
|
||||
./quantize $arg2
|
||||
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
|
||||
./main $arg2
|
||||
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||
python3 ./convert.py "$arg2"
|
||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||
./quantize "$arg2"
|
||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||
./main "$arg2"
|
||||
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||
echo "Converting PTH to GGML..."
|
||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
||||
if [ -f "${i/f16/q4_0}" ]; then
|
||||
@@ -26,8 +26,8 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
||||
./quantize "$i" "${i/f16/q4_0}" q4_0
|
||||
fi
|
||||
done
|
||||
elif [[ $arg1 == '--server' || $arg1 == '-s' ]]; then
|
||||
./server $arg2
|
||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||
./server "$arg2"
|
||||
else
|
||||
echo "Unknown command: $arg1"
|
||||
echo "Available commands: "
|
||||
|
||||
@@ -285,6 +285,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@@ -520,7 +521,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
|
||||
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
|
||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
|
||||
@@ -16,7 +16,7 @@ Command line options:
|
||||
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
|
||||
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
|
||||
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
|
||||
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
|
||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
||||
|
||||
@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
|
||||
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||
@@ -820,6 +820,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--lora-base")
|
||||
{
|
||||
|
||||
108
ggml-cuda.cu
108
ggml-cuda.cu
@@ -212,6 +212,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||
|
||||
#define CUDA_ADD_BLOCK_SIZE 256
|
||||
#define CUDA_MUL_BLOCK_SIZE 256
|
||||
#define CUDA_GELU_BLOCK_SIZE 256
|
||||
#define CUDA_SILU_BLOCK_SIZE 256
|
||||
#define CUDA_CPY_BLOCK_SIZE 32
|
||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
||||
@@ -266,6 +267,19 @@ static __global__ void mul_f32(const float * x, const float * y, float * dst, co
|
||||
dst[i] = x[i] * y[i%ky];
|
||||
}
|
||||
|
||||
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
||||
const float GELU_COEF_A = 0.044715f;
|
||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
|
||||
float xi = x[i];
|
||||
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi)));
|
||||
}
|
||||
|
||||
static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
@@ -1653,6 +1667,40 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int half_n_dims = ncols/4;
|
||||
|
||||
if (col >= half_n_dims) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int i = row*ncols + col;
|
||||
|
||||
const float col_theta_scale = powf(theta_scale, col);
|
||||
|
||||
const float theta = p*col_theta_scale;
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + half_n_dims];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
|
||||
|
||||
const float block_theta = block_p*col_theta_scale;
|
||||
const float sin_block_theta = sinf(block_theta);
|
||||
const float cos_block_theta = cosf(block_theta);
|
||||
|
||||
const float x2 = x[i + half_n_dims * 2];
|
||||
const float x3 = x[i + half_n_dims * 3];
|
||||
|
||||
dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
|
||||
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
|
||||
}
|
||||
|
||||
static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
@@ -1733,6 +1781,11 @@ static void mul_f32_cuda(const float * x, const float * y, float * dst, const in
|
||||
mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
||||
}
|
||||
|
||||
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
||||
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
|
||||
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
@@ -2045,6 +2098,14 @@ static void rope_f32_cuda(const float * x, float * dst, const int ncols, const i
|
||||
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, theta_scale);
|
||||
}
|
||||
|
||||
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
||||
GGML_ASSERT(nrows % 4 == 0);
|
||||
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
||||
const int num_blocks_x = (ncols + 4*CUDA_ROPE_BLOCK_SIZE - 1) / (4*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(num_blocks_x, nrows, 1);
|
||||
rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p, block_p, theta_scale);
|
||||
}
|
||||
|
||||
static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) {
|
||||
const dim3 block_dims(CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1, 1);
|
||||
const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
|
||||
@@ -2280,7 +2341,7 @@ inline void ggml_cuda_op_add(
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t i01_diff = i01_high - i01_low;
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
// const int64_t ne10 = src1->ne[0];
|
||||
|
||||
// compute
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
@@ -2327,6 +2388,28 @@ inline void ggml_cuda_op_mul(
|
||||
(void) i02;
|
||||
}
|
||||
|
||||
inline void ggml_cuda_op_gelu(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
||||
cudaStream_t & cudaStream_main){
|
||||
|
||||
GGML_ASSERT(src0_ddf_i != nullptr);
|
||||
GGML_ASSERT(dst_ddf_i != nullptr);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t i01_diff = i01_high - i01_low;
|
||||
|
||||
// compute
|
||||
gelu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
||||
|
||||
(void) src1;
|
||||
(void) dst;
|
||||
(void) src0_ddq_i;
|
||||
(void) src1_ddf_i;
|
||||
(void) i02;
|
||||
(void) i1;
|
||||
}
|
||||
|
||||
inline void ggml_cuda_op_silu(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
||||
@@ -2577,13 +2660,21 @@ inline void ggml_cuda_op_rope(
|
||||
const int n_past = ((int32_t *) src1->data)[0];
|
||||
const int n_dims = ((int32_t *) src1->data)[1];
|
||||
const int mode = ((int32_t *) src1->data)[2];
|
||||
GGML_ASSERT(mode == 0);
|
||||
const int n_ctx = ((int32_t *) src1->data)[3];
|
||||
|
||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
||||
const float p = ((mode & 1) == 0 ? n_past + i02 : i02);
|
||||
|
||||
bool is_glm = mode & 4;
|
||||
|
||||
// compute
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
||||
if (is_glm) {
|
||||
const float id_p = min(p, n_ctx - 2.f);
|
||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||
} else {
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
||||
}
|
||||
|
||||
(void) dst;
|
||||
(void) src0_ddq_i;
|
||||
@@ -2986,6 +3077,11 @@ void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul, true, false); // TODO ggml_cuda_op needs modification for flatten
|
||||
}
|
||||
|
||||
void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_gelu, true, true);
|
||||
}
|
||||
|
||||
void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_silu, true, true);
|
||||
@@ -3382,6 +3478,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||
}
|
||||
func = ggml_cuda_mul;
|
||||
break;
|
||||
case GGML_OP_GELU:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cuda_gelu;
|
||||
break;
|
||||
case GGML_OP_SILU:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
||||
@@ -740,8 +740,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
|
||||
[encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q2_K ||
|
||||
src0t == GGML_TYPE_Q3_K ||
|
||||
|
||||
237
ggml-metal.metal
237
ggml-metal.metal
@@ -365,6 +365,10 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
}
|
||||
|
||||
// putting them in the kernel cause a significant performance penalty
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
kernel void kernel_mul_mat_q4_0_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -372,64 +376,83 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne0,
|
||||
threadgroup float * sum [[threadgroup(0)]],
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
const int nb = ne00/QK4_0;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q4_0 * x = (device const block_q4_0 *) src0 + r0*nb;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
device const block_q4_0 * x = (device const block_q4_0 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
|
||||
device const float * y = (device const float *) src1 + r1*ne10;
|
||||
block_q4_0 qb_curr, qb_next;
|
||||
float4 y_curr[8]; // src1 vector cache
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
thread float * yl=(thread float *)y_curr;
|
||||
|
||||
const int nth = tptg.x*tptg.y;
|
||||
const int ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
const int ix = tpitg.y/4; // 0 or 1
|
||||
const int iy = tpitg.y - 4*ix; // 0...3
|
||||
|
||||
const int first = 4 * iy;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
|
||||
|
||||
const float d = (float)x[i].d;
|
||||
|
||||
device const uint8_t * xl = x[i].qs + first;
|
||||
device const float * yl = y + i * QK4_0 + first;
|
||||
|
||||
float2 acc = {0.0f, 0.0f};
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
|
||||
acc[0] += yl[j] * (xl[j] & 0xF) + yl[j+16] * (xl[j] >> 4);
|
||||
acc[1] += yl[j] + yl[j+16];
|
||||
// bootstrap
|
||||
qb_curr = x[tiisg];
|
||||
// each thread in a SIMD group deals with 1 block.
|
||||
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
||||
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
sumy *= (-8.f);
|
||||
|
||||
sumf += d * (acc[0] - 8.f*acc[1]);
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
float d = qb_curr.d;
|
||||
float acc = sumy;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
sumf[row] += d * acc;
|
||||
qb_curr = qb_next;
|
||||
}
|
||||
}
|
||||
|
||||
sum[ith] = sumf;
|
||||
if (nb % N_SIMDWIDTH == 0) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
//
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%4 == 0) {
|
||||
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%16 == 0) {
|
||||
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith == 0) {
|
||||
for (int i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
dst[r1*ne0 + r0] = sum[0];
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
sumy *= (-8.f);
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
float d = qb_curr.d;
|
||||
float acc = sumy;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
if (tiisg < nb % N_SIMDWIDTH) {
|
||||
sumf[row] += d * acc;
|
||||
}
|
||||
qb_curr = qb_next;
|
||||
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -440,65 +463,83 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne0,
|
||||
threadgroup float * sum [[threadgroup(0)]],
|
||||
constant int64_t & ne01[[buffer(4)]],
|
||||
uint2 tgpig[[threadgroup_position_in_grid]],
|
||||
uint2 tpitg[[thread_position_in_threadgroup]],
|
||||
uint2 tptg[[threads_per_threadgroup]]) {
|
||||
const int nb = ne00/QK4_1;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
|
||||
device const block_q4_1 * x = (device const block_q4_1 *) src0 + r0*nb;
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
const int nb = ne00/QK4_0;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
device const block_q4_1 * x = (device const block_q4_1 *) src0 + (r0 * N_SIMDGROUP + sgitg) * N_DST * nb;
|
||||
device const float * y = (device const float *) src1 + r1*ne10;
|
||||
block_q4_1 qb_curr, qb_next;
|
||||
float4 y_curr[8]; // src1 vector cache
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
thread float * yl=(thread float *)y_curr;
|
||||
|
||||
const uint nth = tptg.x*tptg.y;
|
||||
const uint ith = tptg.y*tpitg.x + tpitg.y;
|
||||
|
||||
const int ix = tpitg.y/4; // 0 or 1
|
||||
const int iy = tpitg.y - 4*ix; // 0...3
|
||||
|
||||
const int first = 4 * iy;
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
for (int i = 2*tpitg.x + ix; i < nb; i += 2*tptg.x) {
|
||||
|
||||
const float d = (float)x[i].d;
|
||||
const float m = (float)x[i].m;
|
||||
|
||||
device const uint8_t * xl = x[i].qs + first;
|
||||
device const float * yl = y + i * QK4_1 + first;
|
||||
|
||||
float2 acc = {0.0f, 0.0f};
|
||||
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
|
||||
acc[0] += yl[j+ 0] * (d * (xl[j] & 0xF) + m);
|
||||
acc[1] += yl[j+16] * (d * (xl[j] >> 4) + m);
|
||||
// bootstrap
|
||||
qb_curr = x[tiisg];
|
||||
// each thread in a SIMD group deals with 1 block.
|
||||
for (int column = 0; column < nb / N_SIMDWIDTH; column++) {
|
||||
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + column * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
|
||||
sumf += acc[0] + acc[1];
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (column + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
const float d = qb_curr.d;
|
||||
const float m = qb_curr.m;
|
||||
float acc = 0.f;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
sumf[row] += d * acc + m * sumy;
|
||||
qb_curr = qb_next;
|
||||
}
|
||||
}
|
||||
|
||||
sum[ith] = sumf;
|
||||
if (nb % N_SIMDWIDTH == 0) {
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
//
|
||||
// Accumulate the sum from all threads in the threadgroup
|
||||
//
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%4 == 0) {
|
||||
sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith%16 == 0) {
|
||||
sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
|
||||
}
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
if (ith == 0) {
|
||||
for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
|
||||
dst[r1*ne0 + r0] = sum[0];
|
||||
float sumy = 0;
|
||||
for (int i = 0; i < QK4_0 / 4; i++) {
|
||||
y_curr[i] = *((device float4 *)(y + N_SIMDWIDTH * (tiisg + (nb / N_SIMDWIDTH) * QK4_0) + 4 * i));
|
||||
sumy += y_curr[i][0] + y_curr[i][1] + y_curr[i][2] + y_curr[i][3];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
// prefetch next x block
|
||||
qb_next = x[tiisg + ((row + 1) % N_DST) * nb + (nb / N_SIMDWIDTH + ((row + 1) / N_DST)) * N_SIMDWIDTH];
|
||||
|
||||
// calculate
|
||||
const float d = qb_curr.d;
|
||||
const float m = qb_curr.m;
|
||||
float acc = 0.f;
|
||||
for (int i = 0; i < 16; i++) {
|
||||
acc += yl[i] * (qb_curr.qs[i] & 0xF) + yl[i+16] * (qb_curr.qs[i] >> 4);
|
||||
}
|
||||
if (tiisg < nb % N_SIMDWIDTH) {
|
||||
sumf[row] += d * acc + m * sumy;
|
||||
}
|
||||
qb_curr = qb_next;
|
||||
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && ((r0 * N_SIMDGROUP + sgitg) * N_DST + row) < ne01) {
|
||||
dst[r1*ne0 + (r0 * N_SIMDGROUP + sgitg) * N_DST + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
458
ggml.c
458
ggml.c
@@ -3787,6 +3787,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CLAMP",
|
||||
"CONV_1D",
|
||||
"CONV_2D",
|
||||
"POOL_1D",
|
||||
"POOL_2D",
|
||||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_FF",
|
||||
@@ -3805,7 +3807,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
|
||||
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -3865,6 +3867,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"clamp(x)",
|
||||
"conv_1d(x)",
|
||||
"conv_2d(x)",
|
||||
"pool_1d(x)",
|
||||
"pool_2d(x)",
|
||||
|
||||
"flash_attn(x)",
|
||||
"flash_ff(x)",
|
||||
@@ -3883,7 +3887,9 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 66, "GGML_OP_COUNT != 66");
|
||||
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||
@@ -4162,10 +4168,9 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
|
||||
static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return
|
||||
(t0->ne[0] == t1->ne[0]) &&
|
||||
(t0->ne[2] == t1->ne[2]) &&
|
||||
(t0->ne[3] == t1->ne[3]);
|
||||
return (t0->ne[0] == t1->ne[0]) &&
|
||||
(t1->ne[2]%t0->ne[2] == 0) && // verify t0 is broadcastable
|
||||
(t1->ne[3]%t0->ne[3] == 0);
|
||||
}
|
||||
|
||||
static inline bool ggml_can_out_prod(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
@@ -6030,8 +6035,8 @@ struct ggml_tensor * ggml_mul_mat(
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
|
||||
const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(a->n_dims, b->n_dims), ne);
|
||||
|
||||
result->op = GGML_OP_MUL_MAT;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -7167,7 +7172,6 @@ struct ggml_tensor* ggml_conv_2d(
|
||||
int d0,
|
||||
int d1) {
|
||||
|
||||
GGML_ASSERT(b->ne[3] == 1);
|
||||
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
||||
bool is_node = false;
|
||||
|
||||
@@ -7179,7 +7183,7 @@ struct ggml_tensor* ggml_conv_2d(
|
||||
const int64_t ne[4] = {
|
||||
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
||||
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
||||
a->ne[3], 1,
|
||||
a->ne[3], b->ne[3],
|
||||
};
|
||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
@@ -7214,6 +7218,98 @@ struct ggml_tensor* ggml_conv_1d_ph(
|
||||
return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
|
||||
}
|
||||
|
||||
|
||||
// ggml_pool_*
|
||||
|
||||
static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
|
||||
return (ins + 2 * p - ks) / s + 1;
|
||||
}
|
||||
|
||||
// ggml_pool_2d
|
||||
|
||||
struct ggml_tensor* ggml_pool_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_pool op,
|
||||
int k0,
|
||||
int s0,
|
||||
int p0) {
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
GGML_ASSERT(false); // TODO: implement backward
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t ne[3] = {
|
||||
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
||||
a->ne[1],
|
||||
};
|
||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4);
|
||||
((int32_t*)c->data)[0] = op;
|
||||
((int32_t*)c->data)[1] = k0;
|
||||
((int32_t*)c->data)[2] = s0;
|
||||
((int32_t*)c->data)[3] = p0;
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
result->op = GGML_OP_POOL_1D;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = c;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_pool_2d
|
||||
|
||||
struct ggml_tensor* ggml_pool_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_pool op,
|
||||
int k0,
|
||||
int k1,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1) {
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad) {
|
||||
GGML_ASSERT(false); // TODO: implement backward
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t ne[3] = {
|
||||
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
||||
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
||||
a->ne[2],
|
||||
};
|
||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||
|
||||
ggml_scratch_save(ctx);
|
||||
struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7);
|
||||
((int32_t*)c->data)[0] = op;
|
||||
((int32_t*)c->data)[1] = k0;
|
||||
((int32_t*)c->data)[2] = k1;
|
||||
((int32_t*)c->data)[3] = s0;
|
||||
((int32_t*)c->data)[4] = s1;
|
||||
((int32_t*)c->data)[5] = p0;
|
||||
((int32_t*)c->data)[6] = p1;
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
result->op = GGML_OP_POOL_2D;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = c;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn(
|
||||
@@ -10543,7 +10639,6 @@ static void ggml_compute_forward_rms_norm_back(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_mul_mat
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
@@ -10587,17 +10682,19 @@ static void ggml_compute_forward_mul_mat(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
GGML_ASSERT(ne2 == ne12);
|
||||
GGML_ASSERT(ne3 == ne13);
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == GGML_TYPE_SIZE[type]);
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
@@ -10608,16 +10705,16 @@ static void ggml_compute_forward_mul_mat(
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
GGML_ASSERT(ne0 == ne01);
|
||||
GGML_ASSERT(ne1 == ne11);
|
||||
GGML_ASSERT(ne2 == ne02);
|
||||
GGML_ASSERT(ne3 == ne03);
|
||||
|
||||
// nb01 >= nb00 - src0 is not transposed
|
||||
// compute by src0 rows
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
||||
// ref: https://github.com/ggerganov/ggml/pull/224
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
@@ -10627,6 +10724,11 @@ static void ggml_compute_forward_mul_mat(
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
||||
// ref: https://github.com/ggerganov/ggml/pull/224
|
||||
GGML_ASSERT(ne02 == ne12);
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
@@ -10647,7 +10749,7 @@ static void ggml_compute_forward_mul_mat(
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
if (type != GGML_TYPE_F32) {
|
||||
float * const wdata = params->wdata;
|
||||
float * const wdata = params->wdata;
|
||||
ggml_to_float_t const to_float = type_traits[type].to_float;
|
||||
|
||||
size_t id = 0;
|
||||
@@ -10696,41 +10798,52 @@ static void ggml_compute_forward_mul_mat(
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by src0 rows using ggml_vec_dot_q
|
||||
// parallelize by src0 rows
|
||||
const int64_t dr = (ne01 + nth - 1)/nth;
|
||||
|
||||
// total rows in src0
|
||||
const int nr = ne01*ne02*ne03;
|
||||
const int64_t ir10 = dr*ith;
|
||||
const int64_t ir11 = MIN(ir10 + dr, ne01);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
// src1 rows
|
||||
const int64_t nr1 = ne11*ne12*ne13;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
||||
|
||||
void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
||||
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
|
||||
const int64_t i13 = (ir1/(ne12*ne11));
|
||||
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
||||
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// src0 indices
|
||||
const int i03 = ir/(ne02*ne01);
|
||||
const int i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
|
||||
const int64_t i03 = (ir0/(ne02));
|
||||
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
|
||||
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
|
||||
// GG: this is likely the correct way to broadcast, though need some more thought
|
||||
// therefore leaving the comments to remind us for now
|
||||
const int64_t i02 = (i12 / (ne12 / ne02));
|
||||
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
|
||||
// const int64_t i02 = (ir0 - i03*ne02);
|
||||
|
||||
const int i13 = i03;
|
||||
const int i12 = i02;
|
||||
const int64_t i1 = i11;
|
||||
const int64_t i2 = i12;
|
||||
const int64_t i3 = i13;
|
||||
|
||||
const int i0 = i01;
|
||||
const int i2 = i02;
|
||||
const int i3 = i03;
|
||||
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
||||
|
||||
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
||||
char * src1_col = ((char *) wdata + ( (0 + i12*ne11 + i13*ne12*ne11)*row_size));
|
||||
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
||||
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
||||
// the original src1 data pointer, so we should index using the indices directly
|
||||
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||
const char * src1_col = (const char *) wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||
|
||||
float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
|
||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||
|
||||
for (int64_t ic = 0; ic < ne11; ++ic) {
|
||||
vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
|
||||
for (int64_t ir = ir10; ir < ir11; ++ir) {
|
||||
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12879,12 +12992,13 @@ static void ggml_compute_forward_conv_1d(
|
||||
};
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d_sk_p0
|
||||
// ggml_compute_forward_conv_2d
|
||||
|
||||
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
static void ggml_compute_forward_conv_2d_f16_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
const struct ggml_tensor * opt0,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
@@ -12904,11 +13018,17 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
// size of the convolution row - the kernel size unrolled across all channels
|
||||
const int ew0 = nk0*nk1*ne02;
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
||||
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
||||
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
||||
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
||||
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
||||
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
// TODO: fix this memset (wsize is overestimated)
|
||||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// prepare source data (src1)
|
||||
@@ -12923,8 +13043,13 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
||||
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
||||
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
||||
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
|
||||
const int idx0 = i0*s0 + ik0*d0 - p0;
|
||||
const int idx1 = i1*s1 + ik1*d1 - p1;
|
||||
|
||||
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
|
||||
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
||||
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -12951,32 +13076,36 @@ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
||||
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
|
||||
for (int i2 = ip0; i2 < ip1; i2++) {
|
||||
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ip0; i2 < ip1; i2++) {
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
|
||||
|
||||
for (int i1 = 0; i1 < ne1; ++i1) {
|
||||
for (int i0 = 0; i0 < ne0; ++i0) {
|
||||
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
||||
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
||||
(ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
|
||||
for (int i1 = 0; i1 < ne1; ++i1) {
|
||||
for (int i0 = 0; i0 < ne0; ++i0) {
|
||||
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
||||
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
||||
(ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_sk_p0(
|
||||
static void ggml_compute_forward_conv_2d(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * opt0,
|
||||
struct ggml_tensor * dst
|
||||
) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
|
||||
ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
//ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
|
||||
//ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst);
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
@@ -12986,31 +13115,164 @@ static void ggml_compute_forward_conv_2d_sk_p0(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_2d
|
||||
// ggml_compute_forward_pool_1d_sk_p0
|
||||
|
||||
static void ggml_compute_forward_conv_2d(
|
||||
static void ggml_compute_forward_pool_1d_sk_p0(
|
||||
const struct ggml_compute_params * params,
|
||||
const enum ggml_op_pool op,
|
||||
const struct ggml_tensor * src,
|
||||
const int k,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(src->type == GGML_TYPE_F32);
|
||||
assert(params->ith == 0);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const char * cdata = (const char *)src->data;
|
||||
const char * const data_end = cdata + ggml_nbytes(src);
|
||||
float * drow = (float *)dst->data;
|
||||
|
||||
const int64_t rs = dst->ne[0];
|
||||
|
||||
while (cdata < data_end) {
|
||||
const float * const srow = (const float *)cdata;
|
||||
|
||||
int j = 0;
|
||||
|
||||
for (int64_t i = 0; i < rs; ++i) {
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: drow[i] = 0; break;
|
||||
case GGML_OP_POOL_MAX: drow[i] = -FLT_MAX; break;
|
||||
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
||||
}
|
||||
for (int ki = 0; ki < k; ++ki) {
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: drow[i] += srow[j]; break;
|
||||
case GGML_OP_POOL_MAX: if (srow[j] > drow[i]) drow[i] = srow[j]; break;
|
||||
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
||||
}
|
||||
++j;
|
||||
}
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: drow[i] /= k; break;
|
||||
case GGML_OP_POOL_MAX: break;
|
||||
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
||||
}
|
||||
}
|
||||
|
||||
cdata += src->nb[1];
|
||||
drow += rs;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pool_1d
|
||||
|
||||
static void ggml_compute_forward_pool_1d(
|
||||
const struct ggml_compute_params* params,
|
||||
const struct ggml_tensor* src0,
|
||||
const struct ggml_tensor* src1,
|
||||
const struct ggml_tensor* opt0,
|
||||
struct ggml_tensor* dst) {
|
||||
const int32_t s0 = ((const int32_t*)(opt0->data))[0];
|
||||
const int32_t s1 = ((const int32_t*)(opt0->data))[1];
|
||||
const int32_t p0 = ((const int32_t*)(opt0->data))[2];
|
||||
const int32_t p1 = ((const int32_t*)(opt0->data))[3];
|
||||
const int32_t d0 = ((const int32_t*)(opt0->data))[4];
|
||||
const int32_t d1 = ((const int32_t*)(opt0->data))[5];
|
||||
GGML_ASSERT(d0 == 1); // dilation not supported
|
||||
GGML_ASSERT(d1 == 1);
|
||||
GGML_ASSERT(opt0->ne[0] == 4);
|
||||
const int* opts = (const int*)opt0->data;
|
||||
enum ggml_op_pool op = opts[0];
|
||||
const int k0 = opts[1];
|
||||
const int s0 = opts[2];
|
||||
const int p0 = opts[3];
|
||||
GGML_ASSERT(p0 == 0); // padding not supported
|
||||
GGML_ASSERT(p1 == 0);
|
||||
GGML_ASSERT(k0 == s0); // only s = k supported
|
||||
|
||||
if (s0 == src0->ne[0] && s1 == src0->ne[1]) {
|
||||
ggml_compute_forward_conv_2d_sk_p0(params, src0, src1, dst);
|
||||
ggml_compute_forward_pool_1d_sk_p0(params, op, src0, k0, dst);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pool_2d_sk_p0
|
||||
|
||||
static void ggml_compute_forward_pool_2d_sk_p0(
|
||||
const struct ggml_compute_params * params,
|
||||
const enum ggml_op_pool op,
|
||||
const struct ggml_tensor * src,
|
||||
const int k0,
|
||||
const int k1,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(src->type == GGML_TYPE_F32);
|
||||
assert(params->ith == 0);
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false); // only stride equal to kernel size is supported
|
||||
};
|
||||
|
||||
const char * cdata = (const char*)src->data;
|
||||
const char * const data_end = cdata + ggml_nbytes(src);
|
||||
|
||||
const int64_t px = dst->ne[0];
|
||||
const int64_t py = dst->ne[1];
|
||||
const int64_t pa = px * py;
|
||||
|
||||
float * dplane = (float *)dst->data;
|
||||
|
||||
const int ka = k0 * k1;
|
||||
|
||||
while (cdata < data_end) {
|
||||
for (int oy = 0; oy < py; ++oy) {
|
||||
float * const drow = dplane + oy * px;
|
||||
for (int ox = 0; ox < px; ++ox) {
|
||||
float * const out = drow + ox;
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: *out = 0; break;
|
||||
case GGML_OP_POOL_MAX: *out = -FLT_MAX; break;
|
||||
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
||||
}
|
||||
|
||||
const int ix = ox * k0;
|
||||
const int iy = oy * k1;
|
||||
|
||||
for (int ky = 0; ky < k1; ++ky) {
|
||||
const float * const srow = (const float *)(cdata + src->nb[1] * (iy + ky));
|
||||
for (int kx = 0; kx < k0; ++kx) {
|
||||
int j = ix + kx;
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: *out += srow[j]; break;
|
||||
case GGML_OP_POOL_MAX: if (srow[j] > *out) *out = srow[j]; break;
|
||||
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
switch (op) {
|
||||
case GGML_OP_POOL_AVG: *out /= ka; break;
|
||||
case GGML_OP_POOL_MAX: break;
|
||||
case GGML_OP_POOL_COUNT: GGML_ASSERT(false); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cdata += src->nb[2];
|
||||
dplane += pa;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_pool_2d
|
||||
|
||||
static void ggml_compute_forward_pool_2d(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * opt0,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(opt0->ne[0] == 7);
|
||||
const int* opts = (const int*)opt0->data;
|
||||
enum ggml_op_pool op = opts[0];
|
||||
const int k0 = opts[1];
|
||||
const int k1 = opts[2];
|
||||
const int s0 = opts[3];
|
||||
const int s1 = opts[4];
|
||||
const int p0 = opts[5];
|
||||
const int p1 = opts[6];
|
||||
GGML_ASSERT(p0 == 0);
|
||||
GGML_ASSERT(p1 == 0); // padding not supported
|
||||
GGML_ASSERT(k0 == s0);
|
||||
GGML_ASSERT(k1 == s1); // only s = k supported
|
||||
|
||||
ggml_compute_forward_pool_2d_sk_p0(params, op, src0, k0, k1, dst);
|
||||
}
|
||||
|
||||
|
||||
@@ -14794,6 +15056,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
||||
} break;
|
||||
case GGML_OP_POOL_1D:
|
||||
{
|
||||
ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
const int32_t t = ggml_get_i32_1d(tensor->src[3], 0);
|
||||
@@ -15494,6 +15764,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_POOL_1D:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
struct ggml_tensor * flash_grad = NULL;
|
||||
@@ -16284,8 +16562,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
||||
GGML_ASSERT(node->src[1]->ne[3] == 1);
|
||||
|
||||
const int64_t ne00 = node->src[0]->ne[0]; // W
|
||||
const int64_t ne01 = node->src[0]->ne[1]; // H
|
||||
const int64_t ne02 = node->src[0]->ne[2]; // C
|
||||
@@ -16295,19 +16571,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||
const int64_t ne11 = node->src[1]->ne[1]; // H
|
||||
const int64_t ne12 = node->src[1]->ne[2]; // C
|
||||
|
||||
const int64_t ne0 = node->ne[0];
|
||||
const int64_t ne1 = node->ne[1];
|
||||
const int64_t ne2 = node->ne[2];
|
||||
const int64_t nk = ne00*ne01;
|
||||
const int64_t ew0 = nk * ne02;
|
||||
|
||||
UNUSED(ne02);
|
||||
UNUSED(ne03);
|
||||
UNUSED(nk);
|
||||
UNUSED(ne2);
|
||||
|
||||
size_t cur = 0;
|
||||
|
||||
if (node->src[0]->type == GGML_TYPE_F16 &&
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
||||
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)* (ne10*ne11*ne12);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
@@ -16315,6 +16594,11 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_POOL_1D:
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
||||
27
ggml.h
27
ggml.h
@@ -368,6 +368,8 @@ extern "C" {
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
@@ -1173,6 +1175,31 @@ extern "C" {
|
||||
int s,
|
||||
int d);
|
||||
|
||||
enum ggml_op_pool {
|
||||
GGML_OP_POOL_MAX,
|
||||
GGML_OP_POOL_AVG,
|
||||
GGML_OP_POOL_COUNT,
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor* ggml_pool_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_pool op,
|
||||
int k0, // kernel size
|
||||
int s0, // stride
|
||||
int p0); // padding
|
||||
|
||||
GGML_API struct ggml_tensor* ggml_pool_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_pool op,
|
||||
int k0,
|
||||
int k1,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
|
||||
@@ -175,13 +175,13 @@ struct llama_mmap {
|
||||
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_PRIVATE;
|
||||
int flags = MAP_SHARED;
|
||||
// prefetch/readahead impairs performance on NUMA systems
|
||||
if (numa) { prefetch = 0; }
|
||||
#ifdef __linux__
|
||||
if (prefetch) { flags |= MAP_POPULATE; }
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
@@ -223,7 +223,7 @@ struct llama_mmap {
|
||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user