mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-03-05 14:33:24 +02:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
65c2c1c5ab | ||
|
|
80834daecf | ||
|
|
a40f2b656f | ||
|
|
d119c04c15 | ||
|
|
8781013ef6 | ||
|
|
7ddf185537 | ||
|
|
ee66942d7e | ||
|
|
111163e246 | ||
|
|
8b428c9bc8 | ||
|
|
578d8c8f5c | ||
|
|
b541b4f0b1 |
14
.github/workflows/build.yml
vendored
14
.github/workflows/build.yml
vendored
@@ -265,17 +265,17 @@ jobs:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'noavx'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'clblast'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
|
||||
- build: 'openblas'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
@@ -406,6 +406,7 @@ jobs:
|
||||
id: cuda-toolkit
|
||||
with:
|
||||
cuda: ${{ matrix.cuda }}
|
||||
method: 'network'
|
||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||
|
||||
- name: Build
|
||||
@@ -413,7 +414,7 @@ jobs:
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
|
||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
|
||||
cmake --build . --config Release
|
||||
|
||||
- name: Determine tag name
|
||||
@@ -467,6 +468,7 @@ jobs:
|
||||
with:
|
||||
operating_system: freebsd
|
||||
version: '13.2'
|
||||
hypervisor: 'qemu'
|
||||
run: |
|
||||
sudo pkg update
|
||||
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||
|
||||
@@ -80,6 +80,8 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
@@ -304,6 +306,7 @@ if (LLAMA_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_F16)
|
||||
endif()
|
||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
|
||||
19
Makefile
19
Makefile
@@ -368,6 +368,11 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||
else
|
||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||
endif
|
||||
ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
|
||||
else
|
||||
NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
|
||||
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
|
||||
#ifdef LLAMA_CUDA_CUBLAS
|
||||
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
||||
#endif # LLAMA_CUDA_CUBLAS
|
||||
@@ -509,22 +514,22 @@ main: examples/main/main.cpp build-info.h ggml.
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
simple: examples/simple/simple.cpp ggml.o llama.o common.o $(OBJS)
|
||||
simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
|
||||
quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
|
||||
perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
|
||||
embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o common.o $(OBJS)
|
||||
save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
||||
@@ -577,7 +582,7 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
|
||||
|
||||
tests: $(TEST_TARGETS)
|
||||
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
|
||||
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
./$@
|
||||
|
||||
|
||||
15
README.md
15
README.md
@@ -391,13 +391,14 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
<!---
|
||||
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
|
||||
--->
|
||||
| Option | Legal values | Default | Description |
|
||||
|-------------------------|------------------------|---------|-------------|
|
||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
| Option | Legal values | Default | Description |
|
||||
|--------------------------------|------------------------|---------|-------------|
|
||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||
|
||||
- #### hipBLAS
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
#include "build-info.h"
|
||||
|
||||
#define LOG_NO_FILE_LINE_FUNCTION
|
||||
#include "log.h"
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
|
||||
@@ -20,7 +21,7 @@
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
@@ -31,19 +32,19 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
|
||||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||
float sum = 0;
|
||||
if (tensor->type==GGML_TYPE_F32) {
|
||||
static float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||
double sum = 0;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
for (int k = 0; k < tensor->ne[0]; k++) {
|
||||
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
|
||||
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
|
||||
tensor->type, ggml_type_name(tensor->type),
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
||||
@@ -58,7 +59,7 @@ struct benchmark_params_struct {
|
||||
int32_t n_iterations = 10;
|
||||
};
|
||||
|
||||
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
||||
static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
@@ -125,12 +126,15 @@ int main(int argc, char ** argv) {
|
||||
|
||||
//printf("Memsize required = %i\n", sizex*sizex);
|
||||
|
||||
// TODO: perform the bench for all types or for a user specified type
|
||||
const ggml_type qtype = GGML_TYPE_Q4_1;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(qtype);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
|
||||
ctx_size += 1024*1024*16;
|
||||
@@ -163,7 +167,7 @@ int main(int argc, char ** argv) {
|
||||
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
|
||||
ggml_set_f32(m2, 2.0f);
|
||||
|
||||
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
|
||||
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
|
||||
// printf("Creating new tensor m11xm2\n");
|
||||
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
||||
|
||||
@@ -181,17 +185,16 @@ int main(int argc, char ** argv) {
|
||||
|
||||
TENSOR_DUMP(gf.nodes[0]);
|
||||
|
||||
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
|
||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
||||
|
||||
int32_t nelements = sizex*sizey;
|
||||
int32_t ne[2] = { sizex, sizey };
|
||||
|
||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||
|
||||
// Set up a the benchmark matrices
|
||||
// printf("Creating new tensor q11 & Running quantize\n");
|
||||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
|
||||
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
|
||||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
|
||||
|
||||
// Set up a the compute graph
|
||||
// printf("Creating new tensor q31\n");
|
||||
@@ -202,8 +205,8 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||
// printf("Creating new tensor q12 & Running quantize\n");
|
||||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
|
||||
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
|
||||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
||||
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
|
||||
|
||||
// printf("Creating new tensor q32\n");
|
||||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
||||
@@ -220,7 +223,7 @@ int main(int argc, char ** argv) {
|
||||
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
|
||||
|
||||
|
||||
// Let's use the F32 result from above as a reference for the q4_0 multiplication
|
||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
||||
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
|
||||
|
||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
||||
@@ -250,7 +253,7 @@ int main(int argc, char ** argv) {
|
||||
// Check that the matrix multiplication result is in the right ballpark
|
||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
||||
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||
|
||||
if (delta > allowed_delta) {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "embd-input.h"
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "build-info.h"
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
|
||||
@@ -52,7 +52,8 @@
|
||||
in
|
||||
{
|
||||
packages.default = pkgs.stdenv.mkDerivation {
|
||||
inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
|
||||
inherit name src meta postPatch nativeBuildInputs postInstall;
|
||||
buildInputs = osSpecific;
|
||||
cmakeFlags = cmakeFlags
|
||||
++ (if isAarch64 && isDarwin then [
|
||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
||||
|
||||
53
ggml-cuda.cu
53
ggml-cuda.cu
@@ -31,6 +31,9 @@
|
||||
#define cublasSetStream hipblasSetStream
|
||||
#define cublasSgemm hipblasSgemm
|
||||
#define cublasStatus_t hipblasStatus_t
|
||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
@@ -424,6 +427,10 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
||||
#endif
|
||||
|
||||
#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||
#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
|
||||
#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
|
||||
|
||||
#define MUL_MAT_SRC1_COL_STRIDE 128
|
||||
|
||||
#define MAX_STREAMS 8
|
||||
@@ -6258,6 +6265,43 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_set_peer_access(const int n_tokens) {
|
||||
static bool peer_access_enabled = false;
|
||||
|
||||
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
|
||||
|
||||
if (peer_access_enabled == enable_peer_access) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef NDEBUG
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
|
||||
for (int id_other = 0; id_other < g_device_count; ++id_other) {
|
||||
if (id == id_other) {
|
||||
continue;
|
||||
}
|
||||
if (id != g_main_device && id_other != g_main_device) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int can_access_peer;
|
||||
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
|
||||
if (can_access_peer) {
|
||||
if (enable_peer_access) {
|
||||
CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
|
||||
} else {
|
||||
CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // NDEBUG
|
||||
|
||||
peer_access_enabled = enable_peer_access;
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_mul_mat(
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
|
||||
const bool convert_src1_to_q8_1) {
|
||||
@@ -6282,6 +6326,8 @@ static void ggml_cuda_op_mul_mat(
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
ggml_cuda_set_peer_access(ne11);
|
||||
|
||||
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
|
||||
|
||||
@@ -6970,6 +7016,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
||||
return;
|
||||
}
|
||||
if (g_scratch_buffer == nullptr) {
|
||||
ggml_cuda_set_device(g_main_device);
|
||||
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
||||
}
|
||||
|
||||
@@ -7009,7 +7056,7 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
|
||||
ggml_cuda_assign_buffers_impl(tensor, false, true, false);
|
||||
}
|
||||
|
||||
void ggml_cuda_set_main_device(int main_device) {
|
||||
void ggml_cuda_set_main_device(const int main_device) {
|
||||
if (main_device >= g_device_count) {
|
||||
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
||||
main_device, g_device_count, g_main_device);
|
||||
@@ -7023,11 +7070,11 @@ void ggml_cuda_set_main_device(int main_device) {
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_set_mul_mat_q(bool mul_mat_q) {
|
||||
void ggml_cuda_set_mul_mat_q(const bool mul_mat_q) {
|
||||
g_mul_mat_q = mul_mat_q;
|
||||
}
|
||||
|
||||
void ggml_cuda_set_scratch_size(size_t scratch_size) {
|
||||
void ggml_cuda_set_scratch_size(const size_t scratch_size) {
|
||||
g_scratch_size = scratch_size;
|
||||
}
|
||||
|
||||
|
||||
12
llama.cpp
12
llama.cpp
@@ -927,6 +927,7 @@ enum e_model {
|
||||
|
||||
static const size_t kB = 1024;
|
||||
static const size_t MB = kB*kB;
|
||||
static const size_t GB = kB*kB*kB;
|
||||
|
||||
// default hparams (LLaMA 7B)
|
||||
struct llama_hparams {
|
||||
@@ -1280,6 +1281,7 @@ struct llama_model_loader {
|
||||
int n_created = 0;
|
||||
|
||||
int64_t n_elements = 0;
|
||||
size_t n_bytes = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
|
||||
@@ -1312,6 +1314,7 @@ struct llama_model_loader {
|
||||
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
||||
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
||||
n_elements += ggml_nelements(t);
|
||||
n_bytes += ggml_nbytes(t);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
||||
@@ -1909,7 +1912,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||
if (ml.n_bytes < GB) {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
} else {
|
||||
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||
}
|
||||
|
||||
// general kv
|
||||
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
||||
@@ -3495,7 +3503,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
|
||||
ggml_allocr_alloc(lctx.alloc, token);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(token->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||
memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user