make : fix indentation

ci : fix MNT realpath usage (#2250 )
make : support customized LLAMA_CUDA_NVCC and LLAMA_CUDA_CCBIN (#2275 )
2026-02-26 14:23:22 +02:00 · 2023-07-21 13:50:55 +03:00 · 2023-07-21 13:49:18 +03:00 · 2023-07-21 13:38:57 +03:00 · 2023-07-21 13:26:34 +03:00 · 2023-07-21 13:10:51 +03:00
14 changed files with 622 additions and 523 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -61,3 +61,12 @@ qnt-*.txt
 perf-*.txt

 examples/jeopardy/results.txt
+
+# Test binaries
+tests/test-double-float
+tests/test-grad0
+tests/test-opt
+tests/test-quantize-fns
+tests/test-quantize-perf
+tests/test-sampling
+tests/test-tokenizer-0
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -186,16 +186,7 @@ if (LLAMA_BLAS)
                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
                # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS mkl-sdl)
-                if (NOT DepBLAS)
-                    if (BUILD_SHARED_LIBS)
-                        set(LINK_METHOD dynamic)
-                    else()
-                        set(LINK_METHOD static)
-                    endif()
-                    string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
-                    pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
-                endif()
+                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
                # this doesn't provide pkg-config
                # suggest to assign BLAS_INCLUDE_DIRS on your own
--- a/72
+++ b/72
@@ -1,5 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
+
+# Binaries only useful for tests
+TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0

 default: $(BUILD_TARGETS)

@@ -90,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
 	CXXFLAGS += -pthread
 endif

+# detect Windows
+ifneq ($(findstring _NT,$(UNAME_S)),)
+	_WIN32 := 1
+endif
+
+# library name prefix
+ifneq ($(_WIN32),1)
+	LIB_PRE := lib
+endif
+
+# Dynamic Shared Object extension
+ifneq ($(_WIN32),1)
+	DSO_EXT := .so
+else
+	DSO_EXT := .dll
+endif
+
+# Windows Sockets 2 (Winsock) for network-capable apps
+ifeq ($(_WIN32),1)
+	LWINSOCK2 := -lws2_32
+endif
+
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
@@ -168,8 +193,12 @@ ifdef LLAMA_CUBLAS
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
-	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler
+ifdef LLAMA_CUDA_NVCC
+	NVCC = $(LLAMA_CUDA_NVCC)
+else
+	NVCC = nvcc
+endif #LLAMA_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
 	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
 else
@@ -198,7 +227,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-
+ifdef LLAMA_CUDA_CCBIN
+	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
+endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
@@ -294,7 +325,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)

 #
 # Examples
@@ -325,14 +356,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)

-libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)


-embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
+embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput

 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@@ -349,6 +380,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
 # Tests
 #

+tests: $(TEST_TARGETS)
+
 benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	./$@
@@ -356,6 +389,23 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-.PHONY: tests clean
-tests:
-	bash ./tests/run-tests.sh
+tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements
  ```bash
  mkdir build
  cd build
-  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
  cmake --build . --config Release
  ```

--- a/ci/run.sh
+++ b/ci/run.sh
@@ -243,7 +243,7 @@ function gg_sum_open_llama_3b_v2 {
 if [ -z $GG_BUILD_LOW_PERF ]; then
    rm -rf ${SRC}/models-mnt

-    mnt_models=$(realpath ${MNT}/models)
+    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt

--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -2,21 +2,21 @@
 set -e

 AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
+MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
 USER_NAME="${USER_NAME:-Anon}"

 # Uncomment and adjust to the number of CPU cores you want to use.
 #N_THREAD="${N_THREAD:-4}"
+CTX_SIZE="${CTX_SIZE:-4096}"
 N_PREDICTS="${N_PREDICTS:-4096}"

 GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048
+--ctx_size "$CTX_SIZE"
 --keep -1
 --repeat_last_n 256
 --repeat_penalty 1.17647
--temp 0.7
--top_k 40
--top_p 0.5)
+--temp 0.6
+--mirostat 2)

 if [ -n "$N_THREAD" ]; then
    GEN_OPTIONS+=(--threads "$N_THREAD")
@@ -24,16 +24,17 @@ fi

 ./main "${GEN_OPTIONS[@]}" \
    --model "$MODEL" \
+    --in-prefix " " \
+    --in-suffix "${AI_NAME}:" \
    --n_predict "$N_PREDICTS" \
    --color --interactive \
    --reverse-prompt "${USER_NAME}:" \
-    --prompt "
-This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
+    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
 ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
 ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
 ${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
 ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
-The conversation is only between ${USER_NAME} and ${AI_NAME}
+The conversation is only between ${USER_NAME} and ${AI_NAME}.
 The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
 ${AI_NAME} can only communicate through text, so she can't send images or videos.

--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -586,7 +586,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    lparams.n_batch      = params.n_batch;
    lparams.n_gpu_layers = params.n_gpu_layers;
    lparams.main_gpu     = params.main_gpu;
-    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
+    lparams.tensor_split = params.tensor_split;
    lparams.low_vram     = params.low_vram;
    lparams.seed         = params.seed;
    lparams.f16_kv       = params.memory_f16;
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -7,6 +7,9 @@ target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
--- a/flake.nix
+++ b/flake.nix
@@ -6,7 +6,7 @@
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
-        inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin;
+        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
        osSpecific = with pkgs; [ openmpi ] ++
        (
          if isAarch64 && isDarwin then
@@ -22,14 +22,13 @@
              CoreGraphics
              CoreVideo
            ]
-          else if isx86_32 || isx86_64 then
-            with pkgs; [ mkl ]
          else
            with pkgs; [ openblas ]
        );
        pkgs = import nixpkgs { inherit system; };
+        nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
        llama-python =
-          pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
+          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
      in {
        packages.default = pkgs.stdenv.mkDerivation {
          name = "llama.cpp";
@@ -37,33 +36,21 @@
          postPatch = ''
            substituteInPlace ./ggml-metal.m \
              --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+            substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
          '';
-          nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
+          nativeBuildInputs = nativeBuildInputs;
          buildInputs = osSpecific;
          cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
            ++ (if isAarch64 && isDarwin then [
              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
              "-DLLAMA_METAL=ON"
-            ] else if isx86_32 || isx86_64 then [
-              "-DLLAMA_BLAS=ON"
-              "-DLLAMA_BLAS_VENDOR=Intel10_lp64"
            ] else [
              "-DLLAMA_BLAS=ON"
              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
          ]);
-          installPhase = ''
-            runHook preInstall
-
-            install -D bin/* -t $out/bin
-            install -Dm644 lib*.so -t $out/lib
+          postInstall = ''
            mv $out/bin/main $out/bin/llama
            mv $out/bin/server $out/bin/llama-server
-
-            echo "#!${llama-python}/bin/python" > $out/bin/convert.py
-            cat ${./convert.py} >> $out/bin/convert.py
-            chmod +x $out/bin/convert.py
-
-            runHook postInstall
          '';
          meta.mainProgram = "llama";
        };
@@ -81,7 +68,7 @@
        };
        apps.default = self.apps.${system}.llama;
        devShells.default = pkgs.mkShell {
-          packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
+          packages = nativeBuildInputs ++ osSpecific;
        };
      });
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2512,6 +2512,9 @@ void ggml_init_cublas() {
 }

 void ggml_cuda_set_tensor_split(const float * tensor_split) {
+    if (tensor_split == nullptr) {
+        return;
+    }
    bool all_zero = true;
    for (int i = 0; i < g_device_count; ++i) {
        if (tensor_split[i] != 0.0f) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -676,8 +676,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
                                        } break;
                                    case GGML_TYPE_Q3_K:
@@ -694,8 +694,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
                                        } break;
                                    case GGML_TYPE_Q6_K:
@@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
                                        } break;
                                    default:
@@ -739,14 +739,17 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];

-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
-                                else if (src0t == GGML_TYPE_Q2_K ||
-                                         src0t == GGML_TYPE_Q3_K ||
-                                         src0t == GGML_TYPE_Q4_K ||
-                                         src0t == GGML_TYPE_Q5_K ||
-                                         src0t == GGML_TYPE_Q6_K) {
+                                else if (src0t == GGML_TYPE_Q5_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q6_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q3_K) {
                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
@@ -792,7 +795,7 @@ void ggml_metal_graph_compute(

                            const float eps = 1e-6f;

-                            const int nth = 256;
+                            const int nth = 512;

                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -800,7 +803,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];

                            const int64_t nrows = ggml_nrows(src0);

--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/llama.cpp
+++ b/llama.cpp
@@ -555,7 +555,9 @@ struct llama_file_loader {
            }

            // skip to the next multiple of 32 bytes
-            file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+            if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
+                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
+            }

            tensor.file_off = file.tell();
            tensor.name = name;
@@ -847,7 +849,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_batch                     =*/ 512,
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
-        /*.tensor_split                =*/ {0},
+        /*.tensor_split                =*/ nullptr,
        /*.rope_freq_base              =*/ 10000.0f,
        /*.rope_freq_scale             =*/ 1.0f,
        /*.progress_callback           =*/ nullptr,
@@ -875,6 +877,10 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
    return result;
 }

+int llama_max_devices() {
+    return LLAMA_MAX_DEVICES;
+}
+
 bool llama_mmap_supported() {
    return llama_mmap::SUPPORTED;
 }
@@ -1283,7 +1289,7 @@ static bool llama_model_load(
        int n_batch,
        int n_gpu_layers,
        int main_gpu,
-        float * tensor_split,
+        const float * tensor_split,
        float rope_freq_base,
        float rope_freq_scale,
        bool low_vram,
--- a/llama.h
+++ b/llama.h
@@ -88,7 +88,8 @@ extern "C" {
        int32_t  n_batch;                      // prompt processing batch size
        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
-        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
+        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;  // RoPE base frequency
@@ -153,6 +154,8 @@ extern "C" {
        int32_t n_eval;
    };

+    LLAMA_API int llama_max_devices();
+
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
Author	SHA1	Message	Date
Georgi Gerganov	a814d04f81	make : fix indentation	2023-07-21 13:50:55 +03:00
Georgi Gerganov	4c013bb738	ci : fix MNT realpath usage (#2250 )	2023-07-21 13:49:18 +03:00
Sky Yan	42c7c2e2e9	make : support customized LLAMA_CUDA_NVCC and LLAMA_CUDA_CCBIN (#2275 ) Under certain environment, nvcc and gcc is installed under customized path but not standard path Co-authored-by: Yan Lin <yanlin@baidu.com>	2023-07-21 13:38:57 +03:00
wzy	78a3d13424	flake : remove intel mkl from flake.nix due to missing files (#2277 ) NixOS's mkl misses some libraries like mkl-sdl.pc. See #2261 Currently NixOS doesn't have intel C compiler (icx, icpx). See https://discourse.nixos.org/t/packaging-intel-math-kernel-libraries-mkl/975 So remove it from flake.nix Some minor changes: - Change pkgs.python310 to pkgs.python3 to keep latest - Add pkgconfig to devShells.default - Remove installPhase because we have `cmake --install` from #2256	2023-07-21 13:26:34 +03:00
Georgi Gerganov	ae178ab46b	llama : make tensor_split ptr instead of array (#2272 )	2023-07-21 13:10:51 +03:00
Jiří Podivín	54e3bc76fe	make : add new target for test binaries (#2244 ) Programs in the tests directory are now build with target tests and placed in the same location. * clean target was expanded to remove new binaries * test target binaries are listed in a variable * Locations of binaries were added to the .gitignore Signed-off-by: Jiri Podivin <jpodivin@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-07-21 13:09:16 +03:00
Hatsune Miku	019fe257bb	MIKU MAYHEM: Upgrading the Default Model for Maximum Fun 🎉 (#2287 ) * Miku.sh: Set default model to llama-2-7b-chat * Miku.sh: Set ctx_size to 4096 * Miku.sh: Add in-prefix/in-suffix opts * Miku.sh: Switch sampler to mirostat_v2 and tiny prompt improvements	2023-07-21 11:13:18 +03:00
Kawrakow	e68c96f7fe	Faster Q2_K on Metal (#2297 ) * Faster Q2_K on Metal * Deleting unnoticed and dangereous trailing white space * Fixed bug in new metal Q2_K implementation --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2023-07-21 10:44:40 +03:00
Przemysław Pawełczyk	9cf022a188	make : fix embdinput library and server examples building on MSYS2 (#2235 ) * make : fix embdinput library and server examples building on MSYS2 * cmake : fix server example building on MSYS2	2023-07-21 10:42:21 +03:00
Kawrakow	e782c9e735	Faster Q5_K and Q6_K on Metal (#2294 ) * Faster Q6_K on Metal * Faster Q5_K on Metal * Another Q5_K speedup --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2023-07-20 18:19:45 +03:00
Kawrakow	785829dfe8	Faster Q4_K on Metal (#2290 ) Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>	2023-07-20 15:18:43 +03:00
Georgi Gerganov	fff0e0eafe	llama : fix regression from #2000 - could not load no-mmap models	2023-07-20 13:47:26 +03:00
Shouzheng Liu	417a85a001	metal: minor q4 optimization and reduce code size (#2248 ) * metal: use uint16_t instead of uint8_t. Apple GPU doesn't like uint8_t. For every operation on uint8_t the gpu need to copy the uint8_t to an empty 16 bit register, then it can issue other instructions. For the matrix-vector multiplication kernel only, we observed a 340~350 GB/s memory read speed on M1 Max after this commit, which is very close to the reported hardware limit. * metal: update rms_norm kernel This commit double the speed of rms_norm operations by using 512 threads per threadgroup, combining with SIMD primitives to minimize the need for thread group barriers. * metal: use template to reduce size Revert modifications on block_q4_0 and block_q4_1.	2023-07-20 13:32:22 +03:00
Rinne	294f424554	llama : extend API to get max devices at runtime (#2253 )	2023-07-19 10:06:40 +03:00