llama : revert n_threads_batch logic

ggml-ci
llama : use n_threads_batch only when n_tokens >= 32
2026-04-23 16:37:33 +03:00 · 2023-11-27 21:47:35 +02:00 · 2023-11-27 20:49:17 +02:00 · 2023-11-27 20:48:27 +02:00
18 changed files with 106 additions and 171 deletions
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    ./main "$@"
-elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
-    ./finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -36,8 +34,6 @@ else
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
-    echo "              See documentation for finetune for command-line parameters"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
--- a/.gitignore
+++ b/.gitignore
@@ -88,16 +88,15 @@ poetry.lock
 poetry.toml

 # Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
-/tests/test-double-float
-/tests/test-grad0
-/tests/test-opt
-/tests/test-quantize-fns
-/tests/test-quantize-perf
-/tests/test-sampling
-/tests/test-tokenizer-0-llama
-/tests/test-tokenizer-0-falcon
-/tests/test-tokenizer-1-llama
-/tests/test-tokenizer-1-bpe
-/tests/test-rope
+tests/test-grammar-parser
+tests/test-llama-grammar
+tests/test-double-float
+tests/test-grad0
+tests/test-opt
+tests/test-quantize-fns
+tests/test-quantize-perf
+tests/test-sampling
+tests/test-tokenizer-0-llama
+tests/test-tokenizer-0-falcon
+tests/test-tokenizer-1-llama
+tests/test-tokenizer-1-bpe
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,6 @@ else()
 endif()

 # general
-option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
@@ -101,9 +100,6 @@ option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALO
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)

-# Required for relocatable CMake package
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
-
 #
 # Compile flags
 #
@@ -165,7 +161,7 @@ if (LLAMA_METAL)
    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")

    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
+    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)

    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
--- a/25
+++ b/25
@@ -8,7 +8,7 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -30,7 +30,7 @@ ifeq '' '$(findstring clang,$(shell $(CC) --version))'
 	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
 else
 	CC_IS_CLANG=1
-	ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
+	ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
 		CC_IS_LLVM_CLANG=1
 	else
 		CC_IS_APPLE_CLANG=1
@@ -648,7 +648,7 @@ beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS)
 finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
@@ -701,28 +701,28 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
+tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
@@ -737,8 +737,5 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
--- a/README.md
+++ b/README.md
@@ -117,7 +117,6 @@ as the main playground for developing new features for the [ggml](https://github
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [semperai/amica](https://github.com/semperai/amica)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)

 ---

@@ -324,7 +323,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128

 ### BLAS Build

-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

 - #### Accelerate Framework:

@@ -896,7 +895,7 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)

-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).

 #### Usage

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -11,12 +11,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
    if(NOT IS_DIRECTORY "${GIT_DIR}")
        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
-        if (SLASH_POS EQUAL 0)
-            set(GIT_DIR "${REAL_GIT_DIR}")
-        else()
-            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
-        endif()
+        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
    endif()

    set(GIT_INDEX "${GIT_DIR}/index")
@@ -31,7 +26,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
--- a/convert.py
+++ b/convert.py
@@ -267,7 +267,7 @@ class Params:
            n_ctx = 2048

        return Params(
-            n_vocab          = model["tok_embeddings.weight"].shape[0],
+            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
            n_embd           = config["dim"],
            n_layer          = config["n_layers"],
            n_ctx            = n_ctx,
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.

 $ `make`
-$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -5,7 +5,7 @@ import json
 import torch
 import numpy as np
 from gguf import *
-from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+from transformers import CLIPModel, CLIPProcessor

 TEXT = "clip.text"
 VISION = "clip.vision"
@@ -78,19 +78,11 @@ ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
-                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
-# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
-default_image_mean = [0.48145466, 0.4578275, 0.40821073]
-default_image_std = [0.26862954, 0.26130258, 0.27577711]
-ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
-ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)

-# with proper
 args = ap.parse_args()


@@ -104,22 +96,15 @@ if args.use_f32:
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir

-if args.clip_model_is_vision:
-    vocab = None
-    tokens = None
-else:
-    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-        vocab = json.load(f)
-        tokens = [key for key in vocab]
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    vocab = json.load(f)
+    tokens = [key for key in vocab]

 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    config = json.load(f)
-    if args.clip_model_is_vision:
-        v_hparams = config
-        t_hparams = None
-    else:
-        v_hparams = config["vision_config"]
-        t_hparams = config["text_config"]
+    v_hparams = config["vision_config"]
+    t_hparams = config["text_config"]

 # possible data types
 #   ftype == 0 -> float32
@@ -132,12 +117,9 @@ ftype = 1
 if args.use_f32:
    ftype = 0

-if args.clip_model_is_vision:
-    model = CLIPVisionModel.from_pretrained(dir_model)
-    processor = None
-else:
-    model = CLIPModel.from_pretrained(dir_model)
-    processor = CLIPProcessor.from_pretrained(dir_model)
+
+model = CLIPModel.from_pretrained(dir_model)
+processor = CLIPProcessor.from_pretrained(dir_model)

 fname_middle = None
 has_text_encoder = True
@@ -146,13 +128,13 @@ has_llava_projector = False
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
 elif args.llava_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_llava_projector = True
-elif args.vision_only:
-    fname_middle = "vision-"
-    has_text_encoder = False
 else:
    fname_middle = ""

@@ -200,12 +182,8 @@ if has_vision_encoder:
    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)

-    if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
-    else:
-        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
-        image_std = args.image_std if args.image_std is not None else default_image_std
+    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
+    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
    fout.add_array("clip.vision.image_mean", image_mean)
    fout.add_array("clip.vision.image_std", image_std)

--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@@ -1,7 +0,0 @@
-# llama.cpp/examples/lookahead
-
-Demonstartion of lookahead decoding technique:
-
-https://lmsys.org/blog/2023-11-21-lookahead-decoding/
-
-More info: https://github.com/ggerganov/llama.cpp/pull/4207
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -100,12 +100,6 @@ static void sigint_handler(int signo) {
 }
 #endif

-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
 int main(int argc, char ** argv) {
    gpt_params params;
    g_params = &params;
@@ -119,7 +113,6 @@ int main(int argc, char ** argv) {
    log_set_target(log_filename_generator("main", "log"));
    LOG_TEE("Log start\n");
    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS

    // TODO: Dump params ?
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@@ -11,10 +11,10 @@ app = Flask(__name__)
 slot_id = -1

 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
-parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')
-parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ")
-parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ")
-parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ")
+parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
+parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
+parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
+parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
 parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
 parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
 parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
@@ -34,19 +34,19 @@ def is_present(json, key):

 #convert chat to prompt
 def convert_chat(messages):
+    prompt = "" + args.chat_prompt.replace("\\n", "\n")

-    system_n = args.system_name
-    user_n = args.user_name
-    ai_n = args.ai_name
-    stop = args.stop
+    system_n = args.system_name.replace("\\n", "\n")
+    user_n = args.user_name.replace("\\n", "\n")
+    ai_n = args.ai_name.replace("\\n", "\n")
+    stop = args.stop.replace("\\n", "\n")

-    prompt = "" + args.chat_prompt + stop

    for line in messages:
        if (line["role"] == "system"):
-            prompt += f"{system_n}{line['content']}{stop}"
+            prompt += f"{system_n}{line['content']}"
        if (line["role"] == "user"):
-            prompt += f"{user_n}{line['content']}{stop}"
+            prompt += f"{user_n}{line['content']}"
        if (line["role"] == "assistant"):
            prompt += f"{ai_n}{line['content']}{stop}"
    prompt += ai_n.rstrip()
@@ -130,7 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
            }
        ]
    }
-    slot_id = data.get("slot_id")
+    slot_id = data["slot_id"]
    if (chat):
        if (start):
            resData["choices"][0]["delta"] =  {
@@ -150,13 +150,11 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
    return resData


-@app.route('/chat/completions', methods=['POST', 'OPTIONS'])
-@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS'])
+@app.route('/chat/completions', methods=['POST'])
+@app.route('/v1/chat/completions', methods=['POST'])
 def chat_completions():
    if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
        return Response(status=403)
-    if request.method == 'OPTIONS':
-        return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
    body = request.get_json()
    stream = False
    tokenize = False
@@ -179,22 +177,20 @@ def chat_completions():
            data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
            time_now = int(time.time())
            resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
-            yield 'data: {}\n\n'.format(json.dumps(resData))
+            yield 'data: {}\n'.format(json.dumps(resData))
            for line in data.iter_lines():
                if line:
                    decoded_line = line.decode('utf-8')
                    resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
-                    yield 'data: {}\n\n'.format(json.dumps(resData))
-        return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
+                    yield 'data: {}\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream')


-@app.route('/completions', methods=['POST', 'OPTIONS'])
-@app.route('/v1/completions', methods=['POST', 'OPTIONS'])
+@app.route('/completions', methods=['POST'])
+@app.route('/v1/completions', methods=['POST'])
 def completion():
    if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
        return Response(status=403)
-    if request.method == 'OPTIONS':
-        return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
    body = request.get_json()
    stream = False
    tokenize = False
@@ -220,8 +216,8 @@ def completion():
                if line:
                    decoded_line = line.decode('utf-8')
                    resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
-                    yield 'data: {}\n\n'.format(json.dumps(resData))
-        return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"})
+                    yield 'data: {}\n'.format(json.dumps(resData))
+        return Response(generate(), mimetype='text/event-stream')

 if __name__ == '__main__':
    app.run(args.host, port=args.port)
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -1,8 +0,0 @@
-# llama.cpp/examples/speculative
-
-Demonstartion of speculative decoding and tree-based speculative decoding techniques
-
-More info:
-
- https://github.com/ggerganov/llama.cpp/pull/2926
- https://github.com/ggerganov/llama.cpp/pull/3624
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1,18 +1,20 @@
-#include "ggml.h"
 #include "ggml-opencl.h"

 #include <array>
 #include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
 #include <sstream>
 #include <vector>
+#include <limits>

 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast.h>

+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ggml.h"
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
--- a/ggml.h
+++ b/ggml.h
@@ -244,10 +244,11 @@
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
-            fflush(stdout); \
            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            fflush(stderr); \
+            fflush(stdout); \
            ggml_print_backtrace(); \
-            abort(); \
+            exit(1); \
        } \
    } while (0)

--- a/llama.cpp
+++ b/llama.cpp
@@ -46,6 +46,7 @@
    #endif
    #include <windows.h>
    #include <io.h>
+    #include <stdio.h> // for _fseeki64
 #endif

 #include <algorithm>
@@ -2644,15 +2645,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    }

    // general kv
-    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, model.name.c_str());
+    LLAMA_LOG_INFO("%s: general.name   = %s\n",    __func__, model.name.c_str());

    // special tokens
-    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token        = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
-    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token        = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
-    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token        = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
-    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token        = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
-    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token        = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
-    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
+    if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
+    if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
+    if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
+    if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
+    if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
+    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token  = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }

 static void llm_load_tensors(
@@ -7026,7 +7027,6 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
    // Replace the data in candidates with the new_candidates data
    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
    candidates->size = new_candidates.size();
-    candidates->sorted = false;

    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@@ -1,3 +1,5 @@
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
+set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
 set(BUILD_COMPILER "unknown")
@@ -56,3 +58,23 @@ else()
    )
    set(BUILD_TARGET ${OUT})
 endif()
+
+# Only write the build info if it changed
+if(EXISTS ${OUTPUT_FILE})
+    file(READ ${OUTPUT_FILE} CONTENTS)
+    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_COMMIT ${CMAKE_MATCH_1})
+    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_COMPILER ${CMAKE_MATCH_1})
+    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
+    set(OLD_TARGET ${CMAKE_MATCH_1})
+    if (
+        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
+        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
+        NOT OLD_TARGET   STREQUAL BUILD_TARGET
+    )
+        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+    endif()
+else()
+    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+endif()
--- a/scripts/gen-build-info-cpp.cmake
+++ b/scripts/gen-build-info-cpp.cmake
@@ -1,24 +0,0 @@
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
-
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
-set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
-
-# Only write the build info if it changed
-if(EXISTS ${OUTPUT_FILE})
-    file(READ ${OUTPUT_FILE} CONTENTS)
-    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
-    set(OLD_TARGET ${CMAKE_MATCH_1})
-    if (
-        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
-        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
-        NOT OLD_TARGET   STREQUAL BUILD_TARGET
-    )
-        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-    endif()
-else()
-    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
-endif()
Author	SHA1	Message	Date
Georgi Gerganov	87f4102a70	llama : revert n_threads_batch logic ggml-ci	2023-11-27 21:47:35 +02:00
Georgi Gerganov	e9b7a5cbd0	llama : use n_threads_batch only when n_tokens >= 32 ggml-ci	2023-11-27 20:49:17 +02:00
Georgi Gerganov	f815fe43d3	ggml : use blas even if src0 is not F32	2023-11-27 20:48:27 +02:00