llm : cleanup + comments

llm : restore the non-graph llm_build_ functional API
ggml-ci
2026-04-16 16:27:32 +03:00 · 2023-11-01 20:08:02 +02:00 · 2023-11-01 16:47:04 +02:00 · 2023-11-01 11:19:58 +02:00 · 2023-11-01 10:52:24 +02:00
56 changed files with 730 additions and 1828 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,6 @@
 .DS_Store
 .build/
 .cache/
-.ccls-cache/
 .direnv/
 .envrc
 .swiftpm
@@ -65,7 +64,7 @@ models-mnt
 /parallel
 /train-text-from-scratch
 /vdot
-/common/build-info.cpp
+build-info.h
 arm_neon.h
 compile_commands.json
 CMakeSettings.json
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,7 +44,7 @@ endif()

 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)

 # debug
@@ -100,6 +100,39 @@ option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALO
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)

+#
+# Build info header
+#
+
+# Generate initial build-info.h
+include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+
+    # Is git submodule
+    if(NOT IS_DIRECTORY "${GIT_DIR}")
+        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
+        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
+        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
+    endif()
+
+    # Add a custom target for build-info.h
+    add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
+
+    # Add a custom command to rebuild build-info.h when .git/index changes
+    add_custom_command(
+        OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
+        COMMENT "Generating build details from Git"
+        COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        DEPENDS "${GIT_DIR}/index"
+        VERBATIM
+    )
+else()
+    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
+endif()
+
 #
 # Compile flags
 #
--- a/71
+++ b/71
@@ -542,9 +542,9 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h l
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
+COMMON_DEPS   = common.o sampling.o grammar-parser.o

-common.o: common/common.cpp $(COMMON_H_DEPS)
+common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
@@ -563,46 +563,46 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)

 #
 # Examples
 #

-main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+infill: examples/infill/infill.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual

 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
@@ -614,7 +614,7 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
@@ -623,19 +623,19 @@ llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 ifdef LLAMA_METAL
@@ -648,7 +648,7 @@ swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
 endif

-common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
+build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh $(CC) > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
 		mv $@.tmp $@; \
@@ -656,16 +656,13 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 		rm $@.tmp; \
 	fi

-build-info.o: common/build-info.cpp
-	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
-
 #
 # Tests
 #

 tests: $(TEST_TARGETS)

-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 run-benchmark-matmult: benchmark-matmult
@@ -679,40 +676,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-c.o: tests/test-c.c llama.h
--- a/build.zig
+++ b/build.zig
@@ -10,6 +10,7 @@ const Maker = struct {
    builder: *std.build.Builder,
    target: CrossTarget,
    optimize: Mode,
+    config_header: *ConfigHeader,
    enable_lto: bool,

    include_dirs: ArrayList([]const u8),
@@ -40,24 +41,26 @@ const Maker = struct {
        const commit_hash = try std.ChildProcess.exec(
            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
        );
-        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
-            \\int LLAMA_BUILD_NUMBER = {};
-            \\char const *LLAMA_COMMIT = "{s}";
-            \\char const *LLAMA_COMPILER = "Zig {s}";
-            \\char const *LLAMA_BUILD_TARGET = "{s}";
-            \\
-        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
+        const config_header = builder.addConfigHeader(
+            .{ .style = .blank, .include_path = "build-info.h" },
+            .{
+                .BUILD_NUMBER = 0,
+                .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
+                .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
+                .BUILD_TARGET = try target.allocDescription(builder.allocator),
+            },
+        );
        var m = Maker{
            .builder = builder,
            .target = target,
            .optimize = builder.standardOptimizeOption(.{}),
+            .config_header = config_header,
            .enable_lto = false,
            .include_dirs = ArrayList([]const u8).init(builder.allocator),
            .cflags = ArrayList([]const u8).init(builder.allocator),
            .cxxflags = ArrayList([]const u8).init(builder.allocator),
            .objs = ArrayList(*Compile).init(builder.allocator),
        };
-
        try m.addCFlag("-std=c11");
        try m.addCxxFlag("-std=c++11");
        try m.addProjectInclude(&.{});
@@ -69,7 +72,7 @@ const Maker = struct {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (o.target.getAbi() != .msvc)
            o.defineCMacro("_GNU_SOURCE", null);
-
+        o.addConfigHeader(m.config_header);
        if (std.mem.endsWith(u8, src, ".c")) {
            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
@@ -82,6 +85,7 @@ const Maker = struct {
                o.linkLibCpp();
            }
        }
+        o.addConfigHeader(m.config_header);
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
        o.want_lto = m.enable_lto;
        return o;
@@ -101,6 +105,7 @@ const Maker = struct {
            // linkLibCpp already add (libc++ + libunwind + libc)
            e.linkLibCpp();
        }
+        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
        e.want_lto = m.enable_lto;
        return e;
@@ -116,7 +121,6 @@ pub fn build(b: *std.build.Builder) !void {
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
    const llama = make.obj("llama", "llama.cpp");
-    const buildinfo = make.obj("common", "common/build-info.cpp");
    const common = make.obj("common", "common/common.cpp");
    const console = make.obj("console", "common/console.cpp");
    const sampling = make.obj("sampling", "common/sampling.cpp");
@@ -124,14 +128,14 @@ pub fn build(b: *std.build.Builder) !void {
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,46 +1,8 @@
 # common

-
-# Build info header
-#
-
-if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
-    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
-
-    # Is git submodule
-    if(NOT IS_DIRECTORY "${GIT_DIR}")
-        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
-        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
-    endif()
-
-    set(GIT_INDEX "${GIT_DIR}/index")
-else()
-    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
-    set(GIT_INDEX "")
-endif()
-
-# Add a custom command to rebuild build-info.cpp when .git/index changes
-add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
-    COMMENT "Generating build details from Git"
-    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
-    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
-    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
-    VERBATIM
-)
-set(TARGET build_info)
-add_library(${TARGET} OBJECT build-info.cpp)
-if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
-
 set(TARGET common)

-add_library(${TARGET} STATIC
+add_library(${TARGET} OBJECT
    common.h
    common.cpp
    sampling.h
@@ -59,4 +21,4 @@ endif()

 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama build_info)
+target_link_libraries(${TARGET} PRIVATE llama)
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -1,4 +0,0 @@
-int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
-char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
-char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "build-info.h"
 #include "llama.h"

 #include <algorithm>
@@ -102,24 +103,9 @@ void process_escapes(std::string& input) {
 }

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    bool result = true;
-    try {
-        if (!gpt_params_parse_ex(argc, argv, params)) {
-            gpt_print_usage(argc, argv, gpt_params());
-            exit(0);
-        }
-    }
-    catch (const std::invalid_argument & ex) {
-        fprintf(stderr, "%s\n", ex.what());
-        gpt_print_usage(argc, argv, gpt_params());
-        exit(1);
-    }
-    return result;
-}
-
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
    bool invalid_param = false;
    std::string arg;
+    gpt_params default_params;
    const std::string arg_prefix = "--";
    llama_sampling_params & sparams = params.sparams;

@@ -218,52 +204,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
-        } else if (arg == "--rope-scaling") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
-            else { invalid_param = true; break; }
        } else if (arg == "--rope-scale") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.rope_freq_scale = 1.0f/std::stof(argv[i]);
-        } else if (arg == "--yarn-orig-ctx") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_orig_ctx = std::stoi(argv[i]);
-        } else if (arg == "--yarn-ext-factor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_ext_factor = std::stof(argv[i]);
-        } else if (arg == "--yarn-attn-factor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_attn_factor = std::stof(argv[i]);
-        } else if (arg == "--yarn-beta-fast") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_fast = std::stof(argv[i]);
-        } else if (arg == "--yarn-beta-slow") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_slow = std::stof(argv[i]);
        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
        } else if (arg == "--top-p") {
@@ -608,8 +554,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
        } else if (arg == "-h" || arg == "--help") {
-            return false;
-
+            gpt_print_usage(argc, argv, default_params);
+#ifndef LOG_DISABLE_LOGS
+            log_print_usage();
+#endif // LOG_DISABLE_LOGS
+            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@@ -668,17 +617,22 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
        } else {
-            throw std::invalid_argument("error: unknown argument: " + arg);
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            gpt_print_usage(argc, argv, default_params);
+            exit(1);
        }
    }
    if (invalid_param) {
-        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        gpt_print_usage(argc, argv, default_params);
+        exit(1);
    }
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
             params.instruct)) {
-
-        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
+        fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
+        gpt_print_usage(argc, argv, default_params);
+        exit(1);
    }

    if (params.escape) {
@@ -697,7 +651,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const llama_sampling_params & sparams = params.sparams;

-    printf("\n");
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
    printf("options:\n");
@@ -755,16 +708,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --cfg-negative-prompt-file FNAME\n");
    printf("                        negative prompt file to use for guidance. (default: empty)\n");
    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
-    printf("  --rope-scaling {none,linear,yarn}\n");
-    printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
-    printf("  --rope-scale N        RoPE context scaling factor, expands context by a factor of N\n");
+    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-orig-ctx N     YaRN: original context size of model (default: 0 = model training context size)\n");
-    printf("  --yarn-ext-factor N   YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
-    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
-    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
-    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
+    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
@@ -816,9 +762,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
    printf("\n");
-#ifndef LOG_DISABLE_LOGS
-    log_print_usage();
-#endif // LOG_DISABLE_LOGS
 }

 std::string get_system_info(const gpt_params & params) {
@@ -872,23 +815,17 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();

-    cparams.n_ctx             = params.n_ctx;
-    cparams.n_batch           = params.n_batch;
-    cparams.n_threads         = params.n_threads;
-    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-    cparams.mul_mat_q         = params.mul_mat_q;
-    cparams.seed              = params.seed;
-    cparams.f16_kv            = params.memory_f16;
-    cparams.logits_all        = params.logits_all;
-    cparams.embedding         = params.embedding;
-    cparams.rope_scaling_type = params.rope_scaling_type;
-    cparams.rope_freq_base    = params.rope_freq_base;
-    cparams.rope_freq_scale   = params.rope_freq_scale;
-    cparams.yarn_ext_factor   = params.yarn_ext_factor;
-    cparams.yarn_attn_factor  = params.yarn_attn_factor;
-    cparams.yarn_beta_fast    = params.yarn_beta_fast;
-    cparams.yarn_beta_slow    = params.yarn_beta_slow;
-    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
+    cparams.n_ctx           = params.n_ctx;
+    cparams.n_batch         = params.n_batch;
+    cparams.n_threads       = params.n_threads;
+    cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    cparams.mul_mat_q       = params.mul_mat_q;
+    cparams.seed            = params.seed;
+    cparams.f16_kv          = params.memory_f16;
+    cparams.logits_all      = params.logits_all;
+    cparams.embedding       = params.embedding;
+    cparams.rope_freq_base  = params.rope_freq_base;
+    cparams.rope_freq_scale = params.rope_freq_scale;

    return cparams;
 }
@@ -1198,8 +1135,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
    const llama_sampling_params & sparams = params.sparams;

-    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
-    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
+    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
+    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
--- a/common/common.h
+++ b/common/common.h
@@ -9,7 +9,6 @@
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"

-#include <cmath>
 #include <string>
 #include <vector>
 #include <random>
@@ -26,17 +25,11 @@
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

-#define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
+#define print_build_info() do {                                                             \
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);         \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);  \
 } while(0)

-// build info
-extern int LLAMA_BUILD_NUMBER;
-extern char const *LLAMA_COMMIT;
-extern char const *LLAMA_COMPILER;
-extern char const *LLAMA_BUILD_TARGET;
-
 //
 // CLI argument parsing
 //
@@ -61,12 +54,6 @@ struct gpt_params {
    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 0.0f; // RoPE base frequency
    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
-    float   yarn_ext_factor                 = NAN;  // YaRN extrapolation mix factor
-    float   yarn_attn_factor                = 1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast                  = 32.0f;// YaRN low correction dim
-    float   yarn_beta_slow                  = 1.0f; // YaRN high correction dim
-    int32_t yarn_orig_ctx                   = 0;    // YaRN original context length
-    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;

    // // sampling parameters
    struct llama_sampling_params sparams;
@@ -123,8 +110,6 @@ struct gpt_params {
    std::string image = ""; // path to an image file
 };

-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
-
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
--- a/common/log.h
+++ b/common/log.h
@@ -97,56 +97,38 @@
    #define LOG_TEE_TARGET stderr
 #endif

-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
+// NOTE: currently disabled as it produces too many log files
 // Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-   static std::string pid;
-   if (pid.empty())
-   {
-       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-       //  it's not the same as "pid" but is unique enough to solve multiple instances
-       //  trying to write to the same log.
-       std::stringstream ss;
-       ss << std::this_thread::get_id();
-       pid = ss.str();
-   }
-
-   return pid;
-}
+//inline std::string log_get_pid()
+//{
+//    static std::string pid;
+//    if (pid.empty())
+//    {
+//        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
+//        //  it's not the same as "pid" but is unique enough to solve multiple instances
+//        //  trying to write to the same log.
+//        std::stringstream ss;
+//        ss << std::this_thread::get_id();
+//        pid = ss.str();
+//    }
+//
+//    return pid;
+//}

 // Utility function for generating log file names with unique id based on thread id.
 //  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
 //  where the number is a runtime id of the current thread.

-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
+#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)

 // INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
+inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
 {
-    static bool _multilog = false;
-
-    if (multilog != LogTriStateSame)
-    {
-        _multilog = multilog == LogTriStateTrue;
-    }
-
    std::stringstream buf;

    buf << log_file_basename;
-    if (_multilog)
-    {
-        buf << ".";
-        buf << log_get_pid();
-    }
+    //buf << ".";
+    //buf << log_get_pid();
    buf << ".";
    buf << log_file_extension;

@@ -231,6 +213,15 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
    #define LOG_TEE_FLF_VAL ,""
 #endif

+// Utility for synchronizing log configuration state
+//  since std::optional was introduced only in c++17
+enum LogTriState
+{
+    LogTriStateSame,
+    LogTriStateFalse,
+    LogTriStateTrue
+};
+
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
@@ -324,23 +315,16 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 #endif

 // INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
+inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
 {
-    static bool _initialized = false;
-    static bool _append = false;
-    static bool _disabled = filename.empty() && target == nullptr;
+    static bool _initialized{false};
+    static bool _disabled{(filename.empty() && target == nullptr)};
    static std::string log_current_filename{filename};
    static FILE *log_current_target{target};
    static FILE *logfile = nullptr;

    if (change)
    {
-        if (append != LogTriStateSame)
-        {
-            _append = append == LogTriStateTrue;
-            return logfile;
-        }
-
        if (disable == LogTriStateTrue)
        {
            // Disable primary target
@@ -393,7 +377,7 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriS
            }
        }

-        logfile = fopen(filename.c_str(), _append ? "a" : "w");
+        logfile = fopen(filename.c_str(), "w");
    }

    if (!logfile)
@@ -414,9 +398,9 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriS
 }

 // INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
+inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
 {
-    return log_handler1_impl(change, append, disable, filename, target);
+    return log_handler1_impl(change, disable, filename, target);
 }

 // Disables logs entirely at runtime.
@@ -427,7 +411,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
 // INTERNAL, DO NOT USE
 inline FILE *log_disable_impl()
 {
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
+    return log_handler1_impl(true, LogTriStateTrue);
 }

 // Enables logs at runtime.
@@ -436,31 +420,19 @@ inline FILE *log_disable_impl()
 // INTERNAL, DO NOT USE
 inline FILE *log_enable_impl()
 {
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
+    return log_handler1_impl(true, LogTriStateFalse);
 }

 // Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
 #define log_set_target(target) log_set_target_impl(target)

 // INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
+inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
+inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }

 // INTERNAL, DO NOT USE
 inline FILE *log_handler() { return log_handler1_impl(); }

-// Enable or disable creating separate log files for each run.
-//  can ONLY be invoked BEFORE first log use.
-#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
-// Enable or disable append mode for log file.
-//  can ONLY be invoked BEFORE first log use.
-#define log_append(enable) log_append_impl(enable)
-// INTERNAL, DO NOT USE
-inline FILE *log_append_impl(bool enable)
-{
-    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
-}
-
 inline void log_test()
 {
    log_disable();
@@ -522,18 +494,6 @@ inline bool log_param_single_parse(const std::string & param)
        return true;
    }

-    if (param == "--log-new")
-    {
-        log_multilog(true);
-        return true;
-    }
-
-    if (param == "--log-append")
-    {
-        log_append(true);
-        return true;
-    }
-
    return false;
 }

@@ -563,9 +523,7 @@ inline void log_print_usage()
    printf("  --log-disable         Disable trace logs\n");
    printf("  --log-enable          Enable trace logs\n");
    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("  --log-new             Create a separate new log file on start. "
-                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
-    printf("  --log-append          Don't truncate the old log file.\n");
+    printf("                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
 }

 #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -39,7 +39,6 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
 void llama_sampling_reset(llama_sampling_context * ctx) {
    if (ctx->grammar != NULL) {
        llama_grammar_free(ctx->grammar);
-        ctx->grammar = NULL;
    }

    if (!ctx->parsed_grammar.rules.empty()) {
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1045,7 +1045,6 @@ struct train_params_common get_default_train_params_common() {
    params.n_batch    =    8;
    params.n_gradient_accumulation = 1;
    params.n_epochs   = -1;
-    params.n_gpu_layers = 0;

    params.custom_n_ctx = false;

@@ -1081,7 +1080,6 @@ struct train_params_common get_default_train_params_common() {
    params.adam_beta2          = 0.999f;
    params.adam_gclip          = 1.0f;
    params.adam_eps_f          = 0.0f;
-
    return params;
 }

--- a/common/train.h
+++ b/common/train.h
@@ -44,7 +44,6 @@ struct train_params_common {
    int n_batch;
    int n_gradient_accumulation;
    int n_epochs;
-    int n_gpu_layers;

    bool custom_n_ctx;

--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -163,8 +163,7 @@ gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
 if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
    if "type" in hparams["rope_scaling"]:
        if hparams["rope_scaling"]["type"] == "linear":
-            gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])


 # TOKENIZATION
--- a/convert.py
+++ b/convert.py
@@ -151,11 +151,8 @@ class Params:
    n_head_kv:  int
    f_norm_eps: float

-    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
    f_rope_scale: float | None = None
-    n_orig_ctx: int | None = None
-    rope_finetuned: bool | None = None

    ftype: GGMLFileType | None = None

@@ -201,20 +198,20 @@ class Params:
    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

-        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
-        rope_scaling = config.get("rope_scaling")
+        n_vocab          = config["vocab_size"]
+        n_embd           = config["hidden_size"]
+        n_layer          = config["num_hidden_layers"]
+        n_ff             = config["intermediate_size"]
+        n_head           = config["num_attention_heads"]
+        n_head_kv        = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
+        f_norm_eps       = config["rms_norm_eps"]
+        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None

-        if rope_scaling is not None and (typ := rope_scaling.get("type")):
-            rope_factor = rope_scaling.get("factor")
-            f_rope_scale = rope_factor
-            if typ == "linear":
-                rope_scaling_type = gguf.RopeScalingType.LINEAR
-            elif typ == "yarn":
-                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling['original_max_position_embeddings']
-                rope_finetuned = rope_scaling['finetuned']
-            else:
-                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
+        rope_scaling = config.get("rope_scaling")
+        if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
+            f_rope_scale = config["rope_scaling"].get("factor")
+        else:
+            f_rope_scale = None

        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
@@ -225,19 +222,16 @@ class Params:
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

        return Params(
-            n_vocab           = config["vocab_size"],
-            n_embd            = config["hidden_size"],
-            n_layer           = config["num_hidden_layers"],
-            n_ctx             = n_ctx,
-            n_ff              = config["intermediate_size"],
-            n_head            = (n_head := config["num_attention_heads"]),
-            n_head_kv         = config.get("num_key_value_heads", n_head),
-            f_norm_eps        = config["rms_norm_eps"],
-            f_rope_freq_base  = config.get("rope_theta"),
-            rope_scaling_type = rope_scaling_type,
-            f_rope_scale      = f_rope_scale,
-            n_orig_ctx        = n_orig_ctx,
-            rope_finetuned    = rope_finetuned,
+            n_vocab          = n_vocab,
+            n_embd           = n_embd,
+            n_layer          = n_layer,
+            n_ctx            = n_ctx,
+            n_ff             = n_ff,
+            n_head           = n_head,
+            n_head_kv        = n_head_kv,
+            f_norm_eps       = f_norm_eps,
+            f_rope_freq_base = f_rope_freq_base,
+            f_rope_scale     = f_rope_scale,
        )

    # LLaMA v2 70B params.json
@@ -246,8 +240,17 @@ class Params:
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

+        n_vocab          = config["vocab_size"] if "vocab_size" in config else -1
+        n_embd           = config["dim"]
+        n_layer          = config["n_layers"]
+        n_ff             = -1
+        n_head           = config["n_heads"]
+        n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head
+        f_norm_eps       = config["norm_eps"]
+        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
+
        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if config.get("rope_theta") == 1000000:
+        if f_rope_freq_base == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@@ -257,16 +260,22 @@ class Params:
            # LLaMA v1
            n_ctx = 2048

+        if n_vocab == -1:
+            n_vocab = model["tok_embeddings.weight"].shape[0]
+
+        if n_ff == -1:
+            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
+
        return Params(
-            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
-            n_embd           = config["dim"],
-            n_layer          = config["n_layers"],
+            n_vocab          = n_vocab,
+            n_embd           = n_embd,
+            n_layer          = n_layer,
            n_ctx            = n_ctx,
-            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
-            n_head           = (n_head := config["n_heads"]),
-            n_head_kv        = config.get("n_kv_heads", n_head),
-            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config.get("rope_theta"),
+            n_ff             = n_ff,
+            n_head           = n_head,
+            n_head_kv        = n_head_kv,
+            f_norm_eps       = f_norm_eps,
+            f_rope_freq_base = f_rope_freq_base,
        )

    @staticmethod
@@ -822,16 +831,8 @@ class OutputFile:
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)

-        if params.rope_scaling_type:
-            assert params.f_rope_scale is not None
-            self.gguf.add_rope_scaling_type(params.rope_scaling_type)
-            self.gguf.add_rope_scaling_factor(params.f_rope_scale)
-
-        if params.n_orig_ctx is not None:
-            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
-
-        if params.rope_finetuned is not None:
-            self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
+        if params.f_rope_scale is not None:
+            self.gguf.add_rope_scale_linear(params.f_rope_scale)

        if params.ftype is not None:
            self.gguf.add_file_type(params.ftype)
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,6 +1,9 @@
 set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "ggml.h"

--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -642,9 +642,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        const int rope_mode = 0;

        return ggml_rope_custom(ctx,
-            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
-            rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
-        );
+            t, KQ_pos, n_rot, rope_mode, n_ctx,
+            rope_freq_base, rope_freq_scale);
    };

    set_name(tokens_input, "tokens_input");
@@ -653,7 +652,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);

    auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
-        if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
+        if (ggml_is_quantized(a->type)) {
            return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
        } else if (a->type == GGML_TYPE_F32) {
            return ggml_add(ctx, a, b);
@@ -1460,17 +1459,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
            }
            params->n_rank_w3 = std::stoi(argv[i]);
            params->custom_n_rank_w3 = true;
-        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-            params->common.n_gpu_layers = std::stoi(argv[i]);
-#else
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            train_print_usage(argc, argv, &default_params);
@@ -1557,7 +1545,6 @@ int main(int argc, char ** argv) {
    srand(params.common.seed);

    struct llama_model_params llama_mparams = llama_model_default_params();
-    llama_mparams.n_gpu_layers = params.common.n_gpu_layers;
    llama_mparams.vocab_only = false;

    printf("%s: model base = '%s'\n", __func__, params.fn_model_base);
--- a/examples/finetune/finetune.sh
+++ b/examples/finetune/finetune.sh
@@ -1,34 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-cd ../..
-
-EXE="./finetune"
-
-if [[ ! $LLAMA_MODEL_DIR ]]; then LLAMA_MODEL_DIR="./models"; fi
-if [[ ! $LLAMA_TRAINING_DIR ]]; then LLAMA_TRAINING_DIR="."; fi
-
-# MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2-q8_0.gguf" # This is the model the readme uses.
-MODEL="$LLAMA_MODEL_DIR/openllama-3b-v2.gguf" # An f16 model. Note in this case with "-g", you get an f32-format .BIN file that isn't yet supported if you use it with "main --lora" with GPU inferencing.
-
-while getopts "dg" opt; do
-  case $opt in
-    d)
-      DEBUGGER="gdb --args"
-      ;;
-    g)
-      EXE="./build/bin/Release/finetune"
-      GPUARG="--gpu-layers 25"
-      ;;
-  esac
-done
-
-$DEBUGGER $EXE \
-        --model-base $MODEL \
-        $GPUARG \
-        --checkpoint-in  chk-ol3b-shakespeare-LATEST.gguf \
-        --checkpoint-out chk-ol3b-shakespeare-ITERATION.gguf \
-        --lora-out lora-ol3b-shakespeare-ITERATION.bin \
-        --train-data "$LLAMA_TRAINING_DIR\shakespeare.txt" \
-        --save-every 10 \
-        --threads 10 --adam-iter 30 --batch 4 --ctx 64 \
-        --use-checkpointing
--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,6 +2,7 @@

 #include "console.h"
 #include "llama.h"
+#include "build-info.h"
 #include "grammar-parser.h"

 #include <cassert>
@@ -183,8 +184,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} llama-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -19,6 +19,7 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
+#include "build-info.h"
 #include "ggml-cuda.h"

 // utils
@@ -640,8 +641,8 @@ struct test {
    }
 };

-const std::string test::build_commit = LLAMA_COMMIT;
-const int         test::build_number = LLAMA_BUILD_NUMBER;
+const std::string test::build_commit = BUILD_COMMIT;
+const int         test::build_number = BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::metal        = !!ggml_cpu_has_metal();
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -5,6 +5,9 @@ target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+    endif()
+if(TARGET BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
 endif()

 set(TARGET llava)
@@ -12,3 +15,6 @@ add_executable(${TARGET} llava.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -2,6 +2,7 @@

 #include "console.h"
 #include "llama.h"
+#include "build-info.h"

 #include <cassert>
 #include <cinttypes>
@@ -152,8 +153,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -1,6 +1,8 @@
 // A basic application simulating a server with multiple clients.
 // The clients submite requests to the server and they are processed in parallel.

+#include "build-info.h"
+
 #include "common.h"
 #include "llama.h"

--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,4 +1,5 @@
 #define LLAMA_API_INTERNAL
+#include "build-info.h"
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,9 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,3 +1,4 @@
+#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -11,3 +11,6 @@ if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 #include "grammar-parser.h"

 #include "../llava/clip.h"
@@ -148,7 +149,6 @@ struct task_server {
    task_type type;
    json data;
    bool infill_mode = false;
-    bool embedding_mode = false;
 };

 struct task_result {
@@ -371,7 +371,6 @@ struct llama_client_slot
    std::vector<completion_token_output> generated_token_probs;

    bool infill = false;
-    bool embedding = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@@ -1245,14 +1244,13 @@ struct llama_server_context
        queue_results.push_back(res);
    }

-    int request_completion(json data, bool infill, bool embedding)
+    int request_completion(json data, bool infill)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_server task;
        task.id = id_gen++;
        task.data = data;
        task.infill_mode = infill;
-        task.embedding_mode = embedding;
        task.type = COMPLETION_TASK;
        queue_tasks.push_back(task);
        return task.id;
@@ -1378,7 +1376,7 @@ struct llama_server_context
                    {
                        LOG_TEE("slot unavailable\n");
                        // send error result
-                        send_error(task.id, "slot unavailable");
+                        send_error(task.id, "slot unavaliable");
                        return;
                    }

@@ -1390,7 +1388,6 @@ struct llama_server_context
                    slot->reset();

                    slot->infill = task.infill_mode;
-                    slot->embedding = task.embedding_mode;
                    slot->task_id = task.id;

                    if (!launch_slot_with_data(slot, task.data))
@@ -1698,7 +1695,7 @@ struct llama_server_context
                }

                // prompt evaluated for embedding
-                if (slot.embedding)
+                if (params.embedding)
                {
                    send_embedding(slot);
                    slot.release();
@@ -1754,18 +1751,12 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("options:\n");
    printf("  -h, --help                show this help message and exit\n");
    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
+    printf("  -t N,  --threads N        number of threads to use during computation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-scaling {none,linear,yarn}\n");
-    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
+    printf("  -c N,  --ctx-size N       size of the prompt context (default: %d)\n", params.n_ctx);
    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N       RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-ext-factor N       YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
-    printf("  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
-    printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
-    printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  -b N, --batch-size N      batch size for prompt processing (default: %d)\n", params.n_batch);
+    printf("  --rope-freq-scale N       RoPE frequency scaling factor (default: loaded from model)\n");
+    printf("  -b N,  --batch-size N     batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --memory-f32              use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                            not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_mlock_supported())
@@ -1886,19 +1877,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
-        else if (arg == "--rope-scaling")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
-            else { invalid_param = true; break; }
-        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc)
@@ -1917,38 +1895,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.rope_freq_scale = std::stof(argv[i]);
        }
-        else if (arg == "--yarn-ext-factor")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_ext_factor = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-attn-factor")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_attn_factor = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-beta-fast")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_fast = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-beta-slow")
-        {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_slow = std::stof(argv[i]);
-        }
        else if (arg == "--memory-f32" || arg == "--memory_f32")
        {
            params.memory_f16 = false;
@@ -2263,8 +2209,8 @@ int main(int argc, char **argv)

    llama_backend_init(params.numa);

-    LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
-                            {"commit", LLAMA_COMMIT}});
+    LOG_INFO("build info", {{"build", BUILD_NUMBER},
+                            {"commit", BUILD_COMMIT}});

    LOG_INFO("system info", {
                                {"n_threads", params.n_threads},
@@ -2328,7 +2274,7 @@ int main(int argc, char **argv)
    svr.Post("/completion", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, false, false);
+                const int task_id = llama.request_completion(data, false);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@@ -2383,7 +2329,7 @@ int main(int argc, char **argv)
    svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
            {
                json data = json::parse(req.body);
-                const int task_id = llama.request_completion(data, true, false);
+                const int task_id = llama.request_completion(data, true);
                if (!json_value(data, "stream", false)) {
                    std::string completion_text;
                    task_result result = llama.next_result(task_id);
@@ -2487,7 +2433,7 @@ int main(int argc, char **argv)
                {
                    prompt = "";
                }
-                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false, true);
+                const int task_id = llama.request_completion({ {"prompt", prompt}, { "n_predict", 0} }, false);
                task_result result = llama.next_result(task_id);
                return res.set_content(result.result_json.dump(), "application/json");
            });
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,3 +1,5 @@
+#include "build-info.h"
+
 #include "common.h"
 #include "llama.h"

--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -349,9 +349,9 @@ static struct ggml_tensor * llama_build_train_graphs(
        // not capturing these, to silcence warnings
        const int rope_mode = 0;

-        return ggml_rope_custom(
-            ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
-        );
+        return ggml_rope_custom(ctx,
+            t, KQ_pos, n_rot, rope_mode, n_ctx,
+            rope_freq_base, rope_freq_scale);
    };

    set_name(tokens_input, "tokens_input");
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -181,11 +181,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
    do {                                                                                \
        cudaError_t err_ = (err);                                                       \
        if (err_ != cudaSuccess) {                                                      \
-            int dev_id;                                                                     \
-            cudaGetDevice(&dev_id);                                                         \
+            int id;                                                                     \
+            cudaGetDevice(&id);                                                         \
            fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
                cudaGetErrorString(err_));                                              \
-            fprintf(stderr, "current device: %d\n", dev_id);                                \
+            fprintf(stderr, "current device: %d\n", id);                                \
            exit(1);                                                                    \
        }                                                                               \
    } while (0)
@@ -195,11 +195,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
    do {                                                                                \
        cublasStatus_t err_ = (err);                                                    \
        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
-            int dev_id;                                                                     \
-            cudaGetDevice(&dev_id);                                                         \
+            int id;                                                                     \
+            cudaGetDevice(&id);                                                         \
            fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n",                         \
                    err_, __FILE__, __LINE__, cublasGetStatusString(err_));             \
-            fprintf(stderr, "current device: %d\n", dev_id);                                \
+            fprintf(stderr, "current device: %d\n", id);                                \
            exit(1);                                                                    \
        }                                                                               \
    } while (0)
@@ -465,7 +465,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA

 #define MAX_STREAMS 8
 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
-static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };

 struct ggml_tensor_extra_gpu {
    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -514,15 +513,6 @@ static __global__ void add_f16_f32_f16(const half * x, const float * y, half * d
    dst[i] = __hadd(x[i], __float2half(y[i]));
 }

-static __global__ void add_f16_f32_f32(const half * x, const float * y, float * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = __half2float(x[i]) + y[i];
-}
-
 static __global__ void mul_f32(const float * x, const float * y, float * dst, const int kx, const int ky) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;

@@ -4494,41 +4484,11 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
    cpy_1(cx + x_offset, cdst + dst_offset);
 }

-static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-struct rope_corr_dims {
-    float v[4];
-};
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static __device__ void rope_yarn(
-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
-    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
-}
-
 // rope == RoPE == rotary positional embedding
+
 template<typename T, bool has_pos>
-static __global__ void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
-) {
+static __global__ void rope(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                            const int p_delta_rows, const float theta_scale) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (col >= ncols) {
@@ -4540,10 +4500,10 @@ static __global__ void rope(
    const int i2 = row/p_delta_rows;

    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(freq_base, -float(col)/ncols);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    const float p0 = p*freq_scale;
+    const float theta = p0*powf(theta_scale, col/2);
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);

    const float x0 = x[i + 0];
    const float x1 = x[i + 1];
@@ -4553,10 +4513,8 @@ static __global__ void rope(
 }

 template<typename T, bool has_pos>
-static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
-) {
+static __global__ void rope_neox(const T * x, T * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                 const int p_delta_rows, const float theta_scale) {
    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (col >= ncols) {
@@ -4567,14 +4525,11 @@ static __global__ void rope_neox(
    const int i = row*ncols + col/2;
    const int i2 = row/p_delta_rows;

-    // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
-    const float cur_rot = -float(col)/ncols;
-
    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(freq_base, cur_rot);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    const float p0 = p*freq_scale;
+    const float theta = p0*powf(theta_scale, col/2);
+    const float sin_theta = sinf(theta);
+    const float cos_theta = cosf(theta);

    const float x0 = x[i + 0];
    const float x1 = x[i + ncols/2];
@@ -4583,10 +4538,8 @@ static __global__ void rope_neox(
    dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
 }

-static __global__ void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    int n_ctx
-) {
+static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const int32_t * pos, const float freq_scale,
+                                    const int p_delta_rows, const float theta_scale, const int n_ctx) {
    const int col = blockDim.x*blockIdx.x + threadIdx.x;
    const int half_n_dims = ncols/4;

@@ -4598,7 +4551,7 @@ static __global__ void rope_glm_f32(
    const int i = row*ncols + col;
    const int i2 = row/p_delta_rows;

-    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
+    const float col_theta_scale = powf(theta_scale, col);
     // FIXME: this is likely wrong
    const int p = pos != nullptr ? pos[i2] : 0;

@@ -4740,11 +4693,6 @@ static void add_f16_f32_f16_cuda(const half * x, const float * y, half * dst, co
    add_f16_f32_f16<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
 }

-static void add_f16_f32_f32_cuda(const half * x, const float * y, float * dst, const int k, cudaStream_t stream) {
-    const int num_blocks = (k + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
-    add_f16_f32_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, k);
-}
-
 static void mul_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
    const int num_blocks = (kx + CUDA_MUL_BLOCK_SIZE - 1) / CUDA_MUL_BLOCK_SIZE;
    mul_f32<<<num_blocks, CUDA_MUL_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
@@ -5622,54 +5570,40 @@ static void clamp_f32_cuda(const float * x, float * dst, const float min, const
 }

 template<typename T>
-static void rope_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
-) {
+static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
    if (pos == nullptr) {
-        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
+        rope<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
    } else {
-        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
+        rope<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
    }
 }

 template<typename T>
-static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
-) {
+static void rope_neox_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                          const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
    GGML_ASSERT(ncols % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
    const dim3 block_nums(nrows, num_blocks_x, 1);
    if (pos == nullptr) {
-        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
    } else {
-        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale);
    }
 }

-static void rope_glm_f32_cuda(
-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, int n_ctx, cudaStream_t stream
-) {
+static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
+                              const int p_delta_rows, const float theta_scale, const int n_ctx, cudaStream_t stream) {
    GGML_ASSERT(ncols % 4 == 0);
    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
    const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
+    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, theta_scale, n_ctx);
 }

 static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
@@ -5773,16 +5707,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
    return ptr;
 }

-static void * ggml_cuda_pool_malloc_async(size_t size, size_t * actual_size, int id, cudaStream_t stream) {
-    if (g_cudaMemPools[id] == nullptr) {
-        return ggml_cuda_pool_malloc(size, actual_size);
-    }
-    void *ptr;
-    CUDA_CHECK(cudaMallocFromPoolAsync(&ptr, size, g_cudaMemPools[id], stream));
-    *actual_size = size;
-    return ptr;
-}
-
 static void ggml_cuda_pool_free(void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    int id;
@@ -5801,13 +5725,6 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
 }


-static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) {
-    if (g_cudaMemPools[id] == nullptr) {
-        return ggml_cuda_pool_free(ptr, actual_size);
-    }
-    CUDA_CHECK(cudaFreeAsync(ptr, stream));
-}
-
 void ggml_init_cublas() {
    static bool initialized = false;

@@ -5862,13 +5779,6 @@ void ggml_init_cublas() {
            // create cublas handle
            CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
            CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
-
-            // configure memory pool
-            cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
-            if (err == cudaSuccess) {
-                size_t treshold = UINT64_MAX;
-                CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
-            }
        }

        // configure logging to stdout
@@ -6086,10 +5996,7 @@ inline void ggml_cuda_op_add(
        add_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(src0), ne10*ne11, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
        add_f16_f32_f16_cuda((const half *) src0_dd, src1_dd, (half *) dst_dd, ggml_nelements(src0), main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        add_f16_f32_f32_cuda((const half *) src0_dd, src1_dd, dst_dd, ggml_nelements(src0), main_stream);
    } else {
-        fprintf(stderr, "src0->type: %d  dst->type: %d\n", src0->type, dst->type);
        GGML_ASSERT(false);
    }

@@ -6462,7 +6369,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = row_diff*ne00;
-            src0_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src0_as, id, stream);
+            src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
            to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
        }
        const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
@@ -6473,12 +6380,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
            GGML_ASSERT(to_fp16_cuda != nullptr);
            size_t ne = src1_ncols*ne10;
-            src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream);
+            src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
            to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
        }
        const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddq_i : src1_as_f16;
-        size_t dst_f16_as = 0;
-        half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream);
+
+        size_t dst_as = 0;
+        half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);

        const half alpha_f16 = 1.0f;
        const half beta_f16 = 0.0f;
@@ -6496,15 +6404,14 @@ inline void ggml_cuda_op_mul_mat_cublas(
        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
        to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);

-        if (dst_f16_as != 0) {
-            ggml_cuda_pool_free_async(dst_f16, dst_f16_as, id, stream);
-        }
+        ggml_cuda_pool_free(dst_f16, dst_as);

        if (src0_as != 0) {
-            ggml_cuda_pool_free_async(src0_as_f16, src0_as, id, stream);
+            ggml_cuda_pool_free(src0_as_f16, src0_as);
        }
+
        if (src1_as != 0) {
-            ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, stream);
+            ggml_cuda_pool_free(src1_as_f16, src1_as);
        }
    }
    else {
@@ -6514,7 +6421,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
        if (src0->type != GGML_TYPE_F32) {
            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async(row_diff*ne00 * sizeof(float), &src0_as, id, stream); // NOLINT
+            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
        }
        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
@@ -6531,7 +6438,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
                    &beta,  dst_dd_i,   ldc));

        if (src0_as != 0) {
-            ggml_cuda_pool_free_async(src0_ddq_as_f32, src0_as, id, stream);
+            ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
        }
    }

@@ -6553,20 +6460,17 @@ inline void ggml_cuda_op_rope(
    const int64_t ne2 = dst->ne[2];
    const int64_t nrows = ggml_nrows(src0);

-    //const int n_past      = ((int32_t *) dst->op_params)[0];
-    const int n_dims      = ((int32_t *) dst->op_params)[1];
-    const int mode        = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
-
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+    const int n_ctx  = ((int32_t *) dst->op_params)[3];
    // RoPE alteration for extended context
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    float freq_base, freq_scale;
+    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);

    const int32_t * pos = nullptr;
    if ((mode & 1) == 0) {
@@ -6578,39 +6482,24 @@ inline void ggml_cuda_op_rope(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

-    rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-
    // compute
    if (is_glm) {
        GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, n_ctx, main_stream);
    } else if (is_neox) {
        GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
        if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
+            rope_neox_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
        } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
+            rope_neox_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
        } else {
            GGML_ASSERT(false);
        }
    } else {
        if (src0->type == GGML_TYPE_F32) {
-            rope_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
+            rope_cuda((const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
        } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda(
-                (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
+            rope_cuda((const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, theta_scale, main_stream);
        } else {
            GGML_ASSERT(false);
        }
@@ -6721,10 +6610,8 @@ inline void ggml_cuda_op_clamp(
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);

-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+    const float min = ((float *) dst->op_params)[0];
+    const float max = ((float *) dst->op_params)[1];

    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
    CUDA_CHECK(cudaGetLastError());
@@ -6954,22 +6841,21 @@ static void ggml_cuda_op_mul_mat(
            src0_dd[id] = (char *) src0_extra->data_device[id];
        } else {
            const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
-            src0_dd[id] = (char *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_as[id], id, stream);
+            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
        }

        if (src1_on_device && src1_is_contiguous) {
            src1_ddf[id] = (float *) src1_extra->data_device[id];
        } else {
-            src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf[id], id, stream);
+            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
        }

        if (convert_src1_to_q8_1) {
-            const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
-            src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async(size_dst_ddq, &src1_asq[id], id, stream);
+            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);

            if (src1_on_device && src1_is_contiguous) {
                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
-                // CUDA_CHECK(cudaGetLastError());
+                CUDA_CHECK(cudaGetLastError());
            }
        }

@@ -6977,7 +6863,7 @@ static void ggml_cuda_op_mul_mat(
            dst_dd[id] = (float *) dst_extra->data_device[id];
        } else {
            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
-            dst_dd[id] = (float *) ggml_cuda_pool_malloc_async(size_dst_ddf, &dst_as[id], id,  stream);
+            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
        }
    }

@@ -7103,6 +6989,24 @@ static void ggml_cuda_op_mul_mat(
        }
    }

+    for (int64_t id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        // free buffers again when done
+        if (src0_as[id] > 0) {
+            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
+        }
+        if (src1_asf[id] > 0) {
+            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+        }
+        if (src1_asq[id] > 0) {
+            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
+        }
+        if (dst_as[id] > 0) {
+            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
+        }
+    }
+
    // main device waits for all other devices to be finished
    if (split && g_device_count > 1) {
        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
@@ -7120,21 +7024,6 @@ static void ggml_cuda_op_mul_mat(
        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
        CUDA_CHECK(cudaDeviceSynchronize());
    }
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (src0_as[id] > 0) {
-            ggml_cuda_pool_free_async(src0_dd[id], src0_as[id], id, g_cudaStreams[id][0]);
-        }
-        if (src1_asf[id] > 0) {
-            ggml_cuda_pool_free_async(src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0]);
-        }
-        if (src1_asq[id] > 0) {
-            ggml_cuda_pool_free_async(src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0]);
-        }
-        if (dst_as[id] > 0) {
-            ggml_cuda_pool_free_async(dst_dd[id], dst_as[id], id, g_cudaStreams[id][0]);
-        }
-    }
 }

 static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7246,30 +7135,6 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }

-__global__ void k_compute_batched_ptrs(
-        const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
-        const void ** ptrs_src, void ** ptrs_dst,
-        int ne12, int ne13,
-        int ne23,
-        int nb02, int nb03,
-        int nb12, int nb13,
-        int nb2, int nb3,
-        int r2, int r3) {
-    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int i03 = i13 / r3;
-    int i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)     dst_f16 + i12* nb2/2 + i13* nb3/2;
-}
-
 static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
@@ -7321,11 +7186,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
    GGML_ASSERT(to_fp16_cuda != nullptr);

    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne1 * sizeof(half), &src1_as, id, main_stream);
+    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);

    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &dst_as, id, main_stream);
+    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);

    GGML_ASSERT(ne12 % ne02 == 0);
    GGML_ASSERT(ne13 % ne03 == 0);
@@ -7371,55 +7236,57 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));
    } else {
        // use cublasGemmBatchedEx
+        // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
        const int ne23 = ne12*ne13;

-        const void ** ptrs_src = nullptr;
-              void ** ptrs_dst = nullptr;
+        // TODO: avoid this alloc
+        void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));

-        size_t ptrs_src_s = 0;
-        size_t ptrs_dst_s = 0;
+        for (int i13 = 0; i13 < ne13; ++i13) {
+            for (int i12 = 0; i12 < ne12; ++i12) {
+                int i03 = i13 / r3;
+                int i02 = i12 / r2;

-        ptrs_src = (const void **) ggml_cuda_pool_malloc_async(2*ne23*sizeof(void *), &ptrs_src_s, id, main_stream);
-        ptrs_dst = (      void **) ggml_cuda_pool_malloc_async(1*ne23*sizeof(void *), &ptrs_dst_s, id, main_stream);
+                ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3];
+                ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
+                ptrs[2*ne23 + i12 + i13*ne12] = (char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
+            }
+        }
+
+        // allocate device memory for pointers
+        void ** ptrs_as = nullptr;
+        CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
+
+        // TODO: this does not work for some reason -- not sure why?
+        //size_t ptrs_s = 0;
+        //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
+
+        // copy pointers to device
+        CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
+
+        free(ptrs);

-        dim3 block_dims(ne13, ne12);
-        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
-                src0_as_f16, src1_as_f16, dst_f16,
-                ptrs_src, ptrs_dst,
-                ne12, ne13,
-                ne23,
-                nb02, nb03,
-                nb12, nb13,
-                dst->nb[2], dst->nb[3],
-                r2, r3);
-        CUDA_CHECK(cudaGetLastError());
        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
-                            (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
-                &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+                &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
+                            (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
+                &beta_f16,  (      void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
                ne23,
                CUBLAS_COMPUTE_16F,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));

-        if (ptrs_src_s != 0) {
-            ggml_cuda_pool_free_async(ptrs_src, ptrs_src_s, id, main_stream);
-        }
-        if (ptrs_dst_s != 0) {
-            ggml_cuda_pool_free_async(ptrs_dst, ptrs_dst_s, id, main_stream);
-        }
+        // free device memory for pointers
+        CUDA_CHECK(cudaFree(ptrs_as));
+        //ggml_cuda_pool_free(ptrs_as, ptrs_s);
    }
 #endif

    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-    if (src1_as != 0) {
-        ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, main_stream);
-    }
-    if (dst_as != 0) {
-        ggml_cuda_pool_free_async(dst_f16, dst_as, id, main_stream);
-    }
+
+    ggml_cuda_pool_free(src1_as_f16, src1_as);
+    ggml_cuda_pool_free(dst_f16, dst_as);
 }

 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7455,7 +7322,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
    } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
    } else if (src0->type == GGML_TYPE_F32) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1001,15 +1001,11 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
-                            int nth = 32; // SIMD width
+                            const int nth = MIN(32, ne00);

                            if (ne00%4 == 0) {
                                [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
                            } else {
-                                do {
-                                    nth *= 2;
-                                } while (nth <= ne00 && nth <= 1024);
-                                nth /= 2;
                                [encoder setComputePipelineState:ctx->pipeline_soft_max];
                            }
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -1017,9 +1013,8 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];

-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    case GGML_OP_DIAG_MASK_INF:
                        {
@@ -1400,18 +1395,14 @@ void ggml_metal_graph_compute(

                            const int nth = MIN(1024, ne00);

-                            const int n_past     = ((int32_t *) dst->op_params)[0];
-                            const int n_dims     = ((int32_t *) dst->op_params)[1];
-                            const int mode       = ((int32_t *) dst->op_params)[2];
-                            const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
+                            const int n_past = ((int32_t *) dst->op_params)[0];
+                            const int n_dims = ((int32_t *) dst->op_params)[1];
+                            const int mode   = ((int32_t *) dst->op_params)[2];

-                            float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-                            memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-                            memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-                            memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-                            memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-                            memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+                            float freq_base;
+                            float freq_scale;
+                            memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));

                            switch (src0->type) {
                                case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_rope_f32]; break;
@@ -1419,35 +1410,30 @@ void ggml_metal_graph_compute(
                                default: GGML_ASSERT(false);
                            };

-                            [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
-                            [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
-                            [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
-                            [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
-                            [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
-                            [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
-                            [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
-                            [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
-                            [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
-                            [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
-                            [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
-                            [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
-                            [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:6];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:14];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:18];
+                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:21];
+                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:22];
+                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];

                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -184,73 +184,36 @@ kernel void kernel_soft_max(
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint  tgpig[[threadgroup_position_in_grid]],
-        uint  tpitg[[thread_position_in_threadgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint    ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = (tgpig) / (ne02*ne01);
-    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
-    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];

    device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    device       float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;

    // parallel max
-    float lmax = tpitg < ne00 ? psrc0[tpitg] : -INFINITY;
-
-    for (int i00 = tpitg + ntg; i00 < ne00; i00 += ntg) {
+    float lmax = tpitg[0] < ne00 ? psrc0[tpitg[0]] : -INFINITY;
+    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00; i00 += ntg[0]) {
        lmax = MAX(lmax, psrc0[i00]);
    }
-
-    float max = simd_max(lmax);
-    if (tiisg == 0) {
-        buf[sgitg] = max;
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
-       }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    max = buf[0];
+    const float max = simd_max(lmax);

    // parallel sum
    float lsum = 0.0f;
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
        const float exp_psrc0 = exp(psrc0[i00] - max);
        lsum += exp_psrc0;
        // Remember the result of exp here. exp is expensive, so we really do not
-        // wish to compute it twice.
+        // whish to compute it twice.
        pdst[i00] = exp_psrc0;
    }

-    float sum = simd_sum(lsum);
-    if (tiisg == 0) {
-        buf[sgitg] = sum;
-    }
+    const float sum = simd_sum(lsum);

-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] += buf[tpitg + i];
-       }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    sum = buf[0];
-
-    for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
+    for (int i00 = tpitg[0]; i00 < ne00; i00 += ntg[0]) {
        pdst[i00] /= sum;
    }
 }
@@ -261,73 +224,37 @@ kernel void kernel_soft_max_4(
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
-        threadgroup float  * buf [[threadgroup(0)]],
-        uint  tgpig[[threadgroup_position_in_grid]],
-        uint  tpitg[[thread_position_in_threadgroup]],
-        uint  sgitg[[simdgroup_index_in_threadgroup]],
-        uint  tiisg[[thread_index_in_simdgroup]],
-        uint    ntg[[threads_per_threadgroup]]) {
-    const int64_t i03 = (tgpig) / (ne02*ne01);
-    const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01;
-    const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01);
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]]) {
+    const int64_t i03 = tgpig[2];
+    const int64_t i02 = tgpig[1];
+    const int64_t i01 = tgpig[0];

    device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    device       float4 * pdst4 = (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);

    // parallel max
-    float4 lmax4 = tpitg < ne00/4 ? psrc4[tpitg] : -INFINITY;
-
-    for (int i00 = tpitg + ntg; i00 < ne00/4; i00 += ntg) {
+    float4 lmax4 = tpitg[0] < ne00/4 ? psrc4[tpitg[0]] : -INFINITY;
+    for (int i00 = tpitg[0] + ntg[0]; i00 < ne00/4; i00 += ntg[0]) {
        lmax4 = fmax(lmax4, psrc4[i00]);
    }
+    float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));

-    const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
-    float max = simd_max(lmax);
-    if (tiisg == 0) {
-        buf[sgitg] = max;
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] = MAX(buf[tpitg], buf[tpitg + i]);
-       }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    max = buf[0];
+    const float max = simd_max(lmax);

    // parallel sum
    float4 lsum4 = 0.0f;
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+    for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
        const float4 exp_psrc4 = exp(psrc4[i00] - max);
        lsum4 += exp_psrc4;
        pdst4[i00] = exp_psrc4;
    }
+    float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];

-    const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
-    float sum = simd_sum(lsum);
-    if (tiisg == 0) {
-        buf[sgitg] = sum;
-    }
+    const float sum = simd_sum(lsum);

-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    // broadcast, simd group number is ntg / 32
-    for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
-       if (tpitg < i) {
-           buf[tpitg] += buf[tpitg + i];
-       }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    sum = buf[0];
-
-    for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
+    for (int i00 = tpitg[0]; i00 < ne00/4; i00 += ntg[0]) {
        pdst4[i00] /= sum;
    }
 }
@@ -347,7 +274,7 @@ kernel void kernel_diag_mask_inf(
        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
    } else {
        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
-    }
+     }
 }

 kernel void kernel_diag_mask_inf_8(
@@ -1061,45 +988,6 @@ kernel void kernel_alibi_f32(
    }
 }

-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    thread float * cos_theta, thread float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
-    }
-    *cos_theta = cos(theta) * mscale;
-    *sin_theta = sin(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
-}
-
-static void rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
-) {
-    // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
-}
-
 typedef void (rope_t)(
        device const    void * src0,
        device const int32_t * src1,
@@ -1123,13 +1011,8 @@ typedef void (rope_t)(
        constant         int & n_past,
        constant         int & n_dims,
        constant         int & mode,
-        constant         int & n_orig_ctx,
        constant       float & freq_base,
        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
        uint  tiitg[[thread_index_in_threadgroup]],
        uint3 tptg[[threads_per_threadgroup]],
        uint3 tgpig[[threadgroup_position_in_grid]]);
@@ -1158,13 +1041,8 @@ kernel void kernel_rope(
        constant         int & n_past,
        constant         int & n_dims,
        constant         int & mode,
-        constant         int & n_orig_ctx,
        constant       float & freq_base,
        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
        uint  tiitg[[thread_index_in_threadgroup]],
        uint3 tptg[[threads_per_threadgroup]],
        uint3 tgpig[[threadgroup_position_in_grid]]) {
@@ -1174,22 +1052,19 @@ kernel void kernel_rope(

    const bool is_neox = mode & 2;

-    float corr_dims[2];
-    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
-
    device const int32_t * pos = src1;

    const int64_t p = pos[i2];

-    const float theta_0 = (float)p;
+    const float theta_0 = freq_scale * (float)p;
    const float inv_ndims = -1.f/n_dims;

    if (!is_neox) {
        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {

            const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
-            float cos_theta, sin_theta;
-            rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+            const float cos_theta = cos(theta);
+            const float sin_theta = sin(theta);

            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -1204,12 +1079,9 @@ kernel void kernel_rope(
        for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
            for (int64_t ic = 2*tiitg; ic < n_dims; ic += 2*tptg.x) {

-                // simplified from `(ib * n_dims + ic) * inv_ndims`
-                const float cur_rot = inv_ndims*ic - ib;
-
-                const float theta = theta_0 * pow(freq_base, cur_rot);
-                float cos_theta, sin_theta;
-                rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+                const float theta = theta_0 * pow(freq_base, inv_ndims*ic - ib);
+                const float cos_theta = cos(theta);
+                const float sin_theta = sin(theta);

                const int64_t i0 = ib*n_dims + ic/2;

--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -716,7 +716,6 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
    }
 #else
-    GGML_UNUSED(nb);
    // scalar
    quantize_row_q8_0_reference(x, y, k);
 #endif
@@ -970,7 +969,6 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
        y[i].s = sum*d;
    }
 #else
-    GGML_UNUSED(nb);
    // scalar
    quantize_row_q8_1_reference(x, y, k);
 #endif
--- a/ggml.c
+++ b/ggml.c
@@ -1,5 +1,4 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
-#define _USE_MATH_DEFINES // For M_PI on MSVC

 #include "ggml-impl.h"
 #include "ggml-quants.h"
@@ -3154,7 +3153,7 @@ static struct ggml_tensor * ggml_add_cast_impl(
    // TODO: support less-strict constraint
    //       GGML_ASSERT(ggml_can_repeat(b, a));
    GGML_ASSERT(ggml_can_repeat_rows(b, a));
-    GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16
+    GGML_ASSERT(ggml_is_quantized(a->type)); // currently only supported for quantized input

    bool is_node = false;

@@ -4846,13 +4845,8 @@ static struct ggml_tensor * ggml_rope_impl(
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
-        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow,
        float                 xpos_base,
        bool                  xpos_down,
        bool                  inplace) {
@@ -4868,15 +4862,11 @@ static struct ggml_tensor * ggml_rope_impl(

    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);

-    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
-    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params +  8, &attn_factor,  sizeof(float));
-    memcpy(params +  9, &beta_fast,    sizeof(float));
-    memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &xpos_base,    sizeof(float));
-    memcpy(params + 12, &xpos_down,    sizeof(bool));
+    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
+    memcpy(params + 4, &freq_base,  sizeof(float));
+    memcpy(params + 5, &freq_scale, sizeof(float));
+    memcpy(params + 6, &xpos_base,  sizeof(float));
+    memcpy(params + 7, &xpos_down,  sizeof(bool));
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE;
@@ -4894,9 +4884,7 @@ struct ggml_tensor * ggml_rope(
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
-    );
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, false);
 }

 struct ggml_tensor * ggml_rope_inplace(
@@ -4906,9 +4894,7 @@ struct ggml_tensor * ggml_rope_inplace(
        int                   n_dims,
        int                   mode,
        int                   n_ctx) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
-    );
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, 10000.0f, 1.0f, 0.0f, false, true);
 }

 struct ggml_tensor * ggml_rope_custom(
@@ -4918,17 +4904,9 @@ struct ggml_tensor * ggml_rope_custom(
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
-        int                   n_orig_ctx,
        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
-    );
+        float                 freq_scale) {
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, false);
 }

 struct ggml_tensor * ggml_rope_custom_inplace(
@@ -4938,17 +4916,9 @@ struct ggml_tensor * ggml_rope_custom_inplace(
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
-        int                   n_orig_ctx,
        float                 freq_base,
-        float                 freq_scale,
-        float                 ext_factor,
-        float                 attn_factor,
-        float                 beta_fast,
-        float                 beta_slow) {
-    return ggml_rope_impl(
-        ctx, a, b, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
-    );
+        float                 freq_scale) {
+    return ggml_rope_impl(ctx, a, b, n_dims, mode, n_ctx, freq_base, freq_scale, 0.0f, false, true);
 }

 struct ggml_tensor * ggml_rope_xpos_inplace(
@@ -4958,7 +4928,7 @@ struct ggml_tensor * ggml_rope_xpos_inplace(
        int                   n_dims,
        float                 base,
        bool                  down) {
-    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
+    return ggml_rope_impl(ctx, a, b, n_dims, 0, 0, 10000.0f, 1.0f, base, down, true);
 }

 // ggml_rope_back
@@ -6957,15 +6927,9 @@ static void ggml_compute_forward_add_f16_f32(

    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F16);

-    if (dst->type == GGML_TYPE_F32) {
-        GGML_ASSERT( nb0 == sizeof(float));
-    }
-    else {
-        GGML_ASSERT(dst->type  == GGML_TYPE_F16);
-        GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    }
-
+    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));

    // rows per thread
@@ -6976,35 +6940,18 @@ static void ggml_compute_forward_add_f16_f32(
    const int ir1 = MIN(ir0 + dr, nr);

    if (nb10 == sizeof(float)) {
-        if (dst->type == GGML_TYPE_F16) {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src0, src1 and dst are same shape => same indices
+            const int i3 = ir/(ne2*ne1);
+            const int i2 = (ir - i3*ne2*ne1)/ne1;
+            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);

-                ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
+            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+            float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);

-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
-                }
-            }
-        } else {
-            for (int ir = ir0; ir < ir1; ++ir) {
-                // src0, src1 and dst are same shape => same indices
-                const int i3 = ir/(ne2*ne1);
-                const int i2 = (ir - i3*ne2*ne1)/ne1;
-                const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
-
-                float *       dst_ptr  = (float *)       ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-                ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-                float *       src1_ptr = (float *)       ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
-
-                for (int i = 0; i < ne0; i++) {
-                    dst_ptr[i] = GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i];
-                }
+            for (int i = 0; i < ne0; i++) {
+                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + src1_ptr[i]);
            }
        }
    }
@@ -10931,45 +10878,6 @@ static void ggml_compute_forward_clamp(

 // ggml_compute_forward_rope

-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
-    return 1 - MIN(1, MAX(0, y));
-}
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
-    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
-}
-
-// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
-// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
-}
-
-void ggml_rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
-) {
-    // start and end correction dims
-    dims[0] = MAX(0,         floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = MIN(n_dims - 1, ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base)));
-}
-
 static void ggml_compute_forward_rope_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@@ -10979,26 +10887,21 @@ static void ggml_compute_forward_rope_f32(
        return;
    }

-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    float freq_base;
+    float freq_scale;

    // these two only relevant for xPos RoPE:
    float xpos_base;
    bool  xpos_down;

-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
-    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+    const int n_ctx  = ((int32_t *) dst->op_params)[3];
+    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));

    GGML_TENSOR_UNARY_OP_LOCALS

@@ -11026,9 +10929,6 @@ static void ggml_compute_forward_rope_f32(
    int ir = 0;

    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.f/n_dims;
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);

    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
@@ -11042,18 +10942,18 @@ static void ggml_compute_forward_rope_f32(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;

-                float theta_base = (float)p;
+                float theta = freq_scale * (float)p;

                if (is_glm) {
-                    theta_base = MIN(p, n_ctx - 2);
+                    theta = MIN(p, n_ctx - 2);
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
                        const float cos_block_theta = cosf(block_theta);
                        const float sin_block_theta = sinf(block_theta);

-                        theta_base *= theta_scale;
+                        theta *= theta_scale;
                        block_theta *= theta_scale;

                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -11071,16 +10971,13 @@ static void ggml_compute_forward_rope_f32(
                    }
                } else if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        float cos_theta, sin_theta;
-                        rope_yarn(
-                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
-                        );
-
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                        if (xpos_down) zeta = 1.0f / zeta;

-                        theta_base *= theta_scale;
+                        theta *= theta_scale;

                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -11094,19 +10991,12 @@ static void ggml_compute_forward_rope_f32(
                } else {
                    // TODO: this might be wrong for ne0 != n_dims - need double check
                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
-                    theta_base *= freq_scale;
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
-                            float cur_rot = inv_ndims * ic - ib;
+                            const float cos_theta = cosf(theta);
+                            const float sin_theta = sinf(theta);

-                            float cos_theta, sin_theta;
-                            rope_yarn(
-                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
-                                &cos_theta, &sin_theta
-                            );
-
-                            theta_base *= theta_scale;
+                            theta *= theta_scale;

                            const int64_t i0 = ib*n_dims + ic/2;

@@ -11135,19 +11025,15 @@ static void ggml_compute_forward_rope_f16(
        return;
    }

-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    float freq_base;
+    float freq_scale;

-    //const int n_past     = ((int32_t *) dst->op_params)[0];
-    const int n_dims     = ((int32_t *) dst->op_params)[1];
-    const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_dims = ((int32_t *) dst->op_params)[1];
+    const int mode   = ((int32_t *) dst->op_params)[2];
+    const int n_ctx  = ((int32_t *) dst->op_params)[3];
+    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));

    GGML_TENSOR_UNARY_OP_LOCALS

@@ -11175,9 +11061,6 @@ static void ggml_compute_forward_rope_f16(
    int ir = 0;

    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.f/n_dims;
-    float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);

    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;
@@ -11191,18 +11074,18 @@ static void ggml_compute_forward_rope_f16(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;

-                float theta_base = (float)p;
+                float theta = freq_scale * (float)p;

                if (is_glm) {
-                    theta_base = MIN(p, n_ctx - 2);
+                    theta = MIN(p, n_ctx - 2);
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
                        const float cos_block_theta = cosf(block_theta);
                        const float sin_block_theta = sinf(block_theta);

-                        theta_base *= theta_scale;
+                        theta *= theta_scale;
                        block_theta *= theta_scale;

                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
@@ -11220,12 +11103,10 @@ static void ggml_compute_forward_rope_f16(
                    }
                } else if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        float cos_theta, sin_theta;
-                        rope_yarn(
-                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
-                        );
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);

-                        theta_base *= theta_scale;
+                        theta *= theta_scale;

                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -11239,19 +11120,12 @@ static void ggml_compute_forward_rope_f16(
                } else {
                    // TODO: this might be wrong for ne0 != n_dims - need double check
                    // ref:  https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
-                    theta_base *= freq_scale;
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            // simplified from `(ib * n_dims + ic) * inv_ndims`
-                            float cur_rot = inv_ndims * ic - ib;
+                            const float cos_theta = cosf(theta);
+                            const float sin_theta = sinf(theta);

-                            float cos_theta, sin_theta;
-                            rope_yarn(
-                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
-                                &cos_theta, &sin_theta
-                            );
-
-                            theta_base *= theta_scale;
+                            theta *= theta_scale;

                            const int64_t i0 = ib*n_dims + ic/2;

@@ -11359,18 +11233,17 @@ static void ggml_compute_forward_rope_back_f32(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;

-                float theta_base = freq_scale * (float)p;
+                float theta = freq_scale * (float)p;

                if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
-
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);
                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
                        if (xpos_down) zeta = 1.0f / zeta;

-                        theta_base *= theta_scale;
+                        theta *= theta_scale;

                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -11384,10 +11257,10 @@ static void ggml_compute_forward_rope_back_f32(
                } else {
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
+                            const float cos_theta = cosf(theta);
+                            const float sin_theta = sinf(theta);

-                            theta_base *= theta_scale;
+                            theta *= theta_scale;

                            const int64_t i0 = ib*n_dims + ic/2;

@@ -11460,14 +11333,14 @@ static void ggml_compute_forward_rope_back_f16(
                if (ir++ < ir0) continue;
                if (ir   > ir1) break;

-                float theta_base = (float)p;
+                float theta = (float)p;

                if (!is_neox) {
                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float cos_theta = cosf(theta);
+                        const float sin_theta = sinf(theta);

-                        theta_base *= theta_scale;
+                        theta *= theta_scale;

                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
@@ -11481,10 +11354,10 @@ static void ggml_compute_forward_rope_back_f16(
                } else {
                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
+                            const float cos_theta = cosf(theta);
+                            const float sin_theta = sinf(theta);

-                            theta_base *= theta_scale;
+                            theta *= theta_scale;

                            const int64_t i0 = ib*n_dims + ic/2;

@@ -15609,14 +15482,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                src1,
                                n_dims,
                                mode,
-                                0,
                                n_ctx,
                                freq_base,
                                freq_scale,
-                                0.0f,
-                                1.0f,
-                                0.0f,
-                                0.0f,
                                xpos_base,
                                xpos_down,
                                false),
@@ -18811,7 +18679,8 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
    return n == size;
 }

-static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
+// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
    p->n    = 0;
    p->data = NULL;

@@ -18823,6 +18692,19 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
    return ok;
 }

+static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
+    p->n    = 0;
+    p->data = NULL;
+
+    bool ok = true;
+
+    uint32_t n = 0;
+    ok = ok && gguf_fread_el(file, &n,       sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
+    ok = ok && gguf_fread_el(file,  p->data, p->n,      offset);
+
+    return ok;
+}
+
 struct gguf_context * gguf_init_empty(void) {
    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));

@@ -18881,14 +18763,20 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        ctx->data  = NULL;

        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
-        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
-        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);

        if (ctx->header.version == 1) {
-            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
-            fclose(file);
-            gguf_free(ctx);
-            return NULL;
+            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+            uint32_t n_tensors = 0;
+            uint32_t n_kv      = 0;
+
+            ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
+            ok = ok && gguf_fread_el(file, &n_kv,      sizeof(n_kv),      &offset);
+
+            ctx->header.n_tensors = n_tensors;
+            ctx->header.n_kv      = n_kv;
+        } else {
+            ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+            ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
        }

        if (!ok) {
@@ -18899,6 +18787,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        }
    }

+    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+    bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
+    if (ctx->header.version == 1) {
+        gguf_fread_str = gguf_fread_str_v1;
+    }
+
    // read the kv pairs
    {
        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
@@ -18929,7 +18823,15 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                case GGUF_TYPE_ARRAY:
                    {
                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n), &offset);
+
+                        if (ctx->header.version == 1) {
+                            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+                            uint32_t n = 0;
+                            ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
+                            kv->value.arr.n = n;
+                        } else {
+                            ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
+                        }

                        switch (kv->value.arr.type) {
                            case GGUF_TYPE_UINT8:
@@ -18988,7 +18890,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
            for (uint32_t j = 0; j < info->n_dims; ++j) {
-                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
+                if (ctx->header.version == 1) {
+                    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
+                    uint32_t t = 0;
+                    ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
+                    info->ne[j] = t;
+                } else {
+                    ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
+                }
            }
            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
--- a/ggml.h
+++ b/ggml.h
@@ -219,7 +219,7 @@
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
 #define GGML_MAX_NAME          64
-#define GGML_MAX_OP_PARAMS     64
+#define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4

 #if UINTPTR_MAX == 0xFFFFFFFF
@@ -1326,13 +1326,8 @@ extern "C" {
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
-            int                   n_orig_ctx,
            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow);
+            float                 freq_scale);

    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
@@ -1342,17 +1337,8 @@ extern "C" {
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
-            int                   n_orig_ctx,
            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow);
-
-    // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
-        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
+            float                 freq_scale);

    // xPos RoPE, in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -7,7 +7,7 @@ import shutil
 import struct
 import sys
 import tempfile
-from enum import Enum, IntEnum, auto
+from enum import IntEnum, auto
 from io import BufferedWriter
 from pathlib import Path
 from typing import IO, Any, BinaryIO, Callable, Sequence
@@ -53,12 +53,9 @@ KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
 KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"

 # RoPE
-KEY_ROPE_DIMENSION_COUNT         = "{arch}.rope.dimension_count"
-KEY_ROPE_FREQ_BASE               = "{arch}.rope.freq_base"
-KEY_ROPE_SCALING_TYPE            = "{arch}.rope.scaling.type"
-KEY_ROPE_SCALING_FACTOR          = "{arch}.rope.scaling.factor"
-KEY_ROPE_SCALING_ORIG_CTX_LEN    = "{arch}.rope.scaling.original_context_length"
-KEY_ROPE_SCALING_FINETUNED       = "{arch}.rope.scaling.finetuned"
+KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
+KEY_ROPE_FREQ_BASE       = "{arch}.rope.freq_base"
+KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"

 # tokenization
 KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
@@ -580,11 +577,6 @@ class TokenType(IntEnum):
    UNUSED       = 5
    BYTE         = 6

-class RopeScalingType(Enum):
-    NONE   = 'none'
-    LINEAR = 'linear'
-    YARN   = 'yarn'
-
 #
 # implementation
 #
@@ -956,17 +948,8 @@ class GGUFWriter:
    def add_rope_freq_base(self, value: float):
        self.add_float32(KEY_ROPE_FREQ_BASE.format(arch=self.arch), value)

-    def add_rope_scaling_type(self, value: RopeScalingType):
-        self.add_string(KEY_ROPE_SCALING_TYPE.format(arch=self.arch), value.value)
-
-    def add_rope_scaling_factor(self, value: float):
-        self.add_float32(KEY_ROPE_SCALING_FACTOR.format(arch=self.arch), value)
-
-    def add_rope_scaling_orig_ctx_len(self, value: int):
-        self.add_uint32(KEY_ROPE_SCALING_ORIG_CTX_LEN.format(arch=self.arch), value)
-
-    def add_rope_scaling_finetuned(self, value: bool):
-        self.add_bool(KEY_ROPE_SCALING_FINETUNED.format(arch=self.arch), value)
+    def add_rope_scale_linear(self, value: float):
+        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)

    def add_tokenizer_model(self, model: str):
        self.add_string(KEY_TOKENIZER_MODEL, model)
--- a/llama.cpp
+++ b/llama.cpp
@@ -54,7 +54,6 @@
 #include <cassert>
 #include <cinttypes>
 #include <climits>
-#include <cmath>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
@@ -236,10 +235,6 @@ enum llm_kv {
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_SCALE_LINEAR,
-    LLM_KV_ROPE_SCALING_TYPE,
-    LLM_KV_ROPE_SCALING_FACTOR,
-    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
-    LLM_KV_ROPE_SCALING_FINETUNED,

    LLM_KV_TOKENIZER_MODEL,
    LLM_KV_TOKENIZER_LIST,
@@ -281,13 +276,9 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_LAYERNORM_EPS,       "%s.attention.layer_norm_epsilon"     },
    { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,   "%s.attention.layer_norm_rms_epsilon" },

-    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count"                 },
-    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"                       },
-    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"                    },
-    { LLM_KV_ROPE_SCALING_TYPE,             "%s.rope.scaling.type"                    },
-    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
-    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
-    { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
+    { LLM_KV_ROPE_DIMENSION_COUNT,          "%s.rope.dimension_count" },
+    { LLM_KV_ROPE_FREQ_BASE,                "%s.rope.freq_base"       },
+    { LLM_KV_ROPE_SCALE_LINEAR,             "%s.rope.scale_linear"    },

    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
    { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
@@ -561,22 +552,6 @@ do { \
    } \
 } while (0)

-static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
-    { LLAMA_ROPE_SCALING_NONE,   "none"   },
-    { LLAMA_ROPE_SCALING_LINEAR, "linear" },
-    { LLAMA_ROPE_SCALING_YARN,   "yarn"   },
-};
-
-static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
-    for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
-        if (kv.second == name) {
-            return kv.first;
-        }
-    }
-
-    return LLAMA_ROPE_SCALING_UNSPECIFIED;
-}
-
 //
 // ggml helpers
 //
@@ -1060,11 +1035,8 @@ struct llama_hparams {
    float f_norm_eps;
    float f_norm_rms_eps;

-    float    rope_freq_base_train;
-    float    rope_freq_scale_train;
-    uint32_t n_yarn_orig_ctx;
-    int8_t   rope_scaling_type_train : 3;
-    bool     rope_finetuned : 1;
+    float rope_freq_base_train;
+    float rope_freq_scale_train;

    float f_clamp_kqv;
    float f_max_alibi_bias;
@@ -1079,8 +1051,6 @@ struct llama_hparams {
        if (this->n_layer     != other.n_layer)     return true;
        if (this->n_rot       != other.n_rot)       return true;
        if (this->n_ff        != other.n_ff)        return true;
-        if (this->rope_finetuned  != other.rope_finetuned)  return true;
-        if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;

        const float EPSILON = 1e-9;

@@ -1111,16 +1081,8 @@ struct llama_cparams {
    uint32_t n_threads;       // number of threads to use for generation
    uint32_t n_threads_batch; // number of threads to use for batch processing

-    float    rope_freq_base;
-    float    rope_freq_scale;
-
-    uint32_t n_yarn_orig_ctx;
-    // These hyperparameters are not exposed in GGUF, because all
-    // existing YaRN models use the same values for them.
-    float yarn_ext_factor;
-    float yarn_attn_factor;
-    float yarn_beta_fast;
-    float yarn_beta_slow;
+    float rope_freq_base;
+    float rope_freq_scale;

    bool mul_mat_q;
 };
@@ -1837,12 +1799,6 @@ struct llama_model_loader {
            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
        }

-        if (backend == GGML_BACKEND_GPU_SPLIT) {
-            if (ne.size() == 1) {
-                throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
-            }
-        }
-
        {
            bool is_ok = true;
            for (size_t i = 0; i < ne.size(); ++i) {
@@ -2058,30 +2014,14 @@ static void llm_load_hparams(
    hparams.n_head_kv = hparams.n_head;
    GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));

-    hparams.rope_finetuned = false;
-    GGUF_GET_KEY(ctx, hparams.rope_finetuned, gguf_get_val_bool, GGUF_TYPE_BOOL, false,
-                 kv(LLM_KV_ROPE_SCALING_FINETUNED));
-
-    hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
-    GGUF_GET_KEY(ctx, hparams.n_yarn_orig_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false,
-                 kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
-
    // rope_freq_base (optional)
    hparams.rope_freq_base_train = 10000.0f;
    GGUF_GET_KEY(ctx, hparams.rope_freq_base_train, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));

-    std::string rope_scaling("linear");
-    GGUF_GET_KEY(ctx, rope_scaling, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_ROPE_SCALING_TYPE));
-    hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
-    GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
-
    // rope_freq_scale (inverse of the kv) is optional
-    float ropescale = 0.0f;
-    GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALING_FACTOR));
-    if (ropescale == 0.0f) { // try the old key name
-        GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
-    }
-    hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
+    float ropescale = 1.0f;
+    GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    hparams.rope_freq_scale_train = 1.0f/ropescale;

    // sanity check for n_rot (optional)
    {
@@ -2431,8 +2371,6 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    const auto & hparams = model.hparams;
    const auto & vocab   = model.vocab;

-    const auto rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
-
    // hparams
    LLAMA_LOG_INFO("%s: format           = %s\n",     __func__, llama_file_version_name(ml.fver));
    LLAMA_LOG_INFO("%s: arch             = %s\n",     __func__, LLM_ARCH_NAMES.at(model.arch).c_str());
@@ -2451,11 +2389,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    LLAMA_LOG_INFO("%s: f_clamp_kqv      = %.1e\n",   __func__, hparams.f_clamp_kqv);
    LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n",   __func__, hparams.f_max_alibi_bias);
    LLAMA_LOG_INFO("%s: n_ff             = %u\n",     __func__, hparams.n_ff);
-    LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type.c_str());
    LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
    LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
-    LLAMA_LOG_INFO("%s: n_yarn_orig_ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx);
-    LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
    LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
    LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
    LLAMA_LOG_INFO("%s: model params     = %.2f B\n", __func__, ml.n_elements*1e-9);
@@ -2823,8 +2758,8 @@ static void llm_load_tensors(
                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);

-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
+                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);

                        if (backend == GGML_BACKEND_GPU) {
                            vram_weights +=
@@ -2883,13 +2818,13 @@ static void llm_load_tensors(
                        layer.attn_norm     = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
                        layer.attn_norm_b   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd}, backend);
                        layer.wqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend);
+                        layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend_split);
                        layer.wo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend);
+                        layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend_split);
                        layer.ffn_down      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend);
+                        layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend_split);
                        layer.ffn_up        = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend);
+                        layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend_split);
                        layer.ffn_norm      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, backend);
                        layer.ffn_norm_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd}, backend);
                        layer.attn_q_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
@@ -2955,19 +2890,19 @@ static void llm_load_tensors(
                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);

                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);

                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend);
+                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend_split);

                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);

                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);

-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
+                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);

                        if (backend == GGML_BACKEND_GPU) {
                            vram_weights +=
@@ -3112,11 +3047,21 @@ static void llm_load_tensors(
    model.t_load_us = ggml_time_us() - model.t_start_us;
 }

-static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
+static bool llama_model_load(
+        const std::string & fname,
+        llama_model & model,
+        int n_gpu_layers,
+        int main_gpu,
+        const float * tensor_split,
+        bool use_mmap,
+        bool use_mlock,
+        bool vocab_only,
+        llama_progress_callback progress_callback,
+        void *progress_callback_user_data) {
    try {
-        llama_model_loader ml(fname, params.use_mmap);
+        llama_model_loader ml(fname, use_mmap);

-        model.hparams.vocab_only = params.vocab_only;
+        model.hparams.vocab_only = vocab_only;

        llm_load_arch   (ml, model);
        llm_load_hparams(ml, model);
@@ -3128,15 +3073,15 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
            throw std::runtime_error("vocab size mismatch");
        }

-        if (params.vocab_only) {
+        if (vocab_only) {
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
            return true;
        }

        llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
-            params.progress_callback, params.progress_callback_user_data
-        );
+                ml, model, n_gpu_layers,
+                main_gpu, tensor_split,
+                use_mlock, progress_callback, progress_callback_user_data);
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
        return false;
@@ -3205,7 +3150,6 @@ static struct ggml_tensor * llm_build_inp_embd(
 static void llm_build_k_shift(
      struct ggml_context * ctx,
      const llama_hparams & hparams,
-      const llama_cparams & cparams,
     const llama_kv_cache & kv,
       struct ggml_cgraph * graph,
            llm_rope_type   type,
@@ -3218,11 +3162,6 @@ static void llm_build_k_shift(
    const int64_t n_head_kv   = hparams.n_head_kv;
    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
    const int64_t n_embd_head = hparams.n_embd_head();
-    const int32_t n_orig_ctx  = cparams.n_yarn_orig_ctx;
-    const float   ext_factor  = cparams.yarn_ext_factor;
-    const float   attn_factor = cparams.yarn_attn_factor;
-    const float   beta_fast   = cparams.yarn_beta_fast;
-    const float   beta_slow   = cparams.yarn_beta_slow;

    GGML_ASSERT(n_embd_head % n_rot == 0);

@@ -3246,8 +3185,7 @@ static void llm_build_k_shift(
                        ggml_element_size(kv.k)*n_embd_head,
                        ggml_element_size(kv.k)*n_embd_gqa,
                        ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
-                    K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow);
+                    K_shift, n_rot, rope_type, 0, freq_base, freq_scale);
        cb(tmp, "K_shifted", il);
        ggml_build_forward_expand(graph, tmp);
    }
@@ -3407,6 +3345,7 @@ static struct ggml_tensor * llm_build_ffn(
 // if max_alibi_bias > 0 then apply ALiBi
 static struct ggml_tensor * llm_build_kqv(
        struct ggml_context * ctx,
+         struct ggml_tensor * cur,
        const llama_hparams & hparams,
       const llama_kv_cache & kv,
         struct ggml_tensor * wo,
@@ -3472,7 +3411,7 @@ static struct ggml_tensor * llm_build_kqv(
    struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
    cb(kqv_merged, "kqv_merged", il);

-    struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
+    cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens);
    cb(cur, "kqv_merged_cont", il);

    cur = ggml_mul_mat(ctx, wo, cur);
@@ -3504,17 +3443,12 @@ struct llm_build_context {

    const float freq_base;
    const float freq_scale;
-    const float ext_factor;
-    const float attn_factor;
-    const float beta_fast;
-    const float beta_slow;
    const float norm_eps;
    const float norm_rms_eps;

    const int32_t n_tokens;
    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
    const int32_t kv_head;  // index of where we store new KV data in the cache
-    const int32_t n_orig_ctx;

    const bool do_rope_shift;

@@ -3544,16 +3478,11 @@ struct llm_build_context {
        n_embd_gqa    (hparams.n_embd_gqa()),
        freq_base     (cparams.rope_freq_base),
        freq_scale    (cparams.rope_freq_scale),
-        ext_factor    (cparams.yarn_ext_factor),
-        attn_factor   (cparams.yarn_attn_factor),
-        beta_fast     (cparams.yarn_beta_fast),
-        beta_slow     (cparams.yarn_beta_slow),
        norm_eps      (hparams.f_norm_eps),
        norm_rms_eps  (hparams.f_norm_rms_eps),
        n_tokens      (batch.n_tokens),
        n_kv          (worst_case ? n_ctx            : kv_self.n),
        kv_head       (worst_case ? n_ctx - n_tokens : kv_self.head),
-        n_orig_ctx    (cparams.n_yarn_orig_ctx),
        do_rope_shift (worst_case || kv_self.has_shift),
        cb            (cb),
        buf_compute   (lctx.buf_compute) {
@@ -3604,7 +3533,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@@ -3628,23 +3557,15 @@ struct llm_build_context {
                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
                cb(Vcur, "Vcur", il);

-                Qcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
-                    n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
+                Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
                cb(Qcur, "Qcur", il);

-                Kcur = ggml_rope_custom(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                    n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
+                Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
                cb(Kcur, "Kcur", il);

                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);

-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, cur, hparams, kv_self,
                        model.layers[il].wo, NULL,
                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
                cb(cur, "kqv_out", il);
@@ -3714,7 +3635,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@@ -3738,16 +3659,8 @@ struct llm_build_context {

                switch (model.type) {
                    case MODEL_7B:
-                        Qcur = ggml_rope_custom(
-                            ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
-                            n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                        );
-                        Kcur = ggml_rope_custom(
-                            ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
-                            n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
-                            ext_factor, attn_factor, beta_fast, beta_slow
-                        );
+                        Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens),    inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
+                        Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale);
                        break;
                    case MODEL_13B:
                        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens);
@@ -3764,7 +3677,7 @@ struct llm_build_context {
                // apply ALiBi for 13B model
                const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;

-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, cur, hparams, kv_self,
                        model.layers[il].wo, NULL,
                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, cb, il);
                cb(cur, "kqv_out", il);
@@ -3834,7 +3747,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@@ -3850,7 +3763,7 @@ struct llm_build_context {
            {
                if (model.layers[il].attn_norm_2) {
                    // Falcon-40B
-                    cur = llm_build_norm(ctx0, inpL, hparams,
+                    cur = llm_build_norm(ctx0, attn_norm, hparams,
                            model.layers[il].attn_norm_2,
                            model.layers[il].attn_norm_2_b,
                            LLM_NORM, cb, il);
@@ -3874,21 +3787,15 @@ struct llm_build_context {
                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);

                // using mode = 2 for neox mode
-                Qcur = ggml_rope_custom(
-                    ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
+                Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
                cb(Qcur, "Qcur", il);

-                Kcur = ggml_rope_custom(
-                    ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
+                Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale);
                cb(Kcur, "Kcur", il);

                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);

-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, attn_norm, hparams, kv_self,
                        model.layers[il].wo, NULL,
                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
                cb(cur, "kqv_out", il);
@@ -3988,7 +3895,7 @@ struct llm_build_context {

                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);

-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, cur, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
                cb(cur, "kqv_out", il);
@@ -4054,7 +3961,7 @@ struct llm_build_context {
        cb(KQ_mask, "KQ_mask", -1);

        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@@ -4147,15 +4054,13 @@ struct llm_build_context {
                cb(kpass, "kpass", il);

                struct ggml_tensor * qrotated = ggml_rope_custom(
-                    ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
+                        ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale
+                        );
                cb(qrotated, "qrotated", il);

                struct ggml_tensor * krotated = ggml_rope_custom(
-                    ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
+                        ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale
+                        );
                cb(krotated, "krotated", il);

                // ggml currently only supports concatenation on dim=2
@@ -4195,7 +4100,7 @@ struct llm_build_context {
                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);

                // TODO: not tested, could be broken
-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, Q, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
                        Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
                cb(cur, "kqv_out", il);
@@ -4286,7 +4191,7 @@ struct llm_build_context {

                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);

-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self,
                        model.layers[il].wo, NULL,
                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
                cb(cur, "kqv_out", il);
@@ -4383,7 +4288,7 @@ struct llm_build_context {

                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);

-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self,
                        model.layers[il].wo, model.layers[il].bo,
                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, cb, il);
                cb(cur, "kqv_out", il);
@@ -4477,7 +4382,7 @@ struct llm_build_context {

                llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);

-                cur = llm_build_kqv(ctx0, hparams, kv_self,
+                cur = llm_build_kqv(ctx0, Qcur, hparams, kv_self,
                        model.layers[il].wo, NULL,
                        Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, cb, il);
                cb(cur, "kqv_out", il);
@@ -7844,7 +7749,7 @@ static int llama_apply_lora_from_file_internal(
            if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
                if (dest_t->type != GGML_TYPE_F16) {
                    throw std::runtime_error(format(
-                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
+                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models", __func__));
                }
                offload_func = ggml_cuda_assign_buffers;
                offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
@@ -7979,14 +7884,8 @@ struct llama_context_params llama_context_default_params() {
        /*.n_batch                     =*/ 512,
        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
-        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
        /*.rope_freq_base              =*/ 0.0f,
        /*.rope_freq_scale             =*/ 0.0f,
-        /*.yarn_ext_factor             =*/ NAN,
-        /*.yarn_attn_factor            =*/ 1.0f,
-        /*.yarn_beta_fast              =*/ 32.0f,
-        /*.yarn_beta_slow              =*/ 1.0f,
-        /*.yarn_orig_ctx               =*/ 0,
        /*.mul_mat_q                   =*/ true,
        /*.f16_kv                      =*/ true,
        /*.logits_all                  =*/ false,
@@ -8073,7 +7972,10 @@ struct llama_model * llama_load_model_from_file(
        };
    }

-    if (!llama_model_load(path_model, *model, params)) {
+    if (!llama_model_load(path_model, *model, params.n_gpu_layers,
+                params.main_gpu, params.tensor_split,
+                params.use_mmap, params.use_mlock, params.vocab_only,
+                params.progress_callback, params.progress_callback_user_data)) {
        LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
        delete model;
        return nullptr;
@@ -8099,35 +8001,13 @@ struct llama_context * llama_new_context_with_model(
    const auto & hparams = model->hparams;
    auto       & cparams = ctx->cparams;

-    cparams.n_batch          = params.n_batch;
-    cparams.n_threads        = params.n_threads;
-    cparams.n_threads_batch  = params.n_threads_batch;
-    cparams.yarn_ext_factor  = params.yarn_ext_factor;
-    cparams.yarn_attn_factor = params.yarn_attn_factor;
-    cparams.yarn_beta_fast   = params.yarn_beta_fast;
-    cparams.yarn_beta_slow   = params.yarn_beta_slow;
-    cparams.mul_mat_q        = params.mul_mat_q;
-
-    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
-    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
-    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
-
-    cparams.n_yarn_orig_ctx  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
-                               hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
-                                                              hparams.n_ctx_train;
-
-    auto rope_scaling_type = params.rope_scaling_type;
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
-        rope_scaling_type = hparams.rope_scaling_type_train;
-    }
-
-    if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
-        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
-    }
-
-    if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
-        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
-    }
+    cparams.n_batch         = params.n_batch;
+    cparams.n_ctx           = params.n_ctx == 0           ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.rope_freq_base  = params.rope_freq_base == 0  ? hparams.rope_freq_base_train  : params.rope_freq_base;
+    cparams.rope_freq_scale = params.rope_freq_scale == 0 ? hparams.rope_freq_scale_train : params.rope_freq_scale;
+    cparams.n_threads       = params.n_threads;
+    cparams.n_threads_batch = params.n_threads_batch;
+    cparams.mul_mat_q       = params.mul_mat_q;

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
--- a/llama.h
+++ b/llama.h
@@ -106,14 +106,6 @@ extern "C" {
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };

-    enum llama_rope_scaling_type {
-        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
-        LLAMA_ROPE_SCALING_NONE        = 0,
-        LLAMA_ROPE_SCALING_LINEAR      = 1,
-        LLAMA_ROPE_SCALING_YARN        = 2,
-        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
-    };
-
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@@ -180,16 +172,10 @@ extern "C" {
        uint32_t n_batch;         // prompt processing maximum batch size
        uint32_t n_threads;       // number of threads to use for generation
        uint32_t n_threads_batch; // number of threads to use for batch processing
-        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
-        float    rope_freq_base;   // RoPE base frequency, 0 = from model
-        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
-        float    yarn_attn_factor; // YaRN magnitude scaling factor
-        float    yarn_beta_fast;   // YaRN low correction dim
-        float    yarn_beta_slow;   // YaRN high correction dim
-        uint32_t yarn_orig_ctx;    // YaRN original context size
+        float rope_freq_base;  // RoPE base frequency, 0 = from model
+        float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model

        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
--- a/models/ggml-vocab-llama.gguf
+++ b/models/ggml-vocab-llama.gguf
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@@ -1,5 +1,5 @@
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
-set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
+set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
 set(BUILD_COMPILER "unknown")
@@ -24,21 +24,15 @@ if(Git_FOUND)
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE HEAD
        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE RES
    )
-    if (RES EQUAL 0)
-        set(BUILD_COMMIT ${HEAD})
-    endif()
    execute_process(
        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE COUNT
        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE RES
    )
-    if (RES EQUAL 0)
-        set(BUILD_NUMBER ${COUNT})
-    endif()
+    set(BUILD_COMMIT ${HEAD})
+    set(BUILD_NUMBER ${COUNT})
 endif()

 if(MSVC)
@@ -59,22 +53,22 @@ else()
    set(BUILD_TARGET ${OUT})
 endif()

-# Only write the build info if it changed
-if(EXISTS ${OUTPUT_FILE})
-    file(READ ${OUTPUT_FILE} CONTENTS)
-    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
+# Only write the header if it's changed to prevent unnecessary recompilation
+if(EXISTS ${HEADER_FILE})
+    file(READ ${HEADER_FILE} CONTENTS)
+    string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
    set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
+    string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
    set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
+    string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
    set(OLD_TARGET ${CMAKE_MATCH_1})
    if (
        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
        NOT OLD_TARGET   STREQUAL BUILD_TARGET
    )
-        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+        configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
    endif()
 else()
-    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
+    configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
 endif()
--- a/scripts/build-info.h.in
+++ b/scripts/build-info.h.in
@@ -0,0 +1,9 @@
+#ifndef BUILD_INFO_H
+#define BUILD_INFO_H
+
+#define BUILD_NUMBER @BUILD_NUMBER@
+#define BUILD_COMMIT "@BUILD_COMMIT@"
+#define BUILD_COMPILER "@BUILD_COMPILER@"
+#define BUILD_TARGET "@BUILD_TARGET@"
+
+#endif // BUILD_INFO_H
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@@ -24,7 +24,12 @@ if out=$($CC -dumpmachine); then
  build_target=$out
 fi

-echo "int LLAMA_BUILD_NUMBER = ${build_number};"
-echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
-echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
-echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
+echo "#ifndef BUILD_INFO_H"
+echo "#define BUILD_INFO_H"
+echo
+echo "#define BUILD_NUMBER $build_number"
+echo "#define BUILD_COMMIT \"$build_commit\""
+echo "#define BUILD_COMPILER \"$build_compiler\""
+echo "#define BUILD_TARGET \"$build_target\""
+echo
+echo "#endif // BUILD_INFO_H"
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@@ -1,391 +0,0 @@
-#!/bin/bash
-#
-# Helper script for deploying llama.cpp server with a single Bash command
-#
-# - Works on Linux and macOS
-# - Supports: CPU, CUDA, Metal, OpenCL
-# - Can run all GGUF models from HuggingFace
-# - Can serve requests in parallel
-# - Always builds latest llama.cpp from GitHub
-#
-# Limitations
-#
-# - Chat templates are poorly supported (base models recommended)
-# - Might be unstable!
-#
-# Usage:
-#   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
-#
-#   --port:       port number, default is 8888
-#   --repo:       path to a repo containing GGUF model files
-#   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input
-#   --backend:    cpu, cuda, metal, opencl, depends on the OS
-#   --gpu-id:     gpu id, default is 0
-#   --n-parallel: number of parallel requests, default is 8
-#   --n-kv:       KV cache size, default is 4096
-#   --verbose:    verbose output
-#
-# Example:
-#
-#   bash -c "$(curl -s https://ggml.ai/server-llm.sh)"
-#
-
-set -e
-
-# required utils: curl, git, make
-if ! command -v curl &> /dev/null; then
-    printf "[-] curl not found\n"
-    exit 1
-fi
-if ! command -v git &> /dev/null; then
-    printf "[-] git not found\n"
-    exit 1
-fi
-if ! command -v make &> /dev/null; then
-    printf "[-] make not found\n"
-    exit 1
-fi
-
-# parse arguments
-port=8888
-repo=""
-wtype=""
-backend="cpu"
-
-# if macOS, use metal backend by default
-if [[ "$OSTYPE" == "darwin"* ]]; then
-    backend="metal"
-elif command -v nvcc &> /dev/null; then
-    backend="cuda"
-fi
-
-gpu_id=0
-n_parallel=8
-n_kv=4096
-verbose=0
-
-function print_usage {
-    printf "Usage:\n"
-    printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n"
-    printf "  --port:       port number, default is 8888\n"
-    printf "  --repo:       path to a repo containing GGUF model files\n"
-    printf "  --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
-    printf "  --backend:    cpu, cuda, metal, opencl, depends on the OS\n"
-    printf "  --gpu-id:     gpu id, default is 0\n"
-    printf "  --n-parallel: number of parallel requests, default is 8\n"
-    printf "  --n-kv:       KV cache size, default is 4096\n"
-    printf "  --verbose:    verbose output\n\n"
-    printf "Example:\n\n"
-    printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n'
-}
-
-while [[ $# -gt 0 ]]; do
-    key="$1"
-    case $key in
-        --port)
-            port="$2"
-            shift
-            shift
-            ;;
-        --repo)
-            repo="$2"
-            shift
-            shift
-            ;;
-        --wtype)
-            wtype="$2"
-            shift
-            shift
-            ;;
-        --backend)
-            backend="$2"
-            shift
-            shift
-            ;;
-        --gpu-id)
-            gpu_id="$2"
-            shift
-            shift
-            ;;
-        --n-parallel)
-            n_parallel="$2"
-            shift
-            shift
-            ;;
-        --n-kv)
-            n_kv="$2"
-            shift
-            shift
-            ;;
-        --verbose)
-            verbose=1
-            shift
-            ;;
-        --help)
-            print_usage
-            exit 0
-            ;;
-        *)
-            echo "Unknown argument: $key"
-            print_usage
-            exit 1
-            ;;
-    esac
-done
-
-# available weights types
-wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
-
-wfiles=()
-for wt in "${wtypes[@]}"; do
-    wfiles+=("")
-done
-
-# sample repos
-repos=(
-    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
-    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF"
-    "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF"
-    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
-    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
-    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
-    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
-)
-
-printf "\n"
-printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
-printf "    Based on the options that follow, the script might download a model file\n"
-printf "    from the internet, which can be a few GBs in size. The script will also\n"
-printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n"
-printf "\n"
-printf "    Upon success, an HTTP server will be started and it will serve the selected\n"
-printf "    model using llama.cpp for demonstration purposes.\n"
-printf "\n"
-printf "    Please note:\n"
-printf "\n"
-printf "    - All new data will be stored in the current folder\n"
-printf "    - The server will be listening on all network interfaces\n"
-printf "    - The server will run with default settings which are not always optimal\n"
-printf "    - Do not judge the quality of a model based on the results from this script\n"
-printf "    - Do not use this script to benchmark llama.cpp\n"
-printf "    - Do not use this script in production\n"
-printf "    - This script is only for demonstration purposes\n"
-printf "\n"
-printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n"
-printf "\n"
-printf "    Press Enter to continue ...\n\n"
-
-read
-
-if [[ -z "$repo" ]]; then
-    printf "[+] No repo provided from the command line\n"
-    printf "    Please select a number from the list below or enter an URL:\n\n"
-
-    is=0
-    for r in "${repos[@]}"; do
-        printf "    %2d) %s\n" $is "$r"
-        is=$((is+1))
-    done
-
-    # ask for repo until index of sample repo is provided or an URL
-    while [[ -z "$repo" ]]; do
-        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
-        read -p "[+] Select repo: " repo
-
-        # check if the input is a number
-        if [[ "$repo" =~ ^[0-9]+$ ]]; then
-            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
-                repo="${repos[$repo]}"
-            else
-                printf "[-] Invalid repo index: %s\n" "$repo"
-                repo=""
-            fi
-        elif [[ "$repo" =~ ^https?:// ]]; then
-            repo="$repo"
-        else
-            printf "[-] Invalid repo URL: %s\n" "$repo"
-            repo=""
-        fi
-    done
-fi
-
-# remove suffix
-repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
-
-printf "[+] Checking for GGUF model files in %s\n" "$repo"
-
-# find GGUF files in the source
-# TODO: better logic
-model_tree="${repo%/}/tree/main"
-model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
-
-# list all files in the provided git repo
-printf "[+] Model files:\n\n"
-for file in $model_files; do
-    # determine iw by grepping the filename with wtypes
-    iw=-1
-    is=0
-    for wt in "${wtypes[@]}"; do
-        # uppercase
-        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
-        if [[ "$ufile" =~ "$wt" ]]; then
-            iw=$is
-            break
-        fi
-        is=$((is+1))
-    done
-
-    if [[ $iw -eq -1 ]]; then
-        continue
-    fi
-
-    wfiles[$iw]="$file"
-
-    have=" "
-    if [[ -f "$file" ]]; then
-        have="*"
-    fi
-
-    printf "    %2d) %s %s\n" $iw "$have" "$file"
-done
-
-# ask for weights type until provided and available
-while [[ -z "$wtype" ]]; do
-    printf "\n"
-    read -p "[+] Select weight type: " wtype
-    wfile="${wfiles[$wtype]}"
-
-    if [[ -z "$wfile" ]]; then
-        printf "[-] Invalid weight type: %s\n" "$wtype"
-        wtype=""
-    fi
-done
-
-printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
-
-url="${repo%/}/resolve/main/$wfile"
-
-# check file if the model has been downloaded before
-chk="$wfile.chk"
-
-# check if we should download the file
-# - if $wfile does not exist
-# - if $wfile exists but $chk does not exist
-# - if $wfile exists and $chk exists but $wfile is newer than $chk
-# TODO: better logic using git lfs info
-
-do_download=0
-
-if [[ ! -f "$wfile" ]]; then
-    do_download=1
-elif [[ ! -f "$chk" ]]; then
-    do_download=1
-elif [[ "$wfile" -nt "$chk" ]]; then
-    do_download=1
-fi
-
-if [[ $do_download -eq 1 ]]; then
-    printf "[+] Downloading weights from %s\n" "$url"
-
-    # download the weights file
-    curl -o "$wfile" -# -L "$url"
-
-    # create a check file if successful
-    if [[ $? -eq 0 ]]; then
-        printf "[+] Creating check file %s\n" "$chk"
-        touch "$chk"
-    fi
-else
-    printf "[+] Using cached weights %s\n" "$wfile"
-fi
-
-# get latest llama.cpp and build
-
-printf "[+] Downloading latest llama.cpp\n"
-
-llama_cpp_dir="__llama_cpp_port_${port}__"
-
-if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
-    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
-    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[-] Please remove it and try again\n"
-    exit 1
-elif [[ -d "$llama_cpp_dir" ]]; then
-    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
-    printf "[+] Using cached llama.cpp\n"
-
-    cd "$llama_cpp_dir"
-    git reset --hard
-    git fetch
-    git checkout origin/master
-
-    cd ..
-else
-    printf "[+] Cloning llama.cpp\n"
-
-    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
-fi
-
-# mark that that the directory is made by this script
-touch "$llama_cpp_dir/__ggml_script__"
-
-if [[ $verbose -eq 1 ]]; then
-    set -x
-fi
-
-# build
-cd "$llama_cpp_dir"
-
-make clean
-
-log="--silent"
-if [[ $verbose -eq 1 ]]; then
-    log=""
-fi
-
-if [[ "$backend" == "cuda" ]]; then
-    printf "[+] Building with CUDA backend\n"
-    LLAMA_CUBLAS=1 make -j server $log
-elif [[ "$backend" == "cpu" ]]; then
-    printf "[+] Building with CPU backend\n"
-    make -j server $log
-elif [[ "$backend" == "metal" ]]; then
-    printf "[+] Building with Metal backend\n"
-    make -j server $log
-elif [[ "$backend" == "opencl" ]]; then
-    printf "[+] Building with OpenCL backend\n"
-    LLAMA_CLBLAST=1 make -j server $log
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-# run the server
-
-printf "[+] Running server\n"
-
-args=""
-if [[ "$backend" == "cuda" ]]; then
-    export CUDA_VISIBLE_DEVICES=$gpu_id
-    args="-ngl 999"
-elif [[ "$backend" == "cpu" ]]; then
-    args="-ngl 0"
-elif [[ "$backend" == "metal" ]]; then
-    args="-ngl 999"
-elif [[ "$backend" == "opencl" ]]; then
-    args="-ngl 999"
-else
-    printf "[-] Unknown backend: %s\n" "$backend"
-    exit 1
-fi
-
-if [[ $verbose -eq 1 ]]; then
-    args="$args --verbose"
-fi
-
-./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
-
-exit 0
Author	SHA1	Message	Date
Georgi Gerganov	a8796f9609	llm : cleanup + comments	2023-11-01 20:08:02 +02:00
Georgi Gerganov	78186f4009	llm : restore the non-graph llm_build_ functional API ggml-ci	2023-11-01 16:47:04 +02:00
Georgi Gerganov	995ee0919f	llm : deduce norm eps based on type + explict max_alibi_bias, clamp_kqv	2023-11-01 11:19:58 +02:00
Georgi Gerganov	9284aa6a70	llm : add llm_build_context	2023-11-01 10:52:24 +02:00