better documentation

format -> std::to_string
llama : add functions to get the model's metadata
2026-04-23 16:37:33 +03:00 · 2023-11-10 01:38:20 +01:00 · 2023-11-10 01:26:12 +01:00 · 2023-11-10 00:50:17 +01:00 · 2023-11-09 11:09:29 +01:00 · 2023-11-08 20:00:34 -06:00
72 changed files with 5045 additions and 4730 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -288,6 +288,7 @@ jobs:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
+      SDE_VERSION: 9.21.1-2023-04-24

    strategy:
      matrix:
@@ -383,11 +384,23 @@ jobs:

      - name: Test
        id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
+        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
        run: |
          cd build
          ctest -C Release --verbose --timeout 900

+      - name: Test (Intel SDE)
+        id: cmake_test_sde
+        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          # for some weird reason windows tar doesn't like sde tar.xz
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
+          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
+          cd build
+          & $sde -future -- ctest -C Release --verbose --timeout 900
+
      - name: Determine tag name
        id: tag
        shell: bash
--- a/.gitignore
+++ b/.gitignore
@@ -46,7 +46,7 @@ models-mnt
 /infill
 /libllama.so
 /llama-bench
-/llava
+/llava-cli
 /main
 /metal
 /perplexity
@@ -65,7 +65,7 @@ models-mnt
 /parallel
 /train-text-from-scratch
 /vdot
-build-info.h
+/common/build-info.cpp
 arm_neon.h
 compile_commands.json
 CMakeSettings.json
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ endif()

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

-if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(LLAMA_STANDALONE ON)

    # configure project version
@@ -100,39 +100,6 @@ option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALO
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)

-#
-# Build info header
-#
-
-# Generate initial build-info.h
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
-
-if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
-    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/.git")
-
-    # Is git submodule
-    if(NOT IS_DIRECTORY "${GIT_DIR}")
-        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
-        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${REAL_GIT_DIR}")
-    endif()
-
-    # Add a custom target for build-info.h
-    add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
-
-    # Add a custom command to rebuild build-info.h when .git/index changes
-    add_custom_command(
-        OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
-        COMMENT "Generating build details from Git"
-        COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION} -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        DEPENDS "${GIT_DIR}/index"
-        VERBATIM
-    )
-else()
-    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
-endif()
-
 #
 # Compile flags
 #
@@ -543,6 +510,10 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
    message(STATUS "x86 detected")
    if (MSVC)
+        # instruction set detection for MSVC only
+        if (LLAMA_NATIVE)
+            include(cmake/FindSIMD.cmake)
+        endif ()
        if (LLAMA_AVX512)
            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
--- a/78
+++ b/78
@@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
 	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o

 # Binaries only useful for tests
@@ -542,9 +542,9 @@ llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h l
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o
+COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o

-common.o: common/common.cpp build-info.h $(COMMON_H_DEPS)
+common.o: common/common.cpp $(COMMON_H_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
@@ -563,46 +563,46 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)

 #
 # Examples
 #

-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

-infill: examples/infill/infill.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
+infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-simple: examples/simple/simple.cpp                            build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-batched: examples/batched/batched.cpp                         build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.h ggml.o llama.o common.o $(OBJS)
+batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual

 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
@@ -614,28 +614,31 @@ train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratc
 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
+
+llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
+finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 ifdef LLAMA_METAL
@@ -648,7 +651,7 @@ swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
 endif

-build-info.h: $(wildcard .git/index) scripts/build-info.sh
+common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh $(CC) > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
 		mv $@.tmp $@; \
@@ -656,13 +659,16 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
 		rm $@.tmp; \
 	fi

+build-info.o: common/build-info.cpp
+	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
+
 #
 # Tests
 #

 tests: $(TEST_TARGETS)

-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 run-benchmark-matmult: benchmark-matmult
@@ -676,40 +682,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-c.o: tests/test-c.c llama.h
--- a/README.md
+++ b/README.md
@@ -2,7 +2,6 @@

 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

-[![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)

 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@@ -11,8 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

- LLaVA support: https://github.com/ggerganov/llama.cpp/pull/3436
- ‼️ BPE tokenizer update: existing Falcon and Starcoder `.gguf` models will need to be reconverted: [#3252](https://github.com/ggerganov/llama.cpp/pull/3252)
+- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912

 ----

--- a/build.zig
+++ b/build.zig
@@ -10,7 +10,6 @@ const Maker = struct {
    builder: *std.build.Builder,
    target: CrossTarget,
    optimize: Mode,
-    config_header: *ConfigHeader,
    enable_lto: bool,

    include_dirs: ArrayList([]const u8),
@@ -41,26 +40,24 @@ const Maker = struct {
        const commit_hash = try std.ChildProcess.exec(
            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
        );
-        const config_header = builder.addConfigHeader(
-            .{ .style = .blank, .include_path = "build-info.h" },
-            .{
-                .BUILD_NUMBER = 0,
-                .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline
-                .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}),
-                .BUILD_TARGET = try target.allocDescription(builder.allocator),
-            },
-        );
+        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
+            \\int LLAMA_BUILD_NUMBER = {};
+            \\char const *LLAMA_COMMIT = "{s}";
+            \\char const *LLAMA_COMPILER = "Zig {s}";
+            \\char const *LLAMA_BUILD_TARGET = "{s}";
+            \\
+        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
        var m = Maker{
            .builder = builder,
            .target = target,
            .optimize = builder.standardOptimizeOption(.{}),
-            .config_header = config_header,
            .enable_lto = false,
            .include_dirs = ArrayList([]const u8).init(builder.allocator),
            .cflags = ArrayList([]const u8).init(builder.allocator),
            .cxxflags = ArrayList([]const u8).init(builder.allocator),
            .objs = ArrayList(*Compile).init(builder.allocator),
        };
+
        try m.addCFlag("-std=c11");
        try m.addCxxFlag("-std=c++11");
        try m.addProjectInclude(&.{});
@@ -72,7 +69,7 @@ const Maker = struct {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (o.target.getAbi() != .msvc)
            o.defineCMacro("_GNU_SOURCE", null);
-        o.addConfigHeader(m.config_header);
+
        if (std.mem.endsWith(u8, src, ".c")) {
            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
@@ -85,7 +82,6 @@ const Maker = struct {
                o.linkLibCpp();
            }
        }
-        o.addConfigHeader(m.config_header);
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
        o.want_lto = m.enable_lto;
        return o;
@@ -105,7 +101,6 @@ const Maker = struct {
            // linkLibCpp already add (libc++ + libunwind + libc)
            e.linkLibCpp();
        }
-        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
        e.want_lto = m.enable_lto;
        return e;
@@ -121,6 +116,7 @@ pub fn build(b: *std.build.Builder) !void {
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
    const llama = make.obj("llama", "llama.cpp");
+    const buildinfo = make.obj("common", "common/build-info.cpp");
    const common = make.obj("common", "common/common.cpp");
    const console = make.obj("console", "common/console.cpp");
    const sampling = make.obj("sampling", "common/sampling.cpp");
@@ -128,14 +124,14 @@ pub fn build(b: *std.build.Builder) !void {
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, train });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
+    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, sampling, grammar_parser, clip });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/cmake/FindSIMD.cmake
+++ b/cmake/FindSIMD.cmake
@@ -0,0 +1,100 @@
+include(CheckCSourceRuns)
+
+set(AVX_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a;
+        a = _mm256_set1_ps(0);
+        return 0;
+    }
+")
+
+set(AVX512_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0,
+                                    0, 0, 0, 0, 0, 0, 0, 0);
+        __m512i b = a;
+        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
+        return 0;
+    }
+")
+
+set(AVX2_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = {0};
+        a = _mm256_abs_epi16(a);
+        __m256i x;
+        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
+        return 0;
+    }
+")
+
+set(FMA_CODE "
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 acc = _mm256_setzero_ps();
+        const __m256 d = _mm256_setzero_ps();
+        const __m256 p = _mm256_setzero_ps();
+        acc = _mm256_fmadd_ps( d, p, acc );
+        return 0;
+    }
+")
+
+macro(check_sse type flags)
+    set(__FLAG_I 1)
+    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+    foreach (__FLAG ${flags})
+        if (NOT ${type}_FOUND)
+            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
+            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
+            if (HAS_${type}_${__FLAG_I})
+                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
+                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
+            endif()
+            math(EXPR __FLAG_I "${__FLAG_I}+1")
+        endif()
+    endforeach()
+    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+    if (NOT ${type}_FOUND)
+        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
+        set(${type}_FLAGS "" CACHE STRING "${type} flags")
+    endif()
+
+    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
+endmacro()
+
+# flags are for MSVC only!
+check_sse("AVX" " ;/arch:AVX")
+if (NOT ${AVX_FOUND})
+    set(LLAMA_AVX OFF)
+else()
+    set(LLAMA_AVX ON)
+endif()
+
+check_sse("AVX2" " ;/arch:AVX2")
+check_sse("FMA" " ;/arch:AVX2")
+if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
+    set(LLAMA_AVX2 OFF)
+else()
+    set(LLAMA_AVX2 ON)
+endif()
+
+check_sse("AVX512" " ;/arch:AVX512")
+if (NOT ${AVX512_FOUND})
+    set(LLAMA_AVX512 OFF)
+else()
+    set(LLAMA_AVX512 ON)
+endif()
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,8 +1,47 @@
 # common

+
+# Build info header
+#
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
+    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
+
+    # Is git submodule
+    if(NOT IS_DIRECTORY "${GIT_DIR}")
+        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
+        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
+        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
+    endif()
+
+    set(GIT_INDEX "${GIT_DIR}/index")
+else()
+    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
+    set(GIT_INDEX "")
+endif()
+
+# Add a custom command to rebuild build-info.cpp when .git/index changes
+add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
+    COMMENT "Generating build details from Git"
+    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
+            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
+    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
+    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
+    VERBATIM
+)
+set(TARGET build_info)
+add_library(${TARGET} OBJECT build-info.cpp)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+
 set(TARGET common)

-add_library(${TARGET} OBJECT
+add_library(${TARGET} STATIC
+    base64.hpp
    common.h
    common.cpp
    sampling.h
@@ -21,4 +60,4 @@ endif()

 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama)
+target_link_libraries(${TARGET} PRIVATE llama build_info)
--- a/common/base64.hpp
+++ b/common/base64.hpp
@@ -0,0 +1,392 @@
+/*
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org>
+*/
+
+#ifndef PUBLIC_DOMAIN_BASE64_HPP_
+#define PUBLIC_DOMAIN_BASE64_HPP_
+
+#include <cstdint>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+
+class base64_error : public std::runtime_error
+{
+public:
+    using std::runtime_error::runtime_error;
+};
+
+class base64
+{
+public:
+    enum class alphabet
+    {
+        /** the alphabet is detected automatically */
+        auto_,
+        /** the standard base64 alphabet is used */
+        standard,
+        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
+        url_filename_safe
+    };
+
+    enum class decoding_behavior
+    {
+        /** if the input is not padded, the remaining bits are ignored */
+        moderate,
+        /** if a padding character is encounter decoding is finished */
+        loose
+    };
+
+    /**
+     Encodes all the elements from `in_begin` to `in_end` to `out`.
+
+     @warning The source and destination cannot overlap. The destination must be able to hold at least
+     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
+
+     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
+     8 bits
+     @tparam Output_iterator the destination; the elements written to it are from the type `char`
+     @param in_begin the beginning of the source
+     @param in_end the ending of the source
+     @param out the destination iterator
+     @param alphabet which alphabet should be used
+     @returns the iterator to the next element past the last element copied
+     @throws see `Input_iterator` and `Output_iterator`
+    */
+    template<typename Input_iterator, typename Output_iterator>
+    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
+                                  alphabet alphabet = alphabet::standard)
+    {
+        constexpr auto pad = '=';
+        const char* alpha  = alphabet == alphabet::url_filename_safe
+                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+        while (in_begin != in_end) {
+            std::uint8_t i0 = 0, i1 = 0, i2 = 0;
+
+            // first character
+            i0 = static_cast<std::uint8_t>(*in_begin);
+            ++in_begin;
+
+            *out = alpha[i0 >> 2 & 0x3f];
+            ++out;
+
+            // part of first character and second
+            if (in_begin != in_end) {
+                i1 = static_cast<std::uint8_t>(*in_begin);
+                ++in_begin;
+
+                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
+                ++out;
+            } else {
+                *out = alpha[(i0 & 0x3) << 4];
+                ++out;
+
+                // last padding
+                *out = pad;
+                ++out;
+
+                // last padding
+                *out = pad;
+                ++out;
+
+                break;
+            }
+
+            // part of second character and third
+            if (in_begin != in_end) {
+                i2 = static_cast<std::uint8_t>(*in_begin);
+                ++in_begin;
+
+                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
+                ++out;
+            } else {
+                *out = alpha[(i1 & 0xf) << 2];
+                ++out;
+
+                // last padding
+                *out = pad;
+                ++out;
+
+                break;
+            }
+
+            // rest of third
+            *out = alpha[i2 & 0x3f];
+            ++out;
+        }
+
+        return out;
+    }
+    /**
+     Encodes a string.
+
+     @param str the string that should be encoded
+     @param alphabet which alphabet should be used
+     @returns the encoded base64 string
+     @throws see base64::encode()
+    */
+    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
+    {
+        std::string result;
+
+        result.reserve(required_encode_size(str.length()) + 1);
+
+        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
+
+        return result;
+    }
+    /**
+     Encodes a char array.
+
+     @param buffer the char array
+     @param size the size of the array
+     @param alphabet which alphabet should be used
+     @returns the encoded string
+    */
+    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
+    {
+        std::string result;
+
+        result.reserve(required_encode_size(size) + 1);
+
+        encode(buffer, buffer + size, std::back_inserter(result), alphabet);
+
+        return result;
+    }
+    /**
+     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
+     in other words: inplace decoding is possible.
+
+     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
+     otherwise the behavior depends on the output iterator.
+
+     @tparam Input_iterator the source; the returned elements are cast to `char`
+     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
+     @param in_begin the beginning of the source
+     @param in_end the ending of the source
+     @param out the destination iterator
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @returns the iterator to the next element past the last element copied
+     @throws base64_error depending on the set behavior
+     @throws see `Input_iterator` and `Output_iterator`
+    */
+    template<typename Input_iterator, typename Output_iterator>
+    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
+                                  alphabet alphabet          = alphabet::auto_,
+                                  decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        //constexpr auto pad = '=';
+        std::uint8_t last  = 0;
+        auto bits          = 0;
+
+        while (in_begin != in_end) {
+            auto c = *in_begin;
+            ++in_begin;
+
+            if (c == '=') {
+                break;
+            }
+
+            auto part = _base64_value(alphabet, c);
+
+            // enough bits for one byte
+            if (bits + 6 >= 8) {
+                *out = (last << (8 - bits)) | (part >> (bits - 2));
+                ++out;
+
+                bits -= 2;
+            } else {
+                bits += 6;
+            }
+
+            last = part;
+        }
+
+        // check padding
+        if (behavior != decoding_behavior::loose) {
+            while (in_begin != in_end) {
+                auto c = *in_begin;
+                ++in_begin;
+
+                if (c != '=') {
+                    throw base64_error("invalid base64 character.");
+                }
+            }
+        }
+
+        return out;
+    }
+    /**
+     Decodes a string.
+
+     @param str the base64 encoded string
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @returns the decoded string
+     @throws see base64::decode()
+    */
+    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
+                              decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        std::string result;
+
+        result.reserve(max_decode_size(str.length()));
+
+        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
+
+        return result;
+    }
+    /**
+     Decodes a string.
+
+     @param buffer the base64 encoded buffer
+     @param size the size of the buffer
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @returns the decoded string
+     @throws see base64::decode()
+    */
+    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
+                              decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        std::string result;
+
+        result.reserve(max_decode_size(size));
+
+        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
+
+        return result;
+    }
+    /**
+     Decodes a string inplace.
+
+     @param[in,out] str the base64 encoded string
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @throws base64::decode_inplace()
+    */
+    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
+                               decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
+    }
+    /**
+     Decodes a char array inplace.
+
+     @param[in,out] str the string array
+     @param size the length of the array
+     @param alphabet which alphabet should be used
+     @param behavior the behavior when an error was detected
+     @returns the pointer to the next element past the last element decoded
+     @throws base64::decode_inplace()
+    */
+    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
+                                decoding_behavior behavior = decoding_behavior::moderate)
+    {
+        return decode(str, str + size, str, alphabet, behavior);
+    }
+    /**
+     Returns the required decoding size for a given size. The value is calculated with the following formula:
+
+     $$
+     \lceil \frac{size}{4} \rceil \cdot 3
+     $$
+
+     @param size the size of the encoded input
+     @returns the size of the resulting decoded buffer; this the absolute maximum
+    */
+    static std::size_t max_decode_size(std::size_t size) noexcept
+    {
+        return (size / 4 + (size % 4 ? 1 : 0)) * 3;
+    }
+    /**
+     Returns the required encoding size for a given size. The value is calculated with the following formula:
+
+     $$
+     \lceil \frac{size}{3} \rceil \cdot 4
+     $$
+
+     @param size the size of the decoded input
+     @returns the size of the resulting encoded buffer
+    */
+    static std::size_t required_encode_size(std::size_t size) noexcept
+    {
+        return (size / 3 + (size % 3 ? 1 : 0)) * 4;
+    }
+
+private:
+    static std::uint8_t _base64_value(alphabet& alphabet, char c)
+    {
+        if (c >= 'A' && c <= 'Z') {
+            return c - 'A';
+        } else if (c >= 'a' && c <= 'z') {
+            return c - 'a' + 26;
+        } else if (c >= '0' && c <= '9') {
+            return c - '0' + 52;
+        }
+
+        // comes down to alphabet
+        if (alphabet == alphabet::standard) {
+            if (c == '+') {
+                return 62;
+            } else if (c == '/') {
+                return 63;
+            }
+        } else if (alphabet == alphabet::url_filename_safe) {
+            if (c == '-') {
+                return 62;
+            } else if (c == '_') {
+                return 63;
+            }
+        } // auto detect
+        else {
+            if (c == '+') {
+                alphabet = alphabet::standard;
+
+                return 62;
+            } else if (c == '/') {
+                alphabet = alphabet::standard;
+
+                return 63;
+            } else if (c == '-') {
+                alphabet = alphabet::url_filename_safe;
+
+                return 62;
+            } else if (c == '_') {
+                alphabet = alphabet::url_filename_safe;
+
+                return 63;
+            }
+        }
+
+        throw base64_error("invalid base64 character.");
+    }
+};
+
+#endif // !PUBLIC_DOMAIN_BASE64_HPP_
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -0,0 +1,4 @@
+int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
+char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
+char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
+char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,5 +1,4 @@
 #include "common.h"
-#include "build-info.h"
 #include "llama.h"

 #include <algorithm>
@@ -91,6 +90,19 @@ void process_escapes(std::string& input) {
                case '\'': input[output_idx++] = '\''; break;
                case '\"': input[output_idx++] = '\"'; break;
                case '\\': input[output_idx++] = '\\'; break;
+                case 'x':
+                    // Handle \x12, etc
+                    if (input_idx + 2 < input_len) {
+                        const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
+                        char *err_p = nullptr;
+                        const long val = std::strtol(x, &err_p, 16);
+                        if (err_p == x + 2) {
+                            input_idx += 2;
+                            input[output_idx++] = char(val);
+                            break;
+                        }
+                    }
+                    // fall through
                default:   input[output_idx++] = '\\';
                           input[output_idx++] = input[input_idx]; break;
            }
@@ -404,6 +416,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_sequences = std::stoi(argv[i]);
+        } else if (arg == "--p-accept" || arg == "-pa") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.p_accept = std::stof(argv[i]);
+        } else if (arg == "--p-split" || arg == "-ps") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.p_split = std::stof(argv[i]);
        } else if (arg == "-m" || arg == "--model") {
            if (++i >= argc) {
                invalid_param = true;
@@ -779,6 +803,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
+    printf("  -pa N, --p-accept N   speculative decoding accept probability (default: %.1f)\n", (double)params.p_accept);
+    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
@@ -1199,8 +1225,8 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
    const llama_sampling_params & sparams = params.sparams;

-    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
-    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
+    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
+    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
--- a/common/common.h
+++ b/common/common.h
@@ -26,41 +26,51 @@
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)

-#define print_build_info() do {                                                             \
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);         \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);  \
+#define print_build_info() do {                                                                     \
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
+    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)

+// build info
+extern int LLAMA_BUILD_NUMBER;
+extern char const *LLAMA_COMMIT;
+extern char const *LLAMA_COMPILER;
+extern char const *LLAMA_BUILD_TARGET;
+
 //
 // CLI argument parsing
 //
 int32_t get_num_physical_cores();

 struct gpt_params {
-    uint32_t seed                           = -1;   // RNG seed
+    uint32_t seed                           = -1;    // RNG seed
+
    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_threads_batch                 = -1;   // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_predict                       = -1;   // new tokens to predict
-    int32_t n_ctx                           = 512;  // context size
-    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
-    int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
-    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel                      = 1;    // number of parallel sequences to decode
-    int32_t n_sequences                     = 1;    // number of sequences to decode
-    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
-    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
-    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
-    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
-    float   rope_freq_base                  = 0.0f; // RoPE base frequency
-    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
-    float   yarn_ext_factor                 = NAN;  // YaRN extrapolation mix factor
-    float   yarn_attn_factor                = 1.0f; // YaRN magnitude scaling factor
-    float   yarn_beta_fast                  = 32.0f;// YaRN low correction dim
-    float   yarn_beta_slow                  = 1.0f; // YaRN high correction dim
-    int32_t yarn_orig_ctx                   = 0;    // YaRN original context length
-    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;
+    int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_predict                       = -1;    // new tokens to predict
+    int32_t n_ctx                           = 512;   // context size
+    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
+    int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
+    int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel                      = 1;     // number of parallel sequences to decode
+    int32_t n_sequences                     = 1;     // number of sequences to decode
+    float   p_accept                        = 0.5f;  // speculative decoding accept probability
+    float   p_split                         = 0.1f;  // speculative decoding split probability
+    int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
+    int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
+    float   rope_freq_base                  = 0.0f;  // RoPE base frequency
+    float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
+    float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
+    float   yarn_attn_factor                = 1.0f;  // YaRN magnitude scaling factor
+    float   yarn_beta_fast                  = 32.0f; // YaRN low correction dim
+    float   yarn_beta_slow                  = 1.0f;  // YaRN high correction dim
+    int32_t yarn_orig_ctx                   = 0;     // YaRN original context length
+    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
+                                                                              //       pinging @cebtenzzre

    // // sampling parameters
    struct llama_sampling_params sparams;
@@ -84,7 +94,7 @@ struct gpt_params {
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                    //                                       (which is more convenient to use for plotting)
                                    //
-    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
--- a/convert-bloom-hf-to-gguf.py
+++ b/convert-bloom-hf-to-gguf.py
@@ -1,247 +0,0 @@
-#!/usr/bin/env python3
-# HF bloom --> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import re
-import struct
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-# Supported Models:
-#   https://huggingface.co/bigscience/bloom-1b7
-#   https://huggingface.co/bigscience/bloom-3b
-#   https://huggingface.co/bigscience/bloom-7b1
-#   https://huggingface.co/Langboat/bloom-1b4-zh
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
-    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
-    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
-    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "BloomForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-    sys.exit(1)
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH=gguf.MODEL_ARCH.BLOOM
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams["n_layer"]
-
-gguf_writer.add_name("Bloom")
-n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
-n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
-gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
-gguf_writer.add_embedding_length(n_embed)
-gguf_writer.add_feed_forward_length(4 * n_embed)
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(n_head)
-gguf_writer.add_head_count_kv(n_head)
-gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
-gguf_writer.add_file_type(ftype)
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
-added_vocab = tokenizer.get_added_vocab()
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
-
-# params for qkv transform
-n_head_kv = hparams.get("n_head_kv", n_head)
-head_dim = n_embed // n_head
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
-
-    has_lm_head = True
-    if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
-        has_lm_head = False
-
-    for original_name in model_part.keys():
-        data = model_part[original_name]
-        name = re.sub(r'transformer\.', '', original_name)
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
-            # Map bloom-style qkv_linear to gpt-style qkv_linear
-            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
-            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
-            qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
-            data = np.concatenate(
-                (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
-                 qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
-                 qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
-                axis=0
-            )
-            print("re-format attention.linear_qkv.weight")
-        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
-            qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
-            data = np.concatenate(
-                (qkv_bias[:, 0, :].reshape((n_embed,)),
-                 qkv_bias[:, 1, :].reshape((n_embed,)),
-                 qkv_bias[:, 2, :].reshape((n_embed,))),
-                axis=0
-            )
-            print("re-format attention.linear_qkv.bias")
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-        gguf_writer.add_tensor(new_name, data)
-
-        if not has_lm_head and name == "word_embeddings.weight":
-            gguf_writer.add_tensor("output.weight", data)
-            print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))  # noqa
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -1,253 +0,0 @@
-#!/usr/bin/env python3
-# HF falcon--> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import contextlib
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
-def count_model_parts(dir_model: Path, prefix: str) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith(prefix):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
-    parser.add_argument(
-        "--vocab-only", action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit(1)
-
-# get number of model parts
-num_parts = count_model_parts(dir_model, "model-00")
-if num_parts:
-    is_safetensors = True
-    from safetensors import safe_open
-else:
-    is_safetensors = False
-    num_parts = count_model_parts(dir_model, "pytorch_model-")
-
-ARCH=gguf.MODEL_ARCH.FALCON
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams.get("num_hidden_layers")
-if block_count is None:
-    block_count = hparams["n_layer"]  # old name
-
-n_head = hparams.get("num_attention_heads")
-if n_head is None:
-    n_head = hparams["n_head"]  # old name
-
-n_head_kv = hparams.get("num_kv_heads")
-if n_head_kv is None:
-    n_head_kv = hparams.get("n_head_kv", 1)  # old name
-
-gguf_writer.add_name("Falcon")
-gguf_writer.add_context_length(2048) # not in config.json
-gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
-gguf_writer.add_embedding_length(hparams["hidden_size"])
-gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(n_head)
-gguf_writer.add_head_count_kv(n_head_kv)
-gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
-gguf_writer.add_file_type(ftype)
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    tokens.append(reverse_vocab[i])
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
-
-head_dim = hparams["hidden_size"] // n_head
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-elif is_safetensors:
-    part_names = (
-        f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
-    )
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    if is_safetensors:
-        ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
-    else:
-        ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
-
-    with ctx as model_part:
-        for name in model_part.keys():
-            data = model_part.get_tensor(name) if is_safetensors else model_part[name]
-
-            old_dtype = data.dtype
-
-            # convert any unsupported data types to float32
-            if data.dtype != torch.float16 and data.dtype != torch.float32:
-                data = data.to(torch.float32)
-
-            # QKV tensor transform
-            # The original query_key_value tensor contains n_head_kv "kv groups",
-            # each consisting of n_head/n_head_kv query weights followed by one key
-            # and one value weight (shared by all query heads in the kv group).
-            # This layout makes it a big pain to work with in GGML.
-            # So we rearrange them here,, so that we have n_head query weights
-            # followed by n_head_kv key weights followed by n_head_kv value weights,
-            # in contiguous fashion.
-            # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
-
-            if "query_key_value" in name:
-                qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
-                q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
-                k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
-                v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
-                data = torch.cat((q,k,v)).reshape_as(data)
-
-            data = data.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-            if new_name is None:
-                print("Can not map tensor '" + name + "'")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-            gguf_writer.add_tensor(new_name, data)
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -1,221 +0,0 @@
-#!/usr/bin/env python3
-# HF gptneox--> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
-    parser.add_argument(
-        "--vocab-only", action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "GPTNeoXForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit()
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH=gguf.MODEL_ARCH.GPTNEOX
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams["num_hidden_layers"]
-
-gguf_writer.add_name(dir_model.name)
-gguf_writer.add_context_length(hparams["max_position_embeddings"])
-gguf_writer.add_embedding_length(hparams["hidden_size"])
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
-gguf_writer.add_head_count(hparams["num_attention_heads"])
-gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
-gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
-added_vocab = tokenizer.get_added_vocab()
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
-
-    for name in model_part.keys():
-        data = model_part[name]
-
-        # we don't need these
-        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
-            continue
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-        gguf_writer.add_tensor(new_name, data)
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -0,0 +1,890 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import contextlib
+import json
+import os
+import re
+import sys
+from enum import IntEnum
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
+
+import numpy as np
+import torch
+
+if TYPE_CHECKING:
+    from torch import Tensor
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+import gguf
+
+
+###### MODEL DEFINITIONS ######
+
+class SentencePieceTokenTypes(IntEnum):
+    NORMAL = 1
+    UNKNOWN = 2
+    CONTROL = 3
+    USER_DEFINED = 4
+    UNUSED = 5
+    BYTE = 6
+
+
+class Model:
+    def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool):
+        self.dir_model = dir_model
+        self.ftype = ftype
+        self.fname_out = fname_out
+        self.is_big_endian = is_big_endian
+        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
+        self.is_safetensors = self._is_model_safetensors()
+        self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
+        self.part_names = self._get_part_names()
+        self.hparams = Model.load_hparams(self.dir_model)
+        self.model_arch = self._get_model_architecture()
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        for part_name in self.part_names:
+            print(f"gguf: loading model part '{part_name}'")
+            ctx: ContextManager[Any]
+            if self.is_safetensors:
+                from safetensors import safe_open
+                ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
+            else:
+                ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu"))
+
+            with ctx as model_part:
+                for name in model_part.keys():
+                    data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
+                    yield name, data
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_block_count(self.hparams.get(
+            "n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
+        ))
+        if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
+            self.gguf_writer.add_context_length(n_ctx)
+        if (n_embd := self.hparams.get("hidden_size")) is not None:
+            self.gguf_writer.add_embedding_length(n_embd)
+        if (n_ff := self.hparams.get("intermediate_size")) is not None:
+            self.gguf_writer.add_feed_forward_length(n_ff)
+        if (n_head := self.hparams.get("num_attention_head")) is not None:
+            self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+    def write(self):
+        self.write_tensors()
+        self.gguf_writer.write_header_to_file()
+        self.gguf_writer.write_kv_data_to_file()
+        self.gguf_writer.write_tensors_to_file()
+        self.gguf_writer.close()
+
+    def write_vocab(self):
+        self.gguf_writer.write_header_to_file()
+        self.gguf_writer.write_kv_data_to_file()
+        self.gguf_writer.close()
+
+    @staticmethod
+    def count_model_parts(dir_model: Path, prefix: str) -> int:
+        num_parts = 0
+        for filename in os.listdir(dir_model):
+            if filename.endswith(prefix):
+                num_parts += 1
+
+        return num_parts
+
+    @staticmethod
+    def load_hparams(dir_model):
+        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+            return json.load(f)
+
+    @staticmethod
+    def from_model_architecture(model_architecture):
+        if model_architecture == "StableLMEpochForCausalLM":
+            return StableLMModel
+        if model_architecture == "GPTNeoXForCausalLM":
+            return GPTNeoXModel
+        if model_architecture == "BloomForCausalLM":
+            return BloomModel
+        if model_architecture == "MPTForCausalLM":
+            return MPTModel
+        if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
+            return BaichuanModel
+        if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
+            return FalconModel
+        if model_architecture == "GPTBigCodeForCausalLM":
+            return StarCoderModel
+        if model_architecture == "GPTRefactForCausalLM":
+            return RefactModel
+        if model_architecture == "PersimmonForCausalLM":
+            return PersimmonModel
+        return Model
+
+    def _is_model_safetensors(self) -> bool:
+        return Model.count_model_parts(self.dir_model, ".safetensors") > 0
+
+    def _get_part_names(self):
+        if self.is_safetensors:
+            if self.num_parts == 1:  # there's only one .safetensors file
+                return ("model.safetensors",)
+            return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
+
+        if self.num_parts == 1:  # there's only one .bin file
+            return ("pytorch_model.bin",)
+        return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
+
+    def _get_model_architecture(self) -> gguf.MODEL_ARCH:
+        arch = self.hparams["architectures"][0]
+        if arch == "GPTNeoXForCausalLM":
+            return gguf.MODEL_ARCH.GPTNEOX
+        if arch == "BloomForCausalLM":
+            return gguf.MODEL_ARCH.BLOOM
+        if arch == "MPTForCausalLM":
+            return gguf.MODEL_ARCH.MPT
+        if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
+            return gguf.MODEL_ARCH.BAICHUAN
+        if arch == "FalconForCausalLM":
+            return gguf.MODEL_ARCH.FALCON
+        if arch == "GPTBigCodeForCausalLM":
+            return gguf.MODEL_ARCH.STARCODER
+        if arch == "GPTRefactForCausalLM":
+            return gguf.MODEL_ARCH.REFACT
+        if arch == "PersimmonForCausalLM":
+            return gguf.MODEL_ARCH.PERSIMMON
+
+        raise NotImplementedError(f'Architecture "{arch}" not supported!')
+
+    def _set_vocab_gpt2(self):
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[bytearray] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer  # type: ignore[attr-defined]
+        tokenizer = AutoTokenizer.from_pretrained(dir_model)
+        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+        assert max(tokenizer.vocab.values()) < vocab_size
+
+        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
+        added_vocab = tokenizer.get_added_vocab()
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                pad_token = f"[PAD{i}]".encode('utf-8')
+                tokens.append(bytearray(pad_token))
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                if tokenizer.added_tokens_decoder[i].special:
+                    toktypes.append(gguf.TokenType.CONTROL)
+                else:
+                    toktypes.append(gguf.TokenType.USER_DEFINED)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+    def _set_vocab_sentencepiece(self):
+        from sentencepiece import SentencePieceProcessor
+
+        tokenizer_path = self.dir_model / 'tokenizer.model'
+
+        tokens: list[bytes] = []
+        scores: list[float] = []
+        toktypes: list[int] = []
+
+        if not tokenizer_path.is_file():
+            print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
+            sys.exit(1)
+
+        tokenizer = SentencePieceProcessor(str(tokenizer_path))
+        vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+        for token_id in range(vocab_size):
+            piece = tokenizer.id_to_piece(token_id)
+            text = piece.encode("utf-8")
+            score = tokenizer.get_score(token_id)
+
+            toktype = SentencePieceTokenTypes.NORMAL
+            if tokenizer.is_unknown(token_id):
+                toktype = SentencePieceTokenTypes.UNKNOWN
+            elif tokenizer.is_control(token_id):
+                toktype = SentencePieceTokenTypes.CONTROL
+            elif tokenizer.is_unused(token_id):
+                toktype = SentencePieceTokenTypes.UNUSED
+            elif tokenizer.is_byte(token_id):
+                toktype = SentencePieceTokenTypes.BYTE
+
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        added_tokens_file = self.dir_model / 'added_tokens.json'
+        if added_tokens_file.is_file():
+            with open(added_tokens_file, "r", encoding="utf-8") as f:
+                added_tokens_json = json.load(f)
+
+                for key in added_tokens_json:
+                    tokens.append(key.encode("utf-8"))
+                    scores.append(-1000.0)
+                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
+
+        self.gguf_writer.add_tokenizer_model("llama")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+
+class StableLMModel(Model):
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_rope_dimension_count(
+            int(self.hparams["rope_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
+        )
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+
+
+class GPTNeoXModel(Model):
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(
+            int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
+        )
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
+
+
+class BloomModel(Model):
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_name("Bloom")
+        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
+        self.gguf_writer.add_embedding_length(n_embed)
+        self.gguf_writer.add_feed_forward_length(4 * n_embed)
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head)
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def write_tensors(self):
+        block_count = self.hparams["n_layer"]
+        tensors = dict(self.get_tensors())
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        has_lm_head = True
+        n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+        n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+
+        for name, data_torch in tensors.items():
+            if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
+                has_lm_head = False
+
+            name = re.sub(r'transformer\.', '', name)
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
+                # Map bloom-style qkv_linear to gpt-style qkv_linear
+                # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
+                # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
+                qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
+                data = np.concatenate(
+                    (
+                        qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
+                        qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
+                        qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
+                    ),
+                    axis=0,
+                )
+                print("re-format attention.linear_qkv.weight")
+            elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
+                qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
+                data = np.concatenate(
+                    (
+                        qkv_bias[:, 0, :].reshape((n_embed,)),
+                        qkv_bias[:, 1, :].reshape((n_embed,)),
+                        qkv_bias[:, 2, :].reshape((n_embed,)),
+                    ),
+                    axis=0,
+                )
+                print("re-format attention.linear_qkv.bias")
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+            if not has_lm_head and name == "word_embeddings.weight":
+                self.gguf_writer.add_tensor("output.weight", data)
+                print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
+
+
+class MPTModel(Model):
+    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layers"]
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
+        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
+        self.gguf_writer.add_head_count(self.hparams["n_heads"])
+        if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
+            self.gguf_writer.add_head_count_kv(kv_n_heads)
+        self.gguf_writer.add_layer_norm_eps(1e-5)
+        if self.hparams["attn_config"]["clip_qkv"] is not None:
+            self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
+        self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+            # note: MPT output is tied to (same as) wte in original model;
+            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
+            if new_name == "token_embd.weight":
+                self.gguf_writer.add_tensor("output.weight", data)
+
+
+class BaichuanModel(Model):
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+
+    def set_gguf_parameters(self):
+        block_count = self.hparams["num_hidden_layers"]
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+        hf_repo = self.hparams.get("_name_or_path", "")
+
+        ctx_length = 0
+        if "max_sequence_length" in self.hparams:
+            ctx_length = self.hparams["max_sequence_length"]
+        elif "max_position_embeddings" in self.hparams:
+            ctx_length = self.hparams["max_position_embeddings"]
+        elif "model_max_length" in self.hparams:
+            ctx_length = self.hparams["model_max_length"]
+        else:
+            print("gguf: can not find ctx length parameter.")
+            sys.exit()
+
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_source_hf_repo(hf_repo)
+        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
+        self.gguf_writer.add_context_length(ctx_length)
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count(head_count)
+        self.gguf_writer.add_head_count_kv(head_count_kv)
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+
+        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+
+    def write_tensors(self):
+        # Collect tensors from generator object
+        model_kv = dict(self.get_tensors())
+        block_count = self.hparams["num_hidden_layers"]
+        head_count = self.hparams["num_attention_heads"]
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
+
+        for i in range(block_count):
+            if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
+                print(f"Unpacking and permuting layer {i}")
+                model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
+                    self._reverse_hf_permute_part(w, 0, head_count, head_count)
+                model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
+                    self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
+                model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
+                    self._reverse_hf_part(w, 2)
+                del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
+
+        for name, data_torch in model_kv.items():
+            # we don't need these
+            if name.endswith(".rotary_emb.inv_freq"):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            self.gguf_writer.add_tensor(new_name, data)
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+    def _reverse_hf_permute_part(
+        self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
+    ) -> Tensor:
+        r = weights.shape[0] // 3
+        return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
+
+    def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
+        r = weights.shape[0] // 3
+        return weights[r * n_part:r * n_part + r, ...]
+
+
+class FalconModel(Model):
+    def set_gguf_parameters(self):
+        block_count = self.hparams.get("num_hidden_layers")
+        if block_count is None:
+            block_count = self.hparams["n_layer"]  # old name
+
+        n_head = self.hparams.get("num_attention_heads")
+        if n_head is None:
+            n_head = self.hparams["n_head"]  # old name
+
+        n_head_kv = self.hparams.get("num_kv_heads")
+        if n_head_kv is None:
+            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
+
+        self.gguf_writer.add_name("Falcon")
+        self.gguf_writer.add_context_length(2048)  # not in config.json
+        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(n_head)
+        self.gguf_writer.add_head_count_kv(n_head_kv)
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def write_tensors(self):
+        block_count = self.hparams.get("num_hidden_layers")
+        if block_count is None:
+            block_count = self.hparams["n_layer"]  # old name
+
+        n_head = self.hparams.get("num_attention_heads")
+        if n_head is None:
+            n_head = self.hparams["n_head"]  # old name
+
+        n_head_kv = self.hparams.get("num_kv_heads")
+        if n_head_kv is None:
+            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
+
+        head_dim = self.hparams["hidden_size"] // n_head
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        for name, data_torch in self.get_tensors():
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            # QKV tensor transform
+            # The original query_key_value tensor contains n_head_kv "kv groups",
+            # each consisting of n_head/n_head_kv query weights followed by one key
+            # and one value weight (shared by all query heads in the kv group).
+            # This layout makes it a big pain to work with in GGML.
+            # So we rearrange them here,, so that we have n_head query weights
+            # followed by n_head_kv key weights followed by n_head_kv value weights,
+            # in contiguous fashion.
+            # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+
+            if "query_key_value" in name:
+                qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+                q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
+                k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+                v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+                data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
+class StarCoderModel(Model):
+    def set_gguf_parameters(self):
+        block_count = self.hparams["n_layer"]
+
+        self.gguf_writer.add_name("StarCoder")
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_head_count_kv(1)
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+
+class RefactModel(Model):
+    def set_gguf_parameters(self):
+        hidden_dim = self.hparams["n_embd"]
+        inner_dim = 4 * hidden_dim
+        hidden_dim = int(2 * inner_dim / 3)
+        multiple_of = 256
+        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        block_count = self.hparams["n_layer"]
+
+        self.gguf_writer.add_name("Refact")
+        # refact uses Alibi. So this is from config.json which might be used by training.
+        self.gguf_writer.add_context_length(self.hparams["n_positions"])
+        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
+
+        self.gguf_writer.add_feed_forward_length(ff_dim)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_head_count(self.hparams["n_head"])
+        self.gguf_writer.add_head_count_kv(1)
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+    def write_tensors(self):
+        hidden_dim = self.hparams["n_embd"]
+        inner_dim = 4 * hidden_dim
+        hidden_dim = int(2 * inner_dim / 3)
+        multiple_of = 256
+        ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        n_head = self.hparams["n_head"]
+        n_head_kv = 1
+        head_dim = self.hparams["n_embd"] // n_head
+        block_count = self.hparams["n_layer"]
+
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        tensors = dict(self.get_tensors())
+        for i in range(block_count):
+            if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
+                tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
+                tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
+                del tensors[f"transformer.h.{i}.attn.kv.weight"]
+            if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
+                tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
+                del tensors[f"transformer.h.{i}.attn.q.weight"]
+            if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
+                tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
+                tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
+                del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
+
+        for name, data_torch in tensors.items():
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
+
+class PersimmonModel(Model):
+    def set_gguf_parameters(self):
+        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
+        head_count = self.hparams["num_attention_heads"]
+        head_count_kv = head_count
+        hidden_size = self.hparams["hidden_size"]
+
+        self.gguf_writer.add_name('persimmon-8b-chat')
+        self.gguf_writer.add_embedding_length(hidden_size)
+        self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
+        self.gguf_writer.add_head_count(head_count)
+        self.gguf_writer.add_head_count_kv(head_count_kv)
+        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
+        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
+        # self.gguf_writer.add_bos_token_id(71013)
+        # self.gguf_writer.add_eos_token_id(71013)
+
+    def write_tensors(self):
+        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        for name, data_torch in self.get_tensors():
+            if name.endswith(".self_attention.rotary_emb.inv_freq"):
+                continue
+            old_dtype = data_torch.dtype
+            # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
+            data = data_torch.to(torch.float32).squeeze().numpy()
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+            n_dims = len(data.shape)
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+            self.gguf_writer.add_tensor(new_name, data)
+
+
+###### CONVERSION LOGIC ######
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
+    parser.add_argument(
+        "--vocab-only", action="store_true",
+        help="extract only the vocab",
+    )
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input",
+    )
+    parser.add_argument(
+        "--outtype", type=str, choices=["f32", "f16"], default="f16",
+        help="output format - use f32 for float32, f16 for float16",
+    )
+    parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
+    parser.add_argument(
+        "model", type=Path,
+        help="directory containing model file",
+    )
+
+    return parser.parse_args()
+
+
+args = parse_args()
+
+dir_model = args.model
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file=sys.stderr)
+    sys.exit(1)
+
+ftype_map = {
+    "f32": gguf.GGMLQuantizationType.F32,
+    "f16": gguf.GGMLQuantizationType.F16,
+}
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
+
+print(f"Loading model: {dir_model.name}")
+
+hparams = Model.load_hparams(dir_model)
+
+model_class = Model.from_model_architecture(hparams["architectures"][0])
+model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+
+print("Set model parameters")
+model_instance.set_gguf_parameters()
+
+print("Set model tokenizer")
+model_instance.set_vocab()
+
+if args.vocab_only:
+    print(f"Exporting model vocab to '{fname_out}'")
+    model_instance.write_vocab()
+else:
+    print(f"Exporting model to '{fname_out}'")
+    model_instance.write()
+
+print(f"Model successfully exported to '{fname_out}'")
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@@ -1,227 +0,0 @@
-#!/usr/bin/env python3
-# HF mpt--> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
-    parser.add_argument(
-        "--vocab-only", action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "MPTForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit()
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH=gguf.MODEL_ARCH.MPT
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams["n_layers"]
-
-gguf_writer.add_name(dir_model.name)
-gguf_writer.add_context_length(hparams["max_seq_len"])
-gguf_writer.add_embedding_length(hparams["d_model"])
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
-gguf_writer.add_head_count(hparams["n_heads"])
-if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
-    gguf_writer.add_head_count_kv(kv_n_heads)
-gguf_writer.add_layer_norm_eps(1e-05)
-if hparams["attn_config"]["clip_qkv"] is not None:
-    gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
-gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
-# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
-# accomodate some "reserved" tokens; this is causing problems down the line in
-# llama.cpp, so we pad the vocab with dummy tokens:
-
-vocab_size = hparams["vocab_size"]
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-added_vocab = tokenizer.get_added_vocab()
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
-
-    for name in model_part.keys():
-        data = model_part[name]
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Cannot map tensor '" + name + "'")
-            continue # for the sake of compatibility with some old published models, don't quit
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-        gguf_writer.add_tensor(new_name, data)
-
-        # note: MPT output is tied to (same as) wte in original model;
-        # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-        if new_name == "token_embd.weight":
-            gguf_writer.add_tensor("output.weight", data)
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-refact-hf-to-gguf.py
+++ b/convert-refact-hf-to-gguf.py
@@ -1,272 +0,0 @@
-#!/usr/bin/env python3
-# HF refact--> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import sys
-from pathlib import Path
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if "NO_LOCAL_GGUF" not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
-import gguf
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert a Refact model to a GGML compatible file"
-    )
-    parser.add_argument(
-        "--vocab-only",
-        action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile",
-        type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model",
-        type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype",
-        type=int,
-        choices=[0, 1],
-        default=1,
-        nargs="?",
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f"Error: {args.model} is not a directory", file=sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
-
-print("gguf: loading model " + dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "GPTRefactForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit(1)
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH = gguf.MODEL_ARCH.REFACT
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-# Get refact feed forward dimension
-hidden_dim = hparams["n_embd"]
-inner_dim = 4 * hidden_dim
-hidden_dim = int(2 * inner_dim / 3)
-multiple_of = 256
-ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-
-block_count = hparams["n_layer"]
-
-gguf_writer.add_name("Refact")
-# refact uses Alibi. So this is from config.json which might be used by training.
-gguf_writer.add_context_length(hparams["n_positions"])
-gguf_writer.add_embedding_length(hparams["n_embd"])
-
-gguf_writer.add_feed_forward_length(ff_dim)
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["n_head"])
-gguf_writer.add_head_count_kv(1)
-gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
-gguf_writer.add_file_type(ftype)
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
-added_vocab = tokenizer.get_added_vocab()
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
-
-# params for qkv transform
-n_head = hparams["n_head"]
-n_head_kv = 1
-
-head_dim = hparams["n_embd"] // n_head
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
-
-    for i in range(block_count):
-        if f"transformer.h.{i}.attn.kv.weight" in model_part:
-            data = model_part[f"transformer.h.{i}.attn.kv.weight"]
-            model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
-                : n_head_kv * head_dim
-            ]
-            model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
-                n_head_kv * head_dim :
-            ]
-            del model_part[f"transformer.h.{i}.attn.kv.weight"]
-        if f"transformer.h.{i}.attn.q.weight" in model_part:
-            model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
-                f"transformer.h.{i}.attn.q.weight"
-            ]
-            del model_part[f"transformer.h.{i}.attn.q.weight"]
-        if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
-            data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
-            model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
-            model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
-            del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
-
-    for name in model_part.keys():
-        data = model_part[name]
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if (
-            ftype == 1
-            and data_dtype == np.float32
-            and name.endswith(".weight")
-            and n_dims == 2
-        ):
-            data = data.astype(np.float16)
-
-        print(
-            new_name
-            + ", n_dims = "
-            + str(n_dims)
-            + ", "
-            + str(old_dtype)
-            + " --> "
-            + str(data.dtype)
-        )
-
-        gguf_writer.add_tensor(new_name, data)
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@@ -1,210 +0,0 @@
-#!/usr/bin/env python3
-# HF starcoder --> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
-    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
-    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
-    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit(1)
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH=gguf.MODEL_ARCH.STARCODER
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams["n_layer"]
-
-gguf_writer.add_name("StarCoder")
-gguf_writer.add_context_length(hparams["n_positions"])
-gguf_writer.add_embedding_length(hparams["n_embd"])
-gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["n_head"])
-gguf_writer.add_head_count_kv(1)
-gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
-gguf_writer.add_file_type(ftype)
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-assert max(tokenizer.vocab.values()) < vocab_size
-
-added_vocab = tokenizer.get_added_vocab()
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-
-for i in range(vocab_size):
-    if i not in reverse_vocab:
-        tokens.append(f"[PAD{i}]")
-        toktypes.append(gguf.TokenType.USER_DEFINED)
-    elif reverse_vocab[i] in added_vocab:
-        tokens.append(reverse_vocab[i])
-        if tokenizer.added_tokens_decoder[i].special:
-            toktypes.append(gguf.TokenType.CONTROL)
-        else:
-            toktypes.append(gguf.TokenType.USER_DEFINED)
-    else:
-        tokens.append(reverse_vocab[i])
-        toktypes.append(gguf.TokenType.NORMAL)
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_types(toktypes)
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
-
-# params for qkv transform
-n_head    = hparams["n_head"]
-n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
-
-head_dim = hparams["n_embd"] // n_head
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
-
-    for name in model_part.keys():
-        data = model_part[name]
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-        gguf_writer.add_tensor(new_name, data)
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert.py
+++ b/convert.py
@@ -26,7 +26,7 @@ from pathlib import Path
 from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar

 import numpy as np
-from sentencepiece import SentencePieceProcessor  # type: ignore[import]
+from sentencepiece import SentencePieceProcessor

 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
@@ -328,7 +328,7 @@ class BpeVocab:

    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.bpe_tokenizer
-        from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
+        from transformers.models.gpt2 import tokenization_gpt2
        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}

        for i, _ in enumerate(tokenizer):
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,9 +1,6 @@
 set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "ggml.h"

--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} embedding.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -643,7 +643,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(

        return ggml_rope_custom(ctx,
            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
-            rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
+            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };

--- a/examples/infill/CMakeLists.txt
+++ b/examples/infill/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} infill.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-    add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,7 +2,6 @@

 #include "console.h"
 #include "llama.h"
-#include "build-info.h"
 #include "grammar-parser.h"

 #include <cassert>
@@ -184,8 +183,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
+    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} llama-bench.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -19,7 +19,6 @@
 #include "ggml.h"
 #include "llama.h"
 #include "common.h"
-#include "build-info.h"
 #include "ggml-cuda.h"

 // utils
@@ -641,8 +640,8 @@ struct test {
    }
 };

-const std::string test::build_commit = BUILD_COMMIT;
-const int         test::build_number = BUILD_NUMBER;
+const std::string test::build_commit = LLAMA_COMMIT;
+const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cublas();
 const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::metal        = !!ggml_cpu_has_metal();
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -1,20 +1,36 @@
-set(TARGET clip)
-add_library(${TARGET} clip.cpp clip.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if (NOT MSVC)
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
-    endif()
-if(TARGET BUILD_INFO)
-    add_dependencies(${TARGET} BUILD_INFO)
+add_library(llava OBJECT
+            llava.cpp
+            llava.h
+            clip.cpp
+            clip.h
+            )
+
+target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(llava PUBLIC .)
+target_include_directories(llava PUBLIC ../..)
+target_include_directories(llava PUBLIC ../../common)
+
+target_compile_features(llava PRIVATE cxx_std_11)
+
+add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
+    target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS llava_shared LIBRARY)
 endif()

-set(TARGET llava)
-add_executable(${TARGET} llava.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
+    endif()
 if(TARGET BUILD_INFO)
-    add_dependencies(${TARGET} BUILD_INFO)
+    add_dependencies(llava BUILD_INFO)
 endif()
+
+set(TARGET llava-cli)
+add_executable(llava-cli llava-cli.cpp)
+install(TARGETS llava-cli RUNTIME)
+target_link_libraries(llava-cli PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(llava PRIVATE cxx_std_11)
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -9,12 +9,12 @@ models are available.
 After API is confirmed, more models will be supported / uploaded.

 ## Usage
-Build with cmake or run `make llava` to build it.
+Build with cmake or run `make llava-cli` to build it.

-After building, run: `./llava` to see the usage. For example:
+After building, run: `./llava-cli` to see the usage. For example:

 ```sh
-./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```

 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -51,7 +51,6 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director

 ## TODO

- [ ] Support server mode.
 - [ ] Support non-CPU backend for the image encoding part.
 - [ ] Support different sampling methods.
 - [ ] Support more model variants.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -680,26 +680,44 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    return new_clip;
 }

-clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); }
-
+clip_image_u8 * make_clip_image_u8() {
+    auto img = new clip_image_u8();
+    return img;
+}
 clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }

-bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
-    int nx, ny, nc;
-    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
-    if (!data) {
-        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname);
-        return false;
-    }
+void clip_image_u8_free(clip_image_u8 * img) { if (img->data) { delete[] img->data; } delete img; }
+void clip_image_f32_free(clip_image_f32 * img) { if (img->data) { delete[] img->data; } delete img; }

+static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
    img->nx = nx;
    img->ny = ny;
    img->size = nx * ny * 3;
    img->data = new uint8_t[img->size]();
    memcpy(img->data, data, img->size);
+}

+bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
+    int nx, ny, nc;
+    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
+    if (!data) {
+        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
+        return false;
+    }
+    build_clip_img_from_data(data, nx, ny, img);
    stbi_image_free(data);
+    return true;
+}

+bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
+    int nx, ny, nc;
+    auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
+    if (!data) {
+        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
+        return false;
+    }
+    build_clip_img_from_data(data, nx, ny, img);
+    stbi_image_free(data);
    return true;
 }

@@ -714,39 +732,40 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156

-    clip_image_u8 temp; // we will keep the input image data here temporarily
+    clip_image_u8 * temp = make_clip_image_u8(); // we will keep the input image data here temporarily
    if (pad2square && img->nx != img->ny) {
        int longer_side = std::max(img->nx, img->ny);
-        temp.nx = longer_side;
-        temp.ny = longer_side;
-        temp.size = 3 * longer_side * longer_side;
-        temp.data = new uint8_t[temp.size]();
+        temp->nx = longer_side;
+        temp->ny = longer_side;
+        temp->size = 3 * longer_side * longer_side;
+        temp->data = new uint8_t[temp->size]();
        uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA

        // fill with background color
-        for (size_t i = 0; i < temp.size; i++) {
-            temp.data[i] = bc[i % 3];
+        for (size_t i = 0; i < temp->size; i++) {
+            temp->data[i] = bc[i % 3];
        }

        // copy from the input image
        for (int y = 0; y < img->ny; y++) {
            for (int x = 0; x < img->nx; x++) {
                const int i = 3 * (y * img->nx + x);
-                const int j = 3 * (y * temp.nx + x);
-                temp.data[j] = img->data[i];
-                temp.data[j+1] = img->data[i+1];
-                temp.data[j+2] = img->data[i+2];
+                const int j = 3 * (y * temp->nx + x);
+                temp->data[j] = img->data[i];
+                temp->data[j+1] = img->data[i+1];
+                temp->data[j+2] = img->data[i+2];
            }
        }
    } else {
-        temp.nx   = img->nx;
-        temp.ny   = img->ny;
-        temp.size = img->size;
-        temp.data = img->data;
+        temp->nx   = img->nx;
+        temp->ny   = img->ny;
+        temp->size = img->size;
+        temp->data = new uint8_t[temp->size]();
+        *temp->data = *img->data; // copy
    }

-    const int nx = temp.nx;
-    const int ny = temp.ny;
+    const int nx = temp->nx;
+    const int ny = temp->ny;

    const int nx2 = ctx->vision_model.hparams.image_size;
    const int ny2 = ctx->vision_model.hparams.image_size;
@@ -785,10 +804,10 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
                const int j10 = 3 * (y1 * nx + x0) + c;
                const int j11 = 3 * (y1 * nx + x1) + c;

-                const float v00 = temp.data[j00];
-                const float v01 = temp.data[j01];
-                const float v10 = temp.data[j10];
-                const float v11 = temp.data[j11];
+                const float v00 = temp->data[j00];
+                const float v01 = temp->data[j01];
+                const float v10 = temp->data[j10];
+                const float v11 = temp->data[j11];

                const float v0 = v00 * (1.0f - dx) + v01 * dx;
                const float v1 = v10 * (1.0f - dx) + v11 * dx;
@@ -803,6 +822,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
            }
        }
    }
+    clip_image_u8_free(temp);

    return true;
 }
@@ -1049,16 +1069,16 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    return true;
 }

-int clip_n_mmproj_embd(struct clip_ctx * ctx) {
+int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
    return ctx->vision_model.mm_2_b->ne[0];
 }

-int clip_n_patches(struct clip_ctx * ctx) {
+int clip_n_patches(const struct clip_ctx * ctx) {
    auto & params = ctx->vision_model.hparams;

    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
 }

-size_t clip_embd_nbytes(struct clip_ctx * ctx) {
+size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -1,7 +1,22 @@
 #ifndef CLIP_H
 #define CLIP_H

-#include "ggml.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define CLIP_API __declspec(dllexport)
+#        else
+#            define CLIP_API __declspec(dllimport)
+#        endif
+#    else
+#        define CLIP_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define CLIP_API
+#endif

 struct clip_ctx;

@@ -20,19 +35,20 @@ struct clip_vision_hparams {
    float eps;
 };

-struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+/** load mmproj model */
+CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity);
+/** free mmproj model */
+CLIP_API void clip_free(struct clip_ctx * ctx);

-void clip_free(struct clip_ctx * ctx);
-
-size_t clip_embd_nbytes(struct clip_ctx * ctx);
-int clip_n_patches(struct clip_ctx * ctx);
-int clip_n_mmproj_embd(struct clip_ctx * ctx);
+size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+int clip_n_patches(const struct clip_ctx * ctx);
+int clip_n_mmproj_embd(const struct clip_ctx * ctx);

 // RGB uint8 image
 struct clip_image_u8 {
    int nx;
    int ny;
-    uint8_t * data;
+    uint8_t * data = NULL;
    size_t size;
 };

@@ -41,7 +57,7 @@ struct clip_image_u8 {
 struct clip_image_f32 {
    int nx;
    int ny;
-    float * data;
+    float * data = NULL;
    size_t size;
 };

@@ -57,7 +73,12 @@ struct clip_image_f32_batch {

 struct clip_image_u8 * make_clip_image_u8();
 struct clip_image_f32 * make_clip_image_f32();
-bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+CLIP_API void clip_image_u8_free(clip_image_u8 * img);
+CLIP_API void clip_image_f32_free(clip_image_f32 * img);
+CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
+CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+
 bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
 bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);

--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -0,0 +1,313 @@
+#include "ggml.h"
+#include "common.h"
+#include "clip.h"
+#include "llava.h"
+#include "llama.h"
+
+#include "base64.hpp"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+            fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    return true;
+}
+
+// TODO: use common/sampling.h
+static llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
+    auto & sparams = params.sparams;
+
+    // out of user input, sample next token
+    const float   temp      = sparams.temp;
+    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
+    const float   top_p     = sparams.top_p;
+    const float   tfs_z     = sparams.tfs_z;
+    const float   typical_p = sparams.typical_p;
+    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
+    // const float   repeat_penalty  = sparams.repeat_penalty;
+    // const float   alpha_presence  = sparams.presence_penalty;
+    // const float   alpha_frequency = sparams.frequency_penalty;
+    const int     mirostat     = sparams.mirostat;
+    const float   mirostat_tau = sparams.mirostat_tau;
+    const float   mirostat_eta = sparams.mirostat_eta;
+    // const bool    penalize_nl     = sparams.penalize_nl;
+
+    llama_token id = 0;
+    {
+        auto logits  = llama_get_logits(ctx_llama);
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
+
+        // Apply params.logit_bias map
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
+            logits[it->first] += it->second;
+        }
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+        if (temp <= 0) {
+              // Greedy sampling
+            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
+        } else {
+            if (mirostat == 1) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                const  int mirostat_m    = 100;
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+            } else if (mirostat == 2) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+            } else {
+                  // Temperature sampling
+                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
+                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
+                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
+                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token(ctx_llama, &candidates_p);
+            }
+        }
+    }
+
+    return id;
+}
+
+static const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
+    int id = sample_id(ctx_llama, params);
+    static std::string ret;
+    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
+
+static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
+static const char* IMG_BASE64_TAG_END = "\">";
+
+static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
+    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
+    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
+}
+
+static bool prompt_contains_image(const std::string& prompt) {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    return (begin != std::string::npos);
+}
+
+// replaces the base64 image tag in the prompt with `replacement`
+static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
+    size_t img_base64_str_start, img_base64_str_end;
+    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
+    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
+        fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        return NULL;
+    }
+
+    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
+    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
+    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
+
+    auto required_bytes = base64::required_encode_size(base64_str.size());
+    auto img_bytes = std::vector<unsigned char>(required_bytes);
+    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
+
+    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
+    if (!embed) {
+        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
+        return NULL;
+    }
+
+    return embed;
+}
+
+static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    if (begin == std::string::npos || end == std::string::npos) {
+        return prompt;
+    }
+    auto pre = prompt.substr(0, begin);
+    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
+    return pre + replacement + post;
+}
+
+struct llava_context {
+    struct clip_ctx * ctx_clip = NULL;
+    struct llama_context * ctx_llama = NULL;
+    struct llama_model * model = NULL;
+};
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
+
+    // load and preprocess the image
+    llava_image_embed * embed = NULL;
+    auto prompt = params->prompt;
+    if (prompt_contains_image(prompt)) {
+        if (!params->image.empty()) {
+            printf("using base64 encoded image instead of command line image path\n");
+        }
+        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        if (!embed) {
+            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
+            return NULL;
+        }
+        params->prompt = remove_image_from_prompt(prompt);
+    } else {
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
+        if (!embed) {
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
+            return NULL;
+        }
+    }
+
+    return embed;
+}
+
+static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
+    int n_past = 0;
+
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+
+    // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
+    eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, true);
+    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
+    eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
+
+    // generate the response
+
+    printf("\n");
+
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
+        if (strcmp(tmp, "</s>") == 0) break;
+
+        printf("%s", tmp);
+        fflush(stdout);
+    }
+
+    printf("\n");
+}
+
+
+static struct llava_context * llava_init(gpt_params * params) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+
+    llama_backend_init(params->numa);
+
+    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return NULL;
+    }
+
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return NULL;
+    }
+
+    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+
+    ctx_llava->ctx_llama = ctx_llama;
+    ctx_llava->ctx_clip = ctx_clip;
+    ctx_llava->model = model;
+    return ctx_llava;
+}
+
+static void llava_free(struct llava_context * ctx_llava) {
+    if (ctx_llava->ctx_clip) {
+        clip_free(ctx_llava->ctx_clip);
+        ctx_llava->ctx_clip = NULL;
+    }
+
+    llama_free(ctx_llava->ctx_llama);
+    llama_free_model(ctx_llava->model);
+    llama_backend_free();
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
+        return 1;
+    }
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+        gpt_print_usage(argc, argv, params);
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    auto ctx_llava = llava_init(&params);
+    if (ctx_llava == NULL) {
+        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
+        return 1;
+    }
+
+    auto image_embed = load_image(ctx_llava, &params);
+
+    // process the prompt
+    process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+    llama_print_timings(ctx_llava->ctx_llama);
+
+    llava_image_embed_free(image_embed);
+    llava_free(ctx_llava);
+    return 0;
+}
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@@ -1,147 +0,0 @@
-#pragma once
-
-// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
-
-#include "common.h"
-#include "llama.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
-    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
-
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = N - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-        if (llama_decode(ctx_llama, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
-}
-
-inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
-    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
-    return true;
-}
-
-// TODO: use common/sampling.h
-inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
-    auto & sparams = params.sparams;
-
-    // out of user input, sample next token
-    const float   temp      = sparams.temp;
-    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
-    const float   top_p     = sparams.top_p;
-    const float   tfs_z     = sparams.tfs_z;
-    const float   typical_p = sparams.typical_p;
-    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
-    // const float   repeat_penalty  = sparams.repeat_penalty;
-    // const float   alpha_presence  = sparams.presence_penalty;
-    // const float   alpha_frequency = sparams.frequency_penalty;
-    const int     mirostat     = sparams.mirostat;
-    const float   mirostat_tau = sparams.mirostat_tau;
-    const float   mirostat_eta = sparams.mirostat_eta;
-    // const bool    penalize_nl     = sparams.penalize_nl;
-
-    llama_token id = 0;
-    {
-        auto logits  = llama_get_logits(ctx_llama);
-        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
-
-        // Apply params.logit_bias map
-        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
-            logits[it->first] += it->second;
-        }
-
-        std::vector<llama_token_data> candidates;
-        candidates.reserve(n_vocab);
-        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-        }
-
-        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl(ctx)];
-        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-        // llama_sample_repetition_penalty(ctx, &candidates_p,
-        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        //      last_n_repeat, repeat_penalty);
-        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-        // last_n_repeat, alpha_frequency, alpha_presence);
-        // if (!penalize_nl) {
-        //     logits[llama_token_nl(ctx)] = nl_logit;
-        // }
-
-        if (temp <= 0) {
-              // Greedy sampling
-            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
-        } else {
-            if (mirostat == 1) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                const  int mirostat_m    = 100;
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-            } else if (mirostat == 2) {
-                static float mirostat_mu = 2.0f * mirostat_tau;
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-            } else {
-                  // Temperature sampling
-                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
-                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
-                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
-                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
-                llama_sample_temp(ctx_llama, &candidates_p, temp);
-                id = llama_sample_token(ctx_llama, &candidates_p);
-            }
-        }
-    }
-
-    return id;
-}
-
-inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
-    int id = sample_id(ctx_llama, params);
-    static std::string ret;
-    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
-        ret = "</s>";
-    } else {
-        ret = llama_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past);
-    return ret.c_str();
-}
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,164 +1,156 @@
 #include "clip.h"
-#include "llava-utils.h"
 #include "common.h"
 #include "llama.h"
+#include "llava.h"

 #include <cstdio>
 #include <cstdlib>
 #include <vector>

-static void show_additional_info(int /*argc*/, char ** argv) {
-    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
+#include "base64.hpp"

-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
-        return 1;
+static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
+    clip_image_f32 * img_res = make_clip_image_f32();
+    if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
+        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
+        clip_image_f32_free(img_res);
+        return false;
    }

-    if (params.mmproj.empty() || params.image.empty()) {
-        gpt_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    const char * clip_path = params.mmproj.c_str();
-    const char * img_path = params.image.c_str();
-
-    if (params.prompt.empty()) {
-        params.prompt = "describe the image in detail.";
-    }
-
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-
-    // load and preprocess the image
-    clip_image_u8 img;
-    clip_image_f32 img_res;
-
-    if (!clip_image_load_from_file(img_path, &img)) {
-        fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
-
-        clip_free(ctx_clip);
-        return 1;
-    }
-
-    if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
-        fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
-
-        clip_free(ctx_clip);
-        return 1;
-    }
-
-    int n_img_pos  = clip_n_patches(ctx_clip);
-    int n_img_embd = clip_n_mmproj_embd(ctx_clip);
-
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
-
-    if (!image_embd) {
-        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
-
-        return 1;
-    }
+    *n_img_pos = clip_n_patches(ctx_clip);

    const int64_t t_img_enc_start_us = ggml_time_us();
-    if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
+    bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
+    clip_image_f32_free(img_res);
+    if (!encoded) {
        fprintf(stderr, "Unable to encode image\n");

-        return 1;
+        return false;
    }
+
    const int64_t t_img_enc_end_us = ggml_time_us();
+    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;

-    // we get the embeddings, free up the memory required for CLIP
-    clip_free(ctx_clip);
+    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);

-    llama_backend_init(params.numa);
-
-    llama_model_params model_params              = llama_model_default_params();
-                       model_params.n_gpu_layers = params.n_gpu_layers;
-                       model_params.main_gpu     = params.main_gpu;
-                       model_params.tensor_split = params.tensor_split;
-                       model_params.use_mmap     = params.use_mmap;
-                       model_params.use_mlock    = params.use_mlock;
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-    ctx_params.seed            = params.seed;
-
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    // make sure that the correct mmproj was used, i.e., compare apples to apples
-    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
-
-    if (n_img_embd != n_llama_embd) {
-        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
-
-        llama_free(ctx_llama);
-        llama_free_model(model);
-        llama_backend_free();
-        free(image_embd);
-
-        return 1;
-    }
-
-    // process the prompt
-    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
-
-    int n_past = 0;
-
-    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-
-    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
-    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
-    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
-
-    // generate the response
-
-    printf("\n");
-    printf("prompt: '%s'\n", params.prompt.c_str());
-    printf("\n");
-
-    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_llama, params, &n_past);
-        if (strcmp(tmp, "</s>") == 0) break;
-
-        printf("%s", tmp);
-        fflush(stdout);
-    }
-
-    printf("\n");
-
-    {
-        const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
-
-        printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
-    }
-
-    llama_print_timings(ctx_llama);
-
-    llama_free(ctx_llama);
-    llama_free_model(model);
-    llama_backend_free();
-    free(image_embd);
-
-    return 0;
+    return true;
+}
+
+bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
+        // make sure that the correct mmproj was used, i.e., compare apples to apples
+    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
+    if (n_image_embd != n_llama_embd) {
+        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        return false;
+    }
+    return true;
+}
+
+static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
+    if (!image_embd) {
+        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
+        free(image_embd);
+        return false;
+    }
+
+    int n_img_pos;
+    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
+        fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
+        free(image_embd);
+        return false;
+    }
+    *image_embd_out = image_embd;
+    *n_img_pos_out = n_img_pos;
+
+    return true;
+}
+
+bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+
+    for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
+        int n_eval = image_embed->n_image_pos - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        if (llama_decode(ctx_llama, batch)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
+    clip_image_u8 * img = make_clip_image_u8();
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
+        clip_image_u8_free(img);
+        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
+        return NULL;
+    }
+
+    float* image_embed = NULL;
+    int n_image_pos = 0;
+    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
+    if (!image_embed_result) {
+        clip_image_u8_free(img);
+        fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
+        return NULL;
+    }
+
+    clip_image_u8_free(img);
+    auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    result->embed = image_embed;
+    result->n_image_pos = n_image_pos;
+    return result;
+}
+
+static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
+    auto file = fopen(path, "rb");
+    if (file == NULL) {
+        fprintf(stderr, "%s: can't read file %s\n", __func__, path);
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    auto fileSize = ftell(file);
+    fseek(file, 0, SEEK_SET);
+
+    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
+    if (buffer == NULL) {
+        fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        perror("Memory allocation error");
+        fclose(file);
+        return false;
+    }
+    fread(buffer, 1, fileSize, file); // Read the file into the buffer
+    fclose(file); // Close the file
+
+    *bytesOut = buffer;
+    *sizeOut = fileSize;
+    return true;
+}
+
+LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
+    unsigned char* image_bytes;
+    long image_bytes_length;
+    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
+    if (!loaded) {
+        fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
+        return NULL;
+    }
+
+    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    free(image_bytes);
+
+    return embed;
+}
+
+LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
+    free(embed->embed);
+    free(embed);
 }
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -0,0 +1,50 @@
+#ifndef LLAVA_H
+#define LLAVA_H
+
+#include "ggml.h"
+
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define LLAVA_API __declspec(dllexport)
+#        else
+#            define LLAVA_API __declspec(dllimport)
+#        endif
+#    else
+#        define LLAVA_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define LLAVA_API
+#endif
+
+struct clip_ctx;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct llava_image_embed {
+    float * embed;
+    int n_image_pos;
+};
+
+/** sanity check for clip <-> llava embed size match */
+LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
+
+/** build an image embed from image file bytes */
+LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+/** build an image embed from a path to an image filename */
+LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
+/** free an embedding made with llava_image_embed_make_* */
+
+/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
+LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} main.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -2,7 +2,6 @@

 #include "console.h"
 #include "llama.h"
-#include "build-info.h"

 #include <cassert>
 #include <cinttypes>
@@ -153,8 +152,8 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
+    LOG_TEE("%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+    LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
--- a/examples/parallel/CMakeLists.txt
+++ b/examples/parallel/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} parallel.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -1,8 +1,6 @@
 // A basic application simulating a server with multiple clients.
 // The clients submite requests to the server and they are processed in parallel.

-#include "build-info.h"
-
 #include "common.h"
 #include "llama.h"

--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} perplexity.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,5 +1,4 @@
 #define LLAMA_API_INTERNAL
-#include "build-info.h"
 #include "common.h"
 #include "ggml.h"
 #include "llama.h"
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,9 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} save-load-state.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,4 +1,3 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"

--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -6,11 +6,8 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama llava ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -7,7 +7,7 @@ Command line options:
 -   `--threads N`, `-t N`: Set the number of threads to use during generation.
 -   `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
-   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
+-   `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
@@ -122,6 +122,8 @@ node index.js

    `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).

+    `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05).
+
    `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).

    `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded.
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -219,6 +219,7 @@
      repeat_penalty: 1.18, // 1.0 = disabled
      top_k: 40, // <= 0 to use vocab size
      top_p: 0.5, // 1.0 = disabled
+      min_p: 0.05, // 0 = disabled
      tfs_z: 1.0, // 1.0 = disabled
      typical_p: 1.0, // 1.0 = disabled
      presence_penalty: 0.0, // 0.0 = disabled
@@ -744,6 +745,7 @@
            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
+            ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })}
          </fieldset>
          <details>
            <summary>More options</summary>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,5 @@
 #include "common.h"
 #include "llama.h"
-#include "build-info.h"
 #include "grammar-parser.h"

 #include "../llava/clip.h"
@@ -680,6 +679,7 @@ struct llama_server_context
        slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
        slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
        slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
+        slot->sparams.min_p           = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z           = json_value(data, "tfs_z",             default_sparams.tfs_z);
        slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
        slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
@@ -1114,6 +1114,7 @@ struct llama_server_context
            {"temp",              slot.sparams.temp},
            {"top_k",             slot.sparams.top_k},
            {"top_p",             slot.sparams.top_p},
+            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
            {"typical_p",         slot.sparams.typical_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
@@ -2264,8 +2265,8 @@ int main(int argc, char **argv)

    llama_backend_init(params.numa);

-    LOG_INFO("build info", {{"build", BUILD_NUMBER},
-                            {"commit", BUILD_COMMIT}});
+    LOG_INFO("build info", {{"build", LLAMA_BUILD_NUMBER},
+                            {"commit", LLAMA_COMMIT}});

    LOG_INFO("system info", {
                                {"n_threads", params.n_threads},
--- a/examples/speculative/CMakeLists.txt
+++ b/examples/speculative/CMakeLists.txt
@@ -3,6 +3,3 @@ add_executable(${TARGET} speculative.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if(TARGET BUILD_INFO)
-  add_dependencies(${TARGET} BUILD_INFO)
-endif()
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,5 +1,3 @@
-#include "build-info.h"
-
 #include "common.h"
 #include "llama.h"

@@ -39,9 +37,11 @@ int main(int argc, char ** argv) {
    // max number of parallel drafting sequences (i.e. tree branches)
    const int n_seq_dft = params.n_parallel;

-    // TODO: make this configurable
-    const float p_accept = 0.80f;
-    const float p_split  = 0.10f;
+    // probability threshold for accepting a token from the draft model
+    const float p_accept = params.p_accept;
+
+    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
+    const float p_split  = params.p_split;

 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("speculative", "log"));
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -378,9 +378,13 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
    }
 }

-static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
+static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view, bool update_backend) {
    assert(view->view_src != NULL && view->view_src->data != NULL);
-    view->backend = view->view_src->backend;
+
+    if (update_backend) {
+        view->backend = view->view_src->backend;
+    }
+
    view->buffer  = view->view_src->buffer;
    view->data    = (char *)view->view_src->data + view->view_offs;

@@ -394,7 +398,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    struct hash_node * ht = alloc->hash_table;
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
-            init_view(alloc, node);
+            init_view(alloc, node, true);
        } else {
            // see if we can reuse a parent's buffer (inplace)
            if (ggml_op_can_inplace(node->op)) {
@@ -424,15 +428,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
                                node->view_src = view_src;
                                view_src_hn->n_views += 1;
-                                init_view(alloc, node);
+                                init_view(alloc, node, false);
                                return;
                            }
-                        }
-                        else {
+                        } else {
                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                            node->view_src = parent;
                            p_hn->n_views += 1;
-                            init_view(alloc, node);
+                            init_view(alloc, node, false);
                            return;
                        }
                    }
@@ -463,7 +466,7 @@ size_t ggml_allocr_alloc_graph_n(
                hash_get(ht, view_src)->n_views += 1;
                if (node->buffer == NULL && node->data != NULL) {
                    // view of a pre-allocated tensor, didn't call init_view() yet
-                    init_view(alloc, node);
+                    init_view(alloc, node, true);
                }
            }

@@ -474,7 +477,7 @@ size_t ggml_allocr_alloc_graph_n(
                }
                hash_get(ht, parent)->n_children += 1;
                if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                    init_view(alloc, parent);
+                    init_view(alloc, parent, true);
                }
            }
        }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -982,7 +982,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,

    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");

-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
@@ -1086,7 +1086,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,

 static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
@@ -1190,7 +1190,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,

 static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {

-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;
    const int num_blocks_per_row = ncols / QK_K;
    const int ib0 = row*num_blocks_per_row;
@@ -1444,7 +1444,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,

    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");

-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;
    if (row > nrows) return;

    const int num_blocks_per_row = ncols / QK_K;
@@ -4254,7 +4254,7 @@ template <bool need_check> static __global__ void

 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
@@ -4294,7 +4294,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
 static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
    // qk = quantized weights per x block
    // qr = number of quantized weights per data value in x block
-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
+    const int row = blockIdx.x*blockDim.y + threadIdx.y;

    if (row >= nrows) {
        return;
@@ -4539,7 +4539,7 @@ static __global__ void rope(
    const int i2 = row/p_delta_rows;

    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(freq_base, -col/ncols);
+    const float theta_base = p*powf(freq_base, -float(col)/ncols);

    float cos_theta, sin_theta;
    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
@@ -4566,8 +4566,8 @@ static __global__ void rope_neox(
    const int i = row*ncols + col/2;
    const int i2 = row/p_delta_rows;

-    // simplified from `(row * ncols + col) * (-1 / ncols)`
-    const float cur_rot = -col/ncols - row;
+    // simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
+    const float cur_rot = -float(col)/ncols;

    const int p = has_pos ? pos[i2] : 0;
    const float theta_base = p*powf(freq_base, cur_rot);
@@ -4867,7 +4867,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
 static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4876,7 +4877,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
 static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4885,7 +4886,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
 static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4894,7 +4895,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
 static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4903,7 +4904,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
 static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -4913,7 +4914,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@@ -4922,7 +4923,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@@ -4931,7 +4932,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@@ -4946,7 +4947,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
    GGML_ASSERT(ncols % QK_K == 0);
    const int ny = 2 / K_QUANTS_PER_ITERATION;
    const int block_num_y = (nrows + ny - 1) / ny;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(32, ny, 1);
    dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
@@ -4954,7 +4955,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
 static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK4_0 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4963,7 +4964,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK4_1 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4972,7 +4973,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK5_0 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4981,7 +4982,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK5_1 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4990,7 +4991,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK8_0 == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -4999,7 +5000,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5008,7 +5009,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5017,7 +5018,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5026,7 +5027,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5035,7 +5036,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
 static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % QK_K == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@@ -5054,7 +5055,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
 static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(block_num_y, 1, 1);
    const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
    dequantize_mul_mat_vec<1, 1, convert_f16>
        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@@ -5789,6 +5790,11 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
    CUDA_CHECK(cudaFree(ptr));
 }

+static bool g_cublas_loaded = false;
+
+bool ggml_cublas_loaded(void) {
+    return g_cublas_loaded;
+}

 void ggml_init_cublas() {
    static bool initialized = false;
@@ -5802,7 +5808,12 @@ void ggml_init_cublas() {
        CUDA_CHECK(cudaDeviceSynchronize());
 #endif

-        CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
+        if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
+            initialized = true;
+            g_cublas_loaded = false;
+            return;
+        }
+
        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
        int64_t total_vram = 0;
 #if defined(GGML_CUDA_FORCE_MMQ)
@@ -5850,6 +5861,7 @@ void ggml_init_cublas() {
        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));

        initialized = true;
+        g_cublas_loaded = true;
    }
 }

@@ -6892,6 +6904,8 @@ static void ggml_cuda_op_mul_mat(
    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
    int64_t row_high[GGML_CUDA_MAX_DEVICES];

+    int used_devices = 0;
+
    for (int64_t id = 0; id < g_device_count; ++id) {
        // by default, use all rows
        row_low[id]  = 0;
@@ -6919,6 +6933,8 @@ static void ggml_cuda_op_mul_mat(
            continue;
        }

+        used_devices++;
+
        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;

@@ -6957,12 +6973,12 @@ static void ggml_cuda_op_mul_mat(

    // if multiple devices are used they need to wait for the main device
    // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && g_device_count > 1) {
+    if (split && used_devices > 1) {
        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
        CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
    }

-    const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@@ -7078,6 +7094,9 @@ static void ggml_cuda_op_mul_mat(
    }

    for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
        CUDA_CHECK(ggml_cuda_set_device(id));

        // free buffers again when done
@@ -7102,6 +7121,9 @@ static void ggml_cuda_op_mul_mat(

        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
        for (int64_t id = 0; id < g_device_count; ++id) {
+            if (row_low[id] == row_high[id]) {
+                continue;
+            }
            for (int64_t is = 0; is < is_max; ++is) {
                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
            }
@@ -7147,6 +7169,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
 }

 bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (!g_cublas_loaded) return false;
+
    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];
@@ -7225,7 +7249,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor

 __global__ void k_compute_batched_ptrs(
        const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
-        void ** ptrs,
+        const void ** ptrs_src, void ** ptrs_dst,
        int ne12, int ne13,
        int ne23,
        int nb02, int nb03,
@@ -7242,9 +7266,9 @@ __global__ void k_compute_batched_ptrs(
    int i03 = i13 / r3;
    int i02 = i12 / r2;

-    ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02   + i03*nb03;
-    ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs[2*ne23 + i12 + i13*ne12] = (char *)     dst_f16 + i12* nb2/2 + i13* nb3/2;
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)     dst_f16 + i12* nb2/2 + i13* nb3/2;
 }

 static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -7350,14 +7374,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        // use cublasGemmBatchedEx
        const int ne23 = ne12*ne13;

-        void ** ptrs_as = nullptr;
-        size_t ptrs_s = 0;
-        ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
+        const void ** ptrs_src = nullptr;
+              void ** ptrs_dst = nullptr;
+
+        size_t ptrs_src_s = 0;
+        size_t ptrs_dst_s = 0;
+
+        ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
+        ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);

        dim3 block_dims(ne13, ne12);
        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
                src0_as_f16, src1_as_f16, dst_f16,
-                ptrs_as,
+                ptrs_src, ptrs_dst,
                ne12, ne13,
                ne23,
                nb02, nb03,
@@ -7369,14 +7398,19 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
        CUBLAS_CHECK(
        cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
                ne01, ne11, ne10,
-                &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
-                            (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
-                &beta_f16,  (      void **       ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
+                &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
+                            (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
+                &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
                ne23,
                CUBLAS_COMPUTE_16F,
                CUBLAS_GEMM_DEFAULT_TENSOR_OP));

-        ggml_cuda_pool_free(ptrs_as, ptrs_s);
+        if (ptrs_src_s != 0) {
+            ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
+        }
+        if (ptrs_dst_s != 0) {
+            ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
+        }
    }
 #endif

@@ -7389,10 +7423,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const

 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU) &&
+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
        (src1->backend == GGML_BACKEND_GPU) &&
        ( dst->backend == GGML_BACKEND_GPU);

+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+
    int64_t min_compute_capability = INT_MAX;
    for (int64_t id = 0; id < g_device_count; ++id) {
        if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@@ -7414,13 +7450,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);

-    if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // KQ single-batch
        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
        // KQV single-batch
        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
        // KQ + KQV multi-batch
        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
    } else if (src0->type == GGML_TYPE_F32) {
@@ -7820,6 +7856,8 @@ void ggml_cuda_free_scratch() {
 }

 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    if (!g_cublas_loaded) return false;
+
    ggml_cuda_func_t func;
    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -17,7 +17,12 @@ extern "C" {

 #define GGML_CUDA_MAX_DEVICES       16

+// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
 GGML_API void   ggml_init_cublas(void);
+
+// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
+GGML_API bool   ggml_cublas_loaded(void);
+
 GGML_API void * ggml_cuda_host_malloc(size_t size);
 GGML_API void   ggml_cuda_host_free(void * ptr);

--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -1017,7 +1017,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:MAX(16, nth/32*sizeof(float)) atIndex:0];

                            [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
@@ -1348,7 +1348,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:MAX(16, nth*sizeof(float)) atIndex:0];

                            const int64_t nrows = ggml_nrows(src0);

@@ -1403,7 +1403,8 @@ void ggml_metal_graph_compute(
                            const int n_past     = ((int32_t *) dst->op_params)[0];
                            const int n_dims     = ((int32_t *) dst->op_params)[1];
                            const int mode       = ((int32_t *) dst->op_params)[2];
-                            const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
+                            // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
+                            const int n_orig_ctx = ((int32_t *) dst->op_params)[4];

                            float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
                            memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
@@ -1419,34 +1420,35 @@ void ggml_metal_graph_compute(
                                default: GGML_ASSERT(false);
                            };

-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:2];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:10];
-                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:14];
-                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:18];
-                            [encoder setBytes:&n_past  length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&n_dims  length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&mode    length:sizeof(     int) atIndex:21];
-                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:22];
-                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:23];
-                            [encoder setBytes:&ext_factor  length:sizeof(float) atIndex:24];
-                            [encoder setBytes:&attn_factor length:sizeof(float) atIndex:25];
-                            [encoder setBytes:&beta_fast   length:sizeof(float) atIndex:26];
-                            [encoder setBytes:&beta_slow   length:sizeof(float) atIndex:27];
+                            [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
+                            [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
+                            [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
+                            [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
+                            [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
+                            [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
+                            [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
+                            [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
+                            [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
+                            [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
+                            [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
+                            [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
+                            [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
+                            [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
+                            [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];

                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1070,20 +1070,20 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static void rope_yarn(
    float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
+    thread float * cos_theta, thread float * sin_theta
 ) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
    if (ext_factor != 0.0f) {
-        ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;

        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
    }
-    *cos_theta = cosf(theta) * mscale;
-    *sin_theta = sinf(theta) * mscale;
+    *cos_theta = cos(theta) * mscale;
+    *sin_theta = sin(theta) * mscale;
 }

 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
@@ -1123,8 +1123,13 @@ typedef void (rope_t)(
        constant         int & n_past,
        constant         int & n_dims,
        constant         int & mode,
+        constant         int & n_orig_ctx,
        constant       float & freq_base,
        constant       float & freq_scale,
+        constant       float & ext_factor,
+        constant       float & attn_factor,
+        constant       float & beta_fast,
+        constant       float & beta_slow,
        uint  tiitg[[thread_index_in_threadgroup]],
        uint3 tptg[[threads_per_threadgroup]],
        uint3 tgpig[[threadgroup_position_in_grid]]);
@@ -1153,6 +1158,7 @@ kernel void kernel_rope(
        constant         int & n_past,
        constant         int & n_dims,
        constant         int & mode,
+        constant         int & n_orig_ctx,
        constant       float & freq_base,
        constant       float & freq_scale,
        constant       float & ext_factor,
--- a/ggml.c
+++ b/ggml.c
@@ -4970,8 +4970,13 @@ struct ggml_tensor * ggml_rope_back(
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
+        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
        float                 xpos_base,
        bool                  xpos_down) {
    GGML_ASSERT(ggml_is_vector(b));
@@ -4988,11 +4993,15 @@ struct ggml_tensor * ggml_rope_back(

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

-    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
-    memcpy(params + 4, &freq_base,  sizeof(float));
-    memcpy(params + 5, &freq_scale, sizeof(float));
-    memcpy(params + 6, &xpos_base,  sizeof(float));
-    memcpy(params + 7, &xpos_down,  sizeof(bool));
+    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &xpos_base,    sizeof(float));
+    memcpy(params + 12, &xpos_down,    sizeof(bool));
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE_BACK;
@@ -10974,7 +10983,8 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
+        const bool forward) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
@@ -11033,6 +11043,11 @@ static void ggml_compute_forward_rope_f32(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
    const int32_t * pos = (const int32_t *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11049,9 +11064,9 @@ static void ggml_compute_forward_rope_f32(
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;

                        theta_base *= theta_scale;
                        block_theta *= theta_scale;
@@ -11075,6 +11090,7 @@ static void ggml_compute_forward_rope_f32(
                        rope_yarn(
                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                        );
+                        sin_theta *= sin_sign;

                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@@ -11105,6 +11121,7 @@ static void ggml_compute_forward_rope_f32(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
+                            sin_theta *= sin_sign;

                            theta_base *= theta_scale;

@@ -11130,7 +11147,8 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
+        const bool forward) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
@@ -11182,6 +11200,11 @@ static void ggml_compute_forward_rope_f16(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
    const int32_t * pos = (const int32_t *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11198,9 +11221,9 @@ static void ggml_compute_forward_rope_f16(
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;

                        theta_base *= theta_scale;
                        block_theta *= theta_scale;
@@ -11224,6 +11247,7 @@ static void ggml_compute_forward_rope_f16(
                        rope_yarn(
                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                        );
+                        sin_theta *= sin_sign;

                        theta_base *= theta_scale;

@@ -11250,6 +11274,7 @@ static void ggml_compute_forward_rope_f16(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
+                            sin_theta *= sin_sign;

                            theta_base *= theta_scale;

@@ -11279,11 +11304,11 @@ static void ggml_compute_forward_rope(
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
            } break;
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
            } break;
        default:
            {
@@ -11294,216 +11319,6 @@ static void ggml_compute_forward_rope(

 // ggml_compute_forward_rope_back

-static void ggml_compute_forward_rope_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // y = rope(x, src1)
-    // dx = rope_back(dy, src1)
-    // src0 is dy, src1 contains options
-
-    float freq_base;
-    float freq_scale;
-
-    // these two only relevant for xPos RoPE:
-    float xpos_base;
-    bool xpos_down;
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = freq_scale * (float)p;
-
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
-
-                        // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
-                        if (xpos_down) zeta = 1.0f / zeta;
-
-                        theta_base *= theta_scale;
-
-                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float dy0 = dy[0];
-                        const float dy1 = dy[1];
-
-                        dx[0] =   dy0*cos_theta*zeta + dy1*sin_theta*zeta;
-                        dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
-                    }
-                } else {
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float dy0 = dy[0];
-                            const float dy1 = dy[n_dims/2];
-
-                            dx[0]        =   dy0*cos_theta + dy1*sin_theta;
-                            dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope_back_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // y = rope(x, src1)
-    // dx = rope_back(dy, src1)
-    // src0 is dy, src1 contains options
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(ggml_fp16_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = (float)p;
-
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
-
-                        theta_base *= theta_scale;
-
-                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float dy0 = GGML_FP16_TO_FP32(dy[0]);
-                        const float dy1 = GGML_FP16_TO_FP32(dy[1]);
-
-                        dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
-                        dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
-                    }
-                } else {
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float dy0 = GGML_FP16_TO_FP32(dy[0]);
-                            const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
-
-                            dx[0]        = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
-                            dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 static void ggml_compute_forward_rope_back(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@@ -11512,11 +11327,11 @@ static void ggml_compute_forward_rope_back(
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
            } break;
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
            } break;
        default:
            {
@@ -15559,17 +15374,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                // necessary for llama
                if (src0->grad) {
                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims = ((int32_t *) tensor->op_params)[1];
-                    const int mode   = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
-                    float freq_scale;
-                    float xpos_base;
-                    bool  xpos_down;
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
@@ -15579,8 +15397,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                n_dims,
                                mode,
                                n_ctx,
+                                n_orig_ctx,
                                freq_base,
                                freq_scale,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
                                xpos_base,
                                xpos_down),
                            zero_table);
@@ -15590,17 +15413,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
            {
                if (src0->grad) {
                    //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims = ((int32_t *) tensor->op_params)[1];
-                    const int mode   = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
-                    float freq_scale;
-                    float xpos_base;
-                    bool  xpos_down;
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
@@ -15609,14 +15435,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                src1,
                                n_dims,
                                mode,
-                                0,
                                n_ctx,
+                                n_orig_ctx,
                                freq_base,
                                freq_scale,
-                                0.0f,
-                                1.0f,
-                                0.0f,
-                                0.0f,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
                                xpos_base,
                                xpos_down,
                                false),
@@ -18811,8 +18637,7 @@ static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset)
    return n == size;
 }

-// NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset) {
+static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
    p->n    = 0;
    p->data = NULL;

@@ -18824,19 +18649,6 @@ static bool gguf_fread_str_cur(FILE * file, struct gguf_str * p, size_t * offset
    return ok;
 }

-static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset) {
-    p->n    = 0;
-    p->data = NULL;
-
-    bool ok = true;
-
-    uint32_t n = 0;
-    ok = ok && gguf_fread_el(file, &n,       sizeof(n), offset); p->data = calloc(n + 1, 1); p->n = n;
-    ok = ok && gguf_fread_el(file,  p->data, p->n,      offset);
-
-    return ok;
-}
-
 struct gguf_context * gguf_init_empty(void) {
    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));

@@ -18895,20 +18707,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        ctx->data  = NULL;

        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
+        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
+        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);

        if (ctx->header.version == 1) {
-            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-            uint32_t n_tensors = 0;
-            uint32_t n_kv      = 0;
-
-            ok = ok && gguf_fread_el(file, &n_tensors, sizeof(n_tensors), &offset);
-            ok = ok && gguf_fread_el(file, &n_kv,      sizeof(n_kv),      &offset);
-
-            ctx->header.n_tensors = n_tensors;
-            ctx->header.n_kv      = n_kv;
-        } else {
-            ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
-            ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
+            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
        }

        if (!ok) {
@@ -18919,12 +18725,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        }
    }

-    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-    bool (* gguf_fread_str)(FILE *, struct gguf_str *, size_t *) = gguf_fread_str_cur;
-    if (ctx->header.version == 1) {
-        gguf_fread_str = gguf_fread_str_v1;
-    }
-
    // read the kv pairs
    {
        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
@@ -18955,15 +18755,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                case GGUF_TYPE_ARRAY:
                    {
                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-
-                        if (ctx->header.version == 1) {
-                            // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-                            uint32_t n = 0;
-                            ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
-                            kv->value.arr.n = n;
-                        } else {
-                            ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
-                        }
+                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n), &offset);

                        switch (kv->value.arr.type) {
                            case GGUF_TYPE_UINT8:
@@ -19022,14 +18814,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
            for (uint32_t j = 0; j < info->n_dims; ++j) {
-                if (ctx->header.version == 1) {
-                    // NOTE: temporary handling of GGUFv1 >> remove after Oct 2023
-                    uint32_t t = 0;
-                    ok = ok && gguf_fread_el(file, &t, sizeof(t), &offset);
-                    info->ne[j] = t;
-                } else {
-                    ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
-                }
+                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
            }
            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
@@ -19270,24 +19055,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
 }

 const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    return ctx->kv[key_id].key.data;
 }

 enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    return ctx->kv[key_id].type;
 }

 enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.type;
 }

 const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.data;
 }

 const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    struct gguf_kv * kv = &ctx->kv[key_id];
    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
@@ -19295,70 +19085,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
 }

 int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
    return ctx->kv[key_id].value.arr.n;
 }

 uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
    return ctx->kv[key_id].value.uint8;
 }

 int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
    return ctx->kv[key_id].value.int8;
 }

 uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
    return ctx->kv[key_id].value.uint16;
 }

 int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
    return ctx->kv[key_id].value.int16;
 }

 uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
    return ctx->kv[key_id].value.uint32;
 }

 int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
    return ctx->kv[key_id].value.int32;
 }

 float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
    return ctx->kv[key_id].value.float32;
 }

 uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
    return ctx->kv[key_id].value.uint64;
 }

 int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
    return ctx->kv[key_id].value.int64;
 }

 double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
    return ctx->kv[key_id].value.float64;
 }

 bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
    return ctx->kv[key_id].value.bool_;
 }

 const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
    GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
    return ctx->kv[key_id].value.str.data;
 }

+const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
+    GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
+    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
+    GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
+    return &ctx->kv[key_id].value;
+}
+
 int gguf_get_n_tensors(const struct gguf_context * ctx) {
    return ctx->header.n_tensors;
 }
--- a/ggml.h
+++ b/ggml.h
@@ -1372,8 +1372,13 @@ extern "C" {
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
+            int                   n_orig_ctx,
            float                 freq_base,
            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow,
            float                 xpos_base,
            bool                  xpos_down);

@@ -2022,6 +2027,7 @@ extern "C" {
    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
+    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -393,6 +393,7 @@ class TensorNameMap:
            "layers.{bid}.attention_norm",                         # llama-pth
            "encoder.layer.{bid}.attention.output.LayerNorm",      # bert
            "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
+            "model.layers.{bid}.ln1",                              # yi
        ),

        # Attention norm 2
@@ -464,6 +465,7 @@ class TensorNameMap:
            "layers.{bid}.ffn_norm",                                        # llama-pth
            "encoder.layer.{bid}.output.LayerNorm",                         # bert
            "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
+            "model.layers.{bid}.ln2",                                       # yi
        ),

        # Feed-forward up
@@ -644,18 +646,17 @@ class GGUFValueType(IntEnum):
            sys.exit()


+class WriterState(Enum):
+    EMPTY   = auto()
+    HEADER  = auto()
+    KV_DATA = auto()
+    TI_DATA = auto()
+
+
 class GGUFWriter:
    fout: BufferedWriter
-    arch: str
-    offset_tensor = 0
-    data_alignment = GGUF_DEFAULT_ALIGNMENT
-    kv_data = b""
-    kv_data_count = 0
-    ti_data = b""
-    ti_data_count = 0
-    use_temp_file: bool
-    temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
-    tensors: list[tuple[np.ndarray[Any, Any], int]]
+    temp_file: tempfile.SpooledTemporaryFile[bytes] | None
+    tensors: list[np.ndarray[Any, Any]]

    @property
    def pack_prefix(self):
@@ -681,27 +682,47 @@ class GGUFWriter:
            GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
            GGUFValueType.BOOL:    "?" ,
        }
-        self.add_architecture()
+        self.offset_tensor = 0
+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
+        self.kv_data = b""
+        self.kv_data_count = 0
+        self.ti_data = b""
+        self.ti_data_count = 0
        self.use_temp_file = use_temp_file
+        self.temp_file = None
        self.tensors = []
        endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
        print(f"This gguf file is for {endianess_str} only")
+        self.state = WriterState.EMPTY
+
+        self.add_architecture()

    def write_header_to_file(self):
+        if self.state is not WriterState.EMPTY:
+            raise ValueError(f'Expected output file to be empty, got {self.state}')
+
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
        self.flush()
-#        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
+        self.state = WriterState.HEADER

    def write_kv_data_to_file(self):
+        if self.state is not WriterState.HEADER:
+            raise ValueError(f'Expected output file to contain the header, got {self.state}')
+
        self.fout.write(self.kv_data)
        self.flush()
+        self.state = WriterState.KV_DATA

    def write_ti_data_to_file(self):
+        if self.state is not WriterState.KV_DATA:
+            raise ValueError(f'Expected output file to contain KV data, got {self.state}')
+
        self.fout.write(self.ti_data)
        self.flush()
+        self.state = WriterState.TI_DATA

    def add_key(self, key: str):
        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
@@ -794,6 +815,9 @@ class GGUFWriter:
        return ((x + n - 1) // n) * n

    def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None):
+        if self.state is not WriterState.EMPTY:
+            raise ValueError(f'Expected output file to be empty, got {self.state}')
+
        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

        encoded_name = name.encode("utf8")
@@ -823,23 +847,22 @@ class GGUFWriter:
        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)

-        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
-
-        if  self.temp_file is None:
-            self.tensors.append((tensor, pad))
+        if self.temp_file is None:
+            self.tensors.append(tensor)
            return

        tensor.tofile(self.temp_file)
+        self.write_padding(self.temp_file, tensor.nbytes)

-        if pad != 0:
-            self.temp_file.write(bytes([0] * pad))
-
-    def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
+    def write_padding(self, fp: IO[bytes], n: int, align: int | None = None):
        pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
        if pad != 0:
            fp.write(bytes([0] * pad))

    def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
+        if self.state is not WriterState.TI_DATA:
+            raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
+
        if self.endianess==GGUFEndian.BIG:
            tensor.byteswap(inplace=True)
        self.write_padding(self.fout, self.fout.tell())
@@ -852,10 +875,13 @@ class GGUFWriter:
        self.write_padding(self.fout, self.fout.tell())

        if self.temp_file is None:
-            for (currtensor, currpad) in self.tensors:
-                currtensor.tofile(self.fout)
-                if currpad != 0:
-                    self.fout.write(bytes([0] * currpad))
+            while True:
+                try:
+                    tensor = self.tensors.pop(0)
+                except IndexError:
+                    break
+                tensor.tofile(self.fout)
+                self.write_padding(self.fout, tensor.nbytes)
            return

        self.temp_file.seek(0)
@@ -1000,11 +1026,8 @@ class GGUFWriter:


 class SpecialVocab:
-    load_merges: bool = False
-    merges: list[str] = []
-    special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
-    special_token_ids: dict[str, int] = {}
-    n_vocab: int | None = None
+    merges: list[str]
+    special_token_ids: dict[str, int]

    def __init__(
        self, path: str | os.PathLike[str], load_merges: bool = False,
@@ -1014,8 +1037,11 @@ class SpecialVocab:
        self.special_token_ids = {}
        self.n_vocab = n_vocab
        self.load_merges = load_merges
+        self.merges = []
        if special_token_types is not None:
            self.special_token_types = special_token_types
+        else:
+            self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad')
        self._load(Path(path))

    def _load(self, path: Path) -> None:
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.5"
+version = "0.4.6"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
--- a/llama.cpp
+++ b/llama.cpp
@@ -577,6 +577,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
    return LLAMA_ROPE_SCALING_UNSPECIFIED;
 }

+static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
+    switch (type) {
+        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
+        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
+        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
+        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
+        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
+        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
+        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
+        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
+        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
+        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
+        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
+        default:                return format("unknown type %d", type);
+    }
+}
+
+static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
+    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+    switch (type) {
+        case GGUF_TYPE_STRING:
+            return gguf_get_val_str(ctx_gguf, i);
+        case GGUF_TYPE_ARRAY:
+            {
+                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
+                int arr_n = gguf_get_arr_n(ctx_gguf, i);
+                const void * data = gguf_get_arr_data(ctx_gguf, i);
+                std::stringstream ss;
+                ss << "[";
+                for (int j = 0; j < arr_n; j++) {
+                    if (arr_type == GGUF_TYPE_STRING) {
+                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
+                        // escape quotes
+                        replace_all(val, "\\", "\\\\");
+                        replace_all(val, "\"", "\\\"");
+                        ss << '"' << val << '"';
+                    } else if (arr_type == GGUF_TYPE_ARRAY) {
+                        ss << "???";
+                    } else {
+                        ss << gguf_data_to_str(arr_type, data, j);
+                    }
+                    if (j < arr_n - 1) {
+                        ss << ", ";
+                    }
+                }
+                ss << "]";
+                return ss.str();
+            }
+        default:
+            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
+    }
+}
+
 //
 // ggml helpers
 //
@@ -596,19 +650,37 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
 // llama helpers
 //

+inline void * llama_host_malloc(size_t n) {
 #ifdef GGML_USE_CUBLAS
-#   define llama_host_malloc(n)  ggml_cuda_host_malloc(n)
-#   define llama_host_free(data) ggml_cuda_host_free(data)
+    if (ggml_cublas_loaded()) {
+        return ggml_cuda_host_malloc(n);
+    } else {
+        return malloc(n);
+    }
 #elif GGML_USE_METAL
-#   define llama_host_malloc(n)  ggml_metal_host_malloc(n)
-#   define llama_host_free(data) ggml_metal_host_free(data)
+    return ggml_metal_host_malloc(n);
 #elif GGML_USE_CPU_HBM
-#   define llama_host_malloc(n)  hbw_malloc(n)
-#   define llama_host_free(data) if (data != NULL) hbw_free(data)
+    return hbw_malloc(n);
 #else
-#   define llama_host_malloc(n)  malloc(n)
-#   define llama_host_free(data) free(data)
+    return malloc(n);
 #endif
+}
+
+inline void llama_host_free(void * ptr) {
+#ifdef GGML_USE_CUBLAS
+    if (ggml_cublas_loaded()) {
+        return ggml_cuda_host_free(ptr);
+    } else {
+        return free(ptr);
+    }
+#elif GGML_USE_METAL
+    return ggml_metal_host_free(ptr);
+#elif GGML_USE_CPU_HBM
+    return hbw_free(ptr);
+#else
+    return free(ptr);
+#endif
+}

 #if defined(_WIN32)
 static std::string llama_format_win_err(DWORD err) {
@@ -1200,9 +1272,11 @@ struct llama_kv_cache {
        }

 #ifdef GGML_USE_CUBLAS
-        ggml_cuda_free_data(k);
-        ggml_cuda_free_data(v);
-#endif // GGML_USE_CUBLAS
+        if (ggml_cublas_loaded()) {
+            ggml_cuda_free_data(k);
+            ggml_cuda_free_data(v);
+        }
+#endif
    }
 };

@@ -1277,6 +1351,9 @@ struct llama_model {

    int n_gpu_layers;

+    // gguf metadata
+    std::unordered_map<std::string, std::string> gguf_kv;
+
    // context
    struct ggml_context * ctx = NULL;

@@ -1302,11 +1379,15 @@ struct llama_model {
        }

 #ifdef GGML_USE_CUBLAS
-        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-            ggml_cuda_free_data(tensors_by_name[i].second);
+        if (ggml_cublas_loaded()) {
+            for (size_t i = 0; i < tensors_by_name.size(); ++i) {
+                ggml_cuda_free_data(tensors_by_name[i].second);
+            }
+            ggml_cuda_free_scratch();
        }
-        ggml_cuda_free_scratch();
-#elif defined(GGML_USE_CLBLAST)
+#endif
+
+#if defined(GGML_USE_CLBLAST)
        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
            ggml_cl_free_data(tensors_by_name[i].second);
        }
@@ -1418,23 +1499,26 @@ static bool llama_kv_cache_init(
    ggml_set_name(cache.v, "cache_v");

    (void) n_gpu_layers;
-#ifdef GGML_USE_CUBLAS
-    size_t vram_kv_cache = 0;

-    if (n_gpu_layers > (int)n_layer + 1) {
-        ggml_cuda_assign_buffers_no_scratch(cache.v);
-        LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
-        vram_kv_cache += ggml_nbytes(cache.v);
+#ifdef GGML_USE_CUBLAS
+    if (ggml_cublas_loaded()) {
+        size_t vram_kv_cache = 0;
+
+        if (n_gpu_layers > (int)n_layer + 1) {
+            ggml_cuda_assign_buffers_no_scratch(cache.v);
+            LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
+            vram_kv_cache += ggml_nbytes(cache.v);
+        }
+        if (n_gpu_layers > (int)n_layer + 2) {
+            ggml_cuda_assign_buffers_no_scratch(cache.k);
+            LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
+            vram_kv_cache += ggml_nbytes(cache.k);
+        }
+        if (vram_kv_cache > 0) {
+            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+        }
    }
-    if (n_gpu_layers > (int)n_layer + 2) {
-        ggml_cuda_assign_buffers_no_scratch(cache.k);
-        LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
-        vram_kv_cache += ggml_nbytes(cache.k);
-    }
-    if (vram_kv_cache > 0) {
-        LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
-    }
-#endif // GGML_USE_CUBLAS
+#endif

    return true;
 }
@@ -1728,10 +1812,10 @@ struct llama_model_loader {
                case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
                case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K;   break;
                default:
-                     {
-                         LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
-                         ftype = LLAMA_FTYPE_ALL_F32;
-                     } break;
+                    {
+                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
+                        ftype = LLAMA_FTYPE_ALL_F32;
+                    } break;
            }

            // this is a way to mark that we have "guessed" the file type
@@ -1745,10 +1829,20 @@ struct llama_model_loader {
            }

            for (int i = 0; i < n_kv; i++) {
-                const char * name         = gguf_get_key(ctx_gguf, i);
-                const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+                const char * name           = gguf_get_key(ctx_gguf, i);
+                const enum gguf_type type   = gguf_get_kv_type(ctx_gguf, i);
+                const std::string type_name =
+                    type == GGUF_TYPE_ARRAY
+                    ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
+                    : gguf_type_name(type);

-                LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
+                std::string value          = gguf_kv_to_str(ctx_gguf, i);
+                const size_t MAX_VALUE_LEN = 40;
+                if (value.size() > MAX_VALUE_LEN) {
+                    value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
+                }
+
+                LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
            }

            // print type counts
@@ -1837,6 +1931,12 @@ struct llama_model_loader {
            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
        }

+        if (backend == GGML_BACKEND_GPU_SPLIT) {
+            if (ne.size() == 1) {
+                throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
+            }
+        }
+
        {
            bool is_ok = true;
            for (size_t i = 0; i < ne.size(); ++i) {
@@ -2037,6 +2137,17 @@ static void llm_load_hparams(

    auto & hparams = model.hparams;

+    // get metadata as string
+    for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
+        enum gguf_type type = gguf_get_kv_type(ctx, i);
+        if (type == GGUF_TYPE_ARRAY) {
+            continue;
+        }
+        const char * name = gguf_get_key(ctx, i);
+        const std::string value = gguf_kv_to_str(ctx, i);
+        model.gguf_kv.emplace(name, value);
+    }
+
    // get general kv
    GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));

@@ -2515,18 +2626,22 @@ static void llm_load_tensors(
    }

    (void) main_gpu;
+
+    enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
+    enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
+
 #ifdef GGML_USE_CUBLAS
-    LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
-    ggml_cuda_set_main_device(main_gpu);
-#define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
-#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
+    if (ggml_cublas_loaded()) {
+        LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
+        ggml_cuda_set_main_device(main_gpu);
+
+        llama_backend_offload = GGML_BACKEND_GPU;
+        llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
+    }
 #elif defined(GGML_USE_CLBLAST)
-    LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
-#define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
-#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
-#else
-#define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_CPU
-#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
+        LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
+        llama_backend_offload = GGML_BACKEND_GPU;
+        llama_backend_offload_split = GGML_BACKEND_GPU;
 #endif

    // prepare memory for the weights
@@ -2553,12 +2668,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32

-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@@ -2582,8 +2697,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);

                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT

                        auto & layer = model.layers[i];

@@ -2619,12 +2734,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32

-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@@ -2648,8 +2763,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);

                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT

                        auto & layer = model.layers[i];

@@ -2689,12 +2804,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32

-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@@ -2720,8 +2835,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);

                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT

                        auto & layer = model.layers[i];

@@ -2766,12 +2881,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32

-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@@ -2797,8 +2912,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);

                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT

                        auto & layer = model.layers[i];

@@ -2817,8 +2932,8 @@ static void llm_load_tensors(
                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);

-                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);

                        if (backend == GGML_BACKEND_GPU) {
                            vram_weights +=
@@ -2843,12 +2958,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32

-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@@ -2871,19 +2986,19 @@ static void llm_load_tensors(
                    const int i_gpu_start = n_layer - n_gpu_layers;
                    model.layers.resize(n_layer);
                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
                        auto & layer = model.layers[i];
                        layer.attn_norm     = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
                        layer.attn_norm_b   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd}, backend);
                        layer.wqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend_split);
+                        layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend);
                        layer.wo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend_split);
+                        layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend);
                        layer.ffn_down      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend_split);
+                        layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend);
                        layer.ffn_up        = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend_split);
+                        layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend);
                        layer.ffn_norm      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, backend);
                        layer.ffn_norm_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd}, backend);
                        layer.attn_q_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
@@ -2909,12 +3024,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32

-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@@ -2940,8 +3055,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);

                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT

                        auto & layer = model.layers[i];

@@ -2949,19 +3064,19 @@ static void llm_load_tensors(
                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);

                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend_split);
+                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);

                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend_split);
+                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend);

                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);

                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend_split);
+                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);

-                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend_split);
+                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);

                        if (backend == GGML_BACKEND_GPU) {
                            vram_weights +=
@@ -2987,12 +3102,12 @@ static void llm_load_tensors(
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
-                            backend_norm = LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = llama_backend_offload;
 #else
-                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32

-                            backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
+                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
                            backend_output = GGML_BACKEND_CPU;
@@ -3016,8 +3131,8 @@ static void llm_load_tensors(
                    model.layers.resize(n_layer);

                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
+                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
+                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT

                        auto & layer = model.layers[i];

@@ -5158,11 +5273,12 @@ static int llama_decode_internal(

    // If all tensors can be run on the GPU then using more than 1 thread is detrimental.
    const bool full_offload_supported =
-        model.arch == LLM_ARCH_LLAMA    ||
-        model.arch == LLM_ARCH_BAICHUAN ||
-        model.arch == LLM_ARCH_FALCON   ||
-        model.arch == LLM_ARCH_REFACT   ||
-        model.arch == LLM_ARCH_MPT;
+        model.arch == LLM_ARCH_LLAMA      ||
+        model.arch == LLM_ARCH_BAICHUAN   ||
+        model.arch == LLM_ARCH_FALCON     ||
+        model.arch == LLM_ARCH_REFACT     ||
+        model.arch == LLM_ARCH_MPT        ||
+        model.arch == LLM_ARCH_STARCODER;

    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
    if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
@@ -7976,7 +8092,7 @@ struct llama_context_params llama_context_default_params() {
        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
        /*.rope_freq_base              =*/ 0.0f,
        /*.rope_freq_scale             =*/ 0.0f,
-        /*.yarn_ext_factor             =*/ NAN,
+        /*.yarn_ext_factor             =*/ -1.0f,
        /*.yarn_attn_factor            =*/ 1.0f,
        /*.yarn_beta_fast              =*/ 32.0f,
        /*.yarn_beta_slow              =*/ 1.0f,
@@ -8119,7 +8235,7 @@ struct llama_context * llama_new_context_with_model(
        cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
    }

-    if (std::isnan(cparams.yarn_ext_factor)) { // NaN indicates 'not set'
+    if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
    }

@@ -8315,6 +8431,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
    return model->hparams.rope_freq_scale_train;
 }

+int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
+    const auto & it = model->gguf_kv.find(key);
+    if (it == model->gguf_kv.end()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
+int llama_model_meta_count(const struct llama_model * model) {
+    return (int)model->gguf_kv.size();
+}
+
+int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+    if (i < 0 || i >= (int)model->gguf_kv.size()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    auto it = model->gguf_kv.begin();
+    std::advance(it, i);
+    return snprintf(buf, buf_size, "%s", it->first.c_str());
+}
+
+int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
+    if (i < 0 || i >= (int)model->gguf_kv.size()) {
+        if (buf_size > 0) {
+            buf[0] = '\0';
+        }
+        return -1;
+    }
+    auto it = model->gguf_kv.begin();
+    std::advance(it, i);
+    return snprintf(buf, buf_size, "%s", it->second.c_str());
+}
+
 int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
    return snprintf(buf, buf_size, "%s %s %s",
            llama_model_arch_name(model->arch).c_str(),
--- a/llama.h
+++ b/llama.h
@@ -175,11 +175,11 @@ extern "C" {
    };

    struct llama_context_params {
-        uint32_t seed;            // RNG seed, -1 for random
-        uint32_t n_ctx;           // text context, 0 = from model
-        uint32_t n_batch;         // prompt processing maximum batch size
-        uint32_t n_threads;       // number of threads to use for generation
-        uint32_t n_threads_batch; // number of threads to use for batch processing
+        uint32_t seed;              // RNG seed, -1 for random
+        uint32_t n_ctx;             // text context, 0 = from model
+        uint32_t n_batch;           // prompt processing maximum batch size
+        uint32_t n_threads;         // number of threads to use for generation
+        uint32_t n_threads_batch;   // number of threads to use for batch processing
        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
@@ -301,6 +301,23 @@ extern "C" {
    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);

+    // Functions to access the model's GGUF metadata scalar values
+    // - The functions return the length of the string on success, or -1 on failure
+    // - The output string is always null-terminated and cleared on failure
+    // - GGUF array values are not supported by these functions
+
+    // Get metadata value as a string by key name
+    LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+
+    // Get the number of metadata key/value pairs
+    LLAMA_API int llama_model_meta_count(const struct llama_model * model);
+
+    // Get metadata key name by index
+    LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+
+    // Get metadata value as a string by index
+    LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+
    // Get a string describing the model type
    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

--- a/models/ggml-vocab-llama.gguf
+++ b/models/ggml-vocab-llama.gguf
--- a/mypy.ini
+++ b/mypy.ini
@@ -3,3 +3,4 @@ strict = true
 allow_untyped_calls = true
 allow_untyped_defs = true
 allow_incomplete_defs = true
+disable_error_code = import-untyped
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@@ -1,5 +1,5 @@
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
-set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
+set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
+set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
 set(BUILD_COMPILER "unknown")
@@ -24,15 +24,21 @@ if(Git_FOUND)
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE HEAD
        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
    )
+    if (RES EQUAL 0)
+        set(BUILD_COMMIT ${HEAD})
+    endif()
    execute_process(
        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE COUNT
        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
    )
-    set(BUILD_COMMIT ${HEAD})
-    set(BUILD_NUMBER ${COUNT})
+    if (RES EQUAL 0)
+        set(BUILD_NUMBER ${COUNT})
+    endif()
 endif()

 if(MSVC)
@@ -53,22 +59,22 @@ else()
    set(BUILD_TARGET ${OUT})
 endif()

-# Only write the header if it's changed to prevent unnecessary recompilation
-if(EXISTS ${HEADER_FILE})
-    file(READ ${HEADER_FILE} CONTENTS)
-    string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
+# Only write the build info if it changed
+if(EXISTS ${OUTPUT_FILE})
+    file(READ ${OUTPUT_FILE} CONTENTS)
+    string(REGEX MATCH "LLAMA_COMMIT = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
+    string(REGEX MATCH "LLAMA_COMPILER = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
+    string(REGEX MATCH "LLAMA_BUILD_TARGET = \"([^\"]*)\";" _ ${CONTENTS})
    set(OLD_TARGET ${CMAKE_MATCH_1})
    if (
        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
        NOT OLD_TARGET   STREQUAL BUILD_TARGET
    )
-        configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+        configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
    endif()
 else()
-    configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+    configure_file(${TEMPLATE_FILE} ${OUTPUT_FILE})
 endif()
--- a/scripts/build-info.h.in
+++ b/scripts/build-info.h.in
@@ -1,9 +0,0 @@
-#ifndef BUILD_INFO_H
-#define BUILD_INFO_H
-
-#define BUILD_NUMBER @BUILD_NUMBER@
-#define BUILD_COMMIT "@BUILD_COMMIT@"
-#define BUILD_COMPILER "@BUILD_COMPILER@"
-#define BUILD_TARGET "@BUILD_TARGET@"
-
-#endif // BUILD_INFO_H
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@@ -24,12 +24,7 @@ if out=$($CC -dumpmachine); then
  build_target=$out
 fi

-echo "#ifndef BUILD_INFO_H"
-echo "#define BUILD_INFO_H"
-echo
-echo "#define BUILD_NUMBER $build_number"
-echo "#define BUILD_COMMIT \"$build_commit\""
-echo "#define BUILD_COMPILER \"$build_compiler\""
-echo "#define BUILD_TARGET \"$build_target\""
-echo
-echo "#endif // BUILD_INFO_H"
+echo "int LLAMA_BUILD_NUMBER = ${build_number};"
+echo "char const *LLAMA_COMMIT = \"${build_commit}\";"
+echo "char const *LLAMA_COMPILER = \"${build_compiler}\";"
+echo "char const *LLAMA_BUILD_TARGET = \"${build_target}\";"
Author	SHA1	Message	Date
slaren	d0445a2eff	better documentation	2023-11-10 01:38:20 +01:00
slaren	bfcbb5bc32	format -> std::to_string	2023-11-10 01:26:12 +01:00
slaren	07352f4950	llama : add functions to get the model's metadata	2023-11-10 00:50:17 +01:00
Galunid	a75fa576ab	scripts: Generalize convert scripts (#3838 ) * Replace convert-*-hf-to-gguf.py files with convert-hf-to-gguf.py	2023-11-09 11:09:29 +01:00
Mihai	57ad015dc3	server : add min_p param (#3877 ) * Update server.cpp with min_p after it was introduced in https://github.com/ggerganov/llama.cpp/pull/3841 * Use spaces instead of tabs * Update index.html.hpp after running deps.sh * Fix test - fix line ending	2023-11-08 20:00:34 -06:00
slaren	875fb42871	ggml-alloc : fix backend assignments of views (#3982 )	2023-11-08 13:15:14 +01:00
Jared Van Bortel	0a7c980b6f	gguf : track writer state, free unneeded tensors, cleanup (#3871 )	2023-11-07 12:43:04 -05:00
Georgi Gerganov	413503d4b9	make : do not add linker flags when compiling static llava lib (#3977 )	2023-11-07 20:25:32 +03:00
xaedes	e9c1cecb9d	ggml : fix backward rope after YaRN (#3974 ) * fix backward process of rope rope backward process was broken after YaRN RoPE (#2268) implementation, due to missing changes in backward functions. the code for the backward process is nearly identically to the forward process: the only difference is the sign of the sin-values. to avoid future regressions remove the near-duplicate backward functions and reuse the forward code: for this a new function argument `bool forward` was added to `ggml_compute_forward_rope_f32` and `ggml_compute_forward_rope_f16`. the sin-values will be negated when forward is false. * fix finetune rope call to use correct default attn_factor of 1.0f * remove unused `ggml_rope_xpos_back` it is better to have only one `ggml_rope_back` function that accepts all rope parameters, so that `ggml_compute_backward` can propagate all parameters without having to switch between different rope_back variants. * fix comments explaining the sinus sign in ggml_forward_rope * add missing function arguments in declaration * fix function argument type in declaration	2023-11-07 10:04:51 +02:00
Matthew Tejo	54b4df8886	Use params when loading models in llava-cli (#3976 ) llava-cli was loading models with default params and ignoring settings from the cli. This switches to a generic function to load the params from the cli options.	2023-11-07 10:43:59 +03:00
Meng Zhang	46876d2a2c	cuda : supports running on CPU for GGML_USE_CUBLAS=ON build (#3946 ) * protyping the idea that supports running on CPU for a GGML_USE_CUBLAS=on build * doc: add comments to ggml_cublas_loaded() * fix defined(...)	2023-11-07 08:49:08 +02:00
Damian Stewart	381efbf480	llava : expose as a shared library for downstream projects (#3613 ) * wip llava python bindings compatibility * add external llava API * add base64 in-prompt image support * wip refactor image loading * refactor image load out of llava init * cleanup * further cleanup; move llava-cli into its own file and rename * move base64.hpp into common/ * collapse clip and llava libraries * move llava into its own subdir * wip * fix bug where base64 string was not removed from the prompt * get libllava to output in the right place * expose llava methods in libllama.dylib * cleanup memory usage around clip_image_* * cleanup and refactor again * update headerdoc * build with cmake, not tested (WIP) * Editorconfig * Editorconfig * Build with make * Build with make * Fix cyclical depts on Windows * attempt to fix build on Windows * attempt to fix build on Windows * Upd TODOs * attempt to fix build on Windows+CUDA * Revert changes in cmake * Fix according to review comments * Support building as a shared library * address review comments --------- Co-authored-by: M. Yusuf Sarıgöz <yusufsarigoz@gmail.com> Co-authored-by: Jared Van Bortel <jared@nomic.ai>	2023-11-07 00:36:23 +03:00
slaren	2833a6f63c	ggml-cuda : fix f16 mul mat (#3961 ) * ggml-cuda : fix f16 mul mat ggml-ci * silence common.cpp warning (bonus)	2023-11-05 18:45:16 +01:00
Kerfuffle	d9ccce2e33	Allow common process_escapes to handle \x sequences (#3928 ) * Allow common process_escapes to handle \x sequences * Fix edge case when second hex digit is NUL	2023-11-05 10:06:06 -07:00
Thái Hoàng Tâm	bb60fd0bf6	server : fix typo for --alias shortcut from -m to -a (#3958 )	2023-11-05 18:15:27 +02:00
Jared Van Bortel	132d25b8a6	cuda : fix disabling device with --tensor-split 1,0 (#3951 ) Co-authored-by: slaren <slarengh@gmail.com>	2023-11-05 10:08:57 -05:00
Meng Zhang	3d48f42efc	llama : mark LLM_ARCH_STARCODER as full offload supported (#3945 ) as done in https://github.com/ggerganov/llama.cpp/pull/3827	2023-11-05 14:40:08 +02:00
Eve	c41ea36eaa	cmake : MSVC instruction detection (fixed up #809 ) (#3923 ) * Add detection code for avx * Only check hardware when option is ON * Modify per code review sugguestions * Build locally will detect CPU * Fixes CMake style to use lowercase like everywhere else * cleanup * fix merge * linux/gcc version for testing * msvc combines avx2 and fma into /arch:AVX2 so check for both * cleanup * msvc only version * style * Update FindSIMD.cmake --------- Co-authored-by: Howard Su <howard0su@gmail.com> Co-authored-by: Jeremy Dunn <jeremydunn123@gmail.com>	2023-11-05 10:03:09 +02:00
Eve	a7fac013cf	ci : use intel sde when ci cpu doesn't support avx512 (#3949 )	2023-11-05 09:46:44 +02:00
slaren	48ade94538	cuda : revert CUDA pool stuff (#3944 ) * Revert "cuda : add ROCM aliases for CUDA pool stuff (#3918)" This reverts commit `629f917cd6`. * Revert "cuda : use CUDA memory pool with async memory allocation/deallocation when available (#3903)" This reverts commit `d6069051de`. ggml-ci	2023-11-05 09:12:13 +02:00
Kerfuffle	f28af0d81a	gguf-py: Support 01.AI Yi models (#3943 )	2023-11-04 16:20:34 -06:00
Peter Sugihara	d9b33fe95b	metal : round up to 16 to fix MTLDebugComputeCommandEncoder assertion (#3938 )	2023-11-03 21:18:18 +02:00
Xiao-Yong Jin	5ba3746171	ggml-metal: fix yarn rope (#3937 )	2023-11-03 14:00:31 -04:00
slaren	abb77e7319	ggml-cuda : move row numbers to x grid dim in mmv kernels (#3921 )	2023-11-03 12:13:09 +01:00
Georgi Gerganov	8f961abdc4	speculative : change default p_accept to 0.5 + CLI args (#3919 ) ggml-ci	2023-11-03 09:41:56 +02:00
Georgi Gerganov	05816027d6	common : YAYF (yet another YARN fix) (#3925 ) ggml-ci	2023-11-03 09:24:00 +02:00
cebtenzzre	3fdbe6b66b	llama : change yarn_ext_factor placeholder to -1 (#3922 )	2023-11-03 08:31:58 +02:00
Kerfuffle	629f917cd6	cuda : add ROCM aliases for CUDA pool stuff (#3918 )	2023-11-02 21:58:22 +02:00
Andrei	51b2fc11f7	cmake : fix relative path to git submodule index (#3915 )	2023-11-02 21:40:31 +02:00
Georgi Gerganov	224e7d5b14	readme : add notice about #3912	2023-11-02 20:44:12 +02:00
Georgi Gerganov	c7743fe1c1	cuda : fix const ptrs warning causing ROCm build issues (#3913 )	2023-11-02 20:32:11 +02:00
Oleksii Maryshchenko	d6069051de	cuda : use CUDA memory pool with async memory allocation/deallocation when available (#3903 ) * Using cuda memory pools for async alloc/dealloc. * If cuda device doesnt support memory pool than use old implementation. * Removed redundant cublasSetStream --------- Co-authored-by: Oleksii Maryshchenko <omaryshchenko@dtis.com>	2023-11-02 19:10:39 +02:00
Georgi Gerganov	4ff1046d75	gguf : print error for GGUFv1 files (#3908 )	2023-11-02 16:22:30 +02:00
slaren	21958bb393	cmake : disable LLAMA_NATIVE by default (#3906 )	2023-11-02 14:10:33 +02:00
Georgi Gerganov	2756c4fbff	gguf : remove special-case code for GGUFv1 (#3901 ) ggml-ci	2023-11-02 11:20:21 +02:00
Georgi Gerganov	1efae9b7dc	llm : prevent from 1-D tensors being GPU split (#3697 )	2023-11-02 09:54:44 +02:00
cebtenzzre	b12fa0d1c1	build : link against build info instead of compiling against it (#3879 ) * cmake : fix build when .git does not exist * cmake : simplify BUILD_INFO target * cmake : add missing dependencies on BUILD_INFO * build : link against build info instead of compiling against it * zig : make build info a .cpp source instead of a header Co-authored-by: Matheus C. França <matheus-catarino@hotmail.com> * cmake : revert change to CMP0115 --------- Co-authored-by: Matheus C. França <matheus-catarino@hotmail.com>	2023-11-02 08:50:16 +02:00
Georgi Gerganov	4d719a6d4e	cuda : check if this fixes Pascal card regression (#3882 )	2023-11-02 08:35:10 +02:00
Georgi Gerganov	183b3fac6c	metal : fix build errors and kernel sig after #2268 (#3898 )	2023-11-02 08:33:37 +02:00
cebtenzzre	2fffa0d61f	cuda : fix RoPE after #2268 (#3897 )	2023-11-02 07:49:44 +02:00