mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
41 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
74b239b3d5 | ||
|
|
852aafb163 | ||
|
|
0136966daf | ||
|
|
10b1e45876 | ||
|
|
197c00681b | ||
|
|
95f84d5ce8 | ||
|
|
5487593bc7 | ||
|
|
1d8fca72ae | ||
|
|
62bfef5194 | ||
|
|
eaf6e03174 | ||
|
|
d6ef0e77dd | ||
|
|
dff451cfa1 | ||
|
|
d298382ad9 | ||
|
|
32a28217f4 | ||
|
|
c429b33beb | ||
|
|
9146d36fe7 | ||
|
|
b9adcbbf92 | ||
|
|
9588f196b1 | ||
|
|
3cbd23ed88 | ||
|
|
00c6390793 | ||
|
|
faa0e6979a | ||
|
|
9791f40258 | ||
|
|
902184dd3a | ||
|
|
57684331fc | ||
|
|
b83bab15a5 | ||
|
|
d041d2ceaa | ||
|
|
27891f6db0 | ||
|
|
fbca2f27fc | ||
|
|
0df0aa8e43 | ||
|
|
74f33adf5f | ||
|
|
1debe72737 | ||
|
|
007489e895 | ||
|
|
8b94e799df | ||
|
|
3015851c5a | ||
|
|
55ac3b7aea | ||
|
|
dacfcebd60 | ||
|
|
9b82476ee9 | ||
|
|
a61a94e543 | ||
|
|
152da28ae5 | ||
|
|
d48c88cbd5 | ||
|
|
e84b71c2c6 |
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
name: Low Severity Bugs
|
||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "low severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
name: Medium Severity Bug
|
||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "medium severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
name: High Severity Bug
|
||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "high severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
@@ -0,0 +1,50 @@
|
||||
name: Critical Severity Bug
|
||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "critical severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
||||
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
@@ -0,0 +1,51 @@
|
||||
name: Enhancement template
|
||||
description: Used to request enhancements for llama.cpp
|
||||
title: "Feature Request: "
|
||||
labels: ["enhancement"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
attributes:
|
||||
label: Prerequisites
|
||||
description: Please confirm the following before submitting your enhancement request.
|
||||
options:
|
||||
- label: I am running the latest code. Mention the version if possible as well.
|
||||
required: true
|
||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
required: true
|
||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||
required: true
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: feature-description
|
||||
attributes:
|
||||
label: Feature Description
|
||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
placeholder: Detailed description of the enhancement
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: motivation
|
||||
attributes:
|
||||
label: Motivation
|
||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
placeholder: Explanation of why this feature is needed and its benefits
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: possible-implementation
|
||||
attributes:
|
||||
label: Possible Implementation
|
||||
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||
placeholder: Detailed description of potential implementation
|
||||
validations:
|
||||
required: false
|
||||
38
.github/ISSUE_TEMPLATE/06-question.yml
vendored
Normal file
38
.github/ISSUE_TEMPLATE/06-question.yml
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
name: Question template
|
||||
description: Used to ask questions about llama.cpp
|
||||
title: "Question: "
|
||||
labels: ["question"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
[Please search your question first in Discussion if you got a common general question.](https://github.com/ggerganov/llama.cpp/discussions/categories/q-a)
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
attributes:
|
||||
label: Prerequisites
|
||||
description: Please confirm the following before submitting your question.
|
||||
options:
|
||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||
required: true
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new useful question to share that cannot be answered within Discussions.
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: background-description
|
||||
attributes:
|
||||
label: Background Description
|
||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an question.
|
||||
placeholder: Detailed description of your question
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: possible-answer
|
||||
attributes:
|
||||
label: Possible Answer
|
||||
description: If you have some idea of possible answers you want to confirm, that would also be appreciated.
|
||||
placeholder: Your idea of possible answers
|
||||
validations:
|
||||
required: false
|
||||
11
.github/ISSUE_TEMPLATE/bug.md
vendored
11
.github/ISSUE_TEMPLATE/bug.md
vendored
@@ -1,11 +0,0 @@
|
||||
---
|
||||
name: Bug template
|
||||
about: Used to report bugs in llama.cpp
|
||||
labels: ["bug-unconfirmed"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
||||
|
||||
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
||||
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
@@ -1,28 +0,0 @@
|
||||
---
|
||||
name: Enhancement template
|
||||
about: Used to request enhancements for llama.cpp
|
||||
labels: ["enhancement"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# Prerequisites
|
||||
|
||||
Please answer the following questions for yourself before submitting an issue.
|
||||
|
||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
||||
|
||||
# Feature Description
|
||||
|
||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
|
||||
# Motivation
|
||||
|
||||
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
|
||||
# Possible Implementation
|
||||
|
||||
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||
19
.github/labeler.yml
vendored
19
.github/labeler.yml
vendored
@@ -1,5 +1,16 @@
|
||||
# https://github.com/actions/labeler
|
||||
|
||||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-kompute.h
|
||||
- ggml-kompute.cpp
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-metal.h
|
||||
- ggml-metal.cpp
|
||||
- README-metal.md
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -9,6 +20,7 @@ SYCL:
|
||||
Nvidia GPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-cuda.h
|
||||
- ggml-cuda/**
|
||||
Vulkan:
|
||||
- changed-files:
|
||||
@@ -62,6 +74,8 @@ server:
|
||||
ggml:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml.c
|
||||
- ggml.h
|
||||
- ggml-*.c
|
||||
- ggml-*.h
|
||||
- ggml-cuda/**
|
||||
@@ -71,3 +85,6 @@ nix:
|
||||
- "**/*.nix"
|
||||
- .github/workflows/nix-*.yml
|
||||
- .devops/nix/nixpkgs-instances.nix
|
||||
embedding:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file: examples/embedding/
|
||||
|
||||
5
.github/workflows/docker.yml
vendored
5
.github/workflows/docker.yml
vendored
@@ -42,8 +42,9 @@ jobs:
|
||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
# TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
|
||||
#- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
#- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
||||
@@ -72,6 +72,7 @@ else()
|
||||
set(INS_ENB ON)
|
||||
endif()
|
||||
|
||||
option(LLAMA_SVE "llama: enable SVE" OFF)
|
||||
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
@@ -124,7 +125,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
|
||||
@@ -384,10 +384,6 @@ if (LLAMA_LLAMAFILE)
|
||||
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
|
||||
endif()
|
||||
|
||||
if (LLAMA_QKK_64)
|
||||
add_compile_definitions(GGML_QKK_64)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
|
||||
set(LLAMA_CUDA ON)
|
||||
@@ -1045,6 +1041,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
if (LLAMA_SVE)
|
||||
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||
endif()
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
{
|
||||
{
|
||||
"version": 4,
|
||||
"configurePresets": [
|
||||
{
|
||||
@@ -40,6 +40,10 @@
|
||||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
||||
]
|
||||
}
|
||||
|
||||
7
Makefile
7
Makefile
@@ -389,10 +389,6 @@ else
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifdef LLAMA_QKK_64
|
||||
MK_CPPFLAGS += -DGGML_QKK_64
|
||||
endif
|
||||
|
||||
ifndef LLAMA_NO_ACCELERATE
|
||||
# Mac OS - include Accelerate framework.
|
||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||
@@ -445,6 +441,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
||||
ifdef LLAMA_DEBUG
|
||||
MK_NVCCFLAGS += -lineinfo
|
||||
endif # LLAMA_DEBUG
|
||||
ifdef LLAMA_CUDA_DEBUG
|
||||
MK_NVCCFLAGS += --device-debug
|
||||
endif # LLAMA_CUDA_DEBUG
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
||||
else
|
||||
|
||||
@@ -127,6 +127,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
- [x] [OLMo](https://allenai.org/olmo)
|
||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||
|
||||
@@ -140,6 +141,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
|
||||
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
|
||||
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
|
||||
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
|
||||
|
||||
**HTTP server**
|
||||
|
||||
@@ -201,6 +203,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
**Tools:**
|
||||
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
|
||||
---
|
||||
|
||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||
|
||||
423
ci/run.sh
423
ci/run.sh
@@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
|
||||
}
|
||||
|
||||
function gg_get_model {
|
||||
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
||||
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||
if [[ -s $gguf_3b ]]; then
|
||||
echo -n "$gguf_3b"
|
||||
elif [[ -s $gguf_7b ]]; then
|
||||
echo -n "$gguf_7b"
|
||||
local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
|
||||
local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
|
||||
local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||
if [[ -s $gguf_0 ]]; then
|
||||
echo -n "$gguf_0"
|
||||
elif [[ -s $gguf_1 ]]; then
|
||||
echo -n "$gguf_1"
|
||||
elif [[ -s $gguf_2 ]]; then
|
||||
echo -n "$gguf_2"
|
||||
else
|
||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||
exit 1
|
||||
@@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
# open_llama_3b_v2
|
||||
|
||||
function gg_run_open_llama_3b_v2 {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
|
||||
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||
|
||||
path_models="../models-mnt/open-llama/3B-v2"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_open_llama_3b_v2 {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
}
|
||||
|
||||
# open_llama_7b_v2
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
@@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
}
|
||||
|
||||
# pythia_1.4b
|
||||
|
||||
function gg_run_pythia_1_4b {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
|
||||
|
||||
path_models="../models-mnt/pythia/1.4B"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||
|
||||
wiki_test_60="${path_wiki}/wiki.test-60.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_pythia_1_4b {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Pythia 1.4B:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
}
|
||||
|
||||
# pythia_2_8b
|
||||
# requires: GG_BUILD_CUDA
|
||||
|
||||
function gg_run_pythia_2_8b {
|
||||
cd ${SRC}
|
||||
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
|
||||
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
|
||||
|
||||
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
|
||||
|
||||
path_models="../models-mnt/pythia/2.8B"
|
||||
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
|
||||
|
||||
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
|
||||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
|
||||
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
|
||||
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
|
||||
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
|
||||
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
|
||||
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
|
||||
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
|
||||
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
|
||||
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
|
||||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/quantize ${model_f16} ${model_q8_0} q8_0
|
||||
./bin/quantize ${model_f16} ${model_q4_0} q4_0
|
||||
./bin/quantize ${model_f16} ${model_q4_1} q4_1
|
||||
./bin/quantize ${model_f16} ${model_q5_0} q5_0
|
||||
./bin/quantize ${model_f16} ${model_q5_1} q5_1
|
||||
./bin/quantize ${model_f16} ${model_q2_k} q2_k
|
||||
./bin/quantize ${model_f16} ${model_q3_k} q3_k
|
||||
./bin/quantize ${model_f16} ${model_q4_k} q4_k
|
||||
./bin/quantize ${model_f16} ${model_q5_k} q5_k
|
||||
./bin/quantize ${model_f16} ${model_q6_k} q6_k
|
||||
|
||||
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
|
||||
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
|
||||
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
|
||||
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
|
||||
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
|
||||
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
|
||||
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
|
||||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
qnt="$1"
|
||||
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
|
||||
return 0
|
||||
}
|
||||
|
||||
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
|
||||
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
function gg_sum_pythia_2_8b {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Pythia 2.8B:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
|
||||
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
|
||||
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
|
||||
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
|
||||
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
|
||||
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
|
||||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
}
|
||||
|
||||
# bge-small
|
||||
|
||||
function gg_run_embd_bge_small {
|
||||
@@ -552,7 +688,7 @@ function gg_run_embd_bge_small {
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert-hf-to-gguf.py ${path_models}
|
||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||
|
||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||
@@ -606,9 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
|
||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||
test $ret -eq 0 && gg_run open_llama_3b_v2
|
||||
test $ret -eq 0 && gg_run pythia_1_4b
|
||||
else
|
||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
test $ret -eq 0 && gg_run pythia_2_8b
|
||||
#test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||
|
||||
@@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.interactive_specials = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--special") {
|
||||
params.special = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
return true;
|
||||
@@ -1362,6 +1366,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" -i, --interactive run in interactive mode\n");
|
||||
printf(" --special special tokens output enabled\n");
|
||||
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
||||
@@ -1855,11 +1860,15 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||
|
||||
std::string fs_get_cache_directory() {
|
||||
std::string cache_directory = "";
|
||||
auto ensure_trailing_slash = [](std::string p) {
|
||||
// Make sure to add trailing slash
|
||||
if (p.back() != DIRECTORY_SEPARATOR) {
|
||||
p += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
return p;
|
||||
};
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
if (cache_directory.back() != DIRECTORY_SEPARATOR) {
|
||||
cache_directory += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else {
|
||||
#ifdef __linux__
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
@@ -1870,12 +1879,12 @@ std::string fs_get_cache_directory() {
|
||||
#elif defined(__APPLE__)
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("APPDATA");
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#endif // __linux__
|
||||
cache_directory = ensure_trailing_slash(cache_directory);
|
||||
cache_directory += "llama.cpp";
|
||||
cache_directory += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
return cache_directory;
|
||||
return ensure_trailing_slash(cache_directory);
|
||||
}
|
||||
|
||||
|
||||
@@ -2840,6 +2849,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
|
||||
@@ -146,6 +146,7 @@ struct gpt_params {
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||
bool special = false; // enable special token output
|
||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
|
||||
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
||||
|
||||
params.custom_n_ctx = false;
|
||||
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_checkpointing = true;
|
||||
|
||||
params.sample_start = "";
|
||||
|
||||
@@ -81,6 +81,7 @@ models = [
|
||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -313,11 +313,10 @@ class Model:
|
||||
data = data.astype(np.float32)
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
|
||||
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"""{{{', '.join(str(n) for n in reversed(
|
||||
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
|
||||
)}}}"""
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||
|
||||
# n_dims is implicit in the shape
|
||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
@@ -474,6 +473,9 @@ class Model:
|
||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||
res = "smaug-bpe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -673,6 +675,44 @@ class GPTNeoXModel(Model):
|
||||
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
|
||||
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
|
||||
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
|
||||
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
|
||||
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
||||
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
||||
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
||||
qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
|
||||
data_torch = torch.cat(
|
||||
(
|
||||
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
||||
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
||||
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
|
||||
),
|
||||
dim=0,
|
||||
)
|
||||
logger.info("re-format attention.linear_qkv.weight")
|
||||
elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
|
||||
qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
|
||||
data_torch = torch.cat(
|
||||
(
|
||||
qkv_bias[:, 0, :].reshape((n_embed,)),
|
||||
qkv_bias[:, 1, :].reshape((n_embed,)),
|
||||
qkv_bias[:, 2, :].reshape((n_embed,)),
|
||||
),
|
||||
dim=0,
|
||||
)
|
||||
logger.info("re-format attention.linear_qkv.bias")
|
||||
|
||||
tensors.append((self.map_tensor_name(name), data_torch))
|
||||
|
||||
return tensors
|
||||
|
||||
|
||||
@Model.register("BloomForCausalLM")
|
||||
class BloomModel(Model):
|
||||
@@ -2355,7 +2395,8 @@ class CommandR2Model(Model):
|
||||
|
||||
# max_position_embeddings = 8192 in config.json but model was actually
|
||||
# trained on 128k context length
|
||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
||||
# aya-23 models don't have model_max_length specified
|
||||
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
@@ -2428,6 +2469,157 @@ class JinaBertV2Model(BertModel):
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
|
||||
@Model.register("ArcticForCausalLM")
|
||||
class ArcticModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.ARCTIC
|
||||
|
||||
def set_vocab(self):
|
||||
# The reason for using a custom implementation here is that the
|
||||
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
||||
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||
|
||||
if not tokenizer_path.is_file():
|
||||
logger.error(f'Error: Missing {tokenizer_path}')
|
||||
sys.exit(1)
|
||||
|
||||
# Read the whole vocabulary from the tokenizer.model file
|
||||
tokenizer = SentencePieceProcessor()
|
||||
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||
|
||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||
|
||||
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||
scores: list[float] = [-10000.0] * vocab_size
|
||||
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||
|
||||
for token_id in range(tokenizer.vocab_size()):
|
||||
|
||||
piece = tokenizer.IdToPiece(token_id)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.GetScore(token_id)
|
||||
|
||||
toktype = SentencePieceTokenTypes.NORMAL
|
||||
if tokenizer.IsUnknown(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||
elif tokenizer.IsControl(token_id):
|
||||
toktype = SentencePieceTokenTypes.CONTROL
|
||||
elif tokenizer.IsUnused(token_id):
|
||||
toktype = SentencePieceTokenTypes.UNUSED
|
||||
elif tokenizer.IsByte(token_id):
|
||||
toktype = SentencePieceTokenTypes.BYTE
|
||||
|
||||
tokens[token_id] = text
|
||||
scores[token_id] = score
|
||||
toktypes[token_id] = toktype
|
||||
|
||||
# Use the added_tokens_decoder field from tokeniser_config.json as the source
|
||||
# of information about added/redefined tokens and modify them accordingly.
|
||||
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||
if tokenizer_config_file.is_file():
|
||||
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_config_json = json.load(f)
|
||||
|
||||
if "added_tokens_decoder" in tokenizer_config_json:
|
||||
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||
for token_id, token_json in added_tokens_decoder.items():
|
||||
token_id = int(token_id)
|
||||
if (token_id >= vocab_size):
|
||||
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||
continue
|
||||
|
||||
token_content = token_json["content"]
|
||||
token_type = SentencePieceTokenTypes.USER_DEFINED
|
||||
token_score = -10000.0
|
||||
|
||||
# Map unk_token to UNKNOWN, other special tokens to CONTROL
|
||||
# Set the score to 0.0 as in the original tokenizer.model
|
||||
if ("special" in token_json) and token_json["special"]:
|
||||
if token_content == tokenizer_config_json["unk_token"]:
|
||||
token_type = SentencePieceTokenTypes.UNKNOWN
|
||||
else:
|
||||
token_type = SentencePieceTokenTypes.CONTROL
|
||||
token_score = 0.0
|
||||
|
||||
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
|
||||
tokens[token_id] = token_content.encode("utf-8")
|
||||
toktypes[token_id] = token_type
|
||||
scores[token_id] = token_score
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("llama")
|
||||
self.gguf_writer.add_tokenizer_pre("default")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_scores(scores)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||
|
||||
_experts: list[dict[str, Tensor]] | None = None
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
|
||||
if name.endswith("q_proj.weight"):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith("k_proj.weight"):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
|
||||
# process the experts separately
|
||||
if name.find("block_sparse_moe.experts") != -1:
|
||||
n_experts = self.hparams["num_local_experts"]
|
||||
|
||||
assert bid is not None
|
||||
|
||||
if self._experts is None:
|
||||
self._experts = [{} for _ in range(self.block_count)]
|
||||
|
||||
self._experts[bid][name] = data_torch
|
||||
|
||||
if len(self._experts[bid]) >= n_experts * 3:
|
||||
tensors: list[tuple[str, Tensor]] = []
|
||||
|
||||
# merge the experts into a single 3d tensor
|
||||
for wid in ["w1", "w2", "w3"]:
|
||||
datas: list[Tensor] = []
|
||||
|
||||
for xid in range(n_experts):
|
||||
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
|
||||
datas.append(self._experts[bid][ename])
|
||||
del self._experts[bid][ename]
|
||||
|
||||
data_torch = torch.stack(datas, dim=0)
|
||||
|
||||
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
|
||||
|
||||
new_name = self.map_tensor_name(merged_name)
|
||||
|
||||
tensors.append((new_name, data_torch))
|
||||
return tensors
|
||||
else:
|
||||
return []
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def write_tensors(self):
|
||||
super().write_tensors()
|
||||
|
||||
if self._experts is not None:
|
||||
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||
experts = [k for d in self._experts for k in d.keys()]
|
||||
if len(experts) > 0:
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
|
||||
|
||||
params.samples_start_after_nl = false;
|
||||
params.use_adam = true;
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_scratch = true;
|
||||
|
||||
// only adam
|
||||
|
||||
@@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
|
||||
struct ggml_tensor * t16;
|
||||
if (enable_flash_attn) {
|
||||
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
|
||||
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
||||
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
|
||||
} else {
|
||||
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
||||
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
||||
|
||||
@@ -7,8 +7,6 @@ android {
|
||||
namespace = "com.example.llama"
|
||||
compileSdk = 34
|
||||
|
||||
ndkVersion = "26.1.10909125"
|
||||
|
||||
defaultConfig {
|
||||
applicationId = "com.example.llama"
|
||||
minSdk = 33
|
||||
@@ -20,17 +18,6 @@ android {
|
||||
vectorDrawables {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
@@ -55,17 +42,6 @@ android {
|
||||
composeOptions {
|
||||
kotlinCompilerExtensionVersion = "1.5.1"
|
||||
}
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
@@ -78,6 +54,7 @@ dependencies {
|
||||
implementation("androidx.compose.ui:ui-graphics")
|
||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||
implementation("androidx.compose.material3:material3")
|
||||
implementation(project(":llama"))
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.llama.cpp.LLamaAndroid
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
@@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
|
||||
import kotlinx.coroutines.flow.catch
|
||||
import kotlinx.coroutines.launch
|
||||
|
||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val NanosPerSecond = 1_000_000_000.0
|
||||
@@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.unload()
|
||||
llamaAndroid.unload()
|
||||
} catch (exc: IllegalStateException) {
|
||||
messages += exc.message!!
|
||||
}
|
||||
@@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
messages += ""
|
||||
|
||||
viewModelScope.launch {
|
||||
llm.send(text)
|
||||
llamaAndroid.send(text)
|
||||
.catch {
|
||||
Log.e(tag, "send() failed", it)
|
||||
messages += it.message!!
|
||||
@@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
val start = System.nanoTime()
|
||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
||||
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
|
||||
val end = System.nanoTime()
|
||||
|
||||
messages += warmupResult
|
||||
@@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
return@launch
|
||||
}
|
||||
|
||||
messages += llm.bench(512, 128, 1, 3)
|
||||
messages += llamaAndroid.bench(512, 128, 1, 3)
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "bench() failed", exc)
|
||||
messages += exc.message!!
|
||||
@@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
fun load(pathToModel: String) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.load(pathToModel)
|
||||
llamaAndroid.load(pathToModel)
|
||||
messages += "Loaded $pathToModel"
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "load() failed", exc)
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
plugins {
|
||||
id("com.android.application") version "8.2.0" apply false
|
||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||
id("com.android.library") version "8.2.0" apply false
|
||||
}
|
||||
|
||||
1
examples/llama.android/llama/.gitignore
vendored
Normal file
1
examples/llama.android/llama/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/build
|
||||
@@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama)
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
68
examples/llama.android/llama/build.gradle.kts
Normal file
68
examples/llama.android/llama/build.gradle.kts
Normal file
@@ -0,0 +1,68 @@
|
||||
plugins {
|
||||
id("com.android.library")
|
||||
id("org.jetbrains.kotlin.android")
|
||||
}
|
||||
|
||||
android {
|
||||
namespace = "android.llama.cpp"
|
||||
compileSdk = 34
|
||||
|
||||
defaultConfig {
|
||||
minSdk = 33
|
||||
|
||||
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||
consumerProguardFiles("consumer-rules.pro")
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
|
||||
cppFlags("")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
release {
|
||||
isMinifyEnabled = false
|
||||
proguardFiles(
|
||||
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||
"proguard-rules.pro"
|
||||
)
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||
targetCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = "1.8"
|
||||
}
|
||||
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation("androidx.core:core-ktx:1.12.0")
|
||||
implementation("androidx.appcompat:appcompat:1.6.1")
|
||||
implementation("com.google.android.material:material:1.11.0")
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
}
|
||||
0
examples/llama.android/llama/consumer-rules.pro
Normal file
0
examples/llama.android/llama/consumer-rules.pro
Normal file
21
examples/llama.android/llama/proguard-rules.pro
vendored
Normal file
21
examples/llama.android/llama/proguard-rules.pro
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
# Add project specific ProGuard rules here.
|
||||
# You can control the set of applied configuration files using the
|
||||
# proguardFiles setting in build.gradle.
|
||||
#
|
||||
# For more details, see
|
||||
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||
|
||||
# If your project uses WebView with JS, uncomment the following
|
||||
# and specify the fully qualified class name to the JavaScript interface
|
||||
# class:
|
||||
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||
# public *;
|
||||
#}
|
||||
|
||||
# Uncomment this to preserve the line number information for
|
||||
# debugging stack traces.
|
||||
#-keepattributes SourceFile,LineNumberTable
|
||||
|
||||
# If you keep the line number information, uncomment this to
|
||||
# hide the original source file name.
|
||||
#-renamesourcefileattribute SourceFile
|
||||
@@ -0,0 +1,24 @@
|
||||
package android.llama.cpp
|
||||
|
||||
import androidx.test.platform.app.InstrumentationRegistry
|
||||
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||
|
||||
import org.junit.Test
|
||||
import org.junit.runner.RunWith
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Instrumented test, which will execute on an Android device.
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
@RunWith(AndroidJUnit4::class)
|
||||
class ExampleInstrumentedTest {
|
||||
@Test
|
||||
fun useAppContext() {
|
||||
// Context of the app under test.
|
||||
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
|
||||
assertEquals("android.llama.cpp.test", appContext.packageName)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
|
||||
</manifest>
|
||||
49
examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Normal file
49
examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Normal file
@@ -0,0 +1,49 @@
|
||||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
||||
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||
|
||||
if (!model) {
|
||||
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
llama_free(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
||||
llama_log_set(log_callback, NULL);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_bench_1model(
|
||||
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
|
||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||
|
||||
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||
llama_backend_init();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
||||
return env->NewStringUTF(llama_print_system_info());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jint JNICALL
|
||||
Java_com_example_llama_Llm_completion_1init(
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_completion_1loop(
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
JNIEnv * env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.example.llama
|
||||
package android.llama.cpp
|
||||
|
||||
import android.util.Log
|
||||
import kotlinx.coroutines.CoroutineDispatcher
|
||||
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
|
||||
import java.util.concurrent.Executors
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
class Llm {
|
||||
class LLamaAndroid {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||
@@ -165,8 +165,8 @@ class Llm {
|
||||
}
|
||||
|
||||
// Enforce only one instance of Llm.
|
||||
private val _instance: Llm = Llm()
|
||||
private val _instance: LLamaAndroid = LLamaAndroid()
|
||||
|
||||
fun instance(): Llm = _instance
|
||||
fun instance(): LLamaAndroid = _instance
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
package android.llama.cpp
|
||||
|
||||
import org.junit.Test
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Example local unit test, which will execute on the development machine (host).
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
class ExampleUnitTest {
|
||||
@Test
|
||||
fun addition_isCorrect() {
|
||||
assertEquals(4, 2 + 2)
|
||||
}
|
||||
}
|
||||
@@ -15,3 +15,4 @@ dependencyResolutionManagement {
|
||||
|
||||
rootProject.name = "LlamaAndroid"
|
||||
include(":app")
|
||||
include(":llama")
|
||||
|
||||
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
@@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
|
||||
// display text
|
||||
if (input_echo && display) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
||||
printf("%s", token_str.c_str());
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||
|
||||
// Console/Stream Output
|
||||
fprintf(stdout, "%s", token_str.c_str());
|
||||
|
||||
// Record Displayed Tokens To Log
|
||||
// Note: Generated tokens are created one by one hence this check
|
||||
if (embd.size() > 1) {
|
||||
// Incoming Requested Tokens
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
// Outgoing Generated Tokens
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>SimpleChat (LlamaCPP, ...) </title>
|
||||
<title>SimpleChat LlamaCppEtal </title>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<meta name="message" content="Save Nature Save Earth" />
|
||||
@@ -30,20 +30,17 @@
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<label for="system-in">System</label>
|
||||
<input type="text" name="system" id="system-in" class="flex-grow"/>
|
||||
<input type="text" name="system" id="system-in" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"/>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div id="chat-div">
|
||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||
<p> Enter your text to the ai assistant below.</p>
|
||||
<p> Use shift+enter for inserting enter.</p>
|
||||
<p> Refresh the page to start over fresh.</p>
|
||||
<p> You need to have javascript enabled.</p>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
|
||||
<textarea id="user-in" class="flex-grow" rows="3" placeholder="enter your query to the ai model here" ></textarea>
|
||||
<button id="user-btn">submit</button>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -14,11 +14,15 @@ own system prompts.
|
||||
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
||||
enough manner, in general.
|
||||
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat.
|
||||
Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
|
||||
console.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
|
||||
the js file as needed.
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude
|
||||
form of old messages culling can be achieved.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants
|
||||
they can update the js file or equivalent member in gMe as needed.
|
||||
|
||||
|
||||
## usage
|
||||
@@ -43,11 +47,33 @@ next run this web front end in examples/server/public_simplechat
|
||||
### using the front end
|
||||
|
||||
Open this simple web front end from your local browser
|
||||
|
||||
* http://127.0.0.1:PORT/index.html
|
||||
|
||||
Once inside
|
||||
|
||||
* Select between chat and completion mode. By default it is set to chat mode.
|
||||
|
||||
* In completion mode
|
||||
* logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
|
||||
If the model requires any prefix wrt user role messages, then the end user has to
|
||||
explicitly add the needed prefix, when they enter their chat message.
|
||||
Similarly if the model requires any prefix to trigger assistant/ai-model response,
|
||||
then the end user needs to enter the same.
|
||||
This keeps the logic simple, while still giving flexibility to the end user to
|
||||
manage any templating/tagging requirement wrt their messages to the model.
|
||||
* the logic doesnt insert newline at the begining and end wrt the prompt message generated.
|
||||
However if the chat being sent to /completions end point has more than one role's message,
|
||||
then insert newline when moving from one role's message to the next role's message, so
|
||||
that it can be clearly identified/distinguished.
|
||||
* given that /completions endpoint normally doesnt add additional chat-templating of its
|
||||
own, the above ensures that end user can create a custom single/multi message combo with
|
||||
any tags/special-tokens related chat templating to test out model handshake. Or enduser
|
||||
can use it just for normal completion related/based query.
|
||||
|
||||
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
||||
Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
|
||||
responses with a suitable system prompt.
|
||||
* if chat.add_system_begin is used
|
||||
* you cant change the system prompt, after it is has been submitted once along with user query.
|
||||
* you cant set a system prompt, after you have submitted any user query
|
||||
@@ -55,27 +81,121 @@ Once inside
|
||||
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
||||
* inturn the updated/changed system prompt will be inserted into the chat session.
|
||||
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
||||
|
||||
* Enter your query and either press enter or click on the submit button.
|
||||
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
||||
|
||||
* Wait for the logic to communicate with the server and get the response.
|
||||
* the user is not allowed to enter any fresh query during this time.
|
||||
* the user input box will be disabled and a working message will be shown in it.
|
||||
|
||||
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
||||
|
||||
* Using NewChat one can start independent chat sessions.
|
||||
* two independent chat sessions are setup by default.
|
||||
|
||||
|
||||
## Devel note
|
||||
|
||||
### Reason behind this
|
||||
|
||||
The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
|
||||
by developers who may not be from web frontend background (so inturn may not be familiar with template /
|
||||
end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
|
||||
|
||||
And given that the idea is also to help explore/experiment for developers, some flexibility is provided
|
||||
to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented
|
||||
to explore some of the end points and ideas/implications around them.
|
||||
|
||||
|
||||
### General
|
||||
|
||||
Me/gMe consolidates the settings which control the behaviour into one object.
|
||||
One can see the current settings, as well as change/update them using browsers devel-tool/console.
|
||||
|
||||
bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
|
||||
communicating with the server or only sends the latest user query/message.
|
||||
|
||||
bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
|
||||
messages that get inserted into prompt field wrt /Completion endpoint.
|
||||
|
||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||
irrespective of whether /chat/completions or /completions endpoint.
|
||||
|
||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||
modify the existing options value or remove them, for now you can update this global var
|
||||
using browser's development-tools/console.
|
||||
|
||||
iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
|
||||
This is disabled by default. However if enabled, then in addition to latest system message, only
|
||||
the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
|
||||
from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
|
||||
only user messages after the latest system message/prompt will be considered.
|
||||
|
||||
This specified sliding window user message count also includes the latest user query.
|
||||
<0 : Send entire chat history to server
|
||||
0 : Send only the system message if any to the server
|
||||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||
|
||||
|
||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way.
|
||||
|
||||
|
||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
||||
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
||||
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
||||
|
||||
|
||||
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
||||
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
||||
its instances relatively easily, however given the current goal of keeping this simple, it has
|
||||
not been added, for now.
|
||||
|
||||
|
||||
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
||||
the system prompt, anytime during the conversation or only at the beginning.
|
||||
|
||||
|
||||
read_json_early, is to experiment with reading json response data early on, if available,
|
||||
so that user can be shown generated data, as and when it is being generated, rather than
|
||||
at the end when full data is available.
|
||||
|
||||
the server flow doesnt seem to be sending back data early, atleast for request (inc options)
|
||||
that is currently sent.
|
||||
|
||||
if able to read json data early on in future, as and when ai model is generating data, then
|
||||
this helper needs to indirectly update the chat div with the recieved data, without waiting
|
||||
for the overall data to be available.
|
||||
|
||||
|
||||
### Default setup
|
||||
|
||||
By default things are setup to try and make the user experience a bit better, if possible.
|
||||
However a developer when testing the server of ai-model may want to change these value.
|
||||
|
||||
Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
|
||||
just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
|
||||
full chat history. This way if there is any response with garbage/repeatation, it doesnt
|
||||
mess with things beyond the next question/request/query, in some ways.
|
||||
|
||||
Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
|
||||
available wrt next query-response. However dont forget that the server when started should
|
||||
also be started with a model context size of 1k or more, to be on safe side.
|
||||
|
||||
The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
|
||||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||
to /completions endpoint handling code on server side.
|
||||
|
||||
Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server
|
||||
along with the user query. So that the model is partly set to try avoid repeating text in
|
||||
its response.
|
||||
|
||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console.
|
||||
|
||||
|
||||
## At the end
|
||||
|
||||
Also a thank you to all open source and open model developers, who strive for the common good.
|
||||
|
||||
@@ -48,6 +48,13 @@ button {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.ul1 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
.ul2 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0.6vmin;
|
||||
}
|
||||
|
||||
@@ -14,23 +14,86 @@ class ApiEP {
|
||||
}
|
||||
|
||||
let gUsageMsg = `
|
||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||
<p> Enter your text to the ai assistant below.</p>
|
||||
<p> Use shift+enter for inserting enter.</p>
|
||||
<p> Refresh the page to start over fresh.</p>
|
||||
<p class="role-system">Usage</p>
|
||||
<ul class="ul1">
|
||||
<li> Set system prompt above, to try control ai response charactersitic, if model supports same.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode normally wont have a system prompt.</li>
|
||||
</ul>
|
||||
<li> Enter your query to ai assistant below.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode doesnt insert user/role: prefix implicitly.</li>
|
||||
<li> Use shift+enter for inserting enter/newline.</li>
|
||||
</ul>
|
||||
<li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
|
||||
<ul class="ul2">
|
||||
<li> experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand</li>
|
||||
</ul>
|
||||
</ul>
|
||||
`;
|
||||
|
||||
/** @typedef {{role: string, content: string}[]} ChatMessages */
|
||||
|
||||
class SimpleChat {
|
||||
|
||||
constructor() {
|
||||
/**
|
||||
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
||||
* @type {{role: string, content: string}[]}
|
||||
* @type {ChatMessages}
|
||||
*/
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recent chat messages.
|
||||
* If iRecentUserMsgCnt < 0
|
||||
* Then return the full chat history
|
||||
* Else
|
||||
* Return chat messages from latest going back till the last/latest system prompt.
|
||||
* While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
|
||||
* @param {number} iRecentUserMsgCnt
|
||||
*/
|
||||
recent_chat(iRecentUserMsgCnt) {
|
||||
if (iRecentUserMsgCnt < 0) {
|
||||
return this.xchat;
|
||||
}
|
||||
if (iRecentUserMsgCnt == 0) {
|
||||
console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
|
||||
}
|
||||
/** @type{ChatMessages} */
|
||||
let rchat = [];
|
||||
let sysMsg = this.get_system_latest();
|
||||
if (sysMsg.length != 0) {
|
||||
rchat.push({role: Roles.System, content: sysMsg});
|
||||
}
|
||||
let iUserCnt = 0;
|
||||
let iStart = this.xchat.length;
|
||||
for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
|
||||
if (iUserCnt >= iRecentUserMsgCnt) {
|
||||
break;
|
||||
}
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.User) {
|
||||
iStart = i;
|
||||
iUserCnt += 1;
|
||||
}
|
||||
}
|
||||
for(let i = iStart; i < this.xchat.length; i++) {
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.System) {
|
||||
continue;
|
||||
}
|
||||
rchat.push({role: msg.role, content: msg.content});
|
||||
}
|
||||
return rchat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an entry into xchat
|
||||
* @param {string} role
|
||||
@@ -57,7 +120,7 @@ class SimpleChat {
|
||||
div.replaceChildren();
|
||||
}
|
||||
let last = undefined;
|
||||
for(const x of this.xchat) {
|
||||
for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
let entry = document.createElement("p");
|
||||
entry.className = `role-${x.role}`;
|
||||
entry.innerText = `${x.role}: ${x.content}`;
|
||||
@@ -69,17 +132,21 @@ class SimpleChat {
|
||||
} else {
|
||||
if (bClear) {
|
||||
div.innerHTML = gUsageMsg;
|
||||
gMe.show_info(div);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
* The needed fields/options are picked from a global object.
|
||||
* Convert the json into string.
|
||||
* @param {Object} obj
|
||||
*/
|
||||
request_jsonstr(obj) {
|
||||
obj["temperature"] = 0.7;
|
||||
for(let k in gMe.chatRequestOptions) {
|
||||
obj[k] = gMe.chatRequestOptions[k];
|
||||
}
|
||||
return JSON.stringify(obj);
|
||||
}
|
||||
|
||||
@@ -88,18 +155,27 @@ class SimpleChat {
|
||||
*/
|
||||
request_messages_jsonstr() {
|
||||
let req = {
|
||||
messages: this.xchat,
|
||||
messages: this.recent_chat(gMe.iRecentUserMsgCnt),
|
||||
}
|
||||
return this.request_jsonstr(req);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string form of json object suitable for /completions
|
||||
* @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
|
||||
*/
|
||||
request_prompt_jsonstr() {
|
||||
request_prompt_jsonstr(bInsertStandardRolePrefix) {
|
||||
let prompt = "";
|
||||
for(const chat of this.xchat) {
|
||||
prompt += `${chat.role}: ${chat.content}\n`;
|
||||
let iCnt = 0;
|
||||
for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
iCnt += 1;
|
||||
if (iCnt > 1) {
|
||||
prompt += "\n";
|
||||
}
|
||||
if (bInsertStandardRolePrefix) {
|
||||
prompt += `${chat.role}: `;
|
||||
}
|
||||
prompt += `${chat.content}`;
|
||||
}
|
||||
let req = {
|
||||
prompt: prompt,
|
||||
@@ -171,7 +247,6 @@ let gChatURL = {
|
||||
'chat': `${gBaseURL}/chat/completions`,
|
||||
'completion': `${gBaseURL}/completions`,
|
||||
}
|
||||
const gbCompletionFreshChatAlways = true;
|
||||
|
||||
|
||||
/**
|
||||
@@ -291,6 +366,8 @@ class MultiChatUI {
|
||||
// allow user to insert enter into their message using shift+enter.
|
||||
// while just pressing enter key will lead to submitting.
|
||||
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||
let value = this.elInUser.value;
|
||||
this.elInUser.value = value.substring(0,value.length-1);
|
||||
this.elBtnUser.click();
|
||||
ev.preventDefault();
|
||||
}
|
||||
@@ -321,6 +398,29 @@ class MultiChatUI {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try read json response early, if available.
|
||||
* @param {Response} resp
|
||||
*/
|
||||
async read_json_early(resp) {
|
||||
if (!resp.body) {
|
||||
throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body...");
|
||||
}
|
||||
let tdUtf8 = new TextDecoder("utf-8");
|
||||
let rr = resp.body.getReader();
|
||||
let gotBody = "";
|
||||
while(true) {
|
||||
let { value: cur, done: done} = await rr.read();
|
||||
let curBody = tdUtf8.decode(cur);
|
||||
console.debug("DBUG:SC:PART:", curBody);
|
||||
gotBody += curBody;
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return JSON.parse(gotBody);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle user query submit request, wrt specified chat session.
|
||||
* @param {string} chatId
|
||||
@@ -330,6 +430,14 @@ class MultiChatUI {
|
||||
|
||||
let chat = this.simpleChats[chatId];
|
||||
|
||||
// In completion mode, if configured, clear any previous chat history.
|
||||
// So if user wants to simulate a multi-chat based completion query,
|
||||
// they will have to enter the full thing, as a suitable multiline
|
||||
// user input/query.
|
||||
if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) {
|
||||
chat.clear();
|
||||
}
|
||||
|
||||
chat.add_system_anytime(this.elInSystem.value, chatId);
|
||||
|
||||
let content = this.elInUser.value;
|
||||
@@ -344,7 +452,7 @@ class MultiChatUI {
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
theBody = chat.request_messages_jsonstr();
|
||||
} else {
|
||||
theBody = chat.request_prompt_jsonstr();
|
||||
theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
|
||||
}
|
||||
|
||||
this.elInUser.value = "working...";
|
||||
@@ -359,6 +467,7 @@ class MultiChatUI {
|
||||
});
|
||||
|
||||
let respBody = await resp.json();
|
||||
//let respBody = await this.read_json_early(resp);
|
||||
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
||||
let assistantMsg;
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
@@ -376,13 +485,6 @@ class MultiChatUI {
|
||||
} else {
|
||||
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
||||
}
|
||||
// Purposefully clear at end rather than begin of this function
|
||||
// so that one can switch from chat to completion mode and sequece
|
||||
// in a completion mode with multiple user-assistant chat data
|
||||
// from before to be sent/occur once.
|
||||
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
|
||||
chat.xchat.length = 0;
|
||||
}
|
||||
this.ui_reset_userinput();
|
||||
}
|
||||
|
||||
@@ -462,17 +564,66 @@ class MultiChatUI {
|
||||
}
|
||||
|
||||
|
||||
let gMuitChat;
|
||||
const gChatIds = [ "Default", "Other" ];
|
||||
class Me {
|
||||
|
||||
constructor() {
|
||||
this.defaultChatIds = [ "Default", "Other" ];
|
||||
this.multiChat = new MultiChatUI();
|
||||
this.bCompletionFreshChatAlways = true;
|
||||
this.bCompletionInsertStandardRolePrefix = false;
|
||||
this.iRecentUserMsgCnt = 2;
|
||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
this.chatRequestOptions = {
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 1024,
|
||||
"frequency_penalty": 1.2,
|
||||
"presence_penalty": 1.2,
|
||||
"n_predict": 1024
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {HTMLDivElement} elDiv
|
||||
*/
|
||||
show_info(elDiv) {
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = "Settings (devel-tools-console gMe)";
|
||||
p.className = "role-system";
|
||||
elDiv.appendChild(p);
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** @type {Me} */
|
||||
let gMe;
|
||||
|
||||
function startme() {
|
||||
console.log("INFO:SimpleChat:StartMe:Starting...");
|
||||
gMuitChat = new MultiChatUI();
|
||||
for (let cid of gChatIds) {
|
||||
gMuitChat.new_chat_session(cid);
|
||||
gMe = new Me();
|
||||
for (let cid of gMe.defaultChatIds) {
|
||||
gMe.multiChat.new_chat_session(cid);
|
||||
}
|
||||
gMuitChat.setup_ui(gChatIds[0]);
|
||||
gMuitChat.show_sessions();
|
||||
gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
|
||||
gMe.multiChat.show_sessions();
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", startme);
|
||||
|
||||
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
||||
|
||||
:: for FP16
|
||||
:: faster for long-prompt inference
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||
|
||||
:: for FP32
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||
if %errorlevel% neq 0 goto ERROR
|
||||
:: build example/main only
|
||||
:: make main
|
||||
|
||||
@@ -3,40 +3,390 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#include <shellapi.h> // For CommandLineToArgvW
|
||||
#endif
|
||||
|
||||
static void print_usage_information(const char * argv0, FILE * stream) {
|
||||
fprintf(stream, "usage: %s [options]\n\n", argv0);
|
||||
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
|
||||
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
|
||||
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
|
||||
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
|
||||
fprintf(stream, " The possible options are:\n");
|
||||
fprintf(stream, "\n");
|
||||
fprintf(stream, " -h, --help print this help and exit\n");
|
||||
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
||||
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
|
||||
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
||||
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
||||
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) text;
|
||||
(void) user_data;
|
||||
}
|
||||
|
||||
static std::string read_prompt_from_file(const char * filepath, bool & success) {
|
||||
success = false;
|
||||
|
||||
std::ifstream in(filepath, std::ios::binary);
|
||||
if (!in) {
|
||||
fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
|
||||
return std::string();
|
||||
}
|
||||
// do not assume the file is seekable (e.g. /dev/stdin)
|
||||
std::stringstream buffer;
|
||||
buffer << in.rdbuf();
|
||||
if (in.fail()) {
|
||||
fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
|
||||
return std::string();
|
||||
}
|
||||
|
||||
success = true;
|
||||
return buffer.str();
|
||||
}
|
||||
|
||||
//
|
||||
// Function: ingest_args(...) -> vector<string>
|
||||
//
|
||||
// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
|
||||
// strings, as an STL vector<string>.
|
||||
//
|
||||
// In particular, it handles character encoding shenanigans on Windows.
|
||||
//
|
||||
// Note: raw_argc and raw_argv are not actually read at all on Windows.
|
||||
// On Windows we call GetCommandLineW to get the arguments in wchar_t
|
||||
// format, ignoring the regular argc/argv arguments to main().
|
||||
//
|
||||
// TODO: potential opportunity to roll common stuff into common/console.cpp
|
||||
// in relation to Windows wchar_t shenanigans.
|
||||
static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
|
||||
std::vector<std::string> argv;
|
||||
|
||||
// Handle Windows, if given non-ASCII arguments.
|
||||
// We convert wchar_t arguments into UTF-8 char* on this platform.
|
||||
// Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
|
||||
// without throwing tantrums.
|
||||
#if defined(_WIN32)
|
||||
int argc;
|
||||
const LPWSTR cmdline_wargv = GetCommandLineW();
|
||||
LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
|
||||
|
||||
// silence unused arg warnings
|
||||
(void) raw_argc;
|
||||
(void) raw_argv;
|
||||
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
|
||||
char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
|
||||
GGML_ASSERT(output_buf);
|
||||
|
||||
WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
|
||||
output_buf[length_needed] = '\0';
|
||||
|
||||
argv.push_back(output_buf);
|
||||
free(output_buf);
|
||||
}
|
||||
|
||||
LocalFree((HLOCAL) wargv);
|
||||
#else
|
||||
int argc = raw_argc;
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
argv.push_back(raw_argv[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
GGML_ASSERT((unsigned int) argc == argv.size());
|
||||
|
||||
return argv;
|
||||
}
|
||||
|
||||
//
|
||||
// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
|
||||
//
|
||||
// writes a string to standard output; taking into account that on Windows
|
||||
// to display correctly you have to use special handling. Works even if the
|
||||
// user has not set a unicode code page on a Windows cmd.exe.
|
||||
//
|
||||
// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
|
||||
// a human-readable is written instead.
|
||||
//
|
||||
// On non-Windows systems, simply printfs() the string.
|
||||
static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
||||
invalid_utf8 = false;
|
||||
|
||||
#if defined(_WIN32)
|
||||
// Are we in a console?
|
||||
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
|
||||
DWORD dwMode = 0;
|
||||
|
||||
// According to Microsoft docs:
|
||||
// "WriteConsole fails if it is used with a standard handle that is redirected to a file."
|
||||
// Also according to the docs, you can use GetConsoleMode to check for that.
|
||||
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
|
||||
printf("%s", str);
|
||||
return;
|
||||
}
|
||||
|
||||
// MultiByteToWideChar reports an error if str is empty, don't report
|
||||
// them as invalid_utf8.
|
||||
if (*str == 0) {
|
||||
return;
|
||||
}
|
||||
int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
|
||||
if (length_needed == 0) {
|
||||
DWORD err = GetLastError();
|
||||
if (err == ERROR_NO_UNICODE_TRANSLATION) {
|
||||
invalid_utf8 = true;
|
||||
int len = strlen(str);
|
||||
printf("<");
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (i > 0) {
|
||||
printf(" ");
|
||||
}
|
||||
printf("%02x", (uint8_t) str[i]);
|
||||
}
|
||||
printf(">");
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
|
||||
}
|
||||
|
||||
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
||||
GGML_ASSERT(wstr);
|
||||
|
||||
MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
|
||||
WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
|
||||
|
||||
free(wstr);
|
||||
#else
|
||||
// TODO: reporting invalid_utf8 would be useful on non-Windows too.
|
||||
// printf will silently just write bad unicode.
|
||||
printf("%s", str);
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int raw_argc, char ** raw_argv) {
|
||||
const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
|
||||
const int argc = argv.size();
|
||||
|
||||
if (argc <= 1) {
|
||||
print_usage_information(argv[0].c_str(), stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char * model_path = argv[1];
|
||||
const char * prompt = argv[2];
|
||||
//////
|
||||
// Read out all the command line arguments.
|
||||
//////
|
||||
|
||||
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
|
||||
// variables where to put any arguments we see.
|
||||
bool printing_ids = false;
|
||||
bool no_bos = false;
|
||||
bool disable_logging = false;
|
||||
const char * model_path = NULL;
|
||||
const char * prompt_path = NULL;
|
||||
const char * prompt_arg = NULL;
|
||||
|
||||
// track which arguments were explicitly given
|
||||
// used for sanity checking down the line
|
||||
bool model_path_set = false;
|
||||
bool prompt_path_set = false;
|
||||
bool prompt_set = false;
|
||||
bool stdin_set = false;
|
||||
|
||||
int iarg = 1;
|
||||
for (; iarg < argc; ++iarg) {
|
||||
std::string arg{argv[iarg]};
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
print_usage_information(argv[0].c_str(), stdout);
|
||||
return 0;
|
||||
}
|
||||
else if (arg == "--ids") {
|
||||
printing_ids = true;
|
||||
}
|
||||
else if (arg == "-m" || arg == "--model") {
|
||||
if (model_path_set) {
|
||||
fprintf(stderr, "Error: -m or --model specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
model_path = argv[++iarg].c_str();
|
||||
model_path_set = true;
|
||||
}
|
||||
else if (arg == "--no-bos") {
|
||||
no_bos = true;
|
||||
}
|
||||
else if (arg == "-p" || arg == "--prompt") {
|
||||
if (prompt_set) {
|
||||
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
prompt_arg = argv[++iarg].c_str();
|
||||
prompt_set = true;
|
||||
}
|
||||
else if (arg == "-f" || arg == "--file") {
|
||||
if (prompt_path_set) {
|
||||
fprintf(stderr, "Error: -f or --file specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
prompt_path = argv[++iarg].c_str();
|
||||
prompt_path_set = true;
|
||||
}
|
||||
else if (arg == "--stdin") {
|
||||
stdin_set = true;
|
||||
}
|
||||
else if (arg == "--log-disable") {
|
||||
disable_logging = true;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
//////
|
||||
// Sanity check the command line arguments.
|
||||
//////
|
||||
|
||||
// Check that we have the required stuff set.
|
||||
if (model_path_set && model_path == NULL) {
|
||||
fprintf(stderr, "Error: --model requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
if (!model_path_set) {
|
||||
fprintf(stderr, "Error: must specify --model.\n");
|
||||
return 1;
|
||||
}
|
||||
if (prompt_path_set && prompt_path == NULL) {
|
||||
fprintf(stderr, "Error: --file requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
if (prompt_set && prompt_arg == NULL) {
|
||||
fprintf(stderr, "Error: --prompt requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
|
||||
if (prompts_set > 1) {
|
||||
fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
|
||||
return 1;
|
||||
}
|
||||
// Must have some prompt.
|
||||
if (prompts_set == 0) {
|
||||
fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
GGML_ASSERT(model_path);
|
||||
GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
|
||||
|
||||
//////
|
||||
// Figure out where will the prompt come from.
|
||||
//////
|
||||
|
||||
std::string prompt;
|
||||
if (prompt_path_set) {
|
||||
bool success = false;
|
||||
prompt = read_prompt_from_file(prompt_path, success);
|
||||
if (!success) {
|
||||
return 1;
|
||||
}
|
||||
} else if (prompt_set) {
|
||||
prompt = prompt_arg;
|
||||
} else {
|
||||
GGML_ASSERT(stdin_set);
|
||||
// we read stdin *after* loading model (early exit if model cannot
|
||||
// be loaded, which can be a nicer user experience)
|
||||
}
|
||||
|
||||
//////
|
||||
// Start actually doing the tokenizing stuff.
|
||||
//////
|
||||
|
||||
#ifdef LOG_DISABLE_LOGS
|
||||
disable_logging = true;
|
||||
#endif
|
||||
|
||||
if (disable_logging) {
|
||||
llama_log_set(llama_log_callback_null, NULL);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.vocab_only = true;
|
||||
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
||||
if (!model) {
|
||||
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
if (!ctx) {
|
||||
fprintf(stderr, "Error: could not create context.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// read entire prompt from stdin?
|
||||
if (stdin_set) {
|
||||
GGML_ASSERT(!prompt_path_set && !prompt_set);
|
||||
|
||||
std::stringstream stdin_buffer;
|
||||
stdin_buffer << std::cin.rdbuf();
|
||||
if (std::cin.fail()) {
|
||||
fprintf(stderr, "Error: could not read the entire standard input.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
prompt = stdin_buffer.str();
|
||||
}
|
||||
|
||||
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
||||
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||
|
||||
tokens = ::llama_tokenize(model, prompt, true, true);
|
||||
if (printing_ids) {
|
||||
printf("[");
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||
if (printing_ids) {
|
||||
printf("%d\n", tokens[i]);
|
||||
if (i > 0) {
|
||||
printf(", ");
|
||||
}
|
||||
printf("%d", tokens[i]);
|
||||
} else {
|
||||
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
|
||||
bool invalid_utf8 = false;
|
||||
printf("%6d -> '", tokens[i]);
|
||||
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
||||
if (invalid_utf8) {
|
||||
printf("' (utf-8 decode failure)\n");
|
||||
} else {
|
||||
printf("'\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (printing_ids) {
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
// silence valgrind
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -341,7 +341,8 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd/n_head, n_head, n_batch);
|
||||
struct ggml_tensor * t16;
|
||||
if (enable_flash_attn) {
|
||||
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
|
||||
GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
|
||||
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd/n_head, N, n_head, n_batch);
|
||||
} else {
|
||||
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
|
||||
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
|
||||
|
||||
12
flake.lock
generated
12
flake.lock
generated
@@ -5,11 +5,11 @@
|
||||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1714641030,
|
||||
"narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
|
||||
"lastModified": 1715865404,
|
||||
"narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
|
||||
"rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
@@ -20,11 +20,11 @@
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1714635257,
|
||||
"narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
|
||||
"lastModified": 1716509168,
|
||||
"narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
|
||||
"rev": "bfb7a882678e518398ce9a31a881538679f6f092",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
||||
@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
|
||||
// QK = number of values after dequantization
|
||||
// QK_K = super-block size
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
#define QK_K 64
|
||||
#define K_SCALE_SIZE 4
|
||||
#else
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
#endif // GGML_QKK_64
|
||||
|
||||
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
|
||||
// QR = QK / number of values before dequantization
|
||||
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
|
||||
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
||||
#define QR4_NL 2
|
||||
|
||||
#if QK_K == 64
|
||||
#define QI4_XS QI4_NL
|
||||
#define QR4_XS QR4_NL
|
||||
#else
|
||||
#define QI4_XS (QK_K / (4*QR4_XS))
|
||||
#define QR4_XS 8
|
||||
#endif
|
||||
|
||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||
|
||||
@@ -228,15 +218,6 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
|
||||
// weight is represented as x = a * q
|
||||
// 16 blocks of 16 elements each
|
||||
// Effectively 3.4375 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
uint8_t scales[2];
|
||||
ggml_half d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
uint8_t hmask[QK_K/8]; // quants - high bit
|
||||
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
||||
@@ -244,20 +225,11 @@ typedef struct {
|
||||
ggml_half d; // super-block scale
|
||||
} block_q3_K;
|
||||
static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 4-bit quantization
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 4.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
ggml_half d[2]; // super-block scales/mins
|
||||
uint8_t scales[2]; // 4-bit block scales/mins
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
union {
|
||||
struct {
|
||||
@@ -270,21 +242,11 @@ typedef struct {
|
||||
uint8_t qs[QK_K/2]; // 4--bit quants
|
||||
} block_q4_K;
|
||||
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 5-bit quantization
|
||||
// 8 blocks of 32 elements each
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 5.5 bits per weight
|
||||
#ifdef GGML_QKK_64
|
||||
typedef struct {
|
||||
ggml_half d; // super-block scale
|
||||
int8_t scales[QK_K/16]; // 8-bit block scales
|
||||
uint8_t qh[QK_K/8]; // quants, high bit
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
||||
#else
|
||||
typedef struct {
|
||||
union {
|
||||
struct {
|
||||
@@ -298,7 +260,6 @@ typedef struct {
|
||||
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
||||
} block_q5_K;
|
||||
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
||||
#endif
|
||||
|
||||
// 6-bit quantization
|
||||
// weight is represented as x = a * q
|
||||
@@ -356,11 +317,7 @@ typedef struct {
|
||||
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
||||
|
||||
// 3.4375 bpw
|
||||
#if QK_K == 64
|
||||
#define IQ3S_N_SCALE 2
|
||||
#else
|
||||
#define IQ3S_N_SCALE QK_K/64
|
||||
#endif
|
||||
typedef struct {
|
||||
ggml_half d;
|
||||
uint8_t qs[QK_K/4];
|
||||
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
|
||||
typedef struct {
|
||||
uint8_t qs[QK_K/8]; // grid index, low 8 bits
|
||||
uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
|
||||
#if QK_K == 64
|
||||
ggml_half d;
|
||||
#endif
|
||||
uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
|
||||
} block_iq1_m;
|
||||
#if QK_K == 64
|
||||
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
|
||||
#else
|
||||
static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
|
||||
#endif
|
||||
|
||||
// Used by IQ1_M quants
|
||||
typedef union {
|
||||
@@ -406,9 +356,6 @@ typedef struct {
|
||||
} block_iq4_nl;
|
||||
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
||||
|
||||
#if QK_K == 64
|
||||
#define block_iq4_xs block_iq4_nl
|
||||
#else
|
||||
typedef struct {
|
||||
ggml_half d;
|
||||
uint16_t scales_h;
|
||||
@@ -416,7 +363,6 @@ typedef struct {
|
||||
uint8_t qs[QK_K/2];
|
||||
} block_iq4_xs;
|
||||
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
||||
#endif
|
||||
|
||||
#endif // GGML_COMMON_DECL
|
||||
#endif // GGML_COMMON_DECL
|
||||
|
||||
33
ggml-cuda.cu
33
ggml-cuda.cu
@@ -119,6 +119,20 @@ int ggml_cuda_get_device() {
|
||||
return id;
|
||||
}
|
||||
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
||||
auto res = hipMallocManaged(ptr, size);
|
||||
if (res == hipSuccess) {
|
||||
// if error we "need" to know why...
|
||||
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||
}
|
||||
return res;
|
||||
#else
|
||||
return cudaMalloc(ptr, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
@@ -271,7 +285,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
@@ -537,7 +551,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||
|
||||
void * dev_ptr;
|
||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
||||
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
@@ -798,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
||||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
ggml_cuda_set_device(id);
|
||||
char * buf;
|
||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
||||
|
||||
// set padding to 0 to avoid possible NaN values
|
||||
if (size > original_size) {
|
||||
@@ -2510,9 +2524,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
|
||||
bool use_cuda_graph = true;
|
||||
bool cuda_graph_update_required = false;
|
||||
// pointer to CUDA cpy kernel, which is required to identify
|
||||
// vector of pointers to CUDA cpy kernels, which are required to identify
|
||||
// kernel parameters which need updated in the graph for each token
|
||||
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
||||
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
||||
|
||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||
@@ -2588,9 +2602,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
if (node->op == GGML_OP_CPY) {
|
||||
// store the copy op parameter which changes with each token.
|
||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||
if (ggml_cuda_cpy_fn_ptr == nullptr) {
|
||||
// store a pointer to the copy op CUDA kernel to identify it later
|
||||
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
// store a pointer to each copy op CUDA kernel to identify it later
|
||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2720,7 +2735,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
||||
int k = 0;
|
||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
|
||||
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
||||
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||
|
||||
@@ -79,13 +79,8 @@
|
||||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||
#define cudaHostUnregister hipHostUnregister
|
||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||
#ifdef GGML_HIP_UMA
|
||||
#define cudaMalloc hipMallocManaged
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
||||
#else
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||
#endif
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||
|
||||
@@ -131,7 +131,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||
const block_q2_K * x = (const block_q2_K *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t n = tid/32;
|
||||
const int64_t l = tid - 32*n;
|
||||
const int64_t is = 8*n + l/16;
|
||||
@@ -145,17 +144,6 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
|
||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||
y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
|
||||
#else
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const int64_t il = tid%16; // 0...15
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
||||
float dall = __low2half(x[i].dm);
|
||||
float dmin = __high2half(x[i].dm);
|
||||
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -164,7 +152,6 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
||||
const int64_t i = blockIdx.x;
|
||||
const block_q3_K * x = (const block_q3_K *) vx;
|
||||
|
||||
#if QK_K == 256
|
||||
const int64_t r = threadIdx.x/4;
|
||||
const int64_t tid = r/2;
|
||||
const int64_t is0 = r%2;
|
||||
@@ -188,31 +175,8 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
|
||||
const uint8_t * hm = x[i].hmask;
|
||||
|
||||
for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
|
||||
#else
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const int64_t il = tid%16; // 0...15
|
||||
const int64_t im = il/8; // 0...1
|
||||
const int64_t in = il%8; // 0...7
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*is + il;
|
||||
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
const uint8_t h = x[i].hmask[in] >> (2*is + im);
|
||||
const float d = (float)x[i].d;
|
||||
|
||||
if (is == 0) {
|
||||
y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||
y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||
} else {
|
||||
y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
|
||||
y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
||||
if (j < 4) {
|
||||
d = q[j] & 63; m = q[j + 4] & 63;
|
||||
@@ -221,7 +185,6 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
@@ -229,7 +192,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 32 threads
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/8;
|
||||
@@ -253,15 +215,6 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
|
||||
y[l + 0] = d1 * (q[l] & 0xF) - m1;
|
||||
y[l +32] = d2 * (q[l] >> 4) - m2;
|
||||
}
|
||||
#else
|
||||
const int64_t tid = threadIdx.x;
|
||||
const uint8_t * q = x[i].qs;
|
||||
dst_t * y = yy + i*QK_K;
|
||||
const float d = (float)x[i].dm[0];
|
||||
const float m = (float)x[i].dm[1];
|
||||
y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
|
||||
y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -270,7 +223,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
|
||||
#if QK_K == 256
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t il = tid/16; // il is in 0...3
|
||||
@@ -297,18 +249,6 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
|
||||
hm <<= 1;
|
||||
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
|
||||
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
|
||||
#else
|
||||
const int64_t tid = threadIdx.x;
|
||||
const uint8_t q = x[i].qs[tid];
|
||||
const int64_t im = tid/8; // 0...3
|
||||
const int64_t in = tid%8; // 0...7
|
||||
const int64_t is = tid/16; // 0 or 1
|
||||
const uint8_t h = x[i].qh[in] >> im;
|
||||
const float d = x[i].d;
|
||||
dst_t * y = yy + i*QK_K + tid;
|
||||
y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
|
||||
y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16));
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -316,7 +256,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||
const block_q6_K * x = (const block_q6_K *) vx;
|
||||
|
||||
const int64_t i = blockIdx.x;
|
||||
#if QK_K == 256
|
||||
|
||||
// assume 64 threads - this is very slightly better than the one below
|
||||
const int64_t tid = threadIdx.x;
|
||||
@@ -336,24 +275,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
|
||||
y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
|
||||
y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
||||
#else
|
||||
|
||||
// assume 32 threads
|
||||
const int64_t tid = threadIdx.x;
|
||||
const int64_t ip = tid/16; // 0 or 1
|
||||
const int64_t il = tid - 16*ip; // 0...15
|
||||
|
||||
dst_t * y = yy + i*QK_K + 16*ip + il;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
const uint8_t ql = x[i].ql[16*ip + il];
|
||||
const uint8_t qh = x[i].qh[il] >> (2*ip);
|
||||
const int8_t * sc = x[i].scales;
|
||||
|
||||
y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
|
||||
y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -363,7 +284,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||
const block_iq2_xxs * x = (const block_iq2_xxs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -374,10 +294,6 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
|
||||
const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -387,7 +303,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
const block_iq2_xs * x = (const block_iq2_xs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -396,10 +311,6 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = ksigns_iq2xs[q2[il] >> 9];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -409,7 +320,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||
const block_iq2_s * x = (const block_iq2_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -417,10 +327,6 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
|
||||
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
|
||||
const uint8_t signs = x[i].qs[QK_K/8+4*ib+il];
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -430,7 +336,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||
const block_iq3_xxs * x = (const block_iq3_xxs *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -445,10 +350,6 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -458,7 +359,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
||||
const block_iq3_s * x = (const block_iq3_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -471,10 +371,6 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
|
||||
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
||||
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -484,7 +380,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||
const block_iq1_s * x = (const block_iq1_s *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -497,10 +392,6 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -510,7 +401,6 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||
const block_iq1_m * x = (const block_iq1_m *) vx;
|
||||
|
||||
const int64_t tid = threadIdx.x;
|
||||
#if QK_K == 256
|
||||
const int64_t il = tid/8; // 0...3
|
||||
const int64_t ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
@@ -527,13 +417,8 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * (q[j] + delta);
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
|
||||
@@ -550,10 +435,8 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
|
||||
y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf];
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#if QK_K != 64
|
||||
template<typename dst_t>
|
||||
static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
|
||||
const int64_t i = blockIdx.x;
|
||||
@@ -570,7 +453,6 @@ static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst
|
||||
y[j+16] = d * kvalues_iq4nl[q4[j] >> 4];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) {
|
||||
@@ -592,21 +474,13 @@ static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half *
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q2_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q3_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -632,21 +506,13 @@ static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q5_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = k / QK_K;
|
||||
#if QK_K == 256
|
||||
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename dst_t>
|
||||
@@ -700,11 +566,7 @@ static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t
|
||||
template<typename dst_t>
|
||||
static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
|
||||
const int nb = (k + QK_K - 1) / QK_K;
|
||||
#if QK_K == 64
|
||||
dequantize_block_iq4_nl<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#else
|
||||
dequantize_block_iq4_xs<<<nb, 32, 0, stream>>>(vx, y);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
|
||||
@@ -22,7 +22,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
||||
|
||||
@@ -71,37 +70,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||
tmp += dall * sum1 - dmin * sum2;
|
||||
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||
const int offset = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
uint32_t uaux[2];
|
||||
const uint8_t * d = (const uint8_t *)uaux;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + offset;
|
||||
const uint8_t * q = x[i].qs + offset;
|
||||
const uint32_t * s = (const uint32_t *)x[i].scales;
|
||||
|
||||
uaux[0] = s[0] & 0x0f0f0f0f;
|
||||
uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
|
||||
|
||||
const float2 dall = __half22float2(x[i].dm);
|
||||
|
||||
float sum1 = 0, sum2 = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
const uint8_t ql = q[l];
|
||||
sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
|
||||
+ y[l+16] * d[1] * ((ql >> 2) & 3)
|
||||
+ y[l+32] * d[2] * ((ql >> 4) & 3)
|
||||
+ y[l+48] * d[3] * ((ql >> 6) & 3);
|
||||
sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
|
||||
}
|
||||
tmp += dall.x * sum1 - dall.y * sum2;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
@@ -123,8 +91,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const uint16_t kmask1 = 0x0303;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
|
||||
@@ -175,34 +141,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
||||
tmp += d * sum;
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3
|
||||
const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14
|
||||
const int in = offset/8; // 0 or 1
|
||||
const int im = offset%8; // 0...7
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + offset;
|
||||
const uint8_t * q = x[i].qs + offset;
|
||||
const uint8_t * s = x[i].scales;
|
||||
|
||||
const float dall = (float)x[i].d;
|
||||
|
||||
float sum = 0;
|
||||
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
||||
const uint8_t hl = x[i].hmask[im+l] >> in;
|
||||
const uint8_t ql = q[l];
|
||||
sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
|
||||
+ y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
|
||||
+ y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
|
||||
+ y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
@@ -221,7 +159,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
|
||||
const block_q4_K * x = (const block_q4_K *)vx + ib0;
|
||||
|
||||
#if QK_K == 256
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
@@ -306,36 +243,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||
#endif
|
||||
|
||||
}
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
||||
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
uint16_t aux16[2];
|
||||
const uint8_t * s = (const uint8_t *)aux16;
|
||||
|
||||
float tmp = 0;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
const uint8_t * q = x[i].qs + step;
|
||||
const float * y = yy + i*QK_K + step;
|
||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
const float d = (float)x[i].dm[0];
|
||||
const float m = (float)x[i].dm[1];
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
|
||||
+ y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
|
||||
+ y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3])
|
||||
+ y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]);
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
@@ -355,7 +262,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
#if QK_K == 256
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
const uint16_t kmask2 = 0x0f0f;
|
||||
const uint16_t kmask3 = 0xc0c0;
|
||||
@@ -426,30 +332,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
||||
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
||||
}
|
||||
|
||||
#else
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
const int im = step/8;
|
||||
const int in = step%8;
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
const uint8_t * q = x[i].qs + step;
|
||||
const int8_t * s = x[i].scales;
|
||||
const float * y = yy + i*QK_K + step;
|
||||
const float d = x[i].d;
|
||||
float sum = 0.f;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
const uint8_t h = x[i].qh[in+j] >> im;
|
||||
sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
|
||||
+ y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
|
||||
+ y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16))
|
||||
+ y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16));
|
||||
}
|
||||
tmp += sum;
|
||||
}
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
|
||||
@@ -470,8 +352,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
const block_q6_K * x = (const block_q6_K *)vx + ib0;
|
||||
|
||||
#if QK_K == 256
|
||||
|
||||
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
||||
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
||||
|
||||
@@ -526,37 +406,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7
|
||||
const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3
|
||||
|
||||
const int step = tid * K_QUANTS_PER_ITERATION;
|
||||
|
||||
float tmp = 0; // partial sum for thread in warp
|
||||
|
||||
for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
|
||||
|
||||
const float * y = yy + i * QK_K + step;
|
||||
const uint8_t * ql = x[i].ql + step;
|
||||
const uint8_t * qh = x[i].qh + step;
|
||||
const int8_t * s = x[i].scales;
|
||||
|
||||
const float d = x[i+0].d;
|
||||
|
||||
float sum = 0;
|
||||
for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
|
||||
sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
|
||||
+ y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
|
||||
+ y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32)
|
||||
+ y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32);
|
||||
}
|
||||
tmp += sum;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
|
||||
|
||||
@@ -826,11 +826,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
|
||||
const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||
|
||||
#if QK_K == 256
|
||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
|
||||
#else
|
||||
x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
|
||||
#endif
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
@@ -933,9 +929,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
|
||||
const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
|
||||
|
||||
#if QK_K == 256
|
||||
x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
|
||||
#endif
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
|
||||
@@ -712,7 +712,6 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
|
||||
#ifndef GGML_QKK_64
|
||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||
|
||||
int v[2];
|
||||
@@ -754,58 +753,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||
}
|
||||
|
||||
return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
|
||||
|
||||
#else
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
|
||||
|
||||
float sumf_d = 0.0f;
|
||||
float sumf_m = 0.0f;
|
||||
|
||||
uint16_t aux16[2];
|
||||
const uint8_t * s = (const uint8_t *)aux16;
|
||||
|
||||
const uint16_t * a = (const uint16_t *)bq4_K->scales;
|
||||
aux16[0] = a[0] & 0x0f0f;
|
||||
aux16[1] = (a[0] >> 4) & 0x0f0f;
|
||||
|
||||
const float dall = bq4_K->dm[0];
|
||||
const float dmin = bq4_K->dm[1];
|
||||
|
||||
const float d8_1 = __low2float(bq8_1[0].ds);
|
||||
const float d8_2 = __low2float(bq8_1[1].ds);
|
||||
|
||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
||||
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
||||
|
||||
const int * q4 = (const int *)bq4_K->qs + (iqs/2);
|
||||
const int v1 = q4[0];
|
||||
const int v2 = q4[4];
|
||||
|
||||
const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
|
||||
const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
|
||||
const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
|
||||
const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
|
||||
|
||||
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
|
||||
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
|
||||
|
||||
return dall * sumf_d - dmin * sumf_m;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
|
||||
#ifndef GGML_QKK_64
|
||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
||||
|
||||
int vl[2];
|
||||
@@ -847,48 +799,6 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||
}
|
||||
|
||||
return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
|
||||
|
||||
#else
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
|
||||
|
||||
const int8_t * s = bq5_K->scales;
|
||||
|
||||
const float d = bq5_K->d;
|
||||
|
||||
const float d8_1 = __low2half(bq8_1[0].ds);
|
||||
const float d8_2 = __low2half(bq8_1[1].ds);
|
||||
|
||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||
const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
|
||||
const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
|
||||
|
||||
const int * ql = (const int *)bq5_K->qs + (iqs/2);
|
||||
const int vl1 = ql[0];
|
||||
const int vl2 = ql[4];
|
||||
|
||||
const int step = 4 * (iqs/2); // 0, 4, 8, 12
|
||||
const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
|
||||
const int in = step%8; // 0, 4, 0, 4
|
||||
const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
|
||||
|
||||
const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
|
||||
const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
|
||||
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
|
||||
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
|
||||
|
||||
const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
|
||||
+ d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
|
||||
|
||||
return d * sumf_d;
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||
@@ -919,7 +829,6 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if QK_K == 256
|
||||
const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
|
||||
|
||||
#if QR2_XXS == 8
|
||||
@@ -960,15 +869,11 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
}
|
||||
return d * (sumi1 + sumi2);
|
||||
#endif
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1002,17 +907,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO
|
||||
static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1048,16 +948,11 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
GGML_UNUSED(ksigns64);
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1082,16 +977,12 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
// TODO: don't use lookup table for signs
|
||||
static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
#if QK_K == 256
|
||||
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1114,14 +1005,10 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if QK_K == 256
|
||||
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1149,14 +1036,10 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
const float d = d1q * __low2float (bq8_1[ib32].ds);
|
||||
const float m = d1q * __high2float(bq8_1[ib32].ds);
|
||||
return d * sumi + m * delta;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if QK_K == 256
|
||||
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
@@ -1192,9 +1075,6 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds);
|
||||
return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1));
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
@@ -1250,9 +1130,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
|
||||
static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
|
||||
#if QK_K == 256
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
|
||||
const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
|
||||
const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
|
||||
|
||||
@@ -1270,10 +1148,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
|
||||
sumi2 = __dp4a(v2, q8[j+4], sumi2);
|
||||
}
|
||||
return d * (sumi1 + sumi2);
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif
|
||||
#else
|
||||
return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
|
||||
#endif
|
||||
|
||||
@@ -144,6 +144,10 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
|
||||
85
ggml-metal.m
85
ggml-metal.m
@@ -35,6 +35,10 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_MUL_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_DIV,
|
||||
GGML_METAL_KERNEL_TYPE_DIV_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_F32,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_F16,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_I32,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_I16,
|
||||
GGML_METAL_KERNEL_TYPE_SCALE,
|
||||
GGML_METAL_KERNEL_TYPE_SCALE_4,
|
||||
GGML_METAL_KERNEL_TYPE_CLAMP,
|
||||
@@ -184,9 +188,9 @@ enum ggml_metal_kernel_type {
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
|
||||
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
|
||||
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
|
||||
@@ -381,10 +385,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
prep[@"GGML_QKK_64"] = @(1);
|
||||
#endif
|
||||
|
||||
MTLCompileOptions* options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
@@ -489,6 +489,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I16, repeat_i16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
||||
@@ -638,9 +642,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
||||
@@ -750,6 +754,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_SQR:
|
||||
@@ -774,6 +779,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
if (op->src[0]->ne[0] == 256) {
|
||||
return false;
|
||||
}
|
||||
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
@@ -980,8 +988,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
switch (dst->op) {
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
const int64_t nb = ne00;
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
@@ -1012,7 +1018,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
@@ -1022,11 +1027,14 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
|
||||
const size_t offs = 0;
|
||||
|
||||
bool bcast_row = false;
|
||||
|
||||
int64_t nb = ne00;
|
||||
int64_t nb = ne00; // used by the "row" kernels
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
@@ -1095,6 +1103,42 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_REPEAT:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline;
|
||||
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
|
||||
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ACC:
|
||||
{
|
||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||
@@ -1773,11 +1817,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#else
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#endif
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -2018,12 +2058,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
{
|
||||
nth0 = 4;
|
||||
nth1 = 16;
|
||||
#if QK_K == 64
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline;
|
||||
#else
|
||||
pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline;
|
||||
#endif
|
||||
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -2088,11 +2123,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q3_K) {
|
||||
#ifdef GGML_QKK_64
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#else
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
#endif
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q5_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
@@ -2590,7 +2621,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
|
||||
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
|
||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
|
||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
||||
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
@@ -2603,7 +2634,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||
|
||||
switch (ne00) {
|
||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
|
||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
||||
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
|
||||
449
ggml-metal.metal
449
ggml-metal.metal
@@ -168,6 +168,53 @@ kernel void kernel_div(
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_repeat(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3 % ne03;
|
||||
const int64_t i02 = i2 % ne02;
|
||||
const int64_t i01 = i1 % ne01;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
device char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1 ;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
const int i00 = i0 % ne00;
|
||||
*((device T *)(dst_ptr + i0*nb0)) = *((device T *)(src0_ptr + i00*nb00));
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_repeat<float>) kernel_repeat_t;
|
||||
|
||||
template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
|
||||
template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
|
||||
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_add_row(
|
||||
@@ -2418,7 +2465,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
|
||||
//template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
|
||||
|
||||
template<int64_t D, int64_t Q = 1, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_vec_f16(
|
||||
@@ -2696,7 +2743,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
||||
}
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
||||
//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
||||
|
||||
kernel void kernel_cpy_f16_f16(
|
||||
device const half * src0,
|
||||
@@ -3386,7 +3433,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
|
||||
const int step = sizeof(block_q2_K) * nb;
|
||||
|
||||
#if QK_K == 256
|
||||
const int ix = tiisg/8; // 0...3
|
||||
const int it = tiisg%8; // 0...7
|
||||
const int iq = it/4; // 0 or 1
|
||||
@@ -3438,57 +3484,6 @@ void kernel_mul_mv_q2_K_f32_impl(
|
||||
|
||||
y4 += 4 * QK_K;
|
||||
}
|
||||
#else
|
||||
const int ix = tiisg/2; // 0...15
|
||||
const int it = tiisg%2; // 0...1
|
||||
|
||||
device const float * y4 = y + ix * QK_K + 8 * it;
|
||||
|
||||
for (int ib = ix; ib < nb; ib += 16) {
|
||||
|
||||
float4 sumy = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0];
|
||||
yl[i+ 8] = y4[i+16]; sumy[1] += yl[i+ 8];
|
||||
yl[i+16] = y4[i+32]; sumy[2] += yl[i+16];
|
||||
yl[i+24] = y4[i+48]; sumy[3] += yl[i+24];
|
||||
}
|
||||
|
||||
device const uint8_t * sc = (device const uint8_t *)x[ib].scales;
|
||||
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
||||
device const half * dh = &x[ib].d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
float4 acc1 = {0.f, 0.f, 0.f, 0.f};
|
||||
float4 acc2 = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
acc1[0] += yl[i+ 0] * (qs[i/2] & 0x0003);
|
||||
acc2[0] += yl[i+ 1] * (qs[i/2] & 0x0300);
|
||||
acc1[1] += yl[i+ 8] * (qs[i/2] & 0x000c);
|
||||
acc2[1] += yl[i+ 9] * (qs[i/2] & 0x0c00);
|
||||
acc1[2] += yl[i+16] * (qs[i/2] & 0x0030);
|
||||
acc2[2] += yl[i+17] * (qs[i/2] & 0x3000);
|
||||
acc1[3] += yl[i+24] * (qs[i/2] & 0x00c0);
|
||||
acc2[3] += yl[i+25] * (qs[i/2] & 0xc000);
|
||||
}
|
||||
|
||||
float dall = dh[0];
|
||||
float dmin = dh[1];
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc2[0]) * (sc[0] & 0xF) * 1.f/ 1.f +
|
||||
(acc1[1] + 1.f/256.f * acc2[1]) * (sc[1] & 0xF) * 1.f/ 4.f +
|
||||
(acc1[2] + 1.f/256.f * acc2[2]) * (sc[2] & 0xF) * 1.f/16.f +
|
||||
(acc1[3] + 1.f/256.f * acc2[3]) * (sc[3] & 0xF) * 1.f/64.f) -
|
||||
dmin * (sumy[0] * (sc[0] >> 4) + sumy[1] * (sc[1] >> 4) + sumy[2] * (sc[2] >> 4) + sumy[3] * (sc[3] >> 4));
|
||||
|
||||
qs += step/2;
|
||||
sc += step;
|
||||
dh += step/2;
|
||||
}
|
||||
|
||||
y4 += 16 * QK_K;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
@@ -3526,7 +3521,6 @@ kernel void kernel_mul_mv_q2_K_f32(
|
||||
kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
void kernel_mul_mv_q3_K_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -3685,84 +3679,6 @@ void kernel_mul_mv_q3_K_f32_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void kernel_mul_mv_q3_K_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
const int64_t im = tgpig.z;
|
||||
|
||||
const int row = 2 * r0 + sgitg;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_q3_K * x = (device const block_q3_K *) src0 + row*nb + offset0;
|
||||
device const float * yy = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
const int ix = tiisg/4;
|
||||
const int il = 4 * (tiisg%4);// 0, 4, 8, 12
|
||||
const int iq = il/8; // 0, 0, 1, 1
|
||||
const int in = il%8; // 0, 4, 0, 4
|
||||
|
||||
float2 sum = {0.f, 0.f};
|
||||
|
||||
for (int i = ix; i < nb; i += 8) {
|
||||
|
||||
const float d_all = (float)(x[i].d);
|
||||
|
||||
device const uint16_t * q = (device const uint16_t *)(x[i].qs + il);
|
||||
device const uint16_t * h = (device const uint16_t *)(x[i].hmask + in);
|
||||
device const uint16_t * s = (device const uint16_t *)(x[i].scales);
|
||||
device const float * y = yy + i * QK_K + il;
|
||||
|
||||
const float d1 = d_all * ((int32_t)(s[0] & 0x000F) - 8);
|
||||
const float d2 = d_all * ((int32_t)(s[0] & 0x00F0) - 128) * 1.f/64.f;
|
||||
const float d3 = d_all * ((int32_t)(s[0] & 0x0F00) - 2048) * 1.f/4096.f;
|
||||
const float d4 = d_all * ((int32_t)(s[0] & 0xF000) - 32768) * 1.f/262144.f;
|
||||
|
||||
for (int l = 0; l < 4; l += 2) {
|
||||
const uint16_t hm = h[l/2] >> iq;
|
||||
sum[0] += y[l+ 0] * d1 * ((int32_t)(q[l/2] & 0x0003) - ((hm & 0x0001) ? 0 : 4))
|
||||
+ y[l+16] * d2 * ((int32_t)(q[l/2] & 0x000c) - ((hm & 0x0004) ? 0 : 16))
|
||||
+ y[l+32] * d3 * ((int32_t)(q[l/2] & 0x0030) - ((hm & 0x0010) ? 0 : 64))
|
||||
+ y[l+48] * d4 * ((int32_t)(q[l/2] & 0x00c0) - ((hm & 0x0040) ? 0 : 256));
|
||||
sum[1] += y[l+ 1] * d1 * ((int32_t)(q[l/2] & 0x0300) - ((hm & 0x0100) ? 0 : 1024))
|
||||
+ y[l+17] * d2 * ((int32_t)(q[l/2] & 0x0c00) - ((hm & 0x0400) ? 0 : 4096))
|
||||
+ y[l+33] * d3 * ((int32_t)(q[l/2] & 0x3000) - ((hm & 0x1000) ? 0 : 16384))
|
||||
+ y[l+49] * d4 * ((int32_t)(q[l/2] & 0xc000) - ((hm & 0x4000) ? 0 : 65536));
|
||||
}
|
||||
|
||||
}
|
||||
const float sumf = sum[0] + sum[1] * 1.f/256.f;
|
||||
|
||||
const float tot = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
[[host_name("kernel_mul_mv_q3_K_f32")]]
|
||||
kernel void kernel_mul_mv_q3_K_f32(
|
||||
@@ -3792,7 +3708,6 @@ kernel void kernel_mul_mv_q3_K_f32(
|
||||
kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
void kernel_mul_mv_q4_K_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -3906,103 +3821,6 @@ void kernel_mul_mv_q4_K_f32_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void kernel_mul_mv_q4_K_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant uint & r2,
|
||||
constant uint & r3,
|
||||
threadgroup int8_t * shared_values [[threadgroup(0)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
const int ix = tiisg/4; // 0...7
|
||||
const int it = tiisg%4; // 0...3
|
||||
|
||||
const int nb = ne00/QK_K;
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
const int first_row = r0 * N_DST;
|
||||
const int ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%ne12;
|
||||
const uint i13 = im/ne12;
|
||||
|
||||
const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
||||
|
||||
device const block_q4_K * x = (device const block_q4_K *) src0 + ib_row + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
|
||||
float yl[8];
|
||||
float yh[8];
|
||||
float sumf[N_DST]={0.f}, all_sum;
|
||||
|
||||
const int step = sizeof(block_q4_K) * nb / 2;
|
||||
|
||||
device const float * y4 = y + ix * QK_K + 8 * it;
|
||||
|
||||
uint16_t sc16[4];
|
||||
|
||||
for (int ib = ix; ib < nb; ib += 8) {
|
||||
|
||||
float2 sumy = {0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i] = y4[i+ 0]; sumy[0] += yl[i];
|
||||
yh[i] = y4[i+32]; sumy[1] += yh[i];
|
||||
}
|
||||
|
||||
device const uint16_t * sc = (device const uint16_t *)x[ib].scales;
|
||||
device const uint16_t * qs = (device const uint16_t *)x[ib].qs + 4 * it;
|
||||
device const half * dh = x[ib].d;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
sc16[0] = sc[0] & 0x000f;
|
||||
sc16[1] = sc[0] & 0x0f00;
|
||||
sc16[2] = sc[0] & 0x00f0;
|
||||
sc16[3] = sc[0] & 0xf000;
|
||||
|
||||
float2 acc1 = {0.f, 0.f};
|
||||
float2 acc2 = {0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
acc1[0] += yl[i+0] * (qs[i/2] & 0x000F);
|
||||
acc1[1] += yl[i+1] * (qs[i/2] & 0x0F00);
|
||||
acc2[0] += yh[i+0] * (qs[i/2] & 0x00F0);
|
||||
acc2[1] += yh[i+1] * (qs[i/2] & 0xF000);
|
||||
}
|
||||
|
||||
float dall = dh[0];
|
||||
float dmin = dh[1];
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc16[0] +
|
||||
(acc2[0] + 1.f/256.f * acc2[1]) * sc16[1] * 1.f/4096.f) -
|
||||
dmin * 1.f/16.f * (sumy[0] * sc16[2] + sumy[1] * sc16[3] * 1.f/256.f);
|
||||
|
||||
qs += step;
|
||||
sc += step;
|
||||
dh += step;
|
||||
}
|
||||
|
||||
y4 += 8 * QK_K;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = simd_sum(sumf[row]);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
[[host_name("kernel_mul_mv_q4_K_f32")]]
|
||||
kernel void kernel_mul_mv_q4_K_f32(
|
||||
@@ -4070,8 +3888,6 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||
|
||||
const int step = sizeof(block_q5_K) * nb;
|
||||
|
||||
#if QK_K == 256
|
||||
#
|
||||
float yl[16], yh[16];
|
||||
|
||||
const uint16_t kmask1 = 0x3f3f;
|
||||
@@ -4154,54 +3970,6 @@ void kernel_mul_mv_q5_K_f32_impl(
|
||||
y1 += 4 * QK_K;
|
||||
|
||||
}
|
||||
#else
|
||||
float yl[8], yh[8];
|
||||
|
||||
const int il = 4 * (tiisg/8); // 0, 4, 8, 12
|
||||
const int ix = tiisg%8;
|
||||
const int iq = il/8; // 0, 0, 1, 1
|
||||
const int in = il%8; // 0, 4, 0, 4
|
||||
|
||||
device const float * y = yy + ix*QK_K + il;
|
||||
|
||||
for (int i = ix; i < nb; i += 8) {
|
||||
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
yl[l+0] = y[l+ 0];
|
||||
yl[l+4] = y[l+16];
|
||||
yh[l+0] = y[l+32];
|
||||
yh[l+4] = y[l+48];
|
||||
}
|
||||
|
||||
device const half * dh = &x[i].d;
|
||||
device const uint8_t * q = x[i].qs + il;
|
||||
device const uint8_t * h = x[i].qh + in;
|
||||
device const int8_t * s = x[i].scales;
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
|
||||
const float d = dh[0];
|
||||
|
||||
float2 acc = {0.f, 0.f};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t hl = h[l] >> iq;
|
||||
acc[0] += yl[l+0] * s[0] * ((int16_t)(q[l+ 0] & 0x0F) - (hl & 0x01 ? 0 : 16))
|
||||
+ yl[l+4] * s[1] * ((int16_t)(q[l+16] & 0x0F) - (hl & 0x04 ? 0 : 16));
|
||||
acc[1] += yh[l+0] * s[2] * ((int16_t)(q[l+ 0] & 0xF0) - (hl & 0x10 ? 0 : 256))
|
||||
+ yh[l+4] * s[3] * ((int16_t)(q[l+16] & 0xF0) - (hl & 0x40 ? 0 : 256));
|
||||
}
|
||||
sumf[row] += d * (acc[0] + 1.f/16.f * acc[1]);
|
||||
|
||||
q += step;
|
||||
h += step;
|
||||
s += step;
|
||||
dh += step/2;
|
||||
|
||||
}
|
||||
|
||||
y += 8 * QK_K;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int row = 0; row < 2; ++row) {
|
||||
const float tot = simd_sum(sumf[row]);
|
||||
@@ -4280,7 +4048,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||
|
||||
float sumf = 0;
|
||||
|
||||
#if QK_K == 256
|
||||
const int tid = tiisg/2;
|
||||
const int ix = tiisg%2;
|
||||
const int ip = tid/8; // 0 or 1
|
||||
@@ -4316,30 +4083,6 @@ void kernel_mul_mv_q6_K_f32_impl(
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
const int ix = tiisg/4;
|
||||
const int il = 4*(tiisg%4);
|
||||
|
||||
for (int i = ix; i < nb; i += 8) {
|
||||
device const float * y = yy + i * QK_K + il;
|
||||
device const uint8_t * ql = x[i].ql + il;
|
||||
device const uint8_t * qh = x[i].qh + il;
|
||||
device const int8_t * s = x[i].scales;
|
||||
|
||||
const float d = x[i].d;
|
||||
|
||||
float4 sums = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
sums[0] += y[l+ 0] * ((int8_t)((ql[l+ 0] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
|
||||
sums[1] += y[l+16] * ((int8_t)((ql[l+16] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
|
||||
sums[2] += y[l+32] * ((int8_t)((ql[l+ 0] >> 4) | ((qh[l] & kmask3) >> 0)) - 32);
|
||||
sums[3] += y[l+48] * ((int8_t)((ql[l+16] >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
|
||||
}
|
||||
sumf += d * (sums[0] * s[0] + sums[1] * s[1] + sums[2] * s[2] + sums[3] * s[3]);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
const float tot = simd_sum(sumf);
|
||||
if (tiisg == 0) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + row] = tot;
|
||||
@@ -5173,9 +4916,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
|
||||
device const float * y4 = y + 32 * ix;
|
||||
|
||||
#if QK_K != 64
|
||||
iq1m_scale_t scale;
|
||||
#endif
|
||||
|
||||
for (int ib32 = ix; ib32 < nb32; ib32 += 32) {
|
||||
|
||||
@@ -5196,10 +4937,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
device const uint16_t * sc = (device const uint16_t *)xr->scales;
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
|
||||
#if QK_K != 64
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
#endif
|
||||
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||
constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700)));
|
||||
@@ -5215,14 +4953,9 @@ void kernel_mul_mv_iq1_m_f32_impl(
|
||||
}
|
||||
const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
#if QK_K == 64
|
||||
const float d = (float) *((device const half *)(sc - 1));
|
||||
sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) +
|
||||
(sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1));
|
||||
#else
|
||||
|
||||
sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) +
|
||||
(sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1));
|
||||
#endif
|
||||
|
||||
sc += nb*sizeof(block_iq1_m)/2;
|
||||
qs += nb*sizeof(block_iq1_m);
|
||||
@@ -5334,7 +5067,6 @@ void kernel_mul_mv_iq4_nl_f32_impl(
|
||||
}
|
||||
}
|
||||
|
||||
#if QK_K != 64
|
||||
void kernel_mul_mv_iq4_xs_f32_impl(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
@@ -5429,7 +5161,6 @@ void kernel_mul_mv_iq4_xs_f32_impl(
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
[[host_name("kernel_mul_mv_iq1_s_f32")]]
|
||||
kernel void kernel_mul_mv_iq1_s_f32(
|
||||
@@ -5542,11 +5273,7 @@ kernel void kernel_mul_mv_iq4_xs_f32(
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
|
||||
#if QK_K == 64
|
||||
kernel_mul_mv_iq4_nl_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
#else
|
||||
kernel_mul_mv_iq4_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg);
|
||||
#endif
|
||||
}
|
||||
|
||||
//============================= templates and their specializations =============================
|
||||
@@ -5672,10 +5399,9 @@ void dequantize_q2_K(device const block_q2_K *xb, short il, thread type4x4 & reg
|
||||
float dl, ml;
|
||||
uint8_t sc = xb->scales[il];
|
||||
|
||||
#if QK_K == 256
|
||||
q = q + 32*(il/8) + 16*(il&1);
|
||||
il = (il/2)%4;
|
||||
#endif
|
||||
|
||||
half coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
uchar mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
dl = d * (sc & 0xF) * coef, ml = min * (sc >> 4);
|
||||
@@ -5691,7 +5417,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||
device const uint8_t * h = (device const uint8_t *)xb->hmask;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
#if QK_K == 256
|
||||
q = q + 32 * (il/8) + 16 * (il&1);
|
||||
h = h + 16 * (il&1);
|
||||
uint8_t m = 1 << (il/2);
|
||||
@@ -5712,17 +5437,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
|
||||
}
|
||||
#else
|
||||
float kcoef = il&1 ? 1.f/16.f : 1.f;
|
||||
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
|
||||
float dl = d_all * ((scales[il/2] & kmask) * kcoef - 8);
|
||||
float coef = il>1 ? (il>2 ? 1/64.h : 1/16.h) : (il>0 ? 1/4.h : 1.h);
|
||||
uint8_t mask = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
uint8_t m = 1<<(il*2);
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - ((h[i%8] & (m * (1 + i/8))) ? 0 : 4.f/coef));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uchar2 get_scale_min_k4_just2(int j, int k, device const uchar * q) {
|
||||
@@ -5734,7 +5448,6 @@ template <typename type4x4>
|
||||
void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg) {
|
||||
device const uchar * q = xb->qs;
|
||||
|
||||
#if QK_K == 256
|
||||
short is = (il/4) * 2;
|
||||
q = q + (il/4) * 32 + 16 * (il&1);
|
||||
il = il & 3;
|
||||
@@ -5743,16 +5456,7 @@ void dequantize_q4_K(device const block_q4_K *xb, short il, thread type4x4 & reg
|
||||
const float min = xb->dmin;
|
||||
const float dl = d * sc[0];
|
||||
const float ml = min * sc[1];
|
||||
#else
|
||||
(void) get_scale_min_k4_just2;
|
||||
|
||||
q = q + 16 * (il&1);
|
||||
device const uint8_t * s = xb->scales;
|
||||
device const half2 * dh = (device const half2 *)xb->d;
|
||||
const float2 d = (float2)dh[0];
|
||||
const float dl = il<2 ? d[0] * (s[0]&0xF) : d[0] * (s[1]&0xF)/16.h;
|
||||
const float ml = il<2 ? d[1] * (s[0]>>4) : d[1] * (s[1]>>4);
|
||||
#endif
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * (q[i] & mask) - ml;
|
||||
@@ -5764,7 +5468,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
||||
device const uint8_t * q = xb->qs;
|
||||
device const uint8_t * qh = xb->qh;
|
||||
|
||||
#if QK_K == 256
|
||||
short is = (il/4) * 2;
|
||||
q = q + 32 * (il/4) + 16 * (il&1);
|
||||
qh = qh + 16 * (il&1);
|
||||
@@ -5781,17 +5484,6 @@ void dequantize_q5_K(device const block_q5_K *xb, short il, thread type4x4 & reg
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = dl * ((q[i] & mask) + (qh[i] & ul ? qh_val : 0)) - ml;
|
||||
}
|
||||
#else
|
||||
q = q + 16 * (il&1);
|
||||
device const int8_t * s = xb->scales;
|
||||
const float dl = xb->d * s[il];
|
||||
uint8_t m = 1<<(il*2);
|
||||
const float coef = il<2 ? 1.f : 1.f/16.f;
|
||||
const ushort mask = il<2 ? 0x0F : 0xF0;
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
reg[i/4][i%4] = coef * dl * ((q[i] & mask) - (qh[i%8] & (m*(1+i/8)) ? 0.f : 16.f/coef));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename type4x4>
|
||||
@@ -5801,15 +5493,11 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
|
||||
device const uint8_t * qh = (device const uint8_t *)xb->qh;
|
||||
device const int8_t * scales = (device const int8_t *)xb->scales;
|
||||
|
||||
#if QK_K == 256
|
||||
ql = ql + 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
|
||||
qh = qh + 32*(il/8) + 16*(il&1);
|
||||
float sc = scales[(il%2) + 2 * ((il/2))];
|
||||
il = (il/2) & 3;
|
||||
#else
|
||||
ql = ql + 16 * (il&1);
|
||||
float sc = scales[il];
|
||||
#endif
|
||||
|
||||
const uint16_t kmask1 = il>1 ? (il>2 ? 192 : 48) : (il>0 ? 12 : 3);
|
||||
const uint16_t kmask2 = il>1 ? 0xF0 : 0x0F;
|
||||
const float coef = il>1 ? 1.f/16.f : 1.f;
|
||||
@@ -5966,20 +5654,15 @@ void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 &
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
device const uint16_t * sc = (device const uint16_t *)xb->scales;
|
||||
#if QK_K == 64
|
||||
const float d = xb->d;
|
||||
#else
|
||||
|
||||
iq1m_scale_t scale;
|
||||
scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
|
||||
const float d = scale.f16;
|
||||
#endif
|
||||
|
||||
device const uint8_t * qs = xb->qs + 4*ib32 + 2*il;
|
||||
device const uint8_t * qh = xb->qh + 2*ib32 + il;
|
||||
#if QK_K == 64
|
||||
const float dl = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1);
|
||||
#else
|
||||
|
||||
const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1);
|
||||
#endif
|
||||
const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA);
|
||||
constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700)));
|
||||
@@ -6009,9 +5692,6 @@ void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4
|
||||
|
||||
template <typename type4x4>
|
||||
void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4 & reg) {
|
||||
#if QK_K == 64
|
||||
dequantize_iq4_nl(xb, il, reg);
|
||||
#else
|
||||
// il is 0...15 for QK_K = 256 => index of block of 32 is il/2
|
||||
const int ib32 = il/2;
|
||||
il = il%2;
|
||||
@@ -6028,7 +5708,6 @@ void dequantize_iq4_xs(device const block_iq4_xs * xb, short il, thread type4x4
|
||||
reg[i][2] = d * kvalues_iq4nl_f[q8[2]];
|
||||
reg[i][3] = d * kvalues_iq4nl_f[q8[3]];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread float4x4 &)>
|
||||
@@ -6533,11 +6212,7 @@ kernel void kernel_mul_mm_id(
|
||||
sgitg);
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
#define QK_NL 16
|
||||
#else
|
||||
#define QK_NL 4
|
||||
#endif
|
||||
|
||||
//
|
||||
// get rows
|
||||
@@ -6577,11 +6252,7 @@ template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_r
|
||||
template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
#if QK_K == 64
|
||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, 2, dequantize_iq4_xs>;
|
||||
#else
|
||||
template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
#endif
|
||||
|
||||
//
|
||||
// matrix-matrix multiplication
|
||||
@@ -6609,11 +6280,7 @@ template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_m
|
||||
template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
#if QK_K == 64
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_nl, 2, dequantize_iq4_xs>;
|
||||
#else
|
||||
template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
#endif
|
||||
|
||||
//
|
||||
// indirect matrix-matrix multiplication
|
||||
@@ -6641,11 +6308,7 @@ template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel
|
||||
template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s, QK_NL, dequantize_iq1_s>;
|
||||
template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m, QK_NL, dequantize_iq1_m>;
|
||||
template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl, 2, dequantize_iq4_nl>;
|
||||
#if QK_K == 64
|
||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, 2, dequantize_iq4_xs>;
|
||||
#else
|
||||
template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs, QK_NL, dequantize_iq4_xs>;
|
||||
#endif
|
||||
|
||||
//
|
||||
// matrix-vector multiplication
|
||||
@@ -6854,7 +6517,5 @@ template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t
|
||||
template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq3_s_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_s_f32_impl>>;
|
||||
template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_nl_f32_impl>>;
|
||||
#if QK_K != 64
|
||||
template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq4_xs_f32_impl>>;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-opencl.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
|
||||
2740
ggml-quants.c
2740
ggml-quants.c
File diff suppressed because it is too large
Load Diff
567
ggml-sycl.cpp
567
ggml-sycl.cpp
File diff suppressed because it is too large
Load Diff
698
ggml.c
698
ggml.c
@@ -871,22 +871,14 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
},
|
||||
[GGML_TYPE_IQ4_XS] = {
|
||||
.type_name = "iq4_xs",
|
||||
#if QK_K == 64
|
||||
.blck_size = QK4_NL,
|
||||
#else
|
||||
.blck_size = QK_K,
|
||||
#endif
|
||||
.type_size = sizeof(block_iq4_xs),
|
||||
.is_quantized = true,
|
||||
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
||||
.from_float = quantize_row_iq4_xs,
|
||||
.from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
|
||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||
#if QK_K == 64
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#else
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
#endif
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q8_K] = {
|
||||
@@ -2678,9 +2670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"ARGSORT",
|
||||
"LEAKY_RELU",
|
||||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_ATTN_EXT",
|
||||
"FLASH_FF",
|
||||
"FLASH_ATTN_BACK",
|
||||
"SSM_CONV",
|
||||
"SSM_SCAN",
|
||||
@@ -2706,7 +2696,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
||||
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -2768,9 +2758,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"argsort(x)",
|
||||
"leaky_relu(x)",
|
||||
|
||||
"flash_attn(x)",
|
||||
"flash_attn_ext(x)",
|
||||
"flash_ff(x)",
|
||||
"flash_attn_back(x)",
|
||||
"ssm_conv(x)",
|
||||
"ssm_scan(x)",
|
||||
@@ -2796,7 +2784,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
|
||||
static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -6956,38 +6944,6 @@ struct ggml_tensor * ggml_top_k(
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
bool masked) {
|
||||
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
||||
// TODO: check if vT can be multiplied by (k*qT)
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (q->grad || k->grad || v->grad) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
//struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, q->ne);
|
||||
|
||||
int32_t t = masked ? 1 : 0;
|
||||
ggml_set_op_params(result, &t, sizeof(t));
|
||||
|
||||
result->op = GGML_OP_FLASH_ATTN;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = q;
|
||||
result->src[1] = k;
|
||||
result->src[2] = v;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn_ext
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn_ext(
|
||||
@@ -7047,38 +7003,6 @@ void ggml_flash_attn_ext_set_prec(
|
||||
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
|
||||
}
|
||||
|
||||
// ggml_flash_ff
|
||||
|
||||
struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b0,
|
||||
struct ggml_tensor * b1,
|
||||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1) {
|
||||
GGML_ASSERT(ggml_can_mul_mat(b0, a));
|
||||
// TODO: more checks
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
//struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, a->ne);
|
||||
|
||||
result->op = GGML_OP_FLASH_FF;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b0;
|
||||
result->src[2] = b1;
|
||||
result->src[3] = c0;
|
||||
result->src[4] = c1;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn_back
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn_back(
|
||||
@@ -7088,6 +7012,8 @@ struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * d,
|
||||
bool masked) {
|
||||
GGML_ASSERT(false && "TODO: adapt to ggml_flash_attn_ext() changes");
|
||||
|
||||
GGML_ASSERT(ggml_can_mul_mat(k, q));
|
||||
// TODO: check if vT can be multiplied by (k*qT)
|
||||
|
||||
@@ -15717,400 +15643,6 @@ static void ggml_compute_forward_argsort(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn
|
||||
|
||||
static void ggml_compute_forward_flash_attn_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * q = dst->src[0];
|
||||
const struct ggml_tensor * k = dst->src[1];
|
||||
const struct ggml_tensor * v = dst->src[2];
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t D = neq0;
|
||||
const int64_t N = neq1;
|
||||
const int64_t P = nek1 - N;
|
||||
const int64_t M = P + N;
|
||||
|
||||
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
||||
|
||||
GGML_ASSERT(ne0 == D);
|
||||
GGML_ASSERT(ne1 == N);
|
||||
GGML_ASSERT(P >= 0);
|
||||
|
||||
GGML_ASSERT(nbq0 == sizeof(float));
|
||||
GGML_ASSERT(nbk0 == sizeof(float));
|
||||
GGML_ASSERT(nbv0 == sizeof(float));
|
||||
|
||||
GGML_ASSERT(neq0 == D);
|
||||
GGML_ASSERT(nek0 == D);
|
||||
GGML_ASSERT(nev1 == D);
|
||||
|
||||
GGML_ASSERT(neq1 == N);
|
||||
GGML_ASSERT(nek1 == N + P);
|
||||
GGML_ASSERT(nev1 == D);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by q rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in q
|
||||
const int nr = neq1*neq2*neq3;
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
const float scale = 1.0f/sqrtf(D);
|
||||
|
||||
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// q indices
|
||||
const int iq3 = ir/(neq2*neq1);
|
||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||
|
||||
float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
|
||||
|
||||
for (int i = M; i < Mup; ++i) {
|
||||
S[i] = -INFINITY;
|
||||
}
|
||||
|
||||
const int64_t masked_begin = masked ? (P + iq1 + 1) : M;
|
||||
for (int64_t ic = 0; ic < masked_begin; ++ic) {
|
||||
// k indices
|
||||
const int ik3 = iq3;
|
||||
const int ik2 = iq2 % nek2;
|
||||
const int ik1 = ic;
|
||||
|
||||
// S indices
|
||||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f32(neq0,
|
||||
S + i1, 0,
|
||||
(float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
(float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||
}
|
||||
|
||||
// scale
|
||||
ggml_vec_scale_f32(masked_begin, S, scale);
|
||||
|
||||
for (int64_t i = masked_begin; i < M; i++) {
|
||||
S[i] = -INFINITY;
|
||||
}
|
||||
|
||||
// softmax
|
||||
// exclude known -INF S[..] values from max and loop
|
||||
// dont forget to set their SW values to zero
|
||||
{
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(masked_begin, &max, S);
|
||||
|
||||
ggml_float sum = 0.0;
|
||||
{
|
||||
#ifdef GGML_SOFT_MAX_ACCELERATE
|
||||
max = -max;
|
||||
vDSP_vsadd(S, 1, &max, S, 1, Mup);
|
||||
vvexpf(S, S, &Mup);
|
||||
ggml_vec_sum_f32(Mup, &sum, S);
|
||||
#else
|
||||
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
||||
#endif
|
||||
}
|
||||
|
||||
assert(sum > 0.0);
|
||||
|
||||
sum = 1.0/sum;
|
||||
ggml_vec_scale_f32(masked_begin, S, sum);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < masked_begin; ++i) {
|
||||
assert(!isnan(S[i]));
|
||||
assert(!isinf(S[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int64_t ic = 0; ic < nev1; ++ic) {
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// v indices
|
||||
const int iv2 = iq2 % nev2;
|
||||
const int iv3 = iq3;
|
||||
|
||||
ggml_vec_dot_f32(masked_begin,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(float *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||
S, 0, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_attn_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * q = dst->src[0];
|
||||
const struct ggml_tensor * k = dst->src[1];
|
||||
const struct ggml_tensor * v = dst->src[2];
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nek, k, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbk, k, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nev, v, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbv, v, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t D = neq0;
|
||||
const int64_t N = neq1;
|
||||
const int64_t P = nek1 - N;
|
||||
const int64_t M = P + N;
|
||||
|
||||
const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
|
||||
|
||||
GGML_ASSERT(ne0 == D);
|
||||
GGML_ASSERT(ne1 == N);
|
||||
GGML_ASSERT(P >= 0);
|
||||
|
||||
GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
|
||||
|
||||
GGML_ASSERT(neq0 == D);
|
||||
GGML_ASSERT(nek0 == D);
|
||||
GGML_ASSERT(nev1 == D);
|
||||
|
||||
GGML_ASSERT(neq1 == N);
|
||||
GGML_ASSERT(nek1 == N + P);
|
||||
GGML_ASSERT(nev1 == D);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by q rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in q
|
||||
const int nr = neq1*neq2*neq3;
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
const float scale = 1.0f/sqrtf(D);
|
||||
|
||||
//printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// q indices
|
||||
const int iq3 = ir/(neq2*neq1);
|
||||
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
|
||||
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
|
||||
|
||||
float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
|
||||
|
||||
for (int i = M; i < Mup; ++i) {
|
||||
S[i] = -INFINITY;
|
||||
}
|
||||
|
||||
if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
|
||||
for (int64_t ic = 0; ic < nek1; ++ic) {
|
||||
// k indices
|
||||
const int ik3 = iq3;
|
||||
const int ik2 = iq2 % nek2;
|
||||
const int ik1 = ic;
|
||||
|
||||
// S indices
|
||||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f16(neq0,
|
||||
S + i1, 0,
|
||||
(ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)), 0,
|
||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)), 0, 1);
|
||||
}
|
||||
} else {
|
||||
for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
|
||||
// k indices
|
||||
const int ik3 = iq3;
|
||||
const int ik2 = iq2 % nek2;
|
||||
const int ik1 = ic;
|
||||
|
||||
// S indices
|
||||
const int i1 = ik1;
|
||||
|
||||
ggml_vec_dot_f16_unroll(neq0, nbk1,
|
||||
S + i1,
|
||||
((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
|
||||
(ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
|
||||
}
|
||||
}
|
||||
|
||||
// scale
|
||||
ggml_vec_scale_f32(nek1, S, scale);
|
||||
|
||||
if (masked) {
|
||||
for (int64_t i = P; i < M; i++) {
|
||||
if (i > P + iq1) {
|
||||
S[i] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// softmax
|
||||
// todo: exclude known -INF S[..] values from max and loop, assuming their results to be zero.
|
||||
// dont forget to set their S values to zero
|
||||
{
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(M, &max, S);
|
||||
|
||||
ggml_float sum = 0.0;
|
||||
{
|
||||
#ifdef GGML_SOFT_MAX_ACCELERATE
|
||||
max = -max;
|
||||
vDSP_vsadd(S, 1, &max, S, 1, Mup);
|
||||
vvexpf(S, S, &Mup);
|
||||
ggml_vec_sum_f32(Mup, &sum, S);
|
||||
#else
|
||||
sum = ggml_vec_soft_max_f32(Mup, S, S, max);
|
||||
#endif
|
||||
}
|
||||
|
||||
assert(sum > 0.0);
|
||||
|
||||
sum = 1.0/sum;
|
||||
ggml_vec_scale_f32(M, S, sum);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < M; ++i) {
|
||||
assert(!isnan(S[i]));
|
||||
assert(!isinf(S[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
|
||||
|
||||
for (int64_t i = 0; i < M; i++) {
|
||||
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
||||
}
|
||||
|
||||
// todo: exclude known zero S[..] values from dot (reducing nev0 and increasing begin of v and S16).
|
||||
if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
|
||||
for (int64_t ic = 0; ic < nev1; ++ic) {
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// v indices
|
||||
const int iv2 = iq2 % nev2;
|
||||
const int iv3 = iq3;
|
||||
|
||||
ggml_vec_dot_f16(nev0,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)), 0,
|
||||
S16, 0, 1);
|
||||
}
|
||||
} else {
|
||||
for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
|
||||
// dst indices
|
||||
const int i1 = iq1;
|
||||
const int i2 = iq2;
|
||||
const int i3 = iq3;
|
||||
|
||||
// v indices
|
||||
const int iv2 = iq2 % nev2;
|
||||
const int iv3 = iq3;
|
||||
|
||||
ggml_vec_dot_f16_unroll(nev0, nbv1,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
((char *) v->data + ( ic*nbv1 + iv2*nbv2 + iv3*nbv3)),
|
||||
S16);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_attn(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * q = dst->src[0];
|
||||
|
||||
switch (q->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_f16(params, masked, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_f32(params, masked, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn_ext
|
||||
|
||||
static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
@@ -16344,165 +15876,6 @@ static void ggml_compute_forward_flash_attn_ext(
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_ff
|
||||
|
||||
static void ggml_compute_forward_flash_ff_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * a = dst->src[0]; // F16
|
||||
const struct ggml_tensor * b0 = dst->src[1]; // F16 fc_w
|
||||
const struct ggml_tensor * b1 = dst->src[2]; // F32 fc_b
|
||||
const struct ggml_tensor * c0 = dst->src[3]; // F16 proj_w
|
||||
const struct ggml_tensor * c1 = dst->src[4]; // F32 proj_b
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_LOCALS(int64_t, nea, a, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nba, a, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, neb0, b0, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbb0, b0, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, neb1, b1, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbb1, b1, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nec0, c0, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbc0, c0, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, nec1, c1, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nbc1, c1, nb)
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne)
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t D = nea0;
|
||||
//const int64_t N = nea1;
|
||||
const int64_t M = neb01;
|
||||
|
||||
GGML_ASSERT(ne0 == nea0);
|
||||
GGML_ASSERT(ne1 == nea1);
|
||||
GGML_ASSERT(ne2 == nea2);
|
||||
|
||||
GGML_ASSERT(nba0 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbb10 == sizeof(float));
|
||||
GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nbc10 == sizeof(float));
|
||||
|
||||
GGML_ASSERT(neb00 == D);
|
||||
GGML_ASSERT(neb01 == M);
|
||||
GGML_ASSERT(neb10 == M);
|
||||
GGML_ASSERT(neb11 == 1);
|
||||
|
||||
GGML_ASSERT(nec00 == M);
|
||||
GGML_ASSERT(nec01 == D);
|
||||
GGML_ASSERT(nec10 == D);
|
||||
GGML_ASSERT(nec11 == 1);
|
||||
|
||||
// dst cannot be transposed or permuted
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb0 <= nb1);
|
||||
GGML_ASSERT(nb1 <= nb2);
|
||||
GGML_ASSERT(nb2 <= nb3);
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by a rows using ggml_vec_dot_f32
|
||||
|
||||
// total rows in a
|
||||
const int nr = nea1*nea2*nea3;
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
// a indices
|
||||
const int ia3 = ir/(nea2*nea1);
|
||||
const int ia2 = (ir - ia3*nea2*nea1)/nea1;
|
||||
const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
|
||||
|
||||
float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
|
||||
|
||||
for (int64_t ic = 0; ic < neb01; ++ic) {
|
||||
// b0 indices
|
||||
const int ib03 = ia3;
|
||||
const int ib02 = ia2;
|
||||
const int ib01 = ic;
|
||||
|
||||
// S indices
|
||||
const int i1 = ib01;
|
||||
|
||||
ggml_vec_dot_f16(nea0,
|
||||
S + i1, 0,
|
||||
(ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)), 0,
|
||||
(ggml_fp16_t *) ((char *) a->data + ( ia1*nba1 + ia2*nba2 + ia3*nba3)), 0, 1);
|
||||
}
|
||||
|
||||
ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
|
||||
//ggml_vec_gelu_f32(neb01, S, S);
|
||||
|
||||
ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
|
||||
|
||||
for (int64_t i = 0; i < M; i++) {
|
||||
S16[i] = GGML_FP32_TO_FP16(S[i]);
|
||||
}
|
||||
|
||||
ggml_vec_gelu_f16(neb01, S16, S16);
|
||||
|
||||
{
|
||||
// dst indices
|
||||
const int i1 = ia1;
|
||||
const int i2 = ia2;
|
||||
const int i3 = ia3;
|
||||
|
||||
for (int64_t ic = 0; ic < nec01; ++ic) {
|
||||
|
||||
ggml_vec_dot_f16(neb01,
|
||||
(float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)), 0,
|
||||
(ggml_fp16_t *) ((char *) c0->data + ( ic*nbc01 + i2*nbc02 + i3*nbc03)), 0,
|
||||
S16, 0, 1);
|
||||
}
|
||||
|
||||
ggml_vec_add_f32(nec01,
|
||||
(float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
(float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
|
||||
(float *) c1->data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_flash_ff(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * b0 = dst->src[1];
|
||||
|
||||
switch (b0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_flash_ff_f16(params, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn_back
|
||||
|
||||
static void ggml_compute_forward_flash_attn_back_f32(
|
||||
@@ -18073,21 +17446,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
{
|
||||
ggml_compute_forward_leaky_relu(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
||||
GGML_ASSERT(t == 0 || t == 1);
|
||||
const bool masked = t != 0;
|
||||
ggml_compute_forward_flash_attn(params, masked, tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
ggml_compute_forward_flash_ff(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
int32_t t = ggml_get_op_params_i32(tensor, 0);
|
||||
@@ -19094,7 +18456,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
struct ggml_tensor * flash_grad = NULL;
|
||||
@@ -19148,10 +18509,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
zero_table);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
GGML_ASSERT(false); // not supported
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
GGML_ASSERT(false); // not supported
|
||||
@@ -19838,15 +19195,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
@@ -20243,40 +19595,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
||||
cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02*ne03;
|
||||
cur += sizeof(ggml_fp16_t)*ne10*ne11*ne12;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
const int64_t ne11 = ggml_up(node->src[1]->ne[1], GGML_SOFT_MAX_UNROLL);
|
||||
|
||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
||||
cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
{
|
||||
const int64_t ne00 = node->src[0]->ne[0]; // D
|
||||
|
||||
cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
|
||||
} break;
|
||||
case GGML_OP_FLASH_FF:
|
||||
{
|
||||
if (node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
||||
} else if (node->src[1]->type == GGML_TYPE_F16) {
|
||||
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
||||
} else if (node->src[1]->type == GGML_TYPE_BF16) {
|
||||
cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
|
||||
cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN_BACK:
|
||||
{
|
||||
const int64_t D = node->src[0]->ne[0];
|
||||
@@ -22117,11 +21441,7 @@ size_t ggml_quantize_chunk(
|
||||
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
#if QK_K == 64
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
#else
|
||||
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
||||
#endif
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
@@ -23422,6 +22742,16 @@ int ggml_cpu_has_neon(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sve(void) {
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// TODO: Currently, SVE 256 bit is only supported.
|
||||
GGML_ASSERT(svcntb() == QK8_0);
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_arm_fma(void) {
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
return 1;
|
||||
|
||||
19
ggml.h
19
ggml.h
@@ -481,9 +481,7 @@ extern "C" {
|
||||
GGML_OP_ARGSORT,
|
||||
GGML_OP_LEAKY_RELU,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_ATTN_EXT,
|
||||
GGML_OP_FLASH_FF,
|
||||
GGML_OP_FLASH_ATTN_BACK,
|
||||
GGML_OP_SSM_CONV,
|
||||
GGML_OP_SSM_SCAN,
|
||||
@@ -1761,13 +1759,6 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
int k);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
bool masked);
|
||||
|
||||
#define GGML_KQ_MASK_PAD 32
|
||||
|
||||
// q: [n_embd, n_batch, n_head, 1]
|
||||
@@ -1788,6 +1779,7 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec);
|
||||
|
||||
// TODO: needs to be adapted to ggml_flash_attn_ext
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
@@ -1796,14 +1788,6 @@ extern "C" {
|
||||
struct ggml_tensor * d,
|
||||
bool masked);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b0,
|
||||
struct ggml_tensor * b1,
|
||||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_ssm_conv(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * s,
|
||||
@@ -2420,6 +2404,7 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
|
||||
@@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum):
|
||||
COMMAND_R = auto()
|
||||
DBRX = auto()
|
||||
OLMO = auto()
|
||||
ARCTIC = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -167,6 +168,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
FFN_ACT = auto()
|
||||
FFN_NORM_EXP = auto()
|
||||
FFN_GATE_EXP = auto()
|
||||
FFN_DOWN_EXP = auto()
|
||||
FFN_UP_EXP = auto()
|
||||
@@ -218,6 +220,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
MODEL_ARCH.DBRX: "dbrx",
|
||||
MODEL_ARCH.OLMO: "olmo",
|
||||
MODEL_ARCH.ARCTIC: "arctic",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -251,6 +254,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||
@@ -732,6 +736,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.ARCTIC: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_GATE_INP,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_NORM_EXP,
|
||||
MODEL_TENSOR.FFN_GATE_EXP,
|
||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
@@ -905,9 +930,8 @@ class GGUFValueType(IntEnum):
|
||||
raise ValueError(f"Unknown type: {type(val)}")
|
||||
|
||||
|
||||
# Note: Does not support GGML_QKK_64
|
||||
QK_K = 256
|
||||
# Items here are (block size, type size)
|
||||
QK_K = 256
|
||||
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
GGMLQuantizationType.F32: (1, 4),
|
||||
GGMLQuantizationType.F16: (1, 2),
|
||||
|
||||
@@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
|
||||
from .quants import quant_shape_to_byte_shape
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
from pathlib import Path
|
||||
@@ -251,6 +253,7 @@ class GGUFReader:
|
||||
tensor_names.add(tensor_name)
|
||||
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||
n_elems = int(np.prod(dims))
|
||||
np_dims = tuple(reversed(dims.tolist()))
|
||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||
n_bytes = n_elems * type_size // block_size
|
||||
data_offs = int(start_offs + offset_tensor[0])
|
||||
@@ -279,6 +282,7 @@ class GGUFReader:
|
||||
else:
|
||||
item_count = n_bytes
|
||||
item_type = np.uint8
|
||||
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||
tensors.append(ReaderTensor(
|
||||
name = tensor_name,
|
||||
tensor_type = ggml_type,
|
||||
@@ -286,7 +290,7 @@ class GGUFReader:
|
||||
n_elements = n_elems,
|
||||
n_bytes = n_bytes,
|
||||
data_offset = data_offs,
|
||||
data = self._get(data_offs, item_type, item_count),
|
||||
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
|
||||
field = field,
|
||||
))
|
||||
self.tensors = tensors
|
||||
|
||||
@@ -13,7 +13,6 @@ from string import ascii_letters, digits
|
||||
import numpy as np
|
||||
|
||||
from .constants import (
|
||||
GGML_QUANT_SIZES,
|
||||
GGUF_DEFAULT_ALIGNMENT,
|
||||
GGUF_MAGIC,
|
||||
GGUF_VERSION,
|
||||
@@ -26,6 +25,8 @@ from .constants import (
|
||||
TokenType,
|
||||
)
|
||||
|
||||
from .quants import quant_shape_from_byte_shape
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -229,10 +230,7 @@ class GGUFWriter:
|
||||
else:
|
||||
dtype = raw_dtype
|
||||
if tensor_dtype == np.uint8:
|
||||
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
|
||||
if tensor_shape[-1] % type_size != 0:
|
||||
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
|
||||
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
|
||||
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += self._pack("I", n_dims)
|
||||
for i in range(n_dims):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from __future__ import annotations
|
||||
from typing import Callable
|
||||
from typing import Callable, Sequence
|
||||
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
@@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
|
||||
import numpy as np
|
||||
|
||||
|
||||
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
if shape[-1] % block_size != 0:
|
||||
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||
|
||||
|
||||
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
if shape[-1] % type_size != 0:
|
||||
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||
|
||||
|
||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||
n = n.astype(np.float32, copy=False).view(np.int32)
|
||||
|
||||
@@ -244,6 +244,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
@@ -272,6 +273,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
@@ -306,6 +308,7 @@ class TensorNameMap:
|
||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||
"model.layers.{bid}.residual_mlp.w2", # arctic
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
@@ -382,6 +385,18 @@ class TensorNameMap:
|
||||
),
|
||||
}
|
||||
|
||||
# architecture-specific block mappings
|
||||
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
|
||||
MODEL_ARCH.ARCTIC: {
|
||||
MODEL_TENSOR.FFN_NORM: (
|
||||
"model.layers.{bid}.residual_layernorm",
|
||||
),
|
||||
MODEL_TENSOR.FFN_NORM_EXP: (
|
||||
"model.layers.{bid}.post_attention_layernorm",
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||
|
||||
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
||||
@@ -393,12 +408,14 @@ class TensorNameMap:
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
for key in keys:
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
if arch in self.arch_block_mappings_cfg:
|
||||
self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
|
||||
for bid in range(n_blocks):
|
||||
for tensor, keys in self.block_mappings_cfg.items():
|
||||
if tensor not in MODEL_TENSORS[arch]:
|
||||
continue
|
||||
# TODO: make this configurable
|
||||
n_experts = 60
|
||||
n_experts = 128
|
||||
for xid in range(n_experts):
|
||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||
|
||||
@@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
|
||||
|
||||
for tensor in reader.tensors:
|
||||
total_bytes += tensor.n_bytes
|
||||
# Dimensions are written in reverse order, so flip them first
|
||||
shape = np.flipud(tensor.shape).tolist()
|
||||
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||
|
||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||
|
||||
|
||||
588
llama.cpp
588
llama.cpp
@@ -26,13 +26,9 @@
|
||||
#ifdef GGML_USE_METAL
|
||||
# include "ggml-metal.h"
|
||||
#endif
|
||||
#ifndef QK_K
|
||||
# ifdef GGML_QKK_64
|
||||
# define QK_K 64
|
||||
# else
|
||||
# define QK_K 256
|
||||
# endif
|
||||
#endif
|
||||
|
||||
// TODO: replace with ggml API call
|
||||
#define QK_K 256
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
@@ -107,7 +103,7 @@
|
||||
#endif
|
||||
|
||||
#define LLAMA_MAX_NODES 8192
|
||||
#define LLAMA_MAX_EXPERTS 60
|
||||
#define LLAMA_MAX_EXPERTS 128
|
||||
|
||||
//
|
||||
// logging
|
||||
@@ -225,6 +221,7 @@ enum llm_arch {
|
||||
LLM_ARCH_COMMAND_R,
|
||||
LLM_ARCH_DBRX,
|
||||
LLM_ARCH_OLMO,
|
||||
LLM_ARCH_ARCTIC,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -261,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||
{ LLM_ARCH_DBRX, "dbrx" },
|
||||
{ LLM_ARCH_OLMO, "olmo" },
|
||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -459,6 +457,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
||||
LLM_TENSOR_FFN_GATE_EXP,
|
||||
LLM_TENSOR_FFN_UP_EXP,
|
||||
LLM_TENSOR_FFN_NORM_EXPS,
|
||||
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
||||
LLM_TENSOR_FFN_GATE_EXPS,
|
||||
LLM_TENSOR_FFN_UP_EXPS,
|
||||
@@ -1036,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_ARCTIC,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -1696,17 +1717,24 @@ static llama_state g_state;
|
||||
// available llama models
|
||||
enum e_model {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_14M,
|
||||
MODEL_17M,
|
||||
MODEL_22M,
|
||||
MODEL_33M,
|
||||
MODEL_70M,
|
||||
MODEL_109M,
|
||||
MODEL_137M,
|
||||
MODEL_160M,
|
||||
MODEL_335M,
|
||||
MODEL_410M,
|
||||
MODEL_0_5B,
|
||||
MODEL_1B,
|
||||
MODEL_1_4B,
|
||||
MODEL_2B,
|
||||
MODEL_2_8B,
|
||||
MODEL_3B,
|
||||
MODEL_4B,
|
||||
MODEL_6_9B,
|
||||
MODEL_7B,
|
||||
MODEL_8B,
|
||||
MODEL_12B,
|
||||
@@ -1729,6 +1757,7 @@ enum e_model {
|
||||
MODEL_8x7B,
|
||||
MODEL_8x22B,
|
||||
MODEL_16x12B,
|
||||
MODEL_10B_128x3_66B,
|
||||
};
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
@@ -1738,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
|
||||
struct llama_hparams {
|
||||
bool vocab_only;
|
||||
bool rope_finetuned;
|
||||
bool use_par_res;
|
||||
|
||||
uint32_t n_vocab;
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
@@ -1903,6 +1933,7 @@ struct llama_layer {
|
||||
struct ggml_tensor * ffn_norm_b;
|
||||
struct ggml_tensor * layer_out_norm;
|
||||
struct ggml_tensor * layer_out_norm_b;
|
||||
struct ggml_tensor * ffn_norm_exps;
|
||||
|
||||
// ff
|
||||
struct ggml_tensor * ffn_gate; // w1
|
||||
@@ -2479,7 +2510,6 @@ static bool llama_kv_cache_init(
|
||||
static bool llama_kv_cache_find_slot(
|
||||
struct llama_kv_cache & cache,
|
||||
const struct llama_batch & batch) {
|
||||
const uint32_t n_ctx = cache.size;
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
|
||||
if (cache.recurrent) {
|
||||
@@ -2530,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
|
||||
}
|
||||
// otherwise, one cell per token.
|
||||
|
||||
if (n_tokens > n_ctx) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
||||
if (n_tokens > cache.size) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
while (true) {
|
||||
if (cache.head + n_tokens > n_ctx) {
|
||||
n_tested += n_ctx - cache.head;
|
||||
if (cache.head + n_tokens > cache.size) {
|
||||
n_tested += cache.size - cache.head;
|
||||
cache.head = 0;
|
||||
continue;
|
||||
}
|
||||
@@ -2558,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_tested >= n_ctx) {
|
||||
if (n_tested >= cache.size) {
|
||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
return false;
|
||||
}
|
||||
@@ -3778,40 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||
|
||||
static const char * llama_model_type_name(e_model type) {
|
||||
switch (type) {
|
||||
case MODEL_17M: return "17M";
|
||||
case MODEL_22M: return "22M";
|
||||
case MODEL_33M: return "33M";
|
||||
case MODEL_109M: return "109M";
|
||||
case MODEL_137M: return "137M";
|
||||
case MODEL_335M: return "335M";
|
||||
case MODEL_0_5B: return "0.5B";
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_2B: return "2B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_4B: return "4B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_8B: return "8B";
|
||||
case MODEL_12B: return "12B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_14B: return "14B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_20B: return "20B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_35B: return "35B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
case MODEL_314B: return "314B";
|
||||
case MODEL_SMALL: return "0.1B";
|
||||
case MODEL_MEDIUM: return "0.4B";
|
||||
case MODEL_LARGE: return "0.8B";
|
||||
case MODEL_XL: return "1.5B";
|
||||
case MODEL_A2_7B: return "A2.7B";
|
||||
case MODEL_8x7B: return "8x7B";
|
||||
case MODEL_8x22B: return "8x22B";
|
||||
case MODEL_16x12B: return "16x12B";
|
||||
default: return "?B";
|
||||
case MODEL_14M: return "14M";
|
||||
case MODEL_17M: return "17M";
|
||||
case MODEL_22M: return "22M";
|
||||
case MODEL_33M: return "33M";
|
||||
case MODEL_70M: return "70M";
|
||||
case MODEL_109M: return "109M";
|
||||
case MODEL_137M: return "137M";
|
||||
case MODEL_160M: return "160M";
|
||||
case MODEL_335M: return "335M";
|
||||
case MODEL_410M: return "410M";
|
||||
case MODEL_0_5B: return "0.5B";
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_1_4B: return "1.4B";
|
||||
case MODEL_2B: return "2B";
|
||||
case MODEL_2_8B: return "2.8B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_4B: return "4B";
|
||||
case MODEL_6_9B: return "6.9B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_8B: return "8B";
|
||||
case MODEL_12B: return "12B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_14B: return "14B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_20B: return "20B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_35B: return "35B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
case MODEL_314B: return "314B";
|
||||
case MODEL_SMALL: return "0.1B";
|
||||
case MODEL_MEDIUM: return "0.4B";
|
||||
case MODEL_LARGE: return "0.8B";
|
||||
case MODEL_XL: return "1.5B";
|
||||
case MODEL_A2_7B: return "A2.7B";
|
||||
case MODEL_8x7B: return "8x7B";
|
||||
case MODEL_8x22B: return "8x22B";
|
||||
case MODEL_16x12B: return "16x12B";
|
||||
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4287,6 +4325,65 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||||
switch (hparams.n_layer) {
|
||||
case 6:
|
||||
switch (hparams.n_ff) {
|
||||
case 512: model.type = e_model::MODEL_14M; break;
|
||||
case 2048: model.type = e_model::MODEL_70M; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
} break;
|
||||
case 12:
|
||||
switch (hparams.n_ff) {
|
||||
case 3072: model.type = e_model::MODEL_160M; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
} break;
|
||||
case 16:
|
||||
switch (hparams.n_ff) {
|
||||
case 8192: model.type = e_model::MODEL_1B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
} break;
|
||||
case 24:
|
||||
switch (hparams.n_ff) {
|
||||
case 4096: model.type = e_model::MODEL_410M; break;
|
||||
case 8192: model.type = e_model::MODEL_1_4B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
} break;
|
||||
case 32:
|
||||
switch (hparams.n_ff) {
|
||||
case 10240: model.type = e_model::MODEL_2_8B; break;
|
||||
case 16384: model.type = e_model::MODEL_6_9B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
} break;
|
||||
case 36:
|
||||
switch (hparams.n_ff) {
|
||||
case 20480: model.type = e_model::MODEL_12B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
} break;
|
||||
case 44:
|
||||
switch (hparams.n_ff) {
|
||||
case 24576: model.type = e_model::MODEL_20B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
} break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_ARCTIC:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
if (hparams.n_expert == 128) {
|
||||
switch (hparams.n_layer) {
|
||||
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} else {
|
||||
model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -4496,6 +4593,9 @@ static void llm_load_vocab(
|
||||
} else if (
|
||||
tokenizer_pre == "dbrx") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||
} else if (
|
||||
tokenizer_pre == "smaug-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
@@ -6038,6 +6138,81 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
||||
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
||||
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_ARCTIC:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (model.output == NULL) {
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
||||
|
||||
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
||||
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
||||
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
||||
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
||||
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -6627,7 +6802,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
|
||||
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
||||
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
}
|
||||
|
||||
@@ -6636,7 +6811,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
@@ -10565,6 +10740,274 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gptneox() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||
}
|
||||
|
||||
// ffn
|
||||
if (hparams.use_par_res) {
|
||||
// attention and ffn are computed in parallel
|
||||
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
||||
|
||||
struct ggml_tensor * attn_out = cur;
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].ffn_norm,
|
||||
model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
inpL = ggml_add(ctx0, cur, attn_out);
|
||||
cb(inpL, "l_out", il);
|
||||
} else {
|
||||
// attention and ffn are computed sequentially
|
||||
// x = x + attn(ln1(x))
|
||||
// x = x + ffn(ln2(x))
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm,
|
||||
model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(inpL, "l_out", il);
|
||||
}
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.output_norm,
|
||||
model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_arctic() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(ffn_out, "ffn_out", il);
|
||||
|
||||
// MoE
|
||||
cur = llm_build_norm(ctx0, inpSA, hparams,
|
||||
model.layers[il].ffn_norm_exps, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm_exps", il);
|
||||
|
||||
cur = llm_build_moe_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_gate_inp,
|
||||
model.layers[il].ffn_up_exps,
|
||||
model.layers[il].ffn_gate_exps,
|
||||
model.layers[il].ffn_down_exps,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SILU, true,
|
||||
cb, il);
|
||||
cb(cur, "ffn_moe_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_out);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer_dir);
|
||||
}
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
@@ -10775,6 +11218,14 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_olmo();
|
||||
} break;
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
{
|
||||
result = llm.build_gptneox();
|
||||
} break;
|
||||
case LLM_ARCH_ARCTIC:
|
||||
{
|
||||
result = llm.build_arctic();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -12064,6 +12515,7 @@ struct llm_tokenizer_bpe {
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// same as llama3
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
@@ -14308,8 +14760,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||
if (qs.model.type == MODEL_70B) {
|
||||
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||
@@ -15769,7 +16219,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
// these models do not use RoPE
|
||||
case LLM_ARCH_GPT2:
|
||||
case LLM_ARCH_GPTJ:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
case LLM_ARCH_MPT:
|
||||
case LLM_ARCH_REFACT:
|
||||
case LLM_ARCH_BLOOM:
|
||||
@@ -15789,6 +16238,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_XVERSE:
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
case LLM_ARCH_OLMO:
|
||||
case LLM_ARCH_ARCTIC:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
@@ -15805,6 +16255,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_PHI3:
|
||||
case LLM_ARCH_GEMMA:
|
||||
case LLM_ARCH_STARCODER2:
|
||||
case LLM_ARCH_GPTNEOX:
|
||||
return LLAMA_ROPE_TYPE_NEOX;
|
||||
|
||||
// all model arches should be listed explicitly here
|
||||
@@ -17183,6 +17634,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
||||
ctx->cparams.n_threads_batch = n_threads_batch;
|
||||
}
|
||||
|
||||
uint32_t llama_n_threads(struct llama_context * ctx) {
|
||||
return ctx->cparams.n_threads;
|
||||
}
|
||||
|
||||
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||
return ctx->cparams.n_threads_batch;
|
||||
}
|
||||
|
||||
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = abort_callback_data;
|
||||
@@ -17406,6 +17865,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
||||
);
|
||||
}
|
||||
|
||||
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
||||
return llama_is_control_token(model->vocab, token);
|
||||
}
|
||||
|
||||
llama_token llama_token_bos(const struct llama_model * model) {
|
||||
return model->vocab.special_bos_id;
|
||||
}
|
||||
@@ -17617,6 +18080,15 @@ static int32_t llama_chat_apply_template_internal(
|
||||
}
|
||||
}
|
||||
// llama2 templates seem to not care about "add_generation_prompt"
|
||||
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
||||
// Phi 3
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
||||
// zephyr template
|
||||
for (auto message : chat) {
|
||||
@@ -17749,15 +18221,6 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
||||
}
|
||||
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
||||
// Phi 3
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|assistant|>\n";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
@@ -17882,6 +18345,7 @@ const char * llama_print_system_info(void) {
|
||||
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
|
||||
18
llama.h
18
llama.h
@@ -85,6 +85,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -264,6 +265,8 @@ extern "C" {
|
||||
bool check_tensors; // validate model tensor data
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
// https://github.com/ggerganov/llama.cpp/pull/7544
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
@@ -290,14 +293,14 @@ extern "C" {
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
|
||||
enum ggml_type type_k; // data type for K cache
|
||||
enum ggml_type type_v; // data type for V cache
|
||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
@@ -759,6 +762,12 @@ extern "C" {
|
||||
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
||||
|
||||
// Get the number of threads used for generation of a single token.
|
||||
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
||||
|
||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||
|
||||
// Set whether to use causal attention or not
|
||||
// If set to true, the model will only attend to the past tokens
|
||||
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
|
||||
@@ -817,6 +826,9 @@ extern "C" {
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Identify if Token Id is a control token or a render-able token
|
||||
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
|
||||
@@ -49,8 +49,14 @@ int main(void) {
|
||||
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
|
||||
// Llama-3
|
||||
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
|
||||
// Phi-3
|
||||
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}"
|
||||
//Phi-3-mini
|
||||
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
||||
//Phi-3-small
|
||||
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
|
||||
//Phi-3-medium
|
||||
"{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
||||
//Phi-3-vision
|
||||
"{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}"
|
||||
};
|
||||
std::vector<std::string> expected_output = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
@@ -79,8 +85,14 @@ int main(void) {
|
||||
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
|
||||
// Llama 3
|
||||
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
|
||||
// Phi 3
|
||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||
//Phi-3-mini
|
||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||
//Phi-3-small
|
||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||
//Phi-3-medium
|
||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||
//Phi-3-vision
|
||||
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
int32_t res;
|
||||
|
||||
@@ -1515,90 +1515,50 @@ int main(int argc, const char ** argv) {
|
||||
}
|
||||
|
||||
// flash_attn f32
|
||||
{
|
||||
srand(seed);
|
||||
const int nargs = 3;
|
||||
// TODO: adapt to ggml_flash_attn_ext() changes
|
||||
//{
|
||||
// srand(seed);
|
||||
// const int nargs = 3;
|
||||
|
||||
int64_t ne2[4];
|
||||
// int64_t ne2[4];
|
||||
|
||||
get_random_dims(ne2, 4);
|
||||
int64_t D = ne2[0];
|
||||
int64_t N = ne2[1];
|
||||
int64_t M = ne2[2] + N;
|
||||
int64_t B = ne2[3];
|
||||
// get_random_dims(ne2, 4);
|
||||
// int64_t D = ne2[0];
|
||||
// int64_t N = ne2[1];
|
||||
// int64_t M = ne2[2] + N;
|
||||
// int64_t B = ne2[3];
|
||||
|
||||
for (int masked = 0; masked <= 1; ++masked) {
|
||||
for (int ndims = 2; ndims <= 4; ++ndims) {
|
||||
int max_nrep = (ndims >= 3) ? 2 : 1;
|
||||
for (int nrep = 1; nrep < max_nrep; ++nrep) {
|
||||
int64_t neq[4] = { D, N, B*nrep, ne[3] };
|
||||
int64_t nek[4] = { D, M, B, ne[3] };
|
||||
int64_t nev[4] = { M, D, B, ne[3] };
|
||||
if (ndims == 2) {
|
||||
neq[2] = 1; neq[3] = 1;
|
||||
nek[2] = 1; nek[3] = 1;
|
||||
nev[2] = 1; nev[3] = 1;
|
||||
} else if (ndims == 3) {
|
||||
neq[3] = 1;
|
||||
nek[3] = 1;
|
||||
nev[3] = 1;
|
||||
}
|
||||
x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
||||
x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
||||
x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
ggml_set_param(ctx0, x[1]);
|
||||
ggml_set_param(ctx0, x[2]);
|
||||
// for (int masked = 0; masked <= 1; ++masked) {
|
||||
// for (int ndims = 2; ndims <= 4; ++ndims) {
|
||||
// int max_nrep = (ndims >= 3) ? 2 : 1;
|
||||
// for (int nrep = 1; nrep < max_nrep; ++nrep) {
|
||||
// int64_t neq[4] = { D, N, B*nrep, ne[3] };
|
||||
// int64_t nek[4] = { D, M, B, ne[3] };
|
||||
// int64_t nev[4] = { M, D, B, ne[3] };
|
||||
// if (ndims == 2) {
|
||||
// neq[2] = 1; neq[3] = 1;
|
||||
// nek[2] = 1; nek[3] = 1;
|
||||
// nev[2] = 1; nev[3] = 1;
|
||||
// } else if (ndims == 3) {
|
||||
// neq[3] = 1;
|
||||
// nek[3] = 1;
|
||||
// nev[3] = 1;
|
||||
// }
|
||||
// x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
||||
// x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
||||
// x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
||||
// ggml_set_param(ctx0, x[0]);
|
||||
// ggml_set_param(ctx0, x[1]);
|
||||
// ggml_set_param(ctx0, x[2]);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
// struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
|
||||
check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
// flash_attn f16, not yet fully implemented
|
||||
if(0)
|
||||
{
|
||||
srand(seed);
|
||||
const int nargs = 3;
|
||||
|
||||
int64_t ne2[4];
|
||||
|
||||
get_random_dims(ne2, 4);
|
||||
int64_t D = ne2[0];
|
||||
int64_t N = ne2[1];
|
||||
int64_t M = ne2[2] + N;
|
||||
int64_t B = ne2[3];
|
||||
|
||||
for (int masked = 0; masked <= 1; ++masked) {
|
||||
for (int ndims = 2; ndims <= 4; ++ndims) {
|
||||
int64_t neq[4] = { D, N, B, ne[3] };
|
||||
int64_t nek[4] = { D, M, B, ne[3] };
|
||||
int64_t nev[4] = { M, D, B, ne[3] };
|
||||
if (ndims == 2) {
|
||||
neq[2] = 1; neq[3] = 1;
|
||||
nek[2] = 1; nek[3] = 1;
|
||||
nev[2] = 1; nev[3] = 1;
|
||||
} else if (ndims == 3) {
|
||||
neq[3] = 1;
|
||||
nek[3] = 1;
|
||||
nev[3] = 1;
|
||||
}
|
||||
x[0] = get_random_tensor_f16(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
||||
x[1] = get_random_tensor_f16(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
||||
x[2] = get_random_tensor_f16(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
||||
ggml_set_param(ctx0, x[0]);
|
||||
ggml_set_param(ctx0, x[1]);
|
||||
ggml_set_param(ctx0, x[2]);
|
||||
|
||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||
|
||||
check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||
}
|
||||
}
|
||||
}
|
||||
ggml_free(ctx0);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user