Compare commits

..

4 Commits

Author SHA1 Message Date
Georgi Gerganov
ab1b75166f Merge branch 'master' into gg/ggml_scale
ggml-ci
2023-12-21 22:35:11 +02:00
Georgi Gerganov
b784f881c3 tests : fix test-grad0
ggml-ci
2023-12-21 21:34:31 +02:00
Georgi Gerganov
36c3f41f66 ggml : fix CPU implementation 2023-12-21 21:02:23 +02:00
Georgi Gerganov
199f6bdc46 ggml : change ggml_scale to take a float instead of tensor 2023-12-21 20:50:24 +02:00
10 changed files with 23 additions and 87 deletions

View File

@@ -52,36 +52,6 @@ jobs:
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
# https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Determine tag name
id: tag
shell: bash
run: |
BUILD_NUMBER="$(git rev-list --count HEAD)"
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
else
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
fi
- name: Build and push Docker image (versioned)
if: github.event_name == 'push'
uses: docker/build-push-action@v4
@@ -89,7 +59,7 @@ jobs:
context: .
push: true
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
- name: Build and push Docker image (tagged)
@@ -98,5 +68,5 @@ jobs:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}" , "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

View File

@@ -452,9 +452,6 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
@@ -609,7 +606,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)

View File

@@ -123,7 +123,6 @@ as the main playground for developing new features for the [ggml](https://github
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
**UI:**
@@ -440,13 +439,7 @@ Building the program with BLAS support may lead to some performance improvements
&& cmake --build build -- -j 16
```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
- Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
```
However, this hurts performance for non-integrated GPUs.
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash
set PATH=%HIP_PATH%\bin;%PATH%

View File

@@ -1,5 +1,5 @@
set(TARGET gguf)
add_executable(${TARGET} gguf.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View File

@@ -1,4 +1,5 @@
#include "ggml.h"
#include "llama.h"
#include <cstdio>
#include <cinttypes>

2
ggml.c
View File

@@ -15335,8 +15335,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
const int n_past = ((int32_t *) tensor->op_params)[0];
src0->grad =
ggml_add_or_set(ctx, src0->grad,
/* ggml_diag_mask_inf_impl() shouldn't be here */
/* ref: https://github.com/ggerganov/llama.cpp/pull/4203#discussion_r1412377992 */
ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false),
zero_table);
}

3
ggml.h
View File

@@ -484,8 +484,7 @@ extern "C" {
enum ggml_log_level {
GGML_LOG_LEVEL_ERROR = 2,
GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_INFO = 4,
GGML_LOG_LEVEL_DEBUG = 5
GGML_LOG_LEVEL_INFO = 4
};
// ggml object

View File

@@ -3,7 +3,7 @@
This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
(GGML Universal File) format.
See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py)
See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
as an example for its usage.
## Installation

View File

@@ -2372,8 +2372,7 @@ struct llama_model_loader {
}
}
// Returns false if cancelled by progress_callback
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
size_t size_data = 0;
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
@@ -2405,9 +2404,7 @@ struct llama_model_loader {
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
if (progress_callback) {
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
return false;
}
progress_callback((float) size_done / size_data, progress_callback_user_data);
}
const size_t offs = file_offset(ggml_get_name(cur));
@@ -2469,11 +2466,8 @@ struct llama_model_loader {
}
if (progress_callback) {
// Even though the model is done loading, we still honor
// cancellation since we need to free allocations.
return progress_callback(1.0f, progress_callback_user_data);
progress_callback(1.0f, progress_callback_user_data);
}
return true;
}
};
@@ -3050,8 +3044,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
}
// Returns false if cancelled by progress_callback
static bool llm_load_tensors(
static void llm_load_tensors(
llama_model_loader & ml,
llama_model & model,
int n_gpu_layers,
@@ -3729,20 +3722,16 @@ static bool llm_load_tensors(
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
}
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
return false;
}
ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
model.mapping = std::move(ml.mapping);
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
model.t_load_us = ggml_time_us() - model.t_start_us;
return true;
}
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
try {
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
@@ -3760,21 +3749,19 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
if (params.vocab_only) {
LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
return 0;
return true;
}
if (!llm_load_tensors(
llm_load_tensors(
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data
)) {
return -2;
}
);
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
return -1;
return false;
}
return 0;
return true;
}
//
@@ -9154,18 +9141,11 @@ struct llama_model * llama_load_model_from_file(
LLAMA_LOG_INFO("\n");
}
}
return true;
};
}
int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
if (status == -1) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
if (!llama_model_load(path_model, *model, params)) {
LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
delete model;
return nullptr;
}

View File

@@ -127,7 +127,7 @@ extern "C" {
bool sorted;
} llama_token_data_array;
typedef bool (*llama_progress_callback)(float progress, void *ctx);
typedef void (*llama_progress_callback)(float progress, void *ctx);
// Input data for llama_decode
// A llama_batch object can contain input about one or many sequences
@@ -180,9 +180,7 @@ extern "C" {
int32_t main_gpu; // the GPU that is used for scratch and small tensors
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
// context pointer passed to the progress callback