Compare commits

..

2 Commits

Author SHA1 Message Date
Brian
284870c868 Merge branch 'master' into fix-convert-modelname 2024-05-14 15:05:49 +10:00
Jared Van Bortel
941de11759 convert : get general.name from model dir, not its parent 2024-02-20 11:17:16 -05:00
29 changed files with 149 additions and 1660 deletions

View File

@@ -340,36 +340,6 @@ jobs:
cd build
ctest -L main --verbose
ubuntu-latest-cmake-rpc:
runs-on: ubuntu-latest
continue-on-error: true
steps:
- name: Clone
id: checkout
uses: actions/checkout@v4
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake -DLLAMA_RPC=ON ..
cmake --build . --config Release -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose
ubuntu-22-cmake-vulkan:
runs-on: ubuntu-22.04
@@ -693,8 +663,6 @@ jobs:
strategy:
matrix:
include:
- build: 'rpc'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2'

View File

@@ -123,7 +123,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
@@ -495,17 +494,6 @@ if (LLAMA_MPI)
endif()
endif()
if (LLAMA_RPC)
add_compile_definitions(GGML_USE_RPC)
if (WIN32)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
endif()
set(GGML_HEADERS_RPC ggml-rpc.h)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
@@ -1188,7 +1176,6 @@ add_library(ggml OBJECT
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}

View File

@@ -1060,14 +1060,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.rpc_servers = argv[i];
return true;
}
if (arg == "--no-mmap") {
params.use_mmap = false;
return true;
@@ -1565,7 +1557,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
}
printf(" --rpc SERVERS comma separated list of RPC servers\n");
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
printf(" -gan N, --grp-attn-n N\n");
@@ -1839,7 +1830,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;

View File

@@ -82,7 +82,6 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;

View File

@@ -1109,7 +1109,7 @@ class OutputFile:
if metadata is not None and metadata.name is not None:
name = metadata.name
elif params.path_model is not None:
name = str(params.path_model.parent).split("/")[-1]
name = params.path_model.name
elif params.n_ctx == 4096:
# Heuristic detection of LLaMA v2 model
name = "LLaMA v2"

View File

@@ -49,7 +49,4 @@ else()
add_subdirectory(server)
endif()
add_subdirectory(export-lora)
if (LLAMA_RPC)
add_subdirectory(rpc)
endif()
endif()

View File

@@ -7,6 +7,8 @@ android {
namespace = "com.example.llama"
compileSdk = 34
ndkVersion = "26.1.10909125"
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
@@ -18,6 +20,17 @@ android {
vectorDrawables {
useSupportLibrary = true
}
ndk {
// Add NDK properties if wanted, e.g.
// abiFilters += listOf("arm64-v8a")
}
externalNativeBuild {
cmake {
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf()
arguments += listOf()
}
}
}
buildTypes {
@@ -42,6 +55,17 @@ android {
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
externalNativeBuild {
cmake {
path = file("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
}
dependencies {
@@ -54,7 +78,6 @@ dependencies {
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
implementation(project(":llama"))
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")

View File

@@ -37,7 +37,7 @@ FetchContent_MakeAvailable(llama)
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this

View File

@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
llama_model_params model_params = llama_model_default_params();
auto path_to_model = env->GetStringUTFChars(filename, 0);
@@ -101,13 +101,13 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast<llama_model *>(model));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
auto model = reinterpret_cast<llama_model *>(jmodel);
if (!model) {
@@ -139,25 +139,25 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
llama_free(reinterpret_cast<llama_context *>(context));
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
llama_backend_free();
}
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
llama_log_set(log_callback, NULL);
}
extern "C"
JNIEXPORT jstring JNICALL
Java_android_llama_cpp_LLamaAndroid_bench_1model(
Java_com_example_llama_Llm_bench_1model(
JNIEnv *env,
jobject,
jlong context_pointer,
@@ -271,13 +271,13 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
@@ -313,19 +313,19 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
llama_backend_init();
}
extern "C"
JNIEXPORT jstring JNICALL
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
return env->NewStringUTF(llama_print_system_info());
}
extern "C"
JNIEXPORT jint JNICALL
Java_android_llama_cpp_LLamaAndroid_completion_1init(
Java_com_example_llama_Llm_completion_1init(
JNIEnv *env,
jobject,
jlong context_pointer,
@@ -376,7 +376,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
extern "C"
JNIEXPORT jstring JNICALL
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
Java_com_example_llama_Llm_completion_1loop(
JNIEnv * env,
jobject,
jlong context_pointer,
@@ -438,6 +438,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
}

View File

@@ -1,4 +1,4 @@
package android.llama.cpp
package com.example.llama
import android.util.Log
import kotlinx.coroutines.CoroutineDispatcher
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import kotlin.concurrent.thread
class LLamaAndroid {
class Llm {
private val tag: String? = this::class.simpleName
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@@ -165,8 +165,8 @@ class LLamaAndroid {
}
// Enforce only one instance of Llm.
private val _instance: LLamaAndroid = LLamaAndroid()
private val _instance: Llm = Llm()
fun instance(): LLamaAndroid = _instance
fun instance(): Llm = _instance
}
}

View File

@@ -1,6 +1,5 @@
package com.example.llama
import android.llama.cpp.LLamaAndroid
import android.util.Log
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
@@ -10,7 +9,7 @@ import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
companion object {
@JvmStatic
private val NanosPerSecond = 1_000_000_000.0
@@ -29,7 +28,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
viewModelScope.launch {
try {
llamaAndroid.unload()
llm.unload()
} catch (exc: IllegalStateException) {
messages += exc.message!!
}
@@ -45,7 +44,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
messages += ""
viewModelScope.launch {
llamaAndroid.send(text)
llm.send(text)
.catch {
Log.e(tag, "send() failed", it)
messages += it.message!!
@@ -58,7 +57,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
viewModelScope.launch {
try {
val start = System.nanoTime()
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
val warmupResult = llm.bench(pp, tg, pl, nr)
val end = System.nanoTime()
messages += warmupResult
@@ -71,7 +70,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
return@launch
}
messages += llamaAndroid.bench(512, 128, 1, 3)
messages += llm.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc)
messages += exc.message!!
@@ -82,7 +81,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
fun load(pathToModel: String) {
viewModelScope.launch {
try {
llamaAndroid.load(pathToModel)
llm.load(pathToModel)
messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc)

View File

@@ -2,5 +2,4 @@
plugins {
id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
id("com.android.library") version "8.2.0" apply false
}

View File

@@ -1 +0,0 @@
/build

View File

@@ -1,21 +0,0 @@
# Add project specific ProGuard rules here.
# You can control the set of applied configuration files using the
# proguardFiles setting in build.gradle.
#
# For more details, see
# http://developer.android.com/guide/developing/tools/proguard.html
# If your project uses WebView with JS, uncomment the following
# and specify the fully qualified class name to the JavaScript interface
# class:
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
# public *;
#}
# Uncomment this to preserve the line number information for
# debugging stack traces.
#-keepattributes SourceFile,LineNumberTable
# If you keep the line number information, uncomment this to
# hide the original source file name.
#-renamesourcefileattribute SourceFile

View File

@@ -1,24 +0,0 @@
package android.llama.cpp
import androidx.test.platform.app.InstrumentationRegistry
import androidx.test.ext.junit.runners.AndroidJUnit4
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.Assert.*
/**
* Instrumented test, which will execute on an Android device.
*
* See [testing documentation](http://d.android.com/tools/testing).
*/
@RunWith(AndroidJUnit4::class)
class ExampleInstrumentedTest {
@Test
fun useAppContext() {
// Context of the app under test.
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
assertEquals("android.llama.cpp.test", appContext.packageName)
}
}

View File

@@ -1,4 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
</manifest>

View File

@@ -1,49 +0,0 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
include(FetchContent)
FetchContent_Declare(
llama
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG master
)
# Also provides "common"
FetchContent_MakeAvailable(llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View File

@@ -1,17 +0,0 @@
package android.llama.cpp
import org.junit.Test
import org.junit.Assert.*
/**
* Example local unit test, which will execute on the development machine (host).
*
* See [testing documentation](http://d.android.com/tools/testing).
*/
class ExampleUnitTest {
@Test
fun addition_isCorrect() {
assertEquals(4, 2 + 2)
}
}

View File

@@ -15,4 +15,3 @@ dependencyResolutionManagement {
rootProject.name = "LlamaAndroid"
include(":app")
include(":llama")

View File

@@ -1,2 +0,0 @@
add_executable(rpc-server rpc-server.cpp)
target_link_libraries(rpc-server PRIVATE ggml llama)

View File

@@ -1,74 +0,0 @@
## Overview
The `rpc-server` allows running `ggml` backend on a remote host.
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
This can be used for distributed LLM inference with `llama.cpp` in the following way:
```mermaid
flowchart TD
rpcb---|TCP|srva
rpcb---|TCP|srvb
rpcb-.-|TCP|srvn
subgraph hostn[Host N]
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
end
subgraph hostb[Host B]
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
end
subgraph hosta[Host A]
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
end
subgraph host[Main Host]
ggml[llama.cpp]---rpcb[RPC backend]
end
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
```
Each host can run a different backend, e.g. one with CUDA and another with Metal.
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
## Usage
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
For example, to build the CUDA backend with RPC support:
```bash
mkdir build-rpc-cuda
cd build-rpc-cuda
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
cmake --build . --config Release
```
Then, start the `rpc-server` with the backend:
```bash
$ bin/rpc-server 0.0.0.0 50052
create_backend: using CUDA backend
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
Starting RPC server on 0.0.0.0:50052
```
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
```bash
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
```
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
```bash
mkdir build-rpc
cd build-rpc
cmake .. -DLLAMA_RPC=ON
cmake --build . --config Release
```
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
```bash
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
```

View File

@@ -1,70 +0,0 @@
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include "ggml-rpc.h"
#include <string>
#include <stdio.h>
static ggml_backend_t create_backend() {
ggml_backend_t backend = NULL;
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
backend = ggml_backend_cuda_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#elif GGML_USE_METAL
fprintf(stderr, "%s: using Metal backend\n", __func__);
backend = ggml_backend_metal_init();
if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
if (!backend) {
fprintf(stderr, "%s: using CPU backend\n", __func__);
backend = ggml_backend_cpu_init();
}
return backend;
}
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
#ifdef GGML_USE_CUDA
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
#else
// TODO: implement for other backends
*free_mem = 1;
*total_mem = 1;
#endif
}
int main(int argc, char * argv[]) {
if (argc < 3) {
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
return 1;
}
const char * host = argv[1];
int port = std::stoi(argv[2]);
if (port <= 0 || port > 65535) {
fprintf(stderr, "Invalid port number: %d\n", port);
return 1;
}
ggml_backend_t backend = create_backend();
if (!backend) {
fprintf(stderr, "Failed to create backend\n");
return 1;
}
printf("Starting RPC server on %s:%d\n", host, port);
size_t free_mem, total_mem;
get_backend_memory(&free_mem, &total_mem);
std::string endpoint = std::string(host) + ":" + std::to_string(port);
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend);
return 0;
}

View File

@@ -48,7 +48,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- `--path`: Path from which to serve static files. Default: disabled
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
- `--embedding`: Enable embedding extraction. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)

File diff suppressed because it is too large Load Diff

View File

@@ -1,24 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_RPC_MAX_SERVERS 16
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
#ifdef __cplusplus
}
#endif

310
llama.cpp
View File

@@ -7,10 +7,6 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"
#ifdef GGML_USE_RPC
# include "ggml-rpc.h"
#endif
#ifdef GGML_USE_CUDA
# include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
@@ -1689,6 +1685,91 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
GGML_UNUSED(host_buffer);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_METAL
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(true);
}
return buft;
GGML_UNUSED(gpu);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
}
#endif
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(fallback_gpu);
}
return buft;
GGML_UNUSED(tensor_split);
}
static size_t llama_get_device_count() {
#if defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
return 1;
#endif
}
static size_t llama_get_device_memory(int device) {
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#else
return 1;
GGML_UNUSED(device);
#endif
}
//
// globals
//
@@ -2129,8 +2210,6 @@ struct llama_model {
int main_gpu;
int n_gpu_layers;
std::vector<std::string> rpc_servers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
@@ -2274,104 +2353,6 @@ struct llama_context {
#endif
};
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_RPC
std::string endpoint = model.rpc_servers[gpu];
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
#elif defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
buft = ggml_backend_cuda_buffer_type(gpu);
#elif defined(GGML_USE_VULKAN)
buft = ggml_backend_vk_buffer_type(gpu);
#elif defined(GGML_USE_SYCL)
buft = ggml_backend_sycl_buffer_type(gpu);
#elif defined(GGML_USE_CLBLAST)
buft = ggml_backend_opencl_buffer_type();
#elif defined(GGML_USE_KOMPUTE)
buft = ggml_backend_kompute_buffer_type(gpu);
if (buft == nullptr) {
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_cpu(true);
}
return buft;
GGML_UNUSED(model);
GGML_UNUSED(gpu);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
ggml_backend_buffer_type_t buft = nullptr;
#ifdef GGML_USE_CUDA
if (ggml_backend_cuda_get_device_count() > 1) {
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
}
#endif
#ifdef GGML_USE_SYCL
if (ggml_backend_sycl_get_device_count() > 1) {
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
}
#endif
if (buft == nullptr) {
buft = llama_default_buffer_type_offload(model, fallback_gpu);
}
return buft;
GGML_UNUSED(tensor_split);
}
static size_t llama_get_device_count(const llama_model & model) {
#if defined(GGML_USE_RPC)
return model.rpc_servers.size();
#elif defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count();
#elif defined(GGML_USE_SYCL)
return ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
return ggml_backend_vk_get_device_count();
#else
return 1;
#endif
GGML_UNUSED(model);
}
static size_t llama_get_device_memory(const llama_model & model, int device) {
#if defined(GGML_USE_RPC)
size_t total;
size_t free;
std::string endpoint = model.rpc_servers[device];
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
return free;
#elif defined(GGML_USE_CUDA)
size_t total;
size_t free;
ggml_backend_cuda_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
ggml_backend_sycl_get_device_memory(device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
ggml_backend_vk_get_device_memory(device, &free, &total);
return free;
#else
return 1;
#endif
GGML_UNUSED(model);
GGML_UNUSED(device);
}
//
// kv cache helpers
//
@@ -4810,13 +4791,13 @@ static bool llm_load_tensors(
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
// calculate the split points
int device_count = llama_get_device_count(model);
int device_count = llama_get_device_count();
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
std::vector<float> splits(device_count);
if (all_zero) {
// default split, by free memory
for (int i = 0; i < device_count; ++i) {
splits[i] = llama_get_device_memory(model, i);
splits[i] = llama_get_device_memory(i);
}
} else {
std::copy(tensor_split, tensor_split + device_count, splits.begin());
@@ -4836,35 +4817,35 @@ static bool llm_load_tensors(
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
}
// assign the output layer
if (n_gpu_layers > n_layer) {
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
} else {
model.buft_output = llama_default_buffer_type_cpu(true);
}
} else {
ggml_backend_buffer_type_t split_buft;
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
} else {
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
split_buft = llama_default_buffer_type_offload(model, main_gpu);
split_buft = llama_default_buffer_type_offload(main_gpu);
}
// assign the repeating layers
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
model.buft_layer[i] = {
split_buft,
llama_default_buffer_type_offload(model, main_gpu)
llama_default_buffer_type_offload(main_gpu)
};
}
// assign the output layer
if (n_gpu_layers > n_layer) {
model.buft_output = {
split_buft,
llama_default_buffer_type_offload(model, main_gpu)
llama_default_buffer_type_offload(main_gpu)
};
} else {
model.buft_output = llama_default_buffer_type_cpu(true);
@@ -13201,58 +13182,6 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
return rejects;
}
static bool llama_grammar_detect_left_recursion(
const std::vector<std::vector<llama_grammar_element>> & rules,
size_t rule_index,
std::vector<bool> * rules_visited,
std::vector<bool> * rules_in_progress,
std::vector<bool> * rules_may_be_empty) {
if ((*rules_in_progress)[rule_index]) {
return true;
}
(*rules_in_progress)[rule_index] = true;
const std::vector<llama_grammar_element> & rule = rules[rule_index];
// First check if the rule might produce the empty string. This could be done combined with the second
// step but it's more readable as two steps.
bool at_rule_start = true;
for (size_t i = 0; i < rule.size(); i++) {
if (llama_grammar_is_end_of_sequence(&rule[i])) {
if (at_rule_start) {
(*rules_may_be_empty)[rule_index] = true;
break;
}
at_rule_start = true;
} else {
at_rule_start = false;
}
}
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
// be empty)
bool recurse_into_nonterminal = true;
for (size_t i = 0; i < rule.size(); i++) {
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
return true;
}
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
recurse_into_nonterminal = false;
}
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
recurse_into_nonterminal = true;
} else {
recurse_into_nonterminal = false;
}
}
(*rules_in_progress)[rule_index] = false;
(*rules_visited)[rule_index] = true;
return false;
}
//
// grammar - external
//
@@ -13272,19 +13201,6 @@ struct llama_grammar * llama_grammar_init(
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
}
// Check for left recursion
std::vector<bool> rules_visited(n_rules);
std::vector<bool> rules_in_progress(n_rules);
std::vector<bool> rules_may_be_empty(n_rules);
for (size_t i = 0; i < n_rules; i++) {
if (rules_visited[i]) {
continue;
}
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
}
}
// loop over alternates of start rule to build initial stacks
std::vector<std::vector<const llama_grammar_element *>> stacks;
pos = vec_rules[start_rule_index].data();
@@ -13307,9 +13223,6 @@ struct llama_grammar * llama_grammar_init(
}
} while (true);
// Important: vec_rules has to be moved here, not copied, because stacks contains
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
// then the pointers would be invalidated when the local vec_rules goes out of scope.
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
}
@@ -15409,7 +15322,6 @@ struct llama_model_params llama_model_default_params() {
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
/*.rpc_servers =*/ nullptr,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.kv_overrides =*/ nullptr,
@@ -15480,9 +15392,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
}
size_t llama_max_devices(void) {
#if defined(GGML_USE_RPC)
return GGML_RPC_MAX_SERVERS;
#elif defined(GGML_USE_METAL)
#if defined(GGML_USE_METAL)
return 1;
#elif defined(GGML_USE_CUDA)
return GGML_CUDA_MAX_DEVICES;
@@ -15505,7 +15415,7 @@ bool llama_supports_mlock(void) {
bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
#else
@@ -15568,17 +15478,7 @@ struct llama_model * llama_load_model_from_file(
return true;
};
}
if (params.rpc_servers != nullptr) {
// split the servers set them into model->rpc_servers
std::string servers(params.rpc_servers);
size_t pos = 0;
while ((pos = servers.find(",")) != std::string::npos) {
std::string server = servers.substr(0, pos);
model->rpc_servers.push_back(server);
servers.erase(0, pos + 1);
}
model->rpc_servers.push_back(servers);
}
int status = llama_model_load(path_model, *model, params);
GGML_ASSERT(status <= 0);
if (status < 0) {
@@ -15725,17 +15625,7 @@ struct llama_context * llama_new_context_with_model(
if (!hparams.vocab_only) {
// initialize backends
#if defined(GGML_USE_RPC)
for (auto & server : model->rpc_servers) {
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
#elif defined(GGML_USE_METAL)
#ifdef GGML_USE_METAL
if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init();
if (ctx->backend_metal == nullptr) {
@@ -15891,11 +15781,7 @@ struct llama_context * llama_new_context_with_model(
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
llama_get_device_count(*model) > 1 &&
model->n_gpu_layers > (int)model->hparams.n_layer &&
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
params.offload_kqv;
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
#ifndef GGML_USE_CUDA
// pipeline parallelism requires support for async compute and events
// currently this is only implemented in the CUDA backend

View File

@@ -242,9 +242,6 @@ extern "C" {
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;
// comma separated list of RPC servers to use for offloading
const char * rpc_servers;
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.

View File

@@ -28,19 +28,6 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
return grammar;
}
static bool test_build_grammar_fails(const std::string & grammar_str) {
fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
bool grammar_fails = false;
try {
build_grammar(grammar_str);
fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
} catch (const std::exception & err) {
grammar_fails = true;
fprintf(stdout, " ✅︎\n");
}
return grammar_fails;
}
static bool match_string(const std::string & input, llama_grammar* grammar) {
auto decoded = decode_utf8(input, {});
@@ -333,38 +320,6 @@ number ::= [0-9]+)""";
fprintf(stderr, " ✅︎ Passed\n");
}
static void test_failure_left_recursion() {
fprintf(stderr, "⚫ Testing left recursion detection:\n");
// Test simple left recursion detection
const std::string simple_str = R"""(root ::= "a" | root "a")""";
assert(test_build_grammar_fails(simple_str));
// Test more complicated left recursion detection
const std::string medium_str = R"""(
root ::= asdf
asdf ::= "a" | asdf "a"
)""";
assert(test_build_grammar_fails(medium_str));
// Test even more complicated left recursion detection
const std::string hard_str = R"""(
root ::= asdf
asdf ::= "a" | foo "b"
foo ::= "c" | asdf "d" | "e")""";
assert(test_build_grammar_fails(hard_str));
// Test yet even more complicated left recursion detection
const std::string hardest_str = R"""(
root ::= asdf
asdf ::= "a" | foo "b"
foo ::= "c" | empty asdf "d" | "e"
empty ::= "blah" | )""";
assert(test_build_grammar_fails(hardest_str));
fprintf(stderr, " ✅︎ Passed\n");
}
int main() {
fprintf(stdout, "Running grammar integration tests...\n");
test_simple_grammar();
@@ -372,7 +327,6 @@ int main() {
test_quantifiers();
test_failure_missing_root();
test_failure_missing_reference();
test_failure_left_recursion();
fprintf(stdout, "All tests passed.\n");
return 0;
}