mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-23 16:37:33 +03:00
Compare commits
2 Commits
b2877
...
fix-conver
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
284870c868 | ||
|
|
941de11759 |
32
.github/workflows/build.yml
vendored
32
.github/workflows/build.yml
vendored
@@ -340,36 +340,6 @@ jobs:
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
|
||||
ubuntu-latest-cmake-rpc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_RPC=ON ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest -L main --verbose
|
||||
|
||||
ubuntu-22-cmake-vulkan:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
@@ -693,8 +663,6 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'rpc'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'noavx'
|
||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
|
||||
@@ -123,7 +123,6 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
|
||||
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_RPC "llama: use RPC" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||
@@ -495,17 +494,6 @@ if (LLAMA_MPI)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_RPC)
|
||||
add_compile_definitions(GGML_USE_RPC)
|
||||
|
||||
if (WIN32)
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ws2_32)
|
||||
endif()
|
||||
|
||||
set(GGML_HEADERS_RPC ggml-rpc.h)
|
||||
set(GGML_SOURCES_RPC ggml-rpc.cpp)
|
||||
endif()
|
||||
|
||||
if (LLAMA_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
@@ -1188,7 +1176,6 @@ add_library(ggml OBJECT
|
||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
|
||||
|
||||
@@ -1060,14 +1060,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
#endif // GGML_USE_CUDA_SYCL_VULKAN
|
||||
return true;
|
||||
}
|
||||
if (arg == "--rpc") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.rpc_servers = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
return true;
|
||||
@@ -1565,7 +1557,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
}
|
||||
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
@@ -1839,7 +1830,6 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
||||
if (params.n_gpu_layers != -1) {
|
||||
mparams.n_gpu_layers = params.n_gpu_layers;
|
||||
}
|
||||
mparams.rpc_servers = params.rpc_servers.c_str();
|
||||
mparams.main_gpu = params.main_gpu;
|
||||
mparams.split_mode = params.split_mode;
|
||||
mparams.tensor_split = params.tensor_split;
|
||||
|
||||
@@ -82,7 +82,6 @@ struct gpt_params {
|
||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
|
||||
@@ -1109,7 +1109,7 @@ class OutputFile:
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = str(params.path_model.parent).split("/")[-1]
|
||||
name = params.path_model.name
|
||||
elif params.n_ctx == 4096:
|
||||
# Heuristic detection of LLaMA v2 model
|
||||
name = "LLaMA v2"
|
||||
|
||||
@@ -49,7 +49,4 @@ else()
|
||||
add_subdirectory(server)
|
||||
endif()
|
||||
add_subdirectory(export-lora)
|
||||
if (LLAMA_RPC)
|
||||
add_subdirectory(rpc)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -7,6 +7,8 @@ android {
|
||||
namespace = "com.example.llama"
|
||||
compileSdk = 34
|
||||
|
||||
ndkVersion = "26.1.10909125"
|
||||
|
||||
defaultConfig {
|
||||
applicationId = "com.example.llama"
|
||||
minSdk = 33
|
||||
@@ -18,6 +20,17 @@ android {
|
||||
vectorDrawables {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
@@ -42,6 +55,17 @@ android {
|
||||
composeOptions {
|
||||
kotlinCompilerExtensionVersion = "1.5.1"
|
||||
}
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
@@ -54,7 +78,6 @@ dependencies {
|
||||
implementation("androidx.compose.ui:ui-graphics")
|
||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||
implementation("androidx.compose.material3:material3")
|
||||
implementation(project(":llama"))
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
|
||||
@@ -37,7 +37,7 @@ FetchContent_MakeAvailable(llama)
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||
@@ -101,13 +101,13 @@ Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring fi
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||
|
||||
if (!model) {
|
||||
@@ -139,25 +139,25 @@ Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmo
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
llama_free(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
||||
llama_log_set(log_callback, NULL);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
Java_com_example_llama_Llm_bench_1model(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -271,13 +271,13 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
|
||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||
|
||||
@@ -313,19 +313,19 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
|
||||
llama_backend_init();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
||||
return env->NewStringUTF(llama_print_system_info());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jint JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
Java_com_example_llama_Llm_completion_1init(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -376,7 +376,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
Java_com_example_llama_Llm_completion_1loop(
|
||||
JNIEnv * env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
@@ -438,6 +438,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package android.llama.cpp
|
||||
package com.example.llama
|
||||
|
||||
import android.util.Log
|
||||
import kotlinx.coroutines.CoroutineDispatcher
|
||||
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
|
||||
import java.util.concurrent.Executors
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
class LLamaAndroid {
|
||||
class Llm {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||
@@ -165,8 +165,8 @@ class LLamaAndroid {
|
||||
}
|
||||
|
||||
// Enforce only one instance of Llm.
|
||||
private val _instance: LLamaAndroid = LLamaAndroid()
|
||||
private val _instance: Llm = Llm()
|
||||
|
||||
fun instance(): LLamaAndroid = _instance
|
||||
fun instance(): Llm = _instance
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,5 @@
|
||||
package com.example.llama
|
||||
|
||||
import android.llama.cpp.LLamaAndroid
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
@@ -10,7 +9,7 @@ import androidx.lifecycle.viewModelScope
|
||||
import kotlinx.coroutines.flow.catch
|
||||
import kotlinx.coroutines.launch
|
||||
|
||||
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
|
||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val NanosPerSecond = 1_000_000_000.0
|
||||
@@ -29,7 +28,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
|
||||
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llamaAndroid.unload()
|
||||
llm.unload()
|
||||
} catch (exc: IllegalStateException) {
|
||||
messages += exc.message!!
|
||||
}
|
||||
@@ -45,7 +44,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
|
||||
messages += ""
|
||||
|
||||
viewModelScope.launch {
|
||||
llamaAndroid.send(text)
|
||||
llm.send(text)
|
||||
.catch {
|
||||
Log.e(tag, "send() failed", it)
|
||||
messages += it.message!!
|
||||
@@ -58,7 +57,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
val start = System.nanoTime()
|
||||
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
|
||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
||||
val end = System.nanoTime()
|
||||
|
||||
messages += warmupResult
|
||||
@@ -71,7 +70,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
|
||||
return@launch
|
||||
}
|
||||
|
||||
messages += llamaAndroid.bench(512, 128, 1, 3)
|
||||
messages += llm.bench(512, 128, 1, 3)
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "bench() failed", exc)
|
||||
messages += exc.message!!
|
||||
@@ -82,7 +81,7 @@ class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instan
|
||||
fun load(pathToModel: String) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llamaAndroid.load(pathToModel)
|
||||
llm.load(pathToModel)
|
||||
messages += "Loaded $pathToModel"
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "load() failed", exc)
|
||||
|
||||
@@ -2,5 +2,4 @@
|
||||
plugins {
|
||||
id("com.android.application") version "8.2.0" apply false
|
||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||
id("com.android.library") version "8.2.0" apply false
|
||||
}
|
||||
|
||||
1
examples/llama.android/llama/.gitignore
vendored
1
examples/llama.android/llama/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
/build
|
||||
21
examples/llama.android/llama/proguard-rules.pro
vendored
21
examples/llama.android/llama/proguard-rules.pro
vendored
@@ -1,21 +0,0 @@
|
||||
# Add project specific ProGuard rules here.
|
||||
# You can control the set of applied configuration files using the
|
||||
# proguardFiles setting in build.gradle.
|
||||
#
|
||||
# For more details, see
|
||||
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||
|
||||
# If your project uses WebView with JS, uncomment the following
|
||||
# and specify the fully qualified class name to the JavaScript interface
|
||||
# class:
|
||||
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||
# public *;
|
||||
#}
|
||||
|
||||
# Uncomment this to preserve the line number information for
|
||||
# debugging stack traces.
|
||||
#-keepattributes SourceFile,LineNumberTable
|
||||
|
||||
# If you keep the line number information, uncomment this to
|
||||
# hide the original source file name.
|
||||
#-renamesourcefileattribute SourceFile
|
||||
@@ -1,24 +0,0 @@
|
||||
package android.llama.cpp
|
||||
|
||||
import androidx.test.platform.app.InstrumentationRegistry
|
||||
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||
|
||||
import org.junit.Test
|
||||
import org.junit.runner.RunWith
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Instrumented test, which will execute on an Android device.
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
@RunWith(AndroidJUnit4::class)
|
||||
class ExampleInstrumentedTest {
|
||||
@Test
|
||||
fun useAppContext() {
|
||||
// Context of the app under test.
|
||||
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
|
||||
assertEquals("android.llama.cpp.test", appContext.packageName)
|
||||
}
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
|
||||
</manifest>
|
||||
@@ -1,49 +0,0 @@
|
||||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
||||
@@ -1,17 +0,0 @@
|
||||
package android.llama.cpp
|
||||
|
||||
import org.junit.Test
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Example local unit test, which will execute on the development machine (host).
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
class ExampleUnitTest {
|
||||
@Test
|
||||
fun addition_isCorrect() {
|
||||
assertEquals(4, 2 + 2)
|
||||
}
|
||||
}
|
||||
@@ -15,4 +15,3 @@ dependencyResolutionManagement {
|
||||
|
||||
rootProject.name = "LlamaAndroid"
|
||||
include(":app")
|
||||
include(":llama")
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
add_executable(rpc-server rpc-server.cpp)
|
||||
target_link_libraries(rpc-server PRIVATE ggml llama)
|
||||
@@ -1,74 +0,0 @@
|
||||
## Overview
|
||||
|
||||
The `rpc-server` allows running `ggml` backend on a remote host.
|
||||
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
|
||||
This can be used for distributed LLM inference with `llama.cpp` in the following way:
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
rpcb---|TCP|srva
|
||||
rpcb---|TCP|srvb
|
||||
rpcb-.-|TCP|srvn
|
||||
subgraph hostn[Host N]
|
||||
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph hostb[Host B]
|
||||
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph hosta[Host A]
|
||||
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
|
||||
end
|
||||
subgraph host[Main Host]
|
||||
ggml[llama.cpp]---rpcb[RPC backend]
|
||||
end
|
||||
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
|
||||
```
|
||||
|
||||
Each host can run a different backend, e.g. one with CUDA and another with Metal.
|
||||
You can also run multiple `rpc-server` instances on the same host, each with a different backend.
|
||||
|
||||
## Usage
|
||||
|
||||
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
|
||||
For example, to build the CUDA backend with RPC support:
|
||||
|
||||
```bash
|
||||
mkdir build-rpc-cuda
|
||||
cd build-rpc-cuda
|
||||
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Then, start the `rpc-server` with the backend:
|
||||
|
||||
```bash
|
||||
$ bin/rpc-server 0.0.0.0 50052
|
||||
create_backend: using CUDA backend
|
||||
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
|
||||
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
|
||||
ggml_cuda_init: found 1 CUDA devices:
|
||||
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes
|
||||
Starting RPC server on 0.0.0.0:50052
|
||||
```
|
||||
|
||||
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
|
||||
```bash
|
||||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052
|
||||
```
|
||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||
|
||||
|
||||
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
|
||||
|
||||
```bash
|
||||
mkdir build-rpc
|
||||
cd build-rpc
|
||||
cmake .. -DLLAMA_RPC=ON
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||
|
||||
```bash
|
||||
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||
```
|
||||
@@ -1,70 +0,0 @@
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#include <string>
|
||||
#include <stdio.h>
|
||||
|
||||
static ggml_backend_t create_backend() {
|
||||
ggml_backend_t backend = NULL;
|
||||
#ifdef GGML_USE_CUDA
|
||||
fprintf(stderr, "%s: using CUDA backend\n", __func__);
|
||||
backend = ggml_backend_cuda_init(0); // init device 0
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
|
||||
}
|
||||
#elif GGML_USE_METAL
|
||||
fprintf(stderr, "%s: using Metal backend\n", __func__);
|
||||
backend = ggml_backend_metal_init();
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||
}
|
||||
#endif
|
||||
|
||||
// if there aren't GPU Backends fallback to CPU backend
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: using CPU backend\n", __func__);
|
||||
backend = ggml_backend_cpu_init();
|
||||
}
|
||||
return backend;
|
||||
}
|
||||
|
||||
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||
#ifdef GGML_USE_CUDA
|
||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||
#else
|
||||
// TODO: implement for other backends
|
||||
*free_mem = 1;
|
||||
*total_mem = 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int argc, char * argv[]) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
const char * host = argv[1];
|
||||
int port = std::stoi(argv[2]);
|
||||
if (port <= 0 || port > 65535) {
|
||||
fprintf(stderr, "Invalid port number: %d\n", port);
|
||||
return 1;
|
||||
}
|
||||
ggml_backend_t backend = create_backend();
|
||||
if (!backend) {
|
||||
fprintf(stderr, "Failed to create backend\n");
|
||||
return 1;
|
||||
}
|
||||
printf("Starting RPC server on %s:%d\n", host, port);
|
||||
size_t free_mem, total_mem;
|
||||
get_backend_memory(&free_mem, &total_mem);
|
||||
std::string endpoint = std::string(host) + ":" + std::to_string(port);
|
||||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||
ggml_backend_free(backend);
|
||||
return 0;
|
||||
}
|
||||
@@ -48,7 +48,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||
- `--path`: Path from which to serve static files. Default: disabled
|
||||
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
|
||||
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
|
||||
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
|
||||
- `--embedding`: Enable embedding extraction. Default: disabled
|
||||
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`
|
||||
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
|
||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
|
||||
1023
ggml-rpc.cpp
1023
ggml-rpc.cpp
File diff suppressed because it is too large
Load Diff
24
ggml-rpc.h
24
ggml-rpc.h
@@ -1,24 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||
|
||||
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
310
llama.cpp
310
llama.cpp
@@ -7,10 +7,6 @@
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
# include "ggml-rpc.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
# include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
@@ -1689,6 +1685,91 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
||||
GGML_UNUSED(host_buffer);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
buft = ggml_backend_vk_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||
if (buft == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
return buft;
|
||||
|
||||
GGML_UNUSED(gpu);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
if (ggml_backend_cuda_get_device_count() > 1) {
|
||||
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
if (ggml_backend_sycl_get_device_count() > 1) {
|
||||
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_offload(fallback_gpu);
|
||||
}
|
||||
return buft;
|
||||
|
||||
GGML_UNUSED(tensor_split);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_count() {
|
||||
#if defined(GGML_USE_CUDA)
|
||||
return ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return ggml_backend_vk_get_device_count();
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
static size_t llama_get_device_memory(int device) {
|
||||
#if defined(GGML_USE_CUDA)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#else
|
||||
return 1;
|
||||
GGML_UNUSED(device);
|
||||
#endif
|
||||
}
|
||||
|
||||
//
|
||||
// globals
|
||||
//
|
||||
@@ -2129,8 +2210,6 @@ struct llama_model {
|
||||
int main_gpu;
|
||||
int n_gpu_layers;
|
||||
|
||||
std::vector<std::string> rpc_servers;
|
||||
|
||||
// gguf metadata
|
||||
std::unordered_map<std::string, std::string> gguf_kv;
|
||||
|
||||
@@ -2274,104 +2353,6 @@ struct llama_context {
|
||||
#endif
|
||||
};
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
std::string endpoint = model.rpc_servers[gpu];
|
||||
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
||||
#elif defined(GGML_USE_METAL)
|
||||
buft = ggml_backend_metal_buffer_type();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
buft = ggml_backend_cuda_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
buft = ggml_backend_vk_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
buft = ggml_backend_sycl_buffer_type(gpu);
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
buft = ggml_backend_opencl_buffer_type();
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
buft = ggml_backend_kompute_buffer_type(gpu);
|
||||
if (buft == nullptr) {
|
||||
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
return buft;
|
||||
GGML_UNUSED(model);
|
||||
GGML_UNUSED(gpu);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
||||
ggml_backend_buffer_type_t buft = nullptr;
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
if (ggml_backend_cuda_get_device_count() > 1) {
|
||||
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
if (ggml_backend_sycl_get_device_count() > 1) {
|
||||
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (buft == nullptr) {
|
||||
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
||||
}
|
||||
return buft;
|
||||
|
||||
GGML_UNUSED(tensor_split);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_count(const llama_model & model) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
return model.rpc_servers.size();
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
return ggml_backend_cuda_get_device_count();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
return ggml_backend_sycl_get_device_count();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
return ggml_backend_vk_get_device_count();
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
GGML_UNUSED(model);
|
||||
}
|
||||
|
||||
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
size_t total;
|
||||
size_t free;
|
||||
std::string endpoint = model.rpc_servers[device];
|
||||
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
size_t total;
|
||||
size_t free;
|
||||
ggml_backend_vk_get_device_memory(device, &free, &total);
|
||||
return free;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
GGML_UNUSED(model);
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
//
|
||||
// kv cache helpers
|
||||
//
|
||||
@@ -4810,13 +4791,13 @@ static bool llm_load_tensors(
|
||||
|
||||
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
||||
// calculate the split points
|
||||
int device_count = llama_get_device_count(model);
|
||||
int device_count = llama_get_device_count();
|
||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
||||
std::vector<float> splits(device_count);
|
||||
if (all_zero) {
|
||||
// default split, by free memory
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
splits[i] = llama_get_device_memory(model, i);
|
||||
splits[i] = llama_get_device_memory(i);
|
||||
}
|
||||
} else {
|
||||
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
||||
@@ -4836,35 +4817,35 @@ static bool llm_load_tensors(
|
||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
||||
}
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
||||
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
||||
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
}
|
||||
} else {
|
||||
ggml_backend_buffer_type_t split_buft;
|
||||
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
||||
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
||||
} else {
|
||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
||||
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
||||
split_buft = llama_default_buffer_type_offload(main_gpu);
|
||||
}
|
||||
// assign the repeating layers
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
model.buft_layer[i] = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(model, main_gpu)
|
||||
llama_default_buffer_type_offload(main_gpu)
|
||||
};
|
||||
}
|
||||
// assign the output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
model.buft_output = {
|
||||
split_buft,
|
||||
llama_default_buffer_type_offload(model, main_gpu)
|
||||
llama_default_buffer_type_offload(main_gpu)
|
||||
};
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
@@ -13201,58 +13182,6 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
||||
return rejects;
|
||||
}
|
||||
|
||||
static bool llama_grammar_detect_left_recursion(
|
||||
const std::vector<std::vector<llama_grammar_element>> & rules,
|
||||
size_t rule_index,
|
||||
std::vector<bool> * rules_visited,
|
||||
std::vector<bool> * rules_in_progress,
|
||||
std::vector<bool> * rules_may_be_empty) {
|
||||
if ((*rules_in_progress)[rule_index]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = true;
|
||||
|
||||
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
||||
|
||||
// First check if the rule might produce the empty string. This could be done combined with the second
|
||||
// step but it's more readable as two steps.
|
||||
bool at_rule_start = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
if (at_rule_start) {
|
||||
(*rules_may_be_empty)[rule_index] = true;
|
||||
break;
|
||||
}
|
||||
at_rule_start = true;
|
||||
} else {
|
||||
at_rule_start = false;
|
||||
}
|
||||
}
|
||||
|
||||
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
||||
// be empty)
|
||||
bool recurse_into_nonterminal = true;
|
||||
for (size_t i = 0; i < rule.size(); i++) {
|
||||
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
||||
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
||||
return true;
|
||||
}
|
||||
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
||||
recurse_into_nonterminal = true;
|
||||
} else {
|
||||
recurse_into_nonterminal = false;
|
||||
}
|
||||
}
|
||||
|
||||
(*rules_in_progress)[rule_index] = false;
|
||||
(*rules_visited)[rule_index] = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// grammar - external
|
||||
//
|
||||
@@ -13272,19 +13201,6 @@ struct llama_grammar * llama_grammar_init(
|
||||
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
||||
}
|
||||
|
||||
// Check for left recursion
|
||||
std::vector<bool> rules_visited(n_rules);
|
||||
std::vector<bool> rules_in_progress(n_rules);
|
||||
std::vector<bool> rules_may_be_empty(n_rules);
|
||||
for (size_t i = 0; i < n_rules; i++) {
|
||||
if (rules_visited[i]) {
|
||||
continue;
|
||||
}
|
||||
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
||||
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
||||
}
|
||||
}
|
||||
|
||||
// loop over alternates of start rule to build initial stacks
|
||||
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
||||
pos = vec_rules[start_rule_index].data();
|
||||
@@ -13307,9 +13223,6 @@ struct llama_grammar * llama_grammar_init(
|
||||
}
|
||||
} while (true);
|
||||
|
||||
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
||||
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
||||
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
||||
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
||||
}
|
||||
|
||||
@@ -15409,7 +15322,6 @@ struct llama_model_params llama_model_default_params() {
|
||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.rpc_servers =*/ nullptr,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
@@ -15480,9 +15392,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
||||
}
|
||||
|
||||
size_t llama_max_devices(void) {
|
||||
#if defined(GGML_USE_RPC)
|
||||
return GGML_RPC_MAX_SERVERS;
|
||||
#elif defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_METAL)
|
||||
return 1;
|
||||
#elif defined(GGML_USE_CUDA)
|
||||
return GGML_CUDA_MAX_DEVICES;
|
||||
@@ -15505,7 +15415,7 @@ bool llama_supports_mlock(void) {
|
||||
|
||||
bool llama_supports_gpu_offload(void) {
|
||||
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
||||
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
return true;
|
||||
#else
|
||||
@@ -15568,17 +15478,7 @@ struct llama_model * llama_load_model_from_file(
|
||||
return true;
|
||||
};
|
||||
}
|
||||
if (params.rpc_servers != nullptr) {
|
||||
// split the servers set them into model->rpc_servers
|
||||
std::string servers(params.rpc_servers);
|
||||
size_t pos = 0;
|
||||
while ((pos = servers.find(",")) != std::string::npos) {
|
||||
std::string server = servers.substr(0, pos);
|
||||
model->rpc_servers.push_back(server);
|
||||
servers.erase(0, pos + 1);
|
||||
}
|
||||
model->rpc_servers.push_back(servers);
|
||||
}
|
||||
|
||||
int status = llama_model_load(path_model, *model, params);
|
||||
GGML_ASSERT(status <= 0);
|
||||
if (status < 0) {
|
||||
@@ -15725,17 +15625,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
|
||||
if (!hparams.vocab_only) {
|
||||
// initialize backends
|
||||
#if defined(GGML_USE_RPC)
|
||||
for (auto & server : model->rpc_servers) {
|
||||
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
}
|
||||
#elif defined(GGML_USE_METAL)
|
||||
#ifdef GGML_USE_METAL
|
||||
if (model->n_gpu_layers > 0) {
|
||||
ctx->backend_metal = ggml_backend_metal_init();
|
||||
if (ctx->backend_metal == nullptr) {
|
||||
@@ -15891,11 +15781,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
||||
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
bool pipeline_parallel =
|
||||
llama_get_device_count(*model) > 1 &&
|
||||
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
||||
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
||||
params.offload_kqv;
|
||||
bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER;
|
||||
#ifndef GGML_USE_CUDA
|
||||
// pipeline parallelism requires support for async compute and events
|
||||
// currently this is only implemented in the CUDA backend
|
||||
|
||||
3
llama.h
3
llama.h
@@ -242,9 +242,6 @@ extern "C" {
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||
const float * tensor_split;
|
||||
|
||||
// comma separated list of RPC servers to use for offloading
|
||||
const char * rpc_servers;
|
||||
|
||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||
// If the provided progress_callback returns true, model loading continues.
|
||||
// If it returns false, model loading is immediately aborted.
|
||||
|
||||
@@ -28,19 +28,6 @@ static llama_grammar* build_grammar(const std::string & grammar_str) {
|
||||
return grammar;
|
||||
}
|
||||
|
||||
static bool test_build_grammar_fails(const std::string & grammar_str) {
|
||||
fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
|
||||
bool grammar_fails = false;
|
||||
try {
|
||||
build_grammar(grammar_str);
|
||||
fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
|
||||
} catch (const std::exception & err) {
|
||||
grammar_fails = true;
|
||||
fprintf(stdout, " ✅︎\n");
|
||||
}
|
||||
return grammar_fails;
|
||||
}
|
||||
|
||||
static bool match_string(const std::string & input, llama_grammar* grammar) {
|
||||
auto decoded = decode_utf8(input, {});
|
||||
|
||||
@@ -333,38 +320,6 @@ number ::= [0-9]+)""";
|
||||
fprintf(stderr, " ✅︎ Passed\n");
|
||||
}
|
||||
|
||||
static void test_failure_left_recursion() {
|
||||
fprintf(stderr, "⚫ Testing left recursion detection:\n");
|
||||
|
||||
// Test simple left recursion detection
|
||||
const std::string simple_str = R"""(root ::= "a" | root "a")""";
|
||||
assert(test_build_grammar_fails(simple_str));
|
||||
|
||||
// Test more complicated left recursion detection
|
||||
const std::string medium_str = R"""(
|
||||
root ::= asdf
|
||||
asdf ::= "a" | asdf "a"
|
||||
)""";
|
||||
assert(test_build_grammar_fails(medium_str));
|
||||
|
||||
// Test even more complicated left recursion detection
|
||||
const std::string hard_str = R"""(
|
||||
root ::= asdf
|
||||
asdf ::= "a" | foo "b"
|
||||
foo ::= "c" | asdf "d" | "e")""";
|
||||
assert(test_build_grammar_fails(hard_str));
|
||||
|
||||
// Test yet even more complicated left recursion detection
|
||||
const std::string hardest_str = R"""(
|
||||
root ::= asdf
|
||||
asdf ::= "a" | foo "b"
|
||||
foo ::= "c" | empty asdf "d" | "e"
|
||||
empty ::= "blah" | )""";
|
||||
assert(test_build_grammar_fails(hardest_str));
|
||||
|
||||
fprintf(stderr, " ✅︎ Passed\n");
|
||||
}
|
||||
|
||||
int main() {
|
||||
fprintf(stdout, "Running grammar integration tests...\n");
|
||||
test_simple_grammar();
|
||||
@@ -372,7 +327,6 @@ int main() {
|
||||
test_quantifiers();
|
||||
test_failure_missing_root();
|
||||
test_failure_missing_reference();
|
||||
test_failure_left_recursion();
|
||||
fprintf(stdout, "All tests passed.\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user