Compare commits

..

21 Commits

Author SHA1 Message Date
Georgi Gerganov
0492363137 mpi : fix after master merge 2023-07-09 22:23:04 +03:00
Georgi Gerganov
81c5ddd532 Merge branch 'mpi' into refactor-mpi 2023-07-09 22:20:14 +03:00
Evan Miller
03cc12be0d [mpi] continue-on-error: true 2023-07-09 15:10:43 -04:00
Evan Miller
4a9a4748e9 Add OpenMPI to GH action 2023-07-09 15:05:58 -04:00
Evan Miller
0f557c2ac4 Merge branch 'master' into mpi 2023-07-09 15:02:19 -04:00
Georgi Gerganov
9da9d26c70 mpi : minor 2023-07-09 18:38:32 +03:00
Georgi Gerganov
beadbf3380 mpi : fix inference 2023-07-09 18:26:20 +03:00
Georgi Gerganov
ef37dd14e7 mpi : fix output tensor after MPI compute (still not working) 2023-07-09 17:01:08 +03:00
Georgi Gerganov
c717c5185f mpi : various fixes - communication now works but results are wrong 2023-07-09 16:40:16 +03:00
Georgi Gerganov
01abb3b3b9 mpi : move all MPI logic into ggml-mpi
Not tested yet
2023-07-09 16:04:27 +03:00
Georgi Gerganov
e339d35579 mpi : add names for layer inputs + prep ggml_mpi_graph_compute() 2023-07-09 14:42:36 +03:00
Georgi Gerganov
3232db628c mpi : trying to move more MPI stuff into ggml-mpi (WIP) (#2099) 2023-07-09 14:08:53 +03:00
Evan Miller
ef61acfbf5 Add info to README 2023-07-07 09:02:23 -04:00
Evan Miller
55207ba2b8 Add GH workflow, fix test 2023-07-06 21:40:18 -04:00
Evan Miller
1f0a2cfeda Update CMakeLists.txt 2023-07-06 21:25:34 -04:00
Evan Miller
06a239343c PR comments 2023-07-06 20:18:41 -04:00
Evan Miller
32deabfdc8 Merge branch 'master' into mpi 2023-07-06 19:04:50 -04:00
Evan Miller
042c5b278f wrap includes 2023-07-04 00:13:20 -04:00
Evan Miller
668ba5fe0b fixes 2023-07-04 00:09:02 -04:00
Evan Miller
d05ca74dd8 fix warnings, update README 2023-07-03 23:53:43 -04:00
Evan Miller
f85785f650 MPI support, first cut 2023-07-03 21:51:05 -04:00
7 changed files with 86 additions and 106 deletions

1
.gitignore vendored
View File

@@ -20,7 +20,6 @@ build-static/
build-cublas/
build-opencl/
build-metal/
build-mpi/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/

View File

@@ -321,11 +321,6 @@ if (LLAMA_MPI)
set(c_flags ${c_flags} -Wno-cast-qual)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif()

View File

@@ -450,7 +450,6 @@ void ggml_metal_graph_compute(
//}
switch (dst->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_TRANSPOSE:

View File

@@ -56,7 +56,7 @@ void ggml_mpi_eval_init(
MPI_Bcast(n_threads, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
struct ggml_tensor * t = ggml_graph_get_tensor(gf, name);
if (t == NULL) {
fprintf(stderr, "%s: tensor %s not found\n", __func__, name);
@@ -73,39 +73,13 @@ static int ggml_graph_get_node_idx(struct ggml_cgraph * gf, const char * name) {
return -1;
}
static void ggml_mpi_tensor_send(struct ggml_tensor * t, int mpi_rank_dst) {
MPI_Datatype mpi_type;
switch (t->type) {
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
default: GGML_ASSERT(false && "not implemented");
}
const int retval = MPI_Send(t->data, ggml_nelements(t), mpi_type, mpi_rank_dst, 0, MPI_COMM_WORLD);
GGML_ASSERT(retval == MPI_SUCCESS);
}
static void ggml_mpi_tensor_recv(struct ggml_tensor * t, int mpi_rank_src) {
MPI_Datatype mpi_type;
switch (t->type) {
case GGML_TYPE_I32: mpi_type = MPI_INT32_T; break;
case GGML_TYPE_F32: mpi_type = MPI_FLOAT; break;
default: GGML_ASSERT(false && "not implemented");
}
MPI_Status status; UNUSED(status);
const int retval = MPI_Recv(t->data, ggml_nelements(t), mpi_type, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
}
// TODO: there are many improvements that can be done to this implementation
void ggml_mpi_graph_compute_pre(
void ggml_mpi_graph_compute(
struct ggml_mpi_context * ctx_mpi,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
int n_layers) {
int n_layers,
int n_threads) {
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
@@ -135,19 +109,41 @@ void ggml_mpi_graph_compute_pre(
// node 0: [(n-1) * n_per_node, n_nodes)
//
if (mpi_rank > 0) {
if (mpi_rank == 1) {
// the first node (1) receives the input tokens from the main node (0)
ggml_mpi_tensor_recv(inp_tokens, 0);
} else {
// recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
ggml_mpi_tensor_recv(inp0, mpi_rank - 1);
if (mpi_rank == 1) { // the first node receives the input tokens from the main node
MPI_Status status; UNUSED(status);
const int mpi_rank_src = mpi_rank - 1;
const int retval = MPI_Recv(inp_tokens->data, ggml_nelements(inp_tokens), MPI_INT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
} else { // recv input data for each node into the "inp0" tensor (i.e. the first node in the compute graph)
MPI_Status status; UNUSED(status);
const int mpi_rank_src = mpi_rank - 1;
//printf("%s: node %d: waiting for %d elements from %d\n", __func__, mpi_rank, (int) ggml_nelements(inp0), mpi_rank_src);
const int retval = MPI_Recv(inp0->data, ggml_nelements(inp0), MPI_FLOAT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
}
} else if (mpi_size > 1) {
// node 0 sends the input tokens to node 1
ggml_mpi_tensor_send(inp_tokens, 1);
{
const int mpi_rank_dst = mpi_rank + 1;
const int retval = MPI_Send(inp_tokens->data, ggml_nelements(inp_tokens), MPI_INT, mpi_rank_dst, 0, MPI_COMM_WORLD);
GGML_ASSERT(retval == MPI_SUCCESS);
}
// recv the output data from the last node
ggml_mpi_tensor_recv(inp0, mpi_size - 1);
{
MPI_Status status; UNUSED(status);
const int mpi_rank_src = mpi_size - 1;
//fprintf(stderr, "%s: node %d: waiting for %d elements from %d\n", __func__, mpi_rank, (int) ggml_nelements(inp0), mpi_rank_src);
const int retval = MPI_Recv(inp0->data, ggml_nelements(inp0), MPI_FLOAT, mpi_rank_src, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
GGML_ASSERT(retval == MPI_SUCCESS);
}
}
{
@@ -198,19 +194,20 @@ void ggml_mpi_graph_compute_pre(
//fprintf(stderr, "%s: node %d: processing %d nodes [%d, %d)\n", __func__, mpi_rank, gf->n_nodes, il0, il1);
}
}
void ggml_mpi_graph_compute_post(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers) {
UNUSED(n_layers);
ggml_graph_compute_with_ctx(ctx, gf, n_threads);
const int mpi_rank = ctx_mpi->rank;
const int mpi_size = ctx_mpi->size;
//fprintf(stderr, "%s: node %d: done\n", __func__, mpi_rank);
// send the output data to the next node
if (mpi_rank > 0) {
ggml_mpi_tensor_send(gf->nodes[gf->n_nodes - 1], (mpi_rank + 1) % mpi_size);
struct ggml_tensor * output = gf->nodes[gf->n_nodes - 1];
const int mpi_rank_dst = (mpi_rank + 1) % mpi_size;
//fprintf(stderr, "%s: node %d: sending %d elements to node %d\n", __func__, mpi_rank, ggml_nelements(output), mpi_rank_dst);
const int retval = MPI_Send(output->data, ggml_nelements(output), MPI_FLOAT, mpi_rank_dst, 0, MPI_COMM_WORLD);
GGML_ASSERT(retval == MPI_SUCCESS);
}
}

View File

@@ -24,15 +24,12 @@ void ggml_mpi_eval_init(
int * n_past,
int * n_threads);
void ggml_mpi_graph_compute_pre(
void ggml_mpi_graph_compute(
struct ggml_mpi_context * ctx_mpi,
struct ggml_context * ctx,
struct ggml_cgraph * gf,
int n_layers);
void ggml_mpi_graph_compute_post(
struct ggml_mpi_context * ctx_mpi,
struct ggml_cgraph * gf,
int n_layers);
int n_layers,
int n_threads);
#ifdef __cplusplus
}

View File

@@ -1632,10 +1632,6 @@ static bool llama_eval_internal(
// run the computation
ggml_build_forward_expand(&gf, cur);
#if GGML_USE_MPI
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
#endif
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
@@ -1660,19 +1656,14 @@ static bool llama_eval_internal(
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
}
#elif GGML_USE_MPI
ggml_mpi_graph_compute(lctx.ctx_mpi, ctx0, &gf, n_layer, n_threads);
cur = gf.nodes[gf.n_nodes - 1];
#else
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
#endif
#if GGML_USE_MPI
ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
#endif
// update kv token count
lctx.kv_self.n = n_past + N;
struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
if (cgraph_fname) {
ggml_graph_export(&gf, cgraph_fname);
}
@@ -1688,26 +1679,38 @@ static bool llama_eval_internal(
// ggml_graph_dump_dot(&gf, NULL, "llama.dot");
//}
// extract logits
//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
// update kv token count
lctx.kv_self.n = n_past + N;
#ifdef GGML_USE_MPI
if (ggml_mpi_rank(lctx.ctx_mpi) == 0) {
#else
{
auto & logits_out = lctx.logits;
#endif
// extract logits
{
auto & logits_out = lctx.logits;
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
if (lctx.logits_all) {
logits_out.resize(n_vocab * N);
memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
} else {
// return result for just the last token
logits_out.resize(n_vocab);
memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
}
}
}
// extract embeddings
if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding;
// extract embeddings
if (!lctx.embedding.empty()) {
auto & embedding_out = lctx.embedding;
embedding_out.resize(n_embd);
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
embedding_out.resize(n_embd);
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
}
}
if (mem_per_token == 0) {
@@ -2454,14 +2457,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
bool convert_incompatible_tensor = false;
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
int nx = tensor.ne.at(0);
int ny = tensor.ne.at(1);
if (nx % QK_K != 0 || ny % QK_K != 0) {
fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
convert_incompatible_tensor = true;
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
fprintf(stderr, "========================================================================================\n\n");
throw std::runtime_error("Unsupported tensor size encountered\n");
}
}
if (tensor.name == "output.weight") {
@@ -2489,17 +2493,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
}
if (convert_incompatible_tensor) {
if (tensor.name == "output.weight") {
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
fprintf(stderr, "F16 will be used for this tensor instead.\n");
} else if (tensor.name == "tok_embeddings.weight") {
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
} else {
throw std::runtime_error("Unsupported tensor size encountered\n");
}
}
#endif
float * f32_data;

View File

@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
llama_model * model;
llama_context * ctx;
llama_backend_init(false);
llama_init_backend(false);
// load the vocab
{
@@ -99,7 +99,7 @@ int main(int argc, char **argv) {
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
llama_finalize_backend();
return 0;
}