Revert "llama : add llama_memory_can_rm_suffix()"

This reverts commit d30e59b62a.
2026-02-05 13:53:23 +02:00 · 2026-02-04 13:11:50 +02:00
parent d30e59b62a
commit 1f8d0c848b
14 changed files with 11 additions and 49 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -760,9 +760,6 @@ extern "C" {
    // Check if the memory supports shifting
    LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);

-    // Check if the memory supports removing the last tokens in the sequence
-    LLAMA_API bool llama_memory_can_rm_suffix(llama_memory_t mem);
-
    //
    // State / sessions
    //
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -3360,14 +3360,6 @@ bool llama_memory_can_shift(llama_memory_t mem) {
    return mem->get_can_shift();
 }

-bool llama_memory_can_rm_suffix(llama_memory_t mem) {
-    if (!mem) {
-        return false;
-    }
-
-    return mem->get_can_rm_suffix();
-}
-
 // llama state API

 // deprecated
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -221,10 +221,6 @@ bool llama_kv_cache_iswa::get_can_shift() const {
    return kv_base->get_size() == kv_swa->get_size();
 }

-bool llama_kv_cache_iswa::get_can_rm_suffix() const {
-    return kv_base->get_can_rm_suffix() && kv_swa->get_can_rm_suffix();
-}
-
 void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
    if ((flags & LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY) == 0) {
        kv_base->state_write(io, seq_id, flags);
--- a/src/llama-kv-cache-iswa.h
+++ b/src/llama-kv-cache-iswa.h
@@ -43,8 +43,7 @@ public:

    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;

-    bool get_can_shift()     const override;
-    bool get_can_rm_suffix() const override;
+    bool get_can_shift() const override;

    void clear(bool data) override;

--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -977,10 +977,6 @@ bool llama_kv_cache::get_can_shift() const {
    return true;
 }

-bool llama_kv_cache::get_can_rm_suffix() const {
-    return true;
-}
-
 uint32_t llama_kv_cache::get_size() const {
    const auto & cells = v_cells[seq_to_stream[0]];

--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -123,8 +123,7 @@ public:

    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;

-    bool get_can_shift()     const override;
-    bool get_can_rm_suffix() const override;
+    bool get_can_shift() const override;

    void clear(bool data) override;

--- a/src/llama-memory-hybrid-iswa.cpp
+++ b/src/llama-memory-hybrid-iswa.cpp
@@ -126,11 +126,8 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * l
 }

 bool llama_memory_hybrid_iswa::get_can_shift() const {
-    return mem_attn->get_can_shift() && mem_recr->get_can_shift();
-}
-
-bool llama_memory_hybrid_iswa::get_can_rm_suffix() const {
-    return mem_attn->get_can_rm_suffix() && mem_recr->get_can_rm_suffix();
+    // Shifting is trivially supported for recurrent
+    return mem_attn->get_can_shift();
 }

 void llama_memory_hybrid_iswa::clear(bool data) {
--- a/src/llama-memory-hybrid-iswa.h
+++ b/src/llama-memory-hybrid-iswa.h
@@ -55,8 +55,7 @@ public:

    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;

-    bool get_can_shift()     const override;
-    bool get_can_rm_suffix() const override;
+    bool get_can_shift() const override;

    void clear(bool data) override;

--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -120,11 +120,8 @@ llama_memory_context_ptr llama_memory_hybrid::init_update(llama_context * lctx,
 }

 bool llama_memory_hybrid::get_can_shift() const {
-    return mem_attn->get_can_shift() && mem_recr->get_can_shift();
-}
-
-bool llama_memory_hybrid::get_can_rm_suffix() const {
-    return mem_attn->get_can_rm_suffix() && mem_recr->get_can_rm_suffix();
+    // Shifting is trivially supported for recurrent
+    return mem_attn->get_can_shift();
 }

 void llama_memory_hybrid::clear(bool data) {
--- a/src/llama-memory-hybrid.h
+++ b/src/llama-memory-hybrid.h
@@ -55,8 +55,7 @@ public:

    llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) override;

-    bool get_can_shift()     const override;
-    bool get_can_rm_suffix() const override;
+    bool get_can_shift() const override;

    void clear(bool data) override;

--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -665,10 +665,6 @@ bool llama_memory_recurrent::get_can_shift() const {
    return true;
 }

-bool llama_memory_recurrent::get_can_rm_suffix() const {
-    return false;
-}
-
 size_t llama_memory_recurrent::total_size() const {
    size_t size = 0;
    for (const auto & [_, buf] : ctxs_bufs) {
--- a/src/llama-memory-recurrent.h
+++ b/src/llama-memory-recurrent.h
@@ -58,9 +58,7 @@ public:
    // find a contiguous slot of memory cells and emplace the ubatch there
    bool find_slot(const llama_ubatch & ubatch);

-    bool get_can_shift()     const override;
-    bool get_can_rm_suffix() const override;
-
+    bool get_can_shift() const override;

    // state write/load

--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@@ -91,8 +91,7 @@ struct llama_memory_i {
    virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0;

    // getters
-    virtual bool get_can_shift()     const = 0;
-    virtual bool get_can_rm_suffix() const = 0;
+    virtual bool get_can_shift() const = 0;

    //
    // ops
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -752,7 +752,7 @@ private:
            slot.prompt.tokens.has_mtmd = mctx != nullptr;

            // try speculative decoding
-            if (llama_memory_can_rm_suffix(llama_get_memory(ctx))) {
+            {
                slot.spec = common_speculative_init(params_base.speculative, slot.ctx);
                if (slot.spec) {
                    if (mctx) {
@@ -763,8 +763,6 @@ private:
                } else {
                    SLT_INF(slot, "%s", "speculative decoding context not initialized\n");
                }
-            } else {
-                SLT_WRN(slot, "%s", "speculative decoding not supported by this context (no memory_rm_suffix support)\n");
            }

            SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);