Compare commits

...

8 Commits
b8909 ... b8917

Author SHA1 Message Date
Georgi Gerganov
017f090442 jinja : remove unused header (#22310) 2026-04-24 11:01:46 +03:00
Georgi Gerganov
ffdd983fb8 server : fix swa-full logic (#22288) 2026-04-24 10:17:37 +03:00
Yes You Can Have Your Own
793d0a7931 server: rename debug tags to match --cache-idle-slots naming (#22292) 2026-04-24 09:28:44 +03:00
Mengsheng Wu
8bc492ebb4 hexagon: add SOLVE_TRI op (#21974)
* hexagon: add SOLVE_TRI op

* ggml: fix TODO description for solve_tri

* hexagon: rm unused variable/function warnings

* hexagon: chunk vs batch processingfor better thread utilization

* hexagon: vectorize partial f32 loads

* hexagon: move HVX f32 add/sub/mul wrappers to hvx-base.h

---------

Co-authored-by: Todor Boinovski <todorb@qti.qualcomm.com>
2026-04-23 18:39:13 -07:00
Chen Yuan
e5f070a1dc fix(shader): handle the buffer aliasing for rms fuse (#22266) 2026-04-23 16:32:59 -07:00
Ethan Turner
fa0b8a70a8 cli: Remove redundant local sampling variables (#20429) (#22264)
This change implements the third requested change in issue 20429.
Because defaults.sampling contains the reasoning budget token count and
the reasoning budget message, it's not necessary to assign them to
struct variables.
2026-04-24 00:53:23 +02:00
Max Krasnyansky
5d2b52d80d hexagon: add support for basic and extended Op profiling (#22269)
* hexagon: restore HTP_OPMASK_QUEUE

* hexagon: honor OPMASK_SKIP_COMPUTE in hmx-matmul

* hex-prof: restore op profiling

* hex-prof: enable PMU

* hexagon: simplify and improve op-queuing with full profiling support

Add separate profile descriptors.

* hexagon: remove opsync and rename opmask into opstage

opsync is no longer needed since the profiler is fully async now.
opmask name was confusing and opstage is more accurate.

* hexagon: refactor opbatch queue handling

* hexagon: add iface hooks for enabling profiler from the host

Also move all the PMU setup stuff out of the hex-utils since it's not inteded for normal use.

* hexagon: make profiler mode configurable

On older devices getting PMU counters is expensive so it's now optional.

* hexagon: add support for setting profiler pmu events from env

* hexagon: simplify profiler output (no need to print buffs, etc)

* hexagon: simplify pmu counter formating

* hexagon: add a simple profile post-proc tool

* hex-prof: add support for reading logs from stdin

* hexagon: document GGML_HEXAGON_PROFILE

* hex-prof: update default width for dims field

* hex-prof: fix linter warnings and errors

* Update ggml/src/ggml-hexagon/htp/htp-ops.h

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update scripts/snapdragon/ggml-hexagon-profile.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Trivikram Reddy <tamarnat@qti.qualcomm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-04-23 14:17:21 -07:00
Shreya Jain
187a456370 Enable testing on Snapdragon devices (#21051)
* Add the tests that we want to run on external CI

* remove extra files

* Fixes python issues, reove the deadlock on CI

* remove unecessary changes

* use override to ty.toml

* fix pre-commit and try tests with secret in external repo not upstream

* skip if key is unavailable

* Fix feedback

* switch hexagon to snapdragon

* cleanup

* fix secrets

* remove the copyrights at the top of the files
2026-04-23 13:08:10 -07:00
40 changed files with 1818 additions and 373 deletions

View File

@@ -0,0 +1,113 @@
name: CI (snapdragon)
on:
workflow_dispatch:
push:
branches:
- master
paths:
- '.github/workflows/build-and-test-snapdragon.yml'
- 'ggml/include/ggml-hexagon.h'
- 'ggml/src/ggml-hexagon/**'
- 'docs/backend/snapdragon/**'
- 'scripts/snapdragon/**'
- 'CMakePresets.json'
pull_request:
types: [opened, synchronize, reopened]
paths:
- '.github/workflows/build-and-test-snapdragon.yml'
- 'ggml/include/ggml-hexagon.h'
- 'ggml/src/ggml-hexagon/**'
- 'docs/backend/snapdragon/**'
- 'scripts/snapdragon/**'
- 'CMakePresets.json'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
android-ndk-snapdragon:
runs-on: ubuntu-latest
container:
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
defaults:
run:
shell: bash
steps:
- name: Clone
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Build Llama.CPP for Snapdragon Android
id: build_llama_cpp_snapdragon_android
run: |
cp docs/backend/snapdragon/CMakeUserPresets.json .
cmake --preset arm64-android-snapdragon-release -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Upload Llama.CPP Snapdragon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-arm64-snapdragon
path: pkg-adb/llama.cpp
check-secret:
runs-on: ubuntu-latest
outputs:
has-key: ${{ steps.check.outputs.has-key }}
steps:
- id: check
run: echo "has-key=${{ secrets.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
test-snapdragon-qdc:
name: Test on QDC Android Device (${{ matrix.device }})
needs: [android-ndk-snapdragon, check-secret]
if: needs.check-secret.outputs.has-key == 'true'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
device: [SM8750, SM8650, SM8850]
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Download build artifact
uses: actions/download-artifact@v4
with:
name: llama-cpp-android-arm64-snapdragon
path: pkg-snapdragon/
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
cache: pip
- name: Install QDC SDK wheel
run: |
curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
unzip qdc_sdk.zip -d qdc_sdk
pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
- name: Run QDC tests (${{ matrix.device }})
run: |
python scripts/snapdragon/qdc/run_qdc_jobs.py \
--test all \
--pkg-dir pkg-snapdragon/llama.cpp \
--model-url "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
--device ${{ matrix.device }}
env:
QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
- name: Cleanup
if: always()
run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip

View File

@@ -1,26 +1,24 @@
name: CI (android)
on:
workflow_dispatch: # allows manual triggering
workflow_dispatch:
push:
branches:
- master
paths: [
'.github/workflows/build-android.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]
paths:
- '.github/workflows/build-android.yml'
- '**/CMakeLists.txt'
- '**/.cmake'
- '**/*.h'
- '**/*.hpp'
- '**/*.c'
- '**/*.cpp'
pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-android.yml',
'examples/llama.android/**'
]
paths:
- '.github/workflows/build-android.yml'
- 'examples/llama.android/**'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -67,35 +65,24 @@ jobs:
defaults:
run:
shell: bash
strategy:
matrix:
include:
- build: 'arm64-cpu'
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
- build: 'arm64-snapdragon'
defines: '--preset arm64-android-snapdragon-release'
steps:
- name: Clone
id: checkout
uses: actions/checkout@v6
with:
fetch-depth: 0
lfs: false
- name: Build Llama.CPP for Hexagon Android
id: build_llama_cpp_hexagon_android
- name: Build
id: ndk_build
run: |
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
cp docs/backend/snapdragon/CMakeUserPresets.json .
fi
cmake ${{ matrix.defines }} -B build
cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
cmake --build build
cmake --install build --prefix pkg-adb/llama.cpp
- name: Upload Llama.CPP Hexagon Android Build Artifact
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
- name: Upload Android Build Artifact
if: ${{ always() && steps.ndk_build.outcome == 'success' }}
uses: actions/upload-artifact@v6
with:
name: llama-cpp-android-${{ matrix.build }}
name: llama-cpp-android-arm64-cpu
path: pkg-adb/llama.cpp

View File

@@ -1,4 +1,3 @@
#include "log.h"
#include "value.h"
#include "runtime.h"
#include "caps.h"

View File

@@ -249,18 +249,27 @@ build: 6a8cf8914 (6733)
```
- `GGML_HEXAGON_PROFILE=1`
Generates a host-side profile for the ggml-hexagon Ops.
Enables Op profiling:
- `GGML_HEXAGON_OPMASK=0x0`
Allows enabling specific stages of the processing pipeline:
- `1` Basic profile with per-op `usecs` and `cycles` counters
- `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
- `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
Examples:
`GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
- `GGML_HEXAGON_OPSTAGE=0x0`
Allows enabling specific stages of the Op processing pipeline:
- `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
- `0x2` Enable Op Compute (MUL_MAT, etc.)
Examples:
`GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
`GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
`GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
`GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
- `GGML_HEXAGON_OPFILTER=regex`
Allows filtering (disabling) Ops that match the regex pattern:

View File

@@ -12,9 +12,12 @@
#include <cstddef>
#include <stdexcept>
#include <string>
#include <sstream>
#include <iomanip>
#include <unordered_set>
#include <unordered_map>
#include <regex>
#include <queue>
#ifdef _WIN32
# include <sal.h>
@@ -41,18 +44,26 @@
#include "htp_iface.h"
#include "htp-drv.h"
using intvec = std::vector<int>;
using uintvec = std::vector<unsigned int>;
using u32vec = std::vector<uint32_t>;
static size_t opt_ndev = 1;
static size_t opt_nhvx = 0; // use all
static int opt_arch = 0; // autodetect
static int opt_etm = 0;
static int opt_verbose = 0;
static int opt_profile = 0;
static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
static int opt_hostbuf = 1; // hostbuf ON by default
static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
// Default PMU events, if profiling with PMU (mode=2) is enabled
// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
// https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
// Enable all stages by default
static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
static int opt_opsync = 0; // synchronous ops
static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
static int opt_opbatch = 1024; // max number of ops in a batch
static int opt_opqueue = 16; // max number of pending batches
static std::regex* opt_opfilter = NULL; // regex of ops to not claim
@@ -104,19 +115,26 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
}
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
if (!opt_profile) return;
op_desc desc(op);
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
char pmu_str[256] = "";
if (opt_profile > 1) {
static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
}
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
}
// ** backend sessions
struct ggml_hexagon_opbatch;
struct ggml_hexagon_opshm;
struct ggml_hexagon_opqueue;
struct ggml_hexagon_session {
std::string name;
@@ -132,8 +150,8 @@ struct ggml_hexagon_session {
bool valid_iface;
std::atomic<int> op_pending;
ggml_hexagon_opbatch *op_batch;
ggml_hexagon_opshm *op_shm;
ggml_hexagon_opbatch* op_batch;
ggml_hexagon_opqueue* op_queue;
ggml_backend_buffer_type buffer_type = {};
ggml_backend_buffer_type repack_buffer_type = {};
@@ -1521,65 +1539,14 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
// Backend session implementation
struct ggml_hexagon_opshm {
ggml_hexagon_shared_buffer *sbuf;
std::vector<bool> block_mask;
size_t block_size;
uint8_t * base() const { return this->sbuf->base; }
int fd() const { return this->sbuf->fd; }
size_t n_blocks() const { return this->block_mask.size(); }
ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
size_t n_bufs = HTP_OP_MAX_BUFS;
size_t n_ops = max_batch;
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
block_mask.resize(max_pending, true);
block_size = sizeof(htp_buf_desc) * n_bufs +
sizeof(htp_tensor) * n_tensors +
sizeof(htp_op_desc) * n_ops;
sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
if (opt_verbose) {
GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
}
}
~ggml_hexagon_opshm() {
delete sbuf;
}
uint8_t * allocate() {
auto it = std::find(block_mask.begin(), block_mask.end(), true);
if (it == block_mask.end())
return nullptr;
unsigned int i = std::distance(block_mask.begin(), it);
uint8_t* addr = sbuf->base + (i * block_size);
block_mask[i] = false;
HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
return addr;
}
void release(uint8_t * addr) {
int i = (addr - sbuf->base) / block_size;
block_mask[i] = true;
HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
}
};
struct ggml_hexagon_opbatch {
const char* name;
ggml_hexagon_session* sess;
std::vector<htp_buf_desc> buffers;
std::vector<htp_tensor> tensors;
std::vector<htp_op_desc> ops;
std::vector<const ggml_tensor*> ops; // pointers to original ops
std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
std::vector<htp_tensor> h_tens; // htp tensor descriptors
std::vector<htp_op_desc> h_ops; // htp op descriptors
std::unordered_map<int, int> b_map; // buffer fd to index
std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
@@ -1606,19 +1573,21 @@ struct ggml_hexagon_opbatch {
d_map.clear();
}
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
name = sess->c_name();
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
this->sess = sess;
n_bufs_max = HTP_OP_MAX_BUFS;
n_ops_max = max_batch;
n_ops_max = batch_size;
n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
b_vmem_max = HTP_OP_MAX_VMEM;
buffers.resize(n_bufs_max);
tensors.resize(n_tens_max);
ops.resize(n_ops_max);
h_bufs.resize(n_bufs_max);
h_tens.resize(n_tens_max);
h_ops.resize(n_ops_max);
b_map.reserve(n_bufs_max);
t_map.reserve(n_tens_max);
d_map.reserve(n_tens_max);
@@ -1640,7 +1609,7 @@ struct ggml_hexagon_opbatch {
b_map.insert({sbuf->fd, bi});
htp_buf_desc &b = buffers[bi];
htp_buf_desc &b = h_bufs[bi];
b.base = (uint64_t) sbuf->base;
b.fd = sbuf->fd;
b.size = sbuf->size;
@@ -1664,7 +1633,7 @@ struct ggml_hexagon_opbatch {
// First lookup by tensor data
auto range = d_map.equal_range(t->data);
for (auto it = range.first; it != range.second; ++it) {
htp_tensor * h = &tensors[it->second];
htp_tensor * h = &h_tens[it->second];
if (same_shape(h, t)) { return it->second; }
}
@@ -1682,7 +1651,7 @@ struct ggml_hexagon_opbatch {
uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
size_t t_size = ggml_nbytes(t);
htp_tensor &h = tensors[ti];
htp_tensor &h = h_tens[ti];
h.bi = add_buffer(sbuf);
h.data = t_offset;
h.size = t_size;
@@ -1737,65 +1706,170 @@ struct ggml_hexagon_opbatch {
// assumes that fit_op() was called first and returned true
void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
// Add new op
htp_op_desc &o = ops[n_ops++];
unsigned int n = n_ops++;
GGML_ASSERT(n_ops <= n_ops_max);
ops[n] = t;
htp_op_desc &o = h_ops[n];
memcpy(&o.params, &t->op_params, sizeof(t->op_params));
o.opcode = opcode;
o.flags = 0;
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
}
ggml_hexagon_dump_op_exec(name, t, o.flags);
ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
}
o.dst = add_tensor(t);
}
};
size_t flush(uint8_t * mem_addr, size_t mem_size) {
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
struct ggml_hexagon_opqueue {
// Shared buffer for storing batches
ggml_hexagon_shared_buffer *shm_buf;
size_t shm_blk_size;
const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
const size_t t_size = sizeof(htp_tensor) * n_tens;
const size_t o_size = sizeof(htp_op_desc) * n_ops;
using opvec = std::vector<const ggml_tensor*>;
const size_t m_size = b_size + t_size + o_size;
GGML_ASSERT(m_size <= mem_size);
std::queue<unsigned int> done; // completed batch ids
std::vector<opvec> op_cache; // per batch op cache
std::vector<uint64_t> start_usec; // per batch start time
uint8_t * b_ptr = (uint8_t *) mem_addr;
uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;
ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
size_t n_bufs = HTP_OP_MAX_BUFS;
size_t n_ops = batch_size;
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
memcpy(b_ptr, (void *) buffers.data(), b_size);
memcpy(t_ptr, (void *) tensors.data(), t_size);
memcpy(o_ptr, (void *) ops.data(), o_size);
shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
sizeof(htp_tensor) * n_tensors +
sizeof(htp_op_desc) * n_ops +
sizeof(htp_prof_desc) * n_ops;
HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);
shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
op_cache.resize(depth);
start_usec.resize(depth, 0);
// init done queue
for (unsigned int i = 0; i < depth; i++) { done.push(i); }
if (opt_verbose) {
GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
}
}
~ggml_hexagon_opqueue() {
delete shm_buf;
}
// push new batch
bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
if (done.empty()) { return false; }
req.id = done.front(); done.pop(); // batch id
req.n_bufs = op_batch->n_bufs;
req.n_tensors = op_batch->n_tens;
req.n_ops = op_batch->n_ops;
op_cache[req.id] = op_batch->ops;
start_usec[req.id] = ggml_time_us();
const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
dbuf.fd = shm_buf->fd;
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
dbuf.size = b_size + t_size + o_size + p_size;
GGML_ASSERT(dbuf.size <= shm_blk_size);
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
uint8_t * b_ptr = m_ptr; m_ptr += b_size;
uint8_t * t_ptr = m_ptr; m_ptr += t_size;
uint8_t * o_ptr = m_ptr;
memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
b_size, t_size, o_size, (size_t) dbuf.size);
op_batch->reset();
if (opt_verbose > 1) {
htp_buf_desc *b = (htp_buf_desc*) b_ptr;
for (unsigned int i=0; i < n_bufs; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
for (unsigned int i=0; i < req.n_bufs; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
b[i].fd, (void *) b[i].base, (size_t) b[i].size);
}
htp_tensor *t = (htp_tensor*) t_ptr;
for (unsigned int i=0; i < n_tens; i++) {
for (unsigned int i=0; i < req.n_tensors; i++) {
GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
name, i, t[i].bi, t[i].data, t[i].size,
shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
(size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
}
}
reset();
return true;
}
return m_size;
void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
GGML_ASSERT(rsp.id < op_cache.size());
done.push(rsp.id);
const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
const size_t m_size = b_size + t_size + o_size + p_size;
GGML_ASSERT(m_size <= shm_blk_size);
HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
(size_t) dbuf.size, b_size, t_size, o_size);
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
if (opt_profile && rsp.n_ops > 0) {
auto & ops = op_cache[rsp.id];
uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
uint32_t htp_usec = 0;
GGML_ASSERT(rsp.n_ops <= ops.size());
const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
for (uint32_t i = 0; i < rsp.n_ops; i++) {
htp_usec += pd[i].usecs;
ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
}
GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
}
}
};
@@ -1824,17 +1898,12 @@ void ggml_hexagon_session::flush_pending(bool all) {
GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
}
op_shm->release((uint8_t*) dbuf.ptr);
if (rsp.status != HTP_STATUS_OK) {
GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
// TODO: handle errors
}
// FIXME: profile will be per opreq
// this->prof_usecs = rsp.prof_usecs;
// this->prof_cycles = rsp.prof_cycles;
// this->prof_pkts = rsp.prof_pkts;
op_queue->pop(rsp, dbuf);
this->op_pending--; // atomic dec
@@ -1845,28 +1914,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
void ggml_hexagon_session::flush_batch() {
if (op_batch->empty()) { return; }
htp_opbatch_req req;
req.n_bufs = op_batch->n_bufs;
req.n_tensors = op_batch->n_tens;
req.n_ops = op_batch->n_ops;
htp_opbatch_req req {};
dspqueue_buffer dbuf{};
dspqueue_buffer dbuf;
dbuf.fd = op_shm->fd();
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
dbuf.ptr = op_shm->allocate();
if (!dbuf.ptr) {
if (!op_queue->push(req, dbuf, op_batch)) {
flush_pending(false);
dbuf.ptr = op_shm->allocate();
op_queue->push(req, dbuf, op_batch);
}
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
dbuf.size = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
// Bump pending flag (cleared in the session::flush once we get the response)
this->op_pending++; // atomic inc
HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
if (err != 0) {
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
@@ -2016,25 +2074,33 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
}
if (opt_etm) {
err = htp_iface_enable_etm(this->handle);
err = htp_iface_etm(this->handle, 1);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
}
}
// Start the DSP-side service. We need to pass the queue ID to the
// DSP in a FastRPC call; the DSP side will import the queue and start
// listening for packets in a callback.
if (opt_profile) {
htp_iface_pmu_conf pmu_conf{};
std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
}
}
// Allocate buffers and state for op batching
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
// Start processing op batch requests
err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
}
this->valid_iface = true;
// Allocate buffers and state for op batching
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
this->op_shm = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
}
void ggml_hexagon_session::release() noexcept(true) {
@@ -2043,7 +2109,7 @@ void ggml_hexagon_session::release() noexcept(true) {
int err;
delete this->op_batch;
delete this->op_shm;
delete this->op_queue;
// Stop the DSP-side service and close the queue
if (this->valid_iface) {
@@ -2054,12 +2120,20 @@ void ggml_hexagon_session::release() noexcept(true) {
}
if (opt_etm) {
err = htp_iface_disable_etm(this->handle);
err = htp_iface_etm(this->handle, 0);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
}
}
if (opt_profile) {
htp_iface_pmu_conf pmu_conf{};
err = htp_iface_profiler(this->handle, 0, &pmu_conf);
if (err != 0) {
GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
}
}
if (this->valid_queue) {
err = dspqueue_close(queue);
if (err != 0) {
@@ -2077,7 +2151,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
repack_buffer_type.device = dev;
op_batch = nullptr;
op_shm = nullptr;
op_queue = nullptr;
try {
allocate(dev_id);
@@ -2619,6 +2693,39 @@ static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess
return true;
}
static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0]; // A
const struct ggml_tensor * src1 = op->src[1]; // B
const struct ggml_tensor * dst = op; // X
if (!src0 || !src1) {
return false;
}
if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
return false;
}
if (src0->ne[0] != src0->ne[1]) {
return false;
}
if (src0->ne[1] != src1->ne[1]) {
return false;
}
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
return false;
}
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
return false;
}
GGML_UNUSED(sess);
return true;
}
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
return sess->c_name();
@@ -2657,7 +2764,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
case GGML_OP_FILL: return HTP_OP_FILL;
case GGML_OP_DIAG: return HTP_OP_DIAG;
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(t)) {
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
@@ -2698,7 +2805,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
for (int i = 0; i < graph->n_nodes; ++i) {
ggml_tensor * n = graph->nodes[i];
if (op_is_compute(n)) {
if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
sess->enqueue_op(op_remap_to_htp(n), n);
}
}
@@ -3203,6 +3310,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
supp = ggml_hexagon_supported_diag(sess, op);
break;
case GGML_OP_SOLVE_TRI:
supp = ggml_hexagon_supported_solve_tri(sess, op);
break;
default:
break;
}
@@ -3338,6 +3449,26 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
return NULL;
}
template<typename T> std::vector<T> str_to_vec(const char* str) {
std::stringstream ss(str);
std::vector<T> v;
std::string t;
while (std::getline(ss, t, ',')) {
v.push_back(std::stoul(t, nullptr, 0));
}
return v;
}
template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
std::stringstream ss;
ss << std::setbase(BASE) << std::showbase;
for (auto i : v) { ss << i << ','; }
auto str = ss.str(); str.pop_back(); // drop last comma
return str;
}
static void ggml_hexagon_init(ggml_backend_reg * reg) {
// Basic sanity checks to make sure definitions match
static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
@@ -3351,8 +3482,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC");
const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
@@ -3365,19 +3495,30 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
auto RE_ICASE = std::regex_constants::icase;
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask;
opt_opsync = str_opsync ? atoi(str_opsync) : opt_opsync;
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
opt_profile = str_profile ? atoi(str_profile) : 0;
opt_etm = str_etm ? atoi(str_etm) : 0;
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
opt_etm = str_etm ? atoi(str_etm) : 0;
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
if (str_profile) {
opt_pmu_evt = [&]() -> std::vector<uint32_t> {
auto v = str_to_vec<uint32_t>(str_profile);
switch (v.size()) {
case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
case 8: opt_profile = 2; return v; // mode with custom pmu events
default: opt_profile = 0; return {}; // garbage input
}}();
if (opt_profile == 1) opt_pmu_evt = {};
GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
}
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
opt_ndev = GGML_HEXAGON_MAX_SESSIONS;

View File

@@ -36,6 +36,7 @@ add_library(${HTP_LIB} SHARED
cumsum-ops.c
fill-ops.c
diag-ops.c
solve-tri-ops.c
)
target_compile_definitions(${HTP_LIB} PRIVATE

View File

@@ -4,6 +4,7 @@
#include <stdbool.h>
#include <stdint.h>
#include <qurt_memory.h>
#include <qurt.h>
#include "hexagon_types.h"
#include "hexagon_protos.h"
@@ -100,4 +101,31 @@ static inline void hex_pause() {
asm volatile(" pause(#255)\n");
}
#ifndef HEX_NUM_PMU_COUNTERS
#define HEX_NUM_PMU_COUNTERS 8
#endif
static inline void hex_get_pmu(uint32_t counters[]) {
#if __HVX_ARCH__ >= 79
asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
#else
counters[0] = qurt_pmu_get(QURT_PMUCNT0);
counters[1] = qurt_pmu_get(QURT_PMUCNT1);
counters[2] = qurt_pmu_get(QURT_PMUCNT2);
counters[3] = qurt_pmu_get(QURT_PMUCNT3);
counters[4] = qurt_pmu_get(QURT_PMUCNT4);
counters[5] = qurt_pmu_get(QURT_PMUCNT5);
counters[6] = qurt_pmu_get(QURT_PMUCNT6);
counters[7] = qurt_pmu_get(QURT_PMUCNT7);
// qurt_pmu_get_pmucnt(counters);
#endif
}
#endif /* HEX_UTILS_H */

View File

@@ -10,6 +10,7 @@
#include <dspqueue.h>
#include <stdatomic.h>
#include <stdint.h>
#include <stdbool.h>
#define HTP_MAX_NTHREADS 10
#define HTP_MAX_MMAPS 16
@@ -66,7 +67,9 @@ struct htp_context {
int thread_id;
int thread_prio;
int hmx_enabled;
bool hmx_enabled;
bool etm;
uint32_t profiler;
uint8_t * vtcm_base;
size_t vtcm_size;
@@ -100,5 +103,6 @@ int op_ssm_conv(struct htp_ops_context * octx);
int op_cumsum(struct htp_ops_context * octx);
int op_fill(struct htp_ops_context * octx);
int op_diag(struct htp_ops_context * octx);
int op_solve_tri(struct htp_ops_context * octx);
#endif /* HTP_CTX_H */

View File

@@ -42,9 +42,9 @@ enum htp_data_type {
// Mask to enable various stages of the Ops.
// Used for debugging and profiling.
enum htp_op_mask {
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
HTP_OPMASK_COMPUTE = (1 << 1), // Enable Compute
enum htp_op_stage {
HTP_OPSTAGE_QUEUE = (1 << 0), // Enable Queueing (ie calls into NPU)
HTP_OPSTAGE_COMPUTE = (1 << 1), // Enable Compute
};
// Do not reorder first 4 (used as an index)
@@ -82,7 +82,7 @@ enum htp_op_code {
HTP_OP_CUMSUM,
HTP_OP_FILL,
HTP_OP_DIAG,
HTP_OP_SOLVE_TRI,
HTP_OP_INVALID
};
@@ -137,27 +137,45 @@ struct htp_op_desc {
int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices
uint16_t dst; // Output tensor index
};
// the rest is filled in-place by the NPU
uint32_t prof_usecs; // Number of usec per request
uint32_t prof_cycles; // Number of cycles per request
uint32_t prof_pkts; // Number of instruction packets per request
uint32_t unused;
enum htp_profiler_mode {
HTP_PROF_DISABLED = 0,
HTP_PROF_BASIC = 1,
HTP_PROF_PMU = 2,
};
#define HTP_PROF_PMU_NCNT 8
// Profile descriptor
struct htp_prof_desc {
uint32_t opcode; // GGML/HTP Op
uint32_t usecs; // Number of usec
uint32_t cycles; // Number of cycles
uint32_t pad; // Unused
uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
};
struct htp_opbatch_req {
uint32_t id; // Batch id
uint32_t n_bufs; // Number of buffers
uint32_t n_tensors; // Number of tensors
uint32_t n_ops; // Number of ops
uint32_t flags; // unused
uint32_t pad; // unused
// struct htp_buf_desc bufs[]; -- dspqueue buf 0
// struct htp_tensor tensors[]; -- dspqueue buf 0
// struct htp_op_desc ops[]; -- dspqueue buf 0
};
struct htp_opbatch_rsp {
uint32_t id; // Batch id
uint32_t status; // HTP_STATUS_...
// struct htp_op_req ops[]; -- dspqueue buf 0
uint32_t n_bufs; // Number of buffers
uint32_t n_tensors; // Number of tensors
uint32_t n_ops; // Number of op profile descriptors
uint32_t pad; // unused
// struct htp_prof_desc profs[]; -- dspqueue buf 0
};
#endif /* HTP_OPS_H */

View File

@@ -6,13 +6,17 @@
#include "AEEStdDef.idl"
#include "remote.idl"
struct htp_iface_pmu_conf {
uint32 events[8];
};
interface htp_iface : remote_handle64 {
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
AEEResult stop();
AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
AEEResult munmap(in uint32 fd);
AEEResult enable_etm();
AEEResult disable_etm();
AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
AEEResult etm(in uint32 enable);
};
#endif /* HTP_IDL */

View File

@@ -256,6 +256,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
}
static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
}
static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
}
static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
}
#else
static inline HVX_Vector hvx_vec_add_f16_f16(HVX_Vector a, HVX_Vector b)
@@ -273,6 +285,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
return Q6_Vhf_vmpy_VhfVhf(a, b);
}
static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vadd_VsfVsf(a, b);
}
static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vsub_VsfVsf(a, b);
}
static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
return Q6_Vsf_vmpy_VsfVsf(a, b);
}
#endif // __HVX_ARCH__ < 79
#endif /* HVX_BASE_H */

View File

@@ -27,6 +27,7 @@
#include "htp-ctx.h"
#include "htp-ops.h"
#include "htp-ops.h"
#include "htp_iface.h"
#include "worker-pool.h"
AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -103,6 +104,54 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
return AEE_SUCCESS;
}
AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
}
if (mode == HTP_PROF_PMU) {
const uint32_t* events = pmu_conf->events;
// Pack 4 event IDs (low 8 bits) into each 32-bit config register
uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
evtcfg |= ((events[i + 0] & 0xFF) << (i * 8));
evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
}
// For events >255 pack high 2 bits of all 8 event IDs into cfg register
// 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
cfg |= (((events[i] >> 8) & 3) << (i * 2));
}
FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
// Configure PMU registers
qurt_pmu_set(QURT_PMUCFG, cfg);
qurt_pmu_set(QURT_PMUEVTCFG, evtcfg);
qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
qurt_pmu_enable(1);
}
ctx->profiler = mode;
return AEE_SUCCESS;
}
AEEResult htp_iface_close(remote_handle64 handle) {
struct htp_context * ctx = (struct htp_context *) handle;
@@ -129,35 +178,19 @@ AEEResult htp_iface_close(remote_handle64 handle) {
}
}
if (ctx->profiler) {
qurt_pmu_enable(1);
}
if (ctx->etm) {
HAP_user_etm_disable();
}
free(ctx);
return AEE_SUCCESS;
}
AEEResult htp_iface_enable_etm(remote_handle64 handle) {
int err = HAP_user_etm_enable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_disable_etm(remote_handle64 handle) {
int err = HAP_user_etm_disable();
if (err) {
if (err == AEE_EVERSIONNOTSUPPORT) {
FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
} else {
FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
}
}
return err;
}
AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
@@ -204,7 +237,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t
return AEE_ENOMEMORY;
}
AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
struct htp_context * ctx = (struct htp_context *) handle;
if (!ctx) {
return AEE_EBADPARM;
@@ -434,19 +467,39 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
struct profile_data {
uint64_t usecs;
uint64_t cycles;
uint64_t pkts;
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
};
static inline void profile_start(struct profile_data * d) {
d->usecs = HAP_perf_get_qtimer_count();
d->cycles = hex_get_cycles();
d->pkts = hex_get_pktcnt();
static inline void profile_start(uint32_t mode, struct profile_data * d) {
switch (mode) {
case HTP_PROF_PMU:
hex_get_pmu(d->pmu_counters);
// fallthrough
case HTP_PROF_BASIC:
d->usecs = HAP_perf_get_qtimer_count();
d->cycles = hex_get_cycles();
break;
default:
break;
}
}
static inline void profile_stop(struct profile_data * d) {
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
d->cycles = hex_get_cycles() - d->cycles;
d->pkts = hex_get_pktcnt() - d->pkts;
static inline void profile_stop(uint32_t mode, struct profile_data * d) {
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
switch (mode) {
case HTP_PROF_PMU:
hex_get_pmu(pmu_counters);
for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
}
// fallthrough
case HTP_PROF_BASIC:
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
d->cycles = hex_get_cycles() - d->cycles;
break;
default:
break;
}
}
static int execute_op(struct htp_ops_context * octx) {
@@ -520,6 +573,9 @@ static int execute_op(struct htp_ops_context * octx) {
case HTP_OP_DIAG:
return op_diag(octx);
case HTP_OP_SOLVE_TRI:
return op_solve_tri(octx);
case HTP_OP_INVALID:
break;
@@ -726,30 +782,33 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
continue;
}
// Reset poll count for valid requests
poll_count = DSPQUEUE_POLL_COUNT;
const uint32_t n_bufs = req.n_bufs;
const uint32_t n_tens = req.n_tensors;
const uint32_t n_ops = req.n_ops;
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
if (dbuf.size < b_size + t_size + o_size) {
if (dbuf.size < b_size + t_size + o_size + p_size) {
FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
break;
}
// Reset poll count for valid requests
poll_count = DSPQUEUE_POLL_COUNT;
uint8_t * m_ptr = dbuf.ptr;
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr;
FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
// Setup descriptor pointers
uint8_t * m_ptr = dbuf.ptr;
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; m_ptr += o_size;
struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
prep_op_bufs(ctx, bufs, n_bufs);
prep_tensors(ctx, bufs, tens, n_tens);
@@ -760,22 +819,34 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
for (uint32_t i=0; i < n_ops; i++) {
struct profile_data prof;
profile_start(&prof);
profile_start(ctx->profiler, &prof);
proc_op_req(octx, tens, i, &ops[i]);
profile_stop(&prof);
ops[i].prof_usecs = prof.usecs;
ops[i].prof_cycles = prof.cycles;
ops[i].prof_pkts = prof.pkts;
profile_stop(ctx->profiler, &prof);
if (ctx->profiler) {
pds[i].opcode = ops[i].opcode;
pds[i].usecs = prof.usecs;
pds[i].cycles = prof.cycles;
for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
pds[i].pmu[j] = prof.pmu_counters[j];
}
}
}
// dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
struct htp_opbatch_rsp rsp;
rsp.status = HTP_STATUS_OK; // FIXME
rsp.id = req.id;
rsp.status = HTP_STATUS_OK;
rsp.n_bufs = n_bufs;
rsp.n_tensors = n_tens;
rsp.n_ops = n_ops;
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
if (err != 0) {
FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);

View File

@@ -3017,6 +3017,10 @@ int op_matmul(struct htp_ops_context * octx) {
const int act_stride = (int)(src1->nb[1] / sizeof(float));
const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
if (src0->type == HTP_TYPE_F16) {
if (is_batched) {
hmx_matmul_w16a32_batched_params_t batch_params = {

View File

@@ -0,0 +1,267 @@
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
#include <HAP_farf.h>
#include <HAP_perf.h>
#include <string.h>
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "htp-ctx.h"
#include "htp-ops.h"
#include "hvx-types.h"
#include "hvx-utils.h"
struct htp_solve_tri_context {
struct htp_ops_context * octx;
uint32_t jobs_per_thread;
uint32_t total_jobs;
uint32_t k_chunks;
uint32_t col_block;
};
static inline void solve_tri_row_scalar(const float * A_row,
const float * B_row,
float * X,
uint32_t row,
uint32_t k,
uint32_t col0,
uint32_t coln,
float inv_diag) {
for (uint32_t col = col0; col < col0 + coln; ++col) {
float sum = 0.0f;
for (uint32_t t = 0; t < row; ++t) {
sum += A_row[t] * X[t * k + col];
}
X[row * k + col] = (B_row[col] - sum) * inv_diag;
}
}
static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) {
HVX_Vector v = *((const HVX_UVector *) src);
HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float));
return Q6_V_vmux_QVV(mask, v, Q6_V_vzero());
}
static inline void solve_tri_row_hvx(const float * A_row,
const float * B_row,
float * X,
uint32_t row,
uint32_t k,
uint32_t col0,
uint32_t coln,
float inv_diag) {
const bool full = (coln == VLEN_FP32);
HVX_Vector sum_v = Q6_V_vzero();
for (uint32_t t = 0; t < row; ++t) {
const float a = A_row[t];
const float * x_row_col = X + t * k + col0;
HVX_Vector x_v = full ? *((const HVX_UVector *) x_row_col) : hvx_load_partial_f32(x_row_col, coln);
HVX_Vector a_v = hvx_vec_splat_f32(a);
sum_v = hvx_vec_add_f32_f32(sum_v, hvx_vec_mul_f32_f32(x_v, a_v));
}
const float * b_row_col = B_row + col0;
float * x_out_col = X + row * k + col0;
HVX_Vector b_v = full ? *((const HVX_UVector *) b_row_col) : hvx_load_partial_f32(b_row_col, coln);
HVX_Vector inv_diag_v = hvx_vec_splat_f32(inv_diag);
HVX_Vector out_v = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(b_v, sum_v), inv_diag_v);
hvx_vec_store_u((void *) x_out_col, coln * sizeof(float), out_v);
}
// Batch-level thread: each job is one full batch.
static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
struct htp_ops_context * octx = sctx->octx;
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
const uint32_t n = src0->ne[0];
const uint32_t k = src1->ne[0];
const uint32_t ne02 = src0->ne[2];
const uint32_t col_block = VLEN_FP32;
const uint32_t k_full = (k / col_block) * col_block;
const uint32_t start_batch = sctx->jobs_per_thread * ith;
const uint32_t end_batch = MIN(start_batch + sctx->jobs_per_thread, sctx->total_jobs);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
for (uint32_t batch = start_batch; batch < end_batch; ++batch) {
const uint32_t i03 = batch / ne02;
const uint32_t i02 = batch - i03 * ne02;
const float * A_batch =
(const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
const float * B_batch =
(const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
for (uint32_t row = 0; row < n; ++row) {
const float diag = A_batch[row * n + row];
const float inv_diag = 1.0f / diag;
const float * A_row = A_batch + row * n;
const float * B_row = B_batch + row * k;
uint32_t col0 = 0;
for (; col0 < k_full; col0 += col_block) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, col_block, inv_diag);
}
if (col0 < k) {
const uint32_t coln = k - col0;
if (coln >= 8) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
} else {
solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
}
}
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "solve-tri-batch %d/%d: A=(%ux%u) B=(%ux%u) batch %u:%u usec %u\n",
ith, nth, n, n, k, n, start_batch, end_batch,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
// Chunk-level thread: each job is one (batch, col_chunk) pair.
static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) {
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
struct htp_ops_context * octx = sctx->octx;
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
const uint32_t n = src0->ne[0];
const uint32_t k = src1->ne[0];
const uint32_t ne02 = src0->ne[2];
const uint32_t start_job = sctx->jobs_per_thread * ith;
const uint32_t end_job = MIN(start_job + sctx->jobs_per_thread, sctx->total_jobs);
uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();
for (uint32_t job = start_job; job < end_job; ++job) {
const uint32_t batch = job / sctx->k_chunks;
const uint32_t chunk = job - batch * sctx->k_chunks;
const uint32_t i03 = batch / ne02;
const uint32_t i02 = batch - i03 * ne02;
const uint32_t col0 = chunk * sctx->col_block;
const uint32_t coln = MIN(sctx->col_block, k - col0);
const float * A_batch =
(const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
const float * B_batch =
(const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
const bool use_hvx = (coln >= 8);
for (uint32_t row = 0; row < n; ++row) {
const float diag = A_batch[row * n + row];
const float inv_diag = 1.0f / diag;
const float * A_row = A_batch + row * n;
const float * B_row = B_batch + row * k;
if (use_hvx) {
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
} else {
solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
}
}
}
t2 = HAP_perf_get_qtimer_count();
FARF(HIGH, "solve-tri-chunk %d/%d: A=(%ux%u) B=(%ux%u) job %u:%u usec %u\n",
ith, nth, n, n, k, n, start_job, end_job,
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}
int op_solve_tri(struct htp_ops_context * octx) {
const struct htp_tensor * src0 = octx->src[0]; // A
const struct htp_tensor * src1 = octx->src[1]; // B
const struct htp_tensor * dst = octx->dst; // X
if (src0->type != HTP_TYPE_F32 || src1->type != HTP_TYPE_F32 || dst->type != HTP_TYPE_F32) {
return HTP_STATUS_NO_SUPPORT;
}
// left=true, lower=true, uni=false only
if (src0->ne[0] != src0->ne[1]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (src0->ne[1] != src1->ne[1]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] ||
dst->ne[3] != src1->ne[3]) {
return HTP_STATUS_INVAL_PARAMS;
}
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
return HTP_STATUS_OK;
}
const uint32_t k = src1->ne[0];
const uint32_t col_block = VLEN_FP32;
const uint32_t k_chunks = (k + col_block - 1) / col_block;
const uint32_t total_batches = src0->ne[2] * src0->ne[3];
const bool batched = total_batches >= (uint32_t) octx->n_threads;
FARF(HIGH, "solve-tri: (%ux%ux%ux%u) x (%ux%ux%ux%u) -> (%ux%ux%ux%u) : batched %d\n",
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], batched);
if (batched) {
// Batch-level parallelism
const uint32_t n_threads = MIN((uint32_t) octx->n_threads, total_batches);
struct htp_solve_tri_context sctx = {
.octx = octx,
.jobs_per_thread = (total_batches + n_threads - 1) / n_threads,
.total_jobs = total_batches,
.k_chunks = k_chunks,
.col_block = col_block,
};
worker_pool_run_func(octx->ctx->worker_pool, solve_tri_batch_thread_f32, &sctx, n_threads);
} else {
// Chunk-level parallelism
const uint32_t total_jobs = total_batches * k_chunks;
const uint32_t n_threads = MIN((uint32_t) octx->n_threads, MAX(total_jobs, 1));
struct htp_solve_tri_context sctx = {
.octx = octx,
.jobs_per_thread = (total_jobs + n_threads - 1) / n_threads,
.total_jobs = total_jobs,
.k_chunks = k_chunks,
.col_block = col_block,
};
worker_pool_run_func(octx->ctx->worker_pool, solve_tri_chunk_thread_f32, &sctx, n_threads);
}
return HTP_STATUS_OK;
}

View File

@@ -197,11 +197,12 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
/** RMS_NORM + MUL **/
struct ggml_webgpu_rms_norm_mul_pipeline_key {
bool inplace;
bool src_overlap;
bool inplace; // rn_src == dst
bool overlap; // mul_src == dst
bool src_overlap; // rn_src == mul_src
bool operator==(const ggml_webgpu_rms_norm_mul_pipeline_key & other) const {
return inplace == other.inplace && src_overlap == other.src_overlap;
return inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap;
}
};
@@ -209,6 +210,7 @@ struct ggml_webgpu_rms_norm_mul_pipeline_key_hash {
size_t operator()(const ggml_webgpu_rms_norm_mul_pipeline_key & key) const {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.inplace);
ggml_webgpu_hash_combine(seed, key.overlap);
ggml_webgpu_hash_combine(seed, key.src_overlap);
return seed;
}
@@ -556,7 +558,7 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
const size_t q_tile = context.sg_mat_m;
const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
size_t bytes_per_kv = 0;
size_t bytes_per_kv = 0;
if (!key.kv_direct) {
bytes_per_kv += std::max(key.head_dim_qk, key.head_dim_v);
}
@@ -1878,6 +1880,7 @@ class ggml_webgpu_shader_lib {
webgpu_pipeline get_rms_norm_mul_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_rms_norm_mul_pipeline_key key = {};
key.inplace = context.inplace;
key.overlap = context.overlap;
key.src_overlap = context.src_overlap;
auto it = rms_norm_mul_pipelines.find(key);
@@ -1892,6 +1895,9 @@ class ggml_webgpu_shader_lib {
if (key.inplace) {
defines.push_back("INPLACE");
variant += "_inplace";
} else if (key.overlap) {
defines.push_back("OVERLAP");
variant += "_overlap";
} else if (key.src_overlap) {
defines.push_back("SRC_OVERLAP");
variant += "_src_overlap";

View File

@@ -2071,8 +2071,9 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
}
bool inplace = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
bool overlap = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
(ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
bool inplace = ggml_webgpu_tensor_equal(rn_src, dst);
bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
uint32_t offset_merged_rn_src = 0;
@@ -2116,7 +2117,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
std::vector<wgpu::BindGroupEntry> entries;
if (inplace) {
if (inplace || overlap) {
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
} else if (src_overlap) {
@@ -2136,6 +2137,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
shader_lib_ctx.inplace = inplace;
shader_lib_ctx.overlap = overlap;
shader_lib_ctx.src_overlap = src_overlap;
webgpu_pipeline pipeline = ctx->shader_lib->get_rms_norm_mul_pipeline(shader_lib_ctx);

View File

@@ -1,4 +1,4 @@
#ifdef INPLACE
#ifdef OVERLAP
@group(0) @binding(0)
var<storage, read_write> rn_src: array<f32>;
@@ -13,6 +13,21 @@ fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32)
mul_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
}
#elif INPLACE
@group(0) @binding(0)
var<storage, read_write> rn_src: array<f32>;
@group(0) @binding(1)
var<storage, read_write> mul_src: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
rn_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
}
#elif SRC_OVERLAP
@group(0) @binding(0)

View File

@@ -23,10 +23,10 @@ verbose=
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"

View File

@@ -28,10 +28,10 @@ sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"

View File

@@ -28,10 +28,10 @@ sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"

View File

@@ -37,10 +37,10 @@ sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"

View File

@@ -25,10 +25,10 @@ sched=
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
profile=
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"
opmask=
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
nhvx=
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"

View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
import sys
import os
import re
import argparse
import statistics
import logging
from collections import defaultdict
# Mapping of cli-friendly names to (internal_data_key, Display Header, numeric_sort_key)
COL_MAP = {
"op": ("op", "Op", "op"),
"dims": ("dims", "Dims", "dims"),
"dtypes": ("dtypes", "DTypes", "dtypes"),
"count": ("count", "Count", "_sort_count"),
"max-usec": ("max_usec", "Max usec", "_sort_max_usec"),
"avg-usec": ("avg_usec", "Avg usec", "_sort_avg_usec"),
"max-cycles": ("max_cycles", "Max Cycles", "_sort_max_cycles"),
"avg-cycles": ("avg_cycles", "Avg Cycles", "_sort_avg_cycles"),
"max-pmu": ("max_pmu", "Max PMU", "_sort_max_pmu"),
"avg-pmu": ("avg_pmu", "Avg PMU", "_sort_avg_pmu"),
}
op_pattern = re.compile(
r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
)
logger = logging.getLogger("ggml-hexagon-profile")
def parse_log(file_path, pmu_index=None):
try:
if file_path != "-":
f = open(file_path, 'r', encoding='utf-8', errors='ignore')
else:
f = os.fdopen(0, 'r', encoding='utf-8', errors='ignore')
except FileNotFoundError:
logger.error(f"file '{file_path}' not found.")
sys.exit(1)
all_ops = []
for line in f:
match = op_pattern.search(line)
if not match: continue
pmu_raw = match.group('pmu')
pmu_val = None
if pmu_raw and pmu_index is not None:
try:
pmu_list = [int(x.strip()) for x in pmu_raw.split(',')]
if len(pmu_list) > pmu_index:
pmu_val = pmu_list[pmu_index]
except (ValueError, IndexError):
pmu_val = None
all_ops.append({
'name': match.group('op_name'),
'dims': match.group('dims').strip(),
'types': match.group('types').strip(),
'usec': int(match.group('usec')),
'cycles': int(match.group('cycles')),
'pmu_val': pmu_val
})
f.close()
return all_ops
def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None):
if not ops:
logger.info("No valid records found.")
return
grouped = defaultdict(list)
for op in ops:
key = (op['name'], op['dims'], op['types'])
grouped[key].append(op)
group_stats = []
for (name, dims, types), group_ops in grouped.items():
usecs = [o['usec'] for o in group_ops]
cycles = [o['cycles'] for o in group_ops]
pmu_vals = [o['pmu_val'] for o in group_ops if o['pmu_val'] is not None]
group_stats.append({
'op': name,
'dims': dims,
'dtypes': types,
'count': str(len(group_ops)),
'max_usec': str(max(usecs)),
'avg_usec': f"{statistics.mean(usecs):.2f}",
'max_cycles': str(max(cycles)),
'avg_cycles': f"{statistics.mean(cycles):.2f}",
'max_pmu': str(max(pmu_vals)) if pmu_vals else "0",
'avg_pmu': f"{statistics.mean(pmu_vals):.2f}" if pmu_vals else "0.00",
# Numeric values for accurate sorting
'_sort_count': len(group_ops),
'_sort_max_usec': max(usecs),
'_sort_avg_usec': statistics.mean(usecs),
'_sort_max_cycles': max(cycles),
'_sort_avg_cycles': statistics.mean(cycles),
'_sort_max_pmu': max(pmu_vals) if pmu_vals else 0,
'_sort_avg_pmu': statistics.mean(pmu_vals) if pmu_vals else 0
})
# Sorting logic
actual_sort_key = COL_MAP[sort_col][2]
# We sort numeric fields descending, strings (op/dims) ascending
is_numeric = actual_sort_key.startswith("_") or actual_sort_key == "count"
sorted_groups = sorted(group_stats, key=lambda x: x[actual_sort_key], reverse=is_numeric)[:top_n]
# Define initial column order
active_cols = ["op", "dims", "dtypes"]
if pmu_name:
active_cols += ["max-pmu", "avg-pmu"]
active_cols += ["max-usec", "avg-usec", "max-cycles", "avg-cycles", "count"]
final_headers, final_keys, final_widths = [], [], []
for col_name in active_cols:
data_key, header_text, _ = COL_MAP[col_name]
if "pmu" in col_name and pmu_name:
header_text = header_text.replace("PMU", pmu_name)
natural_width = max([len(row[data_key]) for row in sorted_groups] + [len(header_text)])
target_width = width_overrides.get(col_name, natural_width)
if target_width == 0:
continue
final_headers.append(header_text)
final_keys.append(data_key)
final_widths.append(target_width)
# Print Report
logger.info(f"\n# Profile Report (Top {top_n} Ops sorted by {sort_col})\n")
header_line = "| " + " | ".join(f"{h:<{final_widths[i]}}" for i, h in enumerate(final_headers)) + " |"
sep_line = "| " + " | ".join("-" * final_widths[i] for i in range(len(final_headers))) + " |"
logger.info(header_line)
logger.info(sep_line)
for group in sorted_groups:
row_vals = []
for i, key in enumerate(final_keys):
val = group[key]
if len(val) > final_widths[i]:
val = val[:final_widths[i] - 3] + "..."
row_vals.append(f"{val:<{final_widths[i]}}")
logger.info("| " + " | ".join(row_vals) + " |")
def main():
parser = argparse.ArgumentParser(description="Post-process Op profile info.")
parser.add_argument("logfile")
parser.add_argument("-n", "--top", type=int, default=100)
parser.add_argument("--sort", type=str, default="max-usec", choices=list(COL_MAP.keys()))
parser.add_argument("--pmu-index", type=int)
parser.add_argument("--pmu-name", type=str)
parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format='%(message)s')
# Sort validation: can't sort by PMU if index isn't provided
if "pmu" in args.sort and args.pmu_index is None:
logger.error(f"Cannot sort by '{args.sort}' without --pmu-index.")
sys.exit(1)
overrides = {}
if args.width:
for w in args.width:
try:
name, val = w.split(':')
overrides[name.lower()] = int(val)
except ValueError:
logger.warning(f"Invalid width format '{w}'")
final_pmu_name = (args.pmu_name or f"#{args.pmu_index}") if args.pmu_index is not None else None
ops = parse_log(args.logfile, pmu_index=args.pmu_index)
generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name)
if __name__ == "__main__":
main()

View File

@@ -1 +0,0 @@
This directory includes pytest based scripts for running CI jobs on Qualcomm Device Cloud (QDC).

View File

@@ -8,12 +8,9 @@ iniconfig==2.1.0
outcome==1.3.0.post0
packaging==25.0
pluggy==1.6.0
Pygments==2.19.2
PySocks==1.7.1
pytest==8.4.2
pytest-dependency==0.6.0
selenium==4.36.0
setuptools==80.9.0
sniffio==1.3.1
sortedcontainers==2.4.0
tomli==2.3.0

View File

@@ -0,0 +1,401 @@
"""Run llama.cpp Hexagon Android tests in a single QDC Appium job.
Bundles test scripts into one artifact and submits a single QDC job:
1. run_bench_tests_posix.py — llama-cli and llama-bench on CPU / GPU / NPU
(from scripts/snapdragon/qdc/)
Results are written to $GITHUB_STEP_SUMMARY when set (GitHub Actions).
Prerequisites:
pip install /path/to/qualcomm_device_cloud_sdk*.whl
Required environment variables:
QDC_API_KEY API key from QDC UI -> Users -> Settings -> API Keys
Usage:
python run_qdc_jobs.py \\
--pkg-dir pkg-snapdragon/llama.cpp \\
--model-url https://.../Llama-3.2-1B-Instruct-Q4_0.gguf \\
--device SM8750
"""
from __future__ import annotations
import argparse
import logging
import os
import re
import shutil
import sys
import tempfile
import time
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from pathlib import Path
from qualcomm_device_cloud_sdk.api import qdc_api # ty: ignore[unresolved-import]
from qualcomm_device_cloud_sdk.logging import configure_logging # ty: ignore[unresolved-import]
from qualcomm_device_cloud_sdk.models import ArtifactType, JobMode, JobState, JobSubmissionParameter, JobType, TestFramework # ty: ignore[unresolved-import]
configure_logging(level=logging.INFO, handlers=[logging.StreamHandler()])
log = logging.getLogger(__name__)
POLL_INTERVAL = 30
JOB_TIMEOUT = 3600
LOG_UPLOAD_TIMEOUT = 600
CAPACITY_TIMEOUT = 1800
CAPACITY_POLL = 60
MAX_CONCURRENT_JOBS = 5
TERMINAL_STATES = {JobState.COMPLETED, JobState.CANCELED}
NON_TERMINAL_STATES = {JobState.DISPATCHED, JobState.RUNNING, JobState.SETUP, JobState.SUBMITTED}
_SCRIPTS_DIR = Path(__file__).parent
_TESTS_DIR = _SCRIPTS_DIR / "tests"
_RUN_BENCH = _TESTS_DIR / "run_bench_tests_posix.py"
_RUN_BACKEND_OPS = _TESTS_DIR / "run_backend_ops_posix.py"
_UTILS = _TESTS_DIR / "utils.py"
_CONFTEST = _TESTS_DIR / "conftest.py"
_REQUIREMENTS = _SCRIPTS_DIR / "requirements.txt"
_PYTEST_LINE_RE = re.compile(
r"(?:[\w/]+\.py::)?(?:\w+::)?([\w\[\].-]+)\s+(PASSED|FAILED|ERROR|SKIPPED)"
)
_EXCLUDED_LOGS = {"qdc_android_whole_host-000.log", "qdc_kernel_host-000.log"}
_NON_TERMINAL_STATE_VALUES = {s.value for s in NON_TERMINAL_STATES}
@dataclass
class JobResult:
passed: bool
tests: dict[str, bool] = field(default_factory=dict)
raw_logs: dict[str, str] = field(default_factory=dict)
failure_details: dict[str, str] = field(default_factory=dict)
def build_artifact_zip(
pkg_dir: Path,
stage_dir: Path,
*,
test_mode: str = "bench",
model_url: str | None = None,
) -> Path:
"""Bundle everything into a single QDC artifact zip.
Zip structure (extracted by QDC to /qdc/appium/ on the runner):
llama_cpp_bundle/ installed package (adb pushed to /data/local/tmp/)
tests/
utils.py shared helpers (paths, run_adb_command, …)
conftest.py shared pytest fixtures (driver)
test_bench_posix.py bench + cli tests (<<MODEL_URL>> substituted)
AND/OR
test_backend_ops_posix.py test-backend-ops -b HTP0
requirements.txt
"""
shutil.copytree(pkg_dir, stage_dir / "llama_cpp_bundle")
tests_dir = stage_dir / "tests"
tests_dir.mkdir()
shutil.copy(_UTILS, tests_dir / "utils.py")
shutil.copy(_CONFTEST, tests_dir / "conftest.py")
if test_mode in ("bench", "all"):
assert model_url is not None, "--model-url is required for bench/all test modes"
(tests_dir / "test_bench_posix.py").write_text(
_RUN_BENCH.read_text().replace("<<MODEL_URL>>", model_url)
)
if test_mode in ("backend-ops", "all"):
shutil.copy(_RUN_BACKEND_OPS, tests_dir / "test_backend_ops_posix.py")
shutil.copy(_REQUIREMENTS, stage_dir / "requirements.txt")
(stage_dir / "pytest.ini").write_text("[pytest]\naddopts = --junitxml=results.xml\n")
zip_base = str(stage_dir / "artifact")
shutil.make_archive(zip_base, "zip", stage_dir)
return Path(f"{zip_base}.zip")
def wait_for_job(client, job_id: str, timeout: int) -> str:
elapsed = 0
while elapsed < timeout:
raw = qdc_api.get_job_status(client, job_id)
try:
status = JobState(raw)
except ValueError:
status = raw
if status in TERMINAL_STATES:
return raw.lower()
log.info("Job %s: %s", job_id, raw)
time.sleep(POLL_INTERVAL)
elapsed += POLL_INTERVAL
raise TimeoutError(f"Job {job_id} did not finish within {timeout}s")
def wait_for_log_upload(client, job_id: str) -> None:
elapsed = 0
while elapsed <= LOG_UPLOAD_TIMEOUT:
status = (qdc_api.get_job_log_upload_status(client, job_id) or "").lower()
if status in {"completed", "failed"}:
return
log.info("Waiting for log upload (status=%s) ...", status)
time.sleep(POLL_INTERVAL)
elapsed += POLL_INTERVAL
log.warning("Timed out waiting for log upload after %ds", LOG_UPLOAD_TIMEOUT)
def wait_for_capacity(client, max_jobs: int = MAX_CONCURRENT_JOBS) -> None:
"""Block until the user's active (non-terminal) QDC job count is below max_jobs."""
elapsed = 0
while elapsed < CAPACITY_TIMEOUT:
jobs_page = qdc_api.get_jobs_list(client, page_number=0, page_size=50)
if jobs_page is None:
log.warning("Could not retrieve job list; proceeding without capacity check")
return
items = getattr(jobs_page, "data", []) or []
active = sum(1 for j in items if getattr(j, "state", None) in _NON_TERMINAL_STATE_VALUES)
if active < max_jobs:
log.info("Active QDC jobs: %d / %d — proceeding", active, max_jobs)
return
log.info("Active QDC jobs: %d / %d — waiting %ds ...", active, max_jobs, CAPACITY_POLL)
time.sleep(CAPACITY_POLL)
elapsed += CAPACITY_POLL
log.warning("Capacity wait timed out after %ds; proceeding anyway", CAPACITY_TIMEOUT)
def _parse_junit_xml(content: str) -> tuple[dict[str, bool], dict[str, str]]:
try:
root = ET.fromstring(content)
except ET.ParseError:
return {}, {}
results: dict[str, bool] = {}
failures: dict[str, str] = {}
for tc in root.iter("testcase"):
name = tc.get("name", "")
if classname := tc.get("classname", ""):
name = f"{classname}.{name}"
failure_el = tc.find("failure")
if failure_el is None:
failure_el = tc.find("error")
results[name] = failure_el is None
if failure_el is not None:
parts = [failure_el.get("message", ""), failure_el.text or ""]
failures[name] = "\n".join(p for p in parts if p).strip()
return results, failures
def _parse_pytest_output(content: str) -> dict[str, bool]:
results: dict[str, bool] = {}
for m in _PYTEST_LINE_RE.finditer(content):
results[m.group(1)] = m.group(2) == "PASSED"
return results
def fetch_logs_and_parse_tests(
client, job_id: str
) -> tuple[dict[str, bool], dict[str, str], dict[str, str]]:
"""Returns (test_results, raw_logs, failure_details)."""
log_files = qdc_api.get_job_log_files(client, job_id)
if not log_files:
log.warning("No log files returned for job %s", job_id)
return {}, {}, {}
test_results: dict[str, bool] = {}
pytest_fallback: dict[str, bool] = {}
raw_logs: dict[str, str] = {}
failure_details: dict[str, str] = {}
with tempfile.TemporaryDirectory() as tmpdir:
for lf in log_files:
log.info("Downloading log file: %s", lf.filename)
zip_path = os.path.join(tmpdir, "log.zip")
qdc_api.download_job_log_files(client, lf.filename, zip_path)
try:
shutil.unpack_archive(zip_path, tmpdir, "zip")
except Exception as e:
log.warning("Could not unpack %s as zip: %s", lf.filename, e)
for root_dir, _, files in os.walk(tmpdir):
for fname in sorted(files):
fpath = os.path.join(root_dir, fname)
content = Path(fpath).read_text(errors="replace")
if fname.endswith(".xml"):
results, failures = _parse_junit_xml(content)
test_results.update(results)
failure_details.update(failures)
elif fname.endswith(".log"):
if fname in _EXCLUDED_LOGS:
continue
log.info("--- %s ---", fname)
log.info("%s", content)
raw_logs[fname] = content
pytest_fallback.update(_parse_pytest_output(content))
return (test_results if test_results else pytest_fallback), raw_logs, failure_details
def write_summary(result: JobResult, title: str = "QDC Test Results") -> None:
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
if not summary_path:
return
icon = "" if result.passed else ""
lines = [
f"## {title}\n",
f"Overall: {icon} {'PASSED' if result.passed else 'FAILED'}\n",
]
reportable = {n: ok for n, ok in result.tests.items() if "test_install" not in n}
if reportable:
lines += ["| Test | Result |", "| ---- | ------ |"]
for name, ok in reportable.items():
lines.append(f"| `{name}` | {'' if ok else ''} |")
passed_n = sum(1 for v in reportable.values() if v)
failed_n = sum(1 for v in reportable.values() if not v)
lines += ["", f"**{passed_n} passed, {failed_n} failed**"]
else:
lines.append("_No per-test data available._")
failed_names = [n for n, ok in reportable.items() if not ok]
if failed_names:
lines += ["", "### Failures"]
for name in failed_names:
detail = result.failure_details.get(name)
if detail:
lines += [
f"<details><summary><code>{name}</code></summary>",
"",
"```",
detail,
"```",
"",
"</details>",
]
if result.raw_logs:
lines += ["", "### Raw Logs"]
for fname, content in sorted(result.raw_logs.items()):
lines += [
f"<details><summary>{fname}</summary>",
"",
"```",
content.rstrip(),
"```",
"",
"</details>",
]
with open(summary_path, "a") as f:
f.write("\n".join(lines) + "\n")
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument("--pkg-dir", required=True, type=Path,
help="Installed llama.cpp package directory (contains bin/ and lib/)")
p.add_argument("--model-url",
help="Direct URL to the GGUF model file (required for --test bench)")
p.add_argument("--device", required=True,
help="QDC chipset name, e.g. SM8750")
p.add_argument("--test", choices=["bench", "backend-ops", "all"], default="bench",
help="Test suite to run (default: bench)")
p.add_argument("--job-timeout", type=int, default=JOB_TIMEOUT, metavar="SECONDS",
help=f"Max seconds to wait for job completion (default: {JOB_TIMEOUT})")
args = p.parse_args()
if args.test in ("bench", "all") and not args.model_url:
p.error("--model-url is required when --test bench or --test all")
return args
def main() -> int:
args = parse_args()
api_key = os.environ.get("QDC_API_KEY")
if not api_key:
log.error("QDC_API_KEY environment variable must be set")
return 1
if not args.pkg_dir.is_dir():
log.error("--pkg-dir %s does not exist", args.pkg_dir)
return 1
client = qdc_api.get_public_api_client_using_api_key(
api_key_header=api_key,
app_name_header="llama-cpp-ci",
on_behalf_of_header="llama-cpp-ci",
client_type_header="Python",
)
target_id = qdc_api.get_target_id(client, args.device)
if target_id is None:
log.error("Could not find QDC target for device %r", args.device)
return 1
with tempfile.TemporaryDirectory() as tmpdir:
log.info("Building artifact ...")
zip_path = build_artifact_zip(
args.pkg_dir, Path(tmpdir),
test_mode=args.test, model_url=args.model_url,
)
log.info("Uploading artifact (%d MB) ...", zip_path.stat().st_size // 1_000_000)
artifact_id = qdc_api.upload_file(client, str(zip_path), ArtifactType.TESTSCRIPT)
if artifact_id is None:
log.error("Artifact upload failed")
return 1
wait_for_capacity(client)
job_id = qdc_api.submit_job(
public_api_client=client,
target_id=target_id,
job_name="llama.cpp Hexagon tests",
external_job_id=None,
job_type=JobType.AUTOMATED,
job_mode=JobMode.APPLICATION,
timeout=max(1, args.job_timeout // 60),
test_framework=TestFramework.APPIUM,
entry_script=None,
job_artifacts=[artifact_id],
monkey_events=None,
monkey_session_timeout=None,
job_parameters=[JobSubmissionParameter.WIFIENABLED],
)
if job_id is None:
log.error("Job submission failed")
return 1
log.info("Job submitted: %s (device=%s)", job_id, args.device)
try:
job_status = wait_for_job(client, job_id, timeout=args.job_timeout)
except TimeoutError as e:
log.error("%s", e)
write_summary(JobResult(passed=False, tests={}), title=f"QDC Job Timed Out ({args.device})")
return 1
log.info("Job %s finished: %s", job_id, job_status)
wait_for_log_upload(client, job_id)
tests, raw_logs, failure_details = fetch_logs_and_parse_tests(client, job_id)
passed = job_status == JobState.COMPLETED.value.lower()
if tests:
passed = passed and all(tests.values())
if not passed:
log.error("Job did not complete successfully or tests failed (status=%s)", job_status)
result = JobResult(passed=passed, tests=tests, raw_logs=raw_logs, failure_details=failure_details)
if args.test == "backend-ops":
title = f"Backend Ops — HTP0 ({args.device})"
elif args.test == "all":
title = f"QDC Tests ({args.device})"
else:
title = f"QDC Test Results ({args.device})"
write_summary(result, title=title)
return 0 if passed else 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,20 @@
"""Shared pytest fixtures for QDC on-device test runners."""
import os
import pytest
from appium import webdriver
from utils import options, write_qdc_log
@pytest.fixture(scope="session", autouse=True)
def driver():
return webdriver.Remote(command_executor="http://127.0.0.1:4723/wd/hub", options=options)
def pytest_sessionfinish(session, exitstatus):
xml_path = getattr(session.config.option, "xmlpath", None) or "results.xml"
if os.path.exists(xml_path):
with open(xml_path) as f:
write_qdc_log("results.xml", f.read())

View File

@@ -0,0 +1,41 @@
"""
On-device test-backend-ops runner for llama.cpp (HTP0 backend).
Executed by QDC's Appium test framework on the QDC runner.
The runner has ADB access to the allocated device.
"""
import os
import sys
import pytest
from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log
@pytest.fixture(scope="session", autouse=True)
def install(driver):
push_bundle_if_needed(f"{BIN_PATH}/test-backend-ops")
@pytest.mark.parametrize("type_a", ["mxfp4", "fp16", "q4_0"])
def test_backend_ops_htp0(type_a):
cmd = f"{CMD_PREFIX} GGML_HEXAGON_HOSTBUF=0 GGML_HEXAGON_EXPERIMENTAL=1 {BIN_PATH}/test-backend-ops -b HTP0 -o MUL_MAT"
if type_a == "q4_0":
cmd += r' -p "^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$"'
else:
cmd += f" -p type_a={type_a}"
result = run_adb_command(
cmd,
check=False,
)
write_qdc_log(f"backend_ops_{type_a}.log", result.stdout or "")
assert result.returncode == 0, f"test-backend-ops type_a={type_a} failed (exit {result.returncode})"
if __name__ == "__main__":
ret = pytest.main(["-s", "--junitxml=results.xml", os.path.realpath(__file__)])
if os.path.exists("results.xml"):
with open("results.xml") as f:
write_qdc_log("results.xml", f.read())
sys.exit(ret)

View File

@@ -0,0 +1,76 @@
"""
On-device bench and completion test runner for llama.cpp (CPU, GPU, NPU backends).
Executed by QDC's Appium test framework on the QDC runner.
The runner has ADB access to the allocated device.
Placeholders replaced at artifact creation time by run_qdc_jobs.py:
<<MODEL_URL>> Direct URL to the GGUF model file (downloaded on-device via curl)
"""
import os
import subprocess
import sys
import pytest
from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log
MODEL_PATH = "/data/local/tmp/model.gguf"
PROMPT = "What is the capital of France?"
CLI_OPTS = "--batch-size 128 -n 128 -no-cnv --seed 42"
@pytest.fixture(scope="session", autouse=True)
def install(driver):
push_bundle_if_needed(f"{BIN_PATH}/llama-cli")
# Skip model download if already present
check = subprocess.run(
["adb", "shell", f"ls {MODEL_PATH}"],
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
if check.returncode != 0:
run_adb_command(f'curl -L -J --output {MODEL_PATH} "<<MODEL_URL>>"')
@pytest.mark.parametrize("device,extra_flags", [
pytest.param("none", "-ctk q8_0 -ctv q8_0", id="cpu"),
pytest.param("GPUOpenCL", "", id="gpu"),
pytest.param("HTP0", "-ctk q8_0 -ctv q8_0", id="npu"),
])
def test_llama_completion(device, extra_flags):
result = run_adb_command(
f'{CMD_PREFIX} {BIN_PATH}/llama-completion'
f' -m {MODEL_PATH} --device {device} -ngl 99 -t 4 {CLI_OPTS} {extra_flags} -fa on'
f' -p "{PROMPT}"',
check=False,
)
write_qdc_log(f"llama_completion_{device}.log", result.stdout or "")
assert result.returncode == 0, f"llama-completion {device} failed (exit {result.returncode})"
_DEVICE_LOG_NAME = {"none": "cpu", "GPUOpenCL": "gpu", "HTP0": "htp"}
@pytest.mark.parametrize("device", [
pytest.param("none", id="cpu"),
pytest.param("GPUOpenCL", id="gpu"),
pytest.param("HTP0", id="npu"),
])
def test_llama_bench(device):
result = run_adb_command(
f"{CMD_PREFIX} {BIN_PATH}/llama-bench"
f" -m {MODEL_PATH} --device {device} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32",
check=False,
)
write_qdc_log(f"llama_bench_{_DEVICE_LOG_NAME[device]}.log", result.stdout or "")
assert result.returncode == 0, f"llama-bench {device} failed (exit {result.returncode})"
if __name__ == "__main__":
ret = pytest.main(["-s", "--junitxml=results.xml", os.path.realpath(__file__)])
if os.path.exists("results.xml"):
with open("results.xml") as f:
write_qdc_log("results.xml", f.read())
sys.exit(ret)

View File

@@ -1,63 +0,0 @@
import pytest
import subprocess
import sys
tmp_path='/data/local/tmp'
pkg_path=f'{tmp_path}/llama.cpp'
lib_path=f'{pkg_path}/lib'
bin_path=f'{pkg_path}/bin'
model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
def run_cmd(cmd):
p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
sys.stdout.write(p.stdout)
assert(p.returncode == 0)
@pytest.mark.dependency()
def test_install():
run_cmd(['adb', 'push', 'llama.cpp', f'{tmp_path}'])
run_cmd(['adb', 'shell', f'chmod 755 {bin_path}/*'])
## Basic cli tests
def run_llama_cli(dev, opts):
prompt='what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**.'
opts = '--batch-size 128 -n 128 -no-cnv --seed 42 ' + opts
run_cmd(['adb', 'shell', f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 -t 4 {opts} -p "{prompt}"'])
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_cpu():
run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_gpu():
run_llama_cli('GPUOpenCL', '-fa on')
@pytest.mark.dependency(depends=['test_install'])
def test_llama_cli_npu():
run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
## Basic bench tests
def run_llama_bench(dev):
run_cmd(['adb', 'shell', f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'])
@pytest.mark.dependency(depends=['test_install'])
def test_llama_bench_cpu():
run_llama_bench('none')
def test_llama_bench_gpu():
run_llama_bench('GPUOpenCL')
def test_llama_bench_npu():
run_llama_bench('HTP0')

View File

@@ -0,0 +1,93 @@
"""Shared helpers for QDC on-device test runners."""
import logging
import os
import subprocess
import tempfile
from appium.options.common import AppiumOptions
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# On-device paths
# ---------------------------------------------------------------------------
BUNDLE_PATH = "/data/local/tmp/llama_cpp_bundle"
QDC_LOGS_PATH = "/data/local/tmp/QDC_logs"
LIB_PATH = f"{BUNDLE_PATH}/lib"
BIN_PATH = f"{BUNDLE_PATH}/bin"
ENV_PREFIX = (
f"export LD_LIBRARY_PATH={LIB_PATH} && "
f"export ADSP_LIBRARY_PATH={LIB_PATH} && "
f"chmod +x {BIN_PATH}/* &&"
)
CMD_PREFIX = f"cd {BUNDLE_PATH} && {ENV_PREFIX}"
# ---------------------------------------------------------------------------
# Appium session options
# ---------------------------------------------------------------------------
options = AppiumOptions()
options.set_capability("automationName", "UiAutomator2")
options.set_capability("platformName", "Android")
options.set_capability("deviceName", os.getenv("ANDROID_DEVICE_VERSION"))
# ---------------------------------------------------------------------------
# ADB helpers
# ---------------------------------------------------------------------------
def run_adb_command(cmd: str, *, check: bool = True) -> subprocess.CompletedProcess:
# Append exit-code sentinel because `adb shell` doesn't reliably propagate
# the on-device exit code (older ADB versions always return 0).
raw = subprocess.run(
["adb", "shell", f"{cmd}; echo __RC__:$?"],
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
stdout = raw.stdout
returncode = raw.returncode
if stdout:
lines = stdout.rstrip("\n").split("\n")
if lines and lines[-1].startswith("__RC__:"):
try:
returncode = int(lines[-1][7:])
stdout = "\n".join(lines[:-1]) + "\n"
except ValueError:
pass
log.info("%s", stdout)
result = subprocess.CompletedProcess(raw.args, returncode, stdout=stdout)
if check:
assert returncode == 0, f"Command failed (exit {returncode})"
return result
def write_qdc_log(filename: str, content: str) -> None:
"""Push content as a log file to QDC_LOGS_PATH on the device for QDC log collection."""
subprocess.run(
["adb", "shell", f"mkdir -p {QDC_LOGS_PATH}"],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f:
f.write(content)
tmp_path = f.name
try:
subprocess.run(
["adb", "push", tmp_path, f"{QDC_LOGS_PATH}/{filename}"],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
finally:
os.unlink(tmp_path)
def push_bundle_if_needed(check_binary: str) -> None:
"""Push llama_cpp_bundle to the device if check_binary is not already present."""
result = subprocess.run(
["adb", "shell", f"ls {check_binary}"],
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)
if result.returncode != 0:
subprocess.run(
["adb", "push", "/qdc/appium/llama_cpp_bundle/", "/data/local/tmp"],
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
)

View File

@@ -21,11 +21,11 @@ if ($null -ne $env:V) {
}
if ($null -ne $env:PROF) {
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
$env:GGML_HEXAGON_PROFILE=$env:PROF
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
if ($null -ne $env:OPSTAGE) {
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
}
if ($null -ne $env:NHVX) {

View File

@@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) {
}
if ($null -ne $env:PROF) {
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
$env:GGML_HEXAGON_PROFILE=$env:PROF
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
if ($null -ne $env:OPSTAGE) {
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
}
if ($null -ne $env:NHVX) {

View File

@@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) {
}
if ($null -ne $env:PROF) {
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
$env:GGML_HEXAGON_PROFILE=$env:PROF
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
if ($null -ne $env:OPSTAGE) {
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
}
if ($null -ne $env:NHVX) {

View File

@@ -34,11 +34,11 @@ if ($null -ne $env:SCHED) {
}
if ($null -ne $env:PROF) {
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
$env:GGML_HEXAGON_PROFILE=$env:PROF
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
if ($null -ne $env:OPSTAGE) {
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
}
if ($null -ne $env:NHVX) {

View File

@@ -31,11 +31,11 @@ if ($null -ne $env:SCHED) {
}
if ($null -ne $env:PROF) {
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
$env:GGML_HEXAGON_PROFILE=$env:PROF
}
if ($null -ne $env:OPMASK) {
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
if ($null -ne $env:OPSTAGE) {
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
}
if ($null -ne $env:NHVX) {

View File

@@ -59,8 +59,6 @@ struct cli_context {
std::vector<raw_buffer> input_files;
task_params defaults;
bool verbose_prompt;
int reasoning_budget = -1;
std::string reasoning_budget_message;
// thread for showing "loading" animation
std::atomic<bool> loading_show;
@@ -77,8 +75,6 @@ struct cli_context {
// defaults.return_progress = true; // TODO: show progress
verbose_prompt = params.verbose_prompt;
reasoning_budget = params.sampling.reasoning_budget_tokens;
reasoning_budget_message = params.sampling.reasoning_budget_message;
}
std::string generate_completion(result_timings & out_timings) {
@@ -106,7 +102,7 @@ struct cli_context {
const llama_vocab * vocab = llama_model_get_vocab(
llama_get_model(ctx_server.get_llama_context()));
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
task.params.sampling.generation_prompt = chat_params.generation_prompt;
if (!chat_params.thinking_start_tag.empty()) {
@@ -116,7 +112,7 @@ struct cli_context {
task.params.sampling.reasoning_budget_end =
common_tokenize(vocab, chat_params.thinking_end_tag, false, true);
task.params.sampling.reasoning_budget_forced =
common_tokenize(vocab, reasoning_budget_message + chat_params.thinking_end_tag, false, true);
common_tokenize(vocab, defaults.sampling.reasoning_budget_message + chat_params.thinking_end_tag, false, true);
}
rd.post_task({std::move(task)});

View File

@@ -675,6 +675,10 @@ private:
int32_t n_ctx; // total context for all clients / slots
// set to llama_model_n_swa(model)
// if swa_full is enabled, this is set to 0 to simulate a non-SWA model
int32_t n_swa;
// slots / clients
std::vector<server_slot> slots;
@@ -719,7 +723,7 @@ private:
return;
}
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n");
slot.prompt_save(*prompt_cache);
slot.prompt_clear(false);
prompt_cache->update();
@@ -854,6 +858,8 @@ private:
}
}
n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
// Necessary similarity of prompt for slot selection
slot_prompt_similarity = params_base.slot_prompt_similarity;
@@ -996,7 +1002,7 @@ private:
params_base.cache_idle_slots = false;
} else {
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
}
}
@@ -2415,9 +2421,6 @@ private:
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
// note: when n_swa == 0, the model does not use SWA
const auto n_swa = std::max(0, llama_model_n_swa(model));
// the largest pos_min required for a checkpoint to be useful
const auto pos_min_thold = std::max(0, pos_next - n_swa);
@@ -2589,10 +2592,10 @@ private:
// make a checkpoint of the parts of the memory that cannot be rolled back.
// checkpoints are created only if:
// - the model does not support partial sequence removal
// - the model uses SWA and we are not using `swa_full`
// - the model uses SWA (and we are not using `swa_full`)
do_checkpoint = do_checkpoint && (
(slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
(llama_model_n_swa(model) > 0 && !params_base.swa_full));
(n_swa > 0));
bool has_mtmd = false;

View File

@@ -48,7 +48,7 @@ def test_clear_and_restore():
log = LogReader(server.log_path)
# verify feature is enabled
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" in log.drain()
res = server.make_request("POST", "/completion", data={
"prompt": LONG_PROMPT,
@@ -59,7 +59,7 @@ def test_clear_and_restore():
original_prompt_n = res.body["timings"]["prompt_n"]
# Slot 0 is the only slot with KV — should NOT be cleared
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
# Launching slot 1 clears idle slot 0
res = server.make_request("POST", "/completion", data={
@@ -68,7 +68,7 @@ def test_clear_and_restore():
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOT__" in log.drain()
# Re-send same prompt — should restore from cache-ram
res = server.make_request("POST", "/completion", data={
@@ -86,7 +86,7 @@ def test_clear_and_restore():
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
def test_disabled_with_flag():
@@ -96,7 +96,7 @@ def test_disabled_with_flag():
log = LogReader(server.log_path)
# Feature should not be enabled
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" not in log.drain()
res = server.make_request("POST", "/completion", data={
"prompt": LONG_PROMPT,
@@ -112,4 +112,4 @@ def test_disabled_with_flag():
"cache_prompt": True,
})
assert res.status_code == 200
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()

View File

@@ -1,5 +1,5 @@
[environment]
extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"]
extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests", "./scripts/snapdragon/qdc/tests"]
python-version = "3.10"
[rules]
@@ -13,6 +13,7 @@ exclude = [
[[overrides]]
include = [
"./tools/server/tests/**",
"./scripts/snapdragon/qdc/tests/**",
]
[overrides.rules]