mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
017f090442 | ||
|
|
ffdd983fb8 | ||
|
|
793d0a7931 | ||
|
|
8bc492ebb4 | ||
|
|
e5f070a1dc | ||
|
|
fa0b8a70a8 | ||
|
|
5d2b52d80d | ||
|
|
187a456370 |
113
.github/workflows/build-and-test-snapdragon.yml
vendored
Normal file
113
.github/workflows/build-and-test-snapdragon.yml
vendored
Normal file
@@ -0,0 +1,113 @@
|
||||
name: CI (snapdragon)
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- '.github/workflows/build-and-test-snapdragon.yml'
|
||||
- 'ggml/include/ggml-hexagon.h'
|
||||
- 'ggml/src/ggml-hexagon/**'
|
||||
- 'docs/backend/snapdragon/**'
|
||||
- 'scripts/snapdragon/**'
|
||||
- 'CMakePresets.json'
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths:
|
||||
- '.github/workflows/build-and-test-snapdragon.yml'
|
||||
- 'ggml/include/ggml-hexagon.h'
|
||||
- 'ggml/src/ggml-hexagon/**'
|
||||
- 'docs/backend/snapdragon/**'
|
||||
- 'scripts/snapdragon/**'
|
||||
- 'CMakePresets.json'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
android-ndk-snapdragon:
|
||||
runs-on: ubuntu-latest
|
||||
container:
|
||||
image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Build Llama.CPP for Snapdragon Android
|
||||
id: build_llama_cpp_snapdragon_android
|
||||
run: |
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
cmake --preset arm64-android-snapdragon-release -B build
|
||||
cmake --build build
|
||||
cmake --install build --prefix pkg-adb/llama.cpp
|
||||
|
||||
- name: Upload Llama.CPP Snapdragon Android Build Artifact
|
||||
if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: llama-cpp-android-arm64-snapdragon
|
||||
path: pkg-adb/llama.cpp
|
||||
|
||||
check-secret:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
has-key: ${{ steps.check.outputs.has-key }}
|
||||
steps:
|
||||
- id: check
|
||||
run: echo "has-key=${{ secrets.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
test-snapdragon-qdc:
|
||||
name: Test on QDC Android Device (${{ matrix.device }})
|
||||
needs: [android-ndk-snapdragon, check-secret]
|
||||
if: needs.check-secret.outputs.has-key == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
device: [SM8750, SM8650, SM8850]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Download build artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: llama-cpp-android-arm64-snapdragon
|
||||
path: pkg-snapdragon/
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.x'
|
||||
cache: pip
|
||||
|
||||
- name: Install QDC SDK wheel
|
||||
run: |
|
||||
curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
|
||||
unzip qdc_sdk.zip -d qdc_sdk
|
||||
pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
|
||||
|
||||
- name: Run QDC tests (${{ matrix.device }})
|
||||
run: |
|
||||
python scripts/snapdragon/qdc/run_qdc_jobs.py \
|
||||
--test all \
|
||||
--pkg-dir pkg-snapdragon/llama.cpp \
|
||||
--model-url "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
|
||||
--device ${{ matrix.device }}
|
||||
env:
|
||||
QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
|
||||
49
.github/workflows/build-android.yml
vendored
49
.github/workflows/build-android.yml
vendored
@@ -1,26 +1,24 @@
|
||||
name: CI (android)
|
||||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: [
|
||||
'.github/workflows/build-android.yml',
|
||||
'**/CMakeLists.txt',
|
||||
'**/.cmake',
|
||||
'**/*.h',
|
||||
'**/*.hpp',
|
||||
'**/*.c',
|
||||
'**/*.cpp'
|
||||
]
|
||||
paths:
|
||||
- '.github/workflows/build-android.yml'
|
||||
- '**/CMakeLists.txt'
|
||||
- '**/.cmake'
|
||||
- '**/*.h'
|
||||
- '**/*.hpp'
|
||||
- '**/*.c'
|
||||
- '**/*.cpp'
|
||||
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: [
|
||||
'.github/workflows/build-android.yml',
|
||||
'examples/llama.android/**'
|
||||
]
|
||||
paths:
|
||||
- '.github/workflows/build-android.yml'
|
||||
- 'examples/llama.android/**'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
@@ -67,35 +65,24 @@ jobs:
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'arm64-cpu'
|
||||
defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
|
||||
- build: 'arm64-snapdragon'
|
||||
defines: '--preset arm64-android-snapdragon-release'
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
lfs: false
|
||||
|
||||
- name: Build Llama.CPP for Hexagon Android
|
||||
id: build_llama_cpp_hexagon_android
|
||||
- name: Build
|
||||
id: ndk_build
|
||||
run: |
|
||||
if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
fi
|
||||
cmake ${{ matrix.defines }} -B build
|
||||
cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
|
||||
cmake --build build
|
||||
cmake --install build --prefix pkg-adb/llama.cpp
|
||||
|
||||
- name: Upload Llama.CPP Hexagon Android Build Artifact
|
||||
if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
|
||||
- name: Upload Android Build Artifact
|
||||
if: ${{ always() && steps.ndk_build.outcome == 'success' }}
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: llama-cpp-android-${{ matrix.build }}
|
||||
name: llama-cpp-android-arm64-cpu
|
||||
path: pkg-adb/llama.cpp
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
#include "log.h"
|
||||
#include "value.h"
|
||||
#include "runtime.h"
|
||||
#include "caps.h"
|
||||
|
||||
@@ -249,18 +249,27 @@ build: 6a8cf8914 (6733)
|
||||
```
|
||||
|
||||
- `GGML_HEXAGON_PROFILE=1`
|
||||
Generates a host-side profile for the ggml-hexagon Ops.
|
||||
Enables Op profiling:
|
||||
|
||||
- `GGML_HEXAGON_OPMASK=0x0`
|
||||
Allows enabling specific stages of the processing pipeline:
|
||||
- `1` Basic profile with per-op `usecs` and `cycles` counters
|
||||
- `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
|
||||
- `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
|
||||
|
||||
The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
|
||||
Examples:
|
||||
|
||||
`GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
|
||||
|
||||
- `GGML_HEXAGON_OPSTAGE=0x0`
|
||||
Allows enabling specific stages of the Op processing pipeline:
|
||||
|
||||
- `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
|
||||
- `0x2` Enable Op Compute (MUL_MAT, etc.)
|
||||
|
||||
Examples:
|
||||
|
||||
`GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
|
||||
`GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
|
||||
`GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
|
||||
`GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
|
||||
|
||||
- `GGML_HEXAGON_OPFILTER=regex`
|
||||
Allows filtering (disabling) Ops that match the regex pattern:
|
||||
|
||||
@@ -12,9 +12,12 @@
|
||||
#include <cstddef>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <regex>
|
||||
#include <queue>
|
||||
|
||||
#ifdef _WIN32
|
||||
# include <sal.h>
|
||||
@@ -41,18 +44,26 @@
|
||||
#include "htp_iface.h"
|
||||
#include "htp-drv.h"
|
||||
|
||||
using intvec = std::vector<int>;
|
||||
using uintvec = std::vector<unsigned int>;
|
||||
using u32vec = std::vector<uint32_t>;
|
||||
|
||||
static size_t opt_ndev = 1;
|
||||
static size_t opt_nhvx = 0; // use all
|
||||
static int opt_arch = 0; // autodetect
|
||||
static int opt_etm = 0;
|
||||
static int opt_verbose = 0;
|
||||
static int opt_profile = 0;
|
||||
static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
|
||||
static int opt_hostbuf = 1; // hostbuf ON by default
|
||||
static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only
|
||||
|
||||
// Default PMU events, if profiling with PMU (mode=2) is enabled
|
||||
// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
|
||||
// https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
|
||||
static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
|
||||
|
||||
// Enable all stages by default
|
||||
static int opt_opmask = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
|
||||
static int opt_opsync = 0; // synchronous ops
|
||||
static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
|
||||
static int opt_opbatch = 1024; // max number of ops in a batch
|
||||
static int opt_opqueue = 16; // max number of pending batches
|
||||
static std::regex* opt_opfilter = NULL; // regex of ops to not claim
|
||||
@@ -104,19 +115,26 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
|
||||
}
|
||||
|
||||
static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
|
||||
uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
|
||||
uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
|
||||
if (!opt_profile) return;
|
||||
|
||||
op_desc desc(op);
|
||||
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
|
||||
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
|
||||
op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
|
||||
|
||||
char pmu_str[256] = "";
|
||||
if (opt_profile > 1) {
|
||||
static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
|
||||
sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
|
||||
pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
|
||||
}
|
||||
|
||||
GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
|
||||
ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
|
||||
}
|
||||
|
||||
// ** backend sessions
|
||||
|
||||
struct ggml_hexagon_opbatch;
|
||||
struct ggml_hexagon_opshm;
|
||||
struct ggml_hexagon_opqueue;
|
||||
|
||||
struct ggml_hexagon_session {
|
||||
std::string name;
|
||||
@@ -132,8 +150,8 @@ struct ggml_hexagon_session {
|
||||
bool valid_iface;
|
||||
|
||||
std::atomic<int> op_pending;
|
||||
ggml_hexagon_opbatch *op_batch;
|
||||
ggml_hexagon_opshm *op_shm;
|
||||
ggml_hexagon_opbatch* op_batch;
|
||||
ggml_hexagon_opqueue* op_queue;
|
||||
|
||||
ggml_backend_buffer_type buffer_type = {};
|
||||
ggml_backend_buffer_type repack_buffer_type = {};
|
||||
@@ -1521,65 +1539,14 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
|
||||
|
||||
// Backend session implementation
|
||||
|
||||
struct ggml_hexagon_opshm {
|
||||
ggml_hexagon_shared_buffer *sbuf;
|
||||
|
||||
std::vector<bool> block_mask;
|
||||
size_t block_size;
|
||||
|
||||
uint8_t * base() const { return this->sbuf->base; }
|
||||
int fd() const { return this->sbuf->fd; }
|
||||
size_t n_blocks() const { return this->block_mask.size(); }
|
||||
|
||||
ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
|
||||
size_t n_bufs = HTP_OP_MAX_BUFS;
|
||||
size_t n_ops = max_batch;
|
||||
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
|
||||
|
||||
block_mask.resize(max_pending, true);
|
||||
|
||||
block_size = sizeof(htp_buf_desc) * n_bufs +
|
||||
sizeof(htp_tensor) * n_tensors +
|
||||
sizeof(htp_op_desc) * n_ops;
|
||||
|
||||
sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
|
||||
|
||||
if (opt_verbose) {
|
||||
GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
|
||||
sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
|
||||
}
|
||||
}
|
||||
|
||||
~ggml_hexagon_opshm() {
|
||||
delete sbuf;
|
||||
}
|
||||
|
||||
uint8_t * allocate() {
|
||||
auto it = std::find(block_mask.begin(), block_mask.end(), true);
|
||||
if (it == block_mask.end())
|
||||
return nullptr;
|
||||
|
||||
unsigned int i = std::distance(block_mask.begin(), it);
|
||||
uint8_t* addr = sbuf->base + (i * block_size);
|
||||
block_mask[i] = false;
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
|
||||
return addr;
|
||||
}
|
||||
|
||||
void release(uint8_t * addr) {
|
||||
int i = (addr - sbuf->base) / block_size;
|
||||
block_mask[i] = true;
|
||||
HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_hexagon_opbatch {
|
||||
const char* name;
|
||||
ggml_hexagon_session* sess;
|
||||
|
||||
std::vector<htp_buf_desc> buffers;
|
||||
std::vector<htp_tensor> tensors;
|
||||
std::vector<htp_op_desc> ops;
|
||||
std::vector<const ggml_tensor*> ops; // pointers to original ops
|
||||
|
||||
std::vector<htp_buf_desc> h_bufs; // htp buffer descriptors
|
||||
std::vector<htp_tensor> h_tens; // htp tensor descriptors
|
||||
std::vector<htp_op_desc> h_ops; // htp op descriptors
|
||||
|
||||
std::unordered_map<int, int> b_map; // buffer fd to index
|
||||
std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr to index
|
||||
@@ -1606,19 +1573,21 @@ struct ggml_hexagon_opbatch {
|
||||
d_map.clear();
|
||||
}
|
||||
|
||||
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
|
||||
name = sess->c_name();
|
||||
ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
|
||||
this->sess = sess;
|
||||
|
||||
n_bufs_max = HTP_OP_MAX_BUFS;
|
||||
n_ops_max = max_batch;
|
||||
n_ops_max = batch_size;
|
||||
n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
|
||||
|
||||
b_vmem_max = HTP_OP_MAX_VMEM;
|
||||
|
||||
buffers.resize(n_bufs_max);
|
||||
tensors.resize(n_tens_max);
|
||||
ops.resize(n_ops_max);
|
||||
|
||||
h_bufs.resize(n_bufs_max);
|
||||
h_tens.resize(n_tens_max);
|
||||
h_ops.resize(n_ops_max);
|
||||
|
||||
b_map.reserve(n_bufs_max);
|
||||
t_map.reserve(n_tens_max);
|
||||
d_map.reserve(n_tens_max);
|
||||
@@ -1640,7 +1609,7 @@ struct ggml_hexagon_opbatch {
|
||||
|
||||
b_map.insert({sbuf->fd, bi});
|
||||
|
||||
htp_buf_desc &b = buffers[bi];
|
||||
htp_buf_desc &b = h_bufs[bi];
|
||||
b.base = (uint64_t) sbuf->base;
|
||||
b.fd = sbuf->fd;
|
||||
b.size = sbuf->size;
|
||||
@@ -1664,7 +1633,7 @@ struct ggml_hexagon_opbatch {
|
||||
// First lookup by tensor data
|
||||
auto range = d_map.equal_range(t->data);
|
||||
for (auto it = range.first; it != range.second; ++it) {
|
||||
htp_tensor * h = &tensors[it->second];
|
||||
htp_tensor * h = &h_tens[it->second];
|
||||
if (same_shape(h, t)) { return it->second; }
|
||||
}
|
||||
|
||||
@@ -1682,7 +1651,7 @@ struct ggml_hexagon_opbatch {
|
||||
uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
|
||||
size_t t_size = ggml_nbytes(t);
|
||||
|
||||
htp_tensor &h = tensors[ti];
|
||||
htp_tensor &h = h_tens[ti];
|
||||
h.bi = add_buffer(sbuf);
|
||||
h.data = t_offset;
|
||||
h.size = t_size;
|
||||
@@ -1737,65 +1706,170 @@ struct ggml_hexagon_opbatch {
|
||||
// assumes that fit_op() was called first and returned true
|
||||
void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
|
||||
// Add new op
|
||||
htp_op_desc &o = ops[n_ops++];
|
||||
|
||||
unsigned int n = n_ops++;
|
||||
GGML_ASSERT(n_ops <= n_ops_max);
|
||||
|
||||
ops[n] = t;
|
||||
|
||||
htp_op_desc &o = h_ops[n];
|
||||
memcpy(&o.params, &t->op_params, sizeof(t->op_params));
|
||||
o.opcode = opcode;
|
||||
o.flags = 0;
|
||||
|
||||
if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
|
||||
if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
|
||||
o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
|
||||
}
|
||||
|
||||
ggml_hexagon_dump_op_exec(name, t, o.flags);
|
||||
ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
|
||||
|
||||
for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
|
||||
o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
|
||||
}
|
||||
o.dst = add_tensor(t);
|
||||
}
|
||||
};
|
||||
|
||||
size_t flush(uint8_t * mem_addr, size_t mem_size) {
|
||||
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
|
||||
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
|
||||
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
|
||||
struct ggml_hexagon_opqueue {
|
||||
// Shared buffer for storing batches
|
||||
ggml_hexagon_shared_buffer *shm_buf;
|
||||
size_t shm_blk_size;
|
||||
|
||||
const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
|
||||
const size_t t_size = sizeof(htp_tensor) * n_tens;
|
||||
const size_t o_size = sizeof(htp_op_desc) * n_ops;
|
||||
using opvec = std::vector<const ggml_tensor*>;
|
||||
|
||||
const size_t m_size = b_size + t_size + o_size;
|
||||
GGML_ASSERT(m_size <= mem_size);
|
||||
std::queue<unsigned int> done; // completed batch ids
|
||||
std::vector<opvec> op_cache; // per batch op cache
|
||||
std::vector<uint64_t> start_usec; // per batch start time
|
||||
|
||||
uint8_t * b_ptr = (uint8_t *) mem_addr;
|
||||
uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
|
||||
uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;
|
||||
ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
|
||||
size_t n_bufs = HTP_OP_MAX_BUFS;
|
||||
size_t n_ops = batch_size;
|
||||
size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
|
||||
|
||||
memcpy(b_ptr, (void *) buffers.data(), b_size);
|
||||
memcpy(t_ptr, (void *) tensors.data(), t_size);
|
||||
memcpy(o_ptr, (void *) ops.data(), o_size);
|
||||
shm_blk_size = sizeof(htp_buf_desc) * n_bufs +
|
||||
sizeof(htp_tensor) * n_tensors +
|
||||
sizeof(htp_op_desc) * n_ops +
|
||||
sizeof(htp_prof_desc) * n_ops;
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
|
||||
name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);
|
||||
shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
|
||||
|
||||
op_cache.resize(depth);
|
||||
start_usec.resize(depth, 0);
|
||||
|
||||
// init done queue
|
||||
for (unsigned int i = 0; i < depth; i++) { done.push(i); }
|
||||
|
||||
if (opt_verbose) {
|
||||
GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
|
||||
sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
|
||||
}
|
||||
}
|
||||
|
||||
~ggml_hexagon_opqueue() {
|
||||
delete shm_buf;
|
||||
}
|
||||
|
||||
// push new batch
|
||||
bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
|
||||
static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
|
||||
static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
|
||||
static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
|
||||
static_assert(sizeof(htp_tensor) % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
|
||||
static_assert(sizeof(htp_op_desc) % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
|
||||
static_assert(sizeof(htp_prof_desc) % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
|
||||
|
||||
if (done.empty()) { return false; }
|
||||
|
||||
req.id = done.front(); done.pop(); // batch id
|
||||
req.n_bufs = op_batch->n_bufs;
|
||||
req.n_tensors = op_batch->n_tens;
|
||||
req.n_ops = op_batch->n_ops;
|
||||
|
||||
op_cache[req.id] = op_batch->ops;
|
||||
start_usec[req.id] = ggml_time_us();
|
||||
|
||||
const size_t b_size = sizeof(htp_buf_desc) * req.n_bufs;
|
||||
const size_t t_size = sizeof(htp_tensor) * req.n_tensors;
|
||||
const size_t o_size = sizeof(htp_op_desc) * req.n_ops;
|
||||
const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
|
||||
|
||||
dbuf.ptr = shm_buf->base + (req.id * shm_blk_size);
|
||||
dbuf.fd = shm_buf->fd;
|
||||
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
||||
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
|
||||
dbuf.size = b_size + t_size + o_size + p_size;
|
||||
|
||||
GGML_ASSERT(dbuf.size <= shm_blk_size);
|
||||
|
||||
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
|
||||
uint8_t * b_ptr = m_ptr; m_ptr += b_size;
|
||||
uint8_t * t_ptr = m_ptr; m_ptr += t_size;
|
||||
uint8_t * o_ptr = m_ptr;
|
||||
|
||||
memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
|
||||
memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
|
||||
memcpy(o_ptr, (void *) op_batch->h_ops.data(), o_size);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
|
||||
shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
|
||||
b_size, t_size, o_size, (size_t) dbuf.size);
|
||||
|
||||
op_batch->reset();
|
||||
|
||||
if (opt_verbose > 1) {
|
||||
htp_buf_desc *b = (htp_buf_desc*) b_ptr;
|
||||
for (unsigned int i=0; i < n_bufs; i++) {
|
||||
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
|
||||
for (unsigned int i=0; i < req.n_bufs; i++) {
|
||||
GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
|
||||
b[i].fd, (void *) b[i].base, (size_t) b[i].size);
|
||||
}
|
||||
htp_tensor *t = (htp_tensor*) t_ptr;
|
||||
for (unsigned int i=0; i < n_tens; i++) {
|
||||
for (unsigned int i=0; i < req.n_tensors; i++) {
|
||||
GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
|
||||
name, i, t[i].bi, t[i].data, t[i].size,
|
||||
shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
|
||||
(size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
|
||||
}
|
||||
}
|
||||
|
||||
reset();
|
||||
return true;
|
||||
}
|
||||
|
||||
return m_size;
|
||||
void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
|
||||
GGML_ASSERT(rsp.id < op_cache.size());
|
||||
|
||||
done.push(rsp.id);
|
||||
|
||||
const size_t b_size = sizeof(htp_buf_desc) * rsp.n_bufs;
|
||||
const size_t t_size = sizeof(htp_tensor) * rsp.n_tensors;
|
||||
const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops;
|
||||
const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
|
||||
|
||||
const size_t m_size = b_size + t_size + o_size + p_size;
|
||||
GGML_ASSERT(m_size <= shm_blk_size);
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
|
||||
shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
|
||||
(size_t) dbuf.size, b_size, t_size, o_size);
|
||||
|
||||
uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
|
||||
uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
|
||||
|
||||
if (opt_profile && rsp.n_ops > 0) {
|
||||
auto & ops = op_cache[rsp.id];
|
||||
|
||||
uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
|
||||
uint32_t htp_usec = 0;
|
||||
|
||||
GGML_ASSERT(rsp.n_ops <= ops.size());
|
||||
|
||||
const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
|
||||
for (uint32_t i = 0; i < rsp.n_ops; i++) {
|
||||
htp_usec += pd[i].usecs;
|
||||
ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
|
||||
}
|
||||
|
||||
GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
|
||||
shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -1824,17 +1898,12 @@ void ggml_hexagon_session::flush_pending(bool all) {
|
||||
GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
|
||||
}
|
||||
|
||||
op_shm->release((uint8_t*) dbuf.ptr);
|
||||
|
||||
if (rsp.status != HTP_STATUS_OK) {
|
||||
GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
|
||||
// TODO: handle errors
|
||||
}
|
||||
|
||||
// FIXME: profile will be per opreq
|
||||
// this->prof_usecs = rsp.prof_usecs;
|
||||
// this->prof_cycles = rsp.prof_cycles;
|
||||
// this->prof_pkts = rsp.prof_pkts;
|
||||
op_queue->pop(rsp, dbuf);
|
||||
|
||||
this->op_pending--; // atomic dec
|
||||
|
||||
@@ -1845,28 +1914,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
|
||||
void ggml_hexagon_session::flush_batch() {
|
||||
if (op_batch->empty()) { return; }
|
||||
|
||||
htp_opbatch_req req;
|
||||
req.n_bufs = op_batch->n_bufs;
|
||||
req.n_tensors = op_batch->n_tens;
|
||||
req.n_ops = op_batch->n_ops;
|
||||
htp_opbatch_req req {};
|
||||
dspqueue_buffer dbuf{};
|
||||
|
||||
dspqueue_buffer dbuf;
|
||||
dbuf.fd = op_shm->fd();
|
||||
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
||||
dbuf.ptr = op_shm->allocate();
|
||||
if (!dbuf.ptr) {
|
||||
if (!op_queue->push(req, dbuf, op_batch)) {
|
||||
flush_pending(false);
|
||||
dbuf.ptr = op_shm->allocate();
|
||||
op_queue->push(req, dbuf, op_batch);
|
||||
}
|
||||
|
||||
dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
|
||||
dbuf.size = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
|
||||
|
||||
// Bump pending flag (cleared in the session::flush once we get the response)
|
||||
this->op_pending++; // atomic inc
|
||||
|
||||
HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
|
||||
|
||||
int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
|
||||
if (err != 0) {
|
||||
GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
|
||||
@@ -2016,25 +2074,33 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
|
||||
}
|
||||
|
||||
if (opt_etm) {
|
||||
err = htp_iface_enable_etm(this->handle);
|
||||
err = htp_iface_etm(this->handle, 1);
|
||||
if (err != 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
|
||||
}
|
||||
}
|
||||
|
||||
// Start the DSP-side service. We need to pass the queue ID to the
|
||||
// DSP in a FastRPC call; the DSP side will import the queue and start
|
||||
// listening for packets in a callback.
|
||||
if (opt_profile) {
|
||||
htp_iface_pmu_conf pmu_conf{};
|
||||
std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
|
||||
|
||||
err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
|
||||
if (err != 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate buffers and state for op batching
|
||||
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
|
||||
this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
|
||||
|
||||
// Start processing op batch requests
|
||||
err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
|
||||
if (err != 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
|
||||
throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
|
||||
}
|
||||
this->valid_iface = true;
|
||||
|
||||
// Allocate buffers and state for op batching
|
||||
this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
|
||||
this->op_shm = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
|
||||
}
|
||||
|
||||
void ggml_hexagon_session::release() noexcept(true) {
|
||||
@@ -2043,7 +2109,7 @@ void ggml_hexagon_session::release() noexcept(true) {
|
||||
int err;
|
||||
|
||||
delete this->op_batch;
|
||||
delete this->op_shm;
|
||||
delete this->op_queue;
|
||||
|
||||
// Stop the DSP-side service and close the queue
|
||||
if (this->valid_iface) {
|
||||
@@ -2054,12 +2120,20 @@ void ggml_hexagon_session::release() noexcept(true) {
|
||||
}
|
||||
|
||||
if (opt_etm) {
|
||||
err = htp_iface_disable_etm(this->handle);
|
||||
err = htp_iface_etm(this->handle, 0);
|
||||
if (err != 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
|
||||
}
|
||||
}
|
||||
|
||||
if (opt_profile) {
|
||||
htp_iface_pmu_conf pmu_conf{};
|
||||
err = htp_iface_profiler(this->handle, 0, &pmu_conf);
|
||||
if (err != 0) {
|
||||
GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
|
||||
}
|
||||
}
|
||||
|
||||
if (this->valid_queue) {
|
||||
err = dspqueue_close(queue);
|
||||
if (err != 0) {
|
||||
@@ -2077,7 +2151,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
|
||||
repack_buffer_type.device = dev;
|
||||
|
||||
op_batch = nullptr;
|
||||
op_shm = nullptr;
|
||||
op_queue = nullptr;
|
||||
|
||||
try {
|
||||
allocate(dev_id);
|
||||
@@ -2619,6 +2693,39 @@ static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0]; // A
|
||||
const struct ggml_tensor * src1 = op->src[1]; // B
|
||||
const struct ggml_tensor * dst = op; // X
|
||||
|
||||
if (!src0 || !src1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0->ne[0] != src0->ne[1]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0->ne[1] != src1->ne[1]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
|
||||
auto sess = static_cast<ggml_hexagon_session *>(backend->context);
|
||||
return sess->c_name();
|
||||
@@ -2657,7 +2764,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
|
||||
case GGML_OP_CUMSUM: return HTP_OP_CUMSUM;
|
||||
case GGML_OP_FILL: return HTP_OP_FILL;
|
||||
case GGML_OP_DIAG: return HTP_OP_DIAG;
|
||||
|
||||
case GGML_OP_SOLVE_TRI: return HTP_OP_SOLVE_TRI;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(t)) {
|
||||
case GGML_UNARY_OP_SILU: return HTP_OP_UNARY_SILU;
|
||||
@@ -2698,7 +2805,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
|
||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||
ggml_tensor * n = graph->nodes[i];
|
||||
if (op_is_compute(n)) {
|
||||
if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
|
||||
sess->enqueue_op(op_remap_to_htp(n), n);
|
||||
}
|
||||
}
|
||||
@@ -3203,6 +3310,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_diag(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
supp = ggml_hexagon_supported_solve_tri(sess, op);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -3338,6 +3449,26 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
|
||||
return NULL;
|
||||
}
|
||||
|
||||
template<typename T> std::vector<T> str_to_vec(const char* str) {
|
||||
std::stringstream ss(str);
|
||||
std::vector<T> v;
|
||||
std::string t;
|
||||
|
||||
while (std::getline(ss, t, ',')) {
|
||||
v.push_back(std::stoul(t, nullptr, 0));
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
|
||||
std::stringstream ss;
|
||||
ss << std::setbase(BASE) << std::showbase;
|
||||
for (auto i : v) { ss << i << ','; }
|
||||
auto str = ss.str(); str.pop_back(); // drop last comma
|
||||
return str;
|
||||
}
|
||||
|
||||
static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
||||
// Basic sanity checks to make sure definitions match
|
||||
static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
|
||||
@@ -3351,8 +3482,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
||||
|
||||
const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
|
||||
const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
|
||||
const char * str_opmask = getenv("GGML_HEXAGON_OPMASK");
|
||||
const char * str_opsync = getenv("GGML_HEXAGON_OPSYNC");
|
||||
const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
|
||||
const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
|
||||
const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
|
||||
const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
|
||||
@@ -3365,19 +3495,30 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
|
||||
|
||||
auto RE_ICASE = std::regex_constants::icase;
|
||||
|
||||
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
|
||||
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
|
||||
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
||||
opt_opmask = str_opmask ? strtoul(str_opmask, NULL, 0) : opt_opmask;
|
||||
opt_opsync = str_opsync ? atoi(str_opsync) : opt_opsync;
|
||||
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
|
||||
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
|
||||
opt_profile = str_profile ? atoi(str_profile) : 0;
|
||||
opt_etm = str_etm ? atoi(str_etm) : 0;
|
||||
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
|
||||
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
|
||||
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
|
||||
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
||||
opt_opfilter = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
|
||||
opt_verbose = str_verbose ? atoi(str_verbose) : 0;
|
||||
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
||||
opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage;
|
||||
opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
|
||||
opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
|
||||
opt_etm = str_etm ? atoi(str_etm) : 0;
|
||||
opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
|
||||
opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
|
||||
opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev;
|
||||
opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
|
||||
|
||||
if (str_profile) {
|
||||
opt_pmu_evt = [&]() -> std::vector<uint32_t> {
|
||||
auto v = str_to_vec<uint32_t>(str_profile);
|
||||
switch (v.size()) {
|
||||
case 1: opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
|
||||
case 8: opt_profile = 2; return v; // mode with custom pmu events
|
||||
default: opt_profile = 0; return {}; // garbage input
|
||||
}}();
|
||||
if (opt_profile == 1) opt_pmu_evt = {};
|
||||
GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
|
||||
vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
|
||||
}
|
||||
|
||||
if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
|
||||
opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
|
||||
|
||||
@@ -36,6 +36,7 @@ add_library(${HTP_LIB} SHARED
|
||||
cumsum-ops.c
|
||||
fill-ops.c
|
||||
diag-ops.c
|
||||
solve-tri-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <qurt_memory.h>
|
||||
#include <qurt.h>
|
||||
|
||||
#include "hexagon_types.h"
|
||||
#include "hexagon_protos.h"
|
||||
@@ -100,4 +101,31 @@ static inline void hex_pause() {
|
||||
asm volatile(" pause(#255)\n");
|
||||
}
|
||||
|
||||
#ifndef HEX_NUM_PMU_COUNTERS
|
||||
#define HEX_NUM_PMU_COUNTERS 8
|
||||
#endif
|
||||
|
||||
static inline void hex_get_pmu(uint32_t counters[]) {
|
||||
#if __HVX_ARCH__ >= 79
|
||||
asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
|
||||
asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
|
||||
asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
|
||||
asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
|
||||
asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
|
||||
asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
|
||||
asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
|
||||
asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
|
||||
#else
|
||||
counters[0] = qurt_pmu_get(QURT_PMUCNT0);
|
||||
counters[1] = qurt_pmu_get(QURT_PMUCNT1);
|
||||
counters[2] = qurt_pmu_get(QURT_PMUCNT2);
|
||||
counters[3] = qurt_pmu_get(QURT_PMUCNT3);
|
||||
counters[4] = qurt_pmu_get(QURT_PMUCNT4);
|
||||
counters[5] = qurt_pmu_get(QURT_PMUCNT5);
|
||||
counters[6] = qurt_pmu_get(QURT_PMUCNT6);
|
||||
counters[7] = qurt_pmu_get(QURT_PMUCNT7);
|
||||
// qurt_pmu_get_pmucnt(counters);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* HEX_UTILS_H */
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <dspqueue.h>
|
||||
#include <stdatomic.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define HTP_MAX_NTHREADS 10
|
||||
#define HTP_MAX_MMAPS 16
|
||||
@@ -66,7 +67,9 @@ struct htp_context {
|
||||
int thread_id;
|
||||
int thread_prio;
|
||||
|
||||
int hmx_enabled;
|
||||
bool hmx_enabled;
|
||||
bool etm;
|
||||
uint32_t profiler;
|
||||
|
||||
uint8_t * vtcm_base;
|
||||
size_t vtcm_size;
|
||||
@@ -100,5 +103,6 @@ int op_ssm_conv(struct htp_ops_context * octx);
|
||||
int op_cumsum(struct htp_ops_context * octx);
|
||||
int op_fill(struct htp_ops_context * octx);
|
||||
int op_diag(struct htp_ops_context * octx);
|
||||
int op_solve_tri(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_CTX_H */
|
||||
|
||||
@@ -42,9 +42,9 @@ enum htp_data_type {
|
||||
|
||||
// Mask to enable various stages of the Ops.
|
||||
// Used for debugging and profiling.
|
||||
enum htp_op_mask {
|
||||
HTP_OPMASK_QUEUE = (1 << 0), // Enable Queueing (ie calls into the DSP)
|
||||
HTP_OPMASK_COMPUTE = (1 << 1), // Enable Compute
|
||||
enum htp_op_stage {
|
||||
HTP_OPSTAGE_QUEUE = (1 << 0), // Enable Queueing (ie calls into NPU)
|
||||
HTP_OPSTAGE_COMPUTE = (1 << 1), // Enable Compute
|
||||
};
|
||||
|
||||
// Do not reorder first 4 (used as an index)
|
||||
@@ -82,7 +82,7 @@ enum htp_op_code {
|
||||
HTP_OP_CUMSUM,
|
||||
HTP_OP_FILL,
|
||||
HTP_OP_DIAG,
|
||||
|
||||
HTP_OP_SOLVE_TRI,
|
||||
HTP_OP_INVALID
|
||||
};
|
||||
|
||||
@@ -137,27 +137,45 @@ struct htp_op_desc {
|
||||
int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
|
||||
uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices
|
||||
uint16_t dst; // Output tensor index
|
||||
};
|
||||
|
||||
// the rest is filled in-place by the NPU
|
||||
uint32_t prof_usecs; // Number of usec per request
|
||||
uint32_t prof_cycles; // Number of cycles per request
|
||||
uint32_t prof_pkts; // Number of instruction packets per request
|
||||
uint32_t unused;
|
||||
enum htp_profiler_mode {
|
||||
HTP_PROF_DISABLED = 0,
|
||||
HTP_PROF_BASIC = 1,
|
||||
HTP_PROF_PMU = 2,
|
||||
};
|
||||
|
||||
#define HTP_PROF_PMU_NCNT 8
|
||||
|
||||
// Profile descriptor
|
||||
struct htp_prof_desc {
|
||||
uint32_t opcode; // GGML/HTP Op
|
||||
uint32_t usecs; // Number of usec
|
||||
uint32_t cycles; // Number of cycles
|
||||
uint32_t pad; // Unused
|
||||
uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
|
||||
};
|
||||
|
||||
struct htp_opbatch_req {
|
||||
uint32_t id; // Batch id
|
||||
uint32_t n_bufs; // Number of buffers
|
||||
uint32_t n_tensors; // Number of tensors
|
||||
uint32_t n_ops; // Number of ops
|
||||
uint32_t flags; // unused
|
||||
uint32_t pad; // unused
|
||||
// struct htp_buf_desc bufs[]; -- dspqueue buf 0
|
||||
// struct htp_tensor tensors[]; -- dspqueue buf 0
|
||||
// struct htp_op_desc ops[]; -- dspqueue buf 0
|
||||
};
|
||||
|
||||
struct htp_opbatch_rsp {
|
||||
uint32_t id; // Batch id
|
||||
uint32_t status; // HTP_STATUS_...
|
||||
// struct htp_op_req ops[]; -- dspqueue buf 0
|
||||
uint32_t n_bufs; // Number of buffers
|
||||
uint32_t n_tensors; // Number of tensors
|
||||
uint32_t n_ops; // Number of op profile descriptors
|
||||
uint32_t pad; // unused
|
||||
// struct htp_prof_desc profs[]; -- dspqueue buf 0
|
||||
};
|
||||
|
||||
#endif /* HTP_OPS_H */
|
||||
|
||||
@@ -6,13 +6,17 @@
|
||||
#include "AEEStdDef.idl"
|
||||
#include "remote.idl"
|
||||
|
||||
struct htp_iface_pmu_conf {
|
||||
uint32 events[8];
|
||||
};
|
||||
|
||||
interface htp_iface : remote_handle64 {
|
||||
AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
|
||||
AEEResult stop();
|
||||
AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
|
||||
AEEResult munmap(in uint32 fd);
|
||||
AEEResult enable_etm();
|
||||
AEEResult disable_etm();
|
||||
AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
|
||||
AEEResult etm(in uint32 enable);
|
||||
};
|
||||
|
||||
#endif /* HTP_IDL */
|
||||
|
||||
@@ -256,6 +256,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
|
||||
return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline HVX_Vector hvx_vec_add_f16_f16(HVX_Vector a, HVX_Vector b)
|
||||
@@ -273,6 +285,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
|
||||
return Q6_Vhf_vmpy_VhfVhf(a, b);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vsf_vadd_VsfVsf(a, b);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vsf_vsub_VsfVsf(a, b);
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vsf_vmpy_VsfVsf(a, b);
|
||||
}
|
||||
|
||||
#endif // __HVX_ARCH__ < 79
|
||||
|
||||
#endif /* HVX_BASE_H */
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "htp-ops.h"
|
||||
#include "htp_iface.h"
|
||||
#include "worker-pool.h"
|
||||
|
||||
AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
@@ -103,6 +104,54 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
|
||||
int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
|
||||
if (err) {
|
||||
if (err == AEE_EVERSIONNOTSUPPORT) {
|
||||
FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
|
||||
} else {
|
||||
FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
|
||||
struct htp_context * ctx = (struct htp_context *) handle;
|
||||
if (!ctx) {
|
||||
return AEE_EBADPARM;
|
||||
}
|
||||
|
||||
if (mode == HTP_PROF_PMU) {
|
||||
const uint32_t* events = pmu_conf->events;
|
||||
|
||||
// Pack 4 event IDs (low 8 bits) into each 32-bit config register
|
||||
uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
|
||||
for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
|
||||
evtcfg |= ((events[i + 0] & 0xFF) << (i * 8));
|
||||
evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
|
||||
}
|
||||
|
||||
// For events >255 pack high 2 bits of all 8 event IDs into cfg register
|
||||
// 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
|
||||
for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
|
||||
cfg |= (((events[i] >> 8) & 3) << (i * 2));
|
||||
}
|
||||
|
||||
FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
|
||||
|
||||
// Configure PMU registers
|
||||
qurt_pmu_set(QURT_PMUCFG, cfg);
|
||||
qurt_pmu_set(QURT_PMUEVTCFG, evtcfg);
|
||||
qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
|
||||
qurt_pmu_enable(1);
|
||||
}
|
||||
|
||||
ctx->profiler = mode;
|
||||
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_close(remote_handle64 handle) {
|
||||
struct htp_context * ctx = (struct htp_context *) handle;
|
||||
|
||||
@@ -129,35 +178,19 @@ AEEResult htp_iface_close(remote_handle64 handle) {
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->profiler) {
|
||||
qurt_pmu_enable(1);
|
||||
}
|
||||
|
||||
if (ctx->etm) {
|
||||
HAP_user_etm_disable();
|
||||
}
|
||||
|
||||
free(ctx);
|
||||
return AEE_SUCCESS;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_enable_etm(remote_handle64 handle) {
|
||||
int err = HAP_user_etm_enable();
|
||||
if (err) {
|
||||
if (err == AEE_EVERSIONNOTSUPPORT) {
|
||||
FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
|
||||
} else {
|
||||
FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_disable_etm(remote_handle64 handle) {
|
||||
int err = HAP_user_etm_disable();
|
||||
if (err) {
|
||||
if (err == AEE_EVERSIONNOTSUPPORT) {
|
||||
FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
|
||||
} else {
|
||||
FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
|
||||
AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
|
||||
struct htp_context * ctx = (struct htp_context *) handle;
|
||||
if (!ctx) {
|
||||
return AEE_EBADPARM;
|
||||
@@ -204,7 +237,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t
|
||||
return AEE_ENOMEMORY;
|
||||
}
|
||||
|
||||
AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
|
||||
AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
|
||||
struct htp_context * ctx = (struct htp_context *) handle;
|
||||
if (!ctx) {
|
||||
return AEE_EBADPARM;
|
||||
@@ -434,19 +467,39 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
|
||||
struct profile_data {
|
||||
uint64_t usecs;
|
||||
uint64_t cycles;
|
||||
uint64_t pkts;
|
||||
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
|
||||
};
|
||||
|
||||
static inline void profile_start(struct profile_data * d) {
|
||||
d->usecs = HAP_perf_get_qtimer_count();
|
||||
d->cycles = hex_get_cycles();
|
||||
d->pkts = hex_get_pktcnt();
|
||||
static inline void profile_start(uint32_t mode, struct profile_data * d) {
|
||||
switch (mode) {
|
||||
case HTP_PROF_PMU:
|
||||
hex_get_pmu(d->pmu_counters);
|
||||
// fallthrough
|
||||
case HTP_PROF_BASIC:
|
||||
d->usecs = HAP_perf_get_qtimer_count();
|
||||
d->cycles = hex_get_cycles();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void profile_stop(struct profile_data * d) {
|
||||
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
|
||||
d->cycles = hex_get_cycles() - d->cycles;
|
||||
d->pkts = hex_get_pktcnt() - d->pkts;
|
||||
static inline void profile_stop(uint32_t mode, struct profile_data * d) {
|
||||
uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
|
||||
switch (mode) {
|
||||
case HTP_PROF_PMU:
|
||||
hex_get_pmu(pmu_counters);
|
||||
for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
|
||||
d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
|
||||
}
|
||||
// fallthrough
|
||||
case HTP_PROF_BASIC:
|
||||
d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
|
||||
d->cycles = hex_get_cycles() - d->cycles;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int execute_op(struct htp_ops_context * octx) {
|
||||
@@ -520,6 +573,9 @@ static int execute_op(struct htp_ops_context * octx) {
|
||||
case HTP_OP_DIAG:
|
||||
return op_diag(octx);
|
||||
|
||||
case HTP_OP_SOLVE_TRI:
|
||||
return op_solve_tri(octx);
|
||||
|
||||
case HTP_OP_INVALID:
|
||||
break;
|
||||
|
||||
@@ -726,30 +782,33 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Reset poll count for valid requests
|
||||
poll_count = DSPQUEUE_POLL_COUNT;
|
||||
|
||||
const uint32_t n_bufs = req.n_bufs;
|
||||
const uint32_t n_tens = req.n_tensors;
|
||||
const uint32_t n_ops = req.n_ops;
|
||||
|
||||
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
|
||||
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
|
||||
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
|
||||
const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
|
||||
const uint32_t t_size = sizeof(struct htp_tensor) * n_tens;
|
||||
const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops;
|
||||
const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
|
||||
|
||||
if (dbuf.size < b_size + t_size + o_size) {
|
||||
if (dbuf.size < b_size + t_size + o_size + p_size) {
|
||||
FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
|
||||
break;
|
||||
}
|
||||
|
||||
// Reset poll count for valid requests
|
||||
poll_count = DSPQUEUE_POLL_COUNT;
|
||||
|
||||
uint8_t * m_ptr = dbuf.ptr;
|
||||
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
|
||||
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
|
||||
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr;
|
||||
|
||||
FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
|
||||
FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
|
||||
n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
|
||||
|
||||
// Setup descriptor pointers
|
||||
uint8_t * m_ptr = dbuf.ptr;
|
||||
struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
|
||||
struct htp_tensor* tens = (struct htp_tensor*) m_ptr; m_ptr += t_size;
|
||||
struct htp_op_desc* ops = (struct htp_op_desc*) m_ptr; m_ptr += o_size;
|
||||
struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
|
||||
|
||||
prep_op_bufs(ctx, bufs, n_bufs);
|
||||
prep_tensors(ctx, bufs, tens, n_tens);
|
||||
|
||||
@@ -760,22 +819,34 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
|
||||
for (uint32_t i=0; i < n_ops; i++) {
|
||||
struct profile_data prof;
|
||||
profile_start(&prof);
|
||||
|
||||
profile_start(ctx->profiler, &prof);
|
||||
|
||||
proc_op_req(octx, tens, i, &ops[i]);
|
||||
|
||||
profile_stop(&prof);
|
||||
ops[i].prof_usecs = prof.usecs;
|
||||
ops[i].prof_cycles = prof.cycles;
|
||||
ops[i].prof_pkts = prof.pkts;
|
||||
profile_stop(ctx->profiler, &prof);
|
||||
|
||||
if (ctx->profiler) {
|
||||
pds[i].opcode = ops[i].opcode;
|
||||
pds[i].usecs = prof.usecs;
|
||||
pds[i].cycles = prof.cycles;
|
||||
for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
|
||||
pds[i].pmu[j] = prof.pmu_counters[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
|
||||
|
||||
struct htp_opbatch_rsp rsp;
|
||||
rsp.status = HTP_STATUS_OK; // FIXME
|
||||
rsp.id = req.id;
|
||||
rsp.status = HTP_STATUS_OK;
|
||||
rsp.n_bufs = n_bufs;
|
||||
rsp.n_tensors = n_tens;
|
||||
rsp.n_ops = n_ops;
|
||||
|
||||
dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
|
||||
|
||||
err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
|
||||
if (err != 0) {
|
||||
FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
|
||||
|
||||
@@ -3017,6 +3017,10 @@ int op_matmul(struct htp_ops_context * octx) {
|
||||
const int act_stride = (int)(src1->nb[1] / sizeof(float));
|
||||
const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
if (src0->type == HTP_TYPE_F16) {
|
||||
if (is_batched) {
|
||||
hmx_matmul_w16a32_batched_params_t batch_params = {
|
||||
|
||||
267
ggml/src/ggml-hexagon/htp/solve-tri-ops.c
Normal file
267
ggml/src/ggml-hexagon/htp/solve-tri-ops.c
Normal file
@@ -0,0 +1,267 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
#include <string.h>
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-types.h"
|
||||
#include "hvx-utils.h"
|
||||
|
||||
struct htp_solve_tri_context {
|
||||
struct htp_ops_context * octx;
|
||||
uint32_t jobs_per_thread;
|
||||
uint32_t total_jobs;
|
||||
uint32_t k_chunks;
|
||||
uint32_t col_block;
|
||||
};
|
||||
|
||||
static inline void solve_tri_row_scalar(const float * A_row,
|
||||
const float * B_row,
|
||||
float * X,
|
||||
uint32_t row,
|
||||
uint32_t k,
|
||||
uint32_t col0,
|
||||
uint32_t coln,
|
||||
float inv_diag) {
|
||||
for (uint32_t col = col0; col < col0 + coln; ++col) {
|
||||
float sum = 0.0f;
|
||||
for (uint32_t t = 0; t < row; ++t) {
|
||||
sum += A_row[t] * X[t * k + col];
|
||||
}
|
||||
X[row * k + col] = (B_row[col] - sum) * inv_diag;
|
||||
}
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) {
|
||||
HVX_Vector v = *((const HVX_UVector *) src);
|
||||
HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float));
|
||||
return Q6_V_vmux_QVV(mask, v, Q6_V_vzero());
|
||||
}
|
||||
|
||||
static inline void solve_tri_row_hvx(const float * A_row,
|
||||
const float * B_row,
|
||||
float * X,
|
||||
uint32_t row,
|
||||
uint32_t k,
|
||||
uint32_t col0,
|
||||
uint32_t coln,
|
||||
float inv_diag) {
|
||||
const bool full = (coln == VLEN_FP32);
|
||||
|
||||
HVX_Vector sum_v = Q6_V_vzero();
|
||||
for (uint32_t t = 0; t < row; ++t) {
|
||||
const float a = A_row[t];
|
||||
const float * x_row_col = X + t * k + col0;
|
||||
|
||||
HVX_Vector x_v = full ? *((const HVX_UVector *) x_row_col) : hvx_load_partial_f32(x_row_col, coln);
|
||||
HVX_Vector a_v = hvx_vec_splat_f32(a);
|
||||
sum_v = hvx_vec_add_f32_f32(sum_v, hvx_vec_mul_f32_f32(x_v, a_v));
|
||||
}
|
||||
|
||||
const float * b_row_col = B_row + col0;
|
||||
float * x_out_col = X + row * k + col0;
|
||||
|
||||
HVX_Vector b_v = full ? *((const HVX_UVector *) b_row_col) : hvx_load_partial_f32(b_row_col, coln);
|
||||
HVX_Vector inv_diag_v = hvx_vec_splat_f32(inv_diag);
|
||||
|
||||
HVX_Vector out_v = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(b_v, sum_v), inv_diag_v);
|
||||
hvx_vec_store_u((void *) x_out_col, coln * sizeof(float), out_v);
|
||||
}
|
||||
|
||||
// Batch-level thread: each job is one full batch.
|
||||
static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
|
||||
struct htp_ops_context * octx = sctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = octx->src[0]; // A
|
||||
const struct htp_tensor * src1 = octx->src[1]; // B
|
||||
const struct htp_tensor * dst = octx->dst; // X
|
||||
|
||||
const uint32_t n = src0->ne[0];
|
||||
const uint32_t k = src1->ne[0];
|
||||
|
||||
const uint32_t ne02 = src0->ne[2];
|
||||
|
||||
const uint32_t col_block = VLEN_FP32;
|
||||
const uint32_t k_full = (k / col_block) * col_block;
|
||||
|
||||
const uint32_t start_batch = sctx->jobs_per_thread * ith;
|
||||
const uint32_t end_batch = MIN(start_batch + sctx->jobs_per_thread, sctx->total_jobs);
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
for (uint32_t batch = start_batch; batch < end_batch; ++batch) {
|
||||
const uint32_t i03 = batch / ne02;
|
||||
const uint32_t i02 = batch - i03 * ne02;
|
||||
|
||||
const float * A_batch =
|
||||
(const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
|
||||
const float * B_batch =
|
||||
(const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
|
||||
float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
|
||||
|
||||
for (uint32_t row = 0; row < n; ++row) {
|
||||
const float diag = A_batch[row * n + row];
|
||||
const float inv_diag = 1.0f / diag;
|
||||
const float * A_row = A_batch + row * n;
|
||||
const float * B_row = B_batch + row * k;
|
||||
|
||||
uint32_t col0 = 0;
|
||||
for (; col0 < k_full; col0 += col_block) {
|
||||
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, col_block, inv_diag);
|
||||
}
|
||||
|
||||
if (col0 < k) {
|
||||
const uint32_t coln = k - col0;
|
||||
if (coln >= 8) {
|
||||
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
|
||||
} else {
|
||||
solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "solve-tri-batch %d/%d: A=(%ux%u) B=(%ux%u) batch %u:%u usec %u\n",
|
||||
ith, nth, n, n, k, n, start_batch, end_batch,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// Chunk-level thread: each job is one (batch, col_chunk) pair.
|
||||
static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
|
||||
struct htp_ops_context * octx = sctx->octx;
|
||||
|
||||
const struct htp_tensor * src0 = octx->src[0]; // A
|
||||
const struct htp_tensor * src1 = octx->src[1]; // B
|
||||
const struct htp_tensor * dst = octx->dst; // X
|
||||
|
||||
const uint32_t n = src0->ne[0];
|
||||
const uint32_t k = src1->ne[0];
|
||||
|
||||
const uint32_t ne02 = src0->ne[2];
|
||||
|
||||
const uint32_t start_job = sctx->jobs_per_thread * ith;
|
||||
const uint32_t end_job = MIN(start_job + sctx->jobs_per_thread, sctx->total_jobs);
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
for (uint32_t job = start_job; job < end_job; ++job) {
|
||||
const uint32_t batch = job / sctx->k_chunks;
|
||||
const uint32_t chunk = job - batch * sctx->k_chunks;
|
||||
|
||||
const uint32_t i03 = batch / ne02;
|
||||
const uint32_t i02 = batch - i03 * ne02;
|
||||
|
||||
const uint32_t col0 = chunk * sctx->col_block;
|
||||
const uint32_t coln = MIN(sctx->col_block, k - col0);
|
||||
|
||||
const float * A_batch =
|
||||
(const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
|
||||
const float * B_batch =
|
||||
(const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
|
||||
float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
|
||||
|
||||
const bool use_hvx = (coln >= 8);
|
||||
|
||||
for (uint32_t row = 0; row < n; ++row) {
|
||||
const float diag = A_batch[row * n + row];
|
||||
const float inv_diag = 1.0f / diag;
|
||||
|
||||
const float * A_row = A_batch + row * n;
|
||||
const float * B_row = B_batch + row * k;
|
||||
|
||||
if (use_hvx) {
|
||||
solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
|
||||
} else {
|
||||
solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "solve-tri-chunk %d/%d: A=(%ux%u) B=(%ux%u) job %u:%u usec %u\n",
|
||||
ith, nth, n, n, k, n, start_job, end_job,
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_solve_tri(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = octx->src[0]; // A
|
||||
const struct htp_tensor * src1 = octx->src[1]; // B
|
||||
const struct htp_tensor * dst = octx->dst; // X
|
||||
|
||||
if (src0->type != HTP_TYPE_F32 || src1->type != HTP_TYPE_F32 || dst->type != HTP_TYPE_F32) {
|
||||
return HTP_STATUS_NO_SUPPORT;
|
||||
}
|
||||
|
||||
// left=true, lower=true, uni=false only
|
||||
if (src0->ne[0] != src0->ne[1]) {
|
||||
return HTP_STATUS_INVAL_PARAMS;
|
||||
}
|
||||
if (src0->ne[1] != src1->ne[1]) {
|
||||
return HTP_STATUS_INVAL_PARAMS;
|
||||
}
|
||||
if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
|
||||
return HTP_STATUS_INVAL_PARAMS;
|
||||
}
|
||||
if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] ||
|
||||
dst->ne[3] != src1->ne[3]) {
|
||||
return HTP_STATUS_INVAL_PARAMS;
|
||||
}
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
const uint32_t k = src1->ne[0];
|
||||
|
||||
const uint32_t col_block = VLEN_FP32;
|
||||
const uint32_t k_chunks = (k + col_block - 1) / col_block;
|
||||
const uint32_t total_batches = src0->ne[2] * src0->ne[3];
|
||||
const bool batched = total_batches >= (uint32_t) octx->n_threads;
|
||||
|
||||
FARF(HIGH, "solve-tri: (%ux%ux%ux%u) x (%ux%ux%ux%u) -> (%ux%ux%ux%u) : batched %d\n",
|
||||
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
|
||||
src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], batched);
|
||||
|
||||
if (batched) {
|
||||
// Batch-level parallelism
|
||||
const uint32_t n_threads = MIN((uint32_t) octx->n_threads, total_batches);
|
||||
|
||||
struct htp_solve_tri_context sctx = {
|
||||
.octx = octx,
|
||||
.jobs_per_thread = (total_batches + n_threads - 1) / n_threads,
|
||||
.total_jobs = total_batches,
|
||||
.k_chunks = k_chunks,
|
||||
.col_block = col_block,
|
||||
};
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, solve_tri_batch_thread_f32, &sctx, n_threads);
|
||||
} else {
|
||||
// Chunk-level parallelism
|
||||
const uint32_t total_jobs = total_batches * k_chunks;
|
||||
const uint32_t n_threads = MIN((uint32_t) octx->n_threads, MAX(total_jobs, 1));
|
||||
|
||||
struct htp_solve_tri_context sctx = {
|
||||
.octx = octx,
|
||||
.jobs_per_thread = (total_jobs + n_threads - 1) / n_threads,
|
||||
.total_jobs = total_jobs,
|
||||
.k_chunks = k_chunks,
|
||||
.col_block = col_block,
|
||||
};
|
||||
|
||||
worker_pool_run_func(octx->ctx->worker_pool, solve_tri_chunk_thread_f32, &sctx, n_threads);
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
@@ -197,11 +197,12 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
|
||||
/** RMS_NORM + MUL **/
|
||||
|
||||
struct ggml_webgpu_rms_norm_mul_pipeline_key {
|
||||
bool inplace;
|
||||
bool src_overlap;
|
||||
bool inplace; // rn_src == dst
|
||||
bool overlap; // mul_src == dst
|
||||
bool src_overlap; // rn_src == mul_src
|
||||
|
||||
bool operator==(const ggml_webgpu_rms_norm_mul_pipeline_key & other) const {
|
||||
return inplace == other.inplace && src_overlap == other.src_overlap;
|
||||
return inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -209,6 +210,7 @@ struct ggml_webgpu_rms_norm_mul_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_rms_norm_mul_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.inplace);
|
||||
ggml_webgpu_hash_combine(seed, key.overlap);
|
||||
ggml_webgpu_hash_combine(seed, key.src_overlap);
|
||||
return seed;
|
||||
}
|
||||
@@ -556,7 +558,7 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
|
||||
const size_t q_tile = context.sg_mat_m;
|
||||
const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
|
||||
2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
|
||||
size_t bytes_per_kv = 0;
|
||||
size_t bytes_per_kv = 0;
|
||||
if (!key.kv_direct) {
|
||||
bytes_per_kv += std::max(key.head_dim_qk, key.head_dim_v);
|
||||
}
|
||||
@@ -1878,6 +1880,7 @@ class ggml_webgpu_shader_lib {
|
||||
webgpu_pipeline get_rms_norm_mul_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_rms_norm_mul_pipeline_key key = {};
|
||||
key.inplace = context.inplace;
|
||||
key.overlap = context.overlap;
|
||||
key.src_overlap = context.src_overlap;
|
||||
|
||||
auto it = rms_norm_mul_pipelines.find(key);
|
||||
@@ -1892,6 +1895,9 @@ class ggml_webgpu_shader_lib {
|
||||
if (key.inplace) {
|
||||
defines.push_back("INPLACE");
|
||||
variant += "_inplace";
|
||||
} else if (key.overlap) {
|
||||
defines.push_back("OVERLAP");
|
||||
variant += "_overlap";
|
||||
} else if (key.src_overlap) {
|
||||
defines.push_back("SRC_OVERLAP");
|
||||
variant += "_src_overlap";
|
||||
|
||||
@@ -2071,8 +2071,9 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
|
||||
GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
|
||||
}
|
||||
|
||||
bool inplace = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
|
||||
bool overlap = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
|
||||
(ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
|
||||
bool inplace = ggml_webgpu_tensor_equal(rn_src, dst);
|
||||
bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
|
||||
|
||||
uint32_t offset_merged_rn_src = 0;
|
||||
@@ -2116,7 +2117,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
|
||||
|
||||
std::vector<wgpu::BindGroupEntry> entries;
|
||||
|
||||
if (inplace) {
|
||||
if (inplace || overlap) {
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
|
||||
entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
|
||||
} else if (src_overlap) {
|
||||
@@ -2136,6 +2137,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {};
|
||||
shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
|
||||
shader_lib_ctx.inplace = inplace;
|
||||
shader_lib_ctx.overlap = overlap;
|
||||
shader_lib_ctx.src_overlap = src_overlap;
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_rms_norm_mul_pipeline(shader_lib_ctx);
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#ifdef INPLACE
|
||||
#ifdef OVERLAP
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> rn_src: array<f32>;
|
||||
@@ -13,6 +13,21 @@ fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32)
|
||||
mul_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
|
||||
}
|
||||
|
||||
#elif INPLACE
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> rn_src: array<f32>;
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<storage, read_write> mul_src: array<f32>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
|
||||
rn_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
|
||||
}
|
||||
|
||||
#elif SRC_OVERLAP
|
||||
|
||||
@group(0) @binding(0)
|
||||
|
||||
@@ -23,10 +23,10 @@ verbose=
|
||||
[ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
|
||||
|
||||
profile=
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
|
||||
|
||||
opmask=
|
||||
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
|
||||
|
||||
nhvx=
|
||||
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
|
||||
|
||||
@@ -28,10 +28,10 @@ sched=
|
||||
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
|
||||
|
||||
profile=
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
|
||||
|
||||
opmask=
|
||||
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
|
||||
|
||||
nhvx=
|
||||
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
|
||||
|
||||
@@ -28,10 +28,10 @@ sched=
|
||||
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
|
||||
|
||||
profile=
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
|
||||
|
||||
opmask=
|
||||
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
|
||||
|
||||
nhvx=
|
||||
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
|
||||
|
||||
@@ -37,10 +37,10 @@ sched=
|
||||
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
|
||||
|
||||
profile=
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"
|
||||
|
||||
opmask=
|
||||
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
|
||||
|
||||
nhvx=
|
||||
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
|
||||
|
||||
@@ -25,10 +25,10 @@ sched=
|
||||
[ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
|
||||
|
||||
profile=
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
|
||||
[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"
|
||||
|
||||
opmask=
|
||||
[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
|
||||
[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
|
||||
|
||||
nhvx=
|
||||
[ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
|
||||
|
||||
188
scripts/snapdragon/ggml-hexagon-profile.py
Executable file
188
scripts/snapdragon/ggml-hexagon-profile.py
Executable file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import argparse
|
||||
import statistics
|
||||
import logging
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
# Mapping of cli-friendly names to (internal_data_key, Display Header, numeric_sort_key)
|
||||
COL_MAP = {
|
||||
"op": ("op", "Op", "op"),
|
||||
"dims": ("dims", "Dims", "dims"),
|
||||
"dtypes": ("dtypes", "DTypes", "dtypes"),
|
||||
"count": ("count", "Count", "_sort_count"),
|
||||
"max-usec": ("max_usec", "Max usec", "_sort_max_usec"),
|
||||
"avg-usec": ("avg_usec", "Avg usec", "_sort_avg_usec"),
|
||||
"max-cycles": ("max_cycles", "Max Cycles", "_sort_max_cycles"),
|
||||
"avg-cycles": ("avg_cycles", "Avg Cycles", "_sort_avg_cycles"),
|
||||
"max-pmu": ("max_pmu", "Max PMU", "_sort_max_pmu"),
|
||||
"avg-pmu": ("avg_pmu", "Avg PMU", "_sort_avg_pmu"),
|
||||
}
|
||||
|
||||
op_pattern = re.compile(
|
||||
r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
|
||||
)
|
||||
|
||||
logger = logging.getLogger("ggml-hexagon-profile")
|
||||
|
||||
|
||||
def parse_log(file_path, pmu_index=None):
|
||||
try:
|
||||
if file_path != "-":
|
||||
f = open(file_path, 'r', encoding='utf-8', errors='ignore')
|
||||
else:
|
||||
f = os.fdopen(0, 'r', encoding='utf-8', errors='ignore')
|
||||
except FileNotFoundError:
|
||||
logger.error(f"file '{file_path}' not found.")
|
||||
sys.exit(1)
|
||||
|
||||
all_ops = []
|
||||
for line in f:
|
||||
match = op_pattern.search(line)
|
||||
if not match: continue
|
||||
|
||||
pmu_raw = match.group('pmu')
|
||||
pmu_val = None
|
||||
if pmu_raw and pmu_index is not None:
|
||||
try:
|
||||
pmu_list = [int(x.strip()) for x in pmu_raw.split(',')]
|
||||
if len(pmu_list) > pmu_index:
|
||||
pmu_val = pmu_list[pmu_index]
|
||||
except (ValueError, IndexError):
|
||||
pmu_val = None
|
||||
|
||||
all_ops.append({
|
||||
'name': match.group('op_name'),
|
||||
'dims': match.group('dims').strip(),
|
||||
'types': match.group('types').strip(),
|
||||
'usec': int(match.group('usec')),
|
||||
'cycles': int(match.group('cycles')),
|
||||
'pmu_val': pmu_val
|
||||
})
|
||||
|
||||
f.close()
|
||||
|
||||
return all_ops
|
||||
|
||||
|
||||
def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None):
|
||||
if not ops:
|
||||
logger.info("No valid records found.")
|
||||
return
|
||||
|
||||
grouped = defaultdict(list)
|
||||
for op in ops:
|
||||
key = (op['name'], op['dims'], op['types'])
|
||||
grouped[key].append(op)
|
||||
|
||||
group_stats = []
|
||||
for (name, dims, types), group_ops in grouped.items():
|
||||
usecs = [o['usec'] for o in group_ops]
|
||||
cycles = [o['cycles'] for o in group_ops]
|
||||
pmu_vals = [o['pmu_val'] for o in group_ops if o['pmu_val'] is not None]
|
||||
|
||||
group_stats.append({
|
||||
'op': name,
|
||||
'dims': dims,
|
||||
'dtypes': types,
|
||||
'count': str(len(group_ops)),
|
||||
'max_usec': str(max(usecs)),
|
||||
'avg_usec': f"{statistics.mean(usecs):.2f}",
|
||||
'max_cycles': str(max(cycles)),
|
||||
'avg_cycles': f"{statistics.mean(cycles):.2f}",
|
||||
'max_pmu': str(max(pmu_vals)) if pmu_vals else "0",
|
||||
'avg_pmu': f"{statistics.mean(pmu_vals):.2f}" if pmu_vals else "0.00",
|
||||
# Numeric values for accurate sorting
|
||||
'_sort_count': len(group_ops),
|
||||
'_sort_max_usec': max(usecs),
|
||||
'_sort_avg_usec': statistics.mean(usecs),
|
||||
'_sort_max_cycles': max(cycles),
|
||||
'_sort_avg_cycles': statistics.mean(cycles),
|
||||
'_sort_max_pmu': max(pmu_vals) if pmu_vals else 0,
|
||||
'_sort_avg_pmu': statistics.mean(pmu_vals) if pmu_vals else 0
|
||||
})
|
||||
|
||||
# Sorting logic
|
||||
actual_sort_key = COL_MAP[sort_col][2]
|
||||
# We sort numeric fields descending, strings (op/dims) ascending
|
||||
is_numeric = actual_sort_key.startswith("_") or actual_sort_key == "count"
|
||||
sorted_groups = sorted(group_stats, key=lambda x: x[actual_sort_key], reverse=is_numeric)[:top_n]
|
||||
|
||||
# Define initial column order
|
||||
active_cols = ["op", "dims", "dtypes"]
|
||||
if pmu_name:
|
||||
active_cols += ["max-pmu", "avg-pmu"]
|
||||
active_cols += ["max-usec", "avg-usec", "max-cycles", "avg-cycles", "count"]
|
||||
|
||||
final_headers, final_keys, final_widths = [], [], []
|
||||
|
||||
for col_name in active_cols:
|
||||
data_key, header_text, _ = COL_MAP[col_name]
|
||||
if "pmu" in col_name and pmu_name:
|
||||
header_text = header_text.replace("PMU", pmu_name)
|
||||
|
||||
natural_width = max([len(row[data_key]) for row in sorted_groups] + [len(header_text)])
|
||||
target_width = width_overrides.get(col_name, natural_width)
|
||||
|
||||
if target_width == 0:
|
||||
continue
|
||||
|
||||
final_headers.append(header_text)
|
||||
final_keys.append(data_key)
|
||||
final_widths.append(target_width)
|
||||
|
||||
# Print Report
|
||||
logger.info(f"\n# Profile Report (Top {top_n} Ops sorted by {sort_col})\n")
|
||||
header_line = "| " + " | ".join(f"{h:<{final_widths[i]}}" for i, h in enumerate(final_headers)) + " |"
|
||||
sep_line = "| " + " | ".join("-" * final_widths[i] for i in range(len(final_headers))) + " |"
|
||||
logger.info(header_line)
|
||||
logger.info(sep_line)
|
||||
|
||||
for group in sorted_groups:
|
||||
row_vals = []
|
||||
for i, key in enumerate(final_keys):
|
||||
val = group[key]
|
||||
if len(val) > final_widths[i]:
|
||||
val = val[:final_widths[i] - 3] + "..."
|
||||
row_vals.append(f"{val:<{final_widths[i]}}")
|
||||
logger.info("| " + " | ".join(row_vals) + " |")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Post-process Op profile info.")
|
||||
parser.add_argument("logfile")
|
||||
parser.add_argument("-n", "--top", type=int, default=100)
|
||||
parser.add_argument("--sort", type=str, default="max-usec", choices=list(COL_MAP.keys()))
|
||||
parser.add_argument("--pmu-index", type=int)
|
||||
parser.add_argument("--pmu-name", type=str)
|
||||
parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(message)s')
|
||||
|
||||
# Sort validation: can't sort by PMU if index isn't provided
|
||||
if "pmu" in args.sort and args.pmu_index is None:
|
||||
logger.error(f"Cannot sort by '{args.sort}' without --pmu-index.")
|
||||
sys.exit(1)
|
||||
|
||||
overrides = {}
|
||||
if args.width:
|
||||
for w in args.width:
|
||||
try:
|
||||
name, val = w.split(':')
|
||||
overrides[name.lower()] = int(val)
|
||||
except ValueError:
|
||||
logger.warning(f"Invalid width format '{w}'")
|
||||
|
||||
final_pmu_name = (args.pmu_name or f"#{args.pmu_index}") if args.pmu_index is not None else None
|
||||
ops = parse_log(args.logfile, pmu_index=args.pmu_index)
|
||||
generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
This directory includes pytest based scripts for running CI jobs on Qualcomm Device Cloud (QDC).
|
||||
@@ -8,12 +8,9 @@ iniconfig==2.1.0
|
||||
outcome==1.3.0.post0
|
||||
packaging==25.0
|
||||
pluggy==1.6.0
|
||||
Pygments==2.19.2
|
||||
PySocks==1.7.1
|
||||
pytest==8.4.2
|
||||
pytest-dependency==0.6.0
|
||||
selenium==4.36.0
|
||||
setuptools==80.9.0
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
tomli==2.3.0
|
||||
|
||||
401
scripts/snapdragon/qdc/run_qdc_jobs.py
Normal file
401
scripts/snapdragon/qdc/run_qdc_jobs.py
Normal file
@@ -0,0 +1,401 @@
|
||||
"""Run llama.cpp Hexagon Android tests in a single QDC Appium job.
|
||||
|
||||
Bundles test scripts into one artifact and submits a single QDC job:
|
||||
|
||||
1. run_bench_tests_posix.py — llama-cli and llama-bench on CPU / GPU / NPU
|
||||
(from scripts/snapdragon/qdc/)
|
||||
|
||||
Results are written to $GITHUB_STEP_SUMMARY when set (GitHub Actions).
|
||||
|
||||
Prerequisites:
|
||||
pip install /path/to/qualcomm_device_cloud_sdk*.whl
|
||||
|
||||
Required environment variables:
|
||||
QDC_API_KEY API key from QDC UI -> Users -> Settings -> API Keys
|
||||
|
||||
Usage:
|
||||
python run_qdc_jobs.py \\
|
||||
--pkg-dir pkg-snapdragon/llama.cpp \\
|
||||
--model-url https://.../Llama-3.2-1B-Instruct-Q4_0.gguf \\
|
||||
--device SM8750
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
from qualcomm_device_cloud_sdk.api import qdc_api # ty: ignore[unresolved-import]
|
||||
from qualcomm_device_cloud_sdk.logging import configure_logging # ty: ignore[unresolved-import]
|
||||
from qualcomm_device_cloud_sdk.models import ArtifactType, JobMode, JobState, JobSubmissionParameter, JobType, TestFramework # ty: ignore[unresolved-import]
|
||||
|
||||
configure_logging(level=logging.INFO, handlers=[logging.StreamHandler()])
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
POLL_INTERVAL = 30
|
||||
JOB_TIMEOUT = 3600
|
||||
LOG_UPLOAD_TIMEOUT = 600
|
||||
CAPACITY_TIMEOUT = 1800
|
||||
CAPACITY_POLL = 60
|
||||
MAX_CONCURRENT_JOBS = 5
|
||||
TERMINAL_STATES = {JobState.COMPLETED, JobState.CANCELED}
|
||||
NON_TERMINAL_STATES = {JobState.DISPATCHED, JobState.RUNNING, JobState.SETUP, JobState.SUBMITTED}
|
||||
|
||||
_SCRIPTS_DIR = Path(__file__).parent
|
||||
_TESTS_DIR = _SCRIPTS_DIR / "tests"
|
||||
_RUN_BENCH = _TESTS_DIR / "run_bench_tests_posix.py"
|
||||
_RUN_BACKEND_OPS = _TESTS_DIR / "run_backend_ops_posix.py"
|
||||
_UTILS = _TESTS_DIR / "utils.py"
|
||||
_CONFTEST = _TESTS_DIR / "conftest.py"
|
||||
_REQUIREMENTS = _SCRIPTS_DIR / "requirements.txt"
|
||||
|
||||
_PYTEST_LINE_RE = re.compile(
|
||||
r"(?:[\w/]+\.py::)?(?:\w+::)?([\w\[\].-]+)\s+(PASSED|FAILED|ERROR|SKIPPED)"
|
||||
)
|
||||
_EXCLUDED_LOGS = {"qdc_android_whole_host-000.log", "qdc_kernel_host-000.log"}
|
||||
_NON_TERMINAL_STATE_VALUES = {s.value for s in NON_TERMINAL_STATES}
|
||||
|
||||
|
||||
@dataclass
|
||||
class JobResult:
|
||||
passed: bool
|
||||
tests: dict[str, bool] = field(default_factory=dict)
|
||||
raw_logs: dict[str, str] = field(default_factory=dict)
|
||||
failure_details: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
def build_artifact_zip(
|
||||
pkg_dir: Path,
|
||||
stage_dir: Path,
|
||||
*,
|
||||
test_mode: str = "bench",
|
||||
model_url: str | None = None,
|
||||
) -> Path:
|
||||
"""Bundle everything into a single QDC artifact zip.
|
||||
|
||||
Zip structure (extracted by QDC to /qdc/appium/ on the runner):
|
||||
llama_cpp_bundle/ installed package (adb pushed to /data/local/tmp/)
|
||||
tests/
|
||||
utils.py shared helpers (paths, run_adb_command, …)
|
||||
conftest.py shared pytest fixtures (driver)
|
||||
test_bench_posix.py bench + cli tests (<<MODEL_URL>> substituted)
|
||||
AND/OR
|
||||
test_backend_ops_posix.py test-backend-ops -b HTP0
|
||||
requirements.txt
|
||||
"""
|
||||
shutil.copytree(pkg_dir, stage_dir / "llama_cpp_bundle")
|
||||
|
||||
tests_dir = stage_dir / "tests"
|
||||
tests_dir.mkdir()
|
||||
|
||||
shutil.copy(_UTILS, tests_dir / "utils.py")
|
||||
shutil.copy(_CONFTEST, tests_dir / "conftest.py")
|
||||
|
||||
if test_mode in ("bench", "all"):
|
||||
assert model_url is not None, "--model-url is required for bench/all test modes"
|
||||
(tests_dir / "test_bench_posix.py").write_text(
|
||||
_RUN_BENCH.read_text().replace("<<MODEL_URL>>", model_url)
|
||||
)
|
||||
if test_mode in ("backend-ops", "all"):
|
||||
shutil.copy(_RUN_BACKEND_OPS, tests_dir / "test_backend_ops_posix.py")
|
||||
|
||||
shutil.copy(_REQUIREMENTS, stage_dir / "requirements.txt")
|
||||
(stage_dir / "pytest.ini").write_text("[pytest]\naddopts = --junitxml=results.xml\n")
|
||||
|
||||
zip_base = str(stage_dir / "artifact")
|
||||
shutil.make_archive(zip_base, "zip", stage_dir)
|
||||
return Path(f"{zip_base}.zip")
|
||||
|
||||
|
||||
def wait_for_job(client, job_id: str, timeout: int) -> str:
|
||||
elapsed = 0
|
||||
while elapsed < timeout:
|
||||
raw = qdc_api.get_job_status(client, job_id)
|
||||
try:
|
||||
status = JobState(raw)
|
||||
except ValueError:
|
||||
status = raw
|
||||
if status in TERMINAL_STATES:
|
||||
return raw.lower()
|
||||
log.info("Job %s: %s", job_id, raw)
|
||||
time.sleep(POLL_INTERVAL)
|
||||
elapsed += POLL_INTERVAL
|
||||
raise TimeoutError(f"Job {job_id} did not finish within {timeout}s")
|
||||
|
||||
|
||||
def wait_for_log_upload(client, job_id: str) -> None:
|
||||
elapsed = 0
|
||||
while elapsed <= LOG_UPLOAD_TIMEOUT:
|
||||
status = (qdc_api.get_job_log_upload_status(client, job_id) or "").lower()
|
||||
if status in {"completed", "failed"}:
|
||||
return
|
||||
log.info("Waiting for log upload (status=%s) ...", status)
|
||||
time.sleep(POLL_INTERVAL)
|
||||
elapsed += POLL_INTERVAL
|
||||
log.warning("Timed out waiting for log upload after %ds", LOG_UPLOAD_TIMEOUT)
|
||||
|
||||
|
||||
def wait_for_capacity(client, max_jobs: int = MAX_CONCURRENT_JOBS) -> None:
|
||||
"""Block until the user's active (non-terminal) QDC job count is below max_jobs."""
|
||||
elapsed = 0
|
||||
while elapsed < CAPACITY_TIMEOUT:
|
||||
jobs_page = qdc_api.get_jobs_list(client, page_number=0, page_size=50)
|
||||
if jobs_page is None:
|
||||
log.warning("Could not retrieve job list; proceeding without capacity check")
|
||||
return
|
||||
items = getattr(jobs_page, "data", []) or []
|
||||
active = sum(1 for j in items if getattr(j, "state", None) in _NON_TERMINAL_STATE_VALUES)
|
||||
if active < max_jobs:
|
||||
log.info("Active QDC jobs: %d / %d — proceeding", active, max_jobs)
|
||||
return
|
||||
log.info("Active QDC jobs: %d / %d — waiting %ds ...", active, max_jobs, CAPACITY_POLL)
|
||||
time.sleep(CAPACITY_POLL)
|
||||
elapsed += CAPACITY_POLL
|
||||
log.warning("Capacity wait timed out after %ds; proceeding anyway", CAPACITY_TIMEOUT)
|
||||
|
||||
|
||||
def _parse_junit_xml(content: str) -> tuple[dict[str, bool], dict[str, str]]:
|
||||
try:
|
||||
root = ET.fromstring(content)
|
||||
except ET.ParseError:
|
||||
return {}, {}
|
||||
results: dict[str, bool] = {}
|
||||
failures: dict[str, str] = {}
|
||||
for tc in root.iter("testcase"):
|
||||
name = tc.get("name", "")
|
||||
if classname := tc.get("classname", ""):
|
||||
name = f"{classname}.{name}"
|
||||
failure_el = tc.find("failure")
|
||||
if failure_el is None:
|
||||
failure_el = tc.find("error")
|
||||
results[name] = failure_el is None
|
||||
if failure_el is not None:
|
||||
parts = [failure_el.get("message", ""), failure_el.text or ""]
|
||||
failures[name] = "\n".join(p for p in parts if p).strip()
|
||||
return results, failures
|
||||
|
||||
|
||||
def _parse_pytest_output(content: str) -> dict[str, bool]:
|
||||
results: dict[str, bool] = {}
|
||||
for m in _PYTEST_LINE_RE.finditer(content):
|
||||
results[m.group(1)] = m.group(2) == "PASSED"
|
||||
return results
|
||||
|
||||
|
||||
def fetch_logs_and_parse_tests(
|
||||
client, job_id: str
|
||||
) -> tuple[dict[str, bool], dict[str, str], dict[str, str]]:
|
||||
"""Returns (test_results, raw_logs, failure_details)."""
|
||||
log_files = qdc_api.get_job_log_files(client, job_id)
|
||||
if not log_files:
|
||||
log.warning("No log files returned for job %s", job_id)
|
||||
return {}, {}, {}
|
||||
|
||||
test_results: dict[str, bool] = {}
|
||||
pytest_fallback: dict[str, bool] = {}
|
||||
raw_logs: dict[str, str] = {}
|
||||
failure_details: dict[str, str] = {}
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
for lf in log_files:
|
||||
log.info("Downloading log file: %s", lf.filename)
|
||||
zip_path = os.path.join(tmpdir, "log.zip")
|
||||
qdc_api.download_job_log_files(client, lf.filename, zip_path)
|
||||
try:
|
||||
shutil.unpack_archive(zip_path, tmpdir, "zip")
|
||||
except Exception as e:
|
||||
log.warning("Could not unpack %s as zip: %s", lf.filename, e)
|
||||
|
||||
for root_dir, _, files in os.walk(tmpdir):
|
||||
for fname in sorted(files):
|
||||
fpath = os.path.join(root_dir, fname)
|
||||
content = Path(fpath).read_text(errors="replace")
|
||||
if fname.endswith(".xml"):
|
||||
results, failures = _parse_junit_xml(content)
|
||||
test_results.update(results)
|
||||
failure_details.update(failures)
|
||||
elif fname.endswith(".log"):
|
||||
if fname in _EXCLUDED_LOGS:
|
||||
continue
|
||||
log.info("--- %s ---", fname)
|
||||
log.info("%s", content)
|
||||
raw_logs[fname] = content
|
||||
pytest_fallback.update(_parse_pytest_output(content))
|
||||
|
||||
return (test_results if test_results else pytest_fallback), raw_logs, failure_details
|
||||
|
||||
|
||||
def write_summary(result: JobResult, title: str = "QDC Test Results") -> None:
|
||||
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
|
||||
if not summary_path:
|
||||
return
|
||||
|
||||
icon = "✅" if result.passed else "❌"
|
||||
|
||||
lines = [
|
||||
f"## {title}\n",
|
||||
f"Overall: {icon} {'PASSED' if result.passed else 'FAILED'}\n",
|
||||
]
|
||||
reportable = {n: ok for n, ok in result.tests.items() if "test_install" not in n}
|
||||
if reportable:
|
||||
lines += ["| Test | Result |", "| ---- | ------ |"]
|
||||
for name, ok in reportable.items():
|
||||
lines.append(f"| `{name}` | {'✅' if ok else '❌'} |")
|
||||
passed_n = sum(1 for v in reportable.values() if v)
|
||||
failed_n = sum(1 for v in reportable.values() if not v)
|
||||
lines += ["", f"**{passed_n} passed, {failed_n} failed**"]
|
||||
else:
|
||||
lines.append("_No per-test data available._")
|
||||
|
||||
failed_names = [n for n, ok in reportable.items() if not ok]
|
||||
if failed_names:
|
||||
lines += ["", "### Failures"]
|
||||
for name in failed_names:
|
||||
detail = result.failure_details.get(name)
|
||||
if detail:
|
||||
lines += [
|
||||
f"<details><summary><code>{name}</code></summary>",
|
||||
"",
|
||||
"```",
|
||||
detail,
|
||||
"```",
|
||||
"",
|
||||
"</details>",
|
||||
]
|
||||
|
||||
if result.raw_logs:
|
||||
lines += ["", "### Raw Logs"]
|
||||
for fname, content in sorted(result.raw_logs.items()):
|
||||
lines += [
|
||||
f"<details><summary>{fname}</summary>",
|
||||
"",
|
||||
"```",
|
||||
content.rstrip(),
|
||||
"```",
|
||||
"",
|
||||
"</details>",
|
||||
]
|
||||
|
||||
with open(summary_path, "a") as f:
|
||||
f.write("\n".join(lines) + "\n")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
p.add_argument("--pkg-dir", required=True, type=Path,
|
||||
help="Installed llama.cpp package directory (contains bin/ and lib/)")
|
||||
p.add_argument("--model-url",
|
||||
help="Direct URL to the GGUF model file (required for --test bench)")
|
||||
p.add_argument("--device", required=True,
|
||||
help="QDC chipset name, e.g. SM8750")
|
||||
p.add_argument("--test", choices=["bench", "backend-ops", "all"], default="bench",
|
||||
help="Test suite to run (default: bench)")
|
||||
p.add_argument("--job-timeout", type=int, default=JOB_TIMEOUT, metavar="SECONDS",
|
||||
help=f"Max seconds to wait for job completion (default: {JOB_TIMEOUT})")
|
||||
args = p.parse_args()
|
||||
if args.test in ("bench", "all") and not args.model_url:
|
||||
p.error("--model-url is required when --test bench or --test all")
|
||||
return args
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
api_key = os.environ.get("QDC_API_KEY")
|
||||
if not api_key:
|
||||
log.error("QDC_API_KEY environment variable must be set")
|
||||
return 1
|
||||
if not args.pkg_dir.is_dir():
|
||||
log.error("--pkg-dir %s does not exist", args.pkg_dir)
|
||||
return 1
|
||||
|
||||
client = qdc_api.get_public_api_client_using_api_key(
|
||||
api_key_header=api_key,
|
||||
app_name_header="llama-cpp-ci",
|
||||
on_behalf_of_header="llama-cpp-ci",
|
||||
client_type_header="Python",
|
||||
)
|
||||
|
||||
target_id = qdc_api.get_target_id(client, args.device)
|
||||
if target_id is None:
|
||||
log.error("Could not find QDC target for device %r", args.device)
|
||||
return 1
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
log.info("Building artifact ...")
|
||||
zip_path = build_artifact_zip(
|
||||
args.pkg_dir, Path(tmpdir),
|
||||
test_mode=args.test, model_url=args.model_url,
|
||||
)
|
||||
log.info("Uploading artifact (%d MB) ...", zip_path.stat().st_size // 1_000_000)
|
||||
artifact_id = qdc_api.upload_file(client, str(zip_path), ArtifactType.TESTSCRIPT)
|
||||
|
||||
if artifact_id is None:
|
||||
log.error("Artifact upload failed")
|
||||
return 1
|
||||
|
||||
wait_for_capacity(client)
|
||||
|
||||
job_id = qdc_api.submit_job(
|
||||
public_api_client=client,
|
||||
target_id=target_id,
|
||||
job_name="llama.cpp Hexagon tests",
|
||||
external_job_id=None,
|
||||
job_type=JobType.AUTOMATED,
|
||||
job_mode=JobMode.APPLICATION,
|
||||
timeout=max(1, args.job_timeout // 60),
|
||||
test_framework=TestFramework.APPIUM,
|
||||
entry_script=None,
|
||||
job_artifacts=[artifact_id],
|
||||
monkey_events=None,
|
||||
monkey_session_timeout=None,
|
||||
job_parameters=[JobSubmissionParameter.WIFIENABLED],
|
||||
)
|
||||
if job_id is None:
|
||||
log.error("Job submission failed")
|
||||
return 1
|
||||
log.info("Job submitted: %s (device=%s)", job_id, args.device)
|
||||
|
||||
try:
|
||||
job_status = wait_for_job(client, job_id, timeout=args.job_timeout)
|
||||
except TimeoutError as e:
|
||||
log.error("%s", e)
|
||||
write_summary(JobResult(passed=False, tests={}), title=f"QDC Job Timed Out ({args.device})")
|
||||
return 1
|
||||
log.info("Job %s finished: %s", job_id, job_status)
|
||||
|
||||
wait_for_log_upload(client, job_id)
|
||||
tests, raw_logs, failure_details = fetch_logs_and_parse_tests(client, job_id)
|
||||
|
||||
passed = job_status == JobState.COMPLETED.value.lower()
|
||||
if tests:
|
||||
passed = passed and all(tests.values())
|
||||
if not passed:
|
||||
log.error("Job did not complete successfully or tests failed (status=%s)", job_status)
|
||||
|
||||
result = JobResult(passed=passed, tests=tests, raw_logs=raw_logs, failure_details=failure_details)
|
||||
if args.test == "backend-ops":
|
||||
title = f"Backend Ops — HTP0 ({args.device})"
|
||||
elif args.test == "all":
|
||||
title = f"QDC Tests ({args.device})"
|
||||
else:
|
||||
title = f"QDC Test Results ({args.device})"
|
||||
write_summary(result, title=title)
|
||||
|
||||
return 0 if passed else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
20
scripts/snapdragon/qdc/tests/conftest.py
Normal file
20
scripts/snapdragon/qdc/tests/conftest.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Shared pytest fixtures for QDC on-device test runners."""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from appium import webdriver
|
||||
|
||||
from utils import options, write_qdc_log
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def driver():
|
||||
return webdriver.Remote(command_executor="http://127.0.0.1:4723/wd/hub", options=options)
|
||||
|
||||
|
||||
def pytest_sessionfinish(session, exitstatus):
|
||||
xml_path = getattr(session.config.option, "xmlpath", None) or "results.xml"
|
||||
if os.path.exists(xml_path):
|
||||
with open(xml_path) as f:
|
||||
write_qdc_log("results.xml", f.read())
|
||||
41
scripts/snapdragon/qdc/tests/run_backend_ops_posix.py
Normal file
41
scripts/snapdragon/qdc/tests/run_backend_ops_posix.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""
|
||||
On-device test-backend-ops runner for llama.cpp (HTP0 backend).
|
||||
|
||||
Executed by QDC's Appium test framework on the QDC runner.
|
||||
The runner has ADB access to the allocated device.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def install(driver):
|
||||
push_bundle_if_needed(f"{BIN_PATH}/test-backend-ops")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("type_a", ["mxfp4", "fp16", "q4_0"])
|
||||
def test_backend_ops_htp0(type_a):
|
||||
cmd = f"{CMD_PREFIX} GGML_HEXAGON_HOSTBUF=0 GGML_HEXAGON_EXPERIMENTAL=1 {BIN_PATH}/test-backend-ops -b HTP0 -o MUL_MAT"
|
||||
if type_a == "q4_0":
|
||||
cmd += r' -p "^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$"'
|
||||
else:
|
||||
cmd += f" -p type_a={type_a}"
|
||||
result = run_adb_command(
|
||||
cmd,
|
||||
check=False,
|
||||
)
|
||||
write_qdc_log(f"backend_ops_{type_a}.log", result.stdout or "")
|
||||
assert result.returncode == 0, f"test-backend-ops type_a={type_a} failed (exit {result.returncode})"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ret = pytest.main(["-s", "--junitxml=results.xml", os.path.realpath(__file__)])
|
||||
if os.path.exists("results.xml"):
|
||||
with open("results.xml") as f:
|
||||
write_qdc_log("results.xml", f.read())
|
||||
sys.exit(ret)
|
||||
76
scripts/snapdragon/qdc/tests/run_bench_tests_posix.py
Normal file
76
scripts/snapdragon/qdc/tests/run_bench_tests_posix.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
On-device bench and completion test runner for llama.cpp (CPU, GPU, NPU backends).
|
||||
|
||||
Executed by QDC's Appium test framework on the QDC runner.
|
||||
The runner has ADB access to the allocated device.
|
||||
|
||||
Placeholders replaced at artifact creation time by run_qdc_jobs.py:
|
||||
<<MODEL_URL>> Direct URL to the GGUF model file (downloaded on-device via curl)
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log
|
||||
|
||||
MODEL_PATH = "/data/local/tmp/model.gguf"
|
||||
PROMPT = "What is the capital of France?"
|
||||
CLI_OPTS = "--batch-size 128 -n 128 -no-cnv --seed 42"
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def install(driver):
|
||||
push_bundle_if_needed(f"{BIN_PATH}/llama-cli")
|
||||
|
||||
# Skip model download if already present
|
||||
check = subprocess.run(
|
||||
["adb", "shell", f"ls {MODEL_PATH}"],
|
||||
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
)
|
||||
if check.returncode != 0:
|
||||
run_adb_command(f'curl -L -J --output {MODEL_PATH} "<<MODEL_URL>>"')
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device,extra_flags", [
|
||||
pytest.param("none", "-ctk q8_0 -ctv q8_0", id="cpu"),
|
||||
pytest.param("GPUOpenCL", "", id="gpu"),
|
||||
pytest.param("HTP0", "-ctk q8_0 -ctv q8_0", id="npu"),
|
||||
])
|
||||
def test_llama_completion(device, extra_flags):
|
||||
result = run_adb_command(
|
||||
f'{CMD_PREFIX} {BIN_PATH}/llama-completion'
|
||||
f' -m {MODEL_PATH} --device {device} -ngl 99 -t 4 {CLI_OPTS} {extra_flags} -fa on'
|
||||
f' -p "{PROMPT}"',
|
||||
check=False,
|
||||
)
|
||||
write_qdc_log(f"llama_completion_{device}.log", result.stdout or "")
|
||||
assert result.returncode == 0, f"llama-completion {device} failed (exit {result.returncode})"
|
||||
|
||||
|
||||
_DEVICE_LOG_NAME = {"none": "cpu", "GPUOpenCL": "gpu", "HTP0": "htp"}
|
||||
|
||||
|
||||
@pytest.mark.parametrize("device", [
|
||||
pytest.param("none", id="cpu"),
|
||||
pytest.param("GPUOpenCL", id="gpu"),
|
||||
pytest.param("HTP0", id="npu"),
|
||||
])
|
||||
def test_llama_bench(device):
|
||||
result = run_adb_command(
|
||||
f"{CMD_PREFIX} {BIN_PATH}/llama-bench"
|
||||
f" -m {MODEL_PATH} --device {device} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32",
|
||||
check=False,
|
||||
)
|
||||
write_qdc_log(f"llama_bench_{_DEVICE_LOG_NAME[device]}.log", result.stdout or "")
|
||||
assert result.returncode == 0, f"llama-bench {device} failed (exit {result.returncode})"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ret = pytest.main(["-s", "--junitxml=results.xml", os.path.realpath(__file__)])
|
||||
if os.path.exists("results.xml"):
|
||||
with open("results.xml") as f:
|
||||
write_qdc_log("results.xml", f.read())
|
||||
sys.exit(ret)
|
||||
@@ -1,63 +0,0 @@
|
||||
import pytest
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
tmp_path='/data/local/tmp'
|
||||
pkg_path=f'{tmp_path}/llama.cpp'
|
||||
lib_path=f'{pkg_path}/lib'
|
||||
bin_path=f'{pkg_path}/bin'
|
||||
|
||||
model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
|
||||
cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
|
||||
|
||||
|
||||
def run_cmd(cmd):
|
||||
p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
|
||||
sys.stdout.write(p.stdout)
|
||||
assert(p.returncode == 0)
|
||||
|
||||
|
||||
@pytest.mark.dependency()
|
||||
def test_install():
|
||||
run_cmd(['adb', 'push', 'llama.cpp', f'{tmp_path}'])
|
||||
run_cmd(['adb', 'shell', f'chmod 755 {bin_path}/*'])
|
||||
|
||||
|
||||
## Basic cli tests
|
||||
def run_llama_cli(dev, opts):
|
||||
prompt='what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**.'
|
||||
opts = '--batch-size 128 -n 128 -no-cnv --seed 42 ' + opts
|
||||
run_cmd(['adb', 'shell', f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 -t 4 {opts} -p "{prompt}"'])
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=['test_install'])
|
||||
def test_llama_cli_cpu():
|
||||
run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=['test_install'])
|
||||
def test_llama_cli_gpu():
|
||||
run_llama_cli('GPUOpenCL', '-fa on')
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=['test_install'])
|
||||
def test_llama_cli_npu():
|
||||
run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
|
||||
|
||||
|
||||
## Basic bench tests
|
||||
def run_llama_bench(dev):
|
||||
run_cmd(['adb', 'shell', f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'])
|
||||
|
||||
|
||||
@pytest.mark.dependency(depends=['test_install'])
|
||||
def test_llama_bench_cpu():
|
||||
run_llama_bench('none')
|
||||
|
||||
|
||||
def test_llama_bench_gpu():
|
||||
run_llama_bench('GPUOpenCL')
|
||||
|
||||
|
||||
def test_llama_bench_npu():
|
||||
run_llama_bench('HTP0')
|
||||
93
scripts/snapdragon/qdc/tests/utils.py
Normal file
93
scripts/snapdragon/qdc/tests/utils.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Shared helpers for QDC on-device test runners."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from appium.options.common import AppiumOptions
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# On-device paths
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
BUNDLE_PATH = "/data/local/tmp/llama_cpp_bundle"
|
||||
QDC_LOGS_PATH = "/data/local/tmp/QDC_logs"
|
||||
LIB_PATH = f"{BUNDLE_PATH}/lib"
|
||||
BIN_PATH = f"{BUNDLE_PATH}/bin"
|
||||
ENV_PREFIX = (
|
||||
f"export LD_LIBRARY_PATH={LIB_PATH} && "
|
||||
f"export ADSP_LIBRARY_PATH={LIB_PATH} && "
|
||||
f"chmod +x {BIN_PATH}/* &&"
|
||||
)
|
||||
CMD_PREFIX = f"cd {BUNDLE_PATH} && {ENV_PREFIX}"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Appium session options
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
options = AppiumOptions()
|
||||
options.set_capability("automationName", "UiAutomator2")
|
||||
options.set_capability("platformName", "Android")
|
||||
options.set_capability("deviceName", os.getenv("ANDROID_DEVICE_VERSION"))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ADB helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def run_adb_command(cmd: str, *, check: bool = True) -> subprocess.CompletedProcess:
|
||||
# Append exit-code sentinel because `adb shell` doesn't reliably propagate
|
||||
# the on-device exit code (older ADB versions always return 0).
|
||||
raw = subprocess.run(
|
||||
["adb", "shell", f"{cmd}; echo __RC__:$?"],
|
||||
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
)
|
||||
stdout = raw.stdout
|
||||
returncode = raw.returncode
|
||||
if stdout:
|
||||
lines = stdout.rstrip("\n").split("\n")
|
||||
if lines and lines[-1].startswith("__RC__:"):
|
||||
try:
|
||||
returncode = int(lines[-1][7:])
|
||||
stdout = "\n".join(lines[:-1]) + "\n"
|
||||
except ValueError:
|
||||
pass
|
||||
log.info("%s", stdout)
|
||||
result = subprocess.CompletedProcess(raw.args, returncode, stdout=stdout)
|
||||
if check:
|
||||
assert returncode == 0, f"Command failed (exit {returncode})"
|
||||
return result
|
||||
|
||||
|
||||
def write_qdc_log(filename: str, content: str) -> None:
|
||||
"""Push content as a log file to QDC_LOGS_PATH on the device for QDC log collection."""
|
||||
subprocess.run(
|
||||
["adb", "shell", f"mkdir -p {QDC_LOGS_PATH}"],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
)
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f:
|
||||
f.write(content)
|
||||
tmp_path = f.name
|
||||
try:
|
||||
subprocess.run(
|
||||
["adb", "push", tmp_path, f"{QDC_LOGS_PATH}/{filename}"],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
)
|
||||
finally:
|
||||
os.unlink(tmp_path)
|
||||
|
||||
|
||||
def push_bundle_if_needed(check_binary: str) -> None:
|
||||
"""Push llama_cpp_bundle to the device if check_binary is not already present."""
|
||||
result = subprocess.run(
|
||||
["adb", "shell", f"ls {check_binary}"],
|
||||
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
subprocess.run(
|
||||
["adb", "push", "/qdc/appium/llama_cpp_bundle/", "/data/local/tmp"],
|
||||
text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
)
|
||||
@@ -21,11 +21,11 @@ if ($null -ne $env:V) {
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
if ($null -ne $env:OPSTAGE) {
|
||||
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
|
||||
@@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) {
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
if ($null -ne $env:OPSTAGE) {
|
||||
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
|
||||
@@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) {
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
if ($null -ne $env:OPSTAGE) {
|
||||
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
|
||||
@@ -34,11 +34,11 @@ if ($null -ne $env:SCHED) {
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
if ($null -ne $env:OPSTAGE) {
|
||||
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
|
||||
@@ -31,11 +31,11 @@ if ($null -ne $env:SCHED) {
|
||||
}
|
||||
|
||||
if ($null -ne $env:PROF) {
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
|
||||
$env:GGML_HEXAGON_PROFILE=$env:PROF
|
||||
}
|
||||
|
||||
if ($null -ne $env:OPMASK) {
|
||||
$env:GGML_HEXAGON_OPMASK=$env:OPMASK
|
||||
if ($null -ne $env:OPSTAGE) {
|
||||
$env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
|
||||
}
|
||||
|
||||
if ($null -ne $env:NHVX) {
|
||||
|
||||
@@ -59,8 +59,6 @@ struct cli_context {
|
||||
std::vector<raw_buffer> input_files;
|
||||
task_params defaults;
|
||||
bool verbose_prompt;
|
||||
int reasoning_budget = -1;
|
||||
std::string reasoning_budget_message;
|
||||
|
||||
// thread for showing "loading" animation
|
||||
std::atomic<bool> loading_show;
|
||||
@@ -77,8 +75,6 @@ struct cli_context {
|
||||
// defaults.return_progress = true; // TODO: show progress
|
||||
|
||||
verbose_prompt = params.verbose_prompt;
|
||||
reasoning_budget = params.sampling.reasoning_budget_tokens;
|
||||
reasoning_budget_message = params.sampling.reasoning_budget_message;
|
||||
}
|
||||
|
||||
std::string generate_completion(result_timings & out_timings) {
|
||||
@@ -106,7 +102,7 @@ struct cli_context {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(
|
||||
llama_get_model(ctx_server.get_llama_context()));
|
||||
|
||||
task.params.sampling.reasoning_budget_tokens = reasoning_budget;
|
||||
task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
|
||||
task.params.sampling.generation_prompt = chat_params.generation_prompt;
|
||||
|
||||
if (!chat_params.thinking_start_tag.empty()) {
|
||||
@@ -116,7 +112,7 @@ struct cli_context {
|
||||
task.params.sampling.reasoning_budget_end =
|
||||
common_tokenize(vocab, chat_params.thinking_end_tag, false, true);
|
||||
task.params.sampling.reasoning_budget_forced =
|
||||
common_tokenize(vocab, reasoning_budget_message + chat_params.thinking_end_tag, false, true);
|
||||
common_tokenize(vocab, defaults.sampling.reasoning_budget_message + chat_params.thinking_end_tag, false, true);
|
||||
}
|
||||
|
||||
rd.post_task({std::move(task)});
|
||||
|
||||
@@ -675,6 +675,10 @@ private:
|
||||
|
||||
int32_t n_ctx; // total context for all clients / slots
|
||||
|
||||
// set to llama_model_n_swa(model)
|
||||
// if swa_full is enabled, this is set to 0 to simulate a non-SWA model
|
||||
int32_t n_swa;
|
||||
|
||||
// slots / clients
|
||||
std::vector<server_slot> slots;
|
||||
|
||||
@@ -719,7 +723,7 @@ private:
|
||||
return;
|
||||
}
|
||||
SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
|
||||
SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
|
||||
SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n");
|
||||
slot.prompt_save(*prompt_cache);
|
||||
slot.prompt_clear(false);
|
||||
prompt_cache->update();
|
||||
@@ -854,6 +858,8 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
|
||||
|
||||
// Necessary similarity of prompt for slot selection
|
||||
slot_prompt_similarity = params_base.slot_prompt_similarity;
|
||||
|
||||
@@ -996,7 +1002,7 @@ private:
|
||||
params_base.cache_idle_slots = false;
|
||||
} else {
|
||||
SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
|
||||
SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
|
||||
SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2415,9 +2421,6 @@ private:
|
||||
|
||||
llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
|
||||
|
||||
// note: when n_swa == 0, the model does not use SWA
|
||||
const auto n_swa = std::max(0, llama_model_n_swa(model));
|
||||
|
||||
// the largest pos_min required for a checkpoint to be useful
|
||||
const auto pos_min_thold = std::max(0, pos_next - n_swa);
|
||||
|
||||
@@ -2589,10 +2592,10 @@ private:
|
||||
// make a checkpoint of the parts of the memory that cannot be rolled back.
|
||||
// checkpoints are created only if:
|
||||
// - the model does not support partial sequence removal
|
||||
// - the model uses SWA and we are not using `swa_full`
|
||||
// - the model uses SWA (and we are not using `swa_full`)
|
||||
do_checkpoint = do_checkpoint && (
|
||||
(slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
|
||||
(llama_model_n_swa(model) > 0 && !params_base.swa_full));
|
||||
(n_swa > 0));
|
||||
|
||||
bool has_mtmd = false;
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ def test_clear_and_restore():
|
||||
log = LogReader(server.log_path)
|
||||
|
||||
# verify feature is enabled
|
||||
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
|
||||
assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" in log.drain()
|
||||
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT,
|
||||
@@ -59,7 +59,7 @@ def test_clear_and_restore():
|
||||
original_prompt_n = res.body["timings"]["prompt_n"]
|
||||
|
||||
# Slot 0 is the only slot with KV — should NOT be cleared
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
|
||||
|
||||
# Launching slot 1 clears idle slot 0
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
@@ -68,7 +68,7 @@ def test_clear_and_restore():
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
|
||||
assert "__TEST_TAG_CACHE_IDLE_SLOT__" in log.drain()
|
||||
|
||||
# Re-send same prompt — should restore from cache-ram
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
@@ -86,7 +86,7 @@ def test_clear_and_restore():
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
|
||||
|
||||
|
||||
def test_disabled_with_flag():
|
||||
@@ -96,7 +96,7 @@ def test_disabled_with_flag():
|
||||
log = LogReader(server.log_path)
|
||||
|
||||
# Feature should not be enabled
|
||||
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
|
||||
assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" not in log.drain()
|
||||
|
||||
res = server.make_request("POST", "/completion", data={
|
||||
"prompt": LONG_PROMPT,
|
||||
@@ -112,4 +112,4 @@ def test_disabled_with_flag():
|
||||
"cache_prompt": True,
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
||||
assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
|
||||
|
||||
3
ty.toml
3
ty.toml
@@ -1,5 +1,5 @@
|
||||
[environment]
|
||||
extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"]
|
||||
extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests", "./scripts/snapdragon/qdc/tests"]
|
||||
python-version = "3.10"
|
||||
|
||||
[rules]
|
||||
@@ -13,6 +13,7 @@ exclude = [
|
||||
[[overrides]]
|
||||
include = [
|
||||
"./tools/server/tests/**",
|
||||
"./scripts/snapdragon/qdc/tests/**",
|
||||
]
|
||||
|
||||
[overrides.rules]
|
||||
|
||||
Reference in New Issue
Block a user