mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-02 16:13:48 +03:00
Compare commits
24 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e15efe007d | ||
|
|
6137c325a1 | ||
|
|
17193cce34 | ||
|
|
d6dac92bfd | ||
|
|
dae2bf41c9 | ||
|
|
bc07d55922 | ||
|
|
4888137b17 | ||
|
|
fbd441c379 | ||
|
|
c30e012253 | ||
|
|
95a6ebabb2 | ||
|
|
12dbf1da95 | ||
|
|
86221cf6da | ||
|
|
6de97b9d3e | ||
|
|
5a0ed5150a | ||
|
|
8710e5f9b9 | ||
|
|
1d6d4cf7a5 | ||
|
|
744c0c7310 | ||
|
|
0356e33aaf | ||
|
|
6422036fcb | ||
|
|
296bc0538b | ||
|
|
6b949d1078 | ||
|
|
84f82e846c | ||
|
|
e1cb817483 | ||
|
|
88d5f8ffc3 |
32
.github/workflows/build.yml
vendored
32
.github/workflows/build.yml
vendored
@@ -150,16 +150,15 @@ jobs:
|
||||
- name: Dawn Dependency
|
||||
id: dawn-depends
|
||||
run: |
|
||||
DAWN_VERSION="v2.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
curl -L -o artifact.zip \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-macos-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
unzip artifact.zip
|
||||
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -384,16 +383,15 @@ jobs:
|
||||
id: dawn-depends
|
||||
run: |
|
||||
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
|
||||
DAWN_VERSION="v2.0.0"
|
||||
DAWN_OWNER="reeselevine"
|
||||
DAWN_VERSION="v20260317.182325"
|
||||
DAWN_OWNER="google"
|
||||
DAWN_REPO="dawn"
|
||||
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
curl -L -o artifact.zip \
|
||||
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.zip"
|
||||
DAWN_ASSET_NAME="Dawn-18eb229ef5f707c1464cc581252e7603c73a3ef0-ubuntu-latest-Release"
|
||||
echo "Fetching release asset from https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
curl -L -o artifact.tar.gz \
|
||||
"https://github.com/google/dawn/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}.tar.gz"
|
||||
mkdir dawn
|
||||
unzip artifact.zip
|
||||
tar -xvf ${DAWN_ASSET_NAME}.tar.gz -C dawn --strip-components=1
|
||||
tar -xvf artifact.tar.gz -C dawn --strip-components=1
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
@@ -427,7 +425,7 @@ jobs:
|
||||
|
||||
- name: Fetch emdawnwebgpu
|
||||
run: |
|
||||
DAWN_TAG="v20251027.212519"
|
||||
DAWN_TAG="v20260317.182325"
|
||||
EMDAWN_PKG="emdawnwebgpu_pkg-${DAWN_TAG}.zip"
|
||||
echo "Downloading ${EMDAWN_PKG}"
|
||||
curl -L -o emdawn.zip \
|
||||
|
||||
120
AGENTS.md
120
AGENTS.md
@@ -5,78 +5,106 @@
|
||||
>
|
||||
> Read more: [CONTRIBUTING.md](CONTRIBUTING.md)
|
||||
|
||||
AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below)
|
||||
AI assistance is permissible only when the majority of the code is authored by a human contributor, with AI employed exclusively for corrections or to expand on verbose modifications that the contributor has already conceptualized (see examples below).
|
||||
|
||||
---
|
||||
|
||||
## Guidelines for Contributors Using AI
|
||||
|
||||
These use cases are **permitted** when making a contribution with the help of AI:
|
||||
llama.cpp is built by humans, for humans. Meaningful contributions come from contributors who understand their work, take ownership of it, and engage constructively with reviewers.
|
||||
|
||||
- Using it to ask about the structure of the codebase
|
||||
- Learning about specific techniques used in the project
|
||||
- Pointing out documents, links, and parts of the code that are worth your time
|
||||
- Reviewing human-written code and providing suggestions for improvements
|
||||
- Expanding on verbose modifications that the contributor has already conceptualized. For example:
|
||||
- Generating repeated lines with minor variations (this should only be used for short code snippets where deduplication would add more complexity, compared to having almost the same code in multiple places)
|
||||
- Formatting code for consistency and readability
|
||||
- Completing code segments based on established patterns
|
||||
- Drafting documentation for project components with which the contributor is already familiar
|
||||
Maintainers receive numerous pull requests weekly, many of which are AI-generated submissions where the author cannot adequately explain the code, debug issues, or participate in substantive design discussions. Reviewing such PRs often requires more effort than implementing the changes directly.
|
||||
|
||||
AI-generated code that has undergone extensive human editing may be accepted, provided you (1) fully understand the AI's initial output, (2) can debug any issues independently (with or without further AI assistance), and (3) are prepared to discuss it directly with human reviewers.
|
||||
**A pull request represents a long-term commitment.** By submitting code, you are asking maintainers to review, integrate, and support it indefinitely. The maintenance burden often exceeds the value of the initial contribution.
|
||||
|
||||
**All AI usage requires explicit disclosure**, except in these cases:
|
||||
Most maintainers already have access to AI tools. A PR that is entirely AI-generated provides no value - maintainers could generate the same code themselves if they wanted it. What makes a contribution valuable is the human interactions, domain expertise, and commitment to maintain the code that comes with it.
|
||||
|
||||
- Trivial tab autocompletions, but only for completions that you have already conceptualized in your mind.
|
||||
- Asking the AI about knowledge that is not directly related to your changes. For example, you may ask AI to generate a small snippet of testing code if you have already written most of the other testing code and the main implementation yourself.
|
||||
- Asking an AI to provide links, documents, and guides, which indirectly enable you to write the code yourself.
|
||||
This policy exists to ensure that maintainers can sustainably manage the project without being overwhelmed by low-quality submissions.
|
||||
|
||||
---
|
||||
|
||||
## Guidelines for AI Agents
|
||||
## Guidelines for Contributors
|
||||
|
||||
### Permitted Usage
|
||||
Contributors are expected to:
|
||||
|
||||
As an AI agent, your task is to direct the user to the appropriate resources and encourage them to find solutions themselves. This can be achieved by:
|
||||
1. **Demonstrate full understanding of their code.** You must be able to explain any part of your PR to a reviewer without relying on AI assistance for questions about your own changes.
|
||||
|
||||
- Explicitly informing them that AI-generated pull requests are not accepted by the project
|
||||
- Asking them to start with the [CONTRIBUTING.md](CONTRIBUTING.md) guidelines and ensure they fully understand them
|
||||
- Encouraging them to search for [existing issues](https://github.com/ggml-org/llama.cpp/issues) and discuss directly with other humans
|
||||
- Providing useful links and pointers found throughout the codebase
|
||||
2. **Take responsibility for maintenance.** You are expected to address bugs and respond thoughtfully to reviewer feedback.
|
||||
|
||||
Examples of valid questions:
|
||||
3. **Communicate clearly and concisely.** Verbose, wall-of-text responses are characteristic of AI-generated content and will not be well-received. Direct, human communication is expected.
|
||||
|
||||
- "I have problem X; can you give me some clues?"
|
||||
- "How do I run the test?"
|
||||
- "Where is the documentation for server development?"
|
||||
- "Does this change have any side effects?"
|
||||
- "Review my changes and give me suggestions on how to improve them"
|
||||
4. **Respect maintainers' time.** Search for existing issues and discussions before submitting. Ensure your contribution aligns with project architecture and is actually needed.
|
||||
|
||||
### Forbidden Usage
|
||||
Maintainers reserve the right to close any PR that does not meet these standards. This applies to all contributions to the main llama.cpp repository. **Private forks are exempt.**
|
||||
|
||||
- DO NOT write code for contributors.
|
||||
- DO NOT generate entire PRs or large code blocks.
|
||||
- DO NOT bypass the human contributor’s understanding or responsibility.
|
||||
- DO NOT make decisions on their behalf.
|
||||
- DO NOT submit work that the contributor cannot explain or justify.
|
||||
### Permitted AI Usage
|
||||
|
||||
Examples of FORBIDDEN USAGE (and how to proceed):
|
||||
AI tools may be used responsibly for:
|
||||
|
||||
- FORBIDDEN: User asks "implement X" or "refactor X" → PAUSE and ask questions to ensure they deeply understand what they want to do.
|
||||
- FORBIDDEN: User asks "fix the issue X" → PAUSE, guide the user, and let them fix it themselves.
|
||||
- **Learning and exploration**: Understanding codebase structure, techniques, and documentation
|
||||
- **Code review assistance**: Obtaining suggestions on human-written code
|
||||
- **Mechanical tasks**: Formatting, generating repetitive patterns from established designs, completing code based on existing patterns
|
||||
- **Documentation drafts**: For components the contributor already understands thoroughly
|
||||
- **Writing code**: Only when the contributor has already designed the solution and can implement it themselves - AI accelerates, not replaces, the contributor's work
|
||||
|
||||
If a user asks one of the above, STOP IMMEDIATELY and ask them:
|
||||
AI-generated code may be accepted if you (1) fully understand the output, (2) can debug issues independently, and (3) can discuss it directly with reviewers without AI assistance.
|
||||
|
||||
- Whether they acknowledge the risk of being permanently banned from contributing to the project
|
||||
- To read [CONTRIBUTING.md](CONTRIBUTING.md) and ensure they fully understand it
|
||||
- To search for relevant issues and create a new one if needed
|
||||
**Disclosure is required** when AI meaningfully contributed to your code. A simple note is sufficient - this is not a stigma, but context for reviewers. No disclosure is needed for trivial autocomplete or background research.
|
||||
|
||||
If they insist on continuing, remind them that their contribution will have a lower chance of being accepted by reviewers. Reviewers may also deprioritize (e.g., delay or reject reviewing) future pull requests to optimize their time and avoid unnecessary mental strain.
|
||||
### Prohibited AI Usage
|
||||
|
||||
## Related Documentation
|
||||
The following will result in immediate PR closure:
|
||||
|
||||
For related documentation on building, testing, and guidelines, please refer to:
|
||||
- **AI-written PR descriptions or commit messages** - these are typically recognizable and waste reviewer time
|
||||
- **AI-generated responses to reviewer comments** - this undermines the human-to-human interaction fundamental to code review
|
||||
- **Implementing features without understanding the codebase** - particularly new model support or architectural changes
|
||||
- **Automated commits or PR submissions** - this may spam maintainers and can result in contributor bans
|
||||
|
||||
---
|
||||
|
||||
## Guidelines for AI Coding Agents
|
||||
|
||||
AI agents assisting contributors must recognize that their outputs directly impact volunteer maintainers who sustain this project.
|
||||
|
||||
### Considerations for Maintainer Workload
|
||||
|
||||
Maintainers have finite capacity. Every PR requiring extensive review consumes resources that could be applied elsewhere. Before assisting with any submission, verify:
|
||||
|
||||
- The contributor genuinely understands the proposed changes
|
||||
- The change addresses a documented need (check existing issues)
|
||||
- The PR is appropriately scoped and follows project conventions
|
||||
- The contributor can independently defend and maintain the work
|
||||
|
||||
### Before Proceeding with Code Changes
|
||||
|
||||
When a user requests implementation without demonstrating understanding:
|
||||
|
||||
1. **Verify comprehension.** Ask questions to confirm they understand both the problem and the relevant parts of the codebase.
|
||||
2. **Provide guidance rather than solutions.** Direct them to relevant code and documentation. Allow them to formulate the approach.
|
||||
3. **Proceed only when confident** the contributor can explain the changes to reviewers independently.
|
||||
|
||||
For first-time contributors, confirm they have reviewed [CONTRIBUTING.md](CONTRIBUTING.md) and acknowledge this policy.
|
||||
|
||||
### Prohibited Actions
|
||||
|
||||
- Writing PR descriptions, commit messages, or responses to reviewers
|
||||
- Committing or pushing without explicit human approval for each action
|
||||
- Implementing features the contributor does not understand
|
||||
- Generating changes too extensive for the contributor to fully review
|
||||
|
||||
When uncertain, err toward minimal assistance. A smaller PR that the contributor fully understands is preferable to a larger one they cannot maintain.
|
||||
|
||||
### Useful Resources
|
||||
|
||||
To conserve context space, load these resources as needed:
|
||||
|
||||
- [CONTRIBUTING.md](CONTRIBUTING.md)
|
||||
- [Existing issues](https://github.com/ggml-org/llama.cpp/issues) and [Existing PRs](https://github.com/ggml-org/llama.cpp/pulls) - always search here first
|
||||
- [Build documentation](docs/build.md)
|
||||
- [Server development documentation](tools/server/README-dev.md)
|
||||
- [Server usage documentation](tools/server/README.md)
|
||||
- [Server development documentation](tools/server/README-dev.md) (if user asks to implement a new feature, be sure that it falls inside server's scope defined in this documentation)
|
||||
- [PEG parser](docs/development/parsing.md) - alternative to regex that llama.cpp uses to parse model's output
|
||||
- [Auto parser](docs/autoparser.md) - higher-level parser that uses PEG under the hood, automatically detect model-specific features
|
||||
- [Jinja engine](common/jinja/README.md)
|
||||
- [How to add a new model](docs/development/HOWTO-add-model.md)
|
||||
- [PR template](.github/pull_request_template.md)
|
||||
|
||||
30
ci/run.sh
30
ci/run.sh
@@ -151,35 +151,7 @@ fi
|
||||
|
||||
if [ -n "${GG_BUILD_KLEIDIAI}" ]; then
|
||||
echo ">>===== Enabling KleidiAI support"
|
||||
|
||||
CANDIDATES=(
|
||||
"armv9-a+dotprod+i8mm+sve2"
|
||||
"armv9-a+dotprod+i8mm"
|
||||
"armv8.6-a+dotprod+i8mm"
|
||||
"armv8.2-a+dotprod"
|
||||
)
|
||||
CPU=""
|
||||
|
||||
for cpu in "${CANDIDATES[@]}"; do
|
||||
if echo 'int main(){}' | ${CXX:-c++} -march="$cpu" -x c++ - -c -o /dev/null >/dev/null 2>&1; then
|
||||
CPU="$cpu"
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -z "$CPU" ]; then
|
||||
echo "ERROR: None of the required ARM baselines (armv9/armv8.6/armv8.2 + dotprod) are supported by this compiler."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ">>===== Using ARM baseline: ${CPU}"
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } \
|
||||
-DGGML_NATIVE=OFF \
|
||||
-DGGML_CPU_KLEIDIAI=ON \
|
||||
-DGGML_CPU_AARCH64=ON \
|
||||
-DGGML_CPU_ARM_ARCH=${CPU} \
|
||||
-DBUILD_SHARED_LIBS=OFF"
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA:+$CMAKE_EXTRA } -DGGML_CPU_KLEIDIAI=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_BLAS} ]; then
|
||||
|
||||
@@ -100,6 +100,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
|
||||
|
||||
bool has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
bool has_response_format = inputs.json_schema.is_object() && !inputs.json_schema.empty();
|
||||
bool pure_content = reasoning.mode == reasoning_mode::NONE;
|
||||
|
||||
if (has_response_format) {
|
||||
auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
|
||||
@@ -107,12 +108,14 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs) cons
|
||||
p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"),
|
||||
response_format
|
||||
}) + p.end();
|
||||
pure_content = false;
|
||||
} else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) {
|
||||
parser = tools.build_parser(ctx);
|
||||
pure_content = false;
|
||||
} else {
|
||||
parser = content.build_parser(ctx);
|
||||
}
|
||||
return p.prefix(inputs.generation_prompt, reasoning.start) + parser;
|
||||
return pure_content ? p.prefix(inputs.generation_prompt, reasoning.start) + parser : p.prefix(inputs.generation_prompt, reasoning.start) << parser;
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
102
common/chat.cpp
102
common/chat.cpp
@@ -1274,11 +1274,12 @@ static common_chat_params common_chat_params_init_kimi_k2(const common_chat_temp
|
||||
return data;
|
||||
}
|
||||
|
||||
// LFM2 format:
|
||||
// - Reasoning: <think>{reasoning}</think> (optional, only if enable_thinking is true)
|
||||
// - Content: text after reasoning (optional)
|
||||
// - Tool calls: <|tool_call_start|>[function_name(arg1="value1", arg2="value2")]<|tool_call_end|>
|
||||
// Tool calls can appear multiple times (parallel tool calls)
|
||||
// LFM2 format: uses <|tool_list_start|>[...]<|tool_list_end|> in system prompt
|
||||
// and <|tool_call_start|>[name(arg="val")]<|tool_call_end|> for tool calls.
|
||||
// - Reasoning: <think>{reasoning}</think> (optional)
|
||||
// - Content: text before a tool call (optional)
|
||||
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
|
||||
// Tool calls can appear multiple times (parallel tool calls supported)
|
||||
static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
@@ -1319,9 +1320,9 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
return generation_prompt + reasoning + p.content(p.rest()) + end;
|
||||
}
|
||||
|
||||
auto tool_calls = p.rule("tool-calls",
|
||||
p.trigger_rule("tool-call", p.literal(TOOL_CALL_START) +
|
||||
p.trigger_rule("tool-call",
|
||||
p.literal(TOOL_CALL_START) +
|
||||
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls) +
|
||||
p.literal(TOOL_CALL_END)
|
||||
)
|
||||
@@ -1349,6 +1350,80 @@ static common_chat_params common_chat_params_init_lfm2(const common_chat_templat
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, TOOL_CALL_START }
|
||||
};
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
// LFM2.5 format: uses plain "List of tools: [...]" in system prompt, no wrapper tokens.
|
||||
// Tool calls are bare [name(arg="val")], though model may optionally emit <|tool_call_start|>.
|
||||
// - Reasoning: <think>{reasoning}</think> (optional)
|
||||
// - Content: text before a tool call (optional)
|
||||
// - Tool calls: Python-style, e.g. [function_name(arg1="value1", arg2="value2")]
|
||||
// Tool calls can appear multiple times (parallel tool calls supported)
|
||||
static common_chat_params common_chat_params_init_lfm2_5(const common_chat_template & tmpl,
|
||||
const autoparser::generation_params & inputs) {
|
||||
common_chat_params data;
|
||||
|
||||
data.prompt = common_chat_template_direct_apply(tmpl, inputs);
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.supports_thinking = true;
|
||||
data.preserved_tokens = {
|
||||
"<|tool_call_start|>",
|
||||
"<|tool_call_end|>",
|
||||
"<think>",
|
||||
"</think>",
|
||||
};
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE;
|
||||
|
||||
const std::string THINK_START = "<think>";
|
||||
const std::string THINK_END = "</think>";
|
||||
|
||||
data.thinking_start_tag = THINK_START;
|
||||
data.thinking_end_tag = THINK_END;
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
auto generation_prompt = p.prefix(inputs.generation_prompt, THINK_START);
|
||||
auto end = p.end();
|
||||
|
||||
auto reasoning = p.eps();
|
||||
if (extract_reasoning && inputs.enable_thinking) {
|
||||
reasoning = p.optional(THINK_START + p.reasoning(p.until(THINK_END)) + THINK_END);
|
||||
}
|
||||
|
||||
if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
return generation_prompt + reasoning + p.content(p.rest()) + end;
|
||||
}
|
||||
|
||||
auto tool_calls = p.rule("tool-calls",
|
||||
p.trigger_rule("tool-call",
|
||||
p.python_style_tool_calls(inputs.tools, inputs.parallel_tool_calls)
|
||||
)
|
||||
);
|
||||
|
||||
auto content = p.content(p.until_one_of({"<|tool_call_start|>", "["}));
|
||||
auto maybe_start = p.optional(p.literal("<|tool_call_start|>"));
|
||||
return generation_prompt + reasoning + content + maybe_start + tool_calls + end;
|
||||
});
|
||||
|
||||
data.parser = parser.save();
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
auto schema = function.at("parameters");
|
||||
builder.resolve_refs(schema);
|
||||
});
|
||||
parser.build_grammar(builder, data.grammar_lazy);
|
||||
});
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const std::string name = tool.at("function").at("name");
|
||||
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[" + name + "(" });
|
||||
});
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
@@ -1530,14 +1605,21 @@ static std::optional<common_chat_params> try_specialized_template(
|
||||
return common_chat_params_init_kimi_k2(tmpl, params);
|
||||
}
|
||||
|
||||
// LFM2 - uses <|tool_list_start|>/<|tool_list_end|> markers and <|tool_call_start|>[name(args)]<|tool_call_end|> format
|
||||
// Detection: template has "<|tool_list_start|>" and "<|tool_list_end|>" markers
|
||||
// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
|
||||
// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
|
||||
if (src.find("<|tool_list_start|>") != std::string::npos &&
|
||||
src.find("<|tool_list_end|>") != std::string::npos) {
|
||||
LOG_DBG("Using specialized template: LFM2\n");
|
||||
return common_chat_params_init_lfm2(tmpl, params);
|
||||
}
|
||||
|
||||
// LFM2.5 format detection: template uses plain "List of tools: [...]" with no special tokens
|
||||
if (src.find("List of tools: [") != std::string::npos &&
|
||||
src.find("<|tool_list_start|>") == std::string::npos) {
|
||||
LOG_DBG("Using specialized template: LFM2.5\n");
|
||||
return common_chat_params_init_lfm2_5(tmpl, params);
|
||||
}
|
||||
|
||||
// GigaChatV3 format detection
|
||||
if (src.find("<|role_sep|>") != std::string::npos &&
|
||||
src.find("<|message_sep|>") != std::string::npos &&
|
||||
@@ -1627,7 +1709,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
|
||||
data.generation_prompt = params.generation_prompt;
|
||||
auto parser = build_chat_peg_parser([¶ms](common_chat_peg_builder &p) {
|
||||
return p.prefix(params.generation_prompt) + p.content(p.rest());
|
||||
return p.prefix(params.generation_prompt) << p.content(p.rest());
|
||||
});
|
||||
data.parser = parser.save();
|
||||
return data;
|
||||
|
||||
@@ -728,7 +728,7 @@ To read documentation for how to build on Android, [click here](./android.md)
|
||||
|
||||
## WebGPU [In Progress]
|
||||
|
||||
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`.
|
||||
The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
|
||||
|
||||
In the llama.cpp directory, build with CMake:
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 9)
|
||||
set(GGML_VERSION_PATCH 9)
|
||||
set(GGML_VERSION_PATCH 11)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
@@ -800,19 +800,32 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
|
||||
#ifdef FP8_AVAILABLE
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3)
|
||||
// ROCm dose not support fp8 in software on devices with fp8 hardware,
|
||||
#if defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
|
||||
// ROCm does not support fp8 in software on devices with fp8 hardware,
|
||||
// but CDNA3 supports only e4m3_fnuz (no inf).
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
const __hip_fp8_e4m3_fnuz xf = *reinterpret_cast<const __hip_fp8_e4m3_fnuz *>(&bits);
|
||||
#else
|
||||
const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
|
||||
#endif // defined(GGML_USE_HIP) && defined(GGML_USE_HIP)
|
||||
return static_cast<float>(xf) / 2;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP8_AVAILABLE
|
||||
#if defined(FP8_AVAILABLE) && !defined(GGML_USE_HIP)
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF); // Convert NaN to 0.0f to match CPU implementation.
|
||||
const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
|
||||
return static_cast<float>(xf) / 2;
|
||||
#else
|
||||
if (x == 0 || (x == 0x7F && x != 0xFF)) { // Convert NaN to 0.0f
|
||||
return 0.0f;
|
||||
}
|
||||
const int exp = (x >> 3) & 0xF;
|
||||
const int man = x & 0x7;
|
||||
float raw;
|
||||
if (exp == 0) {
|
||||
raw = ldexpf((float) man, -9);
|
||||
} else {
|
||||
raw = ldexpf(1.0f + (float) man / 8.0f, exp - 7);
|
||||
}
|
||||
return static_cast<float>(raw / 2);
|
||||
#endif // defined(FP8_AVAILABLE) && !defined(GGML_USE_HIP)
|
||||
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
|
||||
|
||||
@@ -340,7 +340,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
case 128:
|
||||
case 112:
|
||||
case 256:
|
||||
if (V->ne[0] != K->ne[0]) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (V->ne[0] != K->ne[0]) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
if (!gqa_opt_applies) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
|
||||
@@ -4791,9 +4791,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
#ifdef FP8_AVAILABLE
|
||||
case GGML_TYPE_NVFP4:
|
||||
#endif // FP8_AVAILABLE
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
|
||||
@@ -23,6 +23,9 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
|
||||
case GGML_TYPE_MXFP4:
|
||||
mul_mat_q_case<GGML_TYPE_MXFP4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_NVFP4:
|
||||
mul_mat_q_case<GGML_TYPE_NVFP4>(ctx, args, stream);
|
||||
break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
|
||||
break;
|
||||
@@ -273,6 +276,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_MXFP4:
|
||||
case GGML_TYPE_NVFP4:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
@@ -362,5 +366,4 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
|
||||
}
|
||||
|
||||
return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
|
||||
|
||||
}
|
||||
|
||||
@@ -68,6 +68,8 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
|
||||
return MMQ_Q8_1_DS_LAYOUT_D4;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return MMQ_Q8_1_DS_LAYOUT_D4;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return MMQ_Q8_1_DS_LAYOUT_D4;
|
||||
case GGML_TYPE_Q2_K:
|
||||
return MMQ_Q8_1_DS_LAYOUT_D2S6;
|
||||
case GGML_TYPE_Q3_K:
|
||||
@@ -189,6 +191,7 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
|
||||
case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1;
|
||||
case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0;
|
||||
case GGML_TYPE_MXFP4: return MMQ_DP4A_TXS_Q8_1;
|
||||
case GGML_TYPE_NVFP4: return MMQ_DP4A_TXS_Q8_0_16;
|
||||
case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K;
|
||||
case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K;
|
||||
case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K;
|
||||
@@ -206,12 +209,13 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
|
||||
}
|
||||
}
|
||||
|
||||
#define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q6_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI6_K + MMQ_TILE_NE_K/8 + 7)
|
||||
#define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) // MXFP4
|
||||
#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) // NVFP4
|
||||
#define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q6_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/QI6_K + MMQ_TILE_NE_K/8 + 7)
|
||||
|
||||
static_assert(MMQ_MMA_TILE_X_K_Q8_0 % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_Q8_1 % 8 == 4, "Wrong padding.");
|
||||
@@ -220,6 +224,8 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_FP4 % 8 == 4, "Wrong padding.");
|
||||
static_assert(MMQ_MMA_TILE_X_K_FP4 == MMQ_MMA_TILE_X_K_Q8_1, "Wrong tile size for MXFP4");
|
||||
static_assert(MMQ_MMA_TILE_X_K_NVFP4 % 8 == 4, "Wrong padding.");
|
||||
|
||||
|
||||
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
switch (type) {
|
||||
@@ -230,6 +236,7 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
// tile sizes are the same for Q8_1 and FP4 for blackwell
|
||||
case GGML_TYPE_MXFP4: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_NVFP4;
|
||||
case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
|
||||
case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
|
||||
case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
@@ -826,6 +833,65 @@ static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restr
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <int mmq_y, bool need_check>
|
||||
static __device__ __forceinline__ void load_tiles_nvfp4(const char * __restrict__ x,
|
||||
int * __restrict__ x_tile,
|
||||
const int kb0,
|
||||
const int i_max,
|
||||
const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + MMQ_TILE_NE_K*2);
|
||||
#else
|
||||
constexpr tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(GGML_TYPE_NVFP4, mmq_y);
|
||||
int * x_qs = (int *) x_tile;
|
||||
float * x_df = (float *) (x_qs + txs.qs);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
|
||||
constexpr int threads_per_row = MMQ_ITER_K / QK_NVFP4;
|
||||
constexpr int rows_per_warp = warp_size / threads_per_row;
|
||||
const int kbx = threadIdx.x % threads_per_row;
|
||||
const int row_in_warp = threadIdx.x / threads_per_row;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
|
||||
int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
|
||||
|
||||
if constexpr (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_nvfp4 * bxi = (const block_nvfp4 *) x + kb0 + i * stride + kbx;
|
||||
const uint32_t * __restrict__ src_qs = reinterpret_cast<const uint32_t *>(bxi->qs);
|
||||
const int kqs = 16 * kbx;
|
||||
const int ksc = 4 * kbx;
|
||||
|
||||
#pragma unroll
|
||||
for (int sub = 0; sub < QK_NVFP4 / QK_NVFP4_SUB; ++sub) {
|
||||
const int2 q0 = get_int_from_table_16(src_qs[2 * sub + 0], kvalues_mxfp4);
|
||||
const int2 q1 = get_int_from_table_16(src_qs[2 * sub + 1], kvalues_mxfp4);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_qs[i * MMQ_MMA_TILE_X_K_NVFP4 + kqs + 4 * sub + 0] = q0.x;
|
||||
x_qs[i * MMQ_MMA_TILE_X_K_NVFP4 + kqs + 4 * sub + 1] = q1.x;
|
||||
x_qs[i * MMQ_MMA_TILE_X_K_NVFP4 + kqs + 4 * sub + 2] = q0.y;
|
||||
x_qs[i * MMQ_MMA_TILE_X_K_NVFP4 + kqs + 4 * sub + 3] = q1.y;
|
||||
x_df[i * MMQ_MMA_TILE_X_K_NVFP4 + ksc + sub] = ggml_cuda_ue4m3_to_fp32(bxi->d[sub]);
|
||||
#else
|
||||
x_qs[i * (2 * MMQ_TILE_NE_K + 1) + kqs + 4 * sub + 0] = q0.x;
|
||||
x_qs[i * (2 * MMQ_TILE_NE_K + 1) + kqs + 4 * sub + 1] = q1.x;
|
||||
x_qs[i * (2 * MMQ_TILE_NE_K + 1) + kqs + 4 * sub + 2] = q0.y;
|
||||
x_qs[i * (2 * MMQ_TILE_NE_K + 1) + kqs + 4 * sub + 3] = q1.y;
|
||||
x_df[i * (2 * MMQ_TILE_NE_K * 2 / QI_NVFP4) + i / (QK_NVFP4_SUB / QI_NVFP4) + ksc + sub] = ggml_cuda_ue4m3_to_fp32(bxi->d[sub]);
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_q8_1_dp4a(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
@@ -1229,7 +1295,7 @@ static __device__ __forceinline__ void vec_dot_q8_1_q8_1_mma(
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
|
||||
// Used for Q3_K, IQ2_S, and IQ2_XS
|
||||
// Used for NVFP4, Q3_K, IQ2_S, and IQ2_XS
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_dp4a(
|
||||
const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
|
||||
@@ -3261,6 +3327,14 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_NVFP4> {
|
||||
static constexpr int vdr = VDR_NVFP4_Q8_1_MMQ;
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q2_K> {
|
||||
static constexpr int vdr = VDR_Q2_K_Q8_1_MMQ;
|
||||
@@ -4069,6 +4143,7 @@ extern DECL_MMQ_CASE(GGML_TYPE_Q5_0);
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_Q5_1);
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_Q8_0);
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_MXFP4);
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_NVFP4);
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_Q2_K);
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
|
||||
extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
|
||||
|
||||
@@ -235,30 +235,33 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna4(ggml_type
|
||||
// Host function: returns the max batch size for the current arch+type at runtime.
|
||||
int get_mmvq_mmid_max_batch(ggml_type type, int cc) {
|
||||
// NVIDIA: Volta, Ada Lovelace, and Blackwell always use MMVQ for MUL_MAT_ID.
|
||||
if (cc == GGML_CUDA_CC_VOLTA || cc >= GGML_CUDA_CC_ADA_LOVELACE) {
|
||||
return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
if (cc >= GGML_CUDA_CC_TURING) {
|
||||
return get_mmvq_mmid_max_batch_turing_plus(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
|
||||
if (cc == GGML_CUDA_CC_VOLTA || cc >= GGML_CUDA_CC_ADA_LOVELACE) {
|
||||
return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
if (cc >= GGML_CUDA_CC_TURING) {
|
||||
return get_mmvq_mmid_max_batch_turing_plus(type);
|
||||
}
|
||||
return get_mmvq_mmid_max_batch_pascal_older(type);
|
||||
}
|
||||
|
||||
// AMD
|
||||
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna4(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna3(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA2(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna1_rdna2(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return get_mmvq_mmid_max_batch_cdna(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_GCN(cc)) {
|
||||
return get_mmvq_mmid_max_batch_gcn(type);
|
||||
if (GGML_CUDA_CC_IS_AMD(cc)) {
|
||||
if (GGML_CUDA_CC_IS_RDNA4(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna4(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA3(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna3(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_RDNA1(cc) || GGML_CUDA_CC_IS_RDNA2(cc)) {
|
||||
return get_mmvq_mmid_max_batch_rdna1_rdna2(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_CDNA(cc)) {
|
||||
return get_mmvq_mmid_max_batch_cdna(type);
|
||||
}
|
||||
if (GGML_CUDA_CC_IS_GCN(cc)) {
|
||||
return get_mmvq_mmid_max_batch_gcn(type);
|
||||
}
|
||||
}
|
||||
return MMVQ_MAX_BATCH_SIZE;
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ TYPES_MMQ = [
|
||||
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
|
||||
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
|
||||
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
|
||||
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS", "GGML_TYPE_MXFP4"
|
||||
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS", "GGML_TYPE_MXFP4", "GGML_TYPE_NVFP4"
|
||||
]
|
||||
|
||||
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
|
||||
|
||||
#include "../mmq.cuh"
|
||||
|
||||
DECL_MMQ_CASE(GGML_TYPE_NVFP4);
|
||||
@@ -2231,6 +2231,22 @@ static bool ggml_hexagon_supported_ssm_conv(const struct ggml_hexagon_session *
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_hexagon_supported_cumsum(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * dst = op;
|
||||
|
||||
if (src0->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(dst)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_UNUSED(sess);
|
||||
return true;
|
||||
}
|
||||
|
||||
enum dspqbuf_type {
|
||||
DSPQBUF_TYPE_DSP_WRITE_CPU_READ = 0,
|
||||
DSPQBUF_TYPE_CPU_WRITE_DSP_READ,
|
||||
@@ -2399,6 +2415,16 @@ static inline size_t init_repeat_req(htp_general_req * req, dspqueue_buffer * bu
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_cumsum_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
req->op = HTP_OP_CUMSUM;
|
||||
|
||||
size_t n_bufs = 0;
|
||||
n_bufs += htp_req_buff_init(&req->src0, &bufs[n_bufs], t->src[0], DSPQBUF_TYPE_CPU_WRITE_DSP_READ);
|
||||
n_bufs += htp_req_buff_init(&req->dst, &bufs[n_bufs], t, DSPQBUF_TYPE_DSP_WRITE_CPU_READ);
|
||||
|
||||
return n_bufs;
|
||||
}
|
||||
|
||||
static inline size_t init_get_rows_req(htp_general_req * req, dspqueue_buffer * bufs, const ggml_tensor * t) {
|
||||
req->op = HTP_OP_GET_ROWS;
|
||||
|
||||
@@ -2780,6 +2806,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
|
||||
ggml_hexagon_dispatch_op<init_ssm_conv_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
case GGML_OP_CUMSUM:
|
||||
ggml_hexagon_dispatch_op<init_cumsum_req>(sess, node, flags);
|
||||
break;
|
||||
|
||||
default:
|
||||
GGML_ABORT("\nggml-hex: graph-compute %s is not supported\n", ggml_op_desc(node));
|
||||
}
|
||||
@@ -3254,6 +3284,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
|
||||
supp = ggml_hexagon_supported_ssm_conv(sess, op);
|
||||
break;
|
||||
|
||||
case GGML_OP_CUMSUM:
|
||||
supp = ggml_hexagon_supported_cumsum(sess, op);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ add_library(${HTP_LIB} SHARED
|
||||
repeat-ops.c
|
||||
argsort-ops.c
|
||||
ssm-conv.c
|
||||
cumsum-ops.c
|
||||
)
|
||||
|
||||
target_compile_definitions(${HTP_LIB} PRIVATE
|
||||
|
||||
267
ggml/src/ggml-hexagon/htp/cumsum-ops.c
Normal file
267
ggml/src/ggml-hexagon/htp/cumsum-ops.c
Normal file
@@ -0,0 +1,267 @@
|
||||
#pragma clang diagnostic ignored "-Wunused-variable"
|
||||
#pragma clang diagnostic ignored "-Wunused-function"
|
||||
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
||||
|
||||
#include <HAP_farf.h>
|
||||
#include <HAP_perf.h>
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
#include "htp-ctx.h"
|
||||
#include "htp-ops.h"
|
||||
#include "hvx-types.h"
|
||||
#include "hvx-utils.h"
|
||||
#include "hex-dma.h"
|
||||
|
||||
#define htp_cumsum_tensors_preamble \
|
||||
struct htp_tensor * restrict src0 = &octx->src0; \
|
||||
struct htp_tensor * restrict dst = &octx->dst; \
|
||||
\
|
||||
const uint32_t ne00 = src0->ne[0]; \
|
||||
const uint32_t ne01 = src0->ne[1]; \
|
||||
const uint32_t ne02 = src0->ne[2]; \
|
||||
const uint32_t ne03 = src0->ne[3]; \
|
||||
\
|
||||
const uint32_t ne0 = dst->ne[0]; \
|
||||
const uint32_t ne1 = dst->ne[1]; \
|
||||
const uint32_t ne2 = dst->ne[2]; \
|
||||
const uint32_t ne3 = dst->ne[3]; \
|
||||
\
|
||||
const uint32_t nb00 = src0->nb[0]; \
|
||||
const uint32_t nb01 = src0->nb[1]; \
|
||||
const uint32_t nb02 = src0->nb[2]; \
|
||||
const uint32_t nb03 = src0->nb[3]; \
|
||||
\
|
||||
const uint32_t nb0 = dst->nb[0]; \
|
||||
const uint32_t nb1 = dst->nb[1]; \
|
||||
const uint32_t nb2 = dst->nb[2]; \
|
||||
const uint32_t nb3 = dst->nb[3];
|
||||
|
||||
struct htp_cumsum_context {
|
||||
struct htp_ops_context * octx;
|
||||
size_t src_row_size;
|
||||
size_t dst_row_size;
|
||||
size_t src_row_size_aligned;
|
||||
size_t dst_row_size_aligned;
|
||||
uint32_t rows_per_thread;
|
||||
uint32_t total_rows;
|
||||
};
|
||||
|
||||
#define htp_cumsum_preamble \
|
||||
struct htp_cumsum_context * cctx = (struct htp_cumsum_context *) data; \
|
||||
struct htp_ops_context * octx = cctx->octx; \
|
||||
htp_cumsum_tensors_preamble; \
|
||||
dma_queue * dma_queue = octx->ctx->dma[ith];
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// HVX prefix scan helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#if __HVX_ARCH__ > 75
|
||||
static inline HVX_Vector hvx_cumsum_vadd(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vsf_vadd_VsfVsf(a, b);
|
||||
}
|
||||
#else
|
||||
static inline HVX_Vector hvx_cumsum_vadd(HVX_Vector a, HVX_Vector b) {
|
||||
return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
|
||||
}
|
||||
#endif // __HVX_ARCH__ > 75
|
||||
|
||||
static inline HVX_Vector hvx_prefix_scan_f32(HVX_Vector v, HVX_Vector carry_in) {
|
||||
const HVX_Vector zero = Q6_V_vsplat_R(0);
|
||||
|
||||
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 4));
|
||||
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 8));
|
||||
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 16));
|
||||
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 32));
|
||||
v = hvx_cumsum_vadd(v, Q6_V_vlalign_VVR(v, zero, 64));
|
||||
v = hvx_cumsum_vadd(v, carry_in);
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
static inline HVX_Vector hvx_splat_last_f32(HVX_Vector v) {
|
||||
return hvx_vec_repl4(Q6_V_vror_VR(v, 124));
|
||||
}
|
||||
|
||||
static inline void hvx_cumsum_row_f32(const float * restrict src, float * restrict dst, uint32_t n) {
|
||||
const uint32_t nvec = n / VLEN_FP32;
|
||||
const uint32_t nloe = n % VLEN_FP32;
|
||||
|
||||
HVX_Vector carry = Q6_V_vsplat_R(0);
|
||||
|
||||
for (uint32_t i = 0; i < nvec; i++) {
|
||||
HVX_Vector v = *((const HVX_UVector *) (src + i * VLEN_FP32));
|
||||
v = hvx_prefix_scan_f32(v, carry);
|
||||
hvx_vec_store_u(dst + i * VLEN_FP32, VLEN, v);
|
||||
carry = hvx_splat_last_f32(v);
|
||||
}
|
||||
|
||||
if (nloe) {
|
||||
float acc = hvx_vec_get_f32(carry);
|
||||
const float * src_tail = src + nvec * VLEN_FP32;
|
||||
float * dst_tail = dst + nvec * VLEN_FP32;
|
||||
for (uint32_t i = 0; i < nloe; i++) {
|
||||
acc += src_tail[i];
|
||||
dst_tail[i] = acc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per thread worker: Double-buffered DMA
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void cumsum_thread_f32_dma(unsigned int nth, unsigned int ith, void * data) {
|
||||
htp_cumsum_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint32_t ir0 = cctx->rows_per_thread * ith;
|
||||
const uint32_t ir1 = MIN(ir0 + cctx->rows_per_thread, cctx->total_rows);
|
||||
|
||||
if (ir0 >= ir1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const size_t src_row_size = cctx->src_row_size;
|
||||
const size_t dst_row_size = cctx->dst_row_size;
|
||||
const size_t src_row_size_aligned = cctx->src_row_size_aligned;
|
||||
const size_t dst_row_size_aligned = cctx->dst_row_size_aligned;
|
||||
|
||||
const uint8_t * src_data = (const uint8_t *) src0->data;
|
||||
uint8_t * dst_data = (uint8_t *) dst->data;
|
||||
|
||||
uint8_t * src_spad = octx->src0_spad.data + (ith * src_row_size_aligned * 2);
|
||||
uint8_t * dst_spad = octx->dst_spad.data + (ith * dst_row_size_aligned * 2);
|
||||
|
||||
for (uint32_t ir = ir0, spad_idx = 0; ir < ir1 && spad_idx < 2; ir++, spad_idx++) {
|
||||
// Dummy dst writeback to establish queue ordering
|
||||
dma_queue_push_vtcm_to_ddr(dma_queue,
|
||||
dma_make_ptr(dst_data, dst_spad + (spad_idx * dst_row_size_aligned)),
|
||||
dst_row_size, dst_row_size_aligned, 0);
|
||||
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue,
|
||||
dma_make_ptr(src_spad + (spad_idx * src_row_size_aligned),
|
||||
src_data + (ir * src_row_size)),
|
||||
src_row_size_aligned, src_row_size, 1);
|
||||
}
|
||||
|
||||
for (uint32_t ir = ir0; ir < ir1; ir++) {
|
||||
float * dst_spad_row = (float *) dma_queue_pop(dma_queue).src;
|
||||
float * src_spad_row = (float *) dma_queue_pop(dma_queue).dst;
|
||||
|
||||
hvx_cumsum_row_f32(src_spad_row, dst_spad_row, ne00);
|
||||
|
||||
dma_queue_push_vtcm_to_ddr(dma_queue,
|
||||
dma_make_ptr(dst_data + (ir * dst_row_size), (uint8_t *) dst_spad_row),
|
||||
dst_row_size, dst_row_size_aligned, 1);
|
||||
|
||||
const uint32_t next_row = ir + 2;
|
||||
if (next_row < ir1) {
|
||||
dma_queue_push_ddr_to_vtcm(dma_queue,
|
||||
dma_make_ptr((uint8_t *) src_spad_row, src_data + (next_row * src_row_size)),
|
||||
src_row_size_aligned, src_row_size, 1);
|
||||
}
|
||||
}
|
||||
|
||||
dma_queue_flush(dma_queue);
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "cumsum-f32-dma %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
|
||||
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Per thread worker: Direct HVX (no DMA)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
static void cumsum_thread_f32(unsigned int nth, unsigned int ith, void * data) {
|
||||
htp_cumsum_preamble;
|
||||
|
||||
uint64_t t1, t2;
|
||||
t1 = HAP_perf_get_qtimer_count();
|
||||
|
||||
const uint8_t * src_data = (const uint8_t *) src0->data;
|
||||
uint8_t * dst_data = (uint8_t *) dst->data;
|
||||
|
||||
const uint32_t ir0 = cctx->rows_per_thread * ith;
|
||||
const uint32_t ir1 = MIN(ir0 + cctx->rows_per_thread, cctx->total_rows);
|
||||
|
||||
for (uint32_t ir = ir0; ir < ir1; ir++) {
|
||||
const float * restrict src_row = (const float *) (src_data + ir * cctx->src_row_size);
|
||||
float * restrict dst_row = (float *) (dst_data + ir * cctx->dst_row_size);
|
||||
hvx_cumsum_row_f32(src_row, dst_row, ne00);
|
||||
}
|
||||
|
||||
t2 = HAP_perf_get_qtimer_count();
|
||||
|
||||
FARF(HIGH, "cumsum-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n",
|
||||
ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0, ir1,
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
|
||||
(unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
|
||||
}
|
||||
|
||||
int op_cumsum_f32(struct htp_ops_context * octx) {
|
||||
const struct htp_tensor * src0 = &octx->src0;
|
||||
const struct htp_tensor * dst = &octx->dst;
|
||||
|
||||
if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
const uint32_t total_rows = src0->ne[1] * src0->ne[2] * src0->ne[3];
|
||||
const uint32_t n_threads = MIN(octx->n_threads, total_rows);
|
||||
|
||||
const size_t src_row_size = src0->nb[1];
|
||||
const size_t dst_row_size = dst->nb[1];
|
||||
const size_t src_row_size_aligned = hex_round_up(src_row_size, VLEN);
|
||||
const size_t dst_row_size_aligned = hex_round_up(dst_row_size, VLEN);
|
||||
|
||||
// 2 ping-pong buffers per thread for src and dst
|
||||
const size_t spad_per_thread = 2 * (src_row_size_aligned + dst_row_size_aligned);
|
||||
|
||||
octx->src0_spad.size_per_thread = src_row_size_aligned * 2;
|
||||
octx->dst_spad.size_per_thread = dst_row_size_aligned * 2;
|
||||
octx->src0_spad.size = n_threads * octx->src0_spad.size_per_thread;
|
||||
octx->dst_spad.size = n_threads * octx->dst_spad.size_per_thread;
|
||||
octx->src0_spad.data = octx->ctx->vtcm_base;
|
||||
octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size;
|
||||
|
||||
struct htp_cumsum_context cctx = {
|
||||
.octx = octx,
|
||||
.src_row_size = src_row_size,
|
||||
.dst_row_size = dst_row_size,
|
||||
.src_row_size_aligned = src_row_size_aligned,
|
||||
.dst_row_size_aligned = dst_row_size_aligned,
|
||||
.rows_per_thread = (total_rows + n_threads - 1) / n_threads,
|
||||
.total_rows = total_rows,
|
||||
};
|
||||
|
||||
if (octx->ctx->vtcm_size < spad_per_thread * n_threads) {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, cumsum_thread_f32, &cctx, n_threads);
|
||||
} else {
|
||||
worker_pool_run_func(octx->ctx->worker_pool, cumsum_thread_f32_dma, &cctx, n_threads);
|
||||
}
|
||||
|
||||
return HTP_STATUS_OK;
|
||||
}
|
||||
|
||||
int op_cumsum(struct htp_ops_context * octx) {
|
||||
int err = HTP_STATUS_OK;
|
||||
struct htp_tensor * dst = &octx->dst;
|
||||
|
||||
switch (dst->type) {
|
||||
case HTP_TYPE_F32:
|
||||
err = op_cumsum_f32(octx);
|
||||
break;
|
||||
default:
|
||||
err = HTP_STATUS_NO_SUPPORT;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@@ -75,6 +75,7 @@ enum htp_op {
|
||||
HTP_OP_SUM_ROWS,
|
||||
HTP_OP_SSM_CONV,
|
||||
HTP_OP_REPEAT,
|
||||
HTP_OP_CUMSUM,
|
||||
INVALID
|
||||
};
|
||||
|
||||
|
||||
@@ -60,5 +60,6 @@ int op_cpy(struct htp_ops_context * octx);
|
||||
int op_repeat(struct htp_ops_context * octx);
|
||||
int op_argsort(struct htp_ops_context * octx);
|
||||
int op_ssm_conv(struct htp_ops_context * octx);
|
||||
int op_cumsum(struct htp_ops_context * octx);
|
||||
|
||||
#endif /* HTP_OPS_H */
|
||||
|
||||
@@ -16,8 +16,10 @@
|
||||
|
||||
#if __HVX_ARCH__ < 79
|
||||
#define HVX_OP_MUL_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b))
|
||||
#define HVX_OP_MUL_F16(a, b) Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b))
|
||||
#else
|
||||
#define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b)
|
||||
#define HVX_OP_MUL_F16(a, b) Q6_Vhf_vmpy_VhfVhf(a, b)
|
||||
#endif
|
||||
|
||||
// Compute div by scaler in f32. Requires first by expanding fp32 to fp16 and converting the result back to fp32.
|
||||
@@ -43,46 +45,67 @@ static inline HVX_Vector hvx_div_mul_f16_const_using_f32(HVX_Vector vec1_hf, HVX
|
||||
return res;
|
||||
}
|
||||
|
||||
#define hvx_div_scaler_f16_loop_body(dst_type, src_type, vec_store) \
|
||||
do { \
|
||||
dst_type * restrict vdst = (dst_type *) dst; \
|
||||
src_type * restrict vsrc = (src_type *) src; \
|
||||
HVX_Vector hf_one = Q6_Vh_vsplat_R(0x3C00); \
|
||||
\
|
||||
const uint32_t nvec = n / VLEN_FP16; \
|
||||
const uint32_t nloe = n % VLEN_FP16; \
|
||||
\
|
||||
uint32_t i = 0; \
|
||||
\
|
||||
_Pragma("unroll(4)") \
|
||||
for (; i < nvec; i++) { \
|
||||
HVX_Vector res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \
|
||||
vdst[i] = res; \
|
||||
} \
|
||||
if (nloe) { \
|
||||
HVX_Vector res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \
|
||||
vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, res); \
|
||||
} \
|
||||
// Variant for <v79: Use pre-computed f16 reciprocal constant
|
||||
static inline HVX_Vector hvx_div_mul_f16_const_using_f16(HVX_Vector vec1_hf, HVX_Vector const_inv_hf) {
|
||||
// Multiply by pre-computed f16 reciprocal constant
|
||||
return HVX_OP_MUL_F16(vec1_hf, const_inv_hf);
|
||||
}
|
||||
|
||||
#define hvx_div_scaler_f16_loop_body(dst_type, src_type, vec_store) \
|
||||
do { \
|
||||
dst_type * restrict vdst = (dst_type *) dst; \
|
||||
src_type * restrict vsrc = (src_type *) src; \
|
||||
\
|
||||
HVX_Vector hf_one = Q6_Vh_vsplat_R(0x3C00); \
|
||||
\
|
||||
const uint32_t nvec = n / VLEN_FP16; \
|
||||
const uint32_t nloe = n % VLEN_FP16; \
|
||||
\
|
||||
uint32_t i = 0; \
|
||||
\
|
||||
_Pragma("unroll(4)") \
|
||||
for (; i < nvec; i++) { \
|
||||
HVX_Vector res; \
|
||||
if (__HVX_ARCH__ < 79) { \
|
||||
res = hvx_div_mul_f16_const_using_f16(vsrc[i], val_vec_f16); \
|
||||
} else { \
|
||||
res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \
|
||||
} \
|
||||
vdst[i] = res; \
|
||||
} \
|
||||
if (nloe) { \
|
||||
HVX_Vector res; \
|
||||
if (__HVX_ARCH__ < 79) { \
|
||||
res = hvx_div_mul_f16_const_using_f16(vsrc[i], val_vec_f16); \
|
||||
} else { \
|
||||
res = hvx_div_mul_f16_const_using_f32(vsrc[i], val_vec_f32, hf_one); \
|
||||
} \
|
||||
vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, res); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
static inline void hvx_div_scalar_f16_aa(uint8_t * restrict dst, const uint8_t * restrict src, const _Float16 val, uint32_t n) {
|
||||
const HVX_Vector val_vec_f32 = hvx_vec_splat_f32(1.0f/((float)val));
|
||||
const HVX_Vector val_vec_f16 = hvx_vec_splat_f16(1.0f / val);
|
||||
assert((uintptr_t) dst % 128 == 0);
|
||||
assert((uintptr_t) src % 128 == 0);
|
||||
hvx_div_scaler_f16_loop_body(HVX_Vector, HVX_Vector, hvx_vec_store_a);
|
||||
}
|
||||
static inline void hvx_div_scalar_f16_au(uint8_t * restrict dst, const uint8_t * restrict src, const _Float16 val, uint32_t n) {
|
||||
const HVX_Vector val_vec_f32 = hvx_vec_splat_f32(1.0f/((float)val));
|
||||
const HVX_Vector val_vec_f16 = hvx_vec_splat_f16(1.0f / val);
|
||||
assert((uintptr_t) dst % 128 == 0);
|
||||
hvx_div_scaler_f16_loop_body(HVX_Vector, HVX_UVector, hvx_vec_store_a);
|
||||
}
|
||||
static inline void hvx_div_scalar_f16_ua(uint8_t * restrict dst, const uint8_t * restrict src, const _Float16 val, uint32_t n) {
|
||||
const HVX_Vector val_vec_f32 = hvx_vec_splat_f32(1.0f/((float)val));
|
||||
const HVX_Vector val_vec_f16 = hvx_vec_splat_f16(1.0f / val);
|
||||
assert((uintptr_t) src % 128 == 0);
|
||||
hvx_div_scaler_f16_loop_body(HVX_UVector, HVX_Vector, hvx_vec_store_u);
|
||||
}
|
||||
static inline void hvx_div_scalar_f16_uu(uint8_t * restrict dst, const uint8_t * restrict src, const _Float16 val, uint32_t n) {
|
||||
const HVX_Vector val_vec_f32 = hvx_vec_splat_f32(1.0f/((float)val));
|
||||
const HVX_Vector val_vec_f16 = hvx_vec_splat_f16(1.0f / val);
|
||||
hvx_div_scaler_f16_loop_body(HVX_UVector, HVX_UVector, hvx_vec_store_u);
|
||||
}
|
||||
|
||||
@@ -128,13 +151,25 @@ static inline HVX_Vector hvx_vec_div_f16_using_f32(HVX_Vector vec1, HVX_Vector v
|
||||
return recip;
|
||||
}
|
||||
|
||||
// Hybrid approach: f16 reciprocal for <v79, f32 precision for >=v79
|
||||
static inline HVX_Vector hvx_vec_hybrid_div_f16(HVX_Vector vec1, HVX_Vector vec2, HVX_Vector f32_nan_inf_mask, HVX_Vector f16_nan_inf_mask, HVX_Vector vec_hf_one_1_0) {
|
||||
#if __HVX_ARCH__ < 79
|
||||
// For older architectures, use f16 reciprocal to avoid NaN/-inf issues
|
||||
HVX_Vector vec2_inv = hvx_vec_inverse_f16_guard(vec2, f16_nan_inf_mask);
|
||||
return HVX_OP_MUL_F16(vec1, vec2_inv);
|
||||
#else
|
||||
return hvx_vec_div_f16_using_f32(vec1, vec2, f32_nan_inf_mask, vec_hf_one_1_0);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define hvx_div_f16_loop_body(dst_type, src0_type, src1_type, vec_store) \
|
||||
do { \
|
||||
dst_type * restrict vdst = (dst_type *) dst; \
|
||||
src0_type * restrict vsrc0 = (src0_type *) src0; \
|
||||
src1_type * restrict vsrc1 = (src1_type *) src1; \
|
||||
\
|
||||
const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \
|
||||
const HVX_Vector f32_nan_inf_mask = Q6_V_vsplat_R(0x7f800000); \
|
||||
const HVX_Vector f16_nan_inf_mask = Q6_Vh_vsplat_R(0x7c00); \
|
||||
const HVX_Vector hf_one = Q6_Vh_vsplat_R(0x3C00); \
|
||||
\
|
||||
const uint32_t nvec = n / VLEN_FP16; \
|
||||
@@ -144,11 +179,15 @@ static inline HVX_Vector hvx_vec_div_f16_using_f32(HVX_Vector vec1, HVX_Vector v
|
||||
\
|
||||
_Pragma("unroll(4)") \
|
||||
for (; i < nvec; i++) { \
|
||||
HVX_Vector res = hvx_vec_div_f16_using_f32(vsrc0[i], vsrc1[i], nan_inf_mask, hf_one); \
|
||||
HVX_Vector res = hvx_vec_hybrid_div_f16(vsrc0[i], vsrc1[i], \
|
||||
f32_nan_inf_mask, f16_nan_inf_mask, \
|
||||
hf_one); \
|
||||
vdst[i] = res; \
|
||||
} \
|
||||
if (nloe) { \
|
||||
HVX_Vector res = hvx_vec_div_f16_using_f32(vsrc0[i], vsrc1[i], nan_inf_mask, hf_one); \
|
||||
HVX_Vector res = hvx_vec_hybrid_div_f16(vsrc0[i], vsrc1[i], \
|
||||
f32_nan_inf_mask, f16_nan_inf_mask, \
|
||||
hf_one); \
|
||||
vec_store((void *) &vdst[i], nloe * SIZEOF_FP16, res); \
|
||||
} \
|
||||
} while(0)
|
||||
@@ -247,5 +286,6 @@ HVX_DIV_DISPATCHER(hvx_div_f32)
|
||||
HVX_DIV_DISPATCHER(hvx_div_f16)
|
||||
|
||||
#undef HVX_OP_MUL_F32
|
||||
#undef HVX_OP_MUL_F16
|
||||
|
||||
#endif // HVX_DIV_H
|
||||
|
||||
@@ -860,6 +860,41 @@ static void proc_ssm_conv_req(struct htp_context * ctx, struct htp_general_req *
|
||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
static void proc_cumsum_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
|
||||
struct dspqueue_buffer rsp_bufs[1];
|
||||
|
||||
// We've written to the output buffer, we'd also need to flush it
|
||||
rsp_bufs[0].fd = bufs[1].fd;
|
||||
rsp_bufs[0].ptr = bufs[1].ptr;
|
||||
rsp_bufs[0].offset = bufs[1].offset;
|
||||
rsp_bufs[0].size = bufs[1].size;
|
||||
rsp_bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush HTP
|
||||
DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
|
||||
|
||||
// Setup Op context
|
||||
struct htp_ops_context octx = { 0 };
|
||||
octx.ctx = ctx;
|
||||
octx.src0 = req->src0;
|
||||
octx.dst = req->dst;
|
||||
octx.flags = req->flags;
|
||||
octx.op = req->op;
|
||||
octx.src0.data = (uint32_t) bufs[0].ptr;
|
||||
octx.dst.data = (uint32_t) bufs[1].ptr;
|
||||
octx.n_threads = ctx->n_threads;
|
||||
|
||||
struct profile_data prof;
|
||||
profile_start(&prof);
|
||||
|
||||
uint32_t rsp_status = HTP_STATUS_INTERNAL_ERR;
|
||||
if (vtcm_acquire(ctx) == AEE_SUCCESS) {
|
||||
rsp_status = op_cumsum(&octx);
|
||||
vtcm_release(ctx);
|
||||
}
|
||||
|
||||
profile_stop(&prof);
|
||||
send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
|
||||
}
|
||||
|
||||
static void proc_activations_req(struct htp_context * ctx,
|
||||
struct htp_general_req * req,
|
||||
struct dspqueue_buffer * bufs,
|
||||
@@ -1474,6 +1509,14 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
|
||||
proc_ssm_conv_req(ctx, &req, bufs);
|
||||
break;
|
||||
|
||||
case HTP_OP_CUMSUM:
|
||||
if (n_bufs != 2) {
|
||||
FARF(ERROR, "Bad cumsum-req buffer list");
|
||||
continue;
|
||||
}
|
||||
proc_cumsum_req(ctx, &req, bufs);
|
||||
break;
|
||||
|
||||
default:
|
||||
FARF(ERROR, "Unknown Op %u", req.op);
|
||||
break;
|
||||
|
||||
@@ -67,34 +67,61 @@ static void hvx_fast_rms_norm_f32(const uint8_t * restrict src,
|
||||
uint8_t * restrict pad,
|
||||
const int num_elems,
|
||||
float epsilon) {
|
||||
(void)pad;
|
||||
|
||||
const HVX_Vector * restrict v_src = (HVX_Vector *) src;
|
||||
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
|
||||
|
||||
HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
|
||||
const int nvec = num_elems / VLEN_FP32; // number of full vectors
|
||||
const int nloe = num_elems % VLEN_FP32; // leftover elements
|
||||
|
||||
// Compute sum of squares for full vectors
|
||||
HVX_Vector sum_v = Q6_V_vsplat_R(0x00000000);
|
||||
HVX_Vector epsilon_v = hvx_vec_splat_f32(epsilon);
|
||||
|
||||
int step_of_1 = num_elems >> 5;
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < step_of_1; i++) {
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
HVX_Vector v1 = v_src[i];
|
||||
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
|
||||
}
|
||||
|
||||
sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v)); // replicated over all lanes
|
||||
// Handle tail elements using vectorized ops with masking
|
||||
if (nloe > 0) {
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
|
||||
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, v1);
|
||||
sum_v = Q6_Vqf32_vadd_Vqf32Vqf32(sum_v, v2);
|
||||
}
|
||||
|
||||
// Reduce HVX sum
|
||||
sum_v = hvx_vec_reduce_sum_f32(Q6_Vsf_equals_Vqf32(sum_v));
|
||||
|
||||
HVX_Vector t_v = hvx_vec_splat_f32((float) num_elems);
|
||||
HVX_Vector denom_v = hvx_vec_inverse_f32(t_v);
|
||||
HVX_Vector mean_v = Q6_Vqf32_vmpy_VsfVsf(sum_v, denom_v);
|
||||
HVX_Vector mean_epsilon_v = Q6_Vqf32_vadd_Vqf32Vsf(mean_v, epsilon_v);
|
||||
|
||||
// Scale full vectors
|
||||
HVX_Vector scale_v = hvx_vec_rsqrt_f32(Q6_Vsf_equals_Vqf32(mean_epsilon_v));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int i = 0; i < step_of_1; i++) {
|
||||
for (int i = 0; i < nvec; i++) {
|
||||
HVX_Vector v1 = v_src[i];
|
||||
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
|
||||
v_dst[i] = Q6_Vsf_equals_Vqf32(v2);
|
||||
v_dst[i] = Q6_Vsf_equals_Vqf32(v2);
|
||||
}
|
||||
|
||||
// Handle tail elements using vectorized ops with masking
|
||||
if (nloe > 0) {
|
||||
|
||||
HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe * 4);
|
||||
HVX_Vector v1 = Q6_V_vand_QV(bmask, v_src[nvec]);
|
||||
HVX_Vector v2 = Q6_Vqf32_vmpy_VsfVsf(v1, scale_v);
|
||||
HVX_Vector result = Q6_Vsf_equals_Vqf32(v2);
|
||||
|
||||
// Store with masking to avoid overwriting memory beyond the tensor
|
||||
hvx_vec_store_a(&v_dst[nvec], nloe * 4, result);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9612,6 +9612,9 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
cl_mem B_image1d;
|
||||
cl_mem B_sub_buffer;
|
||||
cl_mem S_image1d;
|
||||
// for B transpose
|
||||
cl_mem B_image1d_trans = nullptr;
|
||||
cl_mem B_d = nullptr;
|
||||
|
||||
cl_mem D_image1d;
|
||||
cl_mem D_sub_buffer;
|
||||
@@ -9703,9 +9706,6 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
global_work_size[2] = 1;
|
||||
} else {
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
cl_mem B_image1d_trans = nullptr;
|
||||
// for B transpose
|
||||
cl_mem B_d = nullptr;
|
||||
int padding;
|
||||
|
||||
//how many extra elements beyond multiple of 8
|
||||
@@ -9800,6 +9800,12 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t
|
||||
CL_CHECK(clReleaseMemObject(S_image1d));
|
||||
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
||||
CL_CHECK(clReleaseMemObject(D_image1d));
|
||||
if (B_image1d_trans) {
|
||||
CL_CHECK(clReleaseMemObject(B_image1d_trans));
|
||||
}
|
||||
if (B_d) {
|
||||
CL_CHECK(clReleaseMemObject(B_d));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(backend);
|
||||
GGML_UNUSED(src0);
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-sycl.h"
|
||||
#include "presets.hpp"
|
||||
#include "type.hpp"
|
||||
#include "sycl_hw.hpp"
|
||||
|
||||
namespace syclexp = sycl::ext::oneapi::experimental;
|
||||
@@ -965,4 +966,10 @@ static T block_reduce(T val, T * shared_vals, int block_size_template) {
|
||||
return val;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float ggml_sycl_ue4m3_to_fp32(uint8_t x) {
|
||||
const uint32_t bits = x * (x != 0x7F && x != 0xFF);
|
||||
const __nv_fp8_e4m3 xf = *reinterpret_cast<const __nv_fp8_e4m3 *>(&bits);
|
||||
return static_cast<float>(xf) / 2;
|
||||
}
|
||||
|
||||
#endif // GGML_SYCL_COMMON_HPP
|
||||
|
||||
@@ -482,6 +482,18 @@ static void dequantize_row_mxfp4_sycl(const void * vx, dst_t * y, const int64_t
|
||||
});
|
||||
}
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_row_nvfp4_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(k % QK_NVFP4 == 0);
|
||||
const int nb = k / QK_NVFP4;
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
dequantize_block_nvfp4(vx, y, k);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static void dequantize_block_nc(const void * __restrict__ vx, dst_t * __restrict__ y,
|
||||
const int64_t ne00, const int64_t ne01, const int64_t ne02,
|
||||
@@ -641,6 +653,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
return dequantize_row_iq4_nl_sycl;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return dequantize_row_mxfp4_sycl;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return dequantize_row_nvfp4_sycl;
|
||||
case GGML_TYPE_F32:
|
||||
return convert_unary_sycl<float>;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
@@ -648,6 +662,7 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
|
||||
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
|
||||
#endif
|
||||
default:
|
||||
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@@ -708,6 +723,8 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
return dequantize_row_iq4_nl_sycl;
|
||||
case GGML_TYPE_MXFP4:
|
||||
return dequantize_row_mxfp4_sycl;
|
||||
case GGML_TYPE_NVFP4:
|
||||
return dequantize_row_nvfp4_sycl;
|
||||
case GGML_TYPE_F16:
|
||||
return convert_unary_sycl<sycl::half>;
|
||||
#ifdef GGML_SYCL_HAS_BF16
|
||||
@@ -715,6 +732,7 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
|
||||
return convert_unary_sycl<sycl::ext::oneapi::bfloat16>;
|
||||
#endif
|
||||
default:
|
||||
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(type));
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -838,4 +838,36 @@ static void dequantize_block_mxfp4(const void * __restrict__ vx, dst_t * __restr
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename dst_t>
|
||||
static void dequantize_block_nvfp4(
|
||||
const void * __restrict__ vx,
|
||||
dst_t * __restrict__ yy,
|
||||
const int64_t ne) {
|
||||
auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
|
||||
const int64_t i = item_ct1.get_group(2);
|
||||
const int tid = item_ct1.get_local_id(2);
|
||||
|
||||
const int64_t base = i * QK_NVFP4;
|
||||
if (base >= ne) {
|
||||
return;
|
||||
}
|
||||
|
||||
const block_nvfp4 * x = (const block_nvfp4 *) vx;
|
||||
const block_nvfp4 & xb = x[i];
|
||||
|
||||
const int sub = tid / (QK_NVFP4_SUB / 2);
|
||||
const int j = tid % (QK_NVFP4_SUB / 2);
|
||||
|
||||
const float d = ggml_sycl_ue4m3_to_fp32(xb.d[sub]);
|
||||
const uint8_t q = xb.qs[sub * (QK_NVFP4_SUB / 2) + j];
|
||||
|
||||
const int64_t y0 = base + sub * QK_NVFP4_SUB + j;
|
||||
const int64_t y1 = y0 + QK_NVFP4_SUB / 2;
|
||||
|
||||
yy[y0] = ggml_sycl_cast<dst_t>(d * kvalues_mxfp4[q & 0x0F]);
|
||||
yy[y1] = ggml_sycl_cast<dst_t>(d * kvalues_mxfp4[q >> 4]);
|
||||
}
|
||||
|
||||
|
||||
#endif // GGML_SYCL_DEQUANTIZE_HPP
|
||||
|
||||
@@ -569,9 +569,15 @@ static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
|
||||
SYCL_CHECK(
|
||||
CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
|
||||
|
||||
SYCL_CHECK(CHECK_TRY_ERROR((*stream)
|
||||
.memset(ctx->dev_ptr, value, buffer->size)
|
||||
.wait()));
|
||||
constexpr size_t MAX_CHUNK = 2ULL << 30; // 2 GiB
|
||||
for (size_t off = 0; off < buffer->size; off += MAX_CHUNK) {
|
||||
size_t chunk = std::min(buffer->size - off, MAX_CHUNK);
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
(*stream)
|
||||
.memset(static_cast<char*>(ctx->dev_ptr) + off, value, chunk)
|
||||
.wait()
|
||||
));
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
|
||||
@@ -613,6 +613,23 @@ static void mul_mat_vec_mxfp4_q8_1_sycl(const void * vx, const void * vy, float
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_nvfp4_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
|
||||
dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % QK_NVFP4 == 0);
|
||||
const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
|
||||
const sycl::range<3> block_nums(1, 1, block_num_y);
|
||||
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
||||
|
||||
{
|
||||
stream->submit([&](sycl::handler & cgh) {
|
||||
cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
|
||||
mul_mat_vec_q<QK_NVFP4, QI_NVFP4, block_nvfp4, VDR_NVFP4_Q8_1_MMVQ, vec_dot_nvfp4_q8_1>(
|
||||
vx, vy, dst, ncols, nrows, item_ct1);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
|
||||
float *dst, const int ncols,
|
||||
@@ -1145,8 +1162,11 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
|
||||
case GGML_TYPE_MXFP4:
|
||||
mul_mat_vec_mxfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
case GGML_TYPE_NVFP4:
|
||||
mul_mat_vec_nvfp4_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
GGML_ABORT("fatal error: unsupport data type=%s\n", ggml_type_name(src0->type));
|
||||
}
|
||||
}
|
||||
GGML_UNUSED(src1);
|
||||
|
||||
112
ggml/src/ggml-sycl/type.hpp
Normal file
112
ggml/src/ggml-sycl/type.hpp
Normal file
@@ -0,0 +1,112 @@
|
||||
#pragma once
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
|
||||
inline uint8_t float_to_e4m3(float f)
|
||||
{
|
||||
if (sycl::isnan(f)) {
|
||||
return 0x7F; // Canonical NaN (positive)
|
||||
}
|
||||
|
||||
uint32_t bits = sycl::bit_cast<uint32_t>(f);
|
||||
uint32_t sign = (bits >> 31) & 0x1u;
|
||||
uint32_t exp = (bits >> 23) & 0xFFu;
|
||||
uint32_t mant = bits & 0x7FFFFFu;
|
||||
|
||||
// Zero
|
||||
if (exp == 0 && mant == 0) {
|
||||
return static_cast<uint8_t>(sign << 7);
|
||||
}
|
||||
|
||||
// Extract biased exponent and mantissa for FP8
|
||||
int e = static_cast<int>(exp) - 127; // true exponent (IEEE bias 127)
|
||||
uint32_t m = mant;
|
||||
|
||||
// Handle very large values → NaN (NVIDIA behavior for E4M3)
|
||||
if (e > 7) { // max exponent for E4M3 is 7 (biased 14)
|
||||
return static_cast<uint8_t>((sign << 7) | 0x7F);
|
||||
}
|
||||
|
||||
// Handle subnormals and normal numbers
|
||||
if (e < -6) { // smallest normal exponent is -6
|
||||
// Subnormal in FP8: shift mantissa right
|
||||
int shift = -6 - e;
|
||||
m = (m | 0x800000u) >> (shift + 1); // +1 because we lose the implicit 1 position
|
||||
if (shift > 23) m = 0;
|
||||
} else {
|
||||
// Normal number: adjust exponent bias from 127 to 7
|
||||
int new_exp = e + 7;
|
||||
m = (m >> 20) & 0x7u; // take top 3 mantissa bits (after implicit 1)
|
||||
m |= (static_cast<uint32_t>(new_exp) << 3);
|
||||
}
|
||||
|
||||
// Round-to-nearest-even (simple guard + round bit)
|
||||
// For better accuracy you can add sticky bit, but this is sufficient for most use cases
|
||||
uint32_t round_bit = (mant >> 19) & 0x1u; // bit after the 3 mantissa bits
|
||||
if (round_bit) {
|
||||
m += 1;
|
||||
// Carry into exponent if mantissa overflows
|
||||
if ((m & 0x8u) != 0) {
|
||||
m = (m & 0x7u) | ((m & 0x38u) << 1); // simple carry handling
|
||||
// If exponent overflows after carry → NaN
|
||||
if ((m >> 3) > 14) {
|
||||
return static_cast<uint8_t>((sign << 7) | 0x7F);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t result = static_cast<uint8_t>((sign << 7) | (m & 0x7F));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline float e4m3_to_float(uint8_t x)
|
||||
{
|
||||
if (x == 0) return 0.0f;
|
||||
|
||||
uint8_t sign = (x >> 7) & 0x1u;
|
||||
uint8_t exp = (x >> 3) & 0xFu;
|
||||
uint8_t mant = x & 0x7u;
|
||||
|
||||
// NaN (NVIDIA uses 0x7F / 0xFF as NaN)
|
||||
if (exp == 0xF && mant != 0) {
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
if (exp == 0xF) { // 0x7F or 0xFF treated as NaN
|
||||
return std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
|
||||
float val;
|
||||
|
||||
if (exp == 0) {
|
||||
// Subnormal
|
||||
val = mant * (1.0f / 8.0f) * sycl::pow(2.0f, -6.0f);
|
||||
} else {
|
||||
// Normal: implicit leading 1 + bias 7
|
||||
val = (1.0f + mant / 8.0f) * sycl::pow(2.0f, static_cast<float>(exp) - 7.0f);
|
||||
}
|
||||
|
||||
return sign ? -val : val;
|
||||
}
|
||||
|
||||
// The actual type definition
|
||||
struct __nv_fp8_e4m3 {
|
||||
uint8_t raw;
|
||||
|
||||
__nv_fp8_e4m3() = default;
|
||||
|
||||
explicit __nv_fp8_e4m3(float f) : raw(float_to_e4m3(f)) {}
|
||||
explicit __nv_fp8_e4m3(sycl::half h) : raw(float_to_e4m3(static_cast<float>(h))) {}
|
||||
|
||||
operator float() const { return e4m3_to_float(raw); }
|
||||
operator sycl::half() const { return static_cast<sycl::half>(static_cast<float>(*this)); }
|
||||
|
||||
// Allow direct access for vector loads/stores
|
||||
operator uint8_t&() { return raw; }
|
||||
operator uint8_t() const { return raw; }
|
||||
};
|
||||
|
||||
using __nv_fp8x2_e4m3 = sycl::vec<__nv_fp8_e4m3, 2>;
|
||||
using __nv_fp8x4_e4m3 = sycl::vec<__nv_fp8_e4m3, 4>;
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
|
||||
#include "dpct/helper.hpp"
|
||||
#include "ggml.h"
|
||||
#include "type.hpp"
|
||||
#include "quants.hpp"
|
||||
|
||||
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
|
||||
@@ -31,6 +32,18 @@ static __dpct_inline__ int get_int_b1(const void * x, const int & i32) {
|
||||
return x32;
|
||||
}
|
||||
|
||||
static __dpct_inline__ int get_int_b2(const void * x, const int & i32) {
|
||||
const uint16_t * x16 = (const uint16_t *) x; // assume at least 2 byte alignment
|
||||
|
||||
int x32 = x16[2*i32 + 0] << 0;
|
||||
x32 |= x16[2*i32 + 1] << 16;
|
||||
|
||||
return x32;
|
||||
}
|
||||
|
||||
static __dpct_inline__ int get_int_b4(const void * x, const int & i32) {
|
||||
return ((const int *) x)[i32]; // assume at least 4 byte alignment
|
||||
}
|
||||
|
||||
static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
|
||||
const uint16_t* x16 =
|
||||
@@ -755,6 +768,35 @@ static __dpct_inline__ float vec_dot_mxfp4_q8_1(const void * __restrict__ vbq,
|
||||
return d * sumi;
|
||||
}
|
||||
|
||||
#define VDR_NVFP4_Q8_1_MMVQ 4
|
||||
#define VDR_NVFP4_Q8_1_MMQ 8
|
||||
|
||||
static __dpct_inline__ float vec_dot_nvfp4_q8_1(const void * __restrict__ vbq,
|
||||
const block_q8_1 * __restrict__ bq8_1,
|
||||
const int32_t & iqs) {
|
||||
const block_nvfp4 * bq4 = (const block_nvfp4 *) vbq;
|
||||
float sum = 0.0f;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < VDR_NVFP4_Q8_1_MMVQ/2; i++) {
|
||||
const int32_t iqs0 = iqs + 2*i;
|
||||
const int32_t iqs1 = iqs0 + 1;
|
||||
const int32_t is = iqs0 >> 1;
|
||||
const sycl::int2 v0 = get_int_from_table_16(get_int_b4(bq4->qs, iqs0), kvalues_mxfp4);
|
||||
const sycl::int2 v1 = get_int_from_table_16(get_int_b4(bq4->qs, iqs1), kvalues_mxfp4);
|
||||
const block_q8_1 * bq8 = bq8_1 + (is >> 1);
|
||||
const int32_t i8 = ((is & 1) << 2);
|
||||
|
||||
int sumi = ggml_sycl_dp4a(v0.x(), get_int_b4(bq8->qs, i8 + 0), 0);
|
||||
sumi = ggml_sycl_dp4a(v0.y(), get_int_b4(bq8->qs, i8 + 2), sumi);
|
||||
sumi = ggml_sycl_dp4a(v1.x(), get_int_b4(bq8->qs, i8 + 1), sumi);
|
||||
sumi = ggml_sycl_dp4a(v1.y(), get_int_b4(bq8->qs, i8 + 3), sumi);
|
||||
|
||||
const float d = ggml_sycl_ue4m3_to_fp32(bq4->d[is]) * (bq8->ds)[0];
|
||||
sum += d * float(sumi);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
static __dpct_inline__ float
|
||||
vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
|
||||
|
||||
45
models/templates/LFM2.5-Instruct.jinja
Normal file
45
models/templates/LFM2.5-Instruct.jinja
Normal file
@@ -0,0 +1,45 @@
|
||||
{{- bos_token -}}
|
||||
{%- set keep_past_thinking = keep_past_thinking | default(false) -%}
|
||||
{%- set ns = namespace(system_prompt="") -%}
|
||||
{%- if messages[0]["role"] == "system" -%}
|
||||
{%- set ns.system_prompt = messages[0]["content"] -%}
|
||||
{%- set messages = messages[1:] -%}
|
||||
{%- endif -%}
|
||||
{%- if tools -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
|
||||
{%- for tool in tools -%}
|
||||
{%- if tool is not string -%}
|
||||
{%- set tool = tool | tojson -%}
|
||||
{%- endif -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + tool -%}
|
||||
{%- if not loop.last -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- set ns.system_prompt = ns.system_prompt + "]" -%}
|
||||
{%- endif -%}
|
||||
{%- if ns.system_prompt -%}
|
||||
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
|
||||
{%- endif -%}
|
||||
{%- set ns.last_assistant_index = -1 -%}
|
||||
{%- for message in messages -%}
|
||||
{%- if message["role"] == "assistant" -%}
|
||||
{%- set ns.last_assistant_index = loop.index0 -%}
|
||||
{%- endif -%}
|
||||
{%- endfor -%}
|
||||
{%- for message in messages -%}
|
||||
{{- "<|im_start|>" + message["role"] + "\n" -}}
|
||||
{%- set content = message["content"] -%}
|
||||
{%- if content is not string -%}
|
||||
{%- set content = content | tojson -%}
|
||||
{%- endif -%}
|
||||
{%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
|
||||
{%- if "</think>" in content -%}
|
||||
{%- set content = content.split("</think>")[-1] | trim -%}
|
||||
{%- endif -%}
|
||||
{%- endif -%}
|
||||
{{- content + "<|im_end|>\n" -}}
|
||||
{%- endfor -%}
|
||||
{%- if add_generation_prompt -%}
|
||||
{{- "<|im_start|>assistant\n" -}}
|
||||
{%- endif -%}
|
||||
118
models/templates/ibm-granite-granite-4.0.jinja
Normal file
118
models/templates/ibm-granite-granite-4.0.jinja
Normal file
@@ -0,0 +1,118 @@
|
||||
{%- set tools_system_message_prefix = 'You are a helpful assistant with access to the following tools. You may call one or more tools to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>' %}
|
||||
{%- set tools_system_message_suffix = '\n</tools>\n\nFor each tool call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.' %}
|
||||
{%- set documents_system_message_prefix = 'You are a helpful assistant with access to the following documents. You may use one or more documents to assist with the user query.\n\nYou are given a list of documents within <documents></documents> XML tags:\n<documents>' %}
|
||||
{%- set documents_system_message_suffix = '\n</documents>\n\nWrite the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.' %}
|
||||
{%- set g4_default_system_message = 'You are a helpful assistant. Please ensure responses are professional, accurate, and safe.' %}
|
||||
{%- if available_tools is defined and available_tools %}
|
||||
{%- set tools = available_tools %}
|
||||
{%- endif %}
|
||||
{%- set ns = namespace(tools_system_message=tools_system_message_prefix,
|
||||
documents_system_message=documents_system_message_prefix,
|
||||
default_system_message=g4_default_system_message,
|
||||
system_message=''
|
||||
) %}
|
||||
{%- if tools %}
|
||||
{%- for tool in tools %}
|
||||
{%- set ns.tools_system_message = ns.tools_system_message + '\n' + (tool | tojson) %}
|
||||
{%- endfor %}
|
||||
{%- set ns.tools_system_message = ns.tools_system_message + tools_system_message_suffix %}
|
||||
{%- else %}
|
||||
{%- set ns.tools_system_message = '' %}
|
||||
{%- endif %}
|
||||
{%- if documents %}
|
||||
{%- for document in documents %}
|
||||
{%- set ns.documents_system_message = ns.documents_system_message + '\n' + (document | tojson) %}
|
||||
{%- endfor %}
|
||||
{%- set ns.documents_system_message = ns.documents_system_message + documents_system_message_suffix %}
|
||||
{%- else %}
|
||||
{%- set ns.documents_system_message = '' %}
|
||||
{%- endif %}
|
||||
{%- if messages[0].role == 'system' %}
|
||||
{%- if messages[0].content is string %}
|
||||
{%- set ns.system_message = messages[0].content %}
|
||||
{%- elif messages[0].content is iterable %}
|
||||
{%- for entry in messages[0].content %}
|
||||
{%- if entry.type== 'text' %}
|
||||
{%- if ns.system_message != '' %}
|
||||
{%- set ns.system_message = ns.system_message + '\n' %}
|
||||
{%- endif %}
|
||||
{%- set ns.system_message = ns.system_message + entry.text %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{%- if tools and documents %}
|
||||
{%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message + '\n\n' + ns.documents_system_message %}
|
||||
{%- elif tools %}
|
||||
{%- set ns.system_message = ns.system_message + '\n\n' + ns.tools_system_message %}
|
||||
{%- elif documents %}
|
||||
{%- set ns.system_message = ns.system_message + '\n\n' + ns.documents_system_message %}
|
||||
{%- endif %}
|
||||
{%- else %}
|
||||
{%- if tools and documents %}
|
||||
{%- set ns.system_message = ns.tools_system_message + '\n\n' + ns.documents_system_message %}
|
||||
{%- elif tools %}
|
||||
{%- set ns.system_message = ns.tools_system_message %}
|
||||
{%- elif documents %}
|
||||
{%- set ns.system_message = ns.documents_system_message %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if ns.system_message %}
|
||||
{{- '<|start_of_role|>system<|end_of_role|>' + ns.system_message + '<|end_of_text|>\n' }}
|
||||
{%- else %}
|
||||
{{- '<|start_of_role|>system<|end_of_role|>' + ns.default_system_message + '<|end_of_text|>\n' }}
|
||||
{%- endif %}
|
||||
{%- for message in messages %}
|
||||
{%- set content = namespace(val='') %}
|
||||
{%- if message.content is string %}
|
||||
{%- set content.val = message.content %}
|
||||
{%- else %}
|
||||
{%- if message.content is iterable %}
|
||||
{%- for entry in message.content %}
|
||||
{%- if entry.type== 'text' %}
|
||||
{%- if content.val != '' %}
|
||||
{%- set content.val = content.val + '\n' %}
|
||||
{%- endif %}
|
||||
{%- set content.val = content.val + entry.text %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- if (message.role == 'user') or (message.role == 'system' and not loop.first) %}
|
||||
{{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val + '<|end_of_text|>\n' }}
|
||||
{%- elif message.role == 'assistant' %}
|
||||
{{- '<|start_of_role|>' + message.role + '<|end_of_role|>' + content.val }}
|
||||
{%- if message.tool_calls %}
|
||||
{%- for tool_call in message.tool_calls %}
|
||||
{%- if (loop.first and content.val) or (not loop.first) %}
|
||||
{{- '\n' }}
|
||||
{%- endif %}
|
||||
{%- if tool_call.function %}
|
||||
{%- set tool_call = tool_call.function %}
|
||||
{%- endif %}
|
||||
{{- '<tool_call>\n{"name": "' }}
|
||||
{{- tool_call.name }}
|
||||
{{- '", "arguments": ' }}
|
||||
{%- if tool_call.arguments is string %}
|
||||
{{- tool_call.arguments }}
|
||||
{%- else %}
|
||||
{{- tool_call.arguments | tojson }}
|
||||
{%- endif %}
|
||||
{{- '}\n</tool_call>' }}
|
||||
{%- endfor %}
|
||||
{%- endif %}
|
||||
{{- '<|end_of_text|>\n' }}
|
||||
{%- elif message.role == 'tool' %}
|
||||
{%- if loop.first or (messages[loop.index0 - 1].role != 'tool') %}
|
||||
{{- '<|start_of_role|>user<|end_of_role|>' }}
|
||||
{%- endif %}
|
||||
{{- '\n<tool_response>\n' }}
|
||||
{{- content.val }}
|
||||
{{- '\n</tool_response>' }}
|
||||
{%- if loop.last or (messages[loop.index0 + 1].role != 'tool') %}
|
||||
{{- '<|end_of_text|>\n' }}
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{%- endfor %}
|
||||
{%- if add_generation_prompt %}
|
||||
{{- '<|start_of_role|>assistant<|end_of_role|>' }}
|
||||
{%- endif %}
|
||||
@@ -139,7 +139,11 @@ def main():
|
||||
'_ZL18flash_attn_ext_f16ILi96ELi96ELi4ELi8ELb0ELb0EEvPKcS1_S1_S1_S1_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS5_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL18flash_attn_ext_vecILi128ELi2EL9ggml_type2ELS0_2ELb0EEvPKcS2_S2_S2_S2_PKiPfP15HIP_vector_typeIfLj2EEffffjfiS6_IjLj3EEiiiiiiiiiiiliiliiiiil',
|
||||
'_ZL9mul_mat_qIL9ggml_type10ELi16ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii',
|
||||
'_ZL9mul_mat_qIL9ggml_type12ELi128ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii'
|
||||
'_ZL9mul_mat_qIL9ggml_type12ELi128ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii',
|
||||
'_ZL9mul_mat_qIL9ggml_type40ELi112ELb0EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii',
|
||||
'_ZL9mul_mat_qIL9ggml_type40ELi112ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii',
|
||||
'_ZL9mul_mat_qIL9ggml_type40ELi128ELb0EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii',
|
||||
'_ZL9mul_mat_qIL9ggml_type40ELi128ELb1EEvPKcPKiS4_S4_PfS5_iiiiiiiiiiiiiiiii'
|
||||
}
|
||||
|
||||
functions = parse_log_file(log_file)
|
||||
|
||||
1135
scripts/server-test-function-call.py
Executable file
1135
scripts/server-test-function-call.py
Executable file
File diff suppressed because it is too large
Load Diff
@@ -1 +1 @@
|
||||
a04eea0761a85d18f3f504d6ab970c5c9dce705f
|
||||
49f84a924f6ea4fc2ef73dbbd8cc4d734b54bd6d
|
||||
|
||||
@@ -60,7 +60,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
|
||||
{ "exaone-moe", LLM_CHAT_TEMPLATE_EXAONE_MOE },
|
||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE_3_X },
|
||||
{ "granite-4.0", LLM_CHAT_TEMPLATE_GRANITE_4_0 },
|
||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
||||
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
||||
@@ -191,7 +192,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
} else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
|
||||
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
||||
} else if (tmpl_contains("<|start_of_role|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GRANITE;
|
||||
if (tmpl_contains("<tool_call>") || tmpl_contains("<tools>")) {
|
||||
return LLM_CHAT_TEMPLATE_GRANITE_4_0;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_GRANITE_3_X;
|
||||
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
||||
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
||||
} else if (tmpl_contains("<|role_start|>")) {
|
||||
@@ -617,8 +621,8 @@ int32_t llm_chat_apply_template(
|
||||
ss << "Assistant: " << trim(chat[i]->content) << "\n\n";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
|
||||
// IBM Granite template
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE_3_X) {
|
||||
// IBM Granite 3.x template
|
||||
for (const auto & message : chat) {
|
||||
std::string role(message->role);
|
||||
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
|
||||
@@ -630,6 +634,20 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "<|start_of_role|>assistant<|end_of_role|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE_4_0) {
|
||||
// IBM Granite 4.0 template
|
||||
for (const auto & message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "assistant_tool_call") {
|
||||
ss << "<|start_of_role|>assistant<|end_of_role|><|tool_call|>";
|
||||
} else {
|
||||
ss << "<|start_of_role|>" << role << "<|end_of_role|>";
|
||||
}
|
||||
ss << message->content << "<|end_of_text|>\n";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|start_of_role|>assistant<|end_of_role|>";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
||||
// GigaChat template
|
||||
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
||||
|
||||
@@ -39,7 +39,8 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_EXAONE_4,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_MOE,
|
||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||
LLM_CHAT_TEMPLATE_GRANITE,
|
||||
LLM_CHAT_TEMPLATE_GRANITE_3_X,
|
||||
LLM_CHAT_TEMPLATE_GRANITE_4_0,
|
||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||
LLM_CHAT_TEMPLATE_MEGREZ,
|
||||
LLM_CHAT_TEMPLATE_YANDEX,
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
|
||||
// dedup helpers
|
||||
|
||||
static ggml_tensor * build_kq_mask(
|
||||
static ggml_tensor * build_attn_inp_kq_mask(
|
||||
ggml_context * ctx,
|
||||
const llama_kv_cache_context * mctx,
|
||||
const llama_ubatch & ubatch,
|
||||
@@ -28,7 +28,11 @@ static ggml_tensor * build_kq_mask(
|
||||
const auto n_tokens = ubatch.n_tokens;
|
||||
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
||||
|
||||
return ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_tensor * res = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
|
||||
ggml_set_input(res);
|
||||
ggml_set_name(res, "attn_inp_kq_mask");
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static bool can_reuse_kq_mask(
|
||||
@@ -52,6 +56,21 @@ static bool can_reuse_kq_mask(
|
||||
|
||||
// impl
|
||||
|
||||
static ggml_tensor * ggml_mul_mat_aux(
|
||||
ggml_context * ctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * rot) {
|
||||
const auto n = rot->ne[0];
|
||||
|
||||
ggml_tensor * res;
|
||||
|
||||
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
|
||||
res = ggml_mul_mat (ctx, rot, res);
|
||||
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->token) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@@ -429,6 +448,14 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
|
||||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
|
||||
if (self_k_rot) {
|
||||
mctx->set_input_k_rot(self_k_rot);
|
||||
}
|
||||
|
||||
if (self_v_rot) {
|
||||
mctx->set_input_v_rot(self_v_rot);
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
|
||||
@@ -476,6 +503,14 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
|
||||
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
|
||||
if (self_k_rot) {
|
||||
mctx->get_base()->set_input_k_rot(self_k_rot);
|
||||
}
|
||||
|
||||
if (self_v_rot) {
|
||||
mctx->get_base()->set_input_v_rot(self_v_rot);
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
|
||||
@@ -532,6 +567,14 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||
|
||||
mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
|
||||
|
||||
if (inp_attn->self_k_rot) {
|
||||
mctx->get_attn()->set_input_k_rot(inp_attn->self_k_rot);
|
||||
}
|
||||
|
||||
if (inp_attn->self_v_rot) {
|
||||
mctx->get_attn()->set_input_v_rot(inp_attn->self_v_rot);
|
||||
}
|
||||
|
||||
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||
|
||||
if (inp_rs->s_copy) {
|
||||
@@ -630,6 +673,14 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
if (inp_attn->self_k_rot) {
|
||||
attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot);
|
||||
}
|
||||
|
||||
if (inp_attn->self_v_rot) {
|
||||
attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot);
|
||||
}
|
||||
|
||||
const int64_t n_rs = mctx->get_recr()->get_n_rs();
|
||||
|
||||
if (inp_rs->s_copy) {
|
||||
@@ -2002,13 +2053,13 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
|
||||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
||||
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
}
|
||||
|
||||
inp->self_k_rot = mctx_cur->build_input_k_rot(ctx0);
|
||||
inp->self_v_rot = mctx_cur->build_input_v_rot(ctx0);
|
||||
|
||||
return inp;
|
||||
}
|
||||
|
||||
@@ -2034,6 +2085,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
int il) const {
|
||||
GGML_ASSERT(v_mla == nullptr);
|
||||
|
||||
if (inp->self_k_rot) {
|
||||
q_cur = ggml_mul_mat_aux(ctx0, q_cur, inp->self_k_rot);
|
||||
k_cur = ggml_mul_mat_aux(ctx0, k_cur, inp->self_k_rot);
|
||||
}
|
||||
|
||||
if (inp->self_v_rot) {
|
||||
v_cur = ggml_mul_mat_aux(ctx0, v_cur, inp->self_v_rot);
|
||||
}
|
||||
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
// expand k later to enable rope fusion which directly writes into k-v cache
|
||||
@@ -2061,6 +2121,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (inp->self_v_rot) {
|
||||
cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot);
|
||||
}
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
|
||||
@@ -2090,9 +2154,7 @@ static std::unique_ptr<llm_graph_input_attn_k> build_attn_inp_k_impl(
|
||||
|
||||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur, ubatch, cparams);
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
}
|
||||
|
||||
@@ -2171,6 +2233,18 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * v_mla,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
if (inp->self_k_rot) {
|
||||
q_cur = ggml_mul_mat_aux(ctx0, q_cur, inp->self_k_rot);
|
||||
if (k_cur) {
|
||||
k_cur = ggml_mul_mat_aux(ctx0, k_cur, inp->self_k_rot);
|
||||
}
|
||||
}
|
||||
if (inp->self_v_rot) {
|
||||
if (v_cur) {
|
||||
v_cur = ggml_mul_mat_aux(ctx0, v_cur, inp->self_v_rot);
|
||||
}
|
||||
}
|
||||
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
ggml_build_forward_expand(gf, q_cur);
|
||||
@@ -2211,6 +2285,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (inp->self_v_rot) {
|
||||
cur = ggml_mul_mat_aux(ctx0, cur, inp->self_v_rot);
|
||||
}
|
||||
|
||||
if (wo) {
|
||||
cur = build_lora_mm(wo, cur);
|
||||
}
|
||||
@@ -2293,12 +2371,8 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = build_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
ggml_set_name(inp->self_kq_mask, "self_kq_mask");
|
||||
|
||||
inp->self_kq_mask = build_attn_inp_kq_mask(ctx0, mctx_cur->get_base(), ubatch, cparams);
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
ggml_set_name(inp->self_kq_mask_cnv, "self_kq_mask_cnv");
|
||||
}
|
||||
|
||||
{
|
||||
@@ -2307,14 +2381,13 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
|
||||
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask_swa = build_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
|
||||
ggml_set_input(inp->self_kq_mask_swa);
|
||||
ggml_set_name(inp->self_kq_mask_swa, "self_kq_mask_swa");
|
||||
|
||||
inp->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, mctx_cur->get_swa(), ubatch, cparams);
|
||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||
ggml_set_name(inp->self_kq_mask_swa_cnv, "self_kq_mask_swa_cnv");
|
||||
}
|
||||
|
||||
inp->self_k_rot = mctx_cur->get_base()->build_input_k_rot(ctx0);
|
||||
inp->self_v_rot = mctx_cur->get_base()->build_input_v_rot(ctx0);
|
||||
|
||||
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
|
||||
}
|
||||
|
||||
@@ -2473,9 +2546,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
|
||||
inp_attn->self_k_idxs = attn_ctx->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp_attn->self_v_idxs = attn_ctx->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp_attn->self_kq_mask = build_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
|
||||
ggml_set_input(inp_attn->self_kq_mask);
|
||||
|
||||
inp_attn->self_kq_mask = build_attn_inp_kq_mask(ctx0, attn_ctx->get_base(), ubatch, cparams);
|
||||
inp_attn->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask, GGML_TYPE_F16) : inp_attn->self_kq_mask;
|
||||
}
|
||||
|
||||
@@ -2483,9 +2554,7 @@ llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa()
|
||||
inp_attn->self_k_idxs_swa = attn_ctx->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp_attn->self_v_idxs_swa = attn_ctx->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp_attn->self_kq_mask_swa = build_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
|
||||
ggml_set_input(inp_attn->self_kq_mask_swa);
|
||||
|
||||
inp_attn->self_kq_mask_swa = build_attn_inp_kq_mask(ctx0, attn_ctx->get_swa(), ubatch, cparams);
|
||||
inp_attn->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp_attn->self_kq_mask_swa, GGML_TYPE_F16) : inp_attn->self_kq_mask_swa;
|
||||
}
|
||||
|
||||
|
||||
@@ -308,6 +308,10 @@ public:
|
||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
|
||||
// note: assumes v_rot^ == I
|
||||
ggml_tensor * self_k_rot = nullptr;
|
||||
ggml_tensor * self_v_rot = nullptr;
|
||||
|
||||
// note: these have to be copies because in order to be able to reuse a graph, its inputs
|
||||
// need to carry these parameters with them. otherwise, they can point to freed
|
||||
// llm_graph_params from a previous batch, causing stack-use-after-return
|
||||
@@ -384,6 +388,10 @@ public:
|
||||
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
|
||||
// note: using same rotation matrices for both base and swa cache
|
||||
ggml_tensor * self_k_rot = nullptr;
|
||||
ggml_tensor * self_v_rot = nullptr;
|
||||
|
||||
const llama_hparams hparams;
|
||||
const llama_cparams cparams;
|
||||
|
||||
|
||||
@@ -66,8 +66,9 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
|
||||
|
||||
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
||||
|
||||
// note: the SWA cache is never quantized because it is relatively small
|
||||
kv_swa = std::make_unique<llama_kv_cache>(
|
||||
model, type_k, type_v,
|
||||
model, GGML_TYPE_F16, GGML_TYPE_F16,
|
||||
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
||||
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
|
||||
}
|
||||
|
||||
@@ -13,6 +13,65 @@
|
||||
#include <map>
|
||||
#include <stdexcept>
|
||||
|
||||
static bool ggml_is_power_of_2(int n) {
|
||||
return (n & (n - 1)) == 0;
|
||||
}
|
||||
|
||||
// orthonormal Walsh-Hadamard rotation matrix
|
||||
// note: res^2 == I
|
||||
static void ggml_gen_hadamard(ggml_tensor * tensor) {
|
||||
assert(tensor->type == GGML_TYPE_F32);
|
||||
|
||||
const int n = tensor->ne[0];
|
||||
|
||||
assert(ggml_is_power_of_2(n));
|
||||
assert(tensor->ne[1] == n);
|
||||
assert(tensor->ne[2] == 1);
|
||||
assert(tensor->ne[3] == 1);
|
||||
|
||||
std::vector<float> data_f32;
|
||||
|
||||
float * data = (float *) tensor->data;
|
||||
|
||||
if (tensor->type != GGML_TYPE_F32) {
|
||||
data_f32.resize(n*n);
|
||||
data = data_f32.data();
|
||||
}
|
||||
|
||||
data[0*n + 0] = 1.0 / sqrtf(n);
|
||||
|
||||
for (int s = 1; s < n; s *= 2) {
|
||||
for (int i = 0; i < s; i++) {
|
||||
for (int j = 0; j < s; j++) {
|
||||
const float val = data[i*n + j];
|
||||
|
||||
data[(i + s)*n + (j )] = val;
|
||||
data[(i )*n + (j + s)] = val;
|
||||
data[(i + s)*n + (j + s)] = -val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (tensor->type != GGML_TYPE_F32) {
|
||||
ggml_quantize_chunk(tensor->type, data, tensor->data, 0, 1, n*n, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
static ggml_tensor * ggml_mul_mat_aux(
|
||||
ggml_context * ctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * rot) {
|
||||
const auto n = rot->ne[0];
|
||||
|
||||
ggml_tensor * res;
|
||||
|
||||
res = ggml_reshape_2d(ctx, cur, n, ggml_nelements(cur)/n);
|
||||
res = ggml_mul_mat (ctx, rot, res);
|
||||
res = ggml_reshape_4d(ctx, res, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3]);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
//
|
||||
// llama_kv_cache
|
||||
//
|
||||
@@ -209,6 +268,48 @@ llama_kv_cache::llama_kv_cache(
|
||||
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
||||
}
|
||||
|
||||
const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE");
|
||||
const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? atoi(LLAMA_ATTN_ROT_DISABLE) : false;
|
||||
if (attn_rot_disable) {
|
||||
LLAMA_LOG_WARN("%s: attention rotation force disabled (LLAMA_ATTN_ROT_DISABLE)\n", __func__);
|
||||
}
|
||||
|
||||
attn_rot_k =
|
||||
!attn_rot_disable &&
|
||||
ggml_is_quantized(type_k) &&
|
||||
!hparams.is_n_embd_k_gqa_variable() &&
|
||||
hparams.n_embd_head_k() % 64 == 0;
|
||||
|
||||
attn_rot_v =
|
||||
!attn_rot_disable &&
|
||||
ggml_is_quantized(type_v) &&
|
||||
!hparams.is_n_embd_v_gqa_variable() &&
|
||||
hparams.n_embd_head_v() % 64 == 0;
|
||||
|
||||
LLAMA_LOG_INFO("%s: attn_rot_k = %d\n", __func__, attn_rot_k);
|
||||
LLAMA_LOG_INFO("%s: attn_rot_v = %d\n", __func__, attn_rot_v);
|
||||
|
||||
// pre-compute the haramard matrices and keep them in host memory
|
||||
// TODO: in the future, we can make copies in the backend buffers to avoid host -> device transfers
|
||||
if (attn_rot_k || attn_rot_v) {
|
||||
for (int64_t n = 64; n <= std::max(hparams.n_embd_head_k(), hparams.n_embd_head_v()); n *= 2) {
|
||||
attn_rot_hadamard[n] = std::vector<float>(n*n);
|
||||
|
||||
ggml_init_params params = {
|
||||
/* .mem_size = */ 1*ggml_tensor_overhead(),
|
||||
/* .mem_buffer = */ nullptr,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
|
||||
ggml_context_ptr ctx { ggml_init(params) };
|
||||
|
||||
ggml_tensor * tmp = ggml_new_tensor_2d(ctx.get(), GGML_TYPE_F32, n, n);
|
||||
tmp->data = attn_rot_hadamard[n].data();
|
||||
|
||||
ggml_gen_hadamard(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
|
||||
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
|
||||
}
|
||||
@@ -1004,6 +1105,14 @@ bool llama_kv_cache::get_has_shift() const {
|
||||
return result;
|
||||
}
|
||||
|
||||
ggml_type llama_kv_cache::type_k() const {
|
||||
return layers[0].k->type;
|
||||
}
|
||||
|
||||
ggml_type llama_kv_cache::type_v() const {
|
||||
return layers[0].v->type;
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache::get_n_kv(const slot_info & sinfo) const {
|
||||
uint32_t result = 0;
|
||||
|
||||
@@ -1189,6 +1298,47 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
|
||||
return v_idxs;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache::build_input_k_rot(ggml_context * ctx) const {
|
||||
ggml_tensor * res = nullptr;
|
||||
|
||||
if (attn_rot_k) {
|
||||
int nrot = 64;
|
||||
|
||||
// TODO: investigate if using the smallest rotation matrix is beneficial also for K (similar as for V)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4141323088
|
||||
do {
|
||||
nrot *= 2;
|
||||
} while (hparams.n_embd_head_k() % nrot == 0);
|
||||
nrot /= 2;
|
||||
|
||||
res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
|
||||
ggml_set_input(res);
|
||||
ggml_set_name(res, "attn_inp_k_rot");
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache::build_input_v_rot(ggml_context * ctx) const {
|
||||
ggml_tensor * res = nullptr;
|
||||
|
||||
if (attn_rot_v) {
|
||||
int nrot = 64;
|
||||
// using smaller rotation matrices for V seems beneficial
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/21038#issuecomment-4146397570
|
||||
//do {
|
||||
// nrot *= 2;
|
||||
//} while (hparams.n_embd_head_v() % nrot == 0);
|
||||
//nrot /= 2;
|
||||
|
||||
res = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nrot, nrot);
|
||||
ggml_set_input(res);
|
||||
ggml_set_name(res, "attn_inp_v_rot");
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
|
||||
const uint32_t n_tokens = ubatch->n_tokens;
|
||||
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
|
||||
@@ -1507,6 +1657,24 @@ void llama_kv_cache::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch
|
||||
}
|
||||
}
|
||||
|
||||
void llama_kv_cache::set_input_k_rot(ggml_tensor * dst) const {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
||||
|
||||
const auto n_rot = dst->ne[0];
|
||||
GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0]));
|
||||
|
||||
memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst));
|
||||
}
|
||||
|
||||
void llama_kv_cache::set_input_v_rot(ggml_tensor * dst) const {
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
|
||||
|
||||
const auto n_rot = dst->ne[0];
|
||||
GGML_ASSERT(attn_rot_hadamard.count(dst->ne[0]));
|
||||
|
||||
memcpy(dst->data, attn_rot_hadamard.at(n_rot).data(), ggml_nbytes(dst));
|
||||
}
|
||||
|
||||
size_t llama_kv_cache::total_size() const {
|
||||
size_t size = 0;
|
||||
|
||||
@@ -1542,6 +1710,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||
ggml_context * ctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * rot,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
@@ -1567,10 +1736,16 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
|
||||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx, cur, GGML_TYPE_F32);
|
||||
|
||||
// rotate back
|
||||
tmp = ggml_mul_mat_aux(ctx, tmp, rot);
|
||||
|
||||
tmp = ggml_rope_ext(ctx, tmp,
|
||||
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
|
||||
|
||||
// rotate fwd
|
||||
tmp = ggml_mul_mat_aux(ctx, tmp, rot);
|
||||
|
||||
tmp = ggml_cpy(ctx, tmp, cur);
|
||||
} else {
|
||||
// we rotate only the first n_rot dimensions
|
||||
@@ -1591,6 +1766,9 @@ public:
|
||||
|
||||
ggml_tensor * k_shift; // I32 [kv_size*n_stream]
|
||||
|
||||
// note: assumes k_rot^2 == I
|
||||
ggml_tensor * k_rot = nullptr;
|
||||
|
||||
const llama_kv_cache * kv_self;
|
||||
};
|
||||
|
||||
@@ -1600,6 +1778,10 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) {
|
||||
if (k_shift) {
|
||||
kv_self->set_input_k_shift(k_shift);
|
||||
}
|
||||
|
||||
if (k_rot) {
|
||||
kv_self->set_input_k_rot(k_rot);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const {
|
||||
@@ -1611,6 +1793,8 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
||||
inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
|
||||
ggml_set_input(inp->k_shift);
|
||||
|
||||
inp->k_rot = build_input_k_rot(ctx);
|
||||
|
||||
const auto & cparams = lctx->get_cparams();
|
||||
|
||||
for (const auto & layer : layers) {
|
||||
@@ -1635,7 +1819,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
||||
ggml_row_size(layer.k->type, n_embd_k_gqa),
|
||||
ggml_row_size(layer.k->type, n_embd_nope));
|
||||
|
||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);
|
||||
ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, inp->k_rot, rope_factors, freq_base_l, freq_scale_l, il);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
}
|
||||
@@ -2239,6 +2423,14 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
|
||||
return n_kv;
|
||||
}
|
||||
|
||||
ggml_type llama_kv_cache_context::type_k() const {
|
||||
return kv->type_k();
|
||||
}
|
||||
|
||||
ggml_type llama_kv_cache_context::type_v() const {
|
||||
return kv->type_v();
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
|
||||
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
|
||||
}
|
||||
@@ -2263,6 +2455,14 @@ ggml_tensor * llama_kv_cache_context::build_input_v_idxs(ggml_context * ctx, con
|
||||
return kv->build_input_v_idxs(ctx, ubatch);
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::build_input_k_rot(ggml_context * ctx) const {
|
||||
return kv->build_input_k_rot(ctx);
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_context::build_input_v_rot(ggml_context * ctx) const {
|
||||
return kv->build_input_v_rot(ctx);
|
||||
}
|
||||
|
||||
void llama_kv_cache_context::set_input_k_shift(ggml_tensor * dst) const {
|
||||
kv->set_input_k_shift(dst);
|
||||
}
|
||||
@@ -2282,3 +2482,11 @@ void llama_kv_cache_context::set_input_kq_mask(ggml_tensor * dst, const llama_ub
|
||||
void llama_kv_cache_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
||||
kv->set_input_pos_bucket(dst, ubatch);
|
||||
}
|
||||
|
||||
void llama_kv_cache_context::set_input_k_rot(ggml_tensor * dst) const {
|
||||
kv->set_input_k_rot(dst);
|
||||
}
|
||||
|
||||
void llama_kv_cache_context::set_input_v_rot(ggml_tensor * dst) const {
|
||||
kv->set_input_v_rot(dst);
|
||||
}
|
||||
|
||||
@@ -152,6 +152,9 @@ public:
|
||||
|
||||
bool get_has_shift() const;
|
||||
|
||||
ggml_type type_k() const;
|
||||
ggml_type type_v() const;
|
||||
|
||||
//
|
||||
// graph_build API
|
||||
//
|
||||
@@ -191,6 +194,9 @@ public:
|
||||
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||
|
||||
ggml_tensor * build_input_k_rot(ggml_context * ctx) const;
|
||||
ggml_tensor * build_input_v_rot(ggml_context * ctx) const;
|
||||
|
||||
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
||||
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
||||
|
||||
@@ -199,6 +205,9 @@ public:
|
||||
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
||||
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||
|
||||
void set_input_k_rot(ggml_tensor * dst) const;
|
||||
void set_input_v_rot(ggml_tensor * dst) const;
|
||||
|
||||
private:
|
||||
const llama_model & model;
|
||||
const llama_hparams & hparams;
|
||||
@@ -226,6 +235,13 @@ private:
|
||||
// SWA
|
||||
const uint32_t n_swa = 0;
|
||||
|
||||
// env: LLAMA_ATTN_ROT_DISABLE
|
||||
bool attn_rot_k = false;
|
||||
bool attn_rot_v = false;
|
||||
|
||||
// pre-computed hadamard martrices
|
||||
std::unordered_map<int64_t, std::vector<float>> attn_rot_hadamard;
|
||||
|
||||
// env: LLAMA_KV_CACHE_DEBUG
|
||||
int debug = 0;
|
||||
|
||||
@@ -262,6 +278,7 @@ private:
|
||||
ggml_context * ctx,
|
||||
ggml_tensor * cur,
|
||||
ggml_tensor * shift,
|
||||
ggml_tensor * rot,
|
||||
ggml_tensor * factors,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
@@ -328,6 +345,9 @@ public:
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
ggml_type type_k() const;
|
||||
ggml_type type_v() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||
@@ -347,6 +367,9 @@ public:
|
||||
ggml_tensor * build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||
ggml_tensor * build_input_v_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const;
|
||||
|
||||
ggml_tensor * build_input_k_rot(ggml_context * ctx) const;
|
||||
ggml_tensor * build_input_v_rot(ggml_context * ctx) const;
|
||||
|
||||
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||
|
||||
@@ -354,6 +377,9 @@ public:
|
||||
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
||||
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||
|
||||
void set_input_k_rot(ggml_tensor * dst) const;
|
||||
void set_input_v_rot(ggml_tensor * dst) const;
|
||||
|
||||
private:
|
||||
llama_memory_status status;
|
||||
|
||||
|
||||
@@ -73,9 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_base()->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
||||
@@ -73,9 +73,9 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
|
||||
// if all tokens are output, split by sequence
|
||||
ubatch = balloc.split_seq(n_ubatch);
|
||||
} else {
|
||||
// TODO: non-sequential equal split can be done if using unified KV cache
|
||||
// for simplicity, we always use sequential equal split for now
|
||||
ubatch = balloc.split_equal(n_ubatch, true);
|
||||
// Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice)
|
||||
const bool unified = (mem_attn->get_n_stream() == 1);
|
||||
ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
}
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
|
||||
@@ -354,6 +354,7 @@ int main_automated_tests(void) {
|
||||
std::string bos_token = "";
|
||||
std::string eos_token = "";
|
||||
bool supported_with_jinja = true;
|
||||
std::vector<llama_chat_message> extra_conversation = {};
|
||||
};
|
||||
std::vector<TestCase> test_cases {
|
||||
{
|
||||
@@ -604,6 +605,26 @@ int main_automated_tests(void) {
|
||||
/* .expected_output_jinja= */ "<seed:bos>system\nYou are a helpful assistant<seed:eos><seed:bos>user\nHello<seed:eos><seed:bos>assistant\nHi there<seed:eos><seed:bos>user\nWho are you<seed:eos><seed:bos>assistant\nI am an assistant<seed:eos><seed:bos>user\nAnother question<seed:eos><seed:bos>assistant\n",
|
||||
/* .bos_token= */ "<seed:bos>",
|
||||
/* .eos_token= */ "<seed:eos>",
|
||||
},
|
||||
{
|
||||
/* .name= */ "ibm-granite/granite-3.x (tool call)",
|
||||
/* .template_str= */ "{%- for message in messages %}\n {%- if message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\\n' }}\n {%- else %}\n {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n {%- endif %}\n{%- endfor %}",
|
||||
/* .expected_output= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|> I am an assistant <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What is the weather?<|end_of_text|>\n<|start_of_role|>assistant_tool_call<|end_of_role|><|tool_call|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}]<|end_of_text|>\n<|start_of_role|>tool_response<|end_of_role|>{\"temperature\": 72}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
|
||||
/* .expected_output_jinja= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|> I am an assistant <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What is the weather?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|><|tool_call|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}]<|end_of_text|>\n<|start_of_role|>tool_response<|end_of_role|>{\"temperature\": 72}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
|
||||
/* .bos_token= */ "",
|
||||
/* .eos_token= */ "",
|
||||
/* .supported_with_jinja= */ true,
|
||||
/* .extra_conversation= */ {{"user", "What is the weather?"}, {"assistant_tool_call", "[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}]"}, {"tool_response", "{\"temperature\": 72}"}},
|
||||
},
|
||||
{
|
||||
/* .name= */ "ibm-granite/granite-4.0 (tool call)",
|
||||
/* .template_str= */ "{%- for message in messages %}\n {%- if message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\\n' }}\n {%- else %}\n {{- '<|start_of_role|>' + message['role'] + '<|end_of_role|>' + message['content'] + '<|end_of_text|>\\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|start_of_role|>assistant<|end_of_role|>' }}\n {%- endif %}\n{%- endfor %}\n{# <tool_call> <tools> #}",
|
||||
/* .expected_output= */ "<|start_of_role|>system<|end_of_role|>You are a helpful assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Hello<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Hi there<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Who are you<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|> I am an assistant <|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Another question<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>What is the weather?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|><|tool_call|><tool_call>\n{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}\n</tool_call><|end_of_text|>\n<|start_of_role|>tool_response<|end_of_role|>{\"temperature\": 72}<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>",
|
||||
/* .expected_output_jinja= */ "",
|
||||
/* .bos_token= */ "",
|
||||
/* .eos_token= */ "",
|
||||
/* .supported_with_jinja= */ true,
|
||||
/* .extra_conversation= */ {{"user", "What is the weather?"}, {"assistant_tool_call", "<tool_call>\n{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}\n</tool_call>"}, {"tool_response", "{\"temperature\": 72}"}},
|
||||
}
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
@@ -627,11 +648,13 @@ int main_automated_tests(void) {
|
||||
|
||||
for (const auto & test_case : test_cases) {
|
||||
std::cout << "\n\n=== " << test_case.name << " ===\n\n";
|
||||
formatted_chat.resize(1024);
|
||||
auto conv = conversation;
|
||||
conv.insert(conv.end(), test_case.extra_conversation.begin(), test_case.extra_conversation.end());
|
||||
formatted_chat.resize(2048);
|
||||
res = llama_chat_apply_template(
|
||||
test_case.template_str.c_str(),
|
||||
conversation.data(),
|
||||
conversation.size(),
|
||||
conv.data(),
|
||||
conv.size(),
|
||||
add_generation_prompt,
|
||||
formatted_chat.data(),
|
||||
formatted_chat.size()
|
||||
@@ -658,11 +681,15 @@ int main_automated_tests(void) {
|
||||
}
|
||||
std::cout << "\n\n=== " << test_case.name << " (jinja) ===\n\n";
|
||||
try {
|
||||
auto msgs = messages;
|
||||
for (const auto & msg : test_case.extra_conversation) {
|
||||
msgs.push_back(simple_msg(msg.role, msg.content));
|
||||
}
|
||||
auto output = format_using_common(
|
||||
test_case.template_str,
|
||||
test_case.bos_token,
|
||||
test_case.eos_token,
|
||||
messages);
|
||||
msgs);
|
||||
auto expected_output = normalize_newlines(test_case.expected_output_jinja.empty() ? test_case.expected_output : test_case.expected_output_jinja);
|
||||
if (output != expected_output) {
|
||||
std::cout << "Template:```\n" << test_case.template_str << "\n```";
|
||||
|
||||
@@ -1929,6 +1929,22 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// .run();
|
||||
}
|
||||
|
||||
{
|
||||
// IBM Granite 4.0 (production template shared by h-tiny, h-small, micro)
|
||||
// Uses <tool_call> XML tags for tool calls, tools in system message
|
||||
auto tst = peg_tester("models/templates/ibm-granite-granite-4.0.jinja", detailed_debug);
|
||||
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
|
||||
"</tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.run();
|
||||
}
|
||||
|
||||
{
|
||||
// ByteDance-Seed-OSS (reasoning and tool calling model)
|
||||
auto tst = peg_tester("models/templates/ByteDance-Seed-OSS.jinja", detailed_debug);
|
||||
@@ -2712,6 +2728,67 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.run();
|
||||
}
|
||||
|
||||
// LFM2.5 tests - uses plain "List of tools: [...]" and bare [name(args)] without wrapper tokens
|
||||
{
|
||||
auto tst = peg_tester("models/templates/LFM2.5-Instruct.jinja", detailed_debug);
|
||||
|
||||
// Basic content only
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
|
||||
// Single tool call without reasoning
|
||||
tst.test("[special_function(arg1=1)]")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.run();
|
||||
|
||||
// Tool call with string argument
|
||||
tst.test("[get_time(city=\"XYZCITY\")]")
|
||||
.tools({ get_time_tool })
|
||||
.expect(message_with_tool_calls("get_time", "{\"city\":\"XYZCITY\"}"))
|
||||
.run();
|
||||
|
||||
// Tool call with reasoning (enable_thinking=true)
|
||||
tst.test("<think>I'm\nthinking</think>[special_function(arg1=1)]")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_thoughts)
|
||||
.run();
|
||||
|
||||
// Multiple tool calls (parallel)
|
||||
tst.test("[special_function(arg1=1), special_function_with_opt(arg1=1, arg2=2)]")
|
||||
.parallel_tool_calls(true)
|
||||
.tools({
|
||||
special_function_tool, special_function_tool_with_optional_param
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
// Tool call with content before tool call
|
||||
tst.test("Let me check the time.[get_time(city=\"Paris\")]")
|
||||
.tools({ get_time_tool })
|
||||
.expect(message_with_reasoning_content_and_multiple_tool_calls(
|
||||
"", "Let me check the time.", { { "get_time", "{\"city\":\"Paris\"}" } }
|
||||
))
|
||||
.run();
|
||||
|
||||
// Partial tool call (streaming)
|
||||
tst.test("[special_function(arg1=")
|
||||
.tools({ special_function_tool })
|
||||
.is_partial(true)
|
||||
.expect(simple_assist_msg("", "", "special_function", "{\"arg1\": "))
|
||||
.run();
|
||||
|
||||
// Tool call with empty arguments
|
||||
tst.test("[empty_args()]")
|
||||
.tools({ empty_args_tool })
|
||||
.expect(simple_assist_msg("", "", "empty_args", "{}"))
|
||||
.run();
|
||||
}
|
||||
|
||||
// Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
|
||||
// Format: <|tools_prefix|>[{"function_name": {...arguments...}}]<|tools_suffix|>
|
||||
{
|
||||
|
||||
@@ -143,7 +143,11 @@ bool server_http_context::init(const common_params & params) {
|
||||
"/v1/health",
|
||||
"/models",
|
||||
"/v1/models",
|
||||
"/api/tags"
|
||||
"/api/tags",
|
||||
"/",
|
||||
"/index.html",
|
||||
"/bundle.js",
|
||||
"/bundle.css",
|
||||
};
|
||||
|
||||
// If API key is not set, skip validation
|
||||
@@ -151,8 +155,8 @@ bool server_http_context::init(const common_params & params) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If path is public or is static file, skip validation
|
||||
if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
|
||||
// If path is public or static file, skip validation
|
||||
if (public_endpoints.find(req.path) != public_endpoints.end()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -1196,6 +1196,10 @@ server_http_proxy::server_http_proxy(
|
||||
// disable Accept-Encoding to avoid compressed responses
|
||||
continue;
|
||||
}
|
||||
if (key == "Transfer-Encoding") {
|
||||
// the body is already decoded
|
||||
continue;
|
||||
}
|
||||
if (key == "Host" || key == "host") {
|
||||
bool is_default_port = (scheme == "https" && port == 443) || (scheme == "http" && port == 80);
|
||||
req.set_header(key, is_default_port ? host : host + ":" + std::to_string(port));
|
||||
|
||||
@@ -22,6 +22,15 @@ def test_access_public_endpoint(endpoint: str):
|
||||
assert "error" not in res.body
|
||||
|
||||
|
||||
def test_access_static_assets_without_api_key():
|
||||
"""Static web UI assets should not require API key authentication (issue #21229)"""
|
||||
global server
|
||||
server.start()
|
||||
for path in ["/", "/bundle.js", "/bundle.css"]:
|
||||
res = server.make_request("GET", path)
|
||||
assert res.status_code == 200, f"Expected 200 for {path}, got {res.status_code}"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("api_key", [None, "invalid-key"])
|
||||
def test_incorrect_api_key(api_key: str):
|
||||
global server
|
||||
|
||||
Reference in New Issue
Block a user