mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d298382ad9 | ||
|
|
32a28217f4 | ||
|
|
c429b33beb | ||
|
|
9146d36fe7 | ||
|
|
b9adcbbf92 | ||
|
|
9588f196b1 | ||
|
|
3cbd23ed88 | ||
|
|
00c6390793 |
14
.github/labeler.yml
vendored
14
.github/labeler.yml
vendored
@@ -1,5 +1,16 @@
|
||||
# https://github.com/actions/labeler
|
||||
|
||||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-kompute.h
|
||||
- ggml-kompute.cpp
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-metal.h
|
||||
- ggml-metal.cpp
|
||||
- README-metal.md
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
@@ -9,6 +20,7 @@ SYCL:
|
||||
Nvidia GPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-cuda.h
|
||||
- ggml-cuda/**
|
||||
Vulkan:
|
||||
- changed-files:
|
||||
|
||||
@@ -203,6 +203,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
**Tools:**
|
||||
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
|
||||
---
|
||||
|
||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||
|
||||
@@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||
params.interactive_specials = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--special") {
|
||||
params.special = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
return true;
|
||||
@@ -1362,6 +1366,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" -i, --interactive run in interactive mode\n");
|
||||
printf(" --special special tokens output enabled\n");
|
||||
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
||||
|
||||
@@ -146,6 +146,7 @@ struct gpt_params {
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||
bool special = false; // enable special token output
|
||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
|
||||
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
||||
|
||||
params.custom_n_ctx = false;
|
||||
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_checkpointing = true;
|
||||
|
||||
params.sample_start = "";
|
||||
|
||||
@@ -81,6 +81,7 @@ models = [
|
||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -473,6 +473,9 @@ class Model:
|
||||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||
res = "smaug-bpe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@@ -2392,7 +2395,8 @@ class CommandR2Model(Model):
|
||||
|
||||
# max_position_embeddings = 8192 in config.json but model was actually
|
||||
# trained on 128k context length
|
||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
||||
# aya-23 models don't have model_max_length specified
|
||||
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
|
||||
|
||||
params.samples_start_after_nl = false;
|
||||
params.use_adam = true;
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_scratch = true;
|
||||
|
||||
// only adam
|
||||
|
||||
@@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
|
||||
// display text
|
||||
if (input_echo && display) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
||||
printf("%s", token_str.c_str());
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||
|
||||
// Console/Stream Output
|
||||
fprintf(stdout, "%s", token_str.c_str());
|
||||
|
||||
// Record Displayed Tokens To Log
|
||||
// Note: Generated tokens are created one by one hence this check
|
||||
if (embd.size() > 1) {
|
||||
// Incoming Requested Tokens
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
// Outgoing Generated Tokens
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>SimpleChat (LlamaCPP, ...) </title>
|
||||
<title>SimpleChat LlamaCppEtal </title>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<meta name="message" content="Save Nature Save Earth" />
|
||||
@@ -30,20 +30,17 @@
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<label for="system-in">System</label>
|
||||
<input type="text" name="system" id="system-in" class="flex-grow"/>
|
||||
<input type="text" name="system" id="system-in" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"/>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div id="chat-div">
|
||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||
<p> Enter your text to the ai assistant below.</p>
|
||||
<p> Use shift+enter for inserting enter.</p>
|
||||
<p> Refresh the page to start over fresh.</p>
|
||||
<p> You need to have javascript enabled.</p>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
|
||||
<textarea id="user-in" class="flex-grow" rows="3" placeholder="enter your query to the ai model here" ></textarea>
|
||||
<button id="user-btn">submit</button>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -14,11 +14,15 @@ own system prompts.
|
||||
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
||||
enough manner, in general.
|
||||
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat.
|
||||
Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
|
||||
console.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
|
||||
the js file as needed.
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude
|
||||
form of old messages culling can be achieved.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants
|
||||
they can update the js file or equivalent member in gMe as needed.
|
||||
|
||||
|
||||
## usage
|
||||
@@ -43,11 +47,33 @@ next run this web front end in examples/server/public_simplechat
|
||||
### using the front end
|
||||
|
||||
Open this simple web front end from your local browser
|
||||
|
||||
* http://127.0.0.1:PORT/index.html
|
||||
|
||||
Once inside
|
||||
|
||||
* Select between chat and completion mode. By default it is set to chat mode.
|
||||
|
||||
* In completion mode
|
||||
* logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
|
||||
If the model requires any prefix wrt user role messages, then the end user has to
|
||||
explicitly add the needed prefix, when they enter their chat message.
|
||||
Similarly if the model requires any prefix to trigger assistant/ai-model response,
|
||||
then the end user needs to enter the same.
|
||||
This keeps the logic simple, while still giving flexibility to the end user to
|
||||
manage any templating/tagging requirement wrt their messages to the model.
|
||||
* the logic doesnt insert newline at the begining and end wrt the prompt message generated.
|
||||
However if the chat being sent to /completions end point has more than one role's message,
|
||||
then insert newline when moving from one role's message to the next role's message, so
|
||||
that it can be clearly identified/distinguished.
|
||||
* given that /completions endpoint normally doesnt add additional chat-templating of its
|
||||
own, the above ensures that end user can create a custom single/multi message combo with
|
||||
any tags/special-tokens related chat templating to test out model handshake. Or enduser
|
||||
can use it just for normal completion related/based query.
|
||||
|
||||
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
||||
Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
|
||||
responses with a suitable system prompt.
|
||||
* if chat.add_system_begin is used
|
||||
* you cant change the system prompt, after it is has been submitted once along with user query.
|
||||
* you cant set a system prompt, after you have submitted any user query
|
||||
@@ -55,27 +81,121 @@ Once inside
|
||||
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
||||
* inturn the updated/changed system prompt will be inserted into the chat session.
|
||||
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
||||
|
||||
* Enter your query and either press enter or click on the submit button.
|
||||
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
||||
|
||||
* Wait for the logic to communicate with the server and get the response.
|
||||
* the user is not allowed to enter any fresh query during this time.
|
||||
* the user input box will be disabled and a working message will be shown in it.
|
||||
|
||||
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
||||
|
||||
* Using NewChat one can start independent chat sessions.
|
||||
* two independent chat sessions are setup by default.
|
||||
|
||||
|
||||
## Devel note
|
||||
|
||||
### Reason behind this
|
||||
|
||||
The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
|
||||
by developers who may not be from web frontend background (so inturn may not be familiar with template /
|
||||
end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
|
||||
|
||||
And given that the idea is also to help explore/experiment for developers, some flexibility is provided
|
||||
to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented
|
||||
to explore some of the end points and ideas/implications around them.
|
||||
|
||||
|
||||
### General
|
||||
|
||||
Me/gMe consolidates the settings which control the behaviour into one object.
|
||||
One can see the current settings, as well as change/update them using browsers devel-tool/console.
|
||||
|
||||
bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
|
||||
communicating with the server or only sends the latest user query/message.
|
||||
|
||||
bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
|
||||
messages that get inserted into prompt field wrt /Completion endpoint.
|
||||
|
||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||
irrespective of whether /chat/completions or /completions endpoint.
|
||||
|
||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||
modify the existing options value or remove them, for now you can update this global var
|
||||
using browser's development-tools/console.
|
||||
|
||||
iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
|
||||
This is disabled by default. However if enabled, then in addition to latest system message, only
|
||||
the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
|
||||
from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
|
||||
only user messages after the latest system message/prompt will be considered.
|
||||
|
||||
This specified sliding window user message count also includes the latest user query.
|
||||
<0 : Send entire chat history to server
|
||||
0 : Send only the system message if any to the server
|
||||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||
|
||||
|
||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way.
|
||||
|
||||
|
||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
||||
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
||||
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
||||
|
||||
|
||||
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
||||
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
||||
its instances relatively easily, however given the current goal of keeping this simple, it has
|
||||
not been added, for now.
|
||||
|
||||
|
||||
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
||||
the system prompt, anytime during the conversation or only at the beginning.
|
||||
|
||||
|
||||
read_json_early, is to experiment with reading json response data early on, if available,
|
||||
so that user can be shown generated data, as and when it is being generated, rather than
|
||||
at the end when full data is available.
|
||||
|
||||
the server flow doesnt seem to be sending back data early, atleast for request (inc options)
|
||||
that is currently sent.
|
||||
|
||||
if able to read json data early on in future, as and when ai model is generating data, then
|
||||
this helper needs to indirectly update the chat div with the recieved data, without waiting
|
||||
for the overall data to be available.
|
||||
|
||||
|
||||
### Default setup
|
||||
|
||||
By default things are setup to try and make the user experience a bit better, if possible.
|
||||
However a developer when testing the server of ai-model may want to change these value.
|
||||
|
||||
Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
|
||||
just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
|
||||
full chat history. This way if there is any response with garbage/repeatation, it doesnt
|
||||
mess with things beyond the next question/request/query, in some ways.
|
||||
|
||||
Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
|
||||
available wrt next query-response. However dont forget that the server when started should
|
||||
also be started with a model context size of 1k or more, to be on safe side.
|
||||
|
||||
The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
|
||||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||
to /completions endpoint handling code on server side.
|
||||
|
||||
Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server
|
||||
along with the user query. So that the model is partly set to try avoid repeating text in
|
||||
its response.
|
||||
|
||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console.
|
||||
|
||||
|
||||
## At the end
|
||||
|
||||
Also a thank you to all open source and open model developers, who strive for the common good.
|
||||
|
||||
@@ -48,6 +48,13 @@ button {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.ul1 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
.ul2 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0.6vmin;
|
||||
}
|
||||
|
||||
@@ -14,23 +14,86 @@ class ApiEP {
|
||||
}
|
||||
|
||||
let gUsageMsg = `
|
||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||
<p> Enter your text to the ai assistant below.</p>
|
||||
<p> Use shift+enter for inserting enter.</p>
|
||||
<p> Refresh the page to start over fresh.</p>
|
||||
<p class="role-system">Usage</p>
|
||||
<ul class="ul1">
|
||||
<li> Set system prompt above, to try control ai response charactersitic, if model supports same.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode normally wont have a system prompt.</li>
|
||||
</ul>
|
||||
<li> Enter your query to ai assistant below.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode doesnt insert user/role: prefix implicitly.</li>
|
||||
<li> Use shift+enter for inserting enter/newline.</li>
|
||||
</ul>
|
||||
<li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
|
||||
<ul class="ul2">
|
||||
<li> experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand</li>
|
||||
</ul>
|
||||
</ul>
|
||||
`;
|
||||
|
||||
/** @typedef {{role: string, content: string}[]} ChatMessages */
|
||||
|
||||
class SimpleChat {
|
||||
|
||||
constructor() {
|
||||
/**
|
||||
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
||||
* @type {{role: string, content: string}[]}
|
||||
* @type {ChatMessages}
|
||||
*/
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recent chat messages.
|
||||
* If iRecentUserMsgCnt < 0
|
||||
* Then return the full chat history
|
||||
* Else
|
||||
* Return chat messages from latest going back till the last/latest system prompt.
|
||||
* While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
|
||||
* @param {number} iRecentUserMsgCnt
|
||||
*/
|
||||
recent_chat(iRecentUserMsgCnt) {
|
||||
if (iRecentUserMsgCnt < 0) {
|
||||
return this.xchat;
|
||||
}
|
||||
if (iRecentUserMsgCnt == 0) {
|
||||
console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
|
||||
}
|
||||
/** @type{ChatMessages} */
|
||||
let rchat = [];
|
||||
let sysMsg = this.get_system_latest();
|
||||
if (sysMsg.length != 0) {
|
||||
rchat.push({role: Roles.System, content: sysMsg});
|
||||
}
|
||||
let iUserCnt = 0;
|
||||
let iStart = this.xchat.length;
|
||||
for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
|
||||
if (iUserCnt >= iRecentUserMsgCnt) {
|
||||
break;
|
||||
}
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.User) {
|
||||
iStart = i;
|
||||
iUserCnt += 1;
|
||||
}
|
||||
}
|
||||
for(let i = iStart; i < this.xchat.length; i++) {
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.System) {
|
||||
continue;
|
||||
}
|
||||
rchat.push({role: msg.role, content: msg.content});
|
||||
}
|
||||
return rchat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an entry into xchat
|
||||
* @param {string} role
|
||||
@@ -57,7 +120,7 @@ class SimpleChat {
|
||||
div.replaceChildren();
|
||||
}
|
||||
let last = undefined;
|
||||
for(const x of this.xchat) {
|
||||
for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
let entry = document.createElement("p");
|
||||
entry.className = `role-${x.role}`;
|
||||
entry.innerText = `${x.role}: ${x.content}`;
|
||||
@@ -69,17 +132,21 @@ class SimpleChat {
|
||||
} else {
|
||||
if (bClear) {
|
||||
div.innerHTML = gUsageMsg;
|
||||
gMe.show_info(div);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
* The needed fields/options are picked from a global object.
|
||||
* Convert the json into string.
|
||||
* @param {Object} obj
|
||||
*/
|
||||
request_jsonstr(obj) {
|
||||
obj["temperature"] = 0.7;
|
||||
for(let k in gMe.chatRequestOptions) {
|
||||
obj[k] = gMe.chatRequestOptions[k];
|
||||
}
|
||||
return JSON.stringify(obj);
|
||||
}
|
||||
|
||||
@@ -88,18 +155,27 @@ class SimpleChat {
|
||||
*/
|
||||
request_messages_jsonstr() {
|
||||
let req = {
|
||||
messages: this.xchat,
|
||||
messages: this.recent_chat(gMe.iRecentUserMsgCnt),
|
||||
}
|
||||
return this.request_jsonstr(req);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string form of json object suitable for /completions
|
||||
* @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
|
||||
*/
|
||||
request_prompt_jsonstr() {
|
||||
request_prompt_jsonstr(bInsertStandardRolePrefix) {
|
||||
let prompt = "";
|
||||
for(const chat of this.xchat) {
|
||||
prompt += `${chat.role}: ${chat.content}\n`;
|
||||
let iCnt = 0;
|
||||
for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
iCnt += 1;
|
||||
if (iCnt > 1) {
|
||||
prompt += "\n";
|
||||
}
|
||||
if (bInsertStandardRolePrefix) {
|
||||
prompt += `${chat.role}: `;
|
||||
}
|
||||
prompt += `${chat.content}`;
|
||||
}
|
||||
let req = {
|
||||
prompt: prompt,
|
||||
@@ -171,7 +247,6 @@ let gChatURL = {
|
||||
'chat': `${gBaseURL}/chat/completions`,
|
||||
'completion': `${gBaseURL}/completions`,
|
||||
}
|
||||
const gbCompletionFreshChatAlways = true;
|
||||
|
||||
|
||||
/**
|
||||
@@ -291,6 +366,8 @@ class MultiChatUI {
|
||||
// allow user to insert enter into their message using shift+enter.
|
||||
// while just pressing enter key will lead to submitting.
|
||||
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||
let value = this.elInUser.value;
|
||||
this.elInUser.value = value.substring(0,value.length-1);
|
||||
this.elBtnUser.click();
|
||||
ev.preventDefault();
|
||||
}
|
||||
@@ -321,6 +398,29 @@ class MultiChatUI {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try read json response early, if available.
|
||||
* @param {Response} resp
|
||||
*/
|
||||
async read_json_early(resp) {
|
||||
if (!resp.body) {
|
||||
throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body...");
|
||||
}
|
||||
let tdUtf8 = new TextDecoder("utf-8");
|
||||
let rr = resp.body.getReader();
|
||||
let gotBody = "";
|
||||
while(true) {
|
||||
let { value: cur, done: done} = await rr.read();
|
||||
let curBody = tdUtf8.decode(cur);
|
||||
console.debug("DBUG:SC:PART:", curBody);
|
||||
gotBody += curBody;
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return JSON.parse(gotBody);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle user query submit request, wrt specified chat session.
|
||||
* @param {string} chatId
|
||||
@@ -330,6 +430,14 @@ class MultiChatUI {
|
||||
|
||||
let chat = this.simpleChats[chatId];
|
||||
|
||||
// In completion mode, if configured, clear any previous chat history.
|
||||
// So if user wants to simulate a multi-chat based completion query,
|
||||
// they will have to enter the full thing, as a suitable multiline
|
||||
// user input/query.
|
||||
if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) {
|
||||
chat.clear();
|
||||
}
|
||||
|
||||
chat.add_system_anytime(this.elInSystem.value, chatId);
|
||||
|
||||
let content = this.elInUser.value;
|
||||
@@ -344,7 +452,7 @@ class MultiChatUI {
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
theBody = chat.request_messages_jsonstr();
|
||||
} else {
|
||||
theBody = chat.request_prompt_jsonstr();
|
||||
theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
|
||||
}
|
||||
|
||||
this.elInUser.value = "working...";
|
||||
@@ -359,6 +467,7 @@ class MultiChatUI {
|
||||
});
|
||||
|
||||
let respBody = await resp.json();
|
||||
//let respBody = await this.read_json_early(resp);
|
||||
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
||||
let assistantMsg;
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
@@ -376,13 +485,6 @@ class MultiChatUI {
|
||||
} else {
|
||||
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
||||
}
|
||||
// Purposefully clear at end rather than begin of this function
|
||||
// so that one can switch from chat to completion mode and sequece
|
||||
// in a completion mode with multiple user-assistant chat data
|
||||
// from before to be sent/occur once.
|
||||
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
|
||||
chat.xchat.length = 0;
|
||||
}
|
||||
this.ui_reset_userinput();
|
||||
}
|
||||
|
||||
@@ -462,17 +564,66 @@ class MultiChatUI {
|
||||
}
|
||||
|
||||
|
||||
let gMuitChat;
|
||||
const gChatIds = [ "Default", "Other" ];
|
||||
class Me {
|
||||
|
||||
constructor() {
|
||||
this.defaultChatIds = [ "Default", "Other" ];
|
||||
this.multiChat = new MultiChatUI();
|
||||
this.bCompletionFreshChatAlways = true;
|
||||
this.bCompletionInsertStandardRolePrefix = false;
|
||||
this.iRecentUserMsgCnt = 2;
|
||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
this.chatRequestOptions = {
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 1024,
|
||||
"frequency_penalty": 1.2,
|
||||
"presence_penalty": 1.2,
|
||||
"n_predict": 1024
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {HTMLDivElement} elDiv
|
||||
*/
|
||||
show_info(elDiv) {
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = "Settings (devel-tools-console gMe)";
|
||||
p.className = "role-system";
|
||||
elDiv.appendChild(p);
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** @type {Me} */
|
||||
let gMe;
|
||||
|
||||
function startme() {
|
||||
console.log("INFO:SimpleChat:StartMe:Starting...");
|
||||
gMuitChat = new MultiChatUI();
|
||||
for (let cid of gChatIds) {
|
||||
gMuitChat.new_chat_session(cid);
|
||||
gMe = new Me();
|
||||
for (let cid of gMe.defaultChatIds) {
|
||||
gMe.multiChat.new_chat_session(cid);
|
||||
}
|
||||
gMuitChat.setup_ui(gChatIds[0]);
|
||||
gMuitChat.show_sessions();
|
||||
gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
|
||||
gMe.multiChat.show_sessions();
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", startme);
|
||||
|
||||
@@ -4593,6 +4593,9 @@ static void llm_load_vocab(
|
||||
} else if (
|
||||
tokenizer_pre == "dbrx") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||
} else if (
|
||||
tokenizer_pre == "smaug-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
@@ -12512,6 +12515,7 @@ struct llm_tokenizer_bpe {
|
||||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// same as llama3
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
@@ -17861,6 +17865,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
||||
);
|
||||
}
|
||||
|
||||
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
||||
return llama_is_control_token(model->vocab, token);
|
||||
}
|
||||
|
||||
llama_token llama_token_bos(const struct llama_model * model) {
|
||||
return model->vocab.special_bos_id;
|
||||
}
|
||||
|
||||
4
llama.h
4
llama.h
@@ -85,6 +85,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -823,6 +824,9 @@ extern "C" {
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Identify if Token Id is a control token or a render-able token
|
||||
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
|
||||
Reference in New Issue
Block a user