Compare commits

...

3 Commits

Author SHA1 Message Date
Xuan Son Nguyen 902184dd3a
fix missing slash in `fs_get_cache_directory()` (#7503)
* fix missing slash in fs_get_cache_directory()

* use LOCALAPPDATA for fs_get_cache_directory()

* better code style
2024-05-25 13:30:59 +10:00
Mikko Juola 57684331fc
Make tokenize CLI tool have nicer command line arguments. (#6188)
* Make tokenizer.cpp CLI tool nicer.

Before this commit, tokenize was a simple CLI tool like this:

  tokenize MODEL_FILENAME PROMPT [--ids]

This simple tool loads the model, takes the prompt, and shows the tokens
llama.cpp is interpreting.

This changeset makes the tokenize more sophisticated, and more useful
for debugging and troubleshooting:

  tokenize [-m, --model MODEL_FILENAME]
           [--ids]
           [--stdin]
           [--prompt]
           [-f, --file]
           [--no-bos]
           [--log-disable]

It also behaves nicer on Windows now, interpreting and rendering Unicode
from command line arguments and pipes no matter what code page the user
has set on their terminal.

* style fix: strlen(str) == 0 --> *str == 0

* Simplify tokenize.cpp; by getting rid of handling positional style arguments.

It must now be invoked with long --model, --prompt etc. arguments only.
Shortens the code.

* tokenize.cpp: iostream header no longer required

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: brian khuu <mofosyne@gmail.com>
2024-05-25 11:14:42 +10:00
compilade b83bab15a5
gguf-py : fix and simplify quantized shape round-trip (#7483)
* gguf-py : fix and simplify quantized shape round-trip

* gguf-py : remove unused import
2024-05-25 11:11:48 +10:00
7 changed files with 396 additions and 29 deletions

View File

@ -1855,11 +1855,15 @@ bool fs_create_directory_with_parents(const std::string & path) {
std::string fs_get_cache_directory() { std::string fs_get_cache_directory() {
std::string cache_directory = ""; std::string cache_directory = "";
auto ensure_trailing_slash = [](std::string p) {
// Make sure to add trailing slash
if (p.back() != DIRECTORY_SEPARATOR) {
p += DIRECTORY_SEPARATOR;
}
return p;
};
if (getenv("LLAMA_CACHE")) { if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE"); cache_directory = std::getenv("LLAMA_CACHE");
if (cache_directory.back() != DIRECTORY_SEPARATOR) {
cache_directory += DIRECTORY_SEPARATOR;
}
} else { } else {
#ifdef __linux__ #ifdef __linux__
if (std::getenv("XDG_CACHE_HOME")) { if (std::getenv("XDG_CACHE_HOME")) {
@ -1870,12 +1874,12 @@ std::string fs_get_cache_directory() {
#elif defined(__APPLE__) #elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/"); cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32) #elif defined(_WIN32)
cache_directory = std::getenv("APPDATA"); cache_directory = std::getenv("LOCALAPPDATA");
#endif // __linux__ #endif // __linux__
cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp"; cache_directory += "llama.cpp";
cache_directory += DIRECTORY_SEPARATOR;
} }
return cache_directory; return ensure_trailing_slash(cache_directory);
} }

View File

@ -313,11 +313,10 @@ class Model:
data = data.astype(np.float32) data = data.astype(np.float32)
data_qtype = gguf.GGMLQuantizationType.F32 data_qtype = gguf.GGMLQuantizationType.F32
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype] shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
# reverse shape to make it similar to the internal ggml dimension order # reverse shape to make it similar to the internal ggml dimension order
shape_str = f"""{{{', '.join(str(n) for n in reversed( shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
)}}}"""
# n_dims is implicit in the shape # n_dims is implicit in the shape
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")

View File

@ -3,40 +3,390 @@
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <fstream>
#include <string> #include <string>
#include <vector> #include <vector>
int main(int argc, char ** argv) { #if defined(_WIN32)
if (argc < 3 || argv[1][0] == '-') { #define WIN32_LEAN_AND_MEAN
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]); #include <windows.h>
#include <shellapi.h> // For CommandLineToArgvW
#endif
static void print_usage_information(const char * argv0, FILE * stream) {
fprintf(stream, "usage: %s [options]\n\n", argv0);
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
fprintf(stream, " The possible options are:\n");
fprintf(stream, "\n");
fprintf(stream, " -h, --help print this help and exit\n");
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
fprintf(stream, " --stdin read prompt from standard input.\n");
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
}
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) text;
(void) user_data;
}
static std::string read_prompt_from_file(const char * filepath, bool & success) {
success = false;
std::ifstream in(filepath, std::ios::binary);
if (!in) {
fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
return std::string();
}
// do not assume the file is seekable (e.g. /dev/stdin)
std::stringstream buffer;
buffer << in.rdbuf();
if (in.fail()) {
fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
return std::string();
}
success = true;
return buffer.str();
}
//
// Function: ingest_args(...) -> vector<string>
//
// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
// strings, as an STL vector<string>.
//
// In particular, it handles character encoding shenanigans on Windows.
//
// Note: raw_argc and raw_argv are not actually read at all on Windows.
// On Windows we call GetCommandLineW to get the arguments in wchar_t
// format, ignoring the regular argc/argv arguments to main().
//
// TODO: potential opportunity to roll common stuff into common/console.cpp
// in relation to Windows wchar_t shenanigans.
static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
std::vector<std::string> argv;
// Handle Windows, if given non-ASCII arguments.
// We convert wchar_t arguments into UTF-8 char* on this platform.
// Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
// without throwing tantrums.
#if defined(_WIN32)
int argc;
const LPWSTR cmdline_wargv = GetCommandLineW();
LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
// silence unused arg warnings
(void) raw_argc;
(void) raw_argv;
for (int i = 0; i < argc; ++i) {
int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
GGML_ASSERT(output_buf);
WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
output_buf[length_needed] = '\0';
argv.push_back(output_buf);
free(output_buf);
}
LocalFree((HLOCAL) wargv);
#else
int argc = raw_argc;
for (int i = 0; i < argc; ++i) {
argv.push_back(raw_argv[i]);
}
#endif
GGML_ASSERT((unsigned int) argc == argv.size());
return argv;
}
//
// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
//
// writes a string to standard output; taking into account that on Windows
// to display correctly you have to use special handling. Works even if the
// user has not set a unicode code page on a Windows cmd.exe.
//
// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
// a human-readable is written instead.
//
// On non-Windows systems, simply printfs() the string.
static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
invalid_utf8 = false;
#if defined(_WIN32)
// Are we in a console?
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
DWORD dwMode = 0;
// According to Microsoft docs:
// "WriteConsole fails if it is used with a standard handle that is redirected to a file."
// Also according to the docs, you can use GetConsoleMode to check for that.
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
printf("%s", str);
return;
}
// MultiByteToWideChar reports an error if str is empty, don't report
// them as invalid_utf8.
if (*str == 0) {
return;
}
int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
if (length_needed == 0) {
DWORD err = GetLastError();
if (err == ERROR_NO_UNICODE_TRANSLATION) {
invalid_utf8 = true;
int len = strlen(str);
printf("<");
for (int i = 0; i < len; ++i) {
if (i > 0) {
printf(" ");
}
printf("%02x", (uint8_t) str[i]);
}
printf(">");
return;
}
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
}
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
GGML_ASSERT(wstr);
MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
free(wstr);
#else
// TODO: reporting invalid_utf8 would be useful on non-Windows too.
// printf will silently just write bad unicode.
printf("%s", str);
#endif
}
int main(int raw_argc, char ** raw_argv) {
const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
const int argc = argv.size();
if (argc <= 1) {
print_usage_information(argv[0].c_str(), stderr);
return 1; return 1;
} }
const char * model_path = argv[1]; //////
const char * prompt = argv[2]; // Read out all the command line arguments.
//////
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; // variables where to put any arguments we see.
bool printing_ids = false;
bool no_bos = false;
bool disable_logging = false;
const char * model_path = NULL;
const char * prompt_path = NULL;
const char * prompt_arg = NULL;
// track which arguments were explicitly given
// used for sanity checking down the line
bool model_path_set = false;
bool prompt_path_set = false;
bool prompt_set = false;
bool stdin_set = false;
int iarg = 1;
for (; iarg < argc; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-h" || arg == "--help") {
print_usage_information(argv[0].c_str(), stdout);
return 0;
}
else if (arg == "--ids") {
printing_ids = true;
}
else if (arg == "-m" || arg == "--model") {
if (model_path_set) {
fprintf(stderr, "Error: -m or --model specified multiple times.\n");
return 1;
}
model_path = argv[++iarg].c_str();
model_path_set = true;
}
else if (arg == "--no-bos") {
no_bos = true;
}
else if (arg == "-p" || arg == "--prompt") {
if (prompt_set) {
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
return 1;
}
prompt_arg = argv[++iarg].c_str();
prompt_set = true;
}
else if (arg == "-f" || arg == "--file") {
if (prompt_path_set) {
fprintf(stderr, "Error: -f or --file specified multiple times.\n");
return 1;
}
prompt_path = argv[++iarg].c_str();
prompt_path_set = true;
}
else if (arg == "--stdin") {
stdin_set = true;
}
else if (arg == "--log-disable") {
disable_logging = true;
}
else {
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
return 1;
}
}
//////
// Sanity check the command line arguments.
//////
// Check that we have the required stuff set.
if (model_path_set && model_path == NULL) {
fprintf(stderr, "Error: --model requires an argument.\n");
return 1;
}
if (!model_path_set) {
fprintf(stderr, "Error: must specify --model.\n");
return 1;
}
if (prompt_path_set && prompt_path == NULL) {
fprintf(stderr, "Error: --file requires an argument.\n");
return 1;
}
if (prompt_set && prompt_arg == NULL) {
fprintf(stderr, "Error: --prompt requires an argument.\n");
return 1;
}
const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
if (prompts_set > 1) {
fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
return 1;
}
// Must have some prompt.
if (prompts_set == 0) {
fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
return 1;
}
GGML_ASSERT(model_path);
GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
//////
// Figure out where will the prompt come from.
//////
std::string prompt;
if (prompt_path_set) {
bool success = false;
prompt = read_prompt_from_file(prompt_path, success);
if (!success) {
return 1;
}
} else if (prompt_set) {
prompt = prompt_arg;
} else {
GGML_ASSERT(stdin_set);
// we read stdin *after* loading model (early exit if model cannot
// be loaded, which can be a nicer user experience)
}
//////
// Start actually doing the tokenizing stuff.
//////
#ifdef LOG_DISABLE_LOGS
disable_logging = true;
#endif
if (disable_logging) {
llama_log_set(llama_log_callback_null, NULL);
}
llama_backend_init(); llama_backend_init();
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
model_params.vocab_only = true; model_params.vocab_only = true;
llama_model * model = llama_load_model_from_file(model_path, model_params); llama_model * model = llama_load_model_from_file(model_path, model_params);
if (!model) {
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
return 1;
}
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
if (!ctx) {
fprintf(stderr, "Error: could not create context.\n");
return 1;
}
// read entire prompt from stdin?
if (stdin_set) {
GGML_ASSERT(!prompt_path_set && !prompt_set);
std::stringstream stdin_buffer;
stdin_buffer << std::cin.rdbuf();
if (std::cin.fail()) {
fprintf(stderr, "Error: could not read the entire standard input.\n");
return 1;
}
prompt = stdin_buffer.str();
}
const bool model_wants_add_bos = llama_should_add_bos_token(model);
const bool add_bos = model_wants_add_bos && !no_bos;
std::vector<llama_token> tokens; std::vector<llama_token> tokens;
tokens = ::llama_tokenize(model, prompt, add_bos, true);
tokens = ::llama_tokenize(model, prompt, true, true); if (printing_ids) {
printf("[");
}
for (int i = 0; i < (int) tokens.size(); i++) { for (int i = 0; i < (int) tokens.size(); i++) {
if (printing_ids) { if (printing_ids) {
printf("%d\n", tokens[i]); if (i > 0) {
printf(", ");
}
printf("%d", tokens[i]);
} else { } else {
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str()); bool invalid_utf8 = false;
printf("%6d -> '", tokens[i]);
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
if (invalid_utf8) {
printf("' (utf-8 decode failure)\n");
} else {
printf("'\n");
}
} }
} }
if (printing_ids) {
printf("]\n");
}
// silence valgrind
llama_free(ctx);
llama_free_model(model);
return 0; return 0;
} }

View File

@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
from .quants import quant_shape_to_byte_shape
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
from pathlib import Path from pathlib import Path
@ -251,6 +253,7 @@ class GGUFReader:
tensor_names.add(tensor_name) tensor_names.add(tensor_name)
ggml_type = GGMLQuantizationType(raw_dtype[0]) ggml_type = GGMLQuantizationType(raw_dtype[0])
n_elems = int(np.prod(dims)) n_elems = int(np.prod(dims))
np_dims = tuple(reversed(dims.tolist()))
block_size, type_size = GGML_QUANT_SIZES[ggml_type] block_size, type_size = GGML_QUANT_SIZES[ggml_type]
n_bytes = n_elems * type_size // block_size n_bytes = n_elems * type_size // block_size
data_offs = int(start_offs + offset_tensor[0]) data_offs = int(start_offs + offset_tensor[0])
@ -279,6 +282,7 @@ class GGUFReader:
else: else:
item_count = n_bytes item_count = n_bytes
item_type = np.uint8 item_type = np.uint8
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
tensors.append(ReaderTensor( tensors.append(ReaderTensor(
name = tensor_name, name = tensor_name,
tensor_type = ggml_type, tensor_type = ggml_type,
@ -286,7 +290,7 @@ class GGUFReader:
n_elements = n_elems, n_elements = n_elems,
n_bytes = n_bytes, n_bytes = n_bytes,
data_offset = data_offs, data_offset = data_offs,
data = self._get(data_offs, item_type, item_count), data = self._get(data_offs, item_type, item_count).reshape(np_dims),
field = field, field = field,
)) ))
self.tensors = tensors self.tensors = tensors

View File

@ -13,7 +13,6 @@ from string import ascii_letters, digits
import numpy as np import numpy as np
from .constants import ( from .constants import (
GGML_QUANT_SIZES,
GGUF_DEFAULT_ALIGNMENT, GGUF_DEFAULT_ALIGNMENT,
GGUF_MAGIC, GGUF_MAGIC,
GGUF_VERSION, GGUF_VERSION,
@ -26,6 +25,8 @@ from .constants import (
TokenType, TokenType,
) )
from .quants import quant_shape_from_byte_shape
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -229,10 +230,7 @@ class GGUFWriter:
else: else:
dtype = raw_dtype dtype = raw_dtype
if tensor_dtype == np.uint8: if tensor_dtype == np.uint8:
block_size, type_size = GGML_QUANT_SIZES[raw_dtype] tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
if tensor_shape[-1] % type_size != 0:
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
n_dims = len(tensor_shape) n_dims = len(tensor_shape)
self.ti_data += self._pack("I", n_dims) self.ti_data += self._pack("I", n_dims)
for i in range(n_dims): for i in range(n_dims):

View File

@ -1,5 +1,5 @@
from __future__ import annotations from __future__ import annotations
from typing import Callable from typing import Callable, Sequence
from numpy.typing import DTypeLike from numpy.typing import DTypeLike
@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
import numpy as np import numpy as np
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
block_size, type_size = GGML_QUANT_SIZES[quant_type]
if shape[-1] % block_size != 0:
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
return (*shape[:-1], shape[-1] // block_size * type_size)
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
block_size, type_size = GGML_QUANT_SIZES[quant_type]
if shape[-1] % type_size != 0:
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
return (*shape[:-1], shape[-1] // type_size * block_size)
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray: def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
n = n.astype(np.float32, copy=False).view(np.int32) n = n.astype(np.float32, copy=False).view(np.int32)

View File

@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
for tensor in reader.tensors: for tensor in reader.tensors:
total_bytes += tensor.n_bytes total_bytes += tensor.n_bytes
# Dimensions are written in reverse order, so flip them first writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
shape = np.flipud(tensor.shape).tolist()
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)