Merge 7479388116 into 58062860af
This commit is contained in:
commit
e0298aa3c0
|
|
@ -13,6 +13,7 @@ build*/
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
/llama-cli
|
/llama-cli
|
||||||
|
/llama-completion
|
||||||
/llama-quantize
|
/llama-quantize
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,7 @@ body:
|
||||||
- Documentation/Github
|
- Documentation/Github
|
||||||
- libllama (core library)
|
- libllama (core library)
|
||||||
- llama-cli
|
- llama-cli
|
||||||
|
- llama-completion
|
||||||
- llama-server
|
- llama-server
|
||||||
- llama-bench
|
- llama-bench
|
||||||
- llama-quantize
|
- llama-quantize
|
||||||
|
|
|
||||||
|
|
@ -183,7 +183,8 @@ Add `ggml-ci` to commit message to trigger heavy CI workloads on the custom CI i
|
||||||
|
|
||||||
### Built Executables (in `build/bin/`)
|
### Built Executables (in `build/bin/`)
|
||||||
Primary tools:
|
Primary tools:
|
||||||
- **`llama-cli`**: Main inference tool
|
- **`llama-cli`**: Main CLI tool
|
||||||
|
- **`llama-completion`**: Legacy CLI tool
|
||||||
- **`llama-server`**: OpenAI-compatible HTTP server
|
- **`llama-server`**: OpenAI-compatible HTTP server
|
||||||
- **`llama-quantize`**: Model quantization utility
|
- **`llama-quantize`**: Model quantization utility
|
||||||
- **`llama-perplexity`**: Model evaluation tool
|
- **`llama-perplexity`**: Model evaluation tool
|
||||||
|
|
|
||||||
|
|
@ -584,7 +584,7 @@ Command-line completion is available for some environments.
|
||||||
|
|
||||||
#### Bash Completion
|
#### Bash Completion
|
||||||
```bash
|
```bash
|
||||||
$ build/bin/llama-cli --completion-bash > ~/.llama-completion.bash
|
$ build/bin/llama-completion --completion-bash > ~/.llama-completion.bash
|
||||||
$ source ~/.llama-completion.bash
|
$ source ~/.llama-completion.bash
|
||||||
```
|
```
|
||||||
Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
|
Optionally this can be added to your `.bashrc` or `.bash_profile` to load it
|
||||||
|
|
|
||||||
|
|
@ -118,7 +118,7 @@ On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- **Memory**
|
- **Memory**
|
||||||
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-cli`.
|
- The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/llama-completion`.
|
||||||
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
- Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
|
||||||
|
|
||||||
- **Execution Unit (EU)**
|
- **Execution Unit (EU)**
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,7 @@ To generate an installable "package" simply use cmake --install:
|
||||||
...
|
...
|
||||||
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
|
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-bench
|
||||||
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
|
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-cli
|
||||||
|
-- Installing: /workspace/pkg-adb/llama.cpp/bin/llama-completion
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,7 @@ M=gpt-oss-20b-Q4_0.gguf NDEV=4 D=HTP0,HTP1,HTP2,HTP3 P=surfing.txt scripts/snapd
|
||||||
...
|
...
|
||||||
LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
|
LD_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
|
||||||
ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
|
ADSP_LIBRARY_PATH=/data/local/tmp/llama.cpp/lib
|
||||||
GGML_HEXAGON_NDEV=4 ./bin/llama-cli --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
|
GGML_HEXAGON_NDEV=4 ./bin/llama-completion --no-mmap -m /data/local/tmp/llama.cpp/../gguf/gpt-oss-20b-Q4_0.gguf
|
||||||
-t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
|
-t 4 --ctx-size 8192 --batch-size 128 -ctk q8_0 -ctv q8_0 -fa on -ngl 99 --device HTP0,HTP1,HTP2,HTP3 -no-cnv -f surfing.txt
|
||||||
...
|
...
|
||||||
llama_model_loader: - type f32: 289 tensors
|
llama_model_loader: - type f32: 289 tensors
|
||||||
|
|
|
||||||
|
|
@ -74,11 +74,11 @@ if __name__ == '__main__':
|
||||||
for m in models:
|
for m in models:
|
||||||
logging.info(f' - {m.hf_repo} / {m.hf_file}')
|
logging.info(f' - {m.hf_repo} / {m.hf_file}')
|
||||||
|
|
||||||
cli_path = os.environ.get(
|
completion_path = os.environ.get(
|
||||||
'LLAMA_CLI_BIN_PATH',
|
'LLAMA_CLI_BIN_PATH',
|
||||||
os.path.join(
|
os.path.join(
|
||||||
os.path.dirname(__file__),
|
os.path.dirname(__file__),
|
||||||
'../build/bin/Release/llama-cli.exe' if os.name == 'nt' else '../build/bin/llama-cli'))
|
'../build/bin/Release/llama-completion.exe' if os.name == 'nt' else '../build/bin/llama-completion'))
|
||||||
|
|
||||||
for m in models:
|
for m in models:
|
||||||
if '<' in m.hf_repo or (m.hf_file is not None and '<' in m.hf_file):
|
if '<' in m.hf_repo or (m.hf_file is not None and '<' in m.hf_file):
|
||||||
|
|
@ -86,9 +86,9 @@ if __name__ == '__main__':
|
||||||
if m.hf_file is not None and '-of-' in m.hf_file:
|
if m.hf_file is not None and '-of-' in m.hf_file:
|
||||||
logging.warning(f'Skipping model at {m.hf_repo} / {m.hf_file} because it is a split file')
|
logging.warning(f'Skipping model at {m.hf_repo} / {m.hf_file} because it is a split file')
|
||||||
continue
|
continue
|
||||||
logging.info(f'Using llama-cli to ensure model {m.hf_repo}/{m.hf_file} was fetched')
|
logging.info(f'Using llama-completion to ensure model {m.hf_repo}/{m.hf_file} was fetched')
|
||||||
cmd = [
|
cmd = [
|
||||||
cli_path,
|
completion_path,
|
||||||
'-hfr', m.hf_repo,
|
'-hfr', m.hf_repo,
|
||||||
*([] if m.hf_file is None else ['-hff', m.hf_file]),
|
*([] if m.hf_file is None else ['-hff', m.hf_file]),
|
||||||
'-n', '1',
|
'-n', '1',
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue