diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index 83182c9700..db221b0b81 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -107,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"] # ENTRYPOINT ["/app/llama-server"] ### Target: light -# Lightweight image containing only llama-cli +# Lightweight image containing only llama-cli and llama-completion # ============================================================================== FROM base AS light diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index ef43d78cd2..6581187f32 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -23,11 +23,12 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH RUN echo "Building with static libs" && \ source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \ cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF -DLLAMA_BUILD_TESTS=OFF && \ - cmake --build build --config Release --target llama-cli + cmake --build build --config Release --target llama-cli && \ + cmake --build build --config Release --target llama-completion # TODO: use image with NNRT FROM ascendai/cann:$ASCEND_VERSION AS runtime -COPY --from=build /app/build/bin/llama-cli /llama-cli +COPY --from=build /app/build/bin/llama-cli /app/build/bin/llama-completion / ENV LC_ALL=C.utf8 diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec index 3bbf4a4def..4d42a906b1 100644 --- a/.devops/llama-cpp-cuda.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -37,6 +37,7 @@ make -j GGML_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli +cp -p llama-completion %{buildroot}%{_bindir}/llama-cuda-completion cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple @@ -68,6 +69,7 @@ rm -rf %{_builddir}/* %files %{_bindir}/llama-cuda-cli +%{_bindir}/llama-cuda-completion %{_bindir}/llama-cuda-server %{_bindir}/llama-cuda-simple /usr/lib/systemd/system/llamacuda.service diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec index 45902dcf89..0a4f43058d 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/llama-cpp.srpm.spec @@ -39,6 +39,7 @@ make -j %install mkdir -p %{buildroot}%{_bindir}/ cp -p llama-cli %{buildroot}%{_bindir}/llama-cli +cp -p llama-completion %{buildroot}%{_bindir}/llama-completion cp -p llama-server %{buildroot}%{_bindir}/llama-server cp -p llama-simple %{buildroot}%{_bindir}/llama-simple @@ -70,6 +71,7 @@ rm -rf %{_builddir}/* %files %{_bindir}/llama-cli +%{_bindir}/llama-completion %{_bindir}/llama-server %{_bindir}/llama-simple /usr/lib/systemd/system/llama.service diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index 1904e31fdc..e1bd08ddd2 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -86,6 +86,7 @@ body: description: > If applicable, please copy and paste any relevant log output, including any generated text. This will be automatically formatted into code, so no need for backticks. + If you are encountering problems specifically with the `llama_params_fit` module, always upload `--verbose` logs as well. render: shell validations: required: false diff --git a/.github/workflows/server-webui.yml b/.github/workflows/server-webui.yml index f8a261eefa..544c4ad408 100644 --- a/.github/workflows/server-webui.yml +++ b/.github/workflows/server-webui.yml @@ -31,9 +31,10 @@ concurrency: cancel-in-progress: true jobs: - webui-setup: - name: WebUI Setup + webui-check: + name: WebUI Checks runs-on: ubuntu-latest + continue-on-error: true steps: - name: Checkout code uses: actions/checkout@v4 @@ -42,137 +43,66 @@ jobs: ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - name: Setup Node.js + id: node uses: actions/setup-node@v4 with: node-version: "22" cache: "npm" cache-dependency-path: "tools/server/webui/package-lock.json" - - name: Cache node_modules - uses: actions/cache@v4 - id: cache-node-modules - with: - path: tools/server/webui/node_modules - key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }} - restore-keys: | - ${{ runner.os }}-node-modules- - - name: Install dependencies - if: steps.cache-node-modules.outputs.cache-hit != 'true' + id: setup + if: ${{ steps.node.conclusion == 'success' }} run: npm ci working-directory: tools/server/webui - webui-check: - needs: webui-setup - name: WebUI Check - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: "22" - - - name: Restore node_modules cache - uses: actions/cache@v4 - with: - path: tools/server/webui/node_modules - key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }} - restore-keys: | - ${{ runner.os }}-node-modules- - - name: Run type checking + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run check working-directory: tools/server/webui - name: Run linting + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run lint working-directory: tools/server/webui - webui-build: - needs: webui-check - name: WebUI Build - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: "22" - - - name: Restore node_modules cache - uses: actions/cache@v4 - with: - path: tools/server/webui/node_modules - key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }} - restore-keys: | - ${{ runner.os }}-node-modules- - - name: Build application + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run build working-directory: tools/server/webui - webui-tests: - needs: webui-build - name: Run WebUI tests - permissions: - contents: read - - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Setup Node.js - uses: actions/setup-node@v4 - with: - node-version: "22" - - - name: Restore node_modules cache - uses: actions/cache@v4 - with: - path: tools/server/webui/node_modules - key: ${{ runner.os }}-node-modules-${{ hashFiles('tools/server/webui/package-lock.json') }} - restore-keys: | - ${{ runner.os }}-node-modules- - - name: Install Playwright browsers + id: playwright + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npx playwright install --with-deps working-directory: tools/server/webui - name: Build Storybook + if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run build-storybook working-directory: tools/server/webui - name: Run Client tests + if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:client working-directory: tools/server/webui - - name: Run Server tests - run: npm run test:server + - name: Run Unit tests + if: ${{ always() && steps.playwright.conclusion == 'success' }} + run: npm run test:unit working-directory: tools/server/webui - name: Run UI tests + if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:ui -- --testTimeout=60000 working-directory: tools/server/webui - name: Run E2E tests + if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:e2e working-directory: tools/server/webui server-build: - needs: [webui-tests] runs-on: ubuntu-latest strategy: diff --git a/CODEOWNERS b/CODEOWNERS index 8a0c98c968..750096d9a1 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -32,7 +32,7 @@ /examples/export-docs/ @ggerganov /examples/gen-docs/ @ggerganov /examples/gguf/ @ggerganov -/examples/llama.android/ @ggerganov +/examples/llama.android/ @ggerganov @hanyin-arm @naco-siren /examples/llama.swiftui/ @ggerganov /examples/llama.vim @ggerganov /examples/lookahead/ @ggerganov diff --git a/README.md b/README.md index 5f2076d0a3..ed956bb02e 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) - Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi) - Go (no CGo needed): [hybridgroup/yzma](https://github.com/hybridgroup/yzma) +- Android: [llama.android](/examples/llama.android) diff --git a/SECURITY.md b/SECURITY.md index 9c86ae91b5..ae496f4e3d 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -68,3 +68,6 @@ Please disclose it as a private [security advisory](https://github.com/ggml-org/ Please note that using AI to identify vulnerabilities and generate reports is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before submitting the report. A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. + +> [!IMPORTANT] +> For collaborators: if you are interested in helping out with reviewing privting security disclosures, please see: https://github.com/ggml-org/llama.cpp/discussions/18080 diff --git a/common/arg.cpp b/common/arg.cpp index 4c1b8493d3..03f430375b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -420,6 +420,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context } }; + std::set seen_args; + for (int i = 1; i < argc; i++) { const std::string arg_prefix = "--"; @@ -430,6 +432,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context if (arg_to_options.find(arg) == arg_to_options.end()) { throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } + if (!seen_args.insert(arg).second) { + LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + } auto & tmp = arg_to_options[arg]; auto opt = *tmp.first; bool is_positive = tmp.second; @@ -750,6 +755,8 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map seen_args; + for (int i = 1; i < argc; i++) { const std::string arg_prefix = "--"; @@ -760,6 +767,9 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map(value, ',')) { + std::ifstream file(item); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str())); + } + params.in_files.push_back(item); } - params.in_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( @@ -1895,13 +1919,27 @@ common_params_context common_params_parser_init(common_params & params, llama_ex LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n"); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); - add_opt(common_arg( - {"-np", "--parallel"}, "N", - string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), - [](common_params & params, int value) { - params.n_parallel = value; - } - ).set_env("LLAMA_ARG_N_PARALLEL")); + if (ex == LLAMA_EXAMPLE_SERVER) { + // this is to make sure this option appears in the server-specific section of the help message + add_opt(common_arg( + {"-np", "--parallel"}, "N", + string_format("number of server slots (default: %d, -1 = auto)", params.n_parallel), + [](common_params & params, int value) { + if (value == 0) { + throw std::invalid_argument("error: invalid value for n_parallel\n"); + } + params.n_parallel = value; + } + ).set_env("LLAMA_ARG_N_PARALLEL").set_examples({LLAMA_EXAMPLE_SERVER})); + } else { + add_opt(common_arg( + {"-np", "--parallel"}, "N", + string_format("number of parallel sequences to decode (default: %d)", params.n_parallel), + [](common_params & params, int value) { + params.n_parallel = value; + } + ).set_env("LLAMA_ARG_N_PARALLEL")); + } add_opt(common_arg( {"-ns", "--sequences"}, "N", string_format("number of sequences to decode (default: %d)", params.n_sequences), @@ -1950,9 +1988,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD")); add_opt(common_arg( {"--image", "--audio"}, "FILE", - "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n", + "path to an image or audio file. use with multimodal models, use comma-separated values for multiple files\n", [](common_params & params, const std::string & value) { - params.image.emplace_back(value); + for (const auto & item : string_split(value, ',')) { + params.image.emplace_back(item); + } } ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( @@ -2199,12 +2239,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } )); add_opt(common_arg( - {"--override-kv"}, "KEY=TYPE:VALUE", - "advanced option to override model metadata by key. may be specified multiple times.\n" - "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false", + {"--override-kv"}, "KEY=TYPE:VALUE,...", + "advanced option to override model metadata by key. to specify multiple overrides, either use comma-separated or repeat this argument.\n" + "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false,tokenizer.ggml.add_eos_token=bool:false", [](common_params & params, const std::string & value) { - if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) { - throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str())); + std::vector kv_overrides; + + std::string current; + bool escaping = false; + + for (const char c : value) { + if (escaping) { + current.push_back(c); + escaping = false; + } else if (c == '\\') { + escaping = true; + } else if (c == ',') { + kv_overrides.push_back(current); + current.clear(); + } else { + current.push_back(c); + } + } + + if (escaping) { + current.push_back('\\'); + } + + kv_overrides.push_back(current); + + for (const auto & kv_override : kv_overrides) { + if (!string_parse_kv_override(kv_override.c_str(), params.kv_overrides)) { + throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", kv_override.c_str())); + } } } )); @@ -2218,33 +2285,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--lora"}, "FNAME", - "path to LoRA adapter (can be repeated to use multiple adapters)", + "path to LoRA adapter (use comma-separated values to load multiple adapters)", [](common_params & params, const std::string & value) { - params.lora_adapters.push_back({ std::string(value), 1.0, "", "", nullptr }); + for (const auto & item : string_split(value, ',')) { + params.lora_adapters.push_back({ item, 1.0, "", "", nullptr }); + } } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( - {"--lora-scaled"}, "FNAME", "SCALE", - "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)", - [](common_params & params, const std::string & fname, const std::string & scale) { - params.lora_adapters.push_back({ fname, std::stof(scale), "", "", nullptr }); + {"--lora-scaled"}, "FNAME:SCALE,...", + "path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...)\n" + "note: use comma-separated values", + [](common_params & params, const std::string & value) { + for (const auto & item : string_split(value, ',')) { + auto parts = string_split(item, ':'); + if (parts.size() != 2) { + throw std::invalid_argument("lora-scaled format: FNAME:SCALE"); + } + params.lora_adapters.push_back({ parts[0], std::stof(parts[1]), "", "", nullptr }); + } } // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA})); add_opt(common_arg( {"--control-vector"}, "FNAME", - "add a control vector\nnote: this argument can be repeated to add multiple control vectors", + "add a control vector\nnote: use comma-separated values to add multiple control vectors", [](common_params & params, const std::string & value) { - params.control_vectors.push_back({ 1.0f, value, }); + for (const auto & item : string_split(value, ',')) { + params.control_vectors.push_back({ 1.0f, item, }); + } } )); add_opt(common_arg( - {"--control-vector-scaled"}, "FNAME", "SCALE", + {"--control-vector-scaled"}, "FNAME:SCALE,...", "add a control vector with user defined scaling SCALE\n" - "note: this argument can be repeated to add multiple scaled control vectors", - [](common_params & params, const std::string & fname, const std::string & scale) { - params.control_vectors.push_back({ std::stof(scale), fname }); + "note: use comma-separated values (format: FNAME:SCALE,...)", + [](common_params & params, const std::string & value) { + for (const auto & item : string_split(value, ',')) { + auto parts = string_split(item, ':'); + if (parts.size() != 2) { + throw std::invalid_argument("control-vector-scaled format: FNAME:SCALE"); + } + params.control_vectors.push_back({ std::stof(parts[1]), parts[0] }); + } } )); add_opt(common_arg( @@ -2334,13 +2418,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("HF_TOKEN")); add_opt(common_arg( {"--context-file"}, "FNAME", - "file to load context from (repeat to specify multiple files)", + "file to load context from (use comma-separated values to specify multiple files)", [](common_params & params, const std::string & value) { - std::ifstream file(value, std::ios::binary); - if (!file) { - throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + for (const auto & item : string_split(value, ',')) { + std::ifstream file(item, std::ios::binary); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", item.c_str())); + } + params.context_files.push_back(item); } - params.context_files.push_back(value); } ).set_examples({LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( @@ -2531,6 +2617,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.api_prefix = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); + add_opt(common_arg( + {"--webui-config"}, "JSON", + "JSON that provides default WebUI settings (overrides WebUI defaults)", + [](common_params & params, const std::string & value) { + params.webui_config_json = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG")); + add_opt(common_arg( + {"--webui-config-file"}, "PATH", + "JSON file that provides default WebUI settings (overrides WebUI defaults)", + [](common_params & params, const std::string & value) { + params.webui_config_json = read_file(value); + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE")); add_opt(common_arg( {"--webui"}, {"--no-webui"}, diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index 74a7b6a46d..1bcba9cd86 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -4,9 +4,14 @@ using json = nlohmann::json; -static std::string_view trim_trailing_space(std::string_view sv) { +static std::string_view trim_trailing_space(std::string_view sv, int max = -1) { + int count = 0; while (!sv.empty() && std::isspace(static_cast(sv.back()))) { + if (max != -1 && count <= max) { + break; + } sv.remove_suffix(1); + count++; } return sv; } @@ -93,7 +98,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) { if (is_arg_string && current_tool) { // Serialize to JSON, but exclude the end quote - std::string dumped = json(node.text).dump(); + std::string dumped = json(trim_trailing_space(node.text)).dump(); current_tool->arguments += dumped.substr(0, dumped.size() - 1); needs_closing_quote = true; } @@ -101,6 +106,7 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) { if (is_arg_close && current_tool) { if (needs_closing_quote) { current_tool->arguments += "\""; + needs_closing_quote = false; } } @@ -109,6 +115,10 @@ void common_chat_peg_constructed_mapper::map(const common_peg_ast_node & node) { } if (is_tool_close && current_tool) { + if (needs_closing_quote) { + current_tool->arguments += "\""; + needs_closing_quote = false; + } current_tool->arguments += "}"; } } diff --git a/common/chat.cpp b/common/chat.cpp index c371edaa5a..0a426f4478 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -711,6 +711,25 @@ static void foreach_function(const json & tools, const std::function & fn) { + if (!function.contains("parameters") || !function.at("parameters").is_object()) { + return; + } + const auto & params = function.at("parameters"); + if (!params.contains("properties") || !params.at("properties").is_object()) { + return; + } + const auto & props = params.at("properties"); + std::set required; + if (params.contains("required") && params.at("required").is_array()) { + params.at("required").get_to(required); + } + for (const auto & [name, prop] : props.items()) { + bool is_required = (required.find(name) != required.end()); + fn(name, prop, is_required); + } +} + static std::string apply( const common_chat_template & tmpl, const struct templates_params & inputs, @@ -1409,6 +1428,123 @@ static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_ return data; } +static common_chat_params common_chat_params_init_nemotron_v3(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + + data.prompt = apply(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_CONSTRUCTED; + + // Handle thinking tags appropriately based on inputs.enable_thinking + if (string_ends_with(data.prompt, "\n")) { + if (!inputs.enable_thinking) { + data.prompt += ""; + } else { + data.thinking_forced_open = true; + } + } + + data.preserved_tokens = { + "", + "", + "", + "", + }; + + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); + auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; + auto include_grammar = true; + + auto parser = build_chat_peg_constructed_parser([&](auto & p) { + auto reasoning = p.eps(); + if (inputs.enable_thinking && extract_reasoning) { + auto reasoning_content = p.reasoning(p.until("")) + ("" | p.end()); + if (data.thinking_forced_open) { + reasoning = reasoning_content; + } + } + + // Response format parser + if (inputs.json_schema.is_object() && !inputs.json_schema.empty()) { + return reasoning << p.content(p.schema(p.json(), "response-format", inputs.json_schema)); + } + + // Tool call parser + if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) { + auto tool_choice = p.choice(); + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + std::string name = function.at("name"); + auto parameters = function.at("parameters"); + + auto schema_info = common_schema_info(); + schema_info.resolve_refs(parameters); + + auto tool_open = "\n"; + auto tool_close = p.literal("\n"); + auto args = p.sequence(); + auto arg_string = p.rule("xml-arg-string", p.until_one_of({ + "\n", + "\n" + })); + + foreach_parameter(function, [&](const auto & param_name, const json & param_schema, bool is_required) { + auto rule_name = "tool-" + name + "-arg-" + param_name; + + auto arg_open = "\n"; + auto arg_close = p.literal("\n"); + auto arg_value = p.eps(); + + if (schema_info.resolves_to_string(param_schema)) { + arg_value = p.tool_arg_string_value(arg_string) + "\n"; + } else { + arg_value = p.tool_arg_json_value(p.schema(p.json(), rule_name + "-schema", param_schema)); + } + + // Model may or my not close with + auto arg_rule = p.rule(rule_name, p.tool_arg_open(arg_open) + arg_value + p.optional(p.tool_arg_close(arg_close))); + args += p.repeat(arg_rule, /* min = */ is_required ? 1 : 0, /* max = */ 1); + }); + + tool_choice |= p.rule("tool-" + name, p.tool_open(tool_open) + args + p.tool_close(tool_close)); + }); + + auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0; + auto max_calls = inputs.parallel_tool_calls ? -1 : 1; + auto tool_call = p.rule("tool-call", "\n" + tool_choice + "" + p.space()); + auto tool_calls = p.trigger_rule("tool-call-root", p.repeat(tool_call, /* min = */ min_calls, /* max = */ max_calls)); + + return reasoning << p.content(p.until("")) << tool_calls; + } + + // Content only parser + include_grammar = false; + return reasoning << p.content(p.rest()); + }); + + data.parser = parser.save(); + + if (include_grammar) { + data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO; + + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + auto schema = function.at("parameters"); + builder.resolve_refs(schema); + }); + parser.build_grammar(builder, data.grammar_lazy); + }); + + data.grammar_triggers = { + {COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ""} + }; + } + + return data; +} + + static common_chat_params common_chat_params_init_apertus(const common_chat_template & tmpl, const struct templates_params & inputs) { common_chat_params data; @@ -2534,6 +2670,10 @@ static common_chat_params common_chat_templates_apply_jinja( src.find("") != std::string::npos && src.find("") != std::string::npos) { + return common_chat_params_init_nemotron_v3(tmpl, params); + } return common_chat_params_init_qwen3_coder_xml(tmpl, params); } diff --git a/common/common.cpp b/common/common.cpp index 8e4490ba71..7a89f16250 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1093,7 +1093,7 @@ common_init_result::common_init_result(common_params & params) : auto cparams = common_context_params_to_llama(params); if (params.fit_params) { - LOG_INF("%s: fitting params to device memory, to report bugs during this step use -fit off (or --verbose if you can't)\n", __func__); + LOG_INF("%s: fitting params to device memory, for bugs during this step try to reproduce them with -fit off, or provide --verbose logs if the bug only occurs with -fit on\n", __func__); llama_params_fit(params.model.path.c_str(), &mparams, &cparams, params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_target, params.fit_params_min_ctx, params.verbosity >= 4 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); diff --git a/common/common.h b/common/common.h index 2896dc388f..431bc6f3dc 100644 --- a/common/common.h +++ b/common/common.h @@ -486,8 +486,11 @@ struct common_params { std::map default_template_kwargs; + // webui configs + bool webui = true; + std::string webui_config_json; + // "advanced" endpoints are disabled by default for better security - bool webui = true; bool endpoint_slots = true; bool endpoint_props = false; // only control POST requests, not GET bool endpoint_metrics = false; diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index c3b4e5d9dc..2f67c74d79 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -305,8 +305,9 @@ static std::string format_literal(const std::string & literal) { std::string gbnf_format_literal(const std::string & literal) { return format_literal(literal); } -class SchemaConverter { +class common_schema_converter { private: + friend class common_schema_info; friend std::string build_grammar(const std::function & cb, const common_grammar_options & options); std::function _fetch_json; bool _dotall; @@ -729,7 +730,7 @@ private: } public: - SchemaConverter( + common_schema_converter( const std::function & fetch_json, bool dotall) : _fetch_json(fetch_json), _dotall(dotall) @@ -990,6 +991,134 @@ public: } }; +// common_schema_info implementation (pimpl) + +common_schema_info::common_schema_info() + : impl_(std::make_unique( + [](const std::string &) { return json(); }, + false)) {} + +common_schema_info::~common_schema_info() = default; + +common_schema_info::common_schema_info(common_schema_info &&) noexcept = default; +common_schema_info & common_schema_info::operator=(common_schema_info &&) noexcept = default; + +void common_schema_info::resolve_refs(nlohmann::ordered_json & schema) { + impl_->resolve_refs(schema, ""); +} + +// Determines if a JSON schema can resolve to a string type through any path. +// Some models emit raw string values rather than JSON-encoded strings for string parameters. +// If any branch of the schema (via oneOf, anyOf, $ref, etc.) permits a string, this returns +// true, allowing callers to handle the value as a raw string for simplicity. +bool common_schema_info::resolves_to_string(const nlohmann::ordered_json & schema) { + std::unordered_set visited_refs; + + std::function check = [&](const json & s) -> bool { + if (!s.is_object()) { + return false; + } + + // Handle $ref + if (s.contains("$ref")) { + const std::string & ref = s["$ref"]; + if (visited_refs.find(ref) != visited_refs.end()) { + // Circular reference, assume not a string to be safe + return false; + } + visited_refs.insert(ref); + auto it = impl_->_refs.find(ref); + if (it != impl_->_refs.end()) { + return check(it->second); + } + return false; + } + + // Check type field + if (s.contains("type")) { + const json & schema_type = s["type"]; + if (schema_type.is_string()) { + if (schema_type == "string") { + return true; + } + } else if (schema_type.is_array()) { + // Type can be an array like ["string", "null"] + for (const auto & t : schema_type) { + if (t == "string") { + return true; + } + } + } + } + + // Check oneOf/anyOf - if any alternative can be a string + if (s.contains("oneOf")) { + for (const auto & alt : s["oneOf"]) { + if (check(alt)) { + return true; + } + } + } + if (s.contains("anyOf")) { + for (const auto & alt : s["anyOf"]) { + if (check(alt)) { + return true; + } + } + } + + // Check allOf - all components must be compatible with string type + if (s.contains("allOf")) { + bool all_string = true; + for (const auto & component : s["allOf"]) { + if (!check(component)) { + all_string = false; + break; + } + } + if (all_string) { + return true; + } + } + + // Check const - if the constant value is a string + if (s.contains("const")) { + if (s["const"].is_string()) { + return true; + } + } + + // Check enum - if any enum value is a string + if (s.contains("enum")) { + for (const auto & val : s["enum"]) { + if (val.is_string()) { + return true; + } + } + } + + // String-specific keywords imply string type + if (s.contains("pattern") || s.contains("minLength") || s.contains("maxLength")) { + return true; + } + + // Check format - many formats imply string + if (s.contains("format")) { + const std::string & fmt = s["format"]; + if (fmt == "date" || fmt == "time" || fmt == "date-time" || + fmt == "uri" || fmt == "email" || fmt == "hostname" || + fmt == "ipv4" || fmt == "ipv6" || fmt == "uuid" || + fmt.find("uuid") == 0) { + return true; + } + } + + return false; + }; + + return check(schema); +} + std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { #ifdef LLAMA_USE_LLGUIDANCE if (!force_gbnf) { @@ -1006,7 +1135,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { } std::string build_grammar(const std::function & cb, const common_grammar_options & options) { - SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall); + common_schema_converter converter([&](const std::string &) { return json(); }, options.dotall); common_grammar_builder builder { /* .add_rule = */ [&](const std::string & name, const std::string & rule) { return converter._add_rule(name, rule); diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h index c89ab7f997..240d642311 100644 --- a/common/json-schema-to-grammar.h +++ b/common/json-schema-to-grammar.h @@ -3,11 +3,31 @@ #include #include +#include #include std::string json_schema_to_grammar(const nlohmann::ordered_json & schema, bool force_gbnf = false); +class common_schema_converter; + +// Probes a JSON schema to extract information about its structure and type constraints. +class common_schema_info { + std::unique_ptr impl_; + + public: + common_schema_info(); + ~common_schema_info(); + + common_schema_info(const common_schema_info &) = delete; + common_schema_info & operator=(const common_schema_info &) = delete; + common_schema_info(common_schema_info &&) noexcept; + common_schema_info & operator=(common_schema_info &&) noexcept; + + void resolve_refs(nlohmann::ordered_json & schema); + bool resolves_to_string(const nlohmann::ordered_json & schema); +}; + struct common_grammar_builder { std::function add_rule; std::function add_schema; diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index dec99e1820..f2fc84500f 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -425,7 +425,7 @@ struct parser_executor { if (result.need_more_input()) { // Propagate - need to know what child would match before negating - return result; + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos); } // Child failed, so negation succeeds diff --git a/common/sampling.cpp b/common/sampling.cpp index aefc596443..48e8addc23 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -104,10 +104,9 @@ struct ring_buffer { struct common_sampler { common_params_sampling params; + struct llama_sampler * grmr; struct llama_sampler * chain; - bool grammar; - ring_buffer prev; std::vector cur; @@ -184,15 +183,14 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co lparams.no_perf = params.no_perf; + llama_sampler * grmr = nullptr; llama_sampler * chain = llama_sampler_chain_init(lparams); - bool grammar = false; std::vector samplers; if (params.grammar.compare(0, 11, "%llguidance") == 0) { #ifdef LLAMA_USE_LLGUIDANCE - samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str())); - grammar = true; + grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()); #else GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); #endif // LLAMA_USE_LLGUIDANCE @@ -241,15 +239,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co if (!params.grammar.empty()) { if (params.grammar_lazy) { - samplers.push_back( - llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", - trigger_patterns_c.data(), trigger_patterns_c.size(), - trigger_tokens.data(), trigger_tokens.size())); + grmr = llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", + trigger_patterns_c.data(), trigger_patterns_c.size(), + trigger_tokens.data(), trigger_tokens.size()); } else { - samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root")); + grmr = llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"); } - - grammar = true; } } @@ -320,8 +315,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co auto * result = new common_sampler { /* .params = */ params, + /* .grmr = */ grmr, /* .chain = */ chain, - /* .grammar = */ grammar, /* .prev = */ ring_buffer(std::max(32, params.n_prev)), /* .cur = */ {}, /* .cur_p = */ {}, @@ -332,6 +327,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co void common_sampler_free(struct common_sampler * gsmpl) { if (gsmpl) { + llama_sampler_free(gsmpl->grmr); llama_sampler_free(gsmpl->chain); delete gsmpl; @@ -341,25 +337,12 @@ void common_sampler_free(struct common_sampler * gsmpl) { void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { const auto tm = gsmpl->tm(); - if (gsmpl->grammar) { - const int n_smpl = llama_sampler_chain_n(gsmpl->chain); - - for (int i = 0; i < n_smpl; i++) { - auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); - - // the grammar sampler is always the first one - if (i == 0) { - if (accept_grammar) { - llama_sampler_accept(smpl, token); - } - } else { - llama_sampler_accept(smpl, token); - } - } - } else { - llama_sampler_accept(gsmpl->chain, token); + if (gsmpl->grmr && accept_grammar) { + llama_sampler_accept(gsmpl->grmr, token); } + llama_sampler_accept(gsmpl->chain, token); + gsmpl->prev.push_back(token); } @@ -370,8 +353,8 @@ void common_sampler_reset(struct common_sampler * gsmpl) { struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { return new common_sampler { /* .params = */ gsmpl->params, + /* .grmr = */ llama_sampler_clone(gsmpl->grmr), /* .chain = */ llama_sampler_clone(gsmpl->chain), - /* .grammar = */ gsmpl->grammar, /* .prev = */ gsmpl->prev, /* .cur = */ gsmpl->cur, /* .cur_p = */ gsmpl->cur_p, @@ -427,7 +410,7 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) { return gsmpl->chain; } -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) { +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { llama_synchronize(ctx); // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations @@ -435,6 +418,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co llama_token id = LLAMA_TOKEN_NULL; + auto & grmr = gsmpl->grmr; auto & chain = gsmpl->chain; auto & cur_p = gsmpl->cur_p; // initialized by set_logits @@ -457,6 +441,36 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co gsmpl->set_logits(ctx, idx); + if (grammar_first) { + llama_sampler_apply(grmr, &cur_p); + } + + llama_sampler_apply(chain, &cur_p); + + id = cur_p.data[cur_p.selected].id; + + if (grammar_first) { + return id; + } + + // check if it the sampled token fits the grammar (grammar-based rejection sampling) + { + llama_token_data single_token_data = { id, 1.0f, 0.0f }; + llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; + + llama_sampler_apply(grmr, &single_token_data_array); + + const bool is_valid = single_token_data_array.data[0].logit != -INFINITY; + if (is_valid) { + return id; + } + } + + // resampling: + // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain + gsmpl->set_logits(ctx, idx); + + llama_sampler_apply(grmr, &cur_p); llama_sampler_apply(chain, &cur_p); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); @@ -466,7 +480,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co return id; } -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft) { +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first) { GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); std::vector result; @@ -474,7 +488,7 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample size_t i = 0; for (; i < draft.size(); i++) { - const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]); + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); common_sampler_accept(gsmpl, id, true); @@ -486,7 +500,7 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample } if (i == draft.size()) { - const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]); + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); common_sampler_accept(gsmpl, id, true); @@ -496,13 +510,13 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample return result; } -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) { +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { std::vector idxs(draft.size() + 1); for (size_t i = 0; i < idxs.size(); ++i) { idxs[i] = i; } - return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft); + return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); } uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { diff --git a/common/sampling.h b/common/sampling.h index ace5d3d020..c7101032f2 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -57,7 +57,10 @@ struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl); // - check if the token fits the grammar (if any) // - if not: resample by first applying the grammar constraints and then sampling again (slower path) // -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx); +// if grammar_first is true, the grammar is applied before the samplers (slower) +// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar +// +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); // generalized version of common_sampler_sample // @@ -75,10 +78,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co // // returns at least 1 token, up to idxs.size() // -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft); +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first = false); // assume idxs == [ 0, 1, 2, ..., draft.size() ] -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft); +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); diff --git a/common/speculative.cpp b/common/speculative.cpp index 1e12383ae6..3e83b0964c 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft( for (int i = 0; i < params.n_draft; ++i) { common_batch_clear(batch); - common_sampler_sample(smpl, ctx_dft, 0); + common_sampler_sample(smpl, ctx_dft, 0, true); const auto * cur_p = common_sampler_get_candidates(smpl, true); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 01b15e4b9a..86fe0b5f17 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -189,10 +189,10 @@ class ModelBase: return tensors prefix = "model" if not self.is_mistral_format else "consolidated" - part_names: set[str] = set(ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")) + part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") is_safetensors: bool = len(part_names) > 0 if not is_safetensors: - part_names = set(ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")) + part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") tensor_names_from_index: set[str] = set() @@ -209,7 +209,8 @@ class ModelBase: if weight_map is None or not isinstance(weight_map, dict): raise ValueError(f"Can't load 'weight_map' from {index_name!r}") tensor_names_from_index.update(weight_map.keys()) - part_names |= set(weight_map.values()) + part_dict: dict[str, None] = dict.fromkeys(weight_map.values(), None) + part_names = sorted(part_dict.keys()) else: weight_map = {} else: @@ -862,6 +863,14 @@ class TextModel(ModelBase): logger.warning(f"Unknown RoPE type: {rope_type}") logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") + if "mrope_section" in self.rope_parameters: + mrope_section = self.rope_parameters["mrope_section"] + # Pad to 4 dimensions [time, height, width, extra] + while len(mrope_section) < 4: + mrope_section.append(0) + self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) + logger.info(f"gguf: mrope sections: {mrope_section[:4]}") + if (rope_theta := rope_params.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") @@ -1830,7 +1839,7 @@ class MmprojModel(ModelBase): def tensor_force_quant(self, name, new_name, bid, n_dims): del bid, name, n_dims # unused - if ".patch_embd.weight" in new_name: + if ".patch_embd.weight" in new_name or ".patch_merger.weight" in new_name: return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32 return False @@ -3739,9 +3748,6 @@ class Qwen2VLModel(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() - mrope_section = self.hparams["rope_scaling"]["mrope_section"] - mrope_section += [0] * max(0, 4 - len(mrope_section)) - self.gguf_writer.add_rope_dimension_sections(mrope_section) def set_vocab(self): try: @@ -4377,6 +4383,30 @@ class Qwen3VLVisionModel(MmprojModel): return super().modify_tensors(data_torch, name, bid) +@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration") +class Glm4VVisionModel(Qwen3VLVisionModel): + def set_gguf_parameters(self): + MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters + assert self.hparams_vision is not None + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + + hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower() + if hidden_act == "gelu": + self.gguf_writer.add_vision_use_gelu(True) + elif hidden_act == "silu": + self.gguf_writer.add_vision_use_silu(True) + + rms_norm_eps = self.hparams_vision.get("rms_norm_eps", 1e-5) + self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.") + if name.startswith("visual.merger."): + return [(self.map_tensor_name(name), data_torch)] + return super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Qwen3VLForConditionalGeneration") class Qwen3VLTextModel(Qwen3Model): model_arch = gguf.MODEL_ARCH.QWEN3VL @@ -4385,20 +4415,6 @@ class Qwen3VLTextModel(Qwen3Model): super().set_gguf_parameters() # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL - text_config = self.hparams.get("text_config", {}) - # rope_scaling is deprecated in V5, use rope_parameters instead - rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {} - - if rope_scaling.get("mrope_section"): - # mrope_section contains [time, height, width] dimensions - mrope_section = rope_scaling["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - - logger.info(f"MRoPE sections: {mrope_section[:4]}") - vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) @@ -4417,22 +4433,6 @@ class Qwen3VLMoeTextModel(Qwen3MoeModel): def set_gguf_parameters(self): super().set_gguf_parameters() - - # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL - text_config = self.hparams.get("text_config", {}) - # rope_scaling is deprecated in V5, use rope_parameters instead - rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {} - - if rope_scaling.get("mrope_section"): - # mrope_section contains [time, height, width] dimensions - mrope_section = rope_scaling["mrope_section"] - # Pad to 4 dimensions [time, height, width, extra] - while len(mrope_section) < 4: - mrope_section.append(0) - self.gguf_writer.add_rope_dimension_sections(mrope_section[:4]) - - logger.info(f"MRoPE sections: {mrope_section[:4]}") - vision_config = self.hparams.get("vision_config", {}) deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", [])) self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num) @@ -7795,6 +7795,15 @@ class JaisModel(TextModel): @ModelBase.register("Glm4ForCausalLM", "Glm4vForConditionalGeneration") class Glm4Model(TextModel): model_arch = gguf.MODEL_ARCH.GLM4 + use_mrope = False + partial_rotary_factor = 0.5 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 0.5) + if "mrope_section" in self.rope_parameters: + self.use_mrope = True + logger.info("Q/K weight will need to be permuted for M-RoPE") def set_vocab(self): from transformers import AutoTokenizer @@ -7816,17 +7825,49 @@ class Glm4Model(TextModel): super().set_gguf_parameters() if (rope_dim := self.hparams.get("head_dim")) is None: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.partial_rotary_factor)) + + @staticmethod + def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, partial_rotary_factor: float) -> Tensor: + orig_shape = weights.shape + if len(orig_shape) == 1: + weights = weights.unsqueeze(1) # [out_dim, 1] + if len(weights.shape) != 2: + raise ValueError("Only 1D and 2D tensors are supported.") + n_effective_heads = weights.shape[0] // head_dim + if n_head_kv is not None and n_effective_heads != n_head: + if n_effective_heads != n_head_kv: + raise AssertionError(f"Mismatch in effective heads: computed {n_effective_heads}, expected {n_head} or {n_head_kv}") + rotary_dim = int(head_dim * partial_rotary_factor) + if rotary_dim % 2 != 0: + raise ValueError("rotary_dim must be even.") + reshaped = weights.reshape(n_effective_heads, head_dim, -1) + rot_part = reshaped[:, :rotary_dim, :] + non_rot_part = reshaped[:, rotary_dim:, :] + permuted_rot = torch.cat((rot_part[:, ::2, :], rot_part[:, 1::2, :]), dim=1) + combined = torch.cat((permuted_rot, non_rot_part), dim=1) + result = combined.reshape(weights.shape) + return result if len(orig_shape) != 1 else result.squeeze(1) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part of Glm4v return [] elif name.startswith("model.language_model."): name = name.replace("language_model.", "") # for Glm4v + if self.use_mrope: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + n_embd = self.hparams["hidden_size"] + head_dim = n_embd // n_head + # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) return super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Glm4MoeForCausalLM") +@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") class Glm4MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.GLM4_MOE @@ -7893,6 +7934,7 @@ class Glm4MoeModel(TextModel): _experts: list[dict[str, Tensor]] | None = None + # note: unlike GLM4V non-MoE, we don't need to permute Q/K here since GLM4V_MOE uses Neox ordering already def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: diff --git a/docs/android.md b/docs/android.md index d2a835653f..28b966ffc7 100644 --- a/docs/android.md +++ b/docs/android.md @@ -1,6 +1,26 @@ # Android +## Build with Android Studio + +Import the `examples/llama.android` directory into Android Studio, then perform a Gradle sync and build the project. +![Project imported into Android Studio](./android/imported-into-android-studio.png) + +This Android binding supports hardware acceleration up to `SME2` for **Arm** and `AMX` for **x86-64** CPUs on Android and ChromeOS devices. +It automatically detects the host's hardware to load compatible kernels. As a result, it runs seamlessly on both the latest premium devices and older devices that may lack modern CPU features or have limited RAM, without requiring any manual configuration. + +A minimal Android app frontend is included to showcase the binding’s core functionalities: +1. **Parse GGUF metadata** via `GgufMetadataReader` from either a `ContentResolver` provided `Uri` or a local `File`. +2. **Obtain a `TierDetection` or `InferenceEngine`** instance through the high-level facade APIs. +3. **Send a raw user prompt** for automatic template formatting, prefill, and decoding. Then collect the generated tokens in a Kotlin `Flow`. + +For a production-ready experience that leverages advanced features such as system prompts and benchmarks, check out [Arm AI Chat](https://play.google.com/store/apps/details?id=com.arm.aichat) on Google Play. +This project is made possible through a collaborative effort by Arm's **CT-ML**, **CE-ML** and **STE** groups: + +| ![Home screen](./android/arm-ai-chat-home-screen.png) | ![System prompt](./android/system-prompt-setup.png) | !["Haiku"](./android/chat-with-system-prompt-haiku.png) | +|:------------------------------------------------------:|:----------------------------------------------------:|:--------------------------------------------------------:| +| Home screen | System prompt | "Haiku" | + ## Build on Android using Termux [Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid. diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 02a72a9d51..f44458ed3b 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -103,6 +103,8 @@ SYCL backend supports Intel GPU Family: - Intel Built-in Arc GPU - Intel iGPU in Core CPU (11th Generation Core CPU and newer, refer to [oneAPI supported GPU](https://www.intel.com/content/www/us/en/developer/articles/system-requirements/intel-oneapi-base-toolkit-system-requirements.html#inpage-nav-1-1)). +On older Intel GPUs, you may try [OpenCL](/docs/backend/OPENCL.md) although the performance is not optimal, and some GPUs may not support OpenCL nor have any GPGPU capabilities. + #### Verified devices | Intel GPU | Status | Verified Model | diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 9d1452e3f0..b6870f6e25 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -97,7 +97,7 @@ The model params and tensors layout must be defined in `llama.cpp` source files: 1. Define a new `llm_arch` enum value in `src/llama-arch.h`. 2. In `src/llama-arch.cpp`: - Add the architecture name to the `LLM_ARCH_NAMES` map. - - Add the tensor mappings to the `LLM_TENSOR_NAMES` map. + - Add the list of model tensors to `llm_get_tensor_names` (you may also need to update `LLM_TENSOR_NAMES`) 3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`. 4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`. diff --git a/docs/docker.md b/docs/docker.md index b9e5015396..a3b263497c 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,9 +7,9 @@ ## Images We have three Docker images available for this project: -1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) -2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) -3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) +1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) +2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the `llama-cli` and `llama-completion` executables. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) +3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the `llama-server` executable. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`) Additionally, there the following images, similar to the above: @@ -44,13 +44,15 @@ docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-o On completion, you are ready to play! ```bash -docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf +docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run-legacy -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512 ``` or with a light image: ```bash -docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 +docker run -v /path/to/models:/models --entrypoint /app/llama-cli ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf +docker run -v /path/to/models:/models --entrypoint /app/llama-completion ghcr.io/ggml-org/llama.cpp:light -m /models/32B/ggml-model-q8_0.gguf -no-cnv -p "Building a mobile app can be done in 15 steps:" -n 512 ``` or with a server image: @@ -59,6 +61,8 @@ or with a server image: docker run -v /path/to/models:/models -p 8080:8080 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8080 --host 0.0.0.0 -n 512 ``` +In the above examples, `--entrypoint /app/llama-cli` is specified for clarity, but you can safely omit it since it's the default entrypoint in the container. + ## Docker With CUDA Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. @@ -80,9 +84,9 @@ The defaults are: The resulting images, are essentially the same as the non-CUDA images: -1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. -2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. -3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. +1. `local/llama.cpp:full-cuda`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `local/llama.cpp:light-cuda`: This image only includes the `llama-cli` and `llama-completion` executables. +3. `local/llama.cpp:server-cuda`: This image only includes the `llama-server` executable. ## Usage @@ -114,9 +118,9 @@ The defaults are: The resulting images, are essentially the same as the non-MUSA images: -1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. -2. `local/llama.cpp:light-musa`: This image only includes the main executable file. -3. `local/llama.cpp:server-musa`: This image only includes the server executable file. +1. `local/llama.cpp:full-musa`: This image includes both the `llama-cli` and `llama-completion` executables and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. +2. `local/llama.cpp:light-musa`: This image only includes the `llama-cli` and `llama-completion` executables. +3. `local/llama.cpp:server-musa`: This image only includes the `llama-server` executable. ## Usage diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index e9f7bf9313..dc76c4cf53 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -48,7 +48,7 @@ static void write_table(std::ofstream & file, std::vector & opts) } } -static void export_md(std::string fname, llama_example ex) { +static void export_md(std::string fname, llama_example ex, std::string name) { std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); common_params params; @@ -72,13 +72,14 @@ static void export_md(std::string fname, llama_example ex) { write_table(file, common_options); file << "\n\n**Sampling params**\n\n"; write_table(file, sparam_options); - file << "\n\n**Example-specific params**\n\n"; + file << "\n\n**" << name << "-specific params**\n\n"; write_table(file, specific_options); } int main(int, char **) { - export_md("autogen-main.md", LLAMA_EXAMPLE_COMPLETION); - export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER); + // TODO: add CLI + export_md("autogen-completion.md", LLAMA_EXAMPLE_COMPLETION, "Tool"); + export_md("autogen-server.md", LLAMA_EXAMPLE_SERVER, "Server"); return 0; } diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts index 8d1b37195e..3524fe39c4 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/llama.android/app/build.gradle.kts @@ -1,16 +1,18 @@ plugins { - id("com.android.application") - id("org.jetbrains.kotlin.android") + alias(libs.plugins.android.application) + alias(libs.plugins.jetbrains.kotlin.android) } android { namespace = "com.example.llama" - compileSdk = 34 + compileSdk = 36 defaultConfig { - applicationId = "com.example.llama" + applicationId = "com.example.llama.aichat" + minSdk = 33 - targetSdk = 34 + targetSdk = 36 + versionCode = 1 versionName = "1.0" @@ -21,8 +23,17 @@ android { } buildTypes { + debug { + isMinifyEnabled = true + isShrinkResources = true + proguardFiles( + getDefaultProguardFile("proguard-android.txt"), + "proguard-rules.pro" + ) + } release { - isMinifyEnabled = false + isMinifyEnabled = true + isShrinkResources = true proguardFiles( getDefaultProguardFile("proguard-android-optimize.txt"), "proguard-rules.pro" @@ -36,30 +47,15 @@ android { kotlinOptions { jvmTarget = "1.8" } - buildFeatures { - compose = true - } - composeOptions { - kotlinCompilerExtensionVersion = "1.5.1" - } } dependencies { + implementation(libs.bundles.androidx) + implementation(libs.material) - implementation("androidx.core:core-ktx:1.12.0") - implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2") - implementation("androidx.activity:activity-compose:1.8.2") - implementation(platform("androidx.compose:compose-bom:2023.08.00")) - implementation("androidx.compose.ui:ui") - implementation("androidx.compose.ui:ui-graphics") - implementation("androidx.compose.ui:ui-tooling-preview") - implementation("androidx.compose.material3:material3") - implementation(project(":llama")) - testImplementation("junit:junit:4.13.2") - androidTestImplementation("androidx.test.ext:junit:1.1.5") - androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") - androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00")) - androidTestImplementation("androidx.compose.ui:ui-test-junit4") - debugImplementation("androidx.compose.ui:ui-tooling") - debugImplementation("androidx.compose.ui:ui-test-manifest") + implementation(project(":lib")) + + testImplementation(libs.junit) + androidTestImplementation(libs.androidx.junit) + androidTestImplementation(libs.androidx.espresso.core) } diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro index f1b424510d..358020d2d2 100644 --- a/examples/llama.android/app/proguard-rules.pro +++ b/examples/llama.android/app/proguard-rules.pro @@ -19,3 +19,11 @@ # If you keep the line number information, uncomment this to # hide the original source file name. #-renamesourcefileattribute SourceFile + +-keep class com.arm.aichat.* { *; } +-keep class com.arm.aichat.gguf.* { *; } + +-assumenosideeffects class android.util.Log { + public static int v(...); + public static int d(...); +} diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml index 41a358a299..8f7c606b41 100644 --- a/examples/llama.android/app/src/main/AndroidManifest.xml +++ b/examples/llama.android/app/src/main/AndroidManifest.xml @@ -1,24 +1,21 @@ - - - + + android:exported="true"> diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt deleted file mode 100644 index 78c231ae55..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt +++ /dev/null @@ -1,119 +0,0 @@ -package com.example.llama - -import android.app.DownloadManager -import android.net.Uri -import android.util.Log -import androidx.compose.material3.Button -import androidx.compose.material3.Text -import androidx.compose.runtime.Composable -import androidx.compose.runtime.getValue -import androidx.compose.runtime.mutableDoubleStateOf -import androidx.compose.runtime.mutableStateOf -import androidx.compose.runtime.remember -import androidx.compose.runtime.rememberCoroutineScope -import androidx.compose.runtime.setValue -import androidx.core.database.getLongOrNull -import androidx.core.net.toUri -import kotlinx.coroutines.delay -import kotlinx.coroutines.launch -import java.io.File - -data class Downloadable(val name: String, val source: Uri, val destination: File) { - companion object { - @JvmStatic - private val tag: String? = this::class.qualifiedName - - sealed interface State - data object Ready: State - data class Downloading(val id: Long): State - data class Downloaded(val downloadable: Downloadable): State - data class Error(val message: String): State - - @JvmStatic - @Composable - fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) { - var status: State by remember { - mutableStateOf( - if (item.destination.exists()) Downloaded(item) - else Ready - ) - } - var progress by remember { mutableDoubleStateOf(0.0) } - - val coroutineScope = rememberCoroutineScope() - - suspend fun waitForDownload(result: Downloading, item: Downloadable): State { - while (true) { - val cursor = dm.query(DownloadManager.Query().setFilterById(result.id)) - - if (cursor == null) { - Log.e(tag, "dm.query() returned null") - return Error("dm.query() returned null") - } - - if (!cursor.moveToFirst() || cursor.count < 1) { - cursor.close() - Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?") - return Ready - } - - val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR) - val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES) - val sofar = cursor.getLongOrNull(pix) ?: 0 - val total = cursor.getLongOrNull(tix) ?: 1 - cursor.close() - - if (sofar == total) { - return Downloaded(item) - } - - progress = (sofar * 1.0) / total - - delay(1000L) - } - } - - fun onClick() { - when (val s = status) { - is Downloaded -> { - viewModel.load(item.destination.path) - } - - is Downloading -> { - coroutineScope.launch { - status = waitForDownload(s, item) - } - } - - else -> { - item.destination.delete() - - val request = DownloadManager.Request(item.source).apply { - setTitle("Downloading model") - setDescription("Downloading model: ${item.name}") - setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI) - setDestinationUri(item.destination.toUri()) - } - - viewModel.log("Saving ${item.name} to ${item.destination.path}") - Log.i(tag, "Saving ${item.name} to ${item.destination.path}") - - val id = dm.enqueue(request) - status = Downloading(id) - onClick() - } - } - } - - Button(onClick = { onClick() }, enabled = status !is Downloading) { - when (status) { - is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%") - is Downloaded -> Text("Load ${item.name}") - is Ready -> Text("Download ${item.name}") - is Error -> Text("Download ${item.name}") - } - } - } - - } -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt index 9da04f7d3c..52c5dc2154 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt +++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt @@ -1,154 +1,257 @@ package com.example.llama -import android.app.ActivityManager -import android.app.DownloadManager -import android.content.ClipData -import android.content.ClipboardManager import android.net.Uri import android.os.Bundle -import android.os.StrictMode -import android.os.StrictMode.VmPolicy -import android.text.format.Formatter -import androidx.activity.ComponentActivity -import androidx.activity.compose.setContent -import androidx.activity.viewModels -import androidx.compose.foundation.layout.Box -import androidx.compose.foundation.layout.Column -import androidx.compose.foundation.layout.Row -import androidx.compose.foundation.layout.fillMaxSize -import androidx.compose.foundation.layout.padding -import androidx.compose.foundation.lazy.LazyColumn -import androidx.compose.foundation.lazy.items -import androidx.compose.foundation.lazy.rememberLazyListState -import androidx.compose.material3.Button -import androidx.compose.material3.LocalContentColor -import androidx.compose.material3.MaterialTheme -import androidx.compose.material3.OutlinedTextField -import androidx.compose.material3.Surface -import androidx.compose.material3.Text -import androidx.compose.runtime.Composable -import androidx.compose.ui.Modifier -import androidx.compose.ui.unit.dp -import androidx.core.content.getSystemService -import com.example.llama.ui.theme.LlamaAndroidTheme +import android.util.Log +import android.widget.EditText +import android.widget.TextView +import android.widget.Toast +import androidx.activity.enableEdgeToEdge +import androidx.activity.result.contract.ActivityResultContracts +import androidx.appcompat.app.AppCompatActivity +import androidx.lifecycle.lifecycleScope +import androidx.recyclerview.widget.LinearLayoutManager +import androidx.recyclerview.widget.RecyclerView +import com.arm.aichat.AiChat +import com.arm.aichat.InferenceEngine +import com.arm.aichat.gguf.GgufMetadata +import com.arm.aichat.gguf.GgufMetadataReader +import com.google.android.material.floatingactionbutton.FloatingActionButton +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.flow.onCompletion +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext import java.io.File +import java.io.FileOutputStream +import java.io.InputStream +import java.util.UUID -class MainActivity( - activityManager: ActivityManager? = null, - downloadManager: DownloadManager? = null, - clipboardManager: ClipboardManager? = null, -): ComponentActivity() { - private val tag: String? = this::class.simpleName +class MainActivity : AppCompatActivity() { - private val activityManager by lazy { activityManager ?: getSystemService()!! } - private val downloadManager by lazy { downloadManager ?: getSystemService()!! } - private val clipboardManager by lazy { clipboardManager ?: getSystemService()!! } + // Android views + private lateinit var ggufTv: TextView + private lateinit var messagesRv: RecyclerView + private lateinit var userInputEt: EditText + private lateinit var userActionFab: FloatingActionButton - private val viewModel: MainViewModel by viewModels() + // Arm AI Chat inference engine + private lateinit var engine: InferenceEngine - // Get a MemoryInfo object for the device's current memory status. - private fun availableMemory(): ActivityManager.MemoryInfo { - return ActivityManager.MemoryInfo().also { memoryInfo -> - activityManager.getMemoryInfo(memoryInfo) - } - } + // Conversation states + private var isModelReady = false + private val messages = mutableListOf() + private val lastAssistantMsg = StringBuilder() + private val messageAdapter = MessageAdapter(messages) override fun onCreate(savedInstanceState: Bundle?) { super.onCreate(savedInstanceState) + enableEdgeToEdge() + setContentView(R.layout.activity_main) - StrictMode.setVmPolicy( - VmPolicy.Builder(StrictMode.getVmPolicy()) - .detectLeakedClosableObjects() - .build() - ) + // Find views + ggufTv = findViewById(R.id.gguf) + messagesRv = findViewById(R.id.messages) + messagesRv.layoutManager = LinearLayoutManager(this) + messagesRv.adapter = messageAdapter + userInputEt = findViewById(R.id.user_input) + userActionFab = findViewById(R.id.fab) - val free = Formatter.formatFileSize(this, availableMemory().availMem) - val total = Formatter.formatFileSize(this, availableMemory().totalMem) - - viewModel.log("Current memory: $free / $total") - viewModel.log("Downloads directory: ${getExternalFilesDir(null)}") - - val extFilesDir = getExternalFilesDir(null) - - val models = listOf( - Downloadable( - "Phi-2 7B (Q4_0, 1.6 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"), - File(extFilesDir, "phi-2-q4_0.gguf"), - ), - Downloadable( - "TinyLlama 1.1B (f16, 2.2 GiB)", - Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"), - File(extFilesDir, "tinyllama-1.1-f16.gguf"), - ), - Downloadable( - "Phi 2 DPO (Q3_K_M, 1.48 GiB)", - Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"), - File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf") - ), - ) - - setContent { - LlamaAndroidTheme { - // A surface container using the 'background' color from the theme - Surface( - modifier = Modifier.fillMaxSize(), - color = MaterialTheme.colorScheme.background - ) { - MainCompose( - viewModel, - clipboardManager, - downloadManager, - models, - ) - } + // Arm AI Chat initialization + lifecycleScope.launch(Dispatchers.Default) { + engine = AiChat.getInferenceEngine(applicationContext) + } + // Upon CTA button tapped + userActionFab.setOnClickListener { + if (isModelReady) { + // If model is ready, validate input and send to engine + handleUserInput() + } else { + // Otherwise, prompt user to select a GGUF metadata on the device + getContent.launch(arrayOf("*/*")) } } } -} -@Composable -fun MainCompose( - viewModel: MainViewModel, - clipboard: ClipboardManager, - dm: DownloadManager, - models: List -) { - Column { - val scrollState = rememberLazyListState() + private val getContent = registerForActivityResult( + ActivityResultContracts.OpenDocument() + ) { uri -> + Log.i(TAG, "Selected file uri:\n $uri") + uri?.let { handleSelectedModel(it) } + } - Box(modifier = Modifier.weight(1f)) { - LazyColumn(state = scrollState) { - items(viewModel.messages) { - Text( - it, - style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current), - modifier = Modifier.padding(16.dp) - ) + /** + * Handles the file Uri from [getContent] result + */ + private fun handleSelectedModel(uri: Uri) { + // Update UI states + userActionFab.isEnabled = false + userInputEt.hint = "Parsing GGUF..." + ggufTv.text = "Parsing metadata from selected file \n$uri" + + lifecycleScope.launch(Dispatchers.IO) { + // Parse GGUF metadata + Log.i(TAG, "Parsing GGUF metadata...") + contentResolver.openInputStream(uri)?.use { + GgufMetadataReader.create().readStructuredMetadata(it) + }?.let { metadata -> + // Update UI to show GGUF metadata to user + Log.i(TAG, "GGUF parsed: \n$metadata") + withContext(Dispatchers.Main) { + ggufTv.text = metadata.toString() } - } - } - OutlinedTextField( - value = viewModel.message, - onValueChange = { viewModel.updateMessage(it) }, - label = { Text("Message") }, - ) - Row { - Button({ viewModel.send() }) { Text("Send") } - Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") } - Button({ viewModel.clear() }) { Text("Clear") } - Button({ - viewModel.messages.joinToString("\n").let { - clipboard.setPrimaryClip(ClipData.newPlainText("", it)) - } - }) { Text("Copy") } - } - Column { - for (model in models) { - Downloadable.Button(viewModel, dm, model) + // Ensure the model file is available + val modelName = metadata.filename() + FILE_EXTENSION_GGUF + contentResolver.openInputStream(uri)?.use { input -> + ensureModelFile(modelName, input) + }?.let { modelFile -> + loadModel(modelName, modelFile) + + withContext(Dispatchers.Main) { + isModelReady = true + userInputEt.hint = "Type and send a message!" + userInputEt.isEnabled = true + userActionFab.setImageResource(R.drawable.outline_send_24) + userActionFab.isEnabled = true + } + } } } } + + /** + * Prepare the model file within app's private storage + */ + private suspend fun ensureModelFile(modelName: String, input: InputStream) = + withContext(Dispatchers.IO) { + File(ensureModelsDirectory(), modelName).also { file -> + // Copy the file into local storage if not yet done + if (!file.exists()) { + Log.i(TAG, "Start copying file to $modelName") + withContext(Dispatchers.Main) { + userInputEt.hint = "Copying file..." + } + + FileOutputStream(file).use { input.copyTo(it) } + Log.i(TAG, "Finished copying file to $modelName") + } else { + Log.i(TAG, "File already exists $modelName") + } + } + } + + /** + * Load the model file from the app private storage + */ + private suspend fun loadModel(modelName: String, modelFile: File) = + withContext(Dispatchers.IO) { + Log.i(TAG, "Loading model $modelName") + withContext(Dispatchers.Main) { + userInputEt.hint = "Loading model..." + } + engine.loadModel(modelFile.path) + } + + /** + * Validate and send the user message into [InferenceEngine] + */ + private fun handleUserInput() { + userInputEt.text.toString().also { userSsg -> + if (userSsg.isEmpty()) { + Toast.makeText(this, "Input message is empty!", Toast.LENGTH_SHORT).show() + } else { + userInputEt.text = null + userActionFab.isEnabled = false + + // Update message states + messages.add(Message(UUID.randomUUID().toString(), userSsg, true)) + lastAssistantMsg.clear() + messages.add(Message(UUID.randomUUID().toString(), lastAssistantMsg.toString(), false)) + + lifecycleScope.launch(Dispatchers.Default) { + engine.sendUserPrompt(userSsg) + .onCompletion { + withContext(Dispatchers.Main) { + userActionFab.isEnabled = true + } + }.collect { token -> + val messageCount = messages.size + check(messageCount > 0 && !messages[messageCount - 1].isUser) + + messages.removeAt(messageCount - 1).copy( + content = lastAssistantMsg.append(token).toString() + ).let { messages.add(it) } + + withContext(Dispatchers.Main) { + messageAdapter.notifyItemChanged(messages.size - 1) + } + } + } + } + } + } + + /** + * Run a benchmark with the model file + */ + private suspend fun runBenchmark(modelName: String, modelFile: File) = + withContext(Dispatchers.Default) { + Log.i(TAG, "Starts benchmarking $modelName") + withContext(Dispatchers.Main) { + userInputEt.hint = "Running benchmark..." + } + engine.bench( + pp=BENCH_PROMPT_PROCESSING_TOKENS, + tg=BENCH_TOKEN_GENERATION_TOKENS, + pl=BENCH_SEQUENCE, + nr=BENCH_REPETITION + ).let { result -> + messages.add(Message(UUID.randomUUID().toString(), result, false)) + withContext(Dispatchers.Main) { + messageAdapter.notifyItemChanged(messages.size - 1) + } + } + } + + /** + * Create the `models` directory if not exist. + */ + private fun ensureModelsDirectory() = + File(filesDir, DIRECTORY_MODELS).also { + if (it.exists() && !it.isDirectory) { it.delete() } + if (!it.exists()) { it.mkdir() } + } + + companion object { + private val TAG = MainActivity::class.java.simpleName + + private const val DIRECTORY_MODELS = "models" + private const val FILE_EXTENSION_GGUF = ".gguf" + + private const val BENCH_PROMPT_PROCESSING_TOKENS = 512 + private const val BENCH_TOKEN_GENERATION_TOKENS = 128 + private const val BENCH_SEQUENCE = 1 + private const val BENCH_REPETITION = 3 + } +} + +fun GgufMetadata.filename() = when { + basic.name != null -> { + basic.name?.let { name -> + basic.sizeLabel?.let { size -> + "$name-$size" + } ?: name + } + } + architecture?.architecture != null -> { + architecture?.architecture?.let { arch -> + basic.uuid?.let { uuid -> + "$arch-$uuid" + } ?: "$arch-${System.currentTimeMillis()}" + } + } + else -> { + "model-${System.currentTimeMillis().toHexString()}" + } } diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt deleted file mode 100644 index 45ac29938f..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt +++ /dev/null @@ -1,105 +0,0 @@ -package com.example.llama - -import android.llama.cpp.LLamaAndroid -import android.util.Log -import androidx.compose.runtime.getValue -import androidx.compose.runtime.mutableStateOf -import androidx.compose.runtime.setValue -import androidx.lifecycle.ViewModel -import androidx.lifecycle.viewModelScope -import kotlinx.coroutines.flow.catch -import kotlinx.coroutines.launch - -class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() { - companion object { - @JvmStatic - private val NanosPerSecond = 1_000_000_000.0 - } - - private val tag: String? = this::class.simpleName - - var messages by mutableStateOf(listOf("Initializing...")) - private set - - var message by mutableStateOf("") - private set - - override fun onCleared() { - super.onCleared() - - viewModelScope.launch { - try { - llamaAndroid.unload() - } catch (exc: IllegalStateException) { - messages += exc.message!! - } - } - } - - fun send() { - val text = message - message = "" - - // Add to messages console. - messages += text - messages += "" - - viewModelScope.launch { - llamaAndroid.send(text) - .catch { - Log.e(tag, "send() failed", it) - messages += it.message!! - } - .collect { messages = messages.dropLast(1) + (messages.last() + it) } - } - } - - fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) { - viewModelScope.launch { - try { - val start = System.nanoTime() - val warmupResult = llamaAndroid.bench(pp, tg, pl, nr) - val end = System.nanoTime() - - messages += warmupResult - - val warmup = (end - start).toDouble() / NanosPerSecond - messages += "Warm up time: $warmup seconds, please wait..." - - if (warmup > 5.0) { - messages += "Warm up took too long, aborting benchmark" - return@launch - } - - messages += llamaAndroid.bench(512, 128, 1, 3) - } catch (exc: IllegalStateException) { - Log.e(tag, "bench() failed", exc) - messages += exc.message!! - } - } - } - - fun load(pathToModel: String) { - viewModelScope.launch { - try { - llamaAndroid.load(pathToModel) - messages += "Loaded $pathToModel" - } catch (exc: IllegalStateException) { - Log.e(tag, "load() failed", exc) - messages += exc.message!! - } - } - } - - fun updateMessage(newMessage: String) { - message = newMessage - } - - fun clear() { - messages = listOf() - } - - fun log(message: String) { - messages += message - } -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt b/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt new file mode 100644 index 0000000000..0439f96441 --- /dev/null +++ b/examples/llama.android/app/src/main/java/com/example/llama/MessageAdapter.kt @@ -0,0 +1,51 @@ +package com.example.llama + +import android.view.LayoutInflater +import android.view.View +import android.view.ViewGroup +import android.widget.TextView +import androidx.recyclerview.widget.RecyclerView + +data class Message( + val id: String, + val content: String, + val isUser: Boolean +) + +class MessageAdapter( + private val messages: List +) : RecyclerView.Adapter() { + + companion object { + private const val VIEW_TYPE_USER = 1 + private const val VIEW_TYPE_ASSISTANT = 2 + } + + override fun getItemViewType(position: Int): Int { + return if (messages[position].isUser) VIEW_TYPE_USER else VIEW_TYPE_ASSISTANT + } + + override fun onCreateViewHolder(parent: ViewGroup, viewType: Int): RecyclerView.ViewHolder { + val layoutInflater = LayoutInflater.from(parent.context) + return if (viewType == VIEW_TYPE_USER) { + val view = layoutInflater.inflate(R.layout.item_message_user, parent, false) + UserMessageViewHolder(view) + } else { + val view = layoutInflater.inflate(R.layout.item_message_assistant, parent, false) + AssistantMessageViewHolder(view) + } + } + + override fun onBindViewHolder(holder: RecyclerView.ViewHolder, position: Int) { + val message = messages[position] + if (holder is UserMessageViewHolder || holder is AssistantMessageViewHolder) { + val textView = holder.itemView.findViewById(R.id.msg_content) + textView.text = message.content + } + } + + override fun getItemCount(): Int = messages.size + + class UserMessageViewHolder(view: View) : RecyclerView.ViewHolder(view) + class AssistantMessageViewHolder(view: View) : RecyclerView.ViewHolder(view) +} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt deleted file mode 100644 index 40c30e8d97..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt +++ /dev/null @@ -1,11 +0,0 @@ -package com.example.llama.ui.theme - -import androidx.compose.ui.graphics.Color - -val Purple80 = Color(0xFFD0BCFF) -val PurpleGrey80 = Color(0xFFCCC2DC) -val Pink80 = Color(0xFFEFB8C8) - -val Purple40 = Color(0xFF6650a4) -val PurpleGrey40 = Color(0xFF625b71) -val Pink40 = Color(0xFF7D5260) diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt deleted file mode 100644 index e742220a8d..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt +++ /dev/null @@ -1,70 +0,0 @@ -package com.example.llama.ui.theme - -import android.app.Activity -import android.os.Build -import androidx.compose.foundation.isSystemInDarkTheme -import androidx.compose.material3.MaterialTheme -import androidx.compose.material3.darkColorScheme -import androidx.compose.material3.dynamicDarkColorScheme -import androidx.compose.material3.dynamicLightColorScheme -import androidx.compose.material3.lightColorScheme -import androidx.compose.runtime.Composable -import androidx.compose.runtime.SideEffect -import androidx.compose.ui.graphics.toArgb -import androidx.compose.ui.platform.LocalContext -import androidx.compose.ui.platform.LocalView -import androidx.core.view.WindowCompat - -private val DarkColorScheme = darkColorScheme( - primary = Purple80, - secondary = PurpleGrey80, - tertiary = Pink80 -) - -private val LightColorScheme = lightColorScheme( - primary = Purple40, - secondary = PurpleGrey40, - tertiary = Pink40 - - /* Other default colors to override - background = Color(0xFFFFFBFE), - surface = Color(0xFFFFFBFE), - onPrimary = Color.White, - onSecondary = Color.White, - onTertiary = Color.White, - onBackground = Color(0xFF1C1B1F), - onSurface = Color(0xFF1C1B1F), - */ -) - -@Composable -fun LlamaAndroidTheme( - darkTheme: Boolean = isSystemInDarkTheme(), - // Dynamic color is available on Android 12+ - dynamicColor: Boolean = true, - content: @Composable () -> Unit -) { - val colorScheme = when { - dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { - val context = LocalContext.current - if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) - } - - darkTheme -> DarkColorScheme - else -> LightColorScheme - } - val view = LocalView.current - if (!view.isInEditMode) { - SideEffect { - val window = (view.context as Activity).window - window.statusBarColor = colorScheme.primary.toArgb() - WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme - } - } - - MaterialTheme( - colorScheme = colorScheme, - typography = Typography, - content = content - ) -} diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt deleted file mode 100644 index 0b87946ca3..0000000000 --- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt +++ /dev/null @@ -1,34 +0,0 @@ -package com.example.llama.ui.theme - -import androidx.compose.material3.Typography -import androidx.compose.ui.text.TextStyle -import androidx.compose.ui.text.font.FontFamily -import androidx.compose.ui.text.font.FontWeight -import androidx.compose.ui.unit.sp - -// Set of Material typography styles to start with -val Typography = Typography( - bodyLarge = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Normal, - fontSize = 16.sp, - lineHeight = 24.sp, - letterSpacing = 0.5.sp - ) - /* Other default text styles to override - titleLarge = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Normal, - fontSize = 22.sp, - lineHeight = 28.sp, - letterSpacing = 0.sp - ), - labelSmall = TextStyle( - fontFamily = FontFamily.Default, - fontWeight = FontWeight.Medium, - fontSize = 11.sp, - lineHeight = 16.sp, - letterSpacing = 0.5.sp - ) - */ -) diff --git a/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml b/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml new file mode 100644 index 0000000000..f90c3db458 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/bg_assistant_message.xml @@ -0,0 +1,4 @@ + + + + diff --git a/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml b/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml new file mode 100644 index 0000000000..3ca7daefec --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/bg_user_message.xml @@ -0,0 +1,4 @@ + + + + diff --git a/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml b/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml new file mode 100644 index 0000000000..f58b501e3b --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/outline_folder_open_24.xml @@ -0,0 +1,10 @@ + + + diff --git a/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml b/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml new file mode 100644 index 0000000000..712adc00c4 --- /dev/null +++ b/examples/llama.android/app/src/main/res/drawable/outline_send_24.xml @@ -0,0 +1,11 @@ + + + diff --git a/examples/llama.android/app/src/main/res/layout/activity_main.xml b/examples/llama.android/app/src/main/res/layout/activity_main.xml new file mode 100644 index 0000000000..bf6ef35925 --- /dev/null +++ b/examples/llama.android/app/src/main/res/layout/activity_main.xml @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml b/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml new file mode 100644 index 0000000000..b7fb500393 --- /dev/null +++ b/examples/llama.android/app/src/main/res/layout/item_message_assistant.xml @@ -0,0 +1,15 @@ + + + + + diff --git a/examples/llama.android/app/src/main/res/layout/item_message_user.xml b/examples/llama.android/app/src/main/res/layout/item_message_user.xml new file mode 100644 index 0000000000..fe871f12fa --- /dev/null +++ b/examples/llama.android/app/src/main/res/layout/item_message_user.xml @@ -0,0 +1,15 @@ + + + + + diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml index 7a9d314e29..36059fc799 100644 --- a/examples/llama.android/app/src/main/res/values/strings.xml +++ b/examples/llama.android/app/src/main/res/values/strings.xml @@ -1,3 +1,3 @@ - LlamaAndroid + AI Chat basic sample diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml index 8a24fda566..2e4fdad72e 100644 --- a/examples/llama.android/app/src/main/res/values/themes.xml +++ b/examples/llama.android/app/src/main/res/values/themes.xml @@ -1,5 +1,10 @@ - + +