diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e6555eb479..7db8552865 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -173,7 +173,15 @@ jobs: name: llama-bin-macos-x64.zip ubuntu-cpu-cmake: - runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - build: 'x64' + os: ubuntu-22.04 + - build: 'arm64' + os: ubuntu-22.04-arm + + runs-on: ${{ matrix.os }} steps: - name: Clone @@ -239,14 +247,14 @@ jobs: run: | cp LICENSE ./build/bin/ cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp - zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* + zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/* - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip - name: llama-bin-ubuntu-x64.zip + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip + name: llama-bin-ubuntu-${{ matrix.build }}.zip ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -459,6 +467,7 @@ jobs: run: | cmake -B build -S . \ -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \ + -DGGML_HIP_ROCWMMA_FATTN=ON \ -DGGML_HIP=ON cmake --build build --config Release -j $(nproc) @@ -468,6 +477,7 @@ jobs: cmake -B build2 -S . \ -DCMAKE_C_COMPILER=hipcc \ -DCMAKE_CXX_COMPILER=hipcc \ + -DGGML_HIP_ROCWMMA_FATTN=ON \ -DGGML_HIP=ON cmake --build build2 --config Release -j $(nproc) @@ -666,6 +676,35 @@ jobs: -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + macOS-latest-cmake-visionos: + runs-on: macos-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Dependencies + id: depends + continue-on-error: true + run: | + brew update + + - name: Build + id: cmake_build + run: | + sysctl -a + cmake -B build -G Xcode \ + -DGGML_METAL_USE_BF16=ON \ + -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + -DCMAKE_SYSTEM_NAME=visionOS \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \ + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO + macOS-latest-swift: runs-on: macos-latest @@ -702,12 +741,11 @@ jobs: -DLLAMA_BUILD_SERVER=OFF \ -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) - sudo cmake --install build --config Release - name: xcodebuild for swift package id: xcodebuild run: | - xcodebuild -scheme llama-Package -destination "${{ matrix.destination }}" + ./build-xcframework.sh windows-msys2: runs-on: windows-latest @@ -765,7 +803,7 @@ jobs: env: OPENBLAS_VERSION: 0.3.23 SDE_VERSION: 9.33.0-2024-01-07 - VULKAN_VERSION: 1.3.261.1 + VULKAN_VERSION: 1.4.304.1 strategy: matrix: @@ -1195,6 +1233,11 @@ jobs: id: checkout uses: actions/checkout@v4 + - name: Clone rocWMMA repository + id: clone_rocwmma + run: | + git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + - name: Install id: depends run: | @@ -1224,8 +1267,10 @@ jobs: cmake -G "Unix Makefiles" -B build -S . ` -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` + -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" ` -DCMAKE_BUILD_TYPE=Release ` -DGGML_HIP=ON ` + -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} @@ -1244,6 +1289,11 @@ jobs: with: fetch-depth: 0 + - name: Clone rocWMMA repository + id: clone_rocwmma + run: | + git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1 + - name: ccache uses: hendrikmuhs/ccache-action@v1.2.16 with: @@ -1273,8 +1323,10 @@ jobs: cmake -G "Unix Makefiles" -B build -S . ` -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" ` -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" ` + -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/rocwmma/library/include/" ` -DCMAKE_BUILD_TYPE=Release ` -DAMDGPU_TARGETS=${{ matrix.gpu_target }} ` + -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_HIP=ON ` -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} @@ -1313,6 +1365,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Build id: cmake_build @@ -1328,15 +1382,40 @@ jobs: -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \ -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - sudo cmake --install build --config Release - name: xcodebuild for swift package id: xcodebuild run: | - xcodebuild -scheme llama-Package -destination 'generic/platform=iOS' + ./build-xcframework.sh - name: Build Xcode project - run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build + run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + zip --symlinks -r llama-${{ steps.tag.outputs.name }}-xcframework.zip build-apple/llama.xcframework + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-xcframework.zip + name: llama-${{ steps.tag.outputs.name }}-xcframework android-build: runs-on: ubuntu-latest diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 3a29107d0e..092c928bdf 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -161,6 +161,8 @@ jobs: - name: Tests id: server_integration_tests if: ${{ matrix.sanitizer == '' }} + env: + GITHUB_ACTIONS: "true" run: | cd examples/server/tests ./tests.sh diff --git a/.gitignore b/.gitignore index 56b5ac2c18..2c67ad7f7c 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,8 @@ lcov-report/ tags .build/ build* +release +debug !build-info.cmake !build-info.cpp.in !build-info.sh diff --git a/AUTHORS b/AUTHORS index 6796b29413..0af9f44ad4 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,4 @@ -# date: Tue Feb 4 13:04:05 EET 2025 +# date: Sat Mar 8 18:23:52 EET 2025 # this file is auto-generated by scripts/gen-authors.sh 0cc4m @@ -8,10 +8,12 @@ 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> 44670 <44670@users.noreply.github.com> 65a <10104049+65a@users.noreply.github.com> +708-145 <40387547+708-145@users.noreply.github.com> AN Long AT Aarni Koskela Aaron Miller +Aaron Teo <57927438+taronaeo@users.noreply.github.com> Aaryaman Vasishta Abheek Gulati Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> @@ -20,6 +22,7 @@ Adithya Balaji AdithyanI Adrian Adrian Hesketh +Adrian Kretz Adrien Gallouët Adrien Gallouët Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com> @@ -28,15 +31,18 @@ AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> AidanBeltonS Aisuko Akarshan Biswas +Akarshan Biswas Akarshan Biswas Al Mochkin <14274697+amochkin@users.noreply.github.com> Albert Jin Alberto <57916483+albbus-stack@users.noreply.github.com> Alberto Cabrera Pérez Alberto Cabrera Pérez +Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com> Alex Alex Azarov Alex Azarov +Alex Brooks Alex Klinkhamer Alex Klinkhamer Alex Nguyen @@ -67,6 +73,7 @@ Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com> Andy Salerno Andy Tai Anthony Van de Gejuchte +Antoine Viallon Antonis Makropoulos Arik Poznanski Armen Kaleshian @@ -83,6 +90,7 @@ Atsushi Tatsuma Austin <77757836+teleprint-me@users.noreply.github.com> AustinMroz BADR +BB-fat <45072480+BB-fat@users.noreply.github.com> Bach Le Bailey Chittle <39804642+bachittle@users.noreply.github.com> BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com> @@ -101,6 +109,7 @@ Bert Wagner Billel Mokeddem Bingan <70050083+binganao@users.noreply.github.com> Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com> +Bodhi <3882561+BodhiHu@users.noreply.github.com> Bodo Graumann Bono Lv Borislav Stanimirov @@ -128,6 +137,7 @@ CentricStorm Chad Brewbaker Changyeon Kim Chao Jiang +Charles Duffy Charles Xu <63788048+chaxu01@users.noreply.github.com> Charles Xu Chen Xi @@ -139,12 +149,14 @@ Chris Kuehl Christian Demsar Christian Demsar Christian Falch <875252+chrfalch@users.noreply.github.com> +Christian Fillion Christian Kastner Christian Kögler Christian Köhnenkamp Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Christopher Nielsen <62156882+mascguy@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com> +Clauszy Clint Herron Conrad Kramer Corentin REGAL @@ -163,6 +175,7 @@ Daniel Hiltgen Daniel Illescas Romero Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Daniele <57776841+daniandtheweb@users.noreply.github.com> +Danny Milosavljevic DannyDaemonic Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> Dave @@ -170,6 +183,7 @@ Dave Airlie Dave Airlie Dave Della Costa David Friehs +David Huang <1969802+hjc4869@users.noreply.github.com> David Kennedy David Pflug David Renshaw @@ -236,6 +250,7 @@ Felix Finn Voorhees Firat FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com> +Florent BENOIT Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> Francisco Melo <43780565+francis2tm@users.noreply.github.com> @@ -254,6 +269,7 @@ Gary Mulder Gavin Zhao Genkagaku.GPT Georgi Gerganov +Gian-Carlo Pascutto Gilad S Gilad S. <7817232+giladgd@users.noreply.github.com> Giuseppe Scrivano @@ -267,7 +283,9 @@ Guspan Tanadi <36249910+guspan-tanadi@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Haggai Nuchi Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> +Hale Chan Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com> +Han Yin HanishKVC Haohui Mai Haoxiang Fei @@ -278,6 +296,7 @@ Haus1 Henk Poley Henri Vasserman Henrik Forstén +Henry Linjamäki Herman Semenov Hesen Peng HimariO @@ -307,6 +326,7 @@ Ivan Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Ivan Komarov Ivan Stepanov +JC <43374599+MrSMlT@users.noreply.github.com> JFLFY2255 JH23X <165871467+JH23X@users.noreply.github.com> Jack Mousseau @@ -325,6 +345,7 @@ Jan Ploski Jannis Schönleber Jared Van Bortel Jared Van Bortel +Jason C.H Jason McCartney Jason Stillerman Jean-Christophe Hoelt @@ -342,6 +363,7 @@ Jiahao Li Jian Liao JidongZhang-THU <1119708529@qq.com> Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com> +Jinyang He Jiří Podivín <66251151+jpodivin@users.noreply.github.com> Jiří Sejkora Joan Fontanals @@ -379,6 +401,7 @@ Justine Tunney Juuso Alasuutari KASR Kamil Tomšík +Kante Yin Karol Kontny <82021046+kkontny@users.noreply.github.com> Karsten Weiss Karthick @@ -419,6 +442,7 @@ LoganDark Loïc Carrère LostRuins <39025047+LostRuins@users.noreply.github.com> LostRuins Concedo <39025047+LostRuins@users.noreply.github.com> +Lucas Moura Belo Luciano Luo Tian Lyle Dean @@ -463,6 +487,7 @@ Matthew Tejo Matvey Soloviev Max Krasnyansky Max Krasnyansky +Maxim Evtush <154841002+maximevtush@users.noreply.github.com> Maxime <672982+maximegmd@users.noreply.github.com> Maximilian Winter Meng Zhang @@ -494,6 +519,7 @@ Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Mohammadreza Hendiani Mohammadreza Hendiani Molly Sophia +MoonRide303 <130458190+MoonRide303@users.noreply.github.com> MorganRO8 <47795945+MorganRO8@users.noreply.github.com> Murilo Santana Musab Gultekin @@ -524,6 +550,7 @@ Nikolas <127742645+nneubacher@users.noreply.github.com> Nindaleth Nuno OSecret <135510162+OLSecret@users.noreply.github.com> +Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com> Oleksandr Nikitin Oleksii Maryshchenko Olivier Chafik @@ -533,6 +560,7 @@ PAB Pablo Duboue Pascal Patry Patrice Ferlet +Patrick Peng Paul Tsochantaris Pavel Zloi Pavol Rusnak @@ -549,6 +577,7 @@ Pieter Ouwerkerk Plamen Minev Prashant Vithule <119530321+Vithulep@users.noreply.github.com> Przemysław Pawełczyk +PureJourney Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> Qingyou Meng Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com> @@ -564,14 +593,17 @@ Rand Xie Randall Fitzgerald Random Fly Reinforce-II +Rémy O Rémy Oudompheng Ren Xuancheng Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> Reza Kakhki +Reza Rahemtola <49811529+RezaRahemtola@users.noreply.github.com> RhinoDevel Riccardo Orlando Riceball LEE Rich Dougherty +Richard Richard Kiss Richard Roberson Rick G <26732651+TheFlipbook@users.noreply.github.com> @@ -588,6 +620,7 @@ Robert Sung-wook Shin Robey Holderith Robyn Roger Meier +Rohanjames1997 Roland <14355895+rbur0425@users.noreply.github.com> Romain Biessy Romain D <90720+Artefact2@users.noreply.github.com> @@ -610,6 +643,7 @@ Ryan Landay Ryder Wishart Ryuei Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> +SAMI SRHMorris <69468379+SRHMorris@users.noreply.github.com> SXX SakuraUmi @@ -634,6 +668,8 @@ Shane A Shangning Xu <32517059+xushangning@users.noreply.github.com> Shankar Shanshan Shen <467638484@qq.com> +Shelby Jenkins <47464908+ShelbyJenkins@users.noreply.github.com> +Sheldon Robinson Shijie <821898965@qq.com> Shintarou Okada Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> @@ -713,18 +749,24 @@ Victor Nogueira Victor Z. Peng Viet-Anh NGUYEN (Andrew) Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com> +Vitali Lovich +Vivian Vlad Vladimir Vladimir Malyutin +Vladimir Vuksanovic <109677816+vvuksanovic@users.noreply.github.com> Vladimir Zorin VoidIsVoid <343750470@qq.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> +Wagner Bruna Wang Qin <37098874+wangqin0@users.noreply.github.com> Wang Ran (汪然) WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> Weird Constructor +Weizhao Ouyang Welby Seely Wentai Zhang +Wilken Gottwalt <12194808+wgottwalt@users.noreply.github.com> WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> William Tambellini William Tambellini @@ -816,6 +858,8 @@ chaihahaha chiranko <96988916+chiranko@users.noreply.github.com> clibdev <52199778+clibdev@users.noreply.github.com> clyang +cmdr2 +cmdr2 cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> codezjx coezbek @@ -835,6 +879,7 @@ deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> devojony <61173062+devojony@users.noreply.github.com> ditsuke divinity76 +dm4 dm4 dotpy314 <33351922+dotpy314@users.noreply.github.com> drbh @@ -849,6 +894,7 @@ fairydreaming <166155368+fairydreaming@users.noreply.github.com> fengerhu1 <2748250768@qq.com> fj-y-saito <85871716+fj-y-saito@users.noreply.github.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com> +fxzjshm <11426482+fxzjshm@users.noreply.github.com> github-actions[bot] gliptic gn64 @@ -873,6 +919,7 @@ hydai iSma iacore <74560659+iacore@users.noreply.github.com> icppWorld <124377669+icppWorld@users.noreply.github.com> +igardev <49397134+igardev@users.noreply.github.com> igarnier intelmatt <61025942+intelmatt@users.noreply.github.com> iohub @@ -880,6 +927,7 @@ issixx <46835150+issixx@users.noreply.github.com> jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jameswu2014 <545426914@qq.com> +jason_w jdomke <28772296+jdomke@users.noreply.github.com> jiahao su jiez <373447296@qq.com> @@ -891,6 +939,7 @@ jon-chuang <9093549+jon-chuang@users.noreply.github.com> jp-x-g jukofyork <69222624+jukofyork@users.noreply.github.com> junchao-loongson <68935141+junchao-loongson@users.noreply.github.com> +junchao-zhao <68935141+junchao-loongson@users.noreply.github.com> jwj7140 <32943891+jwj7140@users.noreply.github.com> k.h.lai kaizau @@ -925,6 +974,7 @@ ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> luoyu-intel m3ndax maddes8cht <55592906+maddes8cht@users.noreply.github.com> +magicse mahorozte <41834471+mahorozte@users.noreply.github.com> makomk manikbhandari @@ -935,6 +985,7 @@ matt23654 matteo mdrokz mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> +midnight minarchist mj-shifu <77107165+mj-shifu@users.noreply.github.com> mmyjona @@ -958,10 +1009,12 @@ omahs <73983677+omahs@users.noreply.github.com> oobabooga <112222186+oobabooga@users.noreply.github.com> opparco ostix360 <55257054+ostix360@users.noreply.github.com> +pascal-lc <49066376+pascal-lc@users.noreply.github.com> pculliton peidaqi pengxin99 perserk +petterreinholdtsen piDack <104877312+piDack@users.noreply.github.com> pmysl postmasters @@ -983,6 +1036,7 @@ semidark serhii-nakon <57632032+serhii-nakon@users.noreply.github.com> sharpHL <132747147+sharpHL@users.noreply.github.com> shibe2 +simon886212 <37953122+simon886212@users.noreply.github.com> singularity <12184989+singularity-s0@users.noreply.github.com> sjinzh sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com> @@ -1000,10 +1054,12 @@ tarcey tc-mb <157115220+tc-mb@users.noreply.github.com> texmex76 <40733439+texmex76@users.noreply.github.com> thement <40525767+thement@users.noreply.github.com> +theraininsky <76763719+theraininsky@users.noreply.github.com> thewh1teagle <61390950+thewh1teagle@users.noreply.github.com> tjohnman toyer <2042519524@qq.com> tslmy +tv1wnd <55383215+tv1wnd@users.noreply.github.com> ubik2 uint256_t uint256_t @@ -1014,6 +1070,7 @@ valiray <133289098+valiray@users.noreply.github.com> vb vik viric +vmobilis <75476228+vmobilis@users.noreply.github.com> vodkaslime <646329483@qq.com> vvhg1 <94630311+vvhg1@users.noreply.github.com> vxiiduu <73044267+vxiiduu@users.noreply.github.com> @@ -1028,6 +1085,8 @@ wzy <32936898+Freed-Wu@users.noreply.github.com> xaedes xaedes xctan +xiaobing318 <71554036+xiaobing318@users.noreply.github.com> +xiaofei xloem <0xloem@gmail.com> yangli2 ymcki <84055651+ymcki@users.noreply.github.com> diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b2a1845e5..23cfbce5ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,8 @@ else() set(LLAMA_STANDALONE OFF) endif() +option(LLAMA_USE_SYSTEM_GGML "Use system libggml" OFF) + if (EMSCRIPTEN) set(BUILD_SHARED_LIBS_DEFAULT OFF) @@ -145,7 +147,13 @@ endif() # 3rd-party # -if (NOT TARGET ggml) +if (LLAMA_USE_SYSTEM_GGML) + message(STATUS "Using system-provided libggml, skipping ggml build") + find_package(ggml REQUIRED) + add_library(ggml ALIAS ggml::ggml) +endif() + +if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML) add_subdirectory(ggml) # ... otherwise assume ggml is added by a parent CMakeLists.txt endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8fab0de6f1..e68ff92445 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,7 +39,7 @@ _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_ -- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code +- Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` (from clang-tools v15+) to format the added code - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines) - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ diff --git a/Makefile b/Makefile index 69cab9c667..1f9455eff0 100644 --- a/Makefile +++ b/Makefile @@ -680,6 +680,10 @@ ifdef GGML_CUDA_CCBIN MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN) endif # GGML_CUDA_CCBIN +ifdef GGML_CUDA_NO_FA + MK_NVCCFLAGS += -DGGML_CUDA_NO_FA +endif # GGML_CUDA_NO_FA + ifdef GGML_CUDA_FA_ALL_QUANTS MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS endif # GGML_CUDA_FA_ALL_QUANTS @@ -800,6 +804,10 @@ ifdef GGML_CUDA_NO_PEER_COPY HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY +ifdef GGML_CUDA_NO_FA + HIPFLAGS += -DGGML_CUDA_NO_FA +endif # GGML_CUDA_NO_FA + OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML_EXT += $(OBJ_CUDA_TMPL) @@ -828,7 +836,7 @@ ifdef GGML_MUSA else MUSA_PATH ?= /opt/musa endif - MUSA_ARCHITECTURES ?= 21;22 + MUSA_ARCHITECTURES ?= 21;22;31 MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib @@ -876,6 +884,10 @@ ifdef GGML_CUDA_NO_PEER_COPY MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY endif # GGML_CUDA_NO_PEER_COPY +ifdef GGML_CUDA_NO_FA + MUSAFLAGS += -DGGML_CUDA_NO_FA +endif # GGML_CUDA_NO_FA + ifdef GGML_CUDA_FA_ALL_QUANTS MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS endif # GGML_CUDA_FA_ALL_QUANTS diff --git a/Package.swift b/Package.swift deleted file mode 100644 index 01c996d242..0000000000 --- a/Package.swift +++ /dev/null @@ -1,19 +0,0 @@ -// swift-tools-version:5.5 - -import PackageDescription - -let package = Package( - name: "llama", - platforms: [ - .macOS(.v12), - .iOS(.v14), - .watchOS(.v4), - .tvOS(.v14) - ], - products: [ - .library(name: "llama", targets: ["llama"]), - ], - targets: [ - .systemLibrary(name: "llama", pkgConfig: "llama"), - ] -) diff --git a/README.md b/README.md index f70c8ae1ed..1eec944f27 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml) -[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) +[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml) Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ @@ -25,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) - **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427 - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode -- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639 +- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669 @@ -157,6 +157,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp) - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) +- Delphi [Embarcadero/llama-cpp-delphi](https://github.com/Embarcadero/llama-cpp-delphi) @@ -171,6 +172,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) - [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0) - [janhq/jan](https://github.com/janhq/jan) (AGPL) +- [johnbean393/Sidekick](https://github.com/johnbean393/Sidekick) (MIT) - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0) - [KodiBot](https://github.com/firatkiral/kodibot) (GPL) - [llama.vim](https://github.com/ggml-org/llama.vim) (MIT) @@ -219,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server - [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale - +- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
diff --git a/Sources/llama/llama.h b/Sources/llama/llama.h deleted file mode 100644 index 41725880ed..0000000000 --- a/Sources/llama/llama.h +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once - -#include - diff --git a/Sources/llama/module.modulemap b/Sources/llama/module.modulemap deleted file mode 100644 index d010555b1c..0000000000 --- a/Sources/llama/module.modulemap +++ /dev/null @@ -1,5 +0,0 @@ -module llama [system] { - header "llama.h" - link "llama" - export * -} diff --git a/build-xcframework.sh b/build-xcframework.sh new file mode 100755 index 0000000000..2ce3939c43 --- /dev/null +++ b/build-xcframework.sh @@ -0,0 +1,519 @@ +#!/bin/bash +# +# Options +IOS_MIN_OS_VERSION=16.4 +MACOS_MIN_OS_VERSION=13.3 +VISIONOS_MIN_OS_VERSION=1.0 +TVOS_MIN_OS_VERSION=16.4 + +BUILD_SHARED_LIBS=OFF +LLAMA_BUILD_EXAMPLES=OFF +LLAMA_BUILD_TESTS=OFF +LLAMA_BUILD_SERVER=OFF +GGML_METAL=ON +GGML_METAL_EMBED_LIBRARY=ON +GGML_BLAS_DEFAULT=ON +GGML_METAL_USE_BF16=ON +GGML_OPENMP=OFF + +COMMON_C_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g" +COMMON_CXX_FLAGS="-Wno-macro-redefined -Wno-shorten-64-to-32 -Wno-unused-command-line-argument -g" + +# Common options for all builds +COMMON_CMAKE_ARGS=( + -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_REQUIRED=NO + -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGN_IDENTITY="" + -DCMAKE_XCODE_ATTRIBUTE_CODE_SIGNING_ALLOWED=NO + -DCMAKE_XCODE_ATTRIBUTE_DEBUG_INFORMATION_FORMAT="dwarf-with-dsym" + -DCMAKE_XCODE_ATTRIBUTE_GCC_GENERATE_DEBUGGING_SYMBOLS=YES + -DCMAKE_XCODE_ATTRIBUTE_COPY_PHASE_STRIP=NO + -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} + -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES} + -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS} + -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER} + -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY} + -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT} + -DGGML_METAL=${GGML_METAL} + -DGGML_METAL_USE_BF16=${GGML_METAL_USE_BF16} + -DGGML_NATIVE=OFF + -DGGML_OPENMP=${GGML_OPENMP} +) + +check_required_tool() { + local tool=$1 + local install_message=$2 + + if ! command -v $tool &> /dev/null; then + echo "Error: $tool is required but not found." + echo "$install_message" + exit 1 + fi +} +echo "Checking for required tools..." +check_required_tool "cmake" "Please install CMake 3.28.0 or later (brew install cmake)" +check_required_tool "xcodebuild" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" +check_required_tool "libtool" "Please install libtool which should be available with Xcode Command Line Tools (CLT). Make sure Xcode CLT is installed (xcode-select --install)" +check_required_tool "dsymutil" "Please install Xcode and Xcode Command Line Tools (xcode-select --install)" + +set -e + +## Clean up previous builds +rm -rf build-apple +rm -rf build-ios-sim +rm -rf build-ios-device +rm -rf build-macos +rm -rf build-visionos +rm -rf build-visionos-sim +rm -rf build-tvos-sim +rm -rf build-tvos-device + +# Setup the xcframework build directory structure +setup_framework_structure() { + local build_dir=$1 + local min_os_version=$2 + local platform=$3 # "ios", "macos", "visionos", or "tvos" + local framework_name="llama" + + echo "Creating ${platform}-style framework structure for ${build_dir}" + + if [[ "$platform" == "macos" ]]; then + # macOS versioned structure uses versioned directories + mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Headers + mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Modules + mkdir -p ${build_dir}/framework/${framework_name}.framework/Versions/A/Resources + + # Create symbolic links + ln -sf A ${build_dir}/framework/${framework_name}.framework/Versions/Current + ln -sf Versions/Current/Headers ${build_dir}/framework/${framework_name}.framework/Headers + ln -sf Versions/Current/Modules ${build_dir}/framework/${framework_name}.framework/Modules + ln -sf Versions/Current/Resources ${build_dir}/framework/${framework_name}.framework/Resources + ln -sf Versions/Current/${framework_name} ${build_dir}/framework/${framework_name}.framework/${framework_name} + + # Set header and module paths + local header_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Headers/ + local module_path=${build_dir}/framework/${framework_name}.framework/Versions/A/Modules/ + else + # iOS/VisionOS/tvOS use a flat structure + mkdir -p ${build_dir}/framework/${framework_name}.framework/Headers + mkdir -p ${build_dir}/framework/${framework_name}.framework/Modules + + # Remove any existing structure to ensure clean build + rm -rf ${build_dir}/framework/${framework_name}.framework/Versions + + # Set header and module paths + local header_path=${build_dir}/framework/${framework_name}.framework/Headers/ + local module_path=${build_dir}/framework/${framework_name}.framework/Modules/ + fi + + # Copy all required headers (common for all platforms) + cp include/llama.h ${header_path} + cp ggml/include/ggml.h ${header_path} + cp ggml/include/ggml-alloc.h ${header_path} + cp ggml/include/ggml-backend.h ${header_path} + cp ggml/include/ggml-metal.h ${header_path} + cp ggml/include/ggml-cpu.h ${header_path} + cp ggml/include/ggml-blas.h ${header_path} + cp ggml/include/gguf.h ${header_path} + + # Create module map (common for all platforms) + cat > ${module_path}module.modulemap << EOF +framework module llama { + header "llama.h" + header "ggml.h" + header "ggml-alloc.h" + header "ggml-backend.h" + header "ggml-metal.h" + header "ggml-cpu.h" + header "ggml-blas.h" + header "gguf.h" + + link "c++" + link framework "Accelerate" + link framework "Metal" + link framework "Foundation" + + export * +} +EOF + + # Platform-specific settings for Info.plist + local platform_name="" + local sdk_name="" + local supported_platform="" + + case "$platform" in + "ios") + platform_name="iphoneos" + sdk_name="iphoneos${min_os_version}" + supported_platform="iPhoneOS" + local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" + local device_family=' UIDeviceFamily + + 1 + 2 + ' + ;; + "macos") + platform_name="macosx" + sdk_name="macosx${min_os_version}" + supported_platform="MacOSX" + local plist_path="${build_dir}/framework/${framework_name}.framework/Versions/A/Resources/Info.plist" + local device_family="" + ;; + "visionos") + platform_name="xros" + sdk_name="xros${min_os_version}" + supported_platform="XRPlatform" + local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" + local device_family="" + ;; + "tvos") + platform_name="appletvos" + sdk_name="appletvos${min_os_version}" + supported_platform="AppleTVOS" + local plist_path="${build_dir}/framework/${framework_name}.framework/Info.plist" + local device_family=' UIDeviceFamily + + 3 + ' + ;; + esac + + # Create Info.plist + cat > ${plist_path} << EOF + + + + + CFBundleDevelopmentRegion + en + CFBundleExecutable + llama + CFBundleIdentifier + org.ggml.llama + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + llama + CFBundlePackageType + FMWK + CFBundleShortVersionString + 1.0 + CFBundleVersion + 1 + MinimumOSVersion + ${min_os_version} + CFBundleSupportedPlatforms + + ${supported_platform} + ${device_family} + DTPlatformName + ${platform_name} + DTSDKName + ${sdk_name} + + +EOF +} + +# Create dynamic libraries from static libraries. +combine_static_libraries() { + local build_dir="$1" + local release_dir="$2" + local platform="$3" # "ios", "macos", "visionos", or "tvos" + local is_simulator="$4" + local base_dir="$(pwd)" + local framework_name="llama" + + # Determine output path based on platform + local output_lib="" + if [[ "$platform" == "macos" ]]; then + # macOS uses versioned structure + output_lib="${build_dir}/framework/${framework_name}.framework/Versions/A/${framework_name}" + else + # iOS, visionOS, and tvOS use a directory flat structure + output_lib="${build_dir}/framework/${framework_name}.framework/${framework_name}" + fi + + local libs=( + "${base_dir}/${build_dir}/src/${release_dir}/libllama.a" + "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml.a" + "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-base.a" + "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a" + "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a" + "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a" + ) + + # Create temporary directory for processing + local temp_dir="${base_dir}/${build_dir}/temp" + mkdir -p "${temp_dir}" + + # Since we have multiple architectures libtool will find object files that do not + # match the target architecture. We suppress these warnings. + libtool -static -o "${temp_dir}/combined.a" "${libs[@]}" 2> /dev/null + + # Determine SDK, architectures, and install_name based on platform and simulator flag. + local sdk="" + local archs="" + local min_version_flag="" + local install_name="" + + case "$platform" in + "ios") + if [[ "$is_simulator" == "true" ]]; then + sdk="iphonesimulator" + archs="arm64 x86_64" + min_version_flag="-mios-simulator-version-min=${IOS_MIN_OS_VERSION}" + else + sdk="iphoneos" + archs="arm64" + min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}" + fi + install_name="@rpath/llama.framework/llama" + ;; + "macos") + sdk="macosx" + archs="arm64 x86_64" + min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}" + install_name="@rpath/llama.framework/Versions/Current/llama" + ;; + "visionos") + if [[ "$is_simulator" == "true" ]]; then + sdk="xrsimulator" + archs="arm64 x86_64" + min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}-simulator" + else + sdk="xros" + archs="arm64" + min_version_flag="-mtargetos=xros${VISIONOS_MIN_OS_VERSION}" + fi + # Use flat structure for visionOS, same as iOS + install_name="@rpath/llama.framework/llama" + ;; + "tvos") + if [[ "$is_simulator" == "true" ]]; then + sdk="appletvsimulator" + archs="arm64 x86_64" + min_version_flag="-mtvos-simulator-version-min=${TVOS_MIN_OS_VERSION}" + else + sdk="appletvos" + archs="arm64" + min_version_flag="-mtvos-version-min=${TVOS_MIN_OS_VERSION}" + fi + install_name="@rpath/llama.framework/llama" + ;; + esac + + # Build architecture flags + local arch_flags="" + for arch in $archs; do + arch_flags+=" -arch $arch" + done + + # Create dynamic library + echo "Creating dynamic library for ${platform}." + xcrun -sdk $sdk clang++ -dynamiclib \ + -isysroot $(xcrun --sdk $sdk --show-sdk-path) \ + $arch_flags \ + $min_version_flag \ + -Wl,-force_load,"${temp_dir}/combined.a" \ + -framework Foundation -framework Metal -framework Accelerate \ + -install_name "$install_name" \ + -o "${base_dir}/${output_lib}" + + # Platform-specific post-processing for device builds + if [[ "$is_simulator" == "false" ]]; then + if command -v vtool &>/dev/null; then + case "$platform" in + "ios") + echo "Marking binary as a framework binary for iOS..." + vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \ + -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" + ;; + "visionos") + echo "Marking binary as a framework binary for visionOS..." + vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \ + -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" + ;; + "tvos") + echo "Marking binary as a framework binary for tvOS..." + vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \ + -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}" + ;; + esac + else + echo "Warning: vtool not found. Binary may not pass App Store validation." + fi + fi + + echo "Creating properly formatted dSYM..." + # Create a separate directory for dSYMs for all platforms + mkdir -p "${base_dir}/${build_dir}/dSYMs" + + # iOS and visionOS style dSYM (flat structure) + if [[ "$platform" == "ios" || "$platform" == "visionos" || "$platform" == "tvos" ]]; then + # Generate dSYM in the dSYMs directory + xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM" + + # Create a copy of the binary that will be stripped + cp "${base_dir}/${output_lib}" "${temp_dir}/binary_to_strip" + + # Strip debug symbols from the copy + xcrun strip -S "${temp_dir}/binary_to_strip" -o "${temp_dir}/stripped_lib" + + # Replace the original with the stripped version + mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}" + else + # macOS style dSYM + # First strip debug info to a separate file + xcrun strip -S "${base_dir}/${output_lib}" -o "${temp_dir}/stripped_lib" + + # Generate dSYM in the dSYMs directory + xcrun dsymutil "${base_dir}/${output_lib}" -o "${base_dir}/${build_dir}/dSYMs/llama.dSYM" + + # Replace original binary with stripped version + mv "${temp_dir}/stripped_lib" "${base_dir}/${output_lib}" + fi + + # Remove any automatically generated dSYM files in the framework structure as they will + # otherwise case Invalid Bundle Structure validation errors. + if [ -d "${base_dir}/${output_lib}.dSYM" ]; then + echo "Removing generated dSYM file in framework structure: ${base_dir}/${output_lib}.dSYM" + rm -rf "${base_dir}/${output_lib}.dSYM" + fi + + # Clean up + rm -rf "${temp_dir}" +} + +echo "Building for iOS simulator..." +cmake -B build-ios-sim -G Xcode \ + "${COMMON_CMAKE_ARGS[@]}" \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \ + -DIOS=ON \ + -DCMAKE_SYSTEM_NAME=iOS \ + -DCMAKE_OSX_SYSROOT=iphonesimulator \ + -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ + -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \ + -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -S . +cmake --build build-ios-sim --config Release -- -quiet + +echo "Building for iOS devices..." +cmake -B build-ios-device -G Xcode \ + "${COMMON_CMAKE_ARGS[@]}" \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=${IOS_MIN_OS_VERSION} \ + -DCMAKE_OSX_SYSROOT=iphoneos \ + -DCMAKE_OSX_ARCHITECTURES="arm64" \ + -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \ + -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -S . +cmake --build build-ios-device --config Release -- -quiet + +echo "Building for macOS..." +cmake -B build-macos -G Xcode \ + "${COMMON_CMAKE_ARGS[@]}" \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=${MACOS_MIN_OS_VERSION} \ + -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ + -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -S . +cmake --build build-macos --config Release -- -quiet + +echo "Building for visionOS..." +cmake -B build-visionos -G Xcode \ + "${COMMON_CMAKE_ARGS[@]}" \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \ + -DCMAKE_OSX_ARCHITECTURES="arm64" \ + -DCMAKE_SYSTEM_NAME=visionOS \ + -DCMAKE_OSX_SYSROOT=xros \ + -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \ + -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ + -S . +cmake --build build-visionos --config Release -- -quiet + +echo "Building for visionOS simulator..." +cmake -B build-visionos-sim -G Xcode \ + "${COMMON_CMAKE_ARGS[@]}" \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=${VISIONOS_MIN_OS_VERSION} \ + -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ + -DCMAKE_SYSTEM_NAME=visionOS \ + -DCMAKE_OSX_SYSROOT=xrsimulator \ + -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \ + -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \ + -S . +cmake --build build-visionos-sim --config Release -- -quiet + +# Add tvOS builds (might need the same u_int definitions as watchOS and visionOS) +echo "Building for tvOS simulator..." +cmake -B build-tvos-sim -G Xcode \ + "${COMMON_CMAKE_ARGS[@]}" \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \ + -DCMAKE_SYSTEM_NAME=tvOS \ + -DCMAKE_OSX_SYSROOT=appletvsimulator \ + -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \ + -DGGML_METAL=ON \ + -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvsimulator \ + -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -S . +cmake --build build-tvos-sim --config Release -- -quiet + +echo "Building for tvOS devices..." +cmake -B build-tvos-device -G Xcode \ + "${COMMON_CMAKE_ARGS[@]}" \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=${TVOS_MIN_OS_VERSION} \ + -DCMAKE_SYSTEM_NAME=tvOS \ + -DCMAKE_OSX_SYSROOT=appletvos \ + -DCMAKE_OSX_ARCHITECTURES="arm64" \ + -DGGML_METAL=ON \ + -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=appletvos \ + -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ + -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ + -S . +cmake --build build-tvos-device --config Release -- -quiet + +# Setup frameworks and copy binaries and headers +echo "Setting up framework structures..." +setup_framework_structure "build-ios-sim" ${IOS_MIN_OS_VERSION} "ios" +setup_framework_structure "build-ios-device" ${IOS_MIN_OS_VERSION} "ios" +setup_framework_structure "build-macos" ${MACOS_MIN_OS_VERSION} "macos" +setup_framework_structure "build-visionos" ${VISIONOS_MIN_OS_VERSION} "visionos" +setup_framework_structure "build-visionos-sim" ${VISIONOS_MIN_OS_VERSION} "visionos" +setup_framework_structure "build-tvos-sim" ${TVOS_MIN_OS_VERSION} "tvos" +setup_framework_structure "build-tvos-device" ${TVOS_MIN_OS_VERSION} "tvos" + +# Create dynamic libraries from static libraries +echo "Creating dynamic libraries from static libraries..." +combine_static_libraries "build-ios-sim" "Release-iphonesimulator" "ios" "true" +combine_static_libraries "build-ios-device" "Release-iphoneos" "ios" "false" +combine_static_libraries "build-macos" "Release" "macos" "false" +combine_static_libraries "build-visionos" "Release-xros" "visionos" "false" +combine_static_libraries "build-visionos-sim" "Release-xrsimulator" "visionos" "true" +combine_static_libraries "build-tvos-sim" "Release-appletvsimulator" "tvos" "true" +combine_static_libraries "build-tvos-device" "Release-appletvos" "tvos" "false" + +# Create XCFramework with correct debug symbols paths +echo "Creating XCFramework..." +xcodebuild -create-xcframework \ + -framework $(pwd)/build-ios-sim/framework/llama.framework \ + -debug-symbols $(pwd)/build-ios-sim/dSYMs/llama.dSYM \ + -framework $(pwd)/build-ios-device/framework/llama.framework \ + -debug-symbols $(pwd)/build-ios-device/dSYMs/llama.dSYM \ + -framework $(pwd)/build-macos/framework/llama.framework \ + -debug-symbols $(pwd)/build-macos/dSYMS/llama.dSYM \ + -framework $(pwd)/build-visionos/framework/llama.framework \ + -debug-symbols $(pwd)/build-visionos/dSYMs/llama.dSYM \ + -framework $(pwd)/build-visionos-sim/framework/llama.framework \ + -debug-symbols $(pwd)/build-visionos-sim/dSYMs/llama.dSYM \ + -framework $(pwd)/build-tvos-device/framework/llama.framework \ + -debug-symbols $(pwd)/build-tvos-device/dSYMs/llama.dSYM \ + -framework $(pwd)/build-tvos-sim/framework/llama.framework \ + -debug-symbols $(pwd)/build-tvos-sim/dSYMs/llama.dSYM \ + -output $(pwd)/build-apple/llama.xcframework diff --git a/ci/run.sh b/ci/run.sh index 77c32ce005..9fc19c89d8 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -352,10 +352,10 @@ function gg_run_open_llama_7b_v2 { (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" diff --git a/cmake/common.cmake b/cmake/common.cmake index 0f54871e41..a5bb787f15 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -1,3 +1,5 @@ +include("ggml/cmake/common.cmake") + function(llama_add_compile_flags) if (LLAMA_FATAL_WARNINGS) if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") diff --git a/common/arg.cpp b/common/arg.cpp index 3c169b5b5f..b6bfe6f89b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -764,7 +764,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(common_arg( {"-n", "--predict", "--n-predict"}, "N", - string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict), + string_format( + ex == LLAMA_EXAMPLE_MAIN || ex == LLAMA_EXAMPLE_INFILL + ? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)" + : "number of tokens to predict (default: %d, -1 = infinity)", + params.n_predict), [](common_params & params, int value) { params.n_predict = value; } @@ -813,13 +817,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_FLASH_ATTN")); add_opt(common_arg( {"-p", "--prompt"}, "PROMPT", - ex == LLAMA_EXAMPLE_MAIN - ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" - : "prompt to start generation with", + "prompt to start generation with; for system message, use -sys", [](common_params & params, const std::string & value) { params.prompt = value; } ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-sys", "--system-prompt"}, "PROMPT", + "system prompt to use with model (if applicable, depending on chat template)", + [](common_params & params, const std::string & value) { + params.system_prompt = value; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--no-perf"}, string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), @@ -844,6 +853,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_excludes({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-sysf", "--system-prompt-file"}, "FNAME", + "a file containing the system prompt (default: none)", + [](common_params & params, const std::string & value) { + std::ifstream file(value); + if (!file) { + throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str())); + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.system_prompt)); + if (!params.system_prompt.empty() && params.system_prompt.back() == '\n') { + params.system_prompt.pop_back(); + } + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"--in-file"}, "FNAME", "an input file (repeat to specify multiple files)", @@ -944,6 +967,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; } ).set_examples({LLAMA_EXAMPLE_MAIN})); + add_opt(common_arg( + {"-st", "--single-turn"}, + "run conversation for a single turn only, then exit when done\n" + "will not be interactive if first turn is predefined with --prompt\n" + "(default: false)", + [](common_params & params) { + params.single_turn = true; + } + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(common_arg( {"-i", "--interactive"}, string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"), @@ -1853,18 +1885,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PASSKEY})); add_opt(common_arg( {"-o", "--output", "--output-file"}, "FNAME", - string_format("output file (default: '%s')", - ex == LLAMA_EXAMPLE_EXPORT_LORA - ? params.lora_outfile.c_str() - : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR - ? params.cvector_outfile.c_str() - : params.out_file.c_str()), + string_format("output file (default: '%s')", params.out_file.c_str()), [](common_params & params, const std::string & value) { params.out_file = value; - params.cvector_outfile = value; - params.lora_outfile = value; } - ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA})); + ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_TTS})); add_opt(common_arg( {"-ofreq", "--output-frequency"}, "N", string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq), @@ -2447,6 +2472,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.vocoder.use_guide_tokens = true; } ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--tts-speaker-file"}, "FNAME", + "speaker file path for audio generation", + [](common_params & params, const std::string & value) { + params.vocoder.speaker_file = value; + } + ).set_examples({LLAMA_EXAMPLE_TTS})); // model-specific add_opt(common_arg( @@ -2550,5 +2582,43 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"--fim-qwen-7b-spec"}, + string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + + add_opt(common_arg( + {"--fim-qwen-14b-spec"}, + string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"; + params.hf_file = "qwen2.5-coder-14b-q8_0.gguf"; + params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + ).set_examples({LLAMA_EXAMPLE_SERVER})); + return ctx_arg; } diff --git a/common/chat.cpp b/common/chat.cpp index 9ebe4c5784..62ca26ad76 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -60,7 +60,9 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa } msg.role = message.at("role"); - if (message.contains("content")) { + auto has_content = message.contains("content"); + auto has_tool_calls = message.contains("tool_calls"); + if (has_content) { const auto & content = message.at("content"); if (content.is_string()) { msg.content = content; @@ -81,19 +83,8 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa } else if (!content.is_null()) { throw std::runtime_error("Invalid 'content' type: expected string or array, got " + content.dump() + " (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); } - } else { - throw std::runtime_error("Expected 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)"); } - if (message.contains("reasoning_content")) { - msg.reasoning_content = message.at("reasoning_content"); - } - if (message.contains("name")) { - msg.tool_name = message.at("name"); - } - if (message.contains("tool_call_id")) { - msg.tool_call_id = message.at("tool_call_id"); - } - if (message.contains("tool_calls")) { + if (has_tool_calls) { for (const auto & tool_call : message.at("tool_calls")) { common_chat_tool_call tc; if (!tool_call.contains("type")) { @@ -118,6 +109,18 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa msg.tool_calls.push_back(tc); } } + if (!has_content && !has_tool_calls) { + throw std::runtime_error("Expected 'content' or 'tool_calls' (ref: https://github.com/ggml-org/llama.cpp/issues/8367 & https://github.com/ggml-org/llama.cpp/issues/12279)"); + } + if (message.contains("reasoning_content")) { + msg.reasoning_content = message.at("reasoning_content"); + } + if (message.contains("name")) { + msg.tool_name = message.at("name"); + } + if (message.contains("tool_call_id")) { + msg.tool_call_id = message.at("tool_call_id"); + } msgs.push_back(msg); } @@ -442,6 +445,7 @@ std::string common_chat_format_name(common_chat_format format) { case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2"; case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1"; case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro"; + case COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING: return "Hermes 2 Pro (extract reasoning)"; case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B"; case COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING: return "Command R7B (extract reasoning)"; default: @@ -449,12 +453,6 @@ std::string common_chat_format_name(common_chat_format format) { } } -const common_grammar_options grammar_options { - /* .dotall = */ false, - /* .compact_spaces = */ false, - // /* .compact_spaces = */ true, -}; - static bool parse_json(std::string::const_iterator & it, const std::string::const_iterator & end, json & out) { // // https://json.nlohmann.me/features/parsing/sax_interface/ struct json_error_locator : public nlohmann::json_sax { @@ -500,6 +498,34 @@ static bool parse_json(std::string::const_iterator & it, const std::string::cons } } +static bool parse_literal(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) { + auto expected_it = expected.begin(); + auto tmp_it = it; + while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) { + ++tmp_it; + ++expected_it; + } + if (expected_it == expected.end()) { + it = tmp_it; + return true; + } + return false; +} + +static std::optional parse_pattern(std::string::const_iterator & it, const std::string::const_iterator & end, const std::regex & expected) { + std::smatch match; + if (std::regex_match(it, end, match, expected)) { + it = match.suffix().first; + return match; + } + return std::nullopt; +} + +static void consume_spaces(std::string::const_iterator & it, const std::string::const_iterator & end) { + while (it != end && std::isspace(*it)) { + ++it; + } +} /** * Takes a prefix regex that must have 1 group to capture the function name, a closing suffix, and expects json parameters in between. @@ -509,7 +535,8 @@ static common_chat_msg parse_json_tool_calls( const std::string& input, const std::optional & trigger_opt, const std::regex & function_regex, - const std::regex & close_regex) { + const std::regex & close_regex, + bool allow_raw_python = false) { std::smatch match; common_chat_msg result; @@ -540,14 +567,19 @@ static common_chat_msg parse_json_tool_calls( it = rit->suffix().first; json arguments; - if (!parse_json(it, end, arguments)) { + if (parse_json(it, end, arguments)) { + if (!std::regex_search(it, end, match, close_regex)) { + throw std::runtime_error("Malformed input, missing closing pattern: " + input); + } + it = match.suffix().first; + result.tool_calls.push_back({name, arguments.is_string() ? arguments.get() : arguments.dump(), /* id= */ ""}); + } else { + if (allow_raw_python && name == "python") { + result.tool_calls.push_back({name, json({{"code", std::string(it, end)}}).dump(), /* id= */ ""}); + break; + } throw std::runtime_error("Failed to parse json tool call arguments: " + input); } - if (!std::regex_search(it, end, match, close_regex)) { - throw std::runtime_error("Malformed input, missing closing pattern: " + input); - } - it = match.suffix().first; - result.tool_calls.push_back({name, arguments.is_string() ? arguments.get() : arguments.dump(), /* id= */ ""}); } if (!result.tool_calls.empty()) { @@ -559,29 +591,29 @@ static common_chat_msg parse_json_tool_calls( return result; } +static common_chat_tool_call process_tool_call(const json & tool_call) { + const auto & arguments = tool_call.at("arguments"); + return { + /* .name = */ tool_call.at("name"), + /* .arguments = */ arguments.is_string() ? arguments.get() : arguments.dump(), + /* .id = */ tool_call.contains("id") ? tool_call.at("id") : "", + }; +} static common_chat_msg parse_prefixed_json_tool_call_array(const std::string& input, const std::string & prefix, size_t rstrip_prefix = 0) { auto content_end = input.find(prefix); size_t tc_start = std::string::npos; common_chat_msg result; result.role = "assistant"; - const auto process_tool_calls = [&](const json & tool_calls) { - for (const auto & tool_call : tool_calls) { - const auto & arguments = tool_call.at("arguments"); - result.tool_calls.push_back({ - tool_call.at("name"), - arguments.is_string() ? arguments.get() : arguments.dump(), - tool_call.contains("id") ? tool_call.at("id") : "", - }); - } - }; if (content_end == std::string::npos) { result.content = input; } else { tc_start = content_end + prefix.size() - rstrip_prefix; result.content = input.substr(0, content_end); auto tool_calls = json::parse(input.substr(tc_start)); - process_tool_calls(tool_calls); + for (const auto & tool_call : tool_calls) { + result.tool_calls.emplace_back(process_tool_call(tool_call)); + } } return result; } @@ -700,7 +732,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp data.grammar_lazy = false; data.grammar = build_grammar([&](const common_grammar_builder & builder) { builder.add_schema("root", schema); - }, grammar_options); + }); auto tweaked_messages = common_chat_template::add_system( inputs.messages, @@ -770,8 +802,11 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat schema["maxItems"] = 1; } builder.add_rule("root", "\"[TOOL_CALLS]\" " + builder.add_schema("tool_calls", schema)); - }, grammar_options); - data.grammar_triggers.push_back({"[TOOL_CALLS]", /* .at_start = */ true}); + }); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "[TOOL_CALLS]"}); + data.preserved_tokens = { + "[TOOL_CALLS]", + }; data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; return data; @@ -813,14 +848,18 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ schema["maxItems"] = 1; } builder.add_rule("root", "\"<|START_ACTION|>\" " + builder.add_schema("tool_calls", schema) + " \"<|END_ACTION|>\""); - }, grammar_options); - data.grammar_triggers.push_back({"<|START_ACTION|>", /* .at_start = */ false}); + }); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, + "<|START_ACTION|>", + }); data.preserved_tokens = { + "<|START_ACTION|>", + "<|END_ACTION|>", "<|START_RESPONSE|>", "<|END_RESPONSE|>", "<|START_THINKING|>", "<|END_THINKING|>", - "<|END_ACTION|>", }; auto adjusted_messages = json::array(); for (const auto & msg : inputs.messages) { @@ -840,9 +879,9 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_ return data; } static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool extract_reasoning) { - static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)"); - static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>"); - static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>"); + static const std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S]*?)<\\|END_THINKING\\|>)([\\s\\S]*)"); + static const std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S]*?)<\\|END_ACTION\\|>"); + static const std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S]*?)<\\|END_RESPONSE\\|>"); std::smatch match; @@ -945,23 +984,23 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com builder.add_rule( name + "-call", "\"{\" space " - "( \"\\\"type\\\":\" space \"\\\"function\\\",\" space )? " - "\"\\\"name\\\": \\\"" + name + "\\\", \\\"parameters\\\": \" " + - builder.add_schema(name + "-args", parameters) + - " \"}\"")); - data.grammar_triggers.push_back({"{\"name\": \"" + name + "\"", /* .at_start = */ true}); + "( \"\\\"type\\\"\" space \":\" space \"\\\"function\\\"\" space \",\" space )? " + " \"\\\"name\\\"\" space \":\" space \"\\\"" + name + "\\\"\" space \",\" space " + " \"\\\"parameters\\\"\" space \":\" space " + builder.add_schema(name + "-args", parameters) + " " + "\"}\" space")); + }); + // Small models may hallucinate function names so we match anything (*at the start*) that looks like the JSON of a function call, regardless of the name. + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, + "\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"", // + name + "\"[\\s\\S]*", }); - data.grammar_triggers.push_back({"{\"name\":", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\n \"name\":", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\"type\": \"function\"", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true}); - data.grammar_triggers.push_back({"{\n \"type\": \"function\"", /* .at_start = */ true}); if (!builtin_tools.empty()) { - data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false}); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"}); + data.preserved_tokens.push_back("<|python_tag|>"); } + // Allow a few empty lines on top of the usual constrained json schema space rule. builder.add_rule("root", string_join(tool_rules, " | ")); - }, grammar_options); + }); data.additional_stops.push_back("<|eom_id|>"); data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, { {"tools_in_user_message", false}, @@ -974,33 +1013,33 @@ static common_chat_params common_chat_params_init_llama_3_1_tool_calls(const com } static common_chat_msg common_chat_parse_llama_3_1(const std::string & input, bool with_builtin_tools = false) { // TODO: tighten & simplify the parser, don't accept leading text context. - static std::regex function_regex("\\{[\\s\\n\\r]*(?:\"type\"[\\s\\n\\r]*:[\\s\\n\\r]*\"function\"[\\s\\n\\r]*,[\\s\\n\\r]*|[\\s\\n\\r]*)\"name\"[\\s\\n\\r]*:[\\s\\n\\r]*\"([^\"]+)\"[\\s\\n\\r]*,[\\s\\n\\r]*\"parameters\": "); - static std::regex close_regex("\\}"); - static std::regex builtin_call_regex("<\\|python_tag\\|>([^.(]+)\\.call\\((.*)\\)"); + static const std::regex function_regex( + "\\s*\\{\\s*(?:\"type\"\\s*:\\s*\"function\"\\s*,\\s*)?\"name\"\\s*:\\s*\"([^\"]+)\"\\s*,\\s*\"parameters\"\\s*: "); + static const std::regex close_regex("\\}\\s*"); + static const std::regex builtin_call_regex("<\\|python_tag\\|>\\s*([^.(]+)\\s*\\.\\s*call\\s*\\(\\s*([\\w]+)\\s*=\\s*([\\s\\S]*?)\\)"); if (with_builtin_tools) { std::smatch match; if (std::regex_match(input, match, builtin_call_regex)) { - auto name = match[1].str(); - auto raw_args = match[2].str(); + try { + auto name = match[1].str(); + auto arg_name = match[2].str(); + auto arg_value_str = match[3].str(); + auto arg_value = json::parse(arg_value_str); - // TODO: if/when builtin tools start accepting more than 1 argument, use parse_json for real parsing. - auto it_eq = raw_args.find('='); - auto arg_name = raw_args.substr(0, it_eq); - auto arg_value_str = raw_args.substr(it_eq + 1); - auto arg_value = json::parse(arg_value_str); - - common_chat_msg msg; - msg.role = "assistant"; - msg.content = match.prefix().str(); - msg.tool_calls.push_back({ - /* .name = */ name, - /* .arguments = */ (json { - {arg_name, arg_value}, - }).dump(), - /* .id = */ "", - }); - return msg; + common_chat_msg msg; + msg.role = "assistant"; + msg.tool_calls.push_back({ + /* .name = */ name, + /* .arguments = */ (json { + {arg_name, arg_value}, + }).dump(), + /* .id = */ "", + }); + return msg; + } catch (const std::exception & e) { + LOG_WRN("Failed to parse builtin tool call arguments (%s): %s", e.what(), input.c_str()); + } } } return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); @@ -1017,10 +1056,10 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_ std::string name = function.at("name"); auto parameters = function.at("parameters"); builder.resolve_refs(parameters); - auto args_rule = builder.add_schema(name + "-args", parameters); tool_rules.push_back(builder.add_rule(name + "-call", "\"<|tool▁call▁begin|>function<|tool▁sep|>" + name + "\\n" - "```json\\n\" " + args_rule + " \"```<|tool▁call▁end|>\"")); + "```json\\n\" " + builder.add_schema(name + "-args", parameters) + " " + "\"```<|tool▁call▁end|>\"")); }); // Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag, // so we accept common variants (then it's all constrained) @@ -1029,18 +1068,20 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_ "(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " " "\"<|tool▁calls▁end|>\"" " space"); - data.grammar_triggers.push_back({"<|tool▁calls▁begin|>", /* .at_start = */ false}); - data.grammar_triggers.push_back({"<|tool_calls_begin|>", /* .at_start = */ false}); - data.grammar_triggers.push_back({"<|tool calls begin|>", /* .at_start = */ false}); - data.grammar_triggers.push_back({"<|tool\\_calls\\_begin|>", /* .at_start = */ false}); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool▁calls▁begin|>"}); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool_calls_begin|>"}); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool calls begin|>"}); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|tool\\_calls\\_begin|>"}); data.preserved_tokens = { "", "", + "<|tool▁calls▁begin|>", + "<|tool▁call▁begin|>", "<|tool▁sep|>", - "<|tool▁calls▁end|", "<|tool▁call▁end|>", + "<|tool▁calls▁end|", }; - }, grammar_options); + }); } auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); @@ -1065,34 +1106,42 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_ data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING : COMMON_CHAT_FORMAT_DEEPSEEK_R1; return data; } -static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) { - static std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n"); - static std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>"); - static std::regex reasoning_content_regex("((?:)?([\\s\\S\\r\\n]*?))?([\\s\\S\\r\\n]*)"); - static std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>"); - common_chat_msg msg; - msg.role = "assistant"; +static common_chat_msg handle_think_tag_prelude(const std::string & input, bool extract_reasoning, const std::function & rest_parser) { std::smatch match; + static const std::regex reasoning_content_regex("((?:)?([\\s\\S\\r\\n]*?))?([\\s\\S\\r\\n]*)"); if (std::regex_match(input, match, reasoning_content_regex)) { - std::string rest; + auto rest = match[3].str(); + auto msg = rest_parser(rest); + auto reasoning_content = string_strip(match[2].str()); if (extract_reasoning) { - msg.reasoning_content = string_strip(match[2].str()); - } else { - msg.content = match[1].str(); + msg.reasoning_content = reasoning_content; + } else if (!reasoning_content.empty()) { + std::ostringstream content; + content << "" << reasoning_content << "" << msg.content; + msg.content = content.str(); } - rest = match[3].str(); + return msg; + } + return rest_parser(input); +} +static common_chat_msg common_chat_parse_deepseek_r1(const std::string & input, bool extract_reasoning) { + return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) { + static const std::regex function_regex("<|tool▁call▁begin|>function<|tool▁sep|>([^\n]+)\n```json\n"); + static const std::regex close_regex("```[\\s\\r\\n]*<|tool▁call▁end|>"); + static const std::regex tool_calls_regex("[\\s\\r\\n]*(?:<|tool▁calls▁begin|>|<|tool_calls_begin|>|<|tool calls begin|>|<|tool\\\\_calls\\\\_begin|>)([\\s\\S\\r\\n]*?)<|tool▁calls▁end|>"); - if (std::regex_search(rest, match, tool_calls_regex)) { + common_chat_msg msg; + msg.role = "assistant"; + std::smatch match; + if (std::regex_search(input, match, tool_calls_regex)) { auto tool_calls = match[1].str(); auto msg2 = parse_json_tool_calls(tool_calls, std::nullopt, function_regex, close_regex); msg.tool_calls = std::move(msg2.tool_calls); } else { - msg.content += std::string(rest.begin() + rest.find_first_not_of(" \r\n"), rest.end()); + msg.content = input; } - } else { - msg.content = input; - } - return msg; + return msg; + }); } static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { @@ -1129,8 +1178,11 @@ static common_chat_params common_chat_params_init_firefunction_v2(const common_c schema["maxItems"] = 1; } builder.add_rule("root", "\" functools\"? " + builder.add_schema("tool_calls", schema)); - }, grammar_options); - data.grammar_triggers.push_back({" functools[", /* .at_start = */ false}); + }); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, " functools["}); + data.preserved_tokens = { + " functools[", + }; data.format = COMMON_CHAT_FORMAT_FIREFUNCTION_V2; } else { data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; @@ -1158,11 +1210,28 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ auto parameters = function.at("parameters"); builder.resolve_refs(parameters); auto args_rule = builder.add_schema(name + "-args", parameters); - first_tool_rules.push_back(builder.add_rule(name + "-call", "\"" + name + "\\n\" " + args_rule)); + first_tool_rules.push_back(builder.add_rule(name + "-call", "( \"assistant<|end_header_id|>\\n\" )? \"" + name + "\\n\" " + args_rule)); subsequent_tool_rules.push_back(builder.add_rule(name + "-call2", "\">>>" + name + "\\n\" " + args_rule)); - data.grammar_triggers.push_back({name, /* .at_start = */ true}); - data.grammar_triggers.push_back({">>>" + name, /* .at_start = */ false}); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, + regex_escape(name + "\n"), + }); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, + regex_escape("assistant<|end_header_id|>\n" + name + "\n"), + }); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, + regex_escape(">>>" + name + "\n"), + }); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, + ">>>assistant<|end_header_id|>\n" + name, + }); }); + data.preserved_tokens = { + "<|end_header_id|>", + }; auto first_rule = first_tool_rules.empty() ? "" : builder.add_rule("first_tool_call", string_join(first_tool_rules, " | ")) + " space"; if (inputs.parallel_tool_calls) { auto subsequent_rule = builder.add_rule("subsequent_tool_call", string_join(subsequent_tool_rules, " | ")) + " space"; @@ -1171,34 +1240,20 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_ builder.add_rule("root", first_rule); } - }, grammar_options); + }); } return data; } -static bool consume(std::string::const_iterator & it, const std::string::const_iterator & end, const std::string & expected) { - auto expected_it = expected.begin(); - auto tmp_it = it; - while (tmp_it != end && expected_it != expected.end() && *tmp_it == *expected_it) { - ++tmp_it; - ++expected_it; - } - if (expected_it == expected.end()) { - it = tmp_it; - return true; - } - return false; -} - static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & input) { - static std::regex function_regex(R"((?:>>>)?(\w+)\n)"); - static std::regex close_regex(R"($|(?=>>>))"); + static const std::regex function_regex(R"((?:>>>)?(?:assistant<|end_header_id|>\n)?(\w+)\n)"); + static const std::regex close_regex(R"($|(?=>>>))"); std::string content; auto it = input.begin(); const auto end = input.end(); - if (consume(it, end, "all\n")) { + if (parse_literal(it, end, "all\n")) { std::smatch match; if (std::regex_search(it, end, match, function_regex)) { auto fun_it = match.prefix().second; @@ -1213,7 +1268,7 @@ static common_chat_msg common_chat_parse_functionary_v3_2(const std::string & in } // TODO: tighten & simplify. try { - auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex); + auto res = parse_json_tool_calls(std::string(it, end), std::nullopt, function_regex, close_regex, /* allow_raw_python= */ true); res.content = content + res.content; return res; } catch (const std::exception & e) { @@ -1266,12 +1321,13 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con }); if (has_raw_python) { tool_rules.push_back(builder.add_rule("python-call", "\"<|python_tag|>\" .*")); - data.grammar_triggers.push_back({"<|python_tag|>", /* .at_start = */ false}); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<|python_tag|>"}); + data.preserved_tokens.push_back("<|python_tag|>"); } auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " space"; builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call); - data.grammar_triggers.push_back({"([\s\S\n]*)$)"); + static const std::regex python_tag_regex(R"(<\|python_tag\|>([\s\S\n]*)$)"); std::smatch match; if (std::regex_search(input, match, python_tag_regex)) { auto code = match[1].str(); @@ -1294,8 +1350,8 @@ static common_chat_msg common_chat_parse_functionary_v3_1_llama_3_1(const std::s }); return msg; } - static std::regex function_regex(R"()"); - static std::regex close_regex(R"()"); + static const std::regex function_regex(R"()"); + static const std::regex close_regex(R"()"); // TODO: tighten & simplify. return parse_json_tool_calls(input, std::nullopt, function_regex, close_regex); } @@ -1306,6 +1362,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { std::vector tool_rules; + std::vector tool_call_alts; foreach_function(inputs.tools, [&](const json & tool) { const auto & function = tool.at("function"); std::string name = function.at("name"); @@ -1319,68 +1376,187 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat }}, {"required", json::array({"name", "arguments"})}, })); + tool_call_alts.push_back(builder.add_rule( + name + "-function-tag", + "\"\" space " + + builder.add_schema(name + "-args", parameters) + " " + "\"\" space")); + + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, + "", + }); + auto escaped_name = regex_escape(name); + data.grammar_triggers.push_back({ + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, + "\" space " + builder.add_rule("tool_call", string_join(tool_rules, " | ")) + " \"\" space"; + auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space"); + std::vector alt_tags { + any_tool_call, + "\"\" space " + any_tool_call + " \"\"", + // The rest is just to accommodate common "good bad" outputs. + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + "\"\" space " + any_tool_call + " \"\"", + }; + auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space"); + tool_call_alts.push_back(wrappable_tool_call); + tool_call_alts.push_back( + "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space "); + auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | ")); builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call); - data.grammar_triggers.push_back({"", /* .at_start = */ false}); - data.preserved_tokens = { "" }; - }, grammar_options); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ""}); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "|||)?\\s*\\{\\s*\"", //name\"\\s*:\\s*\"" + escaped_name + "\"", + }); + data.preserved_tokens = { + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "```", + "```json", + "```xml", + }; + }); data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); - data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; + data.format = inputs.extract_reasoning ? COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING : COMMON_CHAT_FORMAT_HERMES_2_PRO; return data; } -static common_chat_msg common_chat_parse_hermes_2_pro(const std::string & input) { - try { - std::regex start_pattern(R"([\n\s]*)"); - std::regex middle_pattern(R"([\n\s]*[\n\s]*)"); - std::regex end_pattern(R"([\n\s]*[\n\s]*$)"); +static common_chat_msg common_chat_parse_hermes_2_pro(const std::string& input, bool extract_reasoning) { + return handle_think_tag_prelude(input, extract_reasoning, [](const std::string & input) { + static const std::regex open_regex( + "(?:" + "(```(?:xml|json)?\\n\\s*)?" // match 1 (block_start) + "(" // match 2 (open_tag) + "|" + "|" + "|" + "|" + "|" + "|" + "|" + ")?" + "(\\s*\\{\\s*\"name\"\\s*:[\\s\\S]*)" // match 3 (named tool call + rest) + ")" + "|" + "(?:]+)>" // match 4 (function name) + "|)" // match 5 (function name again) + "([\\s\\S]*)" // match 6 (function arguments + rest)})" + ); - common_chat_msg msg; - msg.role = "assistant"; + try { + common_chat_msg msg; + msg.role = "assistant"; - auto end = input.end(); - std::sregex_iterator rend; - std::sregex_iterator rit(input.begin(), end, start_pattern); - if (rit == rend) { + std::string::const_iterator it = input.begin(); + const std::string::const_iterator end = input.end(); + std::smatch match; + + while (it != end) { + if (std::regex_search(it, end, match, open_regex)) { + // Add content before the match + msg.content += std::string(it, match[0].first); + + auto block_start = match[1].str(); + std::string block_end = block_start.empty() ? "" : "```"; + + auto open_tag = match[2].str(); + std::string close_tag; + + if (match[3].matched) { + close_tag = open_tag.empty() ? "" : ""; + // Start parsing from after the opening tags + auto json_it = match[6].first; + json arguments; + if (parse_json(json_it, end, arguments)) { + msg.tool_calls.emplace_back(process_tool_call({ + {"name", function_name}, + {"arguments", arguments}, + })); + it = json_it; // Move iterator past parsed JSON + + // Handle close tags + consume_spaces(it, end); + if (!close_tag.empty() && !parse_literal(it, end, close_tag)) { + throw std::runtime_error("Failed to parse closing tag"); + } + consume_spaces(it, end); + if (!block_end.empty() && !parse_literal(it, end, block_end)) { + throw std::runtime_error("Failed to parse block end"); + } + consume_spaces(it, end); + } else { + // Not a valid tool call, treat as content + msg.content += std::string(match[0].first, match[0].second); + it = match[0].second; + } + } + } else { + // Add remaining content + msg.content += std::string(it, end); + break; + } + } + return msg; + } catch (const std::exception & e) { + LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what()); + common_chat_msg msg; + msg.role = "assistant"; msg.content = input; return msg; } - - msg.content = rit->prefix(); - - auto it = rit->suffix().first; - while (it != end) { - json call; - if (!parse_json(it, end, call)) { - throw std::runtime_error("Failed to parse json tool call"); - } - const auto & arguments = call.at("arguments"); - msg.tool_calls.push_back({ - call.at("name"), - arguments.dump(), - // arguments.is_string() ? arguments.get() : arguments.dump(), - /* id= */ "", - }); - rit = {it, end, middle_pattern}; - if (rit != rend) { - it = rit->suffix().first; - } else { - rit = {it, end, end_pattern}; - if (rit == rend) { - throw std::runtime_error("Malformed input, missing "); - } - break; - } - } - return msg; - } catch (const std::exception & e) { - LOG_ERR("Failed to parse hermes 2 pro input: %s\n", e.what()); - common_chat_msg msg; - msg.role = "assistant"; - msg.content = input; - return msg; - } + }); } static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { @@ -1445,6 +1621,11 @@ static common_chat_params common_chat_templates_apply_jinja( return common_chat_params_init_command_r7b(tmpl, params); } + // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) + if (src.find("") != std::string::npos && params.json_schema.is_null()) { + return common_chat_params_init_hermes_2_pro(tmpl, params); + } + // Use generic handler when mixing tools + JSON schema. // TODO: support that mix in handlers below. if ((params.tools.is_array() && params.json_schema.is_object())) { @@ -1466,11 +1647,6 @@ static common_chat_params common_chat_templates_apply_jinja( return common_chat_params_init_without_tools(tmpl, params); } - // Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools) - if (src.find("") != std::string::npos) { - return common_chat_params_init_hermes_2_pro(tmpl, params); - } - // Functionary v3.1 (w/ tools) if (src.find("<|start_header_id|>") != std::string::npos && src.find(" @@ -483,6 +482,11 @@ void string_replace_all(std::string & s, const std::string & search, const std:: s = std::move(builder); } +std::string regex_escape(const std::string & s) { + static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]"); + return std::regex_replace(s, special_chars, "\\$0"); +} + std::string string_join(const std::vector & values, const std::string & separator) { std::ostringstream result; for (size_t i = 0; i < values.size(); ++i) { @@ -951,8 +955,8 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } - if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) { - LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__); + if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) { + LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__); params.ctx_shift = false; } @@ -1029,6 +1033,8 @@ struct common_init_result common_init_from_params(common_params & params) { if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); + llama_set_warmup(lctx, true); + std::vector tmp; llama_token bos = llama_vocab_bos(vocab); llama_token eos = llama_vocab_eos(vocab); @@ -1056,9 +1062,10 @@ struct common_init_result common_init_from_params(common_params & params) { if (llama_model_has_decoder(model)) { llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch))); } - llama_kv_cache_clear(lctx); + llama_kv_self_clear(lctx); llama_synchronize(lctx); llama_perf_context_reset(lctx); + llama_set_warmup(lctx, false); } iparams.model.reset(model); @@ -2026,3 +2033,25 @@ common_control_vector_data common_control_vector_load(const std::vector +json common_grammar_trigger::to_json() const { + json out { + {"type", (int) type}, + {"value", value}, + }; + if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { + out["token"] = (int) token; + } + return out; +} + +template <> +common_grammar_trigger common_grammar_trigger::from_json(const json & in) { + common_grammar_trigger out; + out.type = (common_grammar_trigger_type) in.at("type").get(); + out.value = in.at("value").get(); + if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { + out.token = (llama_token) in.at("token").get(); + } + return out; +} diff --git a/common/common.h b/common/common.h index efe8e7f796..1c0f199774 100644 --- a/common/common.h +++ b/common/common.h @@ -110,9 +110,21 @@ enum common_conversation_mode { COMMON_CONVERSATION_MODE_AUTO = 2, }; +enum common_grammar_trigger_type { + COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN, + COMMON_GRAMMAR_TRIGGER_TYPE_WORD, + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, + COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START, +}; + struct common_grammar_trigger { - std::string word; - bool at_start; + common_grammar_trigger_type type; + std::string value; + llama_token token = LLAMA_TOKEN_NULL; + + // T can only be nlohmann::ordered_json + template T to_json() const; + template static common_grammar_trigger from_json(const T & in); }; // sampling parameters @@ -163,8 +175,7 @@ struct common_params_sampling { std::string grammar; // optional BNF-like grammar to constrain sampling bool grammar_lazy = false; - std::vector grammar_trigger_words; // optional trigger words to trigger lazy grammar - std::vector grammar_trigger_tokens; // optional trigger tokens to trigger lazy grammar and print trigger special tokens. + std::vector grammar_triggers; // optional triggers (for lazy grammars) std::set preserved_tokens; std::vector logit_bias; // logit biases to apply @@ -200,6 +211,8 @@ struct common_params_vocoder { std::string model = ""; // model path // NOLINT std::string model_url = ""; // model url to download // NOLINT + std::string speaker_file = ""; // speaker file path // NOLINT + bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT }; @@ -261,6 +274,7 @@ struct common_params { std::string hf_repo = ""; // HF repo // NOLINT std::string hf_file = ""; // HF file // NOLINT std::string prompt = ""; // NOLINT + std::string system_prompt = ""; // NOLINT std::string prompt_file = ""; // store the external prompt file name // NOLINT std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT std::string input_prefix = ""; // string to prefix user inputs with // NOLINT @@ -325,6 +339,8 @@ struct common_params { bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data + bool single_turn = false; // single turn chat conversation + ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V @@ -391,8 +407,6 @@ struct common_params { int32_t i_pos = -1; // position of the passkey in the junk text // imatrix params - std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file - int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations int32_t i_chunk = 0; // start processing from this chunk @@ -404,16 +418,16 @@ struct common_params { int n_pca_batch = 100; int n_pca_iterations = 1000; dimre_method cvector_dimre_method = DIMRE_METHOD_PCA; - std::string cvector_outfile = "control_vector.gguf"; std::string cvector_positive_file = "examples/cvector-generator/positive.txt"; std::string cvector_negative_file = "examples/cvector-generator/negative.txt"; bool spm_infill = false; // suffix/prefix/middle pattern for infill - std::string lora_outfile = "ggml-lora-merged-f16.gguf"; - // batched-bench params bool batched_bench_output_jsonl = false; + + // common params + std::string out_file; // output filename for all example programs }; // call once at the start of a program if it uses libcommon @@ -453,6 +467,8 @@ std::string string_repeat(const std::string & str, size_t n); void string_replace_all(std::string & s, const std::string & search, const std::string & replace); +std::string regex_escape(const std::string & s); + template static std::vector string_split(const std::string & str, char delim) { static_assert(!std::is_same::value, "Please use the specialized version for std::string"); diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index 3ebcc3d9fb..9067982257 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -264,7 +264,7 @@ static void _build_min_max_int(int min_value, int max_value, std::stringstream & throw std::runtime_error("At least one of min_value or max_value must be set"); } -const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}"; +const std::string SPACE_RULE = "| \" \" | \"\\n\"{1,2} [ \\t]{0,20}"; struct BuiltinRule { std::string content; @@ -764,11 +764,10 @@ private: public: SchemaConverter( const std::function & fetch_json, - bool dotall, - bool compact_spaces) + bool dotall) : _fetch_json(fetch_json), _dotall(dotall) { - _rules["space"] = compact_spaces ? "\" \"?" : SPACE_RULE; + _rules["space"] = SPACE_RULE; } void resolve_refs(json & schema, const std::string & url) { @@ -1007,7 +1006,7 @@ std::string json_schema_to_grammar(const json & schema, bool force_gbnf) { } std::string build_grammar(const std::function & cb, const common_grammar_options & options) { - SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall, options.compact_spaces); + SchemaConverter converter([&](const std::string &) { return json(); }, options.dotall); common_grammar_builder builder { /* .add_rule = */ [&](const std::string & name, const std::string & rule) { return converter._add_rule(name, rule); diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h index 62a3b0a447..4613f5d9f9 100644 --- a/common/json-schema-to-grammar.h +++ b/common/json-schema-to-grammar.h @@ -16,7 +16,6 @@ struct common_grammar_builder { struct common_grammar_options { bool dotall = false; - bool compact_spaces = false; }; std::string build_grammar(const std::function & cb, const common_grammar_options & options = {}); diff --git a/common/minja/minja.hpp b/common/minja/minja.hpp index c58dd66e06..fa4c34d6e4 100644 --- a/common/minja/minja.hpp +++ b/common/minja/minja.hpp @@ -1378,13 +1378,27 @@ struct ArgumentsExpression { } }; -static std::string strip(const std::string & s) { - auto start = s.find_first_not_of(" \t\n\r"); +static std::string strip(const std::string & s, const std::string & chars = "", bool left = true, bool right = true) { + auto charset = chars.empty() ? " \t\n\r" : chars; + auto start = left ? s.find_first_not_of(charset) : 0; if (start == std::string::npos) return ""; - auto end = s.find_last_not_of(" \t\n\r"); + auto end = right ? s.find_last_not_of(charset) : s.size() - 1; return s.substr(start, end - start + 1); } +static std::vector split(const std::string & s, const std::string & sep) { + std::vector result; + size_t start = 0; + size_t end = s.find(sep); + while (end != std::string::npos) { + result.push_back(s.substr(start, end - start)); + start = end + sep.length(); + end = s.find(sep, start); + } + result.push_back(s.substr(start)); + return result; +} + static std::string capitalize(const std::string & s) { if (s.empty()) return s; auto result = s; @@ -1467,8 +1481,26 @@ public: } else if (obj.is_string()) { auto str = obj.get(); if (method->get_name() == "strip") { - vargs.expectArgs("strip method", {0, 0}, {0, 0}); - return Value(strip(str)); + vargs.expectArgs("strip method", {0, 1}, {0, 0}); + auto chars = vargs.args.empty() ? "" : vargs.args[0].get(); + return Value(strip(str, chars)); + } else if (method->get_name() == "lstrip") { + vargs.expectArgs("lstrip method", {0, 1}, {0, 0}); + auto chars = vargs.args.empty() ? "" : vargs.args[0].get(); + return Value(strip(str, chars, /* left= */ true, /* right= */ false)); + } else if (method->get_name() == "rstrip") { + vargs.expectArgs("rstrip method", {0, 1}, {0, 0}); + auto chars = vargs.args.empty() ? "" : vargs.args[0].get(); + return Value(strip(str, chars, /* left= */ false, /* right= */ true)); + } else if (method->get_name() == "split") { + vargs.expectArgs("split method", {1, 1}, {0, 0}); + auto sep = vargs.args[0].get(); + auto parts = split(str, sep); + Value result = Value::array(); + for (const auto& part : parts) { + result.push_back(Value(part)); + } + return result; } else if (method->get_name() == "capitalize") { vargs.expectArgs("capitalize method", {0, 0}, {0, 0}); return Value(capitalize(str)); diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp index a057ae45f5..d1a4d84c40 100644 --- a/common/ngram-cache.cpp +++ b/common/ngram-cache.cpp @@ -7,6 +7,7 @@ #include #include #include +#include void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp, int nnew, bool print_progress) { diff --git a/common/sampling.cpp b/common/sampling.cpp index 37a0d9c85a..baf22066dc 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -4,6 +4,7 @@ #include #include +#include // the ring buffer works similarly to std::deque, but with a fixed capacity // TODO: deduplicate with llama-impl.h @@ -159,16 +160,53 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); #endif // LLAMA_USE_LLGUIDANCE } else { - std::vector trigger_words; - trigger_words.reserve(params.grammar_trigger_words.size()); - for (const auto & str : params.grammar_trigger_words) { - trigger_words.push_back(str.word.c_str()); + std::vector patterns_at_start; + std::vector patterns_anywhere; + std::vector trigger_tokens; + for (const auto & trigger : params.grammar_triggers) { + switch (trigger.type) { + case COMMON_GRAMMAR_TRIGGER_TYPE_WORD: + { + const auto & word = trigger.value; + patterns_anywhere.push_back(regex_escape(word)); + break; + } + case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN: + case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START: + { + const auto & pattern = trigger.value; + (trigger.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_START ? patterns_at_start : patterns_anywhere).push_back(pattern); + break; + } + case COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN: + { + const auto token = trigger.token; + trigger_tokens.push_back(token); + break; + } + default: + GGML_ASSERT(false && "unknown trigger type"); + } + } + + std::vector trigger_patterns; + if (!patterns_at_start.empty()) { + trigger_patterns.push_back("^(" + string_join(patterns_at_start, "|") + ")[\\s\\S]*"); + } + if (!patterns_anywhere.empty()) { + trigger_patterns.push_back("^[\\s\\S]*?(" + string_join(patterns_anywhere, "|") + ")[\\s\\S]*"); + } + + std::vector trigger_patterns_c; + trigger_patterns_c.reserve(trigger_patterns.size()); + for (const auto & regex : trigger_patterns) { + trigger_patterns_c.push_back(regex.c_str()); } grmr = params.grammar_lazy - ? llama_sampler_init_grammar_lazy(vocab, params.grammar.c_str(), "root", - trigger_words.data(), trigger_words.size(), - params.grammar_trigger_tokens.data(), params.grammar_trigger_tokens.size()) + ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", + trigger_patterns_c.data(), trigger_patterns_c.size(), + trigger_tokens.data(), trigger_tokens.size()) : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"); } diff --git a/common/speculative.cpp b/common/speculative.cpp index b1fff27a55..ccad70fa9e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -5,6 +5,7 @@ #include "sampling.h" #include +#include #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 @@ -172,7 +173,7 @@ llama_tokens common_speculative_gen_draft( result.reserve(params.n_draft); if (reuse_n == 0) { - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); prompt.clear(); } else { @@ -191,14 +192,14 @@ llama_tokens common_speculative_gen_draft( } if (reuse_i > 0) { - llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); - llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); + llama_kv_self_seq_rm (ctx, 0, 0, reuse_i); + llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i); prompt.erase(prompt.begin(), prompt.begin() + reuse_i); } if (reuse_n < (int) prompt.size()) { - llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1); + llama_kv_self_seq_rm (ctx, 0, reuse_n, -1); prompt.erase(prompt.begin() + reuse_n, prompt.end()); } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8b7c75d85a..d21edce16b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -180,7 +180,8 @@ class Model: extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}") + raise ValueError(f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}") else: raise ValueError("Mismatch between weight map and model parts for tensor names:\n" f"Missing tensors: {missing}\n" @@ -528,6 +529,8 @@ class Model: reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() + added_tokens_decoder = tokenizer.added_tokens_decoder + for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") @@ -537,13 +540,13 @@ class Model: if token in added_vocab: # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not tokenizer.added_tokens_decoder[i].normalized: + if not added_tokens_decoder[i].normalized: previous_token = token token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if previous_token != token: logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): + if added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: # NOTE: this was added for Gemma. @@ -699,6 +702,9 @@ class Model: if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B res = "deepseek-r1-qwen" + if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": + # ref: https://huggingface.co/Xenova/gpt-4o + res = "gpt-4o" if res is None: logger.warning("\n") @@ -858,6 +864,9 @@ class Model: for token_id, token_data in added_tokens_decoder.items(): token_id = int(token_id) token: str = token_data["content"] + if token_id >= vocab_size: + logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token.encode("utf-8"): logger.warning(f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token!r}') @@ -902,6 +911,40 @@ class Model: special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_rwkv_world(self): + assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() + vocab_size = self.hparams.get("vocab_size", 65536) + + tokens: list[bytes] = [''.encode("utf-8")] + toktypes: list[int] = [gguf.TokenType.CONTROL] + + with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: + lines = f.readlines() + for line in lines: + parts = line.split(' ') + assert len(parts) >= 3 + token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) + token = token.encode("utf-8") if isinstance(token, str) else token + assert isinstance(token, bytes) + assert len(token) == token_len + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" + tokens.append(token_text.encode("utf-8")) + toktypes.append(gguf.TokenType.NORMAL) + remainder = vocab_size - len(tokens) + assert remainder >= 0 + for i in range(len(tokens), vocab_size): + tokens.append(f"[PAD{i}]".encode("utf-8")) + toktypes.append(gguf.TokenType.UNUSED) + + self.gguf_writer.add_tokenizer_model("rwkv") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.chat_template = "rwkv-world" + # hack: Add '\n\n' as the EOT token to make it chat normally + special_vocab._set_special_token("eot", 261) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab_size: int): tokenizer_path = Path(sys.path[0]) / "models" / f"ggml-vocab-{model_name}.gguf" logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") @@ -1059,13 +1102,6 @@ class BloomModel(Model): tensors.append((self.map_tensor_name(name), data_torch)) - if name == "word_embeddings.weight": - assert self.tensor_names is not None - - # TODO: tie them at runtime, don't duplicate in the model file - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - return tensors @@ -1707,6 +1743,25 @@ class LlamaModel(Model): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("Mistral3ForConditionalGeneration") +class Mistral3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + + # we need to merge the text_config into the root level of hparams + def __init__(self, *args, **kwargs): + hparams = Model.load_hparams(kwargs["dir_model"]) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = name.replace("language_model.", "") + if "multi_modal_projector" in name or "vision_tower" in name: + return [] + return super().modify_tensors(data_torch, name, bid) + + @Model.register("DeciLMForCausalLM") class DeciModel(Model): model_arch = gguf.MODEL_ARCH.DECI @@ -2364,10 +2419,6 @@ class GPT2Model(Model): tensors.append((new_name, data_torch)) - # note: GPT2 output is tied to (same as) wte in original model - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - return tensors @@ -2512,7 +2563,8 @@ class Phi3MiniModel(Model): rms_eps = self.find_hparam(["rms_norm_eps"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head self.gguf_writer.add_context_length(max_pos_embds) self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds) @@ -2536,7 +2588,8 @@ class Phi3MiniModel(Model): n_head = self.find_hparam(["num_attention_heads", "n_head"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rope_dims = n_embd // n_head + rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + rope_dims = int(rot_pct * n_embd) // n_head # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(['rope_scaling'], True) @@ -2565,7 +2618,7 @@ class Phi3MiniModel(Model): raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: - raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}') + raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}.') yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), torch.tensor(long_factors, dtype=torch.float32)) yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) @@ -2695,21 +2748,26 @@ class CodeShellModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) + _has_tok_embd = False + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) + new_name = self.map_tensor_name(name) - tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] + # assuming token_embd.weight is seen before output.weight + if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + # even though the tensor file(s) does not contain the word embeddings they are still in the weight map + if self.tensor_names and "transformer.wte.weight" in self.tensor_names: + logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied") + self.tensor_names.remove("transformer.wte.weight") + elif new_name == tok_embd_name: + self._has_tok_embd = True - if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - assert self.tensor_names is not None - - if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): - # copy tok_embd.weight to output.weight - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - - return tensors + return [(new_name, data_torch)] @Model.register("InternLM2ForCausalLM") @@ -3317,6 +3375,83 @@ class Gemma2Model(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(Model): + model_arch = gguf.MODEL_ARCH.GEMMA3 + has_vision: bool = False + + # we need to merge the text_config into the root level of hparams + def __init__(self, *args, **kwargs): + hparams = Model.load_hparams(kwargs["dir_model"]) + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + kwargs["hparams"] = hparams + super().__init__(*args, **kwargs) + if "vision_config" in hparams: + logger.info("Has vision encoder, but it will be ignored") + self.has_vision = True + + def write(self): + super().write() + if self.has_vision: + logger.info("NOTE: this script only convert the language model to GGUF") + logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py") + + def set_vocab(self): + self._set_vocab_sentencepiece() + + self.gguf_writer.add_add_space_prefix(False) + + def set_gguf_parameters(self): + hparams = self.hparams + block_count = hparams["num_hidden_layers"] + + # some default values are not specified in the hparams + self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) + self.gguf_writer.add_embedding_length(hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) + self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) + self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers + # both attn_logit_softcapping and final_logit_softcapping are removed in Gemma3 + assert hparams.get("attn_logit_softcapping") is None + assert hparams.get("final_logit_softcapping") is None + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) + if hparams.get("rope_scaling") is not None: + assert hparams["rope_scaling"]["rope_type"] == "linear" + # important: this rope_scaling is only applied for global layers, and not used by 1B model + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("language_model."): + name = name.replace("language_model.", "") + elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later + # ignore vision tensors + return [] + + # remove OOV (out-of-vocabulary) rows in token_embd + if "embed_tokens.weight" in name: + vocab = self._create_vocab_sentencepiece() + tokens = vocab[0] + data_torch = data_torch[:len(tokens)] + + # ref code in Gemma3RMSNorm + # output = output * (1.0 + self.weight.float()) + if name.endswith("norm.weight"): + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + @Model.register("Starcoder2ForCausalLM") class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 @@ -3327,38 +3462,7 @@ class Rwkv6Model(Model): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): - assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() - vocab_size = self.hparams.get("vocab_size", 65536) - - tokens: list[bytes] = [''.encode("utf-8")] - toktypes: list[int] = [gguf.TokenType.CONTROL] - - with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: - lines = f.readlines() - for line in lines: - parts = line.split(' ') - assert len(parts) >= 3 - token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) - token = token.encode("utf-8") if isinstance(token, str) else token - assert isinstance(token, bytes) - assert len(token) == token_len - token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" - tokens.append(token_text.encode("utf-8")) - toktypes.append(gguf.TokenType.NORMAL) - remainder = vocab_size - len(tokens) - assert remainder >= 0 - for i in range(len(tokens), vocab_size): - tokens.append(f"[PAD{i}]".encode("utf-8")) - toktypes.append(gguf.TokenType.UNUSED) - - self.gguf_writer.add_tokenizer_model("rwkv") - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) - special_vocab.chat_template = "rwkv-world" - # hack: Add '\n\n' as the EOT token to make it chat normally - special_vocab._set_special_token("eot", 261) - special_vocab.add_to_gguf(self.gguf_writer) + self._set_vocab_rwkv_world() def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -3480,6 +3584,168 @@ class RWKV6Qwen2Model(Rwkv6Model): yield (new_name, data) +@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(Model): + model_arch = gguf.MODEL_ARCH.RWKV7 + + def set_vocab(self): + self._set_vocab_rwkv_world() + + def calc_lora_rank(self, hidden_size, exponent, multiplier): + return max(1, round(hidden_size ** exponent * multiplier / 32)) * 32 + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + try: + head_size = self.hparams["head_size"] + layer_norm_eps = self.hparams["layer_norm_epsilon"] + except KeyError: + head_size = self.hparams["head_dim"] + layer_norm_eps = self.hparams["norm_eps"] + hidden_size = self.hparams["hidden_size"] + intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else (hidden_size * 4) + + # ICLR: In-Context-Learning-Rate + try: + lora_rank_decay = self.hparams["lora_rank_decay"] if self.hparams["lora_rank_decay"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["lora_rank_iclr"] if self.hparams["lora_rank_iclr"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["lora_rank_value_residual_mix"] if self.hparams["lora_rank_value_residual_mix"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["lora_rank_gate"] if self.hparams["lora_rank_gate"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + except KeyError: + lora_rank_decay = self.hparams["decay_low_rank_dim"] if self.hparams["decay_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_iclr = self.hparams["a_low_rank_dim"] if self.hparams["a_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.8) + lora_rank_value_residual_mix = self.hparams["v_low_rank_dim"] if self.hparams["v_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.5, 1.3) + lora_rank_gate = self.hparams["gate_low_rank_dim"] if self.hparams["gate_low_rank_dim"] is not None else self.calc_lora_rank(hidden_size, 0.8, 0.6) + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_eps(layer_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + lerp_weights: dict[int, dict[str, Tensor]] = {} + lora_needs_transpose: bool = True + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # unify tensor names here to make life easier + name = name.replace("blocks", "layers").replace("ffn", "feed_forward") + name = name.replace("self_attn", "attention").replace("attn", "attention") + name = name.replace("time_mixer.", "") + # lora layer names in fla-hub's impl + if "_lora.lora" in name: + self.lora_needs_transpose = False + name = name.replace("_lora.lora.0.weight", "1.weight") + name = name.replace("_lora.lora.2.weight", "2.weight") + name = name.replace("_lora.lora.2.bias", "0.weight") + + name = name.replace("feed_forward_norm", "ln2") + name = name.replace("g_norm", "ln_x") + + if "attention.v" in name and "value" not in self.map_tensor_name(name) and bid == 0: + # some models have dummy v0/v1/v2 on first layer while others don't + # ignore them all since they are not used + return + + wkv_has_gate = self.hparams.get("wkv_has_gate", True) + lerp_list = ["r", "w", "k", "v", "a", "g"] if wkv_has_gate else ["r", "w", "k", "v", "a"] + + if bid is not None and "attention.x_" in name: + if "attention.x_x" in name: + # already concatenated + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = data_torch.reshape(len(lerp_list), 1, 1, -1) + yield (new_name, data) + else: + try: + self.lerp_weights[bid][name] = data_torch + except KeyError: + self.lerp_weights[bid] = {name: data_torch} + if all(f"model.layers.{bid}.attention.x_{i}" in self.lerp_weights[bid].keys() for i in lerp_list): + new_name = f"blk.{bid}.time_mix_lerp_fused.weight" + data = torch.stack([self.lerp_weights[bid][f"model.layers.{bid}.attention.x_{i}"] for i in lerp_list], dim=0) + yield (new_name, data) + return + else: + data_torch = data_torch.squeeze() + new_name = self.map_tensor_name(name) + + if not (new_name.endswith(".weight") or new_name.endswith(".bias")): + new_name += ".weight" + + if self.lora_needs_transpose and any( + new_name.endswith(t) for t in [ + "time_mix_w1.weight", "time_mix_w2.weight", + "time_mix_a1.weight", "time_mix_a2.weight", + "time_mix_v1.weight", "time_mix_v2.weight", + "time_mix_g1.weight", "time_mix_g2.weight", + ] + ): + data_torch = data_torch.transpose(0, 1) + + if 'r_k' in new_name: + data_torch = data_torch.flatten() + + if bid == 0 and "time_mix_a" in new_name: + # dummy v0/v1/v2 on first layer + # easist way to make llama happy + yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch) + + yield (new_name, data_torch) + + +@Model.register("RwkvHybridForCausalLM") +class ARwkv7Model(Rwkv7Model): + model_arch = gguf.MODEL_ARCH.ARWKV7 + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + hidden_size = self.hparams["hidden_size"] + head_size = self.hparams["head_size"] + rms_norm_eps = self.hparams["rms_norm_eps"] + intermediate_size = self.hparams["intermediate_size"] + wkv_has_gate = self.hparams["wkv_has_gate"] + assert self.hparams["wkv_version"] == 7 + + # ICLR: In-Context-Learning-Rate + lora_rank_decay = 64 + lora_rank_iclr = 64 + lora_rank_value_residual_mix = 32 + lora_rank_gate = 128 if wkv_has_gate else 0 + + # RWKV isn't context limited + self.gguf_writer.add_context_length(1048576) + self.gguf_writer.add_embedding_length(hidden_size) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_wkv_head_size(head_size) + self.gguf_writer.add_decay_lora_rank(lora_rank_decay) + self.gguf_writer.add_iclr_lora_rank(lora_rank_iclr) + self.gguf_writer.add_value_residual_mix_lora_rank(lora_rank_value_residual_mix) + self.gguf_writer.add_gate_lora_rank(lora_rank_gate) + self.gguf_writer.add_feed_forward_length(intermediate_size) + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_token_shift_count(1) + + # required by llama.cpp, unused + self.gguf_writer.add_head_count(0) + + @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index fa4989a80c..07d3ce0e4e 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -109,6 +109,7 @@ models = [ {"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"}, {"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"}, {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"}, + {"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", }, ] @@ -131,6 +132,10 @@ def download_model(model): files = ["config.json", "tokenizer.json", "tokenizer_config.json"] + if name == "gpt-4o": + # Xenova/gpt-4o is tokenizer-only, it does not contain config.json + files = ["tokenizer.json", "tokenizer_config.json"] + if tokt == TOKENIZER_TYPE.SPM: files.append("tokenizer.model") diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 0cb39e7927..f97e696aaa 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -42,6 +42,16 @@ The following release is verified with good quality: ## News +- 2025.2 + - Optimize MUL_MAT Q4_0 on Intel GPU for all dGPUs and built-in GPUs since MTL. Increase the performance of LLM (llama-2-7b.Q4_0.gguf) 21%-87% on Intel GPUs (MTL, ARL-H, Arc, Flex, PVC). + |GPU|Base tokens/s|Increased tokens/s|Percent| + |-|-|-|-| + |PVC 1550|39|73|+87%| + |Flex 170|39|50|+28%| + |Arc770|42|55|+30%| + |MTL|13|16|+23%| + |ARL-H|14|17|+21%| + - 2024.11 - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer. @@ -97,8 +107,8 @@ SYCL backend supports Intel GPU Family: | Intel Data Center Max Series | Support | Max 1550, 1100 | | Intel Data Center Flex Series | Support | Flex 170 | | Intel Arc Series | Support | Arc 770, 730M, Arc A750 | -| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | -| Intel iGPU | Support | iGPU in 13700k, i5-1250P, i7-1260P, i7-1165G7 | +| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake, Arrow Lake | +| Intel iGPU | Support | iGPU in 13700k,iGPU in 13400, i5-1250P, i7-1260P, i7-1165G7 | *Notes:* @@ -227,6 +237,15 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB cmake --build buildWithCublas --config Release ``` +**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target: + +```sh +git clone https://github.com/oneapi-src/oneDNN.git +cd oneDNN +cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake --build build-nvidia --config Release +``` + - **Adding support to AMD GPUs** **oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit. @@ -317,10 +336,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture # Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl # Option 2: Use FP16 -cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON +cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl # build all binary cmake --build build --config Release -j -v @@ -650,8 +669,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512 |--------------------|---------------------------------------|---------------------------------------------| | GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.
FP32 path - recommended for better perforemance than FP16 on quantized model| | GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. | -| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | +| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. | | GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | @@ -660,8 +680,11 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | Name | Value | Function | |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | +| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase | +| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | + ## Known Issues - `Split-mode:[row]` is not supported. diff --git a/docs/build.md b/docs/build.md index b3ecf043d7..2e3975c145 100644 --- a/docs/build.md +++ b/docs/build.md @@ -197,29 +197,53 @@ The following compilation options are also available to tweak performance: ## MUSA -This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa). +This provides GPU acceleration using a Moore Threads GPU. Make sure to have the [MUSA SDK](https://developer.mthreads.com/musa/musa-sdk) installed. -- Using `CMake`: +#### Download directly from Moore Threads - ```bash - cmake -B build -DGGML_MUSA=ON - cmake --build build --config Release +You may find the official downloads here: [Moore Threads developer site](https://developer.mthreads.com/sdk/download/musa). + +### Compilation + +```bash +cmake -B build -DGGML_MUSA=ON +cmake --build build --config Release +``` + +#### Override Compute Capability Specifications + +By default, all supported compute capabilities are enabled. To customize this behavior, you can specify the `MUSA_ARCHITECTURES` option in the CMake command: + +```bash +cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21" +``` + +This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time. + +#### Compilation options + +Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet. + +- For static builds, add `-DBUILD_SHARED_LIBS=OFF` and `-DCMAKE_POSITION_INDEPENDENT_CODE=ON`: ``` - - For static build: - - ```bash cmake -B build -DGGML_MUSA=ON \ -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON cmake --build build --config Release ``` -The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used. +### Runtime MUSA environmental variables + +You may set the [musa environmental variables](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) at runtime. + +```bash +# Use `MUSA_VISIBLE_DEVICES` to hide the first compute device. +MUSA_VISIBLE_DEVICES="-0" ./build/bin/llama-server --model /srv/models/llama.gguf +``` + +### Unified Memory The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted. -Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet. - ## HIP This provides GPU acceleration on HIP-supported AMD GPUs. @@ -235,6 +259,12 @@ You can download it from your Linux distro's package manager or from here: [ROCm On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). + To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system. + + The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager. + + As an alternative, you can manually install the library by cloning it from the official [GitHub repository](https://github.com/ROCm/rocWMMA), checkout the corresponding version tag (e.g. `rocm-6.2.4`) and set `-DCMAKE_CXX_FLAGS="-I/library/include/"` in CMake. This also works under Windows despite not officially supported by AMD. + Note that if you get the following error: ``` clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library diff --git a/docs/function-calling.md b/docs/function-calling.md new file mode 100644 index 0000000000..c3873c3fa6 --- /dev/null +++ b/docs/function-calling.md @@ -0,0 +1,394 @@ +# Function Calling + +[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in: +- `llama-server` when started w/ `--jinja` flag +- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556) + +## Universal support w/ Native & Generic handlers + +Function calling is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639): + +- Native tool call formats supported: + - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2 + - Functionary v3.1 / v3.2 + - Hermes 2/3, Qwen 2.5 + - Qwen 2.5 Coder (WIP: https://github.com/ggml-org/llama.cpp/pull/12034) + - Mistral Nemo + - Firefunction v2 + - Command R7B + - DeepSeek R1 (WIP / seems reluctant to call any tools?) + +- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs). + - Use `--chat-template-file` to override the template when appropriate (see examples below) + - Generic support may consume more tokens and be less efficient than a model's native format. + +
+Show some common templates and which format handler they use + +| Template | Format | +|----------|--------| +| Almawave-Velvet-14B.jinja | Hermes 2 Pro | +| AtlaAI-Selene-1-Mini-Llama-3.1-8B.jinja | Llama 3.x | +| CohereForAI-aya-expanse-8b.jinja | Generic | +| CohereForAI-c4ai-command-r-plus-default.jinja | Generic | +| CohereForAI-c4ai-command-r-plus-rag.jinja | Generic | +| CohereForAI-c4ai-command-r-plus-tool_use.jinja | Generic | +| CohereForAI-c4ai-command-r7b-12-2024-default.jinja | Command R7B (extract reasoning) | +| CohereForAI-c4ai-command-r7b-12-2024-rag.jinja | Command R7B (extract reasoning) | +| CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja | Command R7B (extract reasoning) | +| CohereForAI-c4ai-command-r7b-12-2024.jinja | Generic | +| DavieLion-Llama-3.2-1B-SPIN-iter3.jinja | Generic | +| Delta-Vector-Rei-12B.jinja | Mistral Nemo | +| EpistemeAI-Mistral-Nemo-Instruct-12B-Philosophy-Math.jinja | Mistral Nemo | +| FlofloB-83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.jinja | Hermes 2 Pro | +| FlofloB-test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.jinja | Generic | +| HelpingAI-HAI-SER.jinja | Generic | +| HuggingFaceTB-SmolLM2-1.7B-Instruct.jinja | Generic | +| HuggingFaceTB-SmolLM2-135M-Instruct.jinja | Generic | +| HuggingFaceTB-SmolLM2-360M-Instruct.jinja | Generic | +| INSAIT-Institute-BgGPT-Gemma-2-27B-IT-v1.0.jinja | Generic | +| Ihor-Text2Graph-R1-Qwen2.5-0.5b.jinja | Hermes 2 Pro | +| Infinigence-Megrez-3B-Instruct.jinja | Generic | +| Josephgflowers-TinyLlama_v1.1_math_code-world-test-1.jinja | Generic | +| LGAI-EXAONE-EXAONE-3.5-2.4B-Instruct.jinja | Generic | +| LGAI-EXAONE-EXAONE-3.5-7.8B-Instruct.jinja | Generic | +| LatitudeGames-Wayfarer-12B.jinja | Generic | +| Magpie-Align-Llama-3-8B-Magpie-Align-v0.1.jinja | Generic | +| Magpie-Align-Llama-3.1-8B-Magpie-Align-v0.1.jinja | Generic | +| MaziyarPanahi-calme-3.2-instruct-78b.jinja | Generic | +| MiniMaxAI-MiniMax-Text-01.jinja | Generic | +| MiniMaxAI-MiniMax-VL-01.jinja | Generic | +| NaniDAO-deepseek-r1-qwen-2.5-32B-ablated.jinja | DeepSeek R1 (extract reasoning) | +| NexaAIDev-Octopus-v2.jinja | Generic | +| NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | Generic | +| NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | Hermes 2 Pro | +| NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | Generic | +| NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | Hermes 2 Pro | +| NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | Generic | +| NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | Hermes 2 Pro | +| NovaSky-AI-Sky-T1-32B-Flash.jinja | Hermes 2 Pro | +| NovaSky-AI-Sky-T1-32B-Preview.jinja | Hermes 2 Pro | +| OnlyCheeini-greesychat-turbo.jinja | Generic | +| Orenguteng-Llama-3.1-8B-Lexi-Uncensored-V2.jinja | Llama 3.x | +| OrionStarAI-Orion-14B-Chat.jinja | Generic | +| PowerInfer-SmallThinker-3B-Preview.jinja | Generic | +| PrimeIntellect-INTELLECT-1-Instruct.jinja | Generic | +| Qwen-QVQ-72B-Preview.jinja | Generic | +| Qwen-QwQ-32B-Preview.jinja | Hermes 2 Pro | +| Qwen-Qwen1.5-7B-Chat.jinja | Generic | +| Qwen-Qwen2-7B-Instruct.jinja | Generic | +| Qwen-Qwen2-VL-72B-Instruct.jinja | Generic | +| Qwen-Qwen2-VL-7B-Instruct.jinja | Generic | +| Qwen-Qwen2.5-0.5B.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-1.5B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-14B-Instruct-1M.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-14B.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-32B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-32B.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-3B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-72B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-7B-Instruct-1M.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-7B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-7B.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-Coder-32B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-Coder-7B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-Math-1.5B.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-Math-7B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-VL-3B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-VL-72B-Instruct.jinja | Hermes 2 Pro | +| Qwen-Qwen2.5-VL-7B-Instruct.jinja | Hermes 2 Pro | +| RWKV-Red-Team-ARWKV-7B-Preview-0.1.jinja | Hermes 2 Pro | +| SakanaAI-TinySwallow-1.5B-Instruct.jinja | Hermes 2 Pro | +| SakanaAI-TinySwallow-1.5B.jinja | Hermes 2 Pro | +| Sao10K-70B-L3.3-Cirrus-x1.jinja | Llama 3.x | +| SentientAGI-Dobby-Mini-Leashed-Llama-3.1-8B.jinja | Llama 3.x | +| SentientAGI-Dobby-Mini-Unhinged-Llama-3.1-8B.jinja | Llama 3.x | +| Steelskull-L3.3-Damascus-R1.jinja | Llama 3.x | +| Steelskull-L3.3-MS-Nevoria-70b.jinja | Llama 3.x | +| Steelskull-L3.3-Nevoria-R1-70b.jinja | Llama 3.x | +| THUDM-glm-4-9b-chat.jinja | Generic | +| THUDM-glm-edge-1.5b-chat.jinja | Generic | +| Tarek07-Progenitor-V1.1-LLaMa-70B.jinja | Llama 3.x | +| TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | Generic | +| TinyLlama-TinyLlama-1.1B-Chat-v1.0.jinja | Generic | +| UCLA-AGI-Mistral7B-PairRM-SPPO-Iter3.jinja | Generic | +| ValiantLabs-Llama3.1-8B-Enigma.jinja | Llama 3.x | +| abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | Generic | +| ai21labs-AI21-Jamba-1.5-Large.jinja | Generic | +| allenai-Llama-3.1-Tulu-3-405B-SFT.jinja | Generic | +| allenai-Llama-3.1-Tulu-3-405B.jinja | Generic | +| allenai-Llama-3.1-Tulu-3-8B.jinja | Generic | +| arcee-ai-Virtuoso-Lite.jinja | Hermes 2 Pro | +| arcee-ai-Virtuoso-Medium-v2.jinja | Hermes 2 Pro | +| arcee-ai-Virtuoso-Small-v2.jinja | Hermes 2 Pro | +| avemio-GRAG-NEMO-12B-ORPO-HESSIAN-AI.jinja | Generic | +| bespokelabs-Bespoke-Stratos-7B.jinja | Hermes 2 Pro | +| bfuzzy1-acheron-m1a-llama.jinja | Generic | +| bofenghuang-vigogne-2-70b-chat.jinja | Generic | +| bytedance-research-UI-TARS-72B-DPO.jinja | Generic | +| bytedance-research-UI-TARS-7B-DPO.jinja | Generic | +| bytedance-research-UI-TARS-7B-SFT.jinja | Generic | +| carsenk-phi3.5_mini_exp_825_uncensored.jinja | Generic | +| cyberagent-DeepSeek-R1-Distill-Qwen-14B-Japanese.jinja | DeepSeek R1 (extract reasoning) | +| cyberagent-DeepSeek-R1-Distill-Qwen-32B-Japanese.jinja | DeepSeek R1 (extract reasoning) | +| databricks-dbrx-instruct.jinja | Generic | +| deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | Generic | +| deepseek-ai-DeepSeek-Coder-V2-Lite-Base.jinja | Generic | +| deepseek-ai-DeepSeek-Coder-V2-Lite-Instruct.jinja | Generic | +| deepseek-ai-DeepSeek-R1-Distill-Llama-70B.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-R1-Distill-Qwen-14B.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-R1-Zero.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-V2-Lite.jinja | Generic | +| deepseek-ai-DeepSeek-V2.5.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-DeepSeek-V3.jinja | DeepSeek R1 (extract reasoning) | +| deepseek-ai-deepseek-coder-33b-instruct.jinja | Generic | +| deepseek-ai-deepseek-coder-6.7b-instruct.jinja | Generic | +| deepseek-ai-deepseek-coder-7b-instruct-v1.5.jinja | Generic | +| deepseek-ai-deepseek-llm-67b-chat.jinja | Generic | +| deepseek-ai-deepseek-llm-7b-chat.jinja | Generic | +| dicta-il-dictalm2.0-instruct.jinja | Generic | +| ehristoforu-Falcon3-8B-Franken-Basestruct.jinja | Hermes 2 Pro | +| fireworks-ai-llama-3-firefunction-v2.jinja | FireFunction v2 | +| godlikehhd-alpaca_data_sampled_ifd_new_5200.jinja | Hermes 2 Pro | +| godlikehhd-alpaca_data_score_max_0.7_2600.jinja | Hermes 2 Pro | +| google-gemma-2-27b-it.jinja | Generic | +| google-gemma-2-2b-it.jinja | Generic | +| google-gemma-2-2b-jpn-it.jinja | Generic | +| google-gemma-7b-it.jinja | Generic | +| huihui-ai-DeepSeek-R1-Distill-Llama-70B-abliterated.jinja | DeepSeek R1 (extract reasoning) | +| huihui-ai-DeepSeek-R1-Distill-Llama-8B-abliterated.jinja | DeepSeek R1 (extract reasoning) | +| huihui-ai-DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) | +| huihui-ai-DeepSeek-R1-Distill-Qwen-32B-abliterated.jinja | DeepSeek R1 (extract reasoning) | +| huihui-ai-DeepSeek-R1-Distill-Qwen-7B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) | +| huihui-ai-Qwen2.5-14B-Instruct-1M-abliterated.jinja | Hermes 2 Pro | +| ibm-granite-granite-3.1-8b-instruct.jinja | Generic | +| indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | Generic | +| inflatebot-MN-12B-Mag-Mell-R1.jinja | Generic | +| jinaai-ReaderLM-v2.jinja | Generic | +| kms7530-chemeng_qwen-math-7b_24_1_100_1_nonmath.jinja | Hermes 2 Pro | +| knifeayumu-Cydonia-v1.3-Magnum-v4-22B.jinja | Mistral Nemo | +| langgptai-qwen1.5-7b-chat-sa-v0.1.jinja | Generic | +| lightblue-DeepSeek-R1-Distill-Qwen-7B-Japanese.jinja | DeepSeek R1 (extract reasoning) | +| mattshumer-Reflection-Llama-3.1-70B.jinja | Generic | +| meetkai-functionary-medium-v3.1.jinja | Functionary v3.1 Llama 3.1 | +| meetkai-functionary-medium-v3.2.jinja | Functionary v3.2 | +| meta-llama-Llama-2-7b-chat-hf.jinja | Generic | +| meta-llama-Llama-3.1-8B-Instruct.jinja | Llama 3.x | +| meta-llama-Llama-3.2-11B-Vision-Instruct.jinja | Llama 3.x | +| meta-llama-Llama-3.2-1B-Instruct.jinja | Llama 3.x | +| meta-llama-Llama-3.2-3B-Instruct.jinja | Llama 3.x | +| meta-llama-Llama-3.3-70B-Instruct.jinja | Llama 3.x | +| meta-llama-Meta-Llama-3-8B-Instruct.jinja | Generic | +| meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | Llama 3.x | +| microsoft-Phi-3-medium-4k-instruct.jinja | Generic | +| microsoft-Phi-3-mini-4k-instruct.jinja | Generic | +| microsoft-Phi-3-small-8k-instruct.jinja | Generic | +| microsoft-Phi-3.5-mini-instruct.jinja | Generic | +| microsoft-Phi-3.5-vision-instruct.jinja | Generic | +| microsoft-phi-4.jinja | Generic | +| migtissera-Tess-3-Mistral-Nemo-12B.jinja | Generic | +| ministral-Ministral-3b-instruct.jinja | Generic | +| mistralai-Codestral-22B-v0.1.jinja | Generic | +| mistralai-Mistral-7B-Instruct-v0.1.jinja | Generic | +| mistralai-Mistral-7B-Instruct-v0.2.jinja | Generic | +| mistralai-Mistral-7B-Instruct-v0.3.jinja | Mistral Nemo | +| mistralai-Mistral-Large-Instruct-2407.jinja | Mistral Nemo | +| mistralai-Mistral-Large-Instruct-2411.jinja | Generic | +| mistralai-Mistral-Nemo-Instruct-2407.jinja | Mistral Nemo | +| mistralai-Mistral-Small-24B-Instruct-2501.jinja | Generic | +| mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | Generic | +| mkurman-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro | +| mlabonne-AlphaMonarch-7B.jinja | Generic | +| mlx-community-Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.jinja | Hermes 2 Pro | +| mlx-community-Qwen2.5-VL-7B-Instruct-8bit.jinja | Hermes 2 Pro | +| mobiuslabsgmbh-DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1.jinja | DeepSeek R1 (extract reasoning) | +| netcat420-MFANNv0.20.jinja | Generic | +| netcat420-MFANNv0.24.jinja | Generic | +| netease-youdao-Confucius-o1-14B.jinja | Hermes 2 Pro | +| nvidia-AceMath-7B-RM.jinja | Hermes 2 Pro | +| nvidia-Eagle2-1B.jinja | Hermes 2 Pro | +| nvidia-Eagle2-9B.jinja | Hermes 2 Pro | +| nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | Llama 3.x | +| onnx-community-DeepSeek-R1-Distill-Qwen-1.5B-ONNX.jinja | DeepSeek R1 (extract reasoning) | +| open-thoughts-OpenThinker-7B.jinja | Hermes 2 Pro | +| openchat-openchat-3.5-0106.jinja | Generic | +| pankajmathur-orca_mini_v6_8b.jinja | Generic | +| princeton-nlp-Mistral-7B-Base-SFT-RDPO.jinja | Generic | +| princeton-nlp-Mistral-7B-Instruct-DPO.jinja | Generic | +| princeton-nlp-Mistral-7B-Instruct-RDPO.jinja | Generic | +| prithivMLmods-Bellatrix-Tiny-1.5B-R1.jinja | Hermes 2 Pro | +| prithivMLmods-Bellatrix-Tiny-1B-R1.jinja | Llama 3.x | +| prithivMLmods-Bellatrix-Tiny-1B-v3.jinja | Generic | +| prithivMLmods-Bellatrix-Tiny-3B-R1.jinja | Llama 3.x | +| prithivMLmods-Blaze-14B-xElite.jinja | Generic | +| prithivMLmods-Calcium-Opus-14B-Elite2-R1.jinja | Hermes 2 Pro | +| prithivMLmods-Calme-Ties-78B.jinja | Generic | +| prithivMLmods-Calme-Ties2-78B.jinja | Generic | +| prithivMLmods-Calme-Ties3-78B.jinja | Generic | +| prithivMLmods-ChemQwen2-vL.jinja | Generic | +| prithivMLmods-GWQ2b.jinja | Generic | +| prithivMLmods-LatexMind-2B-Codec.jinja | Generic | +| prithivMLmods-Llama-3.2-6B-AlgoCode.jinja | Llama 3.x | +| prithivMLmods-Megatron-Opus-14B-Exp.jinja | Hermes 2 Pro | +| prithivMLmods-Megatron-Opus-14B-Stock.jinja | Hermes 2 Pro | +| prithivMLmods-Megatron-Opus-7B-Exp.jinja | Hermes 2 Pro | +| prithivMLmods-Omni-Reasoner-Merged.jinja | Hermes 2 Pro | +| prithivMLmods-Omni-Reasoner4-Merged.jinja | Hermes 2 Pro | +| prithivMLmods-Primal-Opus-14B-Optimus-v1.jinja | Hermes 2 Pro | +| prithivMLmods-QwQ-Math-IO-500M.jinja | Hermes 2 Pro | +| prithivMLmods-Qwen-7B-Distill-Reasoner.jinja | DeepSeek R1 (extract reasoning) | +| prithivMLmods-Qwen2.5-1.5B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro | +| prithivMLmods-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro | +| prithivMLmods-Qwen2.5-32B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro | +| prithivMLmods-Qwen2.5-7B-DeepSeek-R1-1M.jinja | Hermes 2 Pro | +| prithivMLmods-Triangulum-v2-10B.jinja | Hermes 2 Pro | +| qingy2024-Falcon3-2x10B-MoE-Instruct.jinja | Hermes 2 Pro | +| rubenroy-Zurich-14B-GCv2-5m.jinja | Hermes 2 Pro | +| rubenroy-Zurich-7B-GCv2-5m.jinja | Hermes 2 Pro | +| silma-ai-SILMA-Kashif-2B-Instruct-v1.0.jinja | Generic | +| simplescaling-s1-32B.jinja | Hermes 2 Pro | +| sometimesanotion-Lamarck-14B-v0.7.jinja | Hermes 2 Pro | +| sonthenguyen-zephyr-sft-bnb-4bit-DPO-mtbr-180steps.jinja | Generic | +| sthenno-tempesthenno-icy-0130.jinja | Generic | +| sumink-qwft.jinja | Hermes 2 Pro | +| teknium-OpenHermes-2.5-Mistral-7B.jinja | Generic | +| thirdeyeai-elevate360m.jinja | Generic | +| tiiuae-Falcon3-10B-Instruct.jinja | Hermes 2 Pro | +| unsloth-DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit.jinja | DeepSeek R1 (extract reasoning) | +| unsloth-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) | +| unsloth-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) | +| unsloth-Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit.jinja | Generic | +| upstage-solar-pro-preview-instruct.jinja | Generic | +| whyhow-ai-PatientSeek.jinja | Generic | +| xwen-team-Xwen-72B-Chat.jinja | Hermes 2 Pro | +| xwen-team-Xwen-7B-Chat.jinja | Hermes 2 Pro | + +This table can be generated with: + +```bash +./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null +``` + +
+ +# Usage - need tool-aware Jinja template + +First, start a server with any model, but make sure it has a tools-enabled template: you can verify this by inspecting the `chat_template` or `chat_template_tool_use` properties in `http://localhost:8080/props`). + +Here are some models known to work (w/ chat template override when needed): + +```shell +# Native support: + +llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M +llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L +llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M + +# Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it) + +llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \ + --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja + +llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \ + --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja + +# Native support requires the right template for these GGUFs: + +llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M + --chat-template-file models/templates/meetkai-functionary-medium-v3.2.jinja + +llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \ + --chat-template-file models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja + +llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \ + --chat-template-file models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja + +llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \ + --chat-template-file models/templates/fireworks-ai-llama-3-firefunction-v2.jinja + +llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \ + --chat-template-file models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja + +# Generic format support +llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0 +llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0 +llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K +``` + +To get the official template from original HuggingFace repos, you can use [scripts/get_chat_template.py](../scripts/get_chat_template.py) (see examples invocations in [models/templates/README.md](../models/templates/README.md)) + +> [!TIP] +> If there is no official `tool_use` Jinja template, you may want to set `--chat-template chatml` to use a default that works with many models (YMMV!), or write your own (e.g. we provide a custom [llama-cpp-deepseek-r1.jinja](../models/templates/llama-cpp-deepseek-r1.jinja) for DeepSeek R1 distills) + +Test in CLI (or with any library / software that can use OpenAI-compatible API backends): + +```bash +curl http://localhost:8080/v1/chat/completions -d '{ +"model": "gpt-3.5-turbo", +"tools": [ + { + "type":"function", + "function":{ + "name":"python", + "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.", + "parameters":{ + "type":"object", + "properties":{ + "code":{ + "type":"string", + "description":"The code to run in the ipython interpreter." + } + }, + "required":["code"] + } + } + } +], +"messages": [ + { + "role": "user", + "content": "Print a hello world message with python." + } +] +}' +``` + +
+Show output + +```json +{ +"choices": [ + { + "finish_reason": "tool", + "index": 0, + "message": { + "content": null, + "tool_calls": [ + { + "name": "python", + "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}" + } + ], + "role": "assistant" + } + } +], +"created": 1727287211, +"model": "gpt-3.5-turbo", +"object": "chat.completion", +"usage": { + "completion_tokens": 16, + "prompt_tokens": 44, + "total_tokens": 60 +}, +"id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8" +} +``` + +
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 0659ab6f11..430e8be512 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -132,7 +132,7 @@ int main(int argc, char ** argv) { const auto t_pp_start = ggml_time_us(); - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); if (!decode_helper(ctx, batch, ctx_params.n_batch)) { LOG_ERR("%s: llama_decode() failed\n", __func__); @@ -141,7 +141,7 @@ int main(int argc, char ** argv) { if (is_pp_shared) { for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } } diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index 55c31166ca..514989e340 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 { } for i in 1 ..< n_parallel { - llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) + llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) } if n_parallel > 1 { diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp index 413b71d34c..2a90715501 100644 --- a/examples/cvector-generator/cvector-generator.cpp +++ b/examples/cvector-generator/cvector-generator.cpp @@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } static bool get_hidden_layers(llama_context * ctx, std::vector & tokens) { - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; @@ -394,6 +394,8 @@ static int prepare_entries(common_params & params, train_context & ctx_train) { int main(int argc, char ** argv) { common_params params; + params.out_file = "control_vector.gguf"; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) { return 1; } @@ -498,7 +500,7 @@ int main(int argc, char ** argv) { } // write output vectors to gguf - export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint); + export_gguf(ctx_train.v_final, params.out_file, model_hint); llama_backend_free(); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 38d22c90f8..6f08904159 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -4,6 +4,7 @@ #include "llama.h" #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -37,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu const struct llama_model * model = llama_get_model(ctx); // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 91238e4beb..e7d0fbfffe 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -413,20 +413,22 @@ static void print_usage(int, char ** argv) { int main(int argc, char ** argv) { common_params params; + params.out_file = "ggml-lora-merged-f16.gguf"; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) { return 1; } g_verbose = (params.verbosity > 1); try { - lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads); + lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads); ctx.run_merge(); } catch (const std::exception & err) { fprintf(stderr, "%s\n", err.what()); exit(EXIT_FAILURE); } - printf("done, output file is %s\n", params.lora_outfile.c_str()); + printf("done, output file is %s\n", params.out_file.c_str()); return 0; } diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 72eb462574..f7db7861c1 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -45,7 +45,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); llama_set_embeddings(ctx, true); llama_set_causal_attn(ctx, false); @@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std llama_token eos_token = llama_vocab_eos(vocab); - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); llama_set_embeddings(ctx, false); llama_set_causal_attn(ctx, true); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 4edc0bfacf..31b675e8f9 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -206,9 +206,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * void IMatrixCollector::save_imatrix(int ncall) const { auto fname = m_params.out_file; - if (fname.empty()) { - fname = "imatrix.dat"; - } if (ncall > 0) { fname += ".at_"; @@ -498,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); @@ -583,6 +580,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) { int main(int argc, char ** argv) { common_params params; + params.out_file = "imatrix.dat" ; + params.n_ctx = 512; params.logits_all = true; params.escape = false; diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 489a208b66..4e2f7b7270 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -332,8 +332,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); n_past -= n_discard; diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index fc9f0097f5..55f94c0b0a 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -195,7 +195,7 @@ class BuiltinRule: self.deps = deps or [] # Constraining spaces to prevent model "running away". -SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}' +SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}' PRIMITIVE_RULES = { 'boolean' : BuiltinRule('("true" | "false") space', []), diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index f518d02d38..cbcbfcee86 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) { test t(inst, lmodel, ctx); - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); // cool off before the test if (params.delay) { @@ -1618,7 +1618,7 @@ int main(int argc, char ** argv) { } for (int i = 0; i < params.reps; i++) { - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); uint64_t t_start = get_time_ns(); diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp index 2a73983a98..9654cd53cf 100644 --- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp @@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( } batch->logits[batch->n_tokens - 1] = true; - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_pp_start = ggml_time_us(); if (llama_decode(context, *batch) != 0) { @@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( LOGi("Benchmark text generation (tg)"); - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_tg_start = ggml_time_us(); for (i = 0; i < tg; i++) { @@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model( const auto t_tg_end = ggml_time_us(); - llama_kv_cache_clear(context); + llama_kv_self_clear(context); const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0; const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0; @@ -361,7 +361,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init( const auto tokens_list = common_tokenize(context, text, true, parse_special); auto n_ctx = llama_n_ctx(context); - auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size()); + auto n_kv_req = tokens_list.size() + n_len; LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req); @@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop( extern "C" JNIEXPORT void JNICALL Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { - llama_kv_cache_clear(reinterpret_cast(context)); + llama_kv_self_clear(reinterpret_cast(context)); } diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md index f717886d66..bd7ce37747 100644 --- a/examples/llama.swiftui/README.md +++ b/examples/llama.swiftui/README.md @@ -5,6 +5,21 @@ point for more advanced projects. For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508 + +### Building +First llama.cpp need to be built and a XCFramework needs to be created. This can be done by running +the following script from the llama.cpp project root: +```console +$ ./build-xcframework.sh +``` +Open `llama.swiftui.xcodeproj` project in Xcode and you should be able to build and run the app on +a simulator or a real device. + +To use the framework with a different project, the XCFramework can be added to the project by +adding `build-apple/llama.xcframework` by dragging and dropping it into the project navigator, or +by manually selecting the framework in the "Frameworks, Libraries, and Embedded Content" section +of the project settings. + ![image](https://github.com/ggml-org/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299) Video demonstration: diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index ee7141a663..f6e31abc93 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -210,7 +210,7 @@ actor LlamaContext { } batch.logits[Int(batch.n_tokens) - 1] = 1 // true - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000; @@ -223,7 +223,7 @@ actor LlamaContext { // bench text generation - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000; @@ -242,7 +242,7 @@ actor LlamaContext { let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000; - llama_kv_cache_clear(context) + llama_kv_self_clear(context) let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0 let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0 @@ -292,7 +292,7 @@ actor LlamaContext { func clear() { tokens_list.removeAll() temporary_invalid_cchars.removeAll() - llama_kv_cache_clear(context) + llama_kv_self_clear(context) } private func tokenize(text: String, add_bos: Bool) -> [llama_token] { diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj index ff3d108b2a..6f08fe220a 100644 --- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj @@ -7,7 +7,6 @@ objects = { /* Begin PBXBuildFile section */ - 1809696D2D05A39F00400EE8 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = 1809696C2D05A39F00400EE8 /* llama */; }; 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; 79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; }; 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; }; @@ -18,9 +17,25 @@ 8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; }; 8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; }; 8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; }; + DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; }; + DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DD84C9FC2D747FED007778EC /* llama.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; }; F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; }; /* End PBXBuildFile section */ +/* Begin PBXCopyFilesBuildPhase section */ + DD84C9FF2D747FED007778EC /* Embed Frameworks */ = { + isa = PBXCopyFilesBuildPhase; + buildActionMask = 2147483647; + dstPath = ""; + dstSubfolderSpec = 10; + files = ( + DD84C9FE2D747FED007778EC /* llama.xcframework in Embed Frameworks */, + ); + name = "Embed Frameworks"; + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXCopyFilesBuildPhase section */ + /* Begin PBXFileReference section */ 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = ""; }; @@ -33,6 +48,7 @@ 8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = ""; }; 8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = ""; }; 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = ""; }; + DD84C9FC2D747FED007778EC /* llama.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = llama.xcframework; path = "../../build-apple/llama.xcframework"; sourceTree = ""; }; DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = ""; }; F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = ""; }; /* End PBXFileReference section */ @@ -42,9 +58,9 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 1809696D2D05A39F00400EE8 /* llama in Frameworks */, 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */, 8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */, + DD84C9FD2D747FED007778EC /* llama.xcframework in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -86,6 +102,7 @@ 8A39BE082AC7601000BFEB40 /* Frameworks */ = { isa = PBXGroup; children = ( + DD84C9FC2D747FED007778EC /* llama.xcframework */, 549479CA2AC9E16000E0F78B /* Metal.framework */, 8A39BE092AC7601000BFEB40 /* Accelerate.framework */, ); @@ -144,6 +161,7 @@ 8A1C836F2AC328BD0096AF73 /* Sources */, 8A1C83702AC328BD0096AF73 /* Frameworks */, 8A1C83712AC328BD0096AF73 /* Resources */, + DD84C9FF2D747FED007778EC /* Embed Frameworks */, ); buildRules = ( ); @@ -151,7 +169,6 @@ ); name = llama.swiftui; packageProductDependencies = ( - 1809696C2D05A39F00400EE8 /* llama */, ); productName = llama.swiftui; productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */; @@ -427,13 +444,6 @@ defaultConfigurationName = Release; }; /* End XCConfigurationList section */ - -/* Begin XCSwiftPackageProductDependency section */ - 1809696C2D05A39F00400EE8 /* llama */ = { - isa = XCSwiftPackageProductDependency; - productName = llama; - }; -/* End XCSwiftPackageProductDependency section */ }; rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */; } diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 30c2dc4310..1c3cd9d2ef 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -124,15 +124,26 @@ struct ContentView: View { } } }.sheet(isPresented: $showingHelp) { // Sheet for help modal - VStack(alignment: .leading) { + NavigationView { VStack(alignment: .leading) { - Text("1. Make sure the model is in GGUF Format") - .padding() - Text("2. Copy the download link of the quantized model") - .padding() + VStack(alignment: .leading) { + Text("1. Make sure the model is in GGUF Format") + .padding() + Text("2. Copy the download link of the quantized model") + .padding() + } + Spacer() } - Spacer() - } + .navigationTitle("Help") + .navigationBarTitleDisplayMode(.inline) + .toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Button("Done") { + showingHelp = false + } + } + } + } } } } diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index 319effd199..f275ce1ccd 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -51,6 +51,13 @@ install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) +set(TARGET llama-gemma3-cli) +add_executable(${TARGET} gemma3-cli.cpp) +set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + set(TARGET llama-llava-clip-quantize-cli) add_executable(${TARGET} clip-quantize-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-clip-quantize-cli) diff --git a/examples/llava/README-gemma3.md b/examples/llava/README-gemma3.md new file mode 100644 index 0000000000..20bf73fb5c --- /dev/null +++ b/examples/llava/README-gemma3.md @@ -0,0 +1,30 @@ +# Gemma 3 vision + +> [!IMPORTANT] +> +> This is very experimental, only used for demo purpose. + +## How to get mmproj.gguf? + +```bash +cd gemma-3-4b-it +python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py . + +# output file is mmproj.gguf +``` + +## How to run it? + +What you need: +- The text model GGUF, can be converted using `convert_hf_to_gguf.py` +- The mmproj file from step above +- An image file + +```bash +# build +cmake -B build +cmake --build build --target llama-gemma3-cli + +# run it +./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg +``` diff --git a/examples/llava/README-granitevision.md b/examples/llava/README-granitevision.md new file mode 100644 index 0000000000..f08a21cc17 --- /dev/null +++ b/examples/llava/README-granitevision.md @@ -0,0 +1,190 @@ +# Granite Vision + +Download the model and point your `GRANITE_MODEL` environment variable to the path. + +```bash +$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b +$ export GRANITE_MODEL=./granite-vision-3.2-2b +``` + + +### 1. Running llava surgery v2. +First, we need to run the llava surgery script as shown below: + +`python llava_surgery_v2.py -C -m $GRANITE_MODEL` + +You should see two new files (`llava.clip` and `llava.projector`) written into your model's directory, as shown below. + +```bash +$ ls $GRANITE_MODEL | grep -i llava +llava.clip +llava.projector +``` + +We should see that the projector and visual encoder get split out into the llava files. Quick check to make sure they aren't empty: +```python +import os +import torch + +MODEL_PATH = os.getenv("GRANITE_MODEL") +if not MODEL_PATH: + raise ValueError("env var GRANITE_MODEL is unset!") + +encoder_tensors = torch.load(os.path.join(MODEL_PATH, "llava.clip")) +projector_tensors = torch.load(os.path.join(MODEL_PATH, "llava.projector")) + +assert len(encoder_tensors) > 0 +assert len(projector_tensors) > 0 +``` + +If you actually inspect the `.keys()` of the loaded tensors, you should see a lot of `vision_model` tensors in the `encoder_tensors`, and 5 tensors (`'multi_modal_projector.linear_1.bias'`, `'multi_modal_projector.linear_1.weight'`, `'multi_modal_projector.linear_2.bias'`, `'multi_modal_projector.linear_2.weight'`, `'image_newline'`) in the multimodal `projector_tensors`. + + +### 2. Creating the Visual Component GGUF +Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below. + +```bash +$ ENCODER_PATH=$PWD/visual_encoder +$ mkdir $ENCODER_PATH + +$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin +$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/ +``` + +Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`. + +```json +{ + "_name_or_path": "siglip-model", + "architectures": [ + "SiglipVisionModel" + ], + "image_grid_pinpoints": [ + [384,384], + [384,768], + [384,1152], + [384,1536], + [384,1920], + [384,2304], + [384,2688], + [384,3072], + [384,3456], + [384,3840], + [768,384], + [768,768], + [768,1152], + [768,1536], + [768,1920], + [1152,384], + [1152,768], + [1152,1152], + [1536,384], + [1536,768], + [1920,384], + [1920,768], + [2304,384], + [2688,384], + [3072,384], + [3456,384], + [3840,384] + ], + "mm_patch_merge_type": "spatial_unpad", + "hidden_size": 1152, + "image_size": 384, + "intermediate_size": 4304, + "model_type": "siglip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + "layer_norm_eps": 1e-6, + "hidden_act": "gelu_pytorch_tanh", + "projection_dim": 0, + "vision_feature_layer": [-24, -20, -12, -1] +} +``` + +At this point you should have something like this: +```bash +$ ls $ENCODER_PATH +config.json llava.projector pytorch_model.bin +``` + +Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`. +```bash +$ python convert_image_encoder_to_gguf.py \ + -m $ENCODER_PATH \ + --llava-projector $ENCODER_PATH/llava.projector \ + --output-dir $ENCODER_PATH \ + --clip-model-is-vision \ + --clip-model-is-siglip \ + --image-mean 0.5 0.5 0.5 \ + --image-std 0.5 0.5 0.5 +``` + +This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.` + + +### 3. Creating the LLM GGUF. +The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path. + +First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to. +```bash +$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm +``` + +```python +import os +import transformers + +MODEL_PATH = os.getenv("GRANITE_MODEL") +if not MODEL_PATH: + raise ValueError("env var GRANITE_MODEL is unset!") + +LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH") +if not LLM_EXPORT_PATH: + raise ValueError("env var LLM_EXPORT_PATH is unset!") + +tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH) + +# NOTE: granite vision support was added to transformers very recently (4.49); +# if you get size mismatches, your version is too old. +# If you are running with an older version, set `ignore_mismatched_sizes=True` +# as shown below; it won't be loaded correctly, but the LLM part of the model that +# we are exporting will be loaded correctly. +model = transformers.AutoModelForImageTextToText.from_pretrained(MODEL_PATH, ignore_mismatched_sizes=True) + +tokenizer.save_pretrained(LLM_EXPORT_PATH) +model.language_model.save_pretrained(LLM_EXPORT_PATH) +``` + +Now you can convert the exported LLM to GGUF with the normal converter in the root of the llama cpp project. +```bash +$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm.gguf +... +$ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH +``` + + +### 4. Quantization +If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example: +```bash +$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M +$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf +``` + +Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32. + + +### 5. Running the Model in Llama cpp +Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner. + +```bash +$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \ + --mmproj $VISUAL_GGUF_PATH \ + --image ./media/llama0-banner.png \ + -c 16384 \ + -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\\nWhat does the text in this image say?\n<|assistant|>\n" \ + --temp 0 +``` + +Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"` diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md index 8f591506db..48c4232383 100644 --- a/examples/llava/README-minicpmo2.6.md +++ b/examples/llava/README-minicpmo2.6.md @@ -5,13 +5,25 @@ Currently, this readme only supports minicpm-omni's image capabilities, and we w Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder. + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + Clone llama.cpp: ```bash -git clone git@github.com:OpenBMB/llama.cpp.git +git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -git checkout minicpm-omni ``` +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + ### Usage of MiniCPM-o 2.6 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us) @@ -22,25 +34,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM- python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model # quantize int4 version -./llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M +./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M ``` -Build llama.cpp using `CMake`: -https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md - -```bash -cmake -B build -cmake --build build --config Release -``` Inference on Linux or Mac -``` +```bash # run f16 version -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" # run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i +./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" ``` diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md index b0e72a0fa7..6bfe7abd16 100644 --- a/examples/llava/README-minicpmv2.5.md +++ b/examples/llava/README-minicpmv2.5.md @@ -4,13 +4,26 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder. + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + Clone llama.cpp: ```bash git clone https://github.com/ggml-org/llama.cpp cd llama.cpp ``` -### Usage +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + +### Usage of MiniCPM-Llama3-V 2.5 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) @@ -20,80 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM- python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model # quantize int4 version -./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M +./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M ``` -Build for Linux or Mac - -```bash -make -make llama-minicpmv-cli -``` Inference on Linux or Mac -``` +```bash # run f16 version -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" # run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i -``` - -### Android - -#### Build on Android device using Termux -We found that build on Android device would bring better runtime performance, so we recommend to build on device. - -[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). - -Install tools in Termux: -``` -apt update && apt upgrade -y -apt install git make cmake -``` - -It's recommended to move your model inside the `~/` directory for best performance: -``` -cd storage/downloads -mv model.gguf ~/ -``` - -#### Building the Project using Android NDK -Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. - -Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: - -```bash -mkdir build-android -cd build-android -export NDK=/your_ndk_path -cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. -make -``` - -Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). - -Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: - -(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) -``` -$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ -$cd /data/data/com.termux/files/home/bin -$chmod +x ./* -``` - -Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` - -``` -$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ -$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ -``` - -Now, you can start chatting: -``` -$cd /data/data/com.termux/files/home/bin -$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" ``` diff --git a/examples/llava/README-minicpmv2.6.md b/examples/llava/README-minicpmv2.6.md index c4be5e5dd6..2df39cdbac 100644 --- a/examples/llava/README-minicpmv2.6.md +++ b/examples/llava/README-minicpmv2.6.md @@ -4,13 +4,25 @@ Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder. + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + Clone llama.cpp: ```bash -git clone git@github.com:OpenBMB/llama.cpp.git +git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -git checkout minicpmv-main ``` +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + ### Usage of MiniCPM-V 2.6 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us) @@ -21,87 +33,15 @@ python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM- python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model # quantize int4 version -./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M +./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M ``` -Build for Linux or Mac - -```bash -make -make llama-minicpmv-cli -``` Inference on Linux or Mac -``` +```bash # run f16 version -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" # run quantized int4 version -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" - -# or run in interactive mode -./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i -``` - -### Video -Install FFmpeg -``` -brew install ffmpeg -brew install pkg-config -``` - -### Android - -#### Build on Android device using Termux -We found that build on Android device would bring better runtime performance, so we recommend to build on device. - -[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required). - -Install tools in Termux: -``` -apt update && apt upgrade -y -apt install git make cmake -``` - -It's recommended to move your model inside the `~/` directory for best performance: -``` -cd storage/downloads -mv model.gguf ~/ -``` - -#### Building the Project using Android NDK -Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. - -Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: - -```bash -mkdir build-android -cd build-android -export NDK=/your_ndk_path -cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. -make -``` - -Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). - -Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: - -(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) -``` -$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/ -$cd /data/data/com.termux/files/home/bin -$chmod +x ./* -``` - -Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/` - -``` -$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/ -$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/ -``` - -Now, you can start chatting: -``` -$cd /data/data/com.termux/files/home/bin -$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" ``` diff --git a/examples/llava/README.md b/examples/llava/README.md index 0124513617..0e3c320320 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -101,8 +101,27 @@ python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknow ``` **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) + **note** llava-1.6 greatly benefits from batched prompt processing (defaults work) +**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model. + +```python +import os +import transformers + +model_path = ... +llm_export_path = ... + +tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) +model = transformers.AutoModelForImageTextToText.from_pretrained(model_path) + +tokenizer.save_pretrained(llm_export_path) +model.language_model.save_pretrained(llm_export_path) +``` + +Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures. + ## llava-cli templating and llava-1.6 prompting llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."` diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index bf70e351c9..a1f050e39a 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -4,31 +4,12 @@ // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" #include "ggml.h" +#include "ggml-cpp.h" #include "ggml-cpu.h" #include "ggml-alloc.h" #include "ggml-backend.h" #include "gguf.h" -//#ifdef GGML_USE_CUDA -//#include "ggml-cuda.h" -//#endif -// -//#ifdef GGML_USE_SYCL -//#include "ggml-sycl.h" -//#endif -// -//#ifdef GGML_USE_METAL -//#include "ggml-metal.h" -//#endif -// -//#ifdef GGML_USE_CANN -//#include "ggml-cann.h" -//#endif -// -//#ifdef GGML_USE_VULKAN -//#include "ggml-vulkan.h" -//#endif - #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -40,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -120,6 +102,7 @@ static std::string format(const char * fmt, ...) { #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_PROJ_TYPE "clip.projector_type" +#define KEY_FEATURE_LAYER "clip.vision.feature_layer" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -153,6 +136,8 @@ static std::string format(const char * fmt, ...) { #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" #define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" +#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 +#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" #define TN_MINICPMV_QUERY "resampler.query" @@ -179,6 +164,7 @@ enum projector_type { PROJECTOR_TYPE_RESAMPLER, PROJECTOR_TYPE_GLM_EDGE, PROJECTOR_TYPE_MERGER, + PROJECTOR_TYPE_GEMMA3, PROJECTOR_TYPE_UNKNOWN, }; @@ -189,6 +175,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_RESAMPLER, "resampler"}, { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, + { PROJECTOR_TYPE_GEMMA3, "gemma3"}, }; @@ -315,7 +302,7 @@ static projector_type clip_projector_type_from_string(const std::string & name) return kv.first; } } - return PROJECTOR_TYPE_UNKNOWN; + throw std::runtime_error(format("Unknown projector type: %s", name.c_str())); } #ifdef CLIP_DEBUG_FUNCTIONS @@ -444,8 +431,9 @@ struct clip_hparams { char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) - int32_t image_grid_pinpoints[32]; + std::vector image_grid_pinpoints; int32_t image_crop_resolution; + std::unordered_set vision_feature_layer; }; struct clip_layer { @@ -571,6 +559,10 @@ struct clip_vision_model { struct ggml_tensor * mm_model_ln_kv_b; struct ggml_tensor * mm_model_ln_post_w; struct ggml_tensor * mm_model_ln_post_b; + + // gemma3 + struct ggml_tensor * mm_input_proj_w; + struct ggml_tensor * mm_soft_emb_norm_w; }; struct clip_ctx { @@ -585,6 +577,7 @@ struct clip_ctx { struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; + int32_t max_feature_layer; // unused in newer models like gemma3 float image_mean[3]; float image_std[3]; bool use_gelu = false; @@ -596,21 +589,209 @@ struct clip_ctx { bool has_post_norm = false; bool has_patch_bias = false; - struct gguf_context * ctx_gguf; - struct ggml_context * ctx_data; + struct gguf_context * ctx_gguf = nullptr; + struct ggml_context * ctx_data = nullptr; std::vector buf_compute_meta; - // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = NULL; + std::vector backend_ptrs; + std::vector backend_buft; - ggml_backend_t backend = NULL; - ggml_gallocr_t compute_alloc = NULL; + ggml_backend_t backend = nullptr; + ggml_backend_t backend_cpu = nullptr; + ggml_backend_buffer_t buf = nullptr; - struct clip_image_size * load_image_size; + ggml_backend_sched_ptr sched; + + struct clip_image_size * load_image_size = nullptr; + + clip_ctx(clip_context_params & ctx_params) { + backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + backend = ctx_params.use_gpu + ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr) + : nullptr; + + if (backend) { + LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend)); + backend_ptrs.push_back(backend); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); + } else { + backend = backend_cpu; + LOG_INF("%s: CLIP using CPU backend\n", __func__); + } + + backend_ptrs.push_back(backend_cpu); + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); + + sched.reset( + ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false) + ); + } + + ~clip_ctx() { + ggml_free(ctx_data); + gguf_free(ctx_gguf); + ggml_backend_buffer_free(buf); + ggml_backend_free(backend); + if (backend_cpu != backend) { + ggml_backend_free(backend_cpu); + } + } }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { +static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) { + const auto & model = ctx->vision_model; + const auto & hparams = model.hparams; + + const int image_size = hparams.image_size; + int image_size_width = image_size; + int image_size_height = image_size; + + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int hidden_size = hparams.hidden_size; + const int n_head = hparams.n_head; + const int d_head = hidden_size / n_head; + const int n_layer = hparams.n_layer; + const float eps = hparams.eps; + + GGML_ASSERT(imgs->size == 1); // batch_size == 1 + + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + // input raw + struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + + struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + inp = ggml_add(ctx0, inp, model.patch_bias); + + // position embeddings + struct ggml_tensor * embeddings = ggml_add(ctx0, inp, model.position_embeddings); + + // loop over layers + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states + + // layernorm1 + { + cur = ggml_norm(ctx0, cur, eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w), model.layers[il].ln_1_b); + } + + // self-attention + { + + struct ggml_tensor * Q = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + + struct ggml_tensor * K = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b); + + K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + + struct ggml_tensor * V = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b); + + V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ = ggml_scale_inplace(ctx0, KQ, 1.0f / sqrtf((float)d_head)); + KQ = ggml_soft_max_inplace(ctx0, KQ); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches); + } + + // attention output + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b); + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, embeddings); + + embeddings = cur; // embeddings = residual, cur = hidden_states + + // layernorm2 + { + cur = ggml_norm(ctx0, cur, eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b); + + // siglip uses gelu + cur = ggml_gelu(ctx0, cur); + + cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur); + cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b); + + // residual 2 + cur = ggml_add(ctx0, embeddings, cur); + + embeddings = cur; + } + + // post-layernorm + if (ctx->has_post_norm) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "post_ln"); + + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); + } + + if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + const int batch_size = 1; + const int mm_tokens_per_image = 256; // default value for gemma3 + const int tokens_per_side = sqrt(mm_tokens_per_image); + const int patches_per_image = sqrt(num_patches); + const int kernel_size = patches_per_image / tokens_per_side; + + embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings)); + embeddings = ggml_reshape_4d(ctx0, embeddings, patches_per_image, patches_per_image, hidden_size, batch_size); + + // doing a pool2d to reduce the number of output tokens to 256 + embeddings = ggml_pool_2d(ctx0, embeddings, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); + embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0] * embeddings->ne[0], hidden_size, batch_size); + embeddings = ggml_cont(ctx0, ggml_transpose(ctx0, embeddings)); + + // apply norm before projection + embeddings = ggml_rms_norm(ctx0, embeddings, eps); + embeddings = ggml_mul(ctx0, embeddings, model.mm_soft_emb_norm_w); + + // apply projection + embeddings = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), + embeddings); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + ggml_free(ctx0); + + return gf; +} + +static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { if (!ctx->has_vision_encoder) { LOG_ERR("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -651,7 +832,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; - int n_layer = hparams.n_layer; const float eps = hparams.eps; int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; @@ -752,13 +932,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); } + std::vector embedding_stack; + const auto & vision_feature_layer = hparams.vision_feature_layer; + // loop over layers - if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) { - n_layer += 1; - } - for (int il = 0; il < n_layer - 1; il++) { + for (int il = 0; il < ctx->max_feature_layer; il++) { struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states + // If this is an embedding feature layer, save the output. + // NOTE: 0 index here refers to the input to the encoder. + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + embedding_stack.push_back(embeddings); + } + //const size_t nb_q_w = model.layers[il].q_w->nb[0]; // layernorm1 @@ -846,7 +1032,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 cur = ggml_add(ctx0, embeddings, cur); embeddings = cur; - } // post-layernorm @@ -857,6 +1042,19 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); } + // final layer is a vision feature layer + if (vision_feature_layer.find(ctx->max_feature_layer) != vision_feature_layer.end()) { + embedding_stack.push_back(embeddings); + } + + // If feature layers are explicitly set, stack them (if we have multiple) + if (!embedding_stack.empty()) { + embeddings = embedding_stack[0]; + for (size_t i = 1; i < embedding_stack.size(); i++) { + embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); + } + } + // llava projector if (ctx->has_llava_projector) { embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); @@ -1139,7 +1337,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } else { GGML_ABORT("fatel error"); } - } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + } + else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); @@ -1161,8 +1360,25 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 return gf; } +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { + if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + return clip_image_build_graph_siglip(ctx, imgs); + } else { + // TODO: we should have one build_* function per model + return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + } +} + // read and create ggml_context containing the tensors and their data struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { + return clip_init(fname, clip_context_params{ + /* use_gpu */ true, + /* verbosity */ verbosity, + }); +} + +struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) { + int verbosity = ctx_params.verbosity; struct ggml_context * meta = NULL; struct gguf_init_params params = { @@ -1256,7 +1472,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } - clip_ctx * new_clip = new clip_ctx{}; + clip_ctx * new_clip = new clip_ctx(ctx_params); // update projector type { @@ -1275,36 +1491,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -//#ifdef GGML_USE_CUDA -// new_clip->backend = ggml_backend_cuda_init(0); -// LOG_INF("%s: CLIP using CUDA backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_METAL -// new_clip->backend = ggml_backend_metal_init(); -// LOG_INF("%s: CLIP using Metal backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_CANN -// new_clip->backend = ggml_backend_cann_init(0); -// LOG_INF("%s: CLIP using CANN backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_VULKAN -// new_clip->backend = ggml_backend_vk_init(0); -// LOG_INF("%s: CLIP using Vulkan backend\n", __func__); -//#endif -// -//#ifdef GGML_USE_SYCL -// new_clip->backend = ggml_backend_sycl_init(0); -// LOG_INF("%s: CLIP using SYCL backend\n", __func__); -//#endif - - if (!new_clip->backend) { - new_clip->backend = ggml_backend_cpu_init(); - LOG_INF("%s: CLIP using CPU backend\n", __func__); - } - // model size and capabilities { int idx = get_key_idx(ctx, KEY_HAS_TEXT_ENC); @@ -1342,8 +1528,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { GGML_ASSERT(new_clip->has_vision_encoder); GGML_ASSERT(!new_clip->has_text_encoder); - idx = get_key_idx(ctx, KEY_USE_GELU); - new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + try { + idx = get_key_idx(ctx, KEY_USE_GELU); + new_clip->use_gelu = gguf_get_val_bool(ctx, idx); + } catch (std::runtime_error & /*e*/) { + new_clip->use_gelu = false; + } try { idx = get_key_idx(ctx, KEY_USE_SILU); @@ -1357,6 +1547,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); + LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version); LOG_INF("%s: glm_projector: %d\n", __func__, new_clip->has_glm_projector); LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); @@ -1399,7 +1590,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } // alloc memory and offload data - new_clip->params_buffer = ggml_backend_alloc_ctx_tensors(new_clip->ctx_data, new_clip->backend); + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(new_clip->backend); + new_clip->buf = ggml_backend_alloc_ctx_tensors_from_buft(new_clip->ctx_data, buft); + ggml_backend_buffer_set_usage(new_clip->buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); for (int i = 0; i < n_tensors; ++i) { const char * name = gguf_get_tensor_name(ctx, i); struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name); @@ -1412,7 +1605,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { return nullptr; } int num_bytes = ggml_nbytes(cur); - if (ggml_backend_buffer_is_host(new_clip->params_buffer)) { + if (ggml_backend_buft_is_host(buft)) { // for the CPU and Metal backend, we can read directly into the tensor fin.read(reinterpret_cast(cur->data), num_bytes); } else { @@ -1443,14 +1636,26 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); int n = gguf_get_arr_n(ctx, idx); const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); - for (int i = 0; i < 32 && i < n && pinpoints[i] != 0; ++i) { - hparams.image_grid_pinpoints[i] = pinpoints[i]; + for (int i = 0; i < n; ++i) { + hparams.image_grid_pinpoints.push_back(pinpoints[i]); } - if (n < 32) - hparams.image_grid_pinpoints[n] = 0; - } catch (std::runtime_error & /*e*/) { - hparams.image_grid_pinpoints[0]=0; - } + } catch (std::runtime_error & /*e*/) { } + + // Load the vision feature layer indices if they are explicitly provided; + // if multiple vision feature layers are present, the values will be concatenated + // to form the final visual features. + // NOTE: gguf conversions should standardize the values of the vision feature layer to + // be non-negative, since we use -1 to mark values as unset here. + try { + int idx = get_key_idx(ctx, KEY_FEATURE_LAYER); + int n = gguf_get_arr_n(ctx, idx); + + const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx); + + for (int i = 0; i < n; ++i) { + hparams.vision_feature_layer.insert(vision_feature_layer[i]); + } + } catch (std::runtime_error & /*e*/) { } try { int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE); @@ -1476,6 +1681,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->image_std[i] = std_data[i]; } + // Calculate the deepest feature layer based on hparams and projector type + new_clip->max_feature_layer = get_deepest_feature_layer(new_clip); + if (verbosity >= 2) { LOG_INF("\n%s: vision model hparams\n", __func__); LOG_INF("image_size %d\n", hparams.image_size); @@ -1489,8 +1697,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); LOG_INF("v_image_grid_pinpoints: "); - for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { - LOG_INF("%d ", hparams.image_grid_pinpoints[i]); + for (const auto & pp : hparams.image_grid_pinpoints) { + LOG_INF("%d ", pp); + } + LOG_INF("\n"); + LOG_INF("v_vision_feature_layer: "); + for (const auto & feature_layer: hparams.vision_feature_layer) { + LOG_INF("%d ", feature_layer); } LOG_INF("\n"); LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); @@ -1528,11 +1741,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } try { - vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + vision_model.patch_embeddings_0 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); + } catch(const std::exception& /*e*/) { + vision_model.patch_embeddings_0 = nullptr; + } + + try { vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); } catch(const std::exception& /*e*/) { - LOG_ERR("%s: failed to load vision model tensors\n", __func__); + vision_model.position_embeddings = nullptr; } + try { vision_model.patch_embeddings_1 = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD_1); } catch(const std::exception& /*e*/) { @@ -1643,6 +1862,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight")); vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias")); } + else if (new_clip->proj_type == PROJECTOR_TYPE_GEMMA3) { + vision_model.mm_input_proj_w = get_tensor(new_clip->ctx_data, TN_MM_INP_PROJ); + vision_model.mm_soft_emb_norm_w = get_tensor(new_clip->ctx_data, TN_MM_SOFT_EMB_N); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -1678,14 +1901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // measure mem requirement and allocate { new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); - new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend)); clip_image_f32_batch batch; batch.size = 1; batch.data = nullptr; ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); - ggml_gallocr_reserve(new_clip->compute_alloc, gf); - size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); - LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + ggml_backend_sched_reserve(new_clip->sched.get(), gf); + for (size_t i = 0; i < new_clip->backend_ptrs.size(); ++i) { + ggml_backend_t backend = new_clip->backend_ptrs[i]; + ggml_backend_buffer_type_t buft = new_clip->backend_buft[i]; + size_t size = ggml_backend_sched_get_buffer_size(new_clip->sched.get(), backend); + if (size > 1) { + LOG_INF("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } + } } return new_clip; @@ -1729,11 +1959,11 @@ void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { } } -static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) { +void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) { img->nx = nx; img->ny = ny; img->buf.resize(3 * nx * ny); - memcpy(img->buf.data(), data, img->buf.size()); + memcpy(img->buf.data(), rgb_pixels, img->buf.size()); } bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { @@ -1743,7 +1973,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); return false; } - build_clip_img_from_data(data, nx, ny, img); + clip_build_img_from_pixels(data, nx, ny, img); stbi_image_free(data); return true; } @@ -1755,7 +1985,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length LOG_ERR("%s: failed to decode image bytes\n", __func__); return false; } - build_clip_img_from_data(data, nx, ny, img); + clip_build_img_from_pixels(data, nx, ny, img); stbi_image_free(data); return true; } @@ -2177,7 +2407,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli return true; } - if (ctx->has_glm_projector) { + if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { res_imgs->size = 1; res_imgs->data = new clip_image_f32[res_imgs->size]; clip_image_u8 resized_image; @@ -2235,10 +2465,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } } } else { - if (params.image_grid_pinpoints[0] != 0) { + if (!params.image_grid_pinpoints.empty()) { // "spatial_unpad" with "anyres" processing for llava-1.6 std::vector> possible_resolutions; - for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { + for (size_t i = 0; i < params.image_grid_pinpoints.size(); i+=2) { possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); } std::pair best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); @@ -2366,12 +2596,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) { } void clip_free(clip_ctx * ctx) { - ggml_free(ctx->ctx_data); - gguf_free(ctx->ctx_gguf); - - ggml_backend_buffer_free(ctx->params_buffer); - ggml_backend_free(ctx->backend); - ggml_gallocr_free(ctx->compute_alloc); delete ctx; } @@ -2404,7 +2628,14 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) { } const int32_t * clip_image_grid(const struct clip_ctx * ctx) { - return ctx->vision_model.hparams.image_grid_pinpoints; + if (ctx->vision_model.hparams.image_grid_pinpoints.size()) { + return &ctx->vision_model.hparams.image_grid_pinpoints.front(); + } + return nullptr; +} + +size_t get_clip_image_grid_size(const struct clip_ctx * ctx) { + return ctx->vision_model.hparams.image_grid_pinpoints.size(); } int clip_n_patches(const struct clip_ctx * ctx) { @@ -2560,8 +2791,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // build the inference graph + ggml_backend_sched_reset(ctx->sched.get()); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); - ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); + ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); // set inputs const auto & model = ctx->vision_model; @@ -2700,6 +2932,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); } + else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + // do nothing + } else { struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); @@ -2726,11 +2961,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } - if (ggml_backend_is_cpu(ctx->backend)) { - ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); - } + ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads); - ggml_backend_graph_compute(ctx->backend, gf); + auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERR("%s: ggml_backend_sched_graph_compute failed with error %d\n", __func__, status); + return false; + } // the last node is the embedding tensor struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); @@ -2910,6 +3147,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { return ctx->vision_model.mm_1_b->ne[0]; } + if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + return ctx->vision_model.mm_input_proj_w->ne[0]; + } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -2929,6 +3169,28 @@ bool clip_is_qwen2vl(const struct clip_ctx * ctx) { return ctx->has_qwen2vl_merger; } +// Determine the number of encoder layers to iterate over +int get_deepest_feature_layer(const struct clip_ctx * ctx) { + // Get the index of the second to last layer; this is the + // default for models that have a llava projector + const auto & hparams = ctx->vision_model.hparams; + int n_layer = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + // Handle other projectors; incrementing here indicates that we + // should use the last encoder layer for the vision features. + if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) { + n_layer += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer; +} bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 841b4f6f90..47059ca1b9 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -39,8 +39,15 @@ struct clip_image_f32_batch { size_t size; }; -CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); -CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); +struct clip_context_params { + bool use_gpu; + int verbosity; +}; + +// deprecated, use clip_init +CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); + +CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params); CLIP_API void clip_free(struct clip_ctx * ctx); @@ -55,6 +62,7 @@ CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx); CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx); CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); +CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img); @@ -73,6 +81,12 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img); CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); +/** + * Build image from pixels decoded by other libraries instead of stb_image.h for better performance. + * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes + */ +CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img); + CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); /** interpret bytes as an image file with length bytes_length, and use the result to populate img */ @@ -89,11 +103,13 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); +CLIP_API bool clip_is_glm(const struct clip_ctx * ctx); CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); +CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx); + CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); -CLIP_API bool clip_is_glm(const struct clip_ctx * ctx); #ifdef __cplusplus } diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py index 4fa1d6ceae..2949faec42 100644 --- a/examples/llava/convert_image_encoder_to_gguf.py +++ b/examples/llava/convert_image_encoder_to_gguf.py @@ -6,7 +6,7 @@ import re import torch import numpy as np from gguf import * -from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel +from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel TEXT = "clip.text" VISION = "clip.vision" @@ -37,6 +37,18 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b def get_tensor_name(name: str) -> str: + # Standardize the transformers llava next keys for + # image newline / mm projector with the classes in haotian-liu LLaVA + if name == "image_newline": + return "model.image_newline" + if name.startswith("multi_modal_projector"): + name = name.replace("multi_modal_projector", "mm") + if "linear_1" in name: + name = name.replace("linear_1", "0") + if "linear_2" in name: + name = name.replace("linear_2", "2") + return name + if "projection" in name: return name if "mm_projector" in name: @@ -77,14 +89,21 @@ def bytes_to_unicode(): ap = argparse.ArgumentParser() ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True) ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") +ap.add_argument('--bigendian', action="store_true", default=False, help="Model is executed on big-endian machine") ap.add_argument("--text-only", action="store_true", required=False, help="Save a text-only model. It can't be used to encode images") ap.add_argument("--vision-only", action="store_true", required=False, help="Save a vision-only model. It can't be used to encode texts") ap.add_argument("--clip-model-is-vision", action="store_true", required=False, help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") -ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, + +# Selectable visual encoders that are compatible with this script +encoder_group = ap.add_mutually_exclusive_group() +encoder_group.add_argument("--clip-model-is-openclip", action="store_true", required=False, help="The clip model is from openclip (for ViT-SO400M type))") +encoder_group.add_argument("--clip-model-is-siglip", action="store_true", required=False, + help="the visual encoder is Siglip.") + ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) @@ -109,7 +128,12 @@ if args.use_f32: # output in the same directory as the model if output_dir is None dir_model = args.model_dir -if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: +if ( + args.clip_model_is_vision or + not os.path.exists(dir_model + "/vocab.json") or + args.clip_model_is_openclip or + args.clip_model_is_siglip +): vocab = None tokens = None else: @@ -137,7 +161,10 @@ ftype = 1 if args.use_f32: ftype = 0 -if args.clip_model_is_vision or args.clip_model_is_openclip: +if args.clip_model_is_siglip: + model = SiglipVisionModel.from_pretrained(dir_model) + processor = None +elif args.clip_model_is_vision or args.clip_model_is_openclip: model = CLIPVisionModel.from_pretrained(dir_model) processor = None else: @@ -165,7 +192,7 @@ output_dir = args.output_dir if args.output_dir is not None else dir_model os.makedirs(output_dir, exist_ok=True) output_prefix = os.path.basename(output_dir).replace("ggml_", "") fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") -fout = GGUFWriter(path=fname_out, arch="clip") +fout = GGUFWriter(path=fname_out, arch="clip", endianess=GGUFEndian.LITTLE if not args.bigendian else GGUFEndian.BIG) fout.add_bool("clip.has_text_encoder", has_text_encoder) fout.add_bool("clip.has_vision_encoder", has_vision_encoder) @@ -187,26 +214,71 @@ else: if has_text_encoder: assert t_hparams is not None assert tokens is not None + if args.clip_model_is_siglip: + text_projection_dim = 0 + else: + text_projection_dim = t_hparams.get("projection_dim", config["projection_dim"]) # text_model hparams fout.add_uint32(k(KEY_CONTEXT_LENGTH, TEXT), t_hparams["max_position_embeddings"]) fout.add_uint32(k(KEY_EMBEDDING_LENGTH, TEXT), t_hparams["hidden_size"]) fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, TEXT), t_hparams["intermediate_size"]) - fout.add_uint32("clip.text.projection_dim", t_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32("clip.text.projection_dim", text_projection_dim) fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, TEXT), t_hparams["num_attention_heads"]) fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, TEXT), t_hparams["layer_norm_eps"]) fout.add_uint32(k(KEY_BLOCK_COUNT, TEXT), t_hparams["num_hidden_layers"]) fout.add_token_list(tokens) + + +def get_non_negative_vision_feature_layers(v_hparams): + """ + Determine the vision feature layer(s) for the llava model, which are indices into the + hidden states of the visual encoder. Note that the hidden states array generally takes the + form: + + [, , ... ] + + so feature indices should be offset as n+1 to get the output of encoder block n. + We convert all vision feature layers to non-negative so that -1 can be used in + the model as an unset value. If no vision feature layer is found, we leave it unset. + """ + num_hidden_layers = v_hparams["num_hidden_layers"] + to_non_negative = lambda layer_idx: layer_idx if layer_idx >= 0 else num_hidden_layers + layer_idx + 1 + feature_layers_key = None + # Key used for llava models in transformers + if "vision_feature_layer" in config: + feature_layers_key = "vision_feature_layer" + # Key used for llava models in the original format + elif "mm_vision_select_layer" in config: + feature_layers_key = "mm_vision_select_layer" + if feature_layers_key is not None: + feature_layers = config[feature_layers_key] + if isinstance(feature_layers, int): + feature_layers = [feature_layers] + return [to_non_negative(feature_layer) for feature_layer in feature_layers] + +# Determine if we have explicitly specified vision feature layers in our config +feature_layers = get_non_negative_vision_feature_layers(v_hparams) + if has_vision_encoder: - # vision_model hparams + # Siglip does not have a visual projector; set projection dim to 0 + if args.clip_model_is_siglip: + visual_projection_dim = 0 + else: + visual_projection_dim = v_hparams.get("projection_dim", config["projection_dim"]) + + # set vision_model hparams fout.add_uint32("clip.vision.image_size", v_hparams["image_size"]) fout.add_uint32("clip.vision.patch_size", v_hparams["patch_size"]) fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), v_hparams["hidden_size"]) fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), v_hparams["intermediate_size"]) - fout.add_uint32("clip.vision.projection_dim", v_hparams.get("projection_dim", config["projection_dim"])) + fout.add_uint32("clip.vision.projection_dim", visual_projection_dim) fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), v_hparams["num_attention_heads"]) fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"]) - block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] + if feature_layers: + block_count = max(feature_layers) + else: + block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"] fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) # /** # "image_grid_pinpoints": [ @@ -258,7 +330,8 @@ if has_vision_encoder: fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"]) if "mm_projector_type" in v_hparams: fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"]) - + if feature_layers: + fout.add_array("clip.vision.feature_layer", feature_layers) if processor is not None: image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean # pyright: ignore[reportAttributeAccessIssue] @@ -274,7 +347,13 @@ fout.add_bool("clip.use_gelu", use_gelu) if has_llava_projector: - model.vision_model.encoder.layers.pop(-1) + # By default, we drop the last layer for llava projector + # models unless we have explicitly set vision feature layers + if feature_layers is None: + model.vision_model.encoder.layers.pop(-1) + else: + model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)] + projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name) diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp new file mode 100644 index 0000000000..c36bb2eda0 --- /dev/null +++ b/examples/llava/gemma3-cli.cpp @@ -0,0 +1,341 @@ +#include "arg.h" +#include "log.h" +#include "common.h" +#include "sampling.h" +#include "clip.h" +#include "stb_image.h" +#include "llama.h" +#include "ggml.h" +#include "console.h" + +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +static bool g_is_generating = false; + +/** + * Please note that this is NOT a production-ready stuff. + * It is a playground for trying Gemma 3 vision capabilities. + * For contributors: please keep this code simple and easy to understand. + */ + +static void show_additional_info(int /*argc*/, char ** argv) { + LOG( + "Experimental CLI for using Gemma 3 vision model\n\n" + "Usage: %s [options] -m --mmproj --image -p \n\n" + " -m and --mmproj are required\n" + " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n", + argv[0] + ); +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +static void sigint_handler(int signo) { + if (signo == SIGINT) { + if (g_is_generating) { + g_is_generating = false; + } else { + console::cleanup(); + LOG("\nInterrupted by user\n"); + _exit(130); + } + } +} +#endif + +struct gemma3_context { + struct clip_ctx * ctx_clip = NULL; + common_init_result llama_init; + + llama_model * model; + llama_context * lctx; + const llama_vocab * vocab; + llama_batch batch; + + int n_threads = 1; + llama_pos n_past = 0; + + gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) { + model = llama_init.model.get(); + lctx = llama_init.context.get(); + vocab = llama_model_get_vocab(model); + n_threads = params.cpuparams.n_threads; + batch = llama_batch_init(params.n_batch, 0, 1); + init_clip_model(params); + } + + void init_clip_model(common_params & params) { + const char * clip_path = params.mmproj.c_str(); + ctx_clip = clip_model_load(clip_path, params.verbosity > 1); + } + + ~gemma3_context() { + clip_free(ctx_clip); + } +}; + +struct decode_embd_batch { + std::vector pos; + std::vector n_seq_id; + std::vector seq_id_0; + std::vector seq_ids; + std::vector logits; + llama_batch batch; + decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) { + pos .resize(n_tokens); + n_seq_id.resize(n_tokens); + seq_ids .resize(n_tokens + 1); + logits .resize(n_tokens); + seq_id_0.resize(1); + seq_id_0[0] = seq_id; + seq_ids [n_tokens] = nullptr; + batch = { + /*n_tokens =*/ n_tokens, + /*tokens =*/ nullptr, + /*embd =*/ embd, + /*pos =*/ pos.data(), + /*n_seq_id =*/ n_seq_id.data(), + /*seq_id =*/ seq_ids.data(), + /*logits =*/ logits.data(), + }; + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = pos_0 + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } +}; + +static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) { + llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true); + common_batch_clear(ctx.batch); + for (llama_token & t : tokens) { + common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false); + } + if (logits_last) { + ctx.batch.logits[ctx.batch.n_tokens - 1] = true; + } + // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str()); + if (llama_decode(ctx.lctx, ctx.batch)) { + LOG_ERR("Failed to decode text\n"); + return 1; + } + return 0; +} + +static int eval_image(gemma3_context & ctx, std::string & fname) { + std::vector image_embd_v; + int n_embd = llama_model_n_embd(ctx.model); + int n_tokens = 256; + image_embd_v.resize(n_tokens * n_embd); + + bool ok; + struct clip_image_u8 * img_u8 = clip_image_u8_init(); + ok = clip_image_load_from_file(fname.c_str(), img_u8); + if (!ok) { + LOG_ERR("Unable to load image %s\n", fname.c_str()); + clip_image_u8_free(img_u8); + return 2; // non-fatal error + } + + clip_image_f32_batch batch_f32; + ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32); + if (!ok) { + LOG_ERR("Unable to preprocess image\n"); + clip_image_f32_batch_free(&batch_f32); + clip_image_u8_free(img_u8); + return 1; + } + + int64_t t0 = ggml_time_ms(); + LOG("Encoding image %s\n", fname.c_str()); + ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data()); + if (!ok) { + LOG_ERR("Unable to encode image\n"); + clip_image_f32_batch_free(&batch_f32); + clip_image_u8_free(img_u8); + return 1; + } + LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); + + clip_image_f32_batch_free(&batch_f32); + clip_image_u8_free(img_u8); + + // decode image embeddings + int64_t t1 = ggml_time_ms(); + eval_text(ctx, ""); + llama_set_causal_attn(ctx.lctx, false); + decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0); + if (llama_decode(ctx.lctx, batch_img.batch)) { + LOG_ERR("failed to decode image\n"); + return 1; + } + ctx.n_past += n_tokens; + llama_set_causal_attn(ctx.lctx, true); + eval_text(ctx, ""); + LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1); + return 0; +} + +static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) { + for (int i = 0; i < n_predict; i++) { + if (i > n_predict || !g_is_generating) { + printf("\n"); + break; + } + + llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1); + common_sampler_accept(smpl, token_id, true); + + if (llama_vocab_is_eog(ctx.vocab, token_id)) { + printf("\n"); + break; // end of generation + } + + printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str()); + fflush(stdout); + + // eval the token + common_batch_clear(ctx.batch); + common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true); + if (llama_decode(ctx.lctx, ctx.batch)) { + LOG_ERR("failed to decode token\n"); + return 1; + } + } + return 0; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + common_params params; + params.sampling.temp = 0.2; // lower temp by default for better quality + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { + return 1; + } + + common_init(); + + if (params.mmproj.empty()) { + show_additional_info(argc, argv); + return 1; + } + + gemma3_context ctx(params); + printf("%s: %s\n", __func__, params.model.c_str()); + + bool is_single_turn = !params.prompt.empty() && !params.image.empty(); + + struct common_sampler * smpl = common_sampler_init(ctx.model, params.sampling); + int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict; + + // ctrl+C handling + { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + } + + if (eval_text(ctx, "")) { + return 1; + } + + if (is_single_turn) { + g_is_generating = true; + if (eval_text(ctx, "user\n")) { + return 1; + } + for (auto & fname : params.image) { + if (eval_image(ctx, fname)) { + return 1; + } + } + if (eval_text(ctx, params.prompt + "model\n", true)) { + return 1; + } + if (generate_response(ctx, smpl, n_predict)) { + return 1; + } + + } else { + LOG("\n Running in chat mode, available commands:"); + LOG("\n /image load an image"); + LOG("\n /clear clear the chat history"); + LOG("\n /quit or /exit exit the program"); + LOG("\n"); + + if (eval_text(ctx, "user\n")) { + return 1; + } + + while (true) { + g_is_generating = false; + LOG("\n> "); + console::set_display(console::user_input); + std::string line; + console::readline(line, false); + console::set_display(console::reset); + line = string_strip(line); + if (line.empty()) { + continue; + } + if (line == "/quit" || line == "/exit") { + break; + } + if (line == "/clear") { + ctx.n_past = 0; + llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS + LOG("Chat history cleared\n\n"); + continue; + } + g_is_generating = true; + if (line.find("/image") == 0) { + std::string image = line.substr(7); + int res = eval_image(ctx, image); + if (res == 2) { + continue; // image not found + } + if (res) { + return 1; + } + continue; + } + if (eval_text(ctx, line + "model\n", true)) { + return 1; + } + if (generate_response(ctx, smpl, n_predict)) { + return 1; + } + if (eval_text(ctx, "user\n")) { + return 1; + } + } + } + + return 0; +} diff --git a/examples/llava/gemma3_convert_encoder_to_gguf.py b/examples/llava/gemma3_convert_encoder_to_gguf.py new file mode 100644 index 0000000000..241b526b9e --- /dev/null +++ b/examples/llava/gemma3_convert_encoder_to_gguf.py @@ -0,0 +1,307 @@ +import gguf +import argparse +import logging +import sys +import torch +import json +import os +import numpy as np +from typing import cast, ContextManager, Any, Iterator +from pathlib import Path +from torch import Tensor + +logger = logging.getLogger("gemma3-mmproj") + + +# (copied from convert_hf_to_gguf.py) +# tree of lazy tensors +class LazyTorchTensor(gguf.LazyBase): + _tensor_type = torch.Tensor + # to keep the type-checker happy + dtype: torch.dtype + shape: torch.Size + + # only used when converting a torch.Tensor to a np.ndarray + _dtype_map: dict[torch.dtype, type] = { + torch.float16: np.float16, + torch.float32: np.float32, + } + + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_str_map: dict[str, torch.dtype] = { + "F64": torch.float64, + "F32": torch.float32, + "BF16": torch.bfloat16, + "F16": torch.float16, + # "U64": torch.uint64, + "I64": torch.int64, + # "U32": torch.uint32, + "I32": torch.int32, + # "U16": torch.uint16, + "I16": torch.int16, + "U8": torch.uint8, + "I8": torch.int8, + "BOOL": torch.bool, + "F8_E4M3": torch.float8_e4m3fn, + "F8_E5M2": torch.float8_e5m2, + } + + def numpy(self) -> gguf.LazyNumpyTensor: + dtype = self._dtype_map[self.dtype] + return gguf.LazyNumpyTensor( + meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), + args=(self,), + func=(lambda s: s.numpy()) + ) + + @classmethod + def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: + return torch.empty(size=shape, dtype=dtype, device="meta") + + @classmethod + def from_safetensors_slice(cls, st_slice: Any) -> Tensor: + dtype = cls._dtype_str_map[st_slice.get_dtype()] + shape: tuple[int, ...] = tuple(st_slice.get_shape()) + lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) + return cast(torch.Tensor, lazy) + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.numpy: + return args[0].numpy() + + return cls._wrap_fn(func)(*args, **kwargs) + + +class Gemma3VisionTower: + hparams: dict + gguf_writer: gguf.GGUFWriter + fname_out: Path + ftype: gguf.LlamaFileType + + @staticmethod + def load_hparams(dir_model: Path): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + return json.load(f) + + @staticmethod + def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: + part_names: list[str] = [] + for filename in os.listdir(dir_model): + if filename.startswith(prefix) and filename.endswith(suffix): + part_names.append(filename) + part_names.sort() + return part_names + + def __init__(self, + dir_model: Path, + fname_out: Path, + ftype: gguf.LlamaFileType, + is_big_endian: bool,): + hparams = Gemma3VisionTower.load_hparams(dir_model) + self.hparams = hparams + self.fname_out = fname_out + self.ftype = ftype + endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess) + + text_config = hparams["text_config"] + vision_config = hparams["vision_config"] + + assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration" + assert text_config is not None + assert vision_config is not None + + self.gguf_writer.add_string ("clip.projector_type", "gemma3") + self.gguf_writer.add_bool ("clip.has_text_encoder", False) + self.gguf_writer.add_bool ("clip.has_vision_encoder", True) + self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy + self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"]) + self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"]) + self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"]) + self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"]) + self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"]) + self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"]) + self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"]) + self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6)) + # default values taken from HF tranformers code + self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5]) + self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5]) + self.gguf_writer.add_bool ("clip.use_gelu", True) + + # load tensors + for name, data_torch in self.get_tensors(dir_model): + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + self.add_tensor(name, data_torch) + + def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]: + part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors") + tensor_names_from_parts: set[str] = set() + for part_name in part_names: + logger.info(f"gguf: loading model part '{part_name}'") + from safetensors import safe_open + ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu")) + with ctx as model_part: + tensor_names_from_parts.update(model_part.keys()) + + for name in model_part.keys(): + data = model_part.get_slice(name) + data = LazyTorchTensor.from_safetensors_slice(data) + yield name, data + + def add_tensor(self, name: str, data_torch: Tensor): + is_1d = len(data_torch.shape) == 1 + is_embd = ".embeddings." in name + old_dtype = data_torch.dtype + can_quantize = not is_1d and not is_embd + data_qtype = gguf.GGMLQuantizationType.F32 + + # this is to support old checkpoint + # TODO: remove this when we have the final model + name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.") + name = name.replace("multimodal_projector.", "multi_modal_projector.") + + # filter only vision tensors + if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."): + return + # prefix + name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.") + name = name.replace("vision_tower.vision_model.", "v.") + # projector and input embd + name = name.replace(".embeddings.patch_embedding.", ".patch_embd.") + name = name.replace(".embeddings.position_embedding.", ".position_embd.") + name = name.replace( + "multi_modal_projector.mm_input_projection_weight", + "mm.input_projection.weight" + ) + name = name.replace( + "multi_modal_projector.mm_soft_emb_norm.weight", + "mm.soft_emb_norm.weight" + ) + name = name.replace("post_layernorm.", "post_ln.") + # each block + name = name.replace(".self_attn.k_proj.", ".attn_k.") + name = name.replace(".self_attn.v_proj.", ".attn_v.") + name = name.replace(".self_attn.q_proj.", ".attn_q.") + name = name.replace(".self_attn.out_proj.", ".attn_out.") + name = name.replace(".layer_norm1.", ".ln1.") + name = name.replace(".layer_norm2.", ".ln2.") + name = name.replace(".mlp.fc1.", ".ffn_down.") + name = name.replace(".mlp.fc2.", ".ffn_up.") + + if can_quantize: + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + else: + raise ValueError(f"Unsupported file type: {self.ftype}") + + # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + data = data_torch.numpy() + + try: + data = gguf.quants.quantize(data, data_qtype) + except Exception as e: + logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" + logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + + def write(self): + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert Gemma 3 vision tower safetensors to GGUF format",) + parser.add_argument( + "--outfile", type=Path, default="mmproj.gguf", + help="path to write to", + ) + parser.add_argument( + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", + help="output format", + ) + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file", + nargs="?", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) + + args = parser.parse_args() + if args.model is None: + parser.error("the following arguments are required: model") + return args + + +def main() -> None: + args = parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + dir_model = args.model + + if not dir_model.is_dir(): + logger.error(f'Error: {args.model} is not a directory') + sys.exit(1) + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + } + + logger.info(f"Loading model: {dir_model.name}") + + with torch.inference_mode(): + gemma3_vision_tower = Gemma3VisionTower( + dir_model=dir_model, + fname_out=args.outfile, + ftype=ftype_map[args.outtype], + is_big_endian=args.bigendian, + ) + gemma3_vision_tower.write() + + +if __name__ == '__main__': + main() + diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 3007140459..518aad3f1f 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -353,9 +353,10 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); const int32_t * image_grid = clip_image_grid(ctx_clip); + const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip); std::vector> grid_pinpoints; - for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) { + for (size_t i = 0; i < num_gridpoints; i += 2) { grid_pinpoints.push_back({image_grid[i], image_grid[i+1]}); } @@ -405,7 +406,8 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * } bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - int num_max_patches = 6; + // Granite vision uses up to 10 patches + base patch + int num_max_patches = 11; if (clip_is_minicpmv(ctx_clip)) { num_max_patches = 10; } diff --git a/examples/llava/llava_surgery_v2.py b/examples/llava/llava_surgery_v2.py index 2d5b32fe6b..b07c3e323c 100644 --- a/examples/llava/llava_surgery_v2.py +++ b/examples/llava/llava_surgery_v2.py @@ -33,6 +33,33 @@ def save_model(model, file_path, file_type): else: torch.save(model, file_path) +# Helpers to match weight names from specific components or +# determine if a saved shard contains that component +def is_vision_tower(weight_name): + return ( + weight_name.startswith("model.vision_tower") or + weight_name.startswith("vit.") or + weight_name.startswith("vision_tower") + ) + +def is_newline(weight_name): + return ( + weight_name.startswith("model.image_newline") or + weight_name.startswith("image_newline") + ) + +def is_mm_projector(weight_name): + return ( + weight_name.startswith("model.mm_projector") or + weight_name.startswith("vision_proj.") or + weight_name.startswith("multi_modal_projector") + ) + +def newline_criteria(checkpoint): + return any(is_newline(k) for k in checkpoint.keys()) + +def proj_criteria(checkpoint): + return any(is_mm_projector(k) for k in checkpoint.keys()) # Adapted function to clean vision tower from checkpoint def clean_vision_tower_from_checkpoint(checkpoint_path): @@ -40,7 +67,7 @@ def clean_vision_tower_from_checkpoint(checkpoint_path): # file_type = 'pytorch' model_path = os.path.dirname(checkpoint_path) print(f"Searching for vision tower tensors in {checkpoint_path}") - clip_tensors = [k for k, v in checkpoint.items() if (k.startswith("model.vision_tower") or k.startswith("vit."))] + clip_tensors = [k for k, v in checkpoint.items() if is_vision_tower(k)] if len(clip_tensors) > 0: print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}") @@ -84,12 +111,6 @@ def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector): return newline_checkpoint_path, projector_checkpoint_path -def newline_criteria(checkpoint): - return any(k.startswith("model.image_newline") for k in checkpoint.keys()) - -def proj_criteria(checkpoint): - return any(k.startswith("model.mm_projector") or k.startswith("vision_proj.") for k in checkpoint.keys()) - # Command-line interface setup ap = argparse.ArgumentParser() @@ -123,14 +144,14 @@ first_checkpoint = None if newline_checkpoint_path is not None: print(f"Taking newline from {newline_checkpoint_path}") first_checkpoint, file_type = load_model(newline_checkpoint_path) - first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")] + first_mm_tensors = [k for k, v in first_checkpoint.items() if is_newline(k)] # Load the checkpoint mm_tensors = [] last_checkpoint = None if projector_checkpoint_path is not None: last_checkpoint, file_type = load_model(projector_checkpoint_path) - mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("model.mm_projector") or k.startswith("vision_proj.")] + mm_tensors = [k for k, v in last_checkpoint.items() if is_mm_projector(k)] if len(mm_tensors) == 0: if last_checkpoint is not None: @@ -155,5 +176,5 @@ if len(projector) > 0: save_model(projector, f"{args.model}/llava.projector", 'pytorch') print("Done!") -print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.") +print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.") print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.") diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 53d902d616..12f536cf5c 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -86,7 +86,11 @@ static struct clip_ctx * clip_init_context(common_params * params) { if (prompt.empty()) { prompt = "describe the image in detail."; } - auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + struct clip_context_params clip_params = { + /* use_gpu */ params->n_gpu_layers != 0, + /* verbosity */ params->verbosity, + }; + auto * ctx_clip = clip_init(clip_path, clip_params); return ctx_clip; } @@ -148,19 +152,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); if (num_image_embeds > 1) { - size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { - for (size_t j = 0; j < num_image_embeds_col; ++j) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == num_image_embeds_col - 1) { - eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + if (has_minicpmv_projector == 2) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } + } + } + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + } + else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } } } } - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); } LOG_INF("%s: image token past: %d\n", __func__, n_past); } diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py index 9b196757f0..cfe0961f98 100644 --- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -597,7 +597,6 @@ elif args.minicpmv_projector is not None: fname_middle = "mmproj-" has_text_encoder = False has_minicpmv_projector = True - minicpmv_version = 4 elif args.vision_only: fname_middle = "vision-" has_text_encoder = False diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index 2f0898e628..7df20aee17 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -7,6 +7,7 @@ #include #include #include +#include struct ngram_data { bool active = false; @@ -95,7 +96,7 @@ int main(int argc, char ** argv) { llama_decode(ctx, llama_batch_get_one(&inp.back(), 1)); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); + llama_kv_self_seq_cp(ctx, 0, s, -1, -1); } const auto t_enc_end = ggml_time_us(); @@ -437,17 +438,17 @@ int main(int argc, char ** argv) { // KV cache management // if no verification token matched, we simply remove all cells from this batch -> no fragmentation - llama_kv_cache_seq_rm(ctx, -1, n_past, -1); + llama_kv_self_seq_rm(ctx, -1, n_past, -1); if (seq_id_best != 0) { // if a verification token matched, we keep the best sequence and remove the rest // this leads to some KV cache fragmentation - llama_kv_cache_seq_keep(ctx, seq_id_best); - llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1); - llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1); + llama_kv_self_seq_keep(ctx, seq_id_best); + llama_kv_self_seq_cp (ctx, seq_id_best, 0, -1, -1); + llama_kv_self_seq_rm (ctx, seq_id_best, -1, -1); for (int s = 1; s < W + G + 1; ++s) { - llama_kv_cache_seq_cp(ctx, 0, s, -1, -1); + llama_kv_self_seq_cp(ctx, 0, s, -1, -1); } } } diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index dbd0444ec8..4ae93b2a5e 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -192,7 +192,7 @@ int main(int argc, char ** argv){ // KV cache management // clean the cache of draft tokens that weren't accepted - llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + llama_kv_self_seq_rm(ctx, 0, n_past, -1); common_batch_clear(batch_tgt); common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); diff --git a/examples/main/README.md b/examples/main/README.md index f7c2497294..e4b3590b5d 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -27,12 +27,24 @@ Once downloaded, place your model in the models folder in llama.cpp. ##### Input prompt (One-and-done) ```bash -./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time" +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time" ``` ##### Conversation mode (Allow for continuous interaction with the model) ```bash -./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma +``` + +##### Conversation mode using built-in jinja chat template + +```bash +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja +``` + +##### One-and-done query using jinja with custom system prompt and a starting prompt + +```bash +./llama-cli -m models/gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello" ``` ##### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): @@ -44,12 +56,24 @@ Once downloaded, place your model in the models folder in llama.cpp. ##### Input prompt (One-and-done) ```powershell -./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --prompt "Once upon a time" +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -no-cnv --prompt "Once upon a time" ``` ##### Conversation mode (Allow for continuous interaction with the model) ```powershell -./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf -cnv --chat-template gemma +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --chat-template gemma +``` + +##### Conversation mode using built-in jinja chat template + +```powershell +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja +``` + +##### One-and-done query using jinja with custom system prompt and a starting prompt + +```powershell +./llama-cli.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --jinja --single-turn -sys "You are a helpful assistant" -p "Hello" ``` #### Infinite text from a starting prompt (you can use `Ctrl-C` to stop it): @@ -77,6 +101,8 @@ The `llama-cli` program provides several ways to interact with the LLaMA models - `--prompt PROMPT`: Provide a prompt directly as a command-line option. - `--file FNAME`: Provide a file containing a prompt or multiple prompts. +- `--system-prompt PROMPT`: Provide a system prompt (will otherwise use the default one in the chat template (if provided)). +- `--system-prompt-file FNAME`: Provide a file containing a system prompt. - `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.) ## Interaction @@ -89,7 +115,10 @@ In interactive mode, users can participate in text generation by injecting their - `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model. - `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation. -- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: false) +- `-cnv, --conversation`: Run the program in conversation mode (does not print special tokens and suffix/prefix, use default or provided chat template) (default: true if chat template found) +- `-no-cnv`: Disable conversation mode (default: false) +- `-st, --single-turn`: Only process a single conversation turn (user input) and then exit. +- `--jinja`: Enable jinja chat template parser, will use the model's built-in template or a user-provided one (default: false) - `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text. By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs. @@ -125,6 +154,8 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t Example usage: `--chat-template gemma` +`--chat-template-file FNAME`: Load a custom jinja chat template from an external file, useful if the model contains outdated or incompatible template, some examples can be found in models/templates. Up-to-date chat templates can be downloaded from Hugging Face using scripts/get_chat_template.py + ## Context Management During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cf8659b037..fd7410a646 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -31,8 +31,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant"; - static llama_context ** g_ctx; static llama_model ** g_model; static common_sampler ** g_smpl; @@ -47,8 +45,8 @@ static void print_usage(int argc, char ** argv) { (void) argc; LOG("\nexample usage:\n"); - LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); - LOG("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + LOG("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128 -no-cnv\n", argv[0]); + LOG("\n chat (conversation): %s -m your_model.gguf -sys \"You are a helpful assistant\"\n", argv[0]); LOG("\n"); } @@ -219,6 +217,10 @@ int main(int argc, char ** argv) { // print chat template example in conversation mode if (params.conversation_mode) { if (params.enable_chat_template) { + if (!params.prompt.empty() && params.system_prompt.empty()) { + LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n"); + } + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str()); } else { LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__); @@ -263,6 +265,7 @@ int main(int argc, char ** argv) { std::vector embd_inp; + bool waiting_for_first_input = false; auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) { common_chat_msg new_msg; new_msg.role = role; @@ -273,13 +276,34 @@ int main(int argc, char ** argv) { return formatted; }; + std::string prompt; { - auto prompt = (params.conversation_mode && params.enable_chat_template) - // format the system prompt in conversation mode (fallback to default if empty) - ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt) + if (params.conversation_mode && params.enable_chat_template) { + if (!params.system_prompt.empty()) { + // format the system prompt (will use template default if empty) + chat_add_and_format("system", params.system_prompt); + } + + if (!params.prompt.empty()) { + // format and append the user prompt + chat_add_and_format("user", params.prompt); + } else { + waiting_for_first_input = true; + } + + if (!params.system_prompt.empty() || !params.prompt.empty()) { + common_chat_templates_inputs inputs; + inputs.messages = chat_msgs; + inputs.add_generation_prompt = !params.prompt.empty(); + + prompt = common_chat_templates_apply(chat_templates.get(), inputs).prompt; + } + } else { // otherwise use the prompt as is - : params.prompt; - if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) { + prompt = params.prompt; + } + + if (params.interactive_first || !prompt.empty() || session_tokens.empty()) { LOG_DBG("tokenize the prompt\n"); embd_inp = common_tokenize(ctx, prompt, true, true); } else { @@ -292,7 +316,7 @@ int main(int argc, char ** argv) { } // Should not run without any tokens - if (embd_inp.empty()) { + if (!waiting_for_first_input && embd_inp.empty()) { if (add_bos) { embd_inp.push_back(llama_vocab_bos(vocab)); LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str()); @@ -330,7 +354,7 @@ int main(int argc, char ** argv) { } // remove any "future" tokens that we might have inherited from the previous session - llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); + llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1); } LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n", @@ -352,7 +376,12 @@ int main(int argc, char ** argv) { } if (params.conversation_mode) { - params.interactive_first = true; + if (params.single_turn && !params.prompt.empty()) { + params.interactive = false; + params.interactive_first = false; + } else { + params.interactive_first = true; + } } // enable interactive mode if interactive start is specified @@ -476,8 +505,8 @@ int main(int argc, char ** argv) { LOG_INF( " - Press Ctrl+C to interject at any time.\n"); #endif LOG_INF( "%s", control_message); - if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) { - LOG_INF( " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n"); + if (params.conversation_mode && params.enable_chat_template && params.system_prompt.empty()) { + LOG_INF( " - Not using system message. To change it, set a different value via -sys PROMPT\n"); } LOG_INF("\n"); @@ -573,8 +602,8 @@ int main(int argc, char ** argv) { LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", n_past, n_left, n_ctx, params.n_keep, n_discard); - llama_kv_cache_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); + llama_kv_self_seq_rm (ctx, 0, params.n_keep , params.n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard); n_past -= n_discard; @@ -597,9 +626,9 @@ int main(int argc, char ** argv) { LOG_DBG("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n); LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd); - llama_kv_cache_seq_add(ctx, 0, ga_i, n_past, ib*bd); - llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); - llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); + llama_kv_self_seq_add(ctx, 0, ga_i, n_past, ib*bd); + llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n); + llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd, dd); n_past -= bd; @@ -773,7 +802,7 @@ int main(int argc, char ** argv) { } // deal with end of generation tokens in interactive mode - if (llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { + if (!waiting_for_first_input && llama_vocab_is_eog(vocab, common_sampler_last(smpl))) { LOG_DBG("found an EOG token\n"); if (params.interactive) { @@ -793,12 +822,17 @@ int main(int argc, char ** argv) { } // if current token is not EOG, we add it to current assistant message - if (params.conversation_mode) { + if (params.conversation_mode && !waiting_for_first_input) { const auto id = common_sampler_last(smpl); assistant_ss << common_token_to_piece(ctx, id, false); + + if (!prompt.empty()) { + prompt.clear(); + is_interacting = false; + } } - if (n_past > 0 && is_interacting) { + if ((n_past > 0 || waiting_for_first_input) && is_interacting) { LOG_DBG("waiting for user input\n"); if (params.conversation_mode) { @@ -888,11 +922,17 @@ int main(int argc, char ** argv) { input_echo = false; // do not echo this again } - if (n_past > 0) { + if (n_past > 0 || waiting_for_first_input) { if (is_interacting) { common_sampler_reset(smpl); } is_interacting = false; + + if (waiting_for_first_input && params.single_turn) { + params.interactive = false; + params.interactive_first = false; + } + waiting_for_first_input = false; } } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7ef43d5e12..588632f043 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -12,6 +12,7 @@ #include #include #include +#include // trim whitespace from the beginning and end of a string static std::string trim(const std::string & str) { @@ -201,7 +202,7 @@ int main(int argc, char ** argv) { // assign the system KV cache to all parallel sequences for (int32_t i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } LOG_INF("\n"); @@ -233,9 +234,9 @@ int main(int argc, char ** argv) { if (batch.n_tokens == 0) { // all sequences have ended - clear the entire KV cache for (int i = 1; i <= n_clients; ++i) { - llama_kv_cache_seq_rm(ctx, i, -1, -1); + llama_kv_self_seq_rm(ctx, i, -1, -1); // but keep the system prompt - llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); + llama_kv_self_seq_cp(ctx, 0, i, -1, -1); } LOG_INF("%s: clearing the KV cache\n", __func__); @@ -371,8 +372,8 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); - llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); + llama_kv_self_seq_rm(ctx, client.id + 1, -1, -1); + llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 5953928d47..ea3a6c1fca 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -7,6 +7,7 @@ #include #include #include +#include static void print_usage(int, char ** argv) { LOG("\nexample usage:\n"); @@ -132,11 +133,11 @@ int main(int argc, char ** argv) { const int ib = i/n_batch - 1; const int bd = n_batch_grp*(n_grp - 1); - llama_kv_cache_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); - llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); - llama_kv_cache_update (ctx); + llama_kv_self_seq_add (ctx, 0, n_past - n_batch, n_past, ib*bd); + llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; } common_batch_clear(batch); @@ -166,12 +167,12 @@ int main(int argc, char ** argv) { LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (ctx); - llama_kv_cache_update (ctx); + llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_self_defrag (ctx); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; common_batch_clear(batch); @@ -197,12 +198,12 @@ int main(int argc, char ** argv) { if (n_discard > 0) { LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard); - llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); - //llama_kv_cache_defrag (ctx); - llama_kv_cache_update (ctx); + llama_kv_self_seq_rm (ctx, 0, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx, -n_discard); + //llama_kv_self_defrag (ctx); + llama_kv_self_update (ctx); - n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1; + n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1; } } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 5d07421e82..8c413f7d66 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); @@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); for (int j = 0; j < num_batches; ++j) { const int batch_start = start + j * n_batch; @@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) { return; } - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params) return; } - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par return; } - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); // decode all tasks [i0, i1) if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) { @@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) { } // clear the KV cache - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); llama_batch batch = llama_batch_init(n_batch, 0, 1); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index bd2f734670..dd07ab9b37 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,6 +1,6 @@ #include "ggml.h" #include "llama.h" -#include "llama-context.h" +#include "llama-model.h" #include "common.h" #include @@ -328,7 +328,7 @@ int main(int argc, char ** argv) { } } - const auto & tensors = llama_internal_get_tensor_map(ctx); + const auto & tensors = llama_internal_get_tensor_map(model); // check layer tensors int included_layers = 0; diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 8d47b17b6b..a4468b1698 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -8,6 +8,7 @@ #include #include #include +#include struct quant_option { std::string name; diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 2439022a22..0efe20d4b3 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { // clear previous kv_cache values (irrelevant for embeddings) - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); diff --git a/examples/run/linenoise.cpp/linenoise.cpp b/examples/run/linenoise.cpp/linenoise.cpp index a68f12a1a0..9cb9399003 100644 --- a/examples/run/linenoise.cpp/linenoise.cpp +++ b/examples/run/linenoise.cpp/linenoise.cpp @@ -75,7 +75,7 @@ * * DSR (Device Status Report) * Sequence: ESC [ 6 n - * Effect: reports the current cusor position as ESC [ n ; m R + * Effect: reports the current cursor position as ESC [ n ; m R * where n is the row and m is the column * * When multi line mode is enabled, we also use an additional escape @@ -107,6 +107,7 @@ # include # include +# include # include # include # include @@ -205,9 +206,9 @@ class File { int fd = -1; }; -__attribute__((format(printf, 1, 2))) -/* Debugging function. */ #if 0 +/* Debugging function. */ +__attribute__((format(printf, 1, 2))) static void lndebug(const char *fmt, ...) { static File file; if (file.file == nullptr) { @@ -222,11 +223,469 @@ static void lndebug(const char *fmt, ...) { fflush(file.file); } } -#else -static void lndebug(const char *, ...) { -} #endif +/* ========================== Encoding functions ============================= */ + +/* Get length of previous UTF8 codepoint */ +static size_t prevUtf8CodePointLen(const char * buf, int pos) { + int end = pos--; + while (pos >= 0 && ((unsigned char) buf[pos] & 0xC0) == 0x80) { + pos--; + } + return end - pos; +} + +/* Convert UTF8 to Unicode code point */ +static size_t utf8BytesToCodePoint(const char * buf, size_t len, int * cp) { + if (len) { + unsigned char byte = buf[0]; + if ((byte & 0x80) == 0) { + *cp = byte; + return 1; + } else if ((byte & 0xE0) == 0xC0) { + if (len >= 2) { + *cp = (((unsigned long) (buf[0] & 0x1F)) << 6) | ((unsigned long) (buf[1] & 0x3F)); + return 2; + } + } else if ((byte & 0xF0) == 0xE0) { + if (len >= 3) { + *cp = (((unsigned long) (buf[0] & 0x0F)) << 12) | (((unsigned long) (buf[1] & 0x3F)) << 6) | + ((unsigned long) (buf[2] & 0x3F)); + return 3; + } + } else if ((byte & 0xF8) == 0xF0) { + if (len >= 4) { + *cp = (((unsigned long) (buf[0] & 0x07)) << 18) | (((unsigned long) (buf[1] & 0x3F)) << 12) | + (((unsigned long) (buf[2] & 0x3F)) << 6) | ((unsigned long) (buf[3] & 0x3F)); + return 4; + } + } + } + return 0; +} + +/* Check if the code is a wide character */ +static const unsigned long wideCharTable[][2] = { + /* BEGIN: WIDE CHAR TABLE */ + { 0x1100, 0x115F }, + { 0x231A, 0x231B }, + { 0x2329, 0x232A }, + { 0x23E9, 0x23EC }, + { 0x23F0, 0x23F0 }, + { 0x23F3, 0x23F3 }, + { 0x25FD, 0x25FE }, + { 0x2614, 0x2615 }, + { 0x2630, 0x2637 }, + { 0x2648, 0x2653 }, + { 0x267F, 0x267F }, + { 0x268A, 0x268F }, + { 0x2693, 0x2693 }, + { 0x26A1, 0x26A1 }, + { 0x26AA, 0x26AB }, + { 0x26BD, 0x26BE }, + { 0x26C4, 0x26C5 }, + { 0x26CE, 0x26CE }, + { 0x26D4, 0x26D4 }, + { 0x26EA, 0x26EA }, + { 0x26F2, 0x26F3 }, + { 0x26F5, 0x26F5 }, + { 0x26FA, 0x26FA }, + { 0x26FD, 0x26FD }, + { 0x2705, 0x2705 }, + { 0x270A, 0x270B }, + { 0x2728, 0x2728 }, + { 0x274C, 0x274C }, + { 0x274E, 0x274E }, + { 0x2753, 0x2755 }, + { 0x2757, 0x2757 }, + { 0x2795, 0x2797 }, + { 0x27B0, 0x27B0 }, + { 0x27BF, 0x27BF }, + { 0x2B1B, 0x2B1C }, + { 0x2B50, 0x2B50 }, + { 0x2B55, 0x2B55 }, + { 0x2E80, 0x2E99 }, + { 0x2E9B, 0x2EF3 }, + { 0x2F00, 0x2FD5 }, + { 0x2FF0, 0x303E }, + { 0x3041, 0x3096 }, + { 0x3099, 0x30FF }, + { 0x3105, 0x312F }, + { 0x3131, 0x318E }, + { 0x3190, 0x31E5 }, + { 0x31EF, 0x321E }, + { 0x3220, 0x3247 }, + { 0x3250, 0xA48C }, + { 0xA490, 0xA4C6 }, + { 0xA960, 0xA97C }, + { 0xAC00, 0xD7A3 }, + { 0xF900, 0xFAFF }, + { 0xFE10, 0xFE19 }, + { 0xFE30, 0xFE52 }, + { 0xFE54, 0xFE66 }, + { 0xFE68, 0xFE6B }, + { 0xFF01, 0xFF60 }, + { 0xFFE0, 0xFFE6 }, + { 0x16FE0, 0x16FE4 }, + { 0x16FF0, 0x16FF1 }, + { 0x17000, 0x187F7 }, + { 0x18800, 0x18CD5 }, + { 0x18CFF, 0x18D08 }, + { 0x1AFF0, 0x1AFF3 }, + { 0x1AFF5, 0x1AFFB }, + { 0x1AFFD, 0x1AFFE }, + { 0x1B000, 0x1B122 }, + { 0x1B132, 0x1B132 }, + { 0x1B150, 0x1B152 }, + { 0x1B155, 0x1B155 }, + { 0x1B164, 0x1B167 }, + { 0x1B170, 0x1B2FB }, + { 0x1D300, 0x1D356 }, + { 0x1D360, 0x1D376 }, + { 0x1F004, 0x1F004 }, + { 0x1F0CF, 0x1F0CF }, + { 0x1F18E, 0x1F18E }, + { 0x1F191, 0x1F19A }, + { 0x1F200, 0x1F202 }, + { 0x1F210, 0x1F23B }, + { 0x1F240, 0x1F248 }, + { 0x1F250, 0x1F251 }, + { 0x1F260, 0x1F265 }, + { 0x1F300, 0x1F320 }, + { 0x1F32D, 0x1F335 }, + { 0x1F337, 0x1F37C }, + { 0x1F37E, 0x1F393 }, + { 0x1F3A0, 0x1F3CA }, + { 0x1F3CF, 0x1F3D3 }, + { 0x1F3E0, 0x1F3F0 }, + { 0x1F3F4, 0x1F3F4 }, + { 0x1F3F8, 0x1F43E }, + { 0x1F440, 0x1F440 }, + { 0x1F442, 0x1F4FC }, + { 0x1F4FF, 0x1F53D }, + { 0x1F54B, 0x1F54E }, + { 0x1F550, 0x1F567 }, + { 0x1F57A, 0x1F57A }, + { 0x1F595, 0x1F596 }, + { 0x1F5A4, 0x1F5A4 }, + { 0x1F5FB, 0x1F64F }, + { 0x1F680, 0x1F6C5 }, + { 0x1F6CC, 0x1F6CC }, + { 0x1F6D0, 0x1F6D2 }, + { 0x1F6D5, 0x1F6D7 }, + { 0x1F6DC, 0x1F6DF }, + { 0x1F6EB, 0x1F6EC }, + { 0x1F6F4, 0x1F6FC }, + { 0x1F7E0, 0x1F7EB }, + { 0x1F7F0, 0x1F7F0 }, + { 0x1F90C, 0x1F93A }, + { 0x1F93C, 0x1F945 }, + { 0x1F947, 0x1F9FF }, + { 0x1FA70, 0x1FA7C }, + { 0x1FA80, 0x1FA89 }, + { 0x1FA8F, 0x1FAC6 }, + { 0x1FACE, 0x1FADC }, + { 0x1FADF, 0x1FAE9 }, + { 0x1FAF0, 0x1FAF8 }, + { 0x20000, 0x2FFFD }, + { 0x30000, 0x3FFFD } + /* END: WIDE CHAR TABLE */ +}; + +static const size_t wideCharTableSize = sizeof(wideCharTable) / sizeof(wideCharTable[0]); + +static bool isWideChar(unsigned long cp) { + for (size_t i = 0; i < wideCharTableSize; i++) { + auto first_code = wideCharTable[i][0]; + auto last_code = wideCharTable[i][1]; + if (first_code > cp) { + return false; + } + if (first_code <= cp && cp <= last_code) { + return true; + } + } + return false; +} + +/* Check if the code is a combining character */ +static const unsigned long combiningCharTable[] = { + /* BEGIN: COMBINING CHAR TABLE */ + 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, 0x0308, 0x0309, 0x030A, 0x030B, 0x030C, + 0x030D, 0x030E, 0x030F, 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, 0x0318, 0x0319, + 0x031A, 0x031B, 0x031C, 0x031D, 0x031E, 0x031F, 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, + 0x0327, 0x0328, 0x0329, 0x032A, 0x032B, 0x032C, 0x032D, 0x032E, 0x032F, 0x0330, 0x0331, 0x0332, 0x0333, + 0x0334, 0x0335, 0x0336, 0x0337, 0x0338, 0x0339, 0x033A, 0x033B, 0x033C, 0x033D, 0x033E, 0x033F, 0x0340, + 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347, 0x0348, 0x0349, 0x034A, 0x034B, 0x034C, 0x034D, + 0x034E, 0x034F, 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, 0x0358, 0x0359, 0x035A, + 0x035B, 0x035C, 0x035D, 0x035E, 0x035F, 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, + 0x0368, 0x0369, 0x036A, 0x036B, 0x036C, 0x036D, 0x036E, 0x036F, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, + 0x0591, 0x0592, 0x0593, 0x0594, 0x0595, 0x0596, 0x0597, 0x0598, 0x0599, 0x059A, 0x059B, 0x059C, 0x059D, + 0x059E, 0x059F, 0x05A0, 0x05A1, 0x05A2, 0x05A3, 0x05A4, 0x05A5, 0x05A6, 0x05A7, 0x05A8, 0x05A9, 0x05AA, + 0x05AB, 0x05AC, 0x05AD, 0x05AE, 0x05AF, 0x05B0, 0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, + 0x05B8, 0x05B9, 0x05BA, 0x05BB, 0x05BC, 0x05BD, 0x05BF, 0x05C1, 0x05C2, 0x05C4, 0x05C5, 0x05C7, 0x0610, + 0x0611, 0x0612, 0x0613, 0x0614, 0x0615, 0x0616, 0x0617, 0x0618, 0x0619, 0x061A, 0x064B, 0x064C, 0x064D, + 0x064E, 0x064F, 0x0650, 0x0651, 0x0652, 0x0653, 0x0654, 0x0655, 0x0656, 0x0657, 0x0658, 0x0659, 0x065A, + 0x065B, 0x065C, 0x065D, 0x065E, 0x065F, 0x0670, 0x06D6, 0x06D7, 0x06D8, 0x06D9, 0x06DA, 0x06DB, 0x06DC, + 0x06DF, 0x06E0, 0x06E1, 0x06E2, 0x06E3, 0x06E4, 0x06E7, 0x06E8, 0x06EA, 0x06EB, 0x06EC, 0x06ED, 0x0711, + 0x0730, 0x0731, 0x0732, 0x0733, 0x0734, 0x0735, 0x0736, 0x0737, 0x0738, 0x0739, 0x073A, 0x073B, 0x073C, + 0x073D, 0x073E, 0x073F, 0x0740, 0x0741, 0x0742, 0x0743, 0x0744, 0x0745, 0x0746, 0x0747, 0x0748, 0x0749, + 0x074A, 0x07A6, 0x07A7, 0x07A8, 0x07A9, 0x07AA, 0x07AB, 0x07AC, 0x07AD, 0x07AE, 0x07AF, 0x07B0, 0x07EB, + 0x07EC, 0x07ED, 0x07EE, 0x07EF, 0x07F0, 0x07F1, 0x07F2, 0x07F3, 0x07FD, 0x0816, 0x0817, 0x0818, 0x0819, + 0x081B, 0x081C, 0x081D, 0x081E, 0x081F, 0x0820, 0x0821, 0x0822, 0x0823, 0x0825, 0x0826, 0x0827, 0x0829, + 0x082A, 0x082B, 0x082C, 0x082D, 0x0859, 0x085A, 0x085B, 0x0897, 0x0898, 0x0899, 0x089A, 0x089B, 0x089C, + 0x089D, 0x089E, 0x089F, 0x08CA, 0x08CB, 0x08CC, 0x08CD, 0x08CE, 0x08CF, 0x08D0, 0x08D1, 0x08D2, 0x08D3, + 0x08D4, 0x08D5, 0x08D6, 0x08D7, 0x08D8, 0x08D9, 0x08DA, 0x08DB, 0x08DC, 0x08DD, 0x08DE, 0x08DF, 0x08E0, + 0x08E1, 0x08E3, 0x08E4, 0x08E5, 0x08E6, 0x08E7, 0x08E8, 0x08E9, 0x08EA, 0x08EB, 0x08EC, 0x08ED, 0x08EE, + 0x08EF, 0x08F0, 0x08F1, 0x08F2, 0x08F3, 0x08F4, 0x08F5, 0x08F6, 0x08F7, 0x08F8, 0x08F9, 0x08FA, 0x08FB, + 0x08FC, 0x08FD, 0x08FE, 0x08FF, 0x0900, 0x0901, 0x0902, 0x093A, 0x093C, 0x0941, 0x0942, 0x0943, 0x0944, + 0x0945, 0x0946, 0x0947, 0x0948, 0x094D, 0x0951, 0x0952, 0x0953, 0x0954, 0x0955, 0x0956, 0x0957, 0x0962, + 0x0963, 0x0981, 0x09BC, 0x09C1, 0x09C2, 0x09C3, 0x09C4, 0x09CD, 0x09E2, 0x09E3, 0x09FE, 0x0A01, 0x0A02, + 0x0A3C, 0x0A41, 0x0A42, 0x0A47, 0x0A48, 0x0A4B, 0x0A4C, 0x0A4D, 0x0A51, 0x0A70, 0x0A71, 0x0A75, 0x0A81, + 0x0A82, 0x0ABC, 0x0AC1, 0x0AC2, 0x0AC3, 0x0AC4, 0x0AC5, 0x0AC7, 0x0AC8, 0x0ACD, 0x0AE2, 0x0AE3, 0x0AFA, + 0x0AFB, 0x0AFC, 0x0AFD, 0x0AFE, 0x0AFF, 0x0B01, 0x0B3C, 0x0B3F, 0x0B41, 0x0B42, 0x0B43, 0x0B44, 0x0B4D, + 0x0B55, 0x0B56, 0x0B62, 0x0B63, 0x0B82, 0x0BC0, 0x0BCD, 0x0C00, 0x0C04, 0x0C3C, 0x0C3E, 0x0C3F, 0x0C40, + 0x0C46, 0x0C47, 0x0C48, 0x0C4A, 0x0C4B, 0x0C4C, 0x0C4D, 0x0C55, 0x0C56, 0x0C62, 0x0C63, 0x0C81, 0x0CBC, + 0x0CBF, 0x0CC6, 0x0CCC, 0x0CCD, 0x0CE2, 0x0CE3, 0x0D00, 0x0D01, 0x0D3B, 0x0D3C, 0x0D41, 0x0D42, 0x0D43, + 0x0D44, 0x0D4D, 0x0D62, 0x0D63, 0x0D81, 0x0DCA, 0x0DD2, 0x0DD3, 0x0DD4, 0x0DD6, 0x0E31, 0x0E34, 0x0E35, + 0x0E36, 0x0E37, 0x0E38, 0x0E39, 0x0E3A, 0x0E47, 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, + 0x0EB1, 0x0EB4, 0x0EB5, 0x0EB6, 0x0EB7, 0x0EB8, 0x0EB9, 0x0EBA, 0x0EBB, 0x0EBC, 0x0EC8, 0x0EC9, 0x0ECA, + 0x0ECB, 0x0ECC, 0x0ECD, 0x0ECE, 0x0F18, 0x0F19, 0x0F35, 0x0F37, 0x0F39, 0x0F71, 0x0F72, 0x0F73, 0x0F74, + 0x0F75, 0x0F76, 0x0F77, 0x0F78, 0x0F79, 0x0F7A, 0x0F7B, 0x0F7C, 0x0F7D, 0x0F7E, 0x0F80, 0x0F81, 0x0F82, + 0x0F83, 0x0F84, 0x0F86, 0x0F87, 0x0F8D, 0x0F8E, 0x0F8F, 0x0F90, 0x0F91, 0x0F92, 0x0F93, 0x0F94, 0x0F95, + 0x0F96, 0x0F97, 0x0F99, 0x0F9A, 0x0F9B, 0x0F9C, 0x0F9D, 0x0F9E, 0x0F9F, 0x0FA0, 0x0FA1, 0x0FA2, 0x0FA3, + 0x0FA4, 0x0FA5, 0x0FA6, 0x0FA7, 0x0FA8, 0x0FA9, 0x0FAA, 0x0FAB, 0x0FAC, 0x0FAD, 0x0FAE, 0x0FAF, 0x0FB0, + 0x0FB1, 0x0FB2, 0x0FB3, 0x0FB4, 0x0FB5, 0x0FB6, 0x0FB7, 0x0FB8, 0x0FB9, 0x0FBA, 0x0FBB, 0x0FBC, 0x0FC6, + 0x102D, 0x102E, 0x102F, 0x1030, 0x1032, 0x1033, 0x1034, 0x1035, 0x1036, 0x1037, 0x1039, 0x103A, 0x103D, + 0x103E, 0x1058, 0x1059, 0x105E, 0x105F, 0x1060, 0x1071, 0x1072, 0x1073, 0x1074, 0x1082, 0x1085, 0x1086, + 0x108D, 0x109D, 0x135D, 0x135E, 0x135F, 0x1712, 0x1713, 0x1714, 0x1732, 0x1733, 0x1752, 0x1753, 0x1772, + 0x1773, 0x17B4, 0x17B5, 0x17B7, 0x17B8, 0x17B9, 0x17BA, 0x17BB, 0x17BC, 0x17BD, 0x17C6, 0x17C9, 0x17CA, + 0x17CB, 0x17CC, 0x17CD, 0x17CE, 0x17CF, 0x17D0, 0x17D1, 0x17D2, 0x17D3, 0x17DD, 0x180B, 0x180C, 0x180D, + 0x180F, 0x1885, 0x1886, 0x18A9, 0x1920, 0x1921, 0x1922, 0x1927, 0x1928, 0x1932, 0x1939, 0x193A, 0x193B, + 0x1A17, 0x1A18, 0x1A1B, 0x1A56, 0x1A58, 0x1A59, 0x1A5A, 0x1A5B, 0x1A5C, 0x1A5D, 0x1A5E, 0x1A60, 0x1A62, + 0x1A65, 0x1A66, 0x1A67, 0x1A68, 0x1A69, 0x1A6A, 0x1A6B, 0x1A6C, 0x1A73, 0x1A74, 0x1A75, 0x1A76, 0x1A77, + 0x1A78, 0x1A79, 0x1A7A, 0x1A7B, 0x1A7C, 0x1A7F, 0x1AB0, 0x1AB1, 0x1AB2, 0x1AB3, 0x1AB4, 0x1AB5, 0x1AB6, + 0x1AB7, 0x1AB8, 0x1AB9, 0x1ABA, 0x1ABB, 0x1ABC, 0x1ABD, 0x1ABF, 0x1AC0, 0x1AC1, 0x1AC2, 0x1AC3, 0x1AC4, + 0x1AC5, 0x1AC6, 0x1AC7, 0x1AC8, 0x1AC9, 0x1ACA, 0x1ACB, 0x1ACC, 0x1ACD, 0x1ACE, 0x1B00, 0x1B01, 0x1B02, + 0x1B03, 0x1B34, 0x1B36, 0x1B37, 0x1B38, 0x1B39, 0x1B3A, 0x1B3C, 0x1B42, 0x1B6B, 0x1B6C, 0x1B6D, 0x1B6E, + 0x1B6F, 0x1B70, 0x1B71, 0x1B72, 0x1B73, 0x1B80, 0x1B81, 0x1BA2, 0x1BA3, 0x1BA4, 0x1BA5, 0x1BA8, 0x1BA9, + 0x1BAB, 0x1BAC, 0x1BAD, 0x1BE6, 0x1BE8, 0x1BE9, 0x1BED, 0x1BEF, 0x1BF0, 0x1BF1, 0x1C2C, 0x1C2D, 0x1C2E, + 0x1C2F, 0x1C30, 0x1C31, 0x1C32, 0x1C33, 0x1C36, 0x1C37, 0x1CD0, 0x1CD1, 0x1CD2, 0x1CD4, 0x1CD5, 0x1CD6, + 0x1CD7, 0x1CD8, 0x1CD9, 0x1CDA, 0x1CDB, 0x1CDC, 0x1CDD, 0x1CDE, 0x1CDF, 0x1CE0, 0x1CE2, 0x1CE3, 0x1CE4, + 0x1CE5, 0x1CE6, 0x1CE7, 0x1CE8, 0x1CED, 0x1CF4, 0x1CF8, 0x1CF9, 0x1DC0, 0x1DC1, 0x1DC2, 0x1DC3, 0x1DC4, + 0x1DC5, 0x1DC6, 0x1DC7, 0x1DC8, 0x1DC9, 0x1DCA, 0x1DCB, 0x1DCC, 0x1DCD, 0x1DCE, 0x1DCF, 0x1DD0, 0x1DD1, + 0x1DD2, 0x1DD3, 0x1DD4, 0x1DD5, 0x1DD6, 0x1DD7, 0x1DD8, 0x1DD9, 0x1DDA, 0x1DDB, 0x1DDC, 0x1DDD, 0x1DDE, + 0x1DDF, 0x1DE0, 0x1DE1, 0x1DE2, 0x1DE3, 0x1DE4, 0x1DE5, 0x1DE6, 0x1DE7, 0x1DE8, 0x1DE9, 0x1DEA, 0x1DEB, + 0x1DEC, 0x1DED, 0x1DEE, 0x1DEF, 0x1DF0, 0x1DF1, 0x1DF2, 0x1DF3, 0x1DF4, 0x1DF5, 0x1DF6, 0x1DF7, 0x1DF8, + 0x1DF9, 0x1DFA, 0x1DFB, 0x1DFC, 0x1DFD, 0x1DFE, 0x1DFF, 0x20D0, 0x20D1, 0x20D2, 0x20D3, 0x20D4, 0x20D5, + 0x20D6, 0x20D7, 0x20D8, 0x20D9, 0x20DA, 0x20DB, 0x20DC, 0x20E1, 0x20E5, 0x20E6, 0x20E7, 0x20E8, 0x20E9, + 0x20EA, 0x20EB, 0x20EC, 0x20ED, 0x20EE, 0x20EF, 0x20F0, 0x2CEF, 0x2CF0, 0x2CF1, 0x2D7F, 0x2DE0, 0x2DE1, + 0x2DE2, 0x2DE3, 0x2DE4, 0x2DE5, 0x2DE6, 0x2DE7, 0x2DE8, 0x2DE9, 0x2DEA, 0x2DEB, 0x2DEC, 0x2DED, 0x2DEE, + 0x2DEF, 0x2DF0, 0x2DF1, 0x2DF2, 0x2DF3, 0x2DF4, 0x2DF5, 0x2DF6, 0x2DF7, 0x2DF8, 0x2DF9, 0x2DFA, 0x2DFB, + 0x2DFC, 0x2DFD, 0x2DFE, 0x2DFF, 0x302A, 0x302B, 0x302C, 0x302D, 0x3099, 0x309A, 0xA66F, 0xA674, 0xA675, + 0xA676, 0xA677, 0xA678, 0xA679, 0xA67A, 0xA67B, 0xA67C, 0xA67D, 0xA69E, 0xA69F, 0xA6F0, 0xA6F1, 0xA802, + 0xA806, 0xA80B, 0xA825, 0xA826, 0xA82C, 0xA8C4, 0xA8C5, 0xA8E0, 0xA8E1, 0xA8E2, 0xA8E3, 0xA8E4, 0xA8E5, + 0xA8E6, 0xA8E7, 0xA8E8, 0xA8E9, 0xA8EA, 0xA8EB, 0xA8EC, 0xA8ED, 0xA8EE, 0xA8EF, 0xA8F0, 0xA8F1, 0xA8FF, + 0xA926, 0xA927, 0xA928, 0xA929, 0xA92A, 0xA92B, 0xA92C, 0xA92D, 0xA947, 0xA948, 0xA949, 0xA94A, 0xA94B, + 0xA94C, 0xA94D, 0xA94E, 0xA94F, 0xA950, 0xA951, 0xA980, 0xA981, 0xA982, 0xA9B3, 0xA9B6, 0xA9B7, 0xA9B8, + 0xA9B9, 0xA9BC, 0xA9BD, 0xA9E5, 0xAA29, 0xAA2A, 0xAA2B, 0xAA2C, 0xAA2D, 0xAA2E, 0xAA31, 0xAA32, 0xAA35, + 0xAA36, 0xAA43, 0xAA4C, 0xAA7C, 0xAAB0, 0xAAB2, 0xAAB3, 0xAAB4, 0xAAB7, 0xAAB8, 0xAABE, 0xAABF, 0xAAC1, + 0xAAEC, 0xAAED, 0xAAF6, 0xABE5, 0xABE8, 0xABED, 0xFB1E, 0xFE00, 0xFE01, 0xFE02, 0xFE03, 0xFE04, 0xFE05, + 0xFE06, 0xFE07, 0xFE08, 0xFE09, 0xFE0A, 0xFE0B, 0xFE0C, 0xFE0D, 0xFE0E, 0xFE0F, 0xFE20, 0xFE21, 0xFE22, + 0xFE23, 0xFE24, 0xFE25, 0xFE26, 0xFE27, 0xFE28, 0xFE29, 0xFE2A, 0xFE2B, 0xFE2C, 0xFE2D, 0xFE2E, 0xFE2F, + 0x101FD, 0x102E0, 0x10376, 0x10377, 0x10378, 0x10379, 0x1037A, 0x10A01, 0x10A02, 0x10A03, 0x10A05, 0x10A06, 0x10A0C, + 0x10A0D, 0x10A0E, 0x10A0F, 0x10A38, 0x10A39, 0x10A3A, 0x10A3F, 0x10AE5, 0x10AE6, 0x10D24, 0x10D25, 0x10D26, 0x10D27, + 0x10D69, 0x10D6A, 0x10D6B, 0x10D6C, 0x10D6D, 0x10EAB, 0x10EAC, 0x10EFC, 0x10EFD, 0x10EFE, 0x10EFF, 0x10F46, 0x10F47, + 0x10F48, 0x10F49, 0x10F4A, 0x10F4B, 0x10F4C, 0x10F4D, 0x10F4E, 0x10F4F, 0x10F50, 0x10F82, 0x10F83, 0x10F84, 0x10F85, + 0x11001, 0x11038, 0x11039, 0x1103A, 0x1103B, 0x1103C, 0x1103D, 0x1103E, 0x1103F, 0x11040, 0x11041, 0x11042, 0x11043, + 0x11044, 0x11045, 0x11046, 0x11070, 0x11073, 0x11074, 0x1107F, 0x11080, 0x11081, 0x110B3, 0x110B4, 0x110B5, 0x110B6, + 0x110B9, 0x110BA, 0x110C2, 0x11100, 0x11101, 0x11102, 0x11127, 0x11128, 0x11129, 0x1112A, 0x1112B, 0x1112D, 0x1112E, + 0x1112F, 0x11130, 0x11131, 0x11132, 0x11133, 0x11134, 0x11173, 0x11180, 0x11181, 0x111B6, 0x111B7, 0x111B8, 0x111B9, + 0x111BA, 0x111BB, 0x111BC, 0x111BD, 0x111BE, 0x111C9, 0x111CA, 0x111CB, 0x111CC, 0x111CF, 0x1122F, 0x11230, 0x11231, + 0x11234, 0x11236, 0x11237, 0x1123E, 0x11241, 0x112DF, 0x112E3, 0x112E4, 0x112E5, 0x112E6, 0x112E7, 0x112E8, 0x112E9, + 0x112EA, 0x11300, 0x11301, 0x1133B, 0x1133C, 0x11340, 0x11366, 0x11367, 0x11368, 0x11369, 0x1136A, 0x1136B, 0x1136C, + 0x11370, 0x11371, 0x11372, 0x11373, 0x11374, 0x113BB, 0x113BC, 0x113BD, 0x113BE, 0x113BF, 0x113C0, 0x113CE, 0x113D0, + 0x113D2, 0x113E1, 0x113E2, 0x11438, 0x11439, 0x1143A, 0x1143B, 0x1143C, 0x1143D, 0x1143E, 0x1143F, 0x11442, 0x11443, + 0x11444, 0x11446, 0x1145E, 0x114B3, 0x114B4, 0x114B5, 0x114B6, 0x114B7, 0x114B8, 0x114BA, 0x114BF, 0x114C0, 0x114C2, + 0x114C3, 0x115B2, 0x115B3, 0x115B4, 0x115B5, 0x115BC, 0x115BD, 0x115BF, 0x115C0, 0x115DC, 0x115DD, 0x11633, 0x11634, + 0x11635, 0x11636, 0x11637, 0x11638, 0x11639, 0x1163A, 0x1163D, 0x1163F, 0x11640, 0x116AB, 0x116AD, 0x116B0, 0x116B1, + 0x116B2, 0x116B3, 0x116B4, 0x116B5, 0x116B7, 0x1171D, 0x1171F, 0x11722, 0x11723, 0x11724, 0x11725, 0x11727, 0x11728, + 0x11729, 0x1172A, 0x1172B, 0x1182F, 0x11830, 0x11831, 0x11832, 0x11833, 0x11834, 0x11835, 0x11836, 0x11837, 0x11839, + 0x1183A, 0x1193B, 0x1193C, 0x1193E, 0x11943, 0x119D4, 0x119D5, 0x119D6, 0x119D7, 0x119DA, 0x119DB, 0x119E0, 0x11A01, + 0x11A02, 0x11A03, 0x11A04, 0x11A05, 0x11A06, 0x11A07, 0x11A08, 0x11A09, 0x11A0A, 0x11A33, 0x11A34, 0x11A35, 0x11A36, + 0x11A37, 0x11A38, 0x11A3B, 0x11A3C, 0x11A3D, 0x11A3E, 0x11A47, 0x11A51, 0x11A52, 0x11A53, 0x11A54, 0x11A55, 0x11A56, + 0x11A59, 0x11A5A, 0x11A5B, 0x11A8A, 0x11A8B, 0x11A8C, 0x11A8D, 0x11A8E, 0x11A8F, 0x11A90, 0x11A91, 0x11A92, 0x11A93, + 0x11A94, 0x11A95, 0x11A96, 0x11A98, 0x11A99, 0x11C30, 0x11C31, 0x11C32, 0x11C33, 0x11C34, 0x11C35, 0x11C36, 0x11C38, + 0x11C39, 0x11C3A, 0x11C3B, 0x11C3C, 0x11C3D, 0x11C3F, 0x11C92, 0x11C93, 0x11C94, 0x11C95, 0x11C96, 0x11C97, 0x11C98, + 0x11C99, 0x11C9A, 0x11C9B, 0x11C9C, 0x11C9D, 0x11C9E, 0x11C9F, 0x11CA0, 0x11CA1, 0x11CA2, 0x11CA3, 0x11CA4, 0x11CA5, + 0x11CA6, 0x11CA7, 0x11CAA, 0x11CAB, 0x11CAC, 0x11CAD, 0x11CAE, 0x11CAF, 0x11CB0, 0x11CB2, 0x11CB3, 0x11CB5, 0x11CB6, + 0x11D31, 0x11D32, 0x11D33, 0x11D34, 0x11D35, 0x11D36, 0x11D3A, 0x11D3C, 0x11D3D, 0x11D3F, 0x11D40, 0x11D41, 0x11D42, + 0x11D43, 0x11D44, 0x11D45, 0x11D47, 0x11D90, 0x11D91, 0x11D95, 0x11D97, 0x11EF3, 0x11EF4, 0x11F00, 0x11F01, 0x11F36, + 0x11F37, 0x11F38, 0x11F39, 0x11F3A, 0x11F40, 0x11F42, 0x11F5A, 0x13440, 0x13447, 0x13448, 0x13449, 0x1344A, 0x1344B, + 0x1344C, 0x1344D, 0x1344E, 0x1344F, 0x13450, 0x13451, 0x13452, 0x13453, 0x13454, 0x13455, 0x1611E, 0x1611F, 0x16120, + 0x16121, 0x16122, 0x16123, 0x16124, 0x16125, 0x16126, 0x16127, 0x16128, 0x16129, 0x1612D, 0x1612E, 0x1612F, 0x16AF0, + 0x16AF1, 0x16AF2, 0x16AF3, 0x16AF4, 0x16B30, 0x16B31, 0x16B32, 0x16B33, 0x16B34, 0x16B35, 0x16B36, 0x16F4F, 0x16F8F, + 0x16F90, 0x16F91, 0x16F92, 0x16FE4, 0x1BC9D, 0x1BC9E, 0x1CF00, 0x1CF01, 0x1CF02, 0x1CF03, 0x1CF04, 0x1CF05, 0x1CF06, + 0x1CF07, 0x1CF08, 0x1CF09, 0x1CF0A, 0x1CF0B, 0x1CF0C, 0x1CF0D, 0x1CF0E, 0x1CF0F, 0x1CF10, 0x1CF11, 0x1CF12, 0x1CF13, + 0x1CF14, 0x1CF15, 0x1CF16, 0x1CF17, 0x1CF18, 0x1CF19, 0x1CF1A, 0x1CF1B, 0x1CF1C, 0x1CF1D, 0x1CF1E, 0x1CF1F, 0x1CF20, + 0x1CF21, 0x1CF22, 0x1CF23, 0x1CF24, 0x1CF25, 0x1CF26, 0x1CF27, 0x1CF28, 0x1CF29, 0x1CF2A, 0x1CF2B, 0x1CF2C, 0x1CF2D, + 0x1CF30, 0x1CF31, 0x1CF32, 0x1CF33, 0x1CF34, 0x1CF35, 0x1CF36, 0x1CF37, 0x1CF38, 0x1CF39, 0x1CF3A, 0x1CF3B, 0x1CF3C, + 0x1CF3D, 0x1CF3E, 0x1CF3F, 0x1CF40, 0x1CF41, 0x1CF42, 0x1CF43, 0x1CF44, 0x1CF45, 0x1CF46, 0x1D167, 0x1D168, 0x1D169, + 0x1D17B, 0x1D17C, 0x1D17D, 0x1D17E, 0x1D17F, 0x1D180, 0x1D181, 0x1D182, 0x1D185, 0x1D186, 0x1D187, 0x1D188, 0x1D189, + 0x1D18A, 0x1D18B, 0x1D1AA, 0x1D1AB, 0x1D1AC, 0x1D1AD, 0x1D242, 0x1D243, 0x1D244, 0x1DA00, 0x1DA01, 0x1DA02, 0x1DA03, + 0x1DA04, 0x1DA05, 0x1DA06, 0x1DA07, 0x1DA08, 0x1DA09, 0x1DA0A, 0x1DA0B, 0x1DA0C, 0x1DA0D, 0x1DA0E, 0x1DA0F, 0x1DA10, + 0x1DA11, 0x1DA12, 0x1DA13, 0x1DA14, 0x1DA15, 0x1DA16, 0x1DA17, 0x1DA18, 0x1DA19, 0x1DA1A, 0x1DA1B, 0x1DA1C, 0x1DA1D, + 0x1DA1E, 0x1DA1F, 0x1DA20, 0x1DA21, 0x1DA22, 0x1DA23, 0x1DA24, 0x1DA25, 0x1DA26, 0x1DA27, 0x1DA28, 0x1DA29, 0x1DA2A, + 0x1DA2B, 0x1DA2C, 0x1DA2D, 0x1DA2E, 0x1DA2F, 0x1DA30, 0x1DA31, 0x1DA32, 0x1DA33, 0x1DA34, 0x1DA35, 0x1DA36, 0x1DA3B, + 0x1DA3C, 0x1DA3D, 0x1DA3E, 0x1DA3F, 0x1DA40, 0x1DA41, 0x1DA42, 0x1DA43, 0x1DA44, 0x1DA45, 0x1DA46, 0x1DA47, 0x1DA48, + 0x1DA49, 0x1DA4A, 0x1DA4B, 0x1DA4C, 0x1DA4D, 0x1DA4E, 0x1DA4F, 0x1DA50, 0x1DA51, 0x1DA52, 0x1DA53, 0x1DA54, 0x1DA55, + 0x1DA56, 0x1DA57, 0x1DA58, 0x1DA59, 0x1DA5A, 0x1DA5B, 0x1DA5C, 0x1DA5D, 0x1DA5E, 0x1DA5F, 0x1DA60, 0x1DA61, 0x1DA62, + 0x1DA63, 0x1DA64, 0x1DA65, 0x1DA66, 0x1DA67, 0x1DA68, 0x1DA69, 0x1DA6A, 0x1DA6B, 0x1DA6C, 0x1DA75, 0x1DA84, 0x1DA9B, + 0x1DA9C, 0x1DA9D, 0x1DA9E, 0x1DA9F, 0x1DAA1, 0x1DAA2, 0x1DAA3, 0x1DAA4, 0x1DAA5, 0x1DAA6, 0x1DAA7, 0x1DAA8, 0x1DAA9, + 0x1DAAA, 0x1DAAB, 0x1DAAC, 0x1DAAD, 0x1DAAE, 0x1DAAF, 0x1E000, 0x1E001, 0x1E002, 0x1E003, 0x1E004, 0x1E005, 0x1E006, + 0x1E008, 0x1E009, 0x1E00A, 0x1E00B, 0x1E00C, 0x1E00D, 0x1E00E, 0x1E00F, 0x1E010, 0x1E011, 0x1E012, 0x1E013, 0x1E014, + 0x1E015, 0x1E016, 0x1E017, 0x1E018, 0x1E01B, 0x1E01C, 0x1E01D, 0x1E01E, 0x1E01F, 0x1E020, 0x1E021, 0x1E023, 0x1E024, + 0x1E026, 0x1E027, 0x1E028, 0x1E029, 0x1E02A, 0x1E08F, 0x1E130, 0x1E131, 0x1E132, 0x1E133, 0x1E134, 0x1E135, 0x1E136, + 0x1E2AE, 0x1E2EC, 0x1E2ED, 0x1E2EE, 0x1E2EF, 0x1E4EC, 0x1E4ED, 0x1E4EE, 0x1E4EF, 0x1E5EE, 0x1E5EF, 0x1E8D0, 0x1E8D1, + 0x1E8D2, 0x1E8D3, 0x1E8D4, 0x1E8D5, 0x1E8D6, 0x1E944, 0x1E945, 0x1E946, 0x1E947, 0x1E948, 0x1E949, 0x1E94A, 0xE0100, + 0xE0101, 0xE0102, 0xE0103, 0xE0104, 0xE0105, 0xE0106, 0xE0107, 0xE0108, 0xE0109, 0xE010A, 0xE010B, 0xE010C, 0xE010D, + 0xE010E, 0xE010F, 0xE0110, 0xE0111, 0xE0112, 0xE0113, 0xE0114, 0xE0115, 0xE0116, 0xE0117, 0xE0118, 0xE0119, 0xE011A, + 0xE011B, 0xE011C, 0xE011D, 0xE011E, 0xE011F, 0xE0120, 0xE0121, 0xE0122, 0xE0123, 0xE0124, 0xE0125, 0xE0126, 0xE0127, + 0xE0128, 0xE0129, 0xE012A, 0xE012B, 0xE012C, 0xE012D, 0xE012E, 0xE012F, 0xE0130, 0xE0131, 0xE0132, 0xE0133, 0xE0134, + 0xE0135, 0xE0136, 0xE0137, 0xE0138, 0xE0139, 0xE013A, 0xE013B, 0xE013C, 0xE013D, 0xE013E, 0xE013F, 0xE0140, 0xE0141, + 0xE0142, 0xE0143, 0xE0144, 0xE0145, 0xE0146, 0xE0147, 0xE0148, 0xE0149, 0xE014A, 0xE014B, 0xE014C, 0xE014D, 0xE014E, + 0xE014F, 0xE0150, 0xE0151, 0xE0152, 0xE0153, 0xE0154, 0xE0155, 0xE0156, 0xE0157, 0xE0158, 0xE0159, 0xE015A, 0xE015B, + 0xE015C, 0xE015D, 0xE015E, 0xE015F, 0xE0160, 0xE0161, 0xE0162, 0xE0163, 0xE0164, 0xE0165, 0xE0166, 0xE0167, 0xE0168, + 0xE0169, 0xE016A, 0xE016B, 0xE016C, 0xE016D, 0xE016E, 0xE016F, 0xE0170, 0xE0171, 0xE0172, 0xE0173, 0xE0174, 0xE0175, + 0xE0176, 0xE0177, 0xE0178, 0xE0179, 0xE017A, 0xE017B, 0xE017C, 0xE017D, 0xE017E, 0xE017F, 0xE0180, 0xE0181, 0xE0182, + 0xE0183, 0xE0184, 0xE0185, 0xE0186, 0xE0187, 0xE0188, 0xE0189, 0xE018A, 0xE018B, 0xE018C, 0xE018D, 0xE018E, 0xE018F, + 0xE0190, 0xE0191, 0xE0192, 0xE0193, 0xE0194, 0xE0195, 0xE0196, 0xE0197, 0xE0198, 0xE0199, 0xE019A, 0xE019B, 0xE019C, + 0xE019D, 0xE019E, 0xE019F, 0xE01A0, 0xE01A1, 0xE01A2, 0xE01A3, 0xE01A4, 0xE01A5, 0xE01A6, 0xE01A7, 0xE01A8, 0xE01A9, + 0xE01AA, 0xE01AB, 0xE01AC, 0xE01AD, 0xE01AE, 0xE01AF, 0xE01B0, 0xE01B1, 0xE01B2, 0xE01B3, 0xE01B4, 0xE01B5, 0xE01B6, + 0xE01B7, 0xE01B8, 0xE01B9, 0xE01BA, 0xE01BB, 0xE01BC, 0xE01BD, 0xE01BE, 0xE01BF, 0xE01C0, 0xE01C1, 0xE01C2, 0xE01C3, + 0xE01C4, 0xE01C5, 0xE01C6, 0xE01C7, 0xE01C8, 0xE01C9, 0xE01CA, 0xE01CB, 0xE01CC, 0xE01CD, 0xE01CE, 0xE01CF, 0xE01D0, + 0xE01D1, 0xE01D2, 0xE01D3, 0xE01D4, 0xE01D5, 0xE01D6, 0xE01D7, 0xE01D8, 0xE01D9, 0xE01DA, 0xE01DB, 0xE01DC, 0xE01DD, + 0xE01DE, 0xE01DF, 0xE01E0, 0xE01E1, 0xE01E2, 0xE01E3, 0xE01E4, 0xE01E5, 0xE01E6, 0xE01E7, 0xE01E8, 0xE01E9, 0xE01EA, + 0xE01EB, 0xE01EC, 0xE01ED, 0xE01EE, 0xE01EF + /* END: COMBINING CHAR TABLE */ +}; + +static const unsigned long combiningCharTableSize = sizeof(combiningCharTable) / sizeof(combiningCharTable[0]); + +static bool isCombiningChar(unsigned long cp) { + for (size_t i = 0; i < combiningCharTableSize; i++) { + auto code = combiningCharTable[i]; + if (code > cp) { + return false; + } + if (code == cp) { + return true; + } + } + return false; +} + +/* Get length of previous grapheme */ +static size_t defaultPrevCharLen(const char * buf, size_t /*buf_len*/, size_t pos, size_t * col_len) { + size_t end = pos; + while (pos > 0) { + size_t len = prevUtf8CodePointLen(buf, pos); + pos -= len; + int cp; + utf8BytesToCodePoint(buf + pos, len, &cp); + if (!isCombiningChar(cp)) { + if (col_len != NULL) { + *col_len = isWideChar(cp) ? 2 : 1; + } + return end - pos; + } + } + /* NOTREACHED */ + return 0; +} + +/* Get length of next grapheme */ +static size_t defaultNextCharLen(const char * buf, size_t buf_len, size_t pos, size_t * col_len) { + size_t beg = pos; + int cp; + size_t len = utf8BytesToCodePoint(buf + pos, buf_len - pos, &cp); + if (isCombiningChar(cp)) { + /* NOTREACHED */ + return 0; + } + if (col_len != NULL) { + *col_len = isWideChar(cp) ? 2 : 1; + } + pos += len; + while (pos < buf_len) { + int cp; + len = utf8BytesToCodePoint(buf + pos, buf_len - pos, &cp); + if (!isCombiningChar(cp)) { + return pos - beg; + } + pos += len; + } + return pos - beg; +} + +/* Read a Unicode from file. */ +static size_t defaultReadCode(int fd, char * buf, size_t buf_len, int * cp) { + if (buf_len < 1) { + return -1; + } + size_t nread = read(fd, &buf[0], 1); + if (nread <= 0) { + return nread; + } + + unsigned char byte = buf[0]; + if ((byte & 0x80) == 0) { + ; + } else if ((byte & 0xE0) == 0xC0) { + if (buf_len < 2) { + return -1; + } + nread = read(fd, &buf[1], 1); + if (nread <= 0) { + return nread; + } + } else if ((byte & 0xF0) == 0xE0) { + if (buf_len < 3) { + return -1; + } + nread = read(fd, &buf[1], 2); + if (nread <= 0) { + return nread; + } + } else if ((byte & 0xF8) == 0xF0) { + if (buf_len < 3) { + return -1; + } + nread = read(fd, &buf[1], 3); + if (nread <= 0) { + return nread; + } + } else { + return -1; + } + + return utf8BytesToCodePoint(buf, buf_len, cp); +} + +/* Set default encoding functions */ +static linenoisePrevCharLen * prevCharLen = defaultPrevCharLen; +static linenoiseNextCharLen * nextCharLen = defaultNextCharLen; +static linenoiseReadCode * readCode = defaultReadCode; + +/* Set used defined encoding functions */ +void linenoiseSetEncodingFunctions(linenoisePrevCharLen * prevCharLenFunc, linenoiseNextCharLen * nextCharLenFunc, + linenoiseReadCode * readCodeFunc) { + prevCharLen = prevCharLenFunc; + nextCharLen = nextCharLenFunc; + readCode = readCodeFunc; +} + /* ======================= Low level terminal handling ====================== */ /* Enable "mask mode". When it is enabled, instead of the input that @@ -408,6 +867,73 @@ static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseComple } } +enum ESC_TYPE { ESC_NULL = 0, ESC_DELETE, ESC_UP, ESC_DOWN, ESC_RIGHT, ESC_LEFT, ESC_HOME, ESC_END }; + +static ESC_TYPE readEscapeSequence(struct linenoiseState * l) { + /* Check if the file input has additional data. */ + struct pollfd pfd; + pfd.fd = l->ifd; + pfd.events = POLLIN; + + auto ret = poll(&pfd, 1, 1); // 1 millisecond timeout + if (ret <= 0) { // -1: error, 0: timeout + return ESC_NULL; + } + + /* Read the next two bytes representing the escape sequence. + * Use two calls to handle slow terminals returning the two + * chars at different times. */ + char seq[3]; + if (read(l->ifd, seq, 1) == -1) { + return ESC_NULL; + } + if (read(l->ifd, seq + 1, 1) == -1) { + return ESC_NULL; + } + + /* ESC [ sequences. */ + if (seq[0] == '[') { + if (seq[1] >= '0' && seq[1] <= '9') { + /* Extended escape, read additional byte. */ + if (read(l->ifd, seq + 2, 1) == -1) { + return ESC_NULL; + } + if (seq[2] == '~') { + switch (seq[1]) { + case '3': + return ESC_DELETE; + } + } + } else { + switch (seq[1]) { + case 'A': + return ESC_UP; + case 'B': + return ESC_DOWN; + case 'C': + return ESC_RIGHT; + case 'D': + return ESC_LEFT; + case 'H': + return ESC_HOME; + case 'F': + return ESC_END; + } + } + } + + /* ESC O sequences. */ + else if (seq[0] == 'O') { + switch (seq[1]) { + case 'H': + return ESC_HOME; + case 'F': + return ESC_END; + } + } + return ESC_NULL; +} + /* This is an helper function for linenoiseEdit*() and is called when the * user types the key in order to complete the string currently in the * input. @@ -418,11 +944,11 @@ static void refreshLineWithCompletion(struct linenoiseState *ls, linenoiseComple * If the function returns non-zero, the caller should handle the * returned value as a byte read from the standard input, and process * it as usually: this basically means that the function may return a byte - * read from the termianl but not processed. Otherwise, if zero is returned, + * read from the terminal but not processed. Otherwise, if zero is returned, * the input was consumed by the completeLine() function to navigate the * possible completions, and the caller should read for the next characters * from stdin. */ -static int completeLine(struct linenoiseState *ls, int keypressed) { +static int completeLine(struct linenoiseState * ls, int keypressed, ESC_TYPE esc_type) { linenoiseCompletions lc; int nwritten; char c = keypressed; @@ -432,31 +958,31 @@ static int completeLine(struct linenoiseState *ls, int keypressed) { linenoiseBeep(); ls->in_completion = 0; } else { - switch(c) { - case 9: /* tab */ - if (ls->in_completion == 0) { - ls->in_completion = 1; - ls->completion_idx = 0; - } else { - ls->completion_idx = (ls->completion_idx + 1) % (lc.len + 1); - if (ls->completion_idx == lc.len) linenoiseBeep(); + if (c == TAB) { + if (ls->in_completion == 0) { + ls->in_completion = 1; + ls->completion_idx = 0; + } else { + ls->completion_idx = (ls->completion_idx + 1) % (lc.len + 1); + if (ls->completion_idx == lc.len) { + linenoiseBeep(); } - c = 0; - break; - case 27: /* escape */ - /* Re-show original buffer */ - if (ls->completion_idx < lc.len) refreshLine(ls); - ls->in_completion = 0; - c = 0; - break; - default: - /* Update buffer and return */ - if (ls->completion_idx < lc.len) { - nwritten = snprintf(ls->buf, ls->buflen, "%s", lc.cvec[ls->completion_idx]); - ls->len = ls->pos = nwritten; - } - ls->in_completion = 0; - break; + } + c = 0; + } else if (c == ESC && esc_type == ESC_NULL) { + /* Re-show original buffer */ + if (ls->completion_idx < lc.len) { + refreshLine(ls); + } + ls->in_completion = 0; + c = 0; + } else { + /* Update buffer and return */ + if (ls->completion_idx < lc.len) { + nwritten = snprintf(ls->buf, ls->buflen, "%s", lc.cvec[ls->completion_idx]); + ls->len = ls->pos = nwritten; + } + ls->in_completion = 0; } /* Show completion or original buffer */ @@ -508,16 +1034,30 @@ void linenoiseAddCompletion(linenoiseCompletions *lc, const char *str) { lc->cvec[lc->len++] = copy.release(); } +/* Get column length from begining of buffer to current byte position */ +static size_t columnPos(const char * buf, size_t buf_len, size_t pos) { + size_t ret = 0; + size_t off = 0; + while (off < pos) { + size_t col_len; + size_t len = nextCharLen(buf, buf_len, off, &col_len); + off += len; + ret += col_len; + } + return ret; +} + /* Helper of refreshSingleLine() and refreshMultiLine() to show hints * to the right of the prompt. */ -static void refreshShowHints(std::string & ab, struct linenoiseState * l, int plen) { +static void refreshShowHints(std::string & ab, struct linenoiseState * l, int pcollen) { char seq[64]; - if (hintsCallback && plen+l->len < l->cols) { + size_t collen = pcollen + columnPos(l->buf, l->len, l->len); + if (hintsCallback && collen < l->cols) { int color = -1, bold = 0; const char *hint = hintsCallback(l->buf,&color,&bold); if (hint) { int hintlen = strlen(hint); - int hintmaxlen = l->cols-(plen+l->len); + int hintmaxlen = l->cols - collen; if (hintlen > hintmaxlen) hintlen = hintmaxlen; if (bold == 1 && color == -1) color = 37; if (color != -1 || bold != 0) @@ -535,6 +1075,50 @@ static void refreshShowHints(std::string & ab, struct linenoiseState * l, int pl } } +/* Check if text is an ANSI escape sequence */ +static int isAnsiEscape(const char * buf, size_t buf_len, size_t * len) { + if (buf_len > 2 && !memcmp("\033[", buf, 2)) { + size_t off = 2; + while (off < buf_len) { + switch (buf[off++]) { + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + case 'G': + case 'H': + case 'J': + case 'K': + case 'S': + case 'T': + case 'f': + case 'm': + *len = off; + return 1; + } + } + } + return 0; +} + +/* Get column length of prompt text */ +static size_t promptTextColumnLen(const char * prompt, size_t plen) { + char buf[LINENOISE_MAX_LINE]; + size_t buf_len = 0; + size_t off = 0; + while (off < plen) { + size_t len; + if (isAnsiEscape(prompt + off, plen - off, &len)) { + off += len; + continue; + } + buf[buf_len++] = prompt[off++]; + } + return columnPos(buf, buf_len, buf_len); +} + /* Single line low level line refresh. * * Rewrite the currently edited line accordingly to the buffer content, @@ -544,19 +1128,21 @@ static void refreshShowHints(std::string & ab, struct linenoiseState * l, int pl * prompt, just write it, or both. */ static void refreshSingleLine(struct linenoiseState *l, int flags) { char seq[64]; - size_t plen = strlen(l->prompt); + size_t pcollen = promptTextColumnLen(l->prompt, strlen(l->prompt)); int fd = l->ofd; char *buf = l->buf; size_t len = l->len; size_t pos = l->pos; std::string ab; - while((plen+pos) >= l->cols) { - buf++; - len--; - pos--; + + while ((pcollen + columnPos(buf, len, pos)) >= l->cols) { + int chlen = nextCharLen(buf, len, 0, NULL); + buf += chlen; + len -= chlen; + pos -= chlen; } - while (plen+len > l->cols) { - len--; + while (pcollen + columnPos(buf, len, len) > l->cols) { + len -= prevCharLen(buf, len, len, NULL); } /* Cursor to left edge */ @@ -574,7 +1160,7 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) { ab.append(buf, len); } /* Show hits if any. */ - refreshShowHints(ab, l, plen); + refreshShowHints(ab, l, pcollen); } /* Erase to right */ @@ -582,13 +1168,43 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) { ab.append(seq); if (flags & REFRESH_WRITE) { /* Move cursor to original position. */ - snprintf(seq,sizeof(seq),"\r\x1b[%dC", (int)(pos+plen)); + snprintf(seq, sizeof(seq), "\r\x1b[%dC", (int) (columnPos(buf, len, pos) + pcollen)); ab.append(seq); } (void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */ } +/* Get column length from begining of buffer to current byte position for multiline mode*/ +static size_t columnPosForMultiLine(const char * buf, size_t buf_len, size_t pos, size_t cols, size_t ini_pos) { + size_t ret = 0; + size_t colwid = ini_pos; + + size_t off = 0; + while (off < buf_len) { + size_t col_len; + size_t len = nextCharLen(buf, buf_len, off, &col_len); + + int dif = (int) (colwid + col_len) - (int) cols; + if (dif > 0) { + ret += dif; + colwid = col_len; + } else if (dif == 0) { + colwid = 0; + } else { + colwid += col_len; + } + + if (off >= pos) { + break; + } + off += len; + ret += col_len; + } + + return ret; +} + /* Multi line low level line refresh. * * Rewrite the currently edited line accordingly to the buffer content, @@ -598,11 +1214,13 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) { * prompt, just write it, or both. */ static void refreshMultiLine(struct linenoiseState *l, int flags) { char seq[64]; - int plen = strlen(l->prompt); - int rows = (plen+l->len+l->cols-1)/l->cols; /* rows used by current buf. */ - int rpos = (plen+l->oldpos+l->cols)/l->cols; /* cursor relative row. */ + size_t pcollen = promptTextColumnLen(l->prompt, strlen(l->prompt)); + int colpos = columnPosForMultiLine(l->buf, l->len, l->len, l->cols, pcollen); + int colpos2; /* cursor column position. */ + int rows = (pcollen + colpos + l->cols - 1) / l->cols; /* rows used by current buf. */ + int rpos = (pcollen + l->oldcolpos + l->cols) / l->cols; /* cursor relative row. */ int rpos2; /* rpos after refresh. */ - int col; /* colum position, zero-based. */ + int col; /* column position, zero-based. */ int old_rows = l->oldrows; int fd = l->ofd, j; std::string ab; @@ -611,15 +1229,13 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) { /* First step: clear all the lines used before. To do so start by * going to the last row. */ if (flags & REFRESH_CLEAN) { - if (old_rows-rpos > 0) { - lndebug("go down %d", old_rows-rpos); + if (old_rows - rpos > 0) { snprintf(seq,64,"\x1b[%dB", old_rows-rpos); ab.append(seq); } /* Now for every row clear it, go up. */ - for (j = 0; j < old_rows-1; j++) { - lndebug("clear+up"); + for (j = 0; j < old_rows - 1; j++) { snprintf(seq,64,"\r\x1b[0K\x1b[1A"); ab.append(seq); } @@ -627,11 +1243,13 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) { if (flags & REFRESH_ALL) { /* Clean the top line. */ - lndebug("clear"); snprintf(seq,64,"\r\x1b[0K"); ab.append(seq); } + /* Get column length to cursor position */ + colpos2 = columnPosForMultiLine(l->buf, l->len, l->pos, l->cols, pcollen); + if (flags & REFRESH_WRITE) { /* Write the prompt and the current buffer content */ ab.append(l->prompt); @@ -644,15 +1262,11 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) { } /* Show hits if any. */ - refreshShowHints(ab, l, plen); + refreshShowHints(ab, l, pcollen); /* If we are at the very end of the screen with our prompt, we need to * emit a newline and move the prompt to the first column. */ - if (l->pos && - l->pos == l->len && - (l->pos+plen) % l->cols == 0) - { - lndebug(""); + if (l->pos && l->pos == l->len && (colpos2 + pcollen) % l->cols == 0) { ab.append("\n"); snprintf(seq,64,"\r"); ab.append(seq); @@ -661,19 +1275,16 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) { } /* Move cursor to right position. */ - rpos2 = (plen+l->pos+l->cols)/l->cols; /* Current cursor relative row */ - lndebug("rpos2 %d", rpos2); + rpos2 = (pcollen + colpos2 + l->cols) / l->cols; /* Current cursor relative row */ - /* Go up till we reach the expected positon. */ - if (rows-rpos2 > 0) { - lndebug("go-up %d", rows-rpos2); + /* Go up till we reach the expected position. */ + if (rows - rpos2 > 0) { snprintf(seq,64,"\x1b[%dA", rows-rpos2); ab.append(seq); } /* Set column. */ - col = (plen+(int)l->pos) % (int)l->cols; - lndebug("set col %d", 1+col); + col = (pcollen + colpos2) % l->cols; if (col) snprintf(seq,64,"\r\x1b[%dC", col); else @@ -681,8 +1292,8 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) { ab.append(seq); } - lndebug("\n"); - l->oldpos = l->pos; + l->oldcolpos = colpos2; + (void) !write(fd, ab.c_str(), ab.size()); /* Can't recover from write error. */ } @@ -720,26 +1331,36 @@ void linenoiseShow(struct linenoiseState *l) { /* Insert the character 'c' at cursor current position. * * On error writing to the terminal -1 is returned, otherwise 0. */ -static int linenoiseEditInsert(struct linenoiseState * l, char c) { - if (l->len < l->buflen) { +static int linenoiseEditInsert(struct linenoiseState * l, const char * cbuf, int clen) { + if (l->len + clen <= l->buflen) { if (l->len == l->pos) { - l->buf[l->pos] = c; - l->pos++; - l->len++; + memcpy(&l->buf[l->pos], cbuf, clen); + l->pos += clen; + l->len += clen; + ; l->buf[l->len] = '\0'; - if ((!mlmode && l->plen+l->len < l->cols && !hintsCallback)) { + if ((!mlmode && promptTextColumnLen(l->prompt, l->plen) + columnPos(l->buf, l->len, l->len) < l->cols && + !hintsCallback)) { /* Avoid a full update of the line in the * trivial case. */ - char d = (maskmode==1) ? '*' : c; - if (write(l->ofd,&d,1) == -1) return -1; + if (maskmode == 1) { + static const char d = '*'; + if (write(l->ofd, &d, 1) == -1) { + return -1; + } + } else { + if (write(l->ofd, cbuf, clen) == -1) { + return -1; + } + } } else { refreshLine(l); } } else { - memmove(l->buf+l->pos+1,l->buf+l->pos,l->len-l->pos); - l->buf[l->pos] = c; - l->len++; - l->pos++; + memmove(l->buf + l->pos + clen, l->buf + l->pos, l->len - l->pos); + memcpy(&l->buf[l->pos], cbuf, clen); + l->pos += clen; + l->len += clen; l->buf[l->len] = '\0'; refreshLine(l); } @@ -750,7 +1371,7 @@ static int linenoiseEditInsert(struct linenoiseState * l, char c) { /* Move cursor on the left. */ static void linenoiseEditMoveLeft(struct linenoiseState * l) { if (l->pos > 0) { - l->pos--; + l->pos -= prevCharLen(l->buf, l->len, l->pos, NULL); refreshLine(l); } } @@ -758,7 +1379,7 @@ static void linenoiseEditMoveLeft(struct linenoiseState * l) { /* Move cursor on the right. */ static void linenoiseEditMoveRight(struct linenoiseState * l) { if (l->pos != l->len) { - l->pos++; + l->pos += nextCharLen(l->buf, l->len, l->pos, NULL); refreshLine(l); } } @@ -810,8 +1431,9 @@ static void linenoiseEditHistoryNext(struct linenoiseState * l, int dir) { * position. Basically this is what happens with the "Delete" keyboard key. */ static void linenoiseEditDelete(struct linenoiseState * l) { if (l->len > 0 && l->pos < l->len) { - memmove(l->buf+l->pos,l->buf+l->pos+1,l->len-l->pos-1); - l->len--; + int chlen = nextCharLen(l->buf, l->len, l->pos, NULL); + memmove(l->buf + l->pos, l->buf + l->pos + chlen, l->len - l->pos - chlen); + l->len -= chlen; l->buf[l->len] = '\0'; refreshLine(l); } @@ -820,15 +1442,16 @@ static void linenoiseEditDelete(struct linenoiseState * l) { /* Backspace implementation. */ static void linenoiseEditBackspace(struct linenoiseState * l) { if (l->pos > 0 && l->len > 0) { - memmove(l->buf+l->pos-1,l->buf+l->pos,l->len-l->pos); - l->pos--; - l->len--; + int chlen = prevCharLen(l->buf, l->len, l->pos, NULL); + memmove(l->buf + l->pos - chlen, l->buf + l->pos, l->len - l->pos); + l->pos -= chlen; + l->len -= chlen; l->buf[l->len] = '\0'; refreshLine(l); } } -/* Delete the previosu word, maintaining the cursor at the start of the +/* Delete the previous word, maintaining the cursor at the start of the * current word. */ static void linenoiseEditDeletePrevWord(struct linenoiseState * l) { size_t old_pos = l->pos; @@ -855,7 +1478,7 @@ static void linenoiseEditDeletePrevWord(struct linenoiseState * l) { * each time there is some data arriving in the standard input. * * The user can also call linenoiseEditHide() and linenoiseEditShow() if it - * is required to show some input arriving asyncronously, without mixing + * is required to show some input arriving asynchronously, without mixing * it with the currently edited line. * * When linenoiseEditFeed() returns non-NULL, the user finished with the @@ -878,7 +1501,7 @@ int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, ch l->buflen = buflen; l->prompt = prompt; l->plen = strlen(prompt); - l->oldpos = l->pos = 0; + l->oldcolpos = l->pos = 0; l->len = 0; /* Enter raw mode. */ @@ -890,7 +1513,7 @@ int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, ch /* Buffer starts empty. */ l->buf[0] = '\0'; - l->buflen--; /* Make sure there is always space for the nulterm */ + l->buflen--; /* Make sure there is always space for the nullterm */ /* If stdin is not a tty, stop here with the initialization. We * will actually just read a line from standard input in blocking @@ -907,6 +1530,159 @@ int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, ch const char* linenoiseEditMore = "If you see this, you are misusing the API: when linenoiseEditFeed() is called, if it returns linenoiseEditMore the user is yet editing the line. See the README file for more information."; +static const char * handleEnterKey(struct linenoiseState * l) { + --history_len; + free(history[history_len]); + if (mlmode) { + linenoiseEditMoveEnd(l); + } + if (hintsCallback) { + /* Force a refresh without hints to leave the previous + * line as the user typed it after a newline. */ + linenoiseHintsCallback * hc = hintsCallback; + hintsCallback = NULL; + refreshLine(l); + hintsCallback = hc; + } + + return strdup(l->buf); +} + +static const char * handleCtrlCKey() { + errno = EAGAIN; + return NULL; +} + +static const char * handleCtrlDKey(struct linenoiseState * l) { + if (l->len > 0) { + linenoiseEditDelete(l); + return linenoiseEditMore; + } + + --history_len; + free(history[history_len]); + errno = ENOENT; + return NULL; +} + +static void handleCtrlTKey(struct linenoiseState * l) { + if (l->pos > 0 && l->pos < l->len) { + auto prev_chlen = prevCharLen(l->buf, l->len, l->pos, NULL); + auto curr_chlen = nextCharLen(l->buf, l->len, l->pos, NULL); + + std::string prev_char(prev_chlen, 0); + memcpy(prev_char.data(), l->buf + l->pos - prev_chlen, prev_chlen); + memmove(l->buf + l->pos - prev_chlen, l->buf + l->pos, curr_chlen); + memmove(l->buf + l->pos - prev_chlen + curr_chlen, prev_char.data(), prev_chlen); + + l->pos = l->pos - prev_chlen + curr_chlen; + if (l->pos + prev_chlen != l->len) { + l->pos += prev_chlen; + } + + refreshLine(l); + } +} + +static void handleEscapeSequence(struct linenoiseState * l, int esc_type) { + switch (esc_type) { + case ESC_NULL: + break; + case ESC_DELETE: + linenoiseEditDelete(l); + break; + case ESC_UP: + linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV); + break; + case ESC_DOWN: + linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT); + break; + case ESC_RIGHT: + linenoiseEditMoveRight(l); + break; + case ESC_LEFT: + linenoiseEditMoveLeft(l); + break; + case ESC_HOME: + linenoiseEditMoveHome(l); + break; + case ESC_END: + linenoiseEditMoveEnd(l); + break; + } +} + +static void handleCtrlUKey(struct linenoiseState * l) { + l->buf[0] = '\0'; + l->pos = l->len = 0; + refreshLine(l); +} + +static void handleCtrlKKey(struct linenoiseState * l) { + l->buf[l->pos] = '\0'; + l->len = l->pos; + refreshLine(l); +} + +static const char * processInputCharacter(struct linenoiseState * l, int c, char * cbuf, int nread, int esc_type) { + switch (c) { + case ENTER: + return handleEnterKey(l); + case CTRL_C: + return handleCtrlCKey(); + case BACKSPACE: + case CTRL_H: + linenoiseEditBackspace(l); + break; + case CTRL_D: /* ctrl-d, remove char at right of cursor, or if the + line is empty, act as end-of-file. */ + return handleCtrlDKey(l); + case CTRL_T: + handleCtrlTKey(l); + break; + case CTRL_B: + linenoiseEditMoveLeft(l); + break; + case CTRL_F: + linenoiseEditMoveRight(l); + break; + case CTRL_P: + linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV); + break; + case CTRL_N: + linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT); + break; + case ESC: + handleEscapeSequence(l, esc_type); + break; + default: + if (linenoiseEditInsert(l, cbuf, nread)) { + return NULL; + } + break; + case CTRL_U: /* Ctrl+u, delete the whole line. */ + handleCtrlUKey(l); + break; + case CTRL_K: /* Ctrl+k, delete from current to end of line. */ + handleCtrlKKey(l); + break; + case CTRL_A: /* Ctrl+a, go to the start of the line */ + linenoiseEditMoveHome(l); + break; + case CTRL_E: /* ctrl+e, go to the end of the line */ + linenoiseEditMoveEnd(l); + break; + case CTRL_L: /* ctrl+l, clear screen */ + linenoiseClearScreen(); + refreshLine(l); + break; + case CTRL_W: /* ctrl+w, delete previous word */ + linenoiseEditDeletePrevWord(l); + break; + } + return linenoiseEditMore; +} + /* This function is part of the multiplexed API of linenoise, see the top * comment on linenoiseEditStart() for more information. Call this function * each time there is some data to read from the standard input file @@ -925,163 +1701,33 @@ const char* linenoiseEditMore = "If you see this, you are misusing the API: when * * Some other errno: I/O error. */ -const char *linenoiseEditFeed(struct linenoiseState *l) { - /* Not a TTY, pass control to line reading without character - * count limits. */ +const char * linenoiseEditFeed(struct linenoiseState * l) { + /* Not a TTY, pass control to line reading without character count + * limits. */ if (!isatty(l->ifd)) return linenoiseNoTTY(); - char c; + int c; int nread; - char seq[3]; + char cbuf[32]; - nread = read(l->ifd,&c,1); + nread = readCode(l->ifd, cbuf, sizeof(cbuf), &c); if (nread <= 0) return NULL; + auto esc_type = ESC_NULL; + if (c == ESC) { + esc_type = readEscapeSequence(l); + } + /* Only autocomplete when the callback is set. It returns < 0 when * there was an error reading from fd. Otherwise it will return the * character that should be handled next. */ if ((l->in_completion || c == 9) && completionCallback != NULL) { - c = completeLine(l,c); + c = completeLine(l, c, esc_type); /* Read next character when 0 */ if (c == 0) return linenoiseEditMore; } - switch(c) { - case ENTER: /* enter */ - history_len--; - free(history[history_len]); - if (mlmode) linenoiseEditMoveEnd(l); - if (hintsCallback) { - /* Force a refresh without hints to leave the previous - * line as the user typed it after a newline. */ - linenoiseHintsCallback *hc = hintsCallback; - hintsCallback = NULL; - refreshLine(l); - hintsCallback = hc; - } - return strdup(l->buf); - case CTRL_C: /* ctrl-c */ - errno = EAGAIN; - return NULL; - case BACKSPACE: /* backspace */ - case 8: /* ctrl-h */ - linenoiseEditBackspace(l); - break; - case CTRL_D: /* ctrl-d, remove char at right of cursor, or if the - line is empty, act as end-of-file. */ - if (l->len > 0) { - linenoiseEditDelete(l); - } else { - history_len--; - free(history[history_len]); - errno = ENOENT; - return NULL; - } - break; - case CTRL_T: /* ctrl-t, swaps current character with previous. */ - if (l->pos > 0 && l->pos < l->len) { - int aux = l->buf[l->pos-1]; - l->buf[l->pos-1] = l->buf[l->pos]; - l->buf[l->pos] = aux; - if (l->pos != l->len-1) l->pos++; - refreshLine(l); - } - break; - case CTRL_B: /* ctrl-b */ - linenoiseEditMoveLeft(l); - break; - case CTRL_F: /* ctrl-f */ - linenoiseEditMoveRight(l); - break; - case CTRL_P: /* ctrl-p */ - linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV); - break; - case CTRL_N: /* ctrl-n */ - linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT); - break; - case ESC: /* escape sequence */ - /* Read the next two bytes representing the escape sequence. - * Use two calls to handle slow terminals returning the two - * chars at different times. */ - if (read(l->ifd,seq,1) == -1) break; - if (read(l->ifd,seq+1,1) == -1) break; - - /* ESC [ sequences. */ - if (seq[0] == '[') { - if (seq[1] >= '0' && seq[1] <= '9') { - /* Extended escape, read additional byte. */ - if (read(l->ifd,seq+2,1) == -1) break; - if (seq[2] == '~') { - switch(seq[1]) { - case '3': /* Delete key. */ - linenoiseEditDelete(l); - break; - } - } - } else { - switch(seq[1]) { - case 'A': /* Up */ - linenoiseEditHistoryNext(l, LINENOISE_HISTORY_PREV); - break; - case 'B': /* Down */ - linenoiseEditHistoryNext(l, LINENOISE_HISTORY_NEXT); - break; - case 'C': /* Right */ - linenoiseEditMoveRight(l); - break; - case 'D': /* Left */ - linenoiseEditMoveLeft(l); - break; - case 'H': /* Home */ - linenoiseEditMoveHome(l); - break; - case 'F': /* End*/ - linenoiseEditMoveEnd(l); - break; - } - } - } - - /* ESC O sequences. */ - else if (seq[0] == 'O') { - switch(seq[1]) { - case 'H': /* Home */ - linenoiseEditMoveHome(l); - break; - case 'F': /* End*/ - linenoiseEditMoveEnd(l); - break; - } - } - break; - default: - if (linenoiseEditInsert(l,c)) return NULL; - break; - case CTRL_U: /* Ctrl+u, delete the whole line. */ - l->buf[0] = '\0'; - l->pos = l->len = 0; - refreshLine(l); - break; - case CTRL_K: /* Ctrl+k, delete from current to end of line. */ - l->buf[l->pos] = '\0'; - l->len = l->pos; - refreshLine(l); - break; - case CTRL_A: /* Ctrl+a, go to the start of the line */ - linenoiseEditMoveHome(l); - break; - case CTRL_E: /* ctrl+e, go to the end of the line */ - linenoiseEditMoveEnd(l); - break; - case CTRL_L: /* ctrl+l, clear screen */ - linenoiseClearScreen(); - refreshLine(l); - break; - case CTRL_W: /* ctrl+w, delete previous word */ - linenoiseEditDeletePrevWord(l); - break; - } - return linenoiseEditMore; + return processInputCharacter(l, c, cbuf, nread, esc_type); } /* This is part of the multiplexed linenoise API. See linenoiseEditStart() @@ -1095,7 +1741,7 @@ void linenoiseEditStop(struct linenoiseState *l) { } /* This just implements a blocking loop for the multiplexed API. - * In many applications that are not event-drivern, we can just call + * In many applications that are not event-driven, we can just call * the blocking linenoise API, wait for the user to complete the editing * and return the buffer. */ static const char *linenoiseBlockingEdit(int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt) @@ -1135,8 +1781,7 @@ void linenoisePrintKeyCodes(void) { quit[sizeof(quit)-1] = c; /* Insert current char on the right. */ if (memcmp(quit,"quit",sizeof(quit)) == 0) break; - printf("'%c' %02x (%d) (type quit to exit)\n", - isprint(c) ? c : '?', (int)c, (int)c); + printf("'%c' %02x (%d) (type quit to exit)\n", isprint((int) c) ? c : '?', (int) c, (int) c); printf("\r"); /* Go left edge manually, we are in raw mode. */ fflush(stdout); } diff --git a/examples/run/linenoise.cpp/linenoise.h b/examples/run/linenoise.cpp/linenoise.h index a14ec6c740..9823ca36d0 100644 --- a/examples/run/linenoise.cpp/linenoise.h +++ b/examples/run/linenoise.cpp/linenoise.h @@ -47,27 +47,27 @@ extern "C" { #include /* For size_t. */ #include -extern const char *linenoiseEditMore; +extern const char * linenoiseEditMore; /* The linenoiseState structure represents the state during line editing. * We pass this state to functions implementing specific editing * functionalities. */ struct linenoiseState { - int in_completion; /* The user pressed TAB and we are now in completion + int in_completion; /* The user pressed TAB and we are now in completion * mode, so input is handled by completeLine(). */ - size_t completion_idx; /* Index of next completion to propose. */ - int ifd; /* Terminal stdin file descriptor. */ - int ofd; /* Terminal stdout file descriptor. */ - char *buf; /* Edited line buffer. */ - size_t buflen; /* Edited line buffer size. */ - const char *prompt; /* Prompt to display. */ - size_t plen; /* Prompt length. */ - size_t pos; /* Current cursor position. */ - size_t oldpos; /* Previous refresh cursor position. */ - size_t len; /* Current edited line length. */ - size_t cols; /* Number of columns in terminal. */ - size_t oldrows; /* Rows used by last refrehsed line (multiline mode) */ - int history_index; /* The history index we are currently editing. */ + size_t completion_idx; /* Index of next completion to propose. */ + int ifd; /* Terminal stdin file descriptor. */ + int ofd; /* Terminal stdout file descriptor. */ + char * buf; /* Edited line buffer. */ + size_t buflen; /* Edited line buffer size. */ + const char * prompt; /* Prompt to display. */ + size_t plen; /* Prompt length. */ + size_t pos; /* Current cursor position. */ + size_t oldcolpos; /* Previous refresh cursor column position. */ + size_t len; /* Current edited line length. */ + size_t cols; /* Number of columns in terminal. */ + size_t oldrows; /* Rows used by last refreshed line (multiline mode) */ + int history_index; /* The history index we are currently editing. */ }; struct linenoiseCompletions { @@ -89,19 +89,20 @@ struct linenoiseCompletions { }; /* Non blocking API. */ -int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt); -const char *linenoiseEditFeed(struct linenoiseState *l); -void linenoiseEditStop(struct linenoiseState *l); -void linenoiseHide(struct linenoiseState *l); -void linenoiseShow(struct linenoiseState *l); +int linenoiseEditStart(struct linenoiseState * l, int stdin_fd, int stdout_fd, char * buf, size_t buflen, + const char * prompt); +const char * linenoiseEditFeed(struct linenoiseState * l); +void linenoiseEditStop(struct linenoiseState * l); +void linenoiseHide(struct linenoiseState * l); +void linenoiseShow(struct linenoiseState * l); /* Blocking API. */ -const char *linenoise(const char *prompt); -void linenoiseFree(void *ptr); +const char * linenoise(const char * prompt); +void linenoiseFree(void * ptr); /* Completion API. */ typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *); -typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold); +typedef const char *(linenoiseHintsCallback) (const char *, int * color, int * bold); typedef void(linenoiseFreeHintsCallback)(const char *); void linenoiseSetCompletionCallback(linenoiseCompletionCallback *); void linenoiseSetHintsCallback(linenoiseHintsCallback *); @@ -109,10 +110,10 @@ void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *); void linenoiseAddCompletion(linenoiseCompletions *, const char *); /* History API. */ -int linenoiseHistoryAdd(const char *line); +int linenoiseHistoryAdd(const char * line); int linenoiseHistorySetMaxLen(int len); -int linenoiseHistorySave(const char *filename); -int linenoiseHistoryLoad(const char *filename); +int linenoiseHistorySave(const char * filename); +int linenoiseHistoryLoad(const char * filename); /* Other utilities. */ void linenoiseClearScreen(void); @@ -121,6 +122,14 @@ void linenoisePrintKeyCodes(void); void linenoiseMaskModeEnable(void); void linenoiseMaskModeDisable(void); +/* Encoding functions. */ +typedef size_t(linenoisePrevCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len); +typedef size_t(linenoiseNextCharLen)(const char * buf, size_t buf_len, size_t pos, size_t * col_len); +typedef size_t(linenoiseReadCode)(int fd, char * buf, size_t buf_len, int * c); + +void linenoiseSetEncodingFunctions(linenoisePrevCharLen * prevCharLenFunc, linenoiseNextCharLen * nextCharLenFunc, + linenoiseReadCode * readCodeFunc); + #ifdef __cplusplus } #endif diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 4da1e50251..462a6d1519 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -79,6 +79,7 @@ class Opt { ctx_params = llama_context_default_params(); model_params = llama_model_default_params(); context_size_default = ctx_params.n_batch; + n_threads_default = ctx_params.n_threads; ngl_default = model_params.n_gpu_layers; common_params_sampling sampling; temperature_default = sampling.temp; @@ -104,6 +105,7 @@ class Opt { ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; ctx_params.n_ctx = ctx_params.n_batch; + ctx_params.n_threads = ctx_params.n_threads_batch = n_threads >= 0 ? n_threads : n_threads_default; model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; temperature = temperature >= 0 ? temperature : temperature_default; @@ -116,12 +118,12 @@ class Opt { std::string chat_template_file; std::string user; bool use_jinja = false; - int context_size = -1, ngl = -1; + int context_size = -1, ngl = -1, n_threads = -1; float temperature = -1; bool verbose = false; private: - int context_size_default = -1, ngl_default = -1; + int context_size_default = -1, ngl_default = -1, n_threads_default = -1; float temperature_default = -1; bool help = false; @@ -159,53 +161,94 @@ class Opt { return 0; } + int parse_options_with_value(int argc, const char ** argv, int & i, bool & options_parsing) { + if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { + if (handle_option_with_value(argc, argv, i, context_size) == 1) { + return 1; + } + } else if (options_parsing && + (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { + if (handle_option_with_value(argc, argv, i, ngl) == 1) { + return 1; + } + } else if (options_parsing && (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--threads") == 0)) { + if (handle_option_with_value(argc, argv, i, n_threads) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { + if (handle_option_with_value(argc, argv, i, temperature) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0) { + if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { + return 1; + } + use_jinja = true; + } else { + return 2; + } + + return 0; + } + + int parse_options(const char ** argv, int & i, bool & options_parsing) { + if (options_parsing && (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { + verbose = true; + } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { + use_jinja = true; + } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { + help = true; + return 0; + } else if (options_parsing && strcmp(argv[i], "--") == 0) { + options_parsing = false; + } else { + return 2; + } + + return 0; + } + + int parse_positional_args(const char ** argv, int & i, int & positional_args_i) { + if (positional_args_i == 0) { + if (!argv[i][0] || argv[i][0] == '-') { + return 1; + } + + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user = argv[i]; + } else { + user += " " + std::string(argv[i]); + } + + return 0; + } + int parse(int argc, const char ** argv) { bool options_parsing = true; for (int i = 1, positional_args_i = 0; i < argc; ++i) { - if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { - if (handle_option_with_value(argc, argv, i, context_size) == 1) { - return 1; - } - } else if (options_parsing && - (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "-ngl") == 0 || strcmp(argv[i], "--ngl") == 0)) { - if (handle_option_with_value(argc, argv, i, ngl) == 1) { - return 1; - } - } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { - if (handle_option_with_value(argc, argv, i, temperature) == 1) { - return 1; - } - } else if (options_parsing && - (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { - verbose = true; - } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) { - use_jinja = true; - } else if (options_parsing && strcmp(argv[i], "--chat-template-file") == 0){ - if (handle_option_with_value(argc, argv, i, chat_template_file) == 1) { - return 1; - } - use_jinja = true; - } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { - help = true; - return 0; - } else if (options_parsing && strcmp(argv[i], "--") == 0) { - options_parsing = false; - } else if (positional_args_i == 0) { - if (!argv[i][0] || argv[i][0] == '-') { - return 1; - } + int ret = parse_options_with_value(argc, argv, i, options_parsing); + if (ret == 0) { + continue; + } else if (ret == 1) { + return ret; + } - ++positional_args_i; - model_ = argv[i]; - } else if (positional_args_i == 1) { - ++positional_args_i; - user = argv[i]; - } else { - user += " " + std::string(argv[i]); + ret = parse_options(argv, i, options_parsing); + if (ret == 0) { + continue; + } else if (ret == 1) { + return ret; + } + + if (parse_positional_args(argv, i, positional_args_i)) { + return 1; } } - if (model_.empty()){ + if (model_.empty()) { return 1; } @@ -232,6 +275,8 @@ class Opt { " Number of GPU layers (default: %d)\n" " --temp \n" " Temperature (default: %.1f)\n" + " -t, --threads \n" + " Number of threads to use during generation (default: %d)\n" " -v, --verbose, --log-verbose\n" " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n" " -h, --help\n" @@ -260,7 +305,7 @@ class Opt { " llama-run file://some-file3.gguf\n" " llama-run --ngl 999 some-file4.gguf\n" " llama-run --ngl 999 some-file5.gguf Hello World\n", - context_size_default, ngl_default, temperature_default); + context_size_default, ngl_default, temperature_default, n_threads_default); } }; @@ -323,25 +368,17 @@ class File { return 0; } - std::string read_all(const std::string & filename){ - open(filename, "r"); - lock(); - if (!file) { - printe("Error opening file '%s': %s", filename.c_str(), strerror(errno)); - return ""; - } - + std::string to_string() { fseek(file, 0, SEEK_END); - size_t size = ftell(file); + const size_t size = ftell(file); fseek(file, 0, SEEK_SET); - std::string out; out.resize(size); - size_t read_size = fread(&out[0], 1, size, file); + const size_t read_size = fread(&out[0], 1, size, file); if (read_size != size) { - printe("Error reading file '%s': %s", filename.c_str(), strerror(errno)); - return ""; + printe("Error reading file: %s", strerror(errno)); } + return out; } @@ -899,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama // Function to tokenize the prompt static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt, std::vector & prompt_tokens, const LlamaData & llama_data) { - const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0; + const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0; const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); prompt_tokens.resize(n_prompt_tokens); @@ -915,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { const int n_ctx = llama_n_ctx(ctx.get()); - const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); + const int n_ctx_used = llama_kv_self_used_cells(ctx.get()); if (n_ctx_used + batch.n_tokens > n_ctx) { printf(LOG_COL_DEFAULT "\n"); printe("context size exceeded\n"); @@ -985,7 +1022,8 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str } static int read_user_input(std::string & user_input) { - static const char * prompt_prefix = "> "; + static const char * prompt_prefix_env = std::getenv("LLAMA_PROMPT_PREFIX"); + static const char * prompt_prefix = prompt_prefix_env ? prompt_prefix_env : "> "; #ifdef WIN32 printf("\r" LOG_CLR_TO_EOL LOG_COL_DEFAULT "%s", prompt_prefix); @@ -1098,59 +1136,66 @@ static int get_user_input(std::string & user_input, const std::string & user) { // Reads a chat template file to be used static std::string read_chat_template_file(const std::string & chat_template_file) { - if(chat_template_file.empty()){ - return ""; - } - File file; - std::string chat_template = ""; - chat_template = file.read_all(chat_template_file); - if(chat_template.empty()){ + if (!file.open(chat_template_file, "r")) { printe("Error opening chat template file '%s': %s", chat_template_file.c_str(), strerror(errno)); return ""; } - return chat_template; + + return file.to_string(); +} + +static int process_user_message(const Opt & opt, const std::string & user_input, LlamaData & llama_data, + const common_chat_templates_ptr & chat_templates, int & prev_len, + const bool stdout_a_terminal) { + add_message("user", opt.user.empty() ? user_input : opt.user, llama_data); + int new_len; + if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, true, new_len, opt.use_jinja) < 0) { + return 1; + } + + std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); + std::string response; + if (generate_response(llama_data, prompt, response, stdout_a_terminal)) { + return 1; + } + + if (!opt.user.empty()) { + return 2; + } + + add_message("assistant", response, llama_data); + if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, false, prev_len, opt.use_jinja) < 0) { + return 1; + } + + return 0; } // Main chat loop function -static int chat_loop(LlamaData & llama_data, const std::string & user, const std::string & chat_template_file, bool use_jinja) { +static int chat_loop(LlamaData & llama_data, const Opt & opt) { int prev_len = 0; llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); - - std::string chat_template = ""; - if(!chat_template_file.empty()){ - chat_template = read_chat_template_file(chat_template_file); + std::string chat_template; + if (!opt.chat_template_file.empty()) { + chat_template = read_chat_template_file(opt.chat_template_file); } - auto chat_templates = common_chat_templates_init(llama_data.model.get(), chat_template.empty() ? nullptr : chat_template); + common_chat_templates_ptr chat_templates = common_chat_templates_init(llama_data.model.get(), chat_template); static const bool stdout_a_terminal = is_stdout_a_terminal(); while (true) { // Get user input std::string user_input; - if (get_user_input(user_input, user) == 1) { + if (get_user_input(user_input, opt.user) == 1) { return 0; } - add_message("user", user.empty() ? user_input : user, llama_data); - int new_len; - if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, true, new_len, use_jinja) < 0) { + const int ret = process_user_message(opt, user_input, llama_data, chat_templates, prev_len, stdout_a_terminal); + if (ret == 1) { return 1; - } - - std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); - std::string response; - if (generate_response(llama_data, prompt, response, stdout_a_terminal)) { - return 1; - } - - if (!user.empty()) { + } else if (ret == 2) { break; } - - add_message("assistant", response, llama_data); - if (apply_chat_template_with_error_handling(chat_templates.get(), llama_data, false, prev_len, use_jinja) < 0) { - return 1; - } } return 0; @@ -1208,7 +1253,7 @@ int main(int argc, const char ** argv) { return 1; } - if (chat_loop(llama_data, opt.user, opt.chat_template_file, opt.use_jinja)) { + if (chat_loop(llama_data, opt)) { return 1; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index cf7cbd8159..760ebbbf08 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -15,7 +15,7 @@ int main(int argc, char ** argv) { return 1; } - print_build_info(); + common_init(); if (params.n_predict < 0) { params.n_predict = 16; @@ -196,7 +196,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); // erase whole kv - llama_kv_cache_clear(ctx3); + llama_kv_self_clear(ctx3); fprintf(stderr, "%s : kv cache cleared\n", __func__); // restore kv into seq 1 diff --git a/examples/server/README.md b/examples/server/README.md index a2ae614d7c..a2a0903261 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -13,6 +13,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. * Multimodal (wip) * Monitoring endpoints * Schema-constrained JSON response format + * [Function calling](../../docs/function-calling.md) / tool use for ~any model The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216). @@ -1120,381 +1121,9 @@ curl http://localhost:8080/v1/chat/completions \ *Tool call support* -[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639): +[OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) is supported with the `--jinja` flag (and may require a `--chat-template-file` override to get the right tool-use compatible Jinja template; worst case, `--chat-template chatml` may also work). -- Requires `--jinja` flag -- Native tool call formats supported: - - Llama 3.1 / 3.3 (including builtin tools support - tool names for `wolfram_alpha`, `web_search` / `brave_search`, `code_interpreter`), Llama 3.2 - - Functionary v3.1 / v3.2 - - Hermes 2/3, Qwen 2.5 - - Mistral Nemo - - Firefunction v2 - - Command R7B - - DeepSeek R1 (WIP / seems reluctant to call any tools?) - -
- Show some common templates and which format handler they use - - | Template | Format | - |----------|--------| - | Almawave-Velvet-14B.jinja | Hermes 2 Pro | - | AtlaAI-Selene-1-Mini-Llama-3.1-8B.jinja | Llama 3.x | - | CohereForAI-aya-expanse-8b.jinja | Generic | - | CohereForAI-c4ai-command-r-plus-default.jinja | Generic | - | CohereForAI-c4ai-command-r-plus-rag.jinja | Generic | - | CohereForAI-c4ai-command-r-plus-tool_use.jinja | Generic | - | CohereForAI-c4ai-command-r7b-12-2024-default.jinja | Command R7B (extract reasoning) | - | CohereForAI-c4ai-command-r7b-12-2024-rag.jinja | Command R7B (extract reasoning) | - | CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja | Command R7B (extract reasoning) | - | CohereForAI-c4ai-command-r7b-12-2024.jinja | Generic | - | DavieLion-Llama-3.2-1B-SPIN-iter3.jinja | Generic | - | Delta-Vector-Rei-12B.jinja | Mistral Nemo | - | EpistemeAI-Mistral-Nemo-Instruct-12B-Philosophy-Math.jinja | Mistral Nemo | - | FlofloB-83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit.jinja | Hermes 2 Pro | - | FlofloB-test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit.jinja | Generic | - | HelpingAI-HAI-SER.jinja | Generic | - | HuggingFaceTB-SmolLM2-1.7B-Instruct.jinja | Generic | - | HuggingFaceTB-SmolLM2-135M-Instruct.jinja | Generic | - | HuggingFaceTB-SmolLM2-360M-Instruct.jinja | Generic | - | INSAIT-Institute-BgGPT-Gemma-2-27B-IT-v1.0.jinja | Generic | - | Ihor-Text2Graph-R1-Qwen2.5-0.5b.jinja | Hermes 2 Pro | - | Infinigence-Megrez-3B-Instruct.jinja | Generic | - | Josephgflowers-TinyLlama_v1.1_math_code-world-test-1.jinja | Generic | - | LGAI-EXAONE-EXAONE-3.5-2.4B-Instruct.jinja | Generic | - | LGAI-EXAONE-EXAONE-3.5-7.8B-Instruct.jinja | Generic | - | LatitudeGames-Wayfarer-12B.jinja | Generic | - | Magpie-Align-Llama-3-8B-Magpie-Align-v0.1.jinja | Generic | - | Magpie-Align-Llama-3.1-8B-Magpie-Align-v0.1.jinja | Generic | - | MaziyarPanahi-calme-3.2-instruct-78b.jinja | Generic | - | MiniMaxAI-MiniMax-Text-01.jinja | Generic | - | MiniMaxAI-MiniMax-VL-01.jinja | Generic | - | NaniDAO-deepseek-r1-qwen-2.5-32B-ablated.jinja | DeepSeek R1 (extract reasoning) | - | NexaAIDev-Octopus-v2.jinja | Generic | - | NousResearch-Hermes-2-Pro-Llama-3-8B-default.jinja | Generic | - | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja | Hermes 2 Pro | - | NousResearch-Hermes-2-Pro-Mistral-7B-default.jinja | Generic | - | NousResearch-Hermes-2-Pro-Mistral-7B-tool_use.jinja | Hermes 2 Pro | - | NousResearch-Hermes-3-Llama-3.1-70B-default.jinja | Generic | - | NousResearch-Hermes-3-Llama-3.1-70B-tool_use.jinja | Hermes 2 Pro | - | NovaSky-AI-Sky-T1-32B-Flash.jinja | Hermes 2 Pro | - | NovaSky-AI-Sky-T1-32B-Preview.jinja | Hermes 2 Pro | - | OnlyCheeini-greesychat-turbo.jinja | Generic | - | Orenguteng-Llama-3.1-8B-Lexi-Uncensored-V2.jinja | Llama 3.x | - | OrionStarAI-Orion-14B-Chat.jinja | Generic | - | PowerInfer-SmallThinker-3B-Preview.jinja | Generic | - | PrimeIntellect-INTELLECT-1-Instruct.jinja | Generic | - | Qwen-QVQ-72B-Preview.jinja | Generic | - | Qwen-QwQ-32B-Preview.jinja | Hermes 2 Pro | - | Qwen-Qwen1.5-7B-Chat.jinja | Generic | - | Qwen-Qwen2-7B-Instruct.jinja | Generic | - | Qwen-Qwen2-VL-72B-Instruct.jinja | Generic | - | Qwen-Qwen2-VL-7B-Instruct.jinja | Generic | - | Qwen-Qwen2.5-0.5B.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-1.5B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-14B-Instruct-1M.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-14B.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-32B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-32B.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-3B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-72B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-7B-Instruct-1M.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-7B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-7B.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-Coder-32B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-Coder-7B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-Math-1.5B.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-Math-7B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-VL-3B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-VL-72B-Instruct.jinja | Hermes 2 Pro | - | Qwen-Qwen2.5-VL-7B-Instruct.jinja | Hermes 2 Pro | - | RWKV-Red-Team-ARWKV-7B-Preview-0.1.jinja | Hermes 2 Pro | - | SakanaAI-TinySwallow-1.5B-Instruct.jinja | Hermes 2 Pro | - | SakanaAI-TinySwallow-1.5B.jinja | Hermes 2 Pro | - | Sao10K-70B-L3.3-Cirrus-x1.jinja | Llama 3.x | - | SentientAGI-Dobby-Mini-Leashed-Llama-3.1-8B.jinja | Llama 3.x | - | SentientAGI-Dobby-Mini-Unhinged-Llama-3.1-8B.jinja | Llama 3.x | - | Steelskull-L3.3-Damascus-R1.jinja | Llama 3.x | - | Steelskull-L3.3-MS-Nevoria-70b.jinja | Llama 3.x | - | Steelskull-L3.3-Nevoria-R1-70b.jinja | Llama 3.x | - | THUDM-glm-4-9b-chat.jinja | Generic | - | THUDM-glm-edge-1.5b-chat.jinja | Generic | - | Tarek07-Progenitor-V1.1-LLaMa-70B.jinja | Llama 3.x | - | TheBloke-FusionNet_34Bx2_MoE-AWQ.jinja | Generic | - | TinyLlama-TinyLlama-1.1B-Chat-v1.0.jinja | Generic | - | UCLA-AGI-Mistral7B-PairRM-SPPO-Iter3.jinja | Generic | - | ValiantLabs-Llama3.1-8B-Enigma.jinja | Llama 3.x | - | abacusai-Fewshot-Metamath-OrcaVicuna-Mistral.jinja | Generic | - | ai21labs-AI21-Jamba-1.5-Large.jinja | Generic | - | allenai-Llama-3.1-Tulu-3-405B-SFT.jinja | Generic | - | allenai-Llama-3.1-Tulu-3-405B.jinja | Generic | - | allenai-Llama-3.1-Tulu-3-8B.jinja | Generic | - | arcee-ai-Virtuoso-Lite.jinja | Hermes 2 Pro | - | arcee-ai-Virtuoso-Medium-v2.jinja | Hermes 2 Pro | - | arcee-ai-Virtuoso-Small-v2.jinja | Hermes 2 Pro | - | avemio-GRAG-NEMO-12B-ORPO-HESSIAN-AI.jinja | Generic | - | bespokelabs-Bespoke-Stratos-7B.jinja | Hermes 2 Pro | - | bfuzzy1-acheron-m1a-llama.jinja | Generic | - | bofenghuang-vigogne-2-70b-chat.jinja | Generic | - | bytedance-research-UI-TARS-72B-DPO.jinja | Generic | - | bytedance-research-UI-TARS-7B-DPO.jinja | Generic | - | bytedance-research-UI-TARS-7B-SFT.jinja | Generic | - | carsenk-phi3.5_mini_exp_825_uncensored.jinja | Generic | - | cyberagent-DeepSeek-R1-Distill-Qwen-14B-Japanese.jinja | DeepSeek R1 (extract reasoning) | - | cyberagent-DeepSeek-R1-Distill-Qwen-32B-Japanese.jinja | DeepSeek R1 (extract reasoning) | - | databricks-dbrx-instruct.jinja | Generic | - | deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja | Generic | - | deepseek-ai-DeepSeek-Coder-V2-Lite-Base.jinja | Generic | - | deepseek-ai-DeepSeek-Coder-V2-Lite-Instruct.jinja | Generic | - | deepseek-ai-DeepSeek-R1-Distill-Llama-70B.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-R1-Distill-Qwen-14B.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-R1-Distill-Qwen-7B.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-R1-Zero.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-V2-Lite.jinja | Generic | - | deepseek-ai-DeepSeek-V2.5.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-DeepSeek-V3.jinja | DeepSeek R1 (extract reasoning) | - | deepseek-ai-deepseek-coder-33b-instruct.jinja | Generic | - | deepseek-ai-deepseek-coder-6.7b-instruct.jinja | Generic | - | deepseek-ai-deepseek-coder-7b-instruct-v1.5.jinja | Generic | - | deepseek-ai-deepseek-llm-67b-chat.jinja | Generic | - | deepseek-ai-deepseek-llm-7b-chat.jinja | Generic | - | dicta-il-dictalm2.0-instruct.jinja | Generic | - | ehristoforu-Falcon3-8B-Franken-Basestruct.jinja | Hermes 2 Pro | - | fireworks-ai-llama-3-firefunction-v2.jinja | FireFunction v2 | - | godlikehhd-alpaca_data_sampled_ifd_new_5200.jinja | Hermes 2 Pro | - | godlikehhd-alpaca_data_score_max_0.7_2600.jinja | Hermes 2 Pro | - | google-gemma-2-27b-it.jinja | Generic | - | google-gemma-2-2b-it.jinja | Generic | - | google-gemma-2-2b-jpn-it.jinja | Generic | - | google-gemma-7b-it.jinja | Generic | - | huihui-ai-DeepSeek-R1-Distill-Llama-70B-abliterated.jinja | DeepSeek R1 (extract reasoning) | - | huihui-ai-DeepSeek-R1-Distill-Llama-8B-abliterated.jinja | DeepSeek R1 (extract reasoning) | - | huihui-ai-DeepSeek-R1-Distill-Qwen-14B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) | - | huihui-ai-DeepSeek-R1-Distill-Qwen-32B-abliterated.jinja | DeepSeek R1 (extract reasoning) | - | huihui-ai-DeepSeek-R1-Distill-Qwen-7B-abliterated-v2.jinja | DeepSeek R1 (extract reasoning) | - | huihui-ai-Qwen2.5-14B-Instruct-1M-abliterated.jinja | Hermes 2 Pro | - | ibm-granite-granite-3.1-8b-instruct.jinja | Generic | - | indischepartij-MiniCPM-3B-OpenHermes-2.5-v2.jinja | Generic | - | inflatebot-MN-12B-Mag-Mell-R1.jinja | Generic | - | jinaai-ReaderLM-v2.jinja | Generic | - | kms7530-chemeng_qwen-math-7b_24_1_100_1_nonmath.jinja | Hermes 2 Pro | - | knifeayumu-Cydonia-v1.3-Magnum-v4-22B.jinja | Mistral Nemo | - | langgptai-qwen1.5-7b-chat-sa-v0.1.jinja | Generic | - | lightblue-DeepSeek-R1-Distill-Qwen-7B-Japanese.jinja | DeepSeek R1 (extract reasoning) | - | mattshumer-Reflection-Llama-3.1-70B.jinja | Generic | - | meetkai-functionary-medium-v3.1.jinja | Functionary v3.1 Llama 3.1 | - | meetkai-functionary-medium-v3.2.jinja | Functionary v3.2 | - | meta-llama-Llama-2-7b-chat-hf.jinja | Generic | - | meta-llama-Llama-3.1-8B-Instruct.jinja | Llama 3.x | - | meta-llama-Llama-3.2-11B-Vision-Instruct.jinja | Llama 3.x | - | meta-llama-Llama-3.2-1B-Instruct.jinja | Llama 3.x | - | meta-llama-Llama-3.2-3B-Instruct.jinja | Llama 3.x | - | meta-llama-Llama-3.3-70B-Instruct.jinja | Llama 3.x | - | meta-llama-Meta-Llama-3-8B-Instruct.jinja | Generic | - | meta-llama-Meta-Llama-3.1-8B-Instruct.jinja | Llama 3.x | - | microsoft-Phi-3-medium-4k-instruct.jinja | Generic | - | microsoft-Phi-3-mini-4k-instruct.jinja | Generic | - | microsoft-Phi-3-small-8k-instruct.jinja | Generic | - | microsoft-Phi-3.5-mini-instruct.jinja | Generic | - | microsoft-Phi-3.5-vision-instruct.jinja | Generic | - | microsoft-phi-4.jinja | Generic | - | migtissera-Tess-3-Mistral-Nemo-12B.jinja | Generic | - | ministral-Ministral-3b-instruct.jinja | Generic | - | mistralai-Codestral-22B-v0.1.jinja | Generic | - | mistralai-Mistral-7B-Instruct-v0.1.jinja | Generic | - | mistralai-Mistral-7B-Instruct-v0.2.jinja | Generic | - | mistralai-Mistral-7B-Instruct-v0.3.jinja | Mistral Nemo | - | mistralai-Mistral-Large-Instruct-2407.jinja | Mistral Nemo | - | mistralai-Mistral-Large-Instruct-2411.jinja | Generic | - | mistralai-Mistral-Nemo-Instruct-2407.jinja | Mistral Nemo | - | mistralai-Mistral-Small-24B-Instruct-2501.jinja | Generic | - | mistralai-Mixtral-8x7B-Instruct-v0.1.jinja | Generic | - | mkurman-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro | - | mlabonne-AlphaMonarch-7B.jinja | Generic | - | mlx-community-Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32.jinja | Hermes 2 Pro | - | mlx-community-Qwen2.5-VL-7B-Instruct-8bit.jinja | Hermes 2 Pro | - | mobiuslabsgmbh-DeepSeek-R1-ReDistill-Qwen-1.5B-v1.1.jinja | DeepSeek R1 (extract reasoning) | - | netcat420-MFANNv0.20.jinja | Generic | - | netcat420-MFANNv0.24.jinja | Generic | - | netease-youdao-Confucius-o1-14B.jinja | Hermes 2 Pro | - | nvidia-AceMath-7B-RM.jinja | Hermes 2 Pro | - | nvidia-Eagle2-1B.jinja | Hermes 2 Pro | - | nvidia-Eagle2-9B.jinja | Hermes 2 Pro | - | nvidia-Llama-3.1-Nemotron-70B-Instruct-HF.jinja | Llama 3.x | - | onnx-community-DeepSeek-R1-Distill-Qwen-1.5B-ONNX.jinja | DeepSeek R1 (extract reasoning) | - | open-thoughts-OpenThinker-7B.jinja | Hermes 2 Pro | - | openchat-openchat-3.5-0106.jinja | Generic | - | pankajmathur-orca_mini_v6_8b.jinja | Generic | - | princeton-nlp-Mistral-7B-Base-SFT-RDPO.jinja | Generic | - | princeton-nlp-Mistral-7B-Instruct-DPO.jinja | Generic | - | princeton-nlp-Mistral-7B-Instruct-RDPO.jinja | Generic | - | prithivMLmods-Bellatrix-Tiny-1.5B-R1.jinja | Hermes 2 Pro | - | prithivMLmods-Bellatrix-Tiny-1B-R1.jinja | Llama 3.x | - | prithivMLmods-Bellatrix-Tiny-1B-v3.jinja | Generic | - | prithivMLmods-Bellatrix-Tiny-3B-R1.jinja | Llama 3.x | - | prithivMLmods-Blaze-14B-xElite.jinja | Generic | - | prithivMLmods-Calcium-Opus-14B-Elite2-R1.jinja | Hermes 2 Pro | - | prithivMLmods-Calme-Ties-78B.jinja | Generic | - | prithivMLmods-Calme-Ties2-78B.jinja | Generic | - | prithivMLmods-Calme-Ties3-78B.jinja | Generic | - | prithivMLmods-ChemQwen2-vL.jinja | Generic | - | prithivMLmods-GWQ2b.jinja | Generic | - | prithivMLmods-LatexMind-2B-Codec.jinja | Generic | - | prithivMLmods-Llama-3.2-6B-AlgoCode.jinja | Llama 3.x | - | prithivMLmods-Megatron-Opus-14B-Exp.jinja | Hermes 2 Pro | - | prithivMLmods-Megatron-Opus-14B-Stock.jinja | Hermes 2 Pro | - | prithivMLmods-Megatron-Opus-7B-Exp.jinja | Hermes 2 Pro | - | prithivMLmods-Omni-Reasoner-Merged.jinja | Hermes 2 Pro | - | prithivMLmods-Omni-Reasoner4-Merged.jinja | Hermes 2 Pro | - | prithivMLmods-Primal-Opus-14B-Optimus-v1.jinja | Hermes 2 Pro | - | prithivMLmods-QwQ-Math-IO-500M.jinja | Hermes 2 Pro | - | prithivMLmods-Qwen-7B-Distill-Reasoner.jinja | DeepSeek R1 (extract reasoning) | - | prithivMLmods-Qwen2.5-1.5B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro | - | prithivMLmods-Qwen2.5-14B-DeepSeek-R1-1M.jinja | Hermes 2 Pro | - | prithivMLmods-Qwen2.5-32B-DeepSeek-R1-Instruct.jinja | Hermes 2 Pro | - | prithivMLmods-Qwen2.5-7B-DeepSeek-R1-1M.jinja | Hermes 2 Pro | - | prithivMLmods-Triangulum-v2-10B.jinja | Hermes 2 Pro | - | qingy2024-Falcon3-2x10B-MoE-Instruct.jinja | Hermes 2 Pro | - | rubenroy-Zurich-14B-GCv2-5m.jinja | Hermes 2 Pro | - | rubenroy-Zurich-7B-GCv2-5m.jinja | Hermes 2 Pro | - | silma-ai-SILMA-Kashif-2B-Instruct-v1.0.jinja | Generic | - | simplescaling-s1-32B.jinja | Hermes 2 Pro | - | sometimesanotion-Lamarck-14B-v0.7.jinja | Hermes 2 Pro | - | sonthenguyen-zephyr-sft-bnb-4bit-DPO-mtbr-180steps.jinja | Generic | - | sthenno-tempesthenno-icy-0130.jinja | Generic | - | sumink-qwft.jinja | Hermes 2 Pro | - | teknium-OpenHermes-2.5-Mistral-7B.jinja | Generic | - | thirdeyeai-elevate360m.jinja | Generic | - | tiiuae-Falcon3-10B-Instruct.jinja | Hermes 2 Pro | - | unsloth-DeepSeek-R1-Distill-Llama-8B-unsloth-bnb-4bit.jinja | DeepSeek R1 (extract reasoning) | - | unsloth-DeepSeek-R1-Distill-Llama-8B.jinja | DeepSeek R1 (extract reasoning) | - | unsloth-DeepSeek-R1.jinja | DeepSeek R1 (extract reasoning) | - | unsloth-Mistral-Small-24B-Instruct-2501-unsloth-bnb-4bit.jinja | Generic | - | upstage-solar-pro-preview-instruct.jinja | Generic | - | whyhow-ai-PatientSeek.jinja | Generic | - | xwen-team-Xwen-72B-Chat.jinja | Hermes 2 Pro | - | xwen-team-Xwen-7B-Chat.jinja | Hermes 2 Pro | - - This table can be generated with: - - ```bash - ./build/bin/test-chat ../minja/build/tests/*.jinja 2>/dev/null - ``` - -
- -- Generic tool call is supported when the template isn't recognized by native format handlers (you'll see `Chat format: Generic` in the logs). - - Use `--chat-template-file` to override the template when appropriate (see examples below) - - Generic support may consume more tokens and be less efficient than a model's native format. - -- Run with: - - ```shell - # Native support: - - llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M - llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L - llama-server --jinja -fa -hf bartowski/functionary-small-v3.2-GGUF:Q4_K_M - llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M - - # Native support for DeepSeek R1 works best w/ our own template (official template buggy) - - llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \ - --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja - - llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \ - --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja - - # Native support requires the right template for these GGUFs: - - llama-server --jinja -fa -hf bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M \ - --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use ) - - llama-server --jinja -fa -hf bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M \ - --chat-template-file <( python scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use ) - - llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \ - --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use ) - - llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \ - --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use ) - - # Generic format support - llama-server --jinja -fa -hf bartowski/phi-4-GGUF:Q4_0 - llama-server --jinja -fa -hf bartowski/gemma-2-2b-it-GGUF:Q8_0 - llama-server --jinja -fa -hf bartowski/c4ai-command-r-v01-GGUF:Q2_K - ``` - -- Test in CLI: - - ```bash - curl http://localhost:8080/v1/chat/completions -d '{ - "model": "gpt-3.5-turbo", - "tools": [ - { - "type":"function", - "function":{ - "name":"python", - "description":"Runs code in an ipython interpreter and returns the result of the execution after 60 seconds.", - "parameters":{ - "type":"object", - "properties":{ - "code":{ - "type":"string", - "description":"The code to run in the ipython interpreter." - } - }, - "required":["code"] - } - } - } - ], - "messages": [ - { - "role": "user", - "content": "Print a hello world message with python." - } - ] - }' - ``` - -
- Show output - - ```json - { - "choices": [ - { - "finish_reason": "tool", - "index": 0, - "message": { - "content": null, - "tool_calls": [ - { - "name": "python", - "arguments": "{\"code\":\" \\nprint(\\\"Hello, World!\\\")\"}" - } - ], - "role": "assistant" - } - } - ], - "created": 1727287211, - "model": "gpt-3.5-turbo", - "object": "chat.completion", - "usage": { - "completion_tokens": 16, - "prompt_tokens": 44, - "total_tokens": 60 - }, - "id": "chatcmpl-Htbgh9feMmGM0LEH2hmQvwsCxq3c6Ni8" - } - ``` - -
+**See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use. ### POST `/v1/embeddings`: OpenAI-compatible embeddings API diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index e6a22a4e37..d0e6da8e4a 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/public_legacy/json-schema-to-grammar.mjs b/examples/server/public_legacy/json-schema-to-grammar.mjs index e67bb15c17..f767ce7b72 100644 --- a/examples/server/public_legacy/json-schema-to-grammar.mjs +++ b/examples/server/public_legacy/json-schema-to-grammar.mjs @@ -1,5 +1,5 @@ // WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first. -const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'; +const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}'; function _buildRepetition(itemRule, minItems, maxItems, opts={}) { if (minItems === 0 && maxItems === 1) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2306dc26fe..c2f1afeca4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -131,9 +131,9 @@ struct slot_params { lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); } - std::vector grammar_trigger_words; - for (const auto & trigger : sampling.grammar_trigger_words) { - grammar_trigger_words.push_back(trigger.word); + auto grammar_triggers = json::array(); + for (const auto & trigger : sampling.grammar_triggers) { + grammar_triggers.push_back(trigger.to_json()); } return json { @@ -170,8 +170,8 @@ struct slot_params { {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, {"grammar", sampling.grammar}, - {"grammar_trigger_words", grammar_trigger_words}, - {"grammar_trigger_tokens", sampling.grammar_trigger_tokens}, + {"grammar_lazy", sampling.grammar_lazy}, + {"grammar_triggers", grammar_triggers}, {"preserved_tokens", sampling.preserved_tokens}, {"chat_format", common_chat_format_name(oaicompat_chat_format)}, {"samplers", samplers}, @@ -356,24 +356,6 @@ struct server_task { } { - const auto grammar_triggers = data.find("grammar_triggers"); - if (grammar_triggers != data.end()) { - for (const auto & t : *grammar_triggers) { - common_grammar_trigger trigger; - trigger.word = t.at("word"); - trigger.at_start = t.at("at_start"); - - auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true); - if (ids.size() == 1) { - SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str()); - params.sampling.grammar_trigger_tokens.push_back(ids[0]); - params.sampling.preserved_tokens.insert(ids[0]); - continue; - } - SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str()); - params.sampling.grammar_trigger_words.push_back(trigger); - } - } const auto preserved_tokens = data.find("preserved_tokens"); if (preserved_tokens != data.end()) { for (const auto & t : *preserved_tokens) { @@ -383,12 +365,39 @@ struct server_task { params.sampling.preserved_tokens.insert(ids[0]); } else { // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens. - SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get().c_str()); + SRV_DBG("Not preserved because more than 1 token: %s\n", t.get().c_str()); } } } - if (params.sampling.grammar_lazy) { - GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0); + const auto grammar_triggers = data.find("grammar_triggers"); + if (grammar_triggers != data.end()) { + for (const auto & t : *grammar_triggers) { + auto ct = common_grammar_trigger::from_json(t); + if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { + const auto & word = ct.value; + auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true); + if (ids.size() == 1) { + auto token = ids[0]; + if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) { + throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word); + } + SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str()); + common_grammar_trigger trigger; + trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; + trigger.value = word; + trigger.token = token; + params.sampling.grammar_triggers.push_back(std::move(trigger)); + } else { + SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); + params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); + } + } else { + params.sampling.grammar_triggers.push_back(ct); + } + } + } + if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) { + throw std::runtime_error("Error: no triggers set for lazy grammar!"); } } @@ -742,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result { {"name", tc.name}, {"arguments", tc.arguments}, }}, - {"id", tc.id}, + // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). + // We only generate a random id for the ones that don't generate one by themselves + // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) + {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, }); } message["tool_calls"] = tool_calls; @@ -1304,7 +1316,7 @@ struct server_slot { return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK; } - bool can_batch_with(server_slot & other_slot) { + bool can_batch_with(server_slot & other_slot) const { return is_non_causal() == other_slot.is_non_causal() && are_lora_equal(lora, other_slot.lora); } @@ -1860,6 +1872,10 @@ struct server_context { params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; + // force F16 KV cache for the draft model for extra performance + params_dft.cache_type_k = GGML_TYPE_F16; + params_dft.cache_type_v = GGML_TYPE_F16; + llama_init_dft = common_init_from_params(params_dft); model_dft = llama_init_dft.model.get(); @@ -1880,10 +1896,6 @@ struct server_context { cparams_dft = common_context_params_to_llama(params_dft); cparams_dft.n_batch = n_ctx_dft; - // force F16 KV cache for the draft model for extra performance - cparams_dft.type_k = GGML_TYPE_F16; - cparams_dft.type_v = GGML_TYPE_F16; - // the context is not needed - we will create one for each slot llama_init_dft.context.reset(); } @@ -1892,6 +1904,7 @@ struct server_context { try { common_chat_format_example(chat_templates.get(), params.use_jinja); } catch (const std::exception & e) { + SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what()); SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); chat_templates = common_chat_templates_init(model, "chatml"); } @@ -2027,6 +2040,18 @@ struct server_context { return ret; } + bool can_be_detokenized(const struct llama_context * ctx, const std::vector & tokens) { + const llama_model * model = llama_get_model(ctx); + const llama_vocab * vocab = llama_model_get_vocab(model); + const int32_t n_vocab = llama_vocab_n_tokens(vocab); + for (const auto & token : tokens) { + if (token < 0 || token >= n_vocab) { + return false; + } + } + return true; + } + bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot.reset(); slot.id_task = task.id; @@ -2041,11 +2066,16 @@ struct server_context { slot.lora = task.params.lora; } + bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens); + if (!can_detokenize) { + send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST); + return false; + } SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { // Might be better to reject the request with a 400 ? - SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict); + SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict); slot.params.n_predict = slot.n_predict; } @@ -2083,7 +2113,7 @@ struct server_context { SRV_DBG("%s", "clearing KV cache\n"); // clear the entire KV cache - llama_kv_cache_clear(ctx); + llama_kv_self_clear(ctx); clean_kv_cache = false; } @@ -2148,14 +2178,6 @@ struct server_context { } if (slot.has_new_line) { - // if we have already seen a new line, we stop after a certain time limit - if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { - slot.stop = STOP_TYPE_LIMIT; - slot.has_next_token = false; - - SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); - } - // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent if (slot.params.n_indent > 0) { // check the current indentation @@ -2194,6 +2216,14 @@ struct server_context { // check if there is a new line in the generated text if (result.text_to_send.find('\n') != std::string::npos) { slot.has_new_line = true; + + // if we have seen a new line, we stop after a certain time limit, but only upon another new line + if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { + slot.stop = STOP_TYPE_LIMIT; + slot.has_next_token = false; + + SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); + } } // if context shift is disabled, we stop when it reaches the context limit @@ -2625,8 +2655,8 @@ struct server_context { res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res->t_start = metrics.t_start; - res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); - res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); + res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx); + res->kv_cache_used_cells = llama_kv_self_used_cells(ctx); res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; res->t_prompt_processing_total = metrics.t_prompt_processing_total; @@ -2742,7 +2772,7 @@ struct server_context { // Erase token cache const size_t n_erased = slot->cache_tokens.size(); - llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); + llama_kv_self_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); auto res = std::make_unique(); @@ -2810,8 +2840,8 @@ struct server_context { SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); + llama_kv_self_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); + llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -3002,8 +3032,8 @@ struct server_context { const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; - llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); - llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift); + llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c); + llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift); for (size_t i = 0; i < n_match; i++) { slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; @@ -3041,9 +3071,9 @@ struct server_context { } // keep only the common part - if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { + if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) { // could not partially delete (likely using a non-Transformer model) - llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); + llama_kv_self_seq_rm(ctx, slot.id, -1, -1); // there is no common part left slot.n_past = 0; @@ -3283,7 +3313,7 @@ struct server_context { slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); + llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index af1dcb5b96..491cb3a5df 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -144,6 +144,7 @@ def test_apply_chat_template(): @pytest.mark.parametrize("response_format,n_predicted,re_content", [ ({"type": "json_object", "schema": {"const": "42"}}, 6, "\"42\""), ({"type": "json_object", "schema": {"items": [{"type": "integer"}]}}, 10, "[ -3000 ]"), + ({"type": "json_schema", "json_schema": {"schema": {"const": "foooooo"}}}, 10, "\"foooooo\""), ({"type": "json_object"}, 10, "(\\{|John)+"), ({"type": "sound"}, 0, None), # invalid response format (expected to fail) diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py old mode 100644 new mode 100755 index a91a2f3333..569c2a1f8e --- a/examples/server/tests/unit/test_tool_call.py +++ b/examples/server/tests/unit/test_tool_call.py @@ -1,4 +1,12 @@ +#!/usr/bin/env python import pytest + +# ensure grandparent path is in sys.path +from pathlib import Path +import sys +path = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(path)) + from utils import * server: ServerProcess @@ -66,15 +74,8 @@ WEATHER_TOOL = { } -def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None): - global server - n_predict = 512 - # server = ServerPreset.stories15m_moe() - server.jinja = True - server.n_predict = n_predict - server.chat_template_file = f'../../../models/templates/{template_name}.jinja' - server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ +def do_test_completion_with_required_tool_tiny(server: ServerProcess, tool: dict, argument_key: str | None, n_predict, **kwargs): + res = server.make_request("POST", "/v1/chat/completions", data={ "max_tokens": n_predict, "messages": [ {"role": "system", "content": "You are a coding assistant."}, @@ -83,16 +84,15 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a "tool_choice": "required", "tools": [tool], "parallel_tool_calls": False, - "temperature": 0.0, - "top_k": 1, - "top_p": 1.0, + **kwargs, }) assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" choice = res.body["choices"][0] tool_calls = choice["message"].get("tool_calls") assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] - assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' + assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' + assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}' expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] assert expected_function_name == tool_call["function"]["name"] actual_arguments = tool_call["function"]["arguments"] @@ -108,7 +108,14 @@ def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, a ("meta-llama-Llama-3.3-70B-Instruct", PYTHON_TOOL, "code"), ]) def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, argument_key: str | None): - do_test_completion_with_required_tool_tiny(template_name, tool, argument_key) + global server + n_predict = 512 + # server = ServerPreset.stories15m_moe() + server.jinja = True + server.n_predict = n_predict + server.chat_template_file = f'../../../models/templates/{template_name}.jinja' + server.start(timeout_seconds=TIMEOUT_SERVER_START) + do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict, temperature=0.0, top_k=1, top_p=1.0) @pytest.mark.slow @@ -130,10 +137,17 @@ def test_completion_with_required_tool_tiny_fast(template_name: str, tool: dict, ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", TEST_TOOL, "success"), ("deepseek-ai-DeepSeek-R1-Distill-Llama-8B", PYTHON_TOOL, "code"), ("fireworks-ai-llama-3-firefunction-v2", TEST_TOOL, "success"), - ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"), + # ("fireworks-ai-llama-3-firefunction-v2", PYTHON_TOOL, "code"), ]) def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, argument_key: str | None): - do_test_completion_with_required_tool_tiny(template_name, tool, argument_key) + global server + n_predict = 512 + # server = ServerPreset.stories15m_moe() + server.jinja = True + server.n_predict = n_predict + server.chat_template_file = f'../../../models/templates/{template_name}.jinja' + server.start(timeout_seconds=TIMEOUT_SERVER_START) + do_test_completion_with_required_tool_tiny(server, tool, argument_key, n_predict) @pytest.mark.slow @@ -142,25 +156,33 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), - # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. (TEST_TOOL, "success", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", "chatml"), (TEST_TOOL, "success", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"), + + (TEST_TOOL, "success", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"), + (TEST_TOOL, "success", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), (TEST_TOOL, "success", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - # (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), + (PYTHON_TOOL, "code", "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), (TEST_TOOL, "success", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), - # (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), + (PYTHON_TOOL, "code", "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), (TEST_TOOL, "success", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), (PYTHON_TOOL, "code", "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), @@ -176,10 +198,10 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict, (TEST_TOOL, "success", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)), - # (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), - # TODO: fix these - # (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - # (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), + + (TEST_TOOL, "success", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + (PYTHON_TOOL, "code", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), ]) def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): global server @@ -197,7 +219,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str elif isinstance(template_override, str): server.chat_template = template_override server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ + res = server.make_request("POST", "/v1/chat/completions", data={ "max_tokens": n_predict, "messages": [ {"role": "system", "content": "You are a coding assistant."}, @@ -215,7 +237,7 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str tool_calls = choice["message"].get("tool_calls") assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] - assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' + # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' expected_function_name = "python" if tool["type"] == "code_interpreter" else tool["function"]["name"] assert expected_function_name == tool_call["function"]["name"] actual_arguments = tool_call["function"]["arguments"] @@ -225,13 +247,8 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str assert argument_key in actual_arguments, f"tool arguments: {json.dumps(actual_arguments)}, expected: {argument_key}" -def do_test_completion_without_tool_call(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): - global server - server.jinja = True - server.n_predict = n_predict - server.chat_template_file = f'../../../models/templates/{template_name}.jinja' - server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ +def do_test_completion_without_tool_call(server: ServerProcess, n_predict: int, tools: list[dict], tool_choice: str | None, **kwargs): + res = server.make_request("POST", "/v1/chat/completions", data={ "max_tokens": n_predict, "messages": [ {"role": "system", "content": "You are a coding assistant."}, @@ -239,9 +256,7 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too ], "tools": tools if tools else None, "tool_choice": tool_choice, - "temperature": 0.0, - "top_k": 1, - "top_p": 1.0, + **kwargs, }, timeout=TIMEOUT_HTTP_REQUEST) assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" choice = res.body["choices"][0] @@ -254,7 +269,12 @@ def do_test_completion_without_tool_call(template_name: str, n_predict: int, too ("meta-llama-Llama-3.3-70B-Instruct", 128, [PYTHON_TOOL], 'none'), ]) def test_completion_without_tool_call_fast(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): - do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice) + global server + server.jinja = True + server.n_predict = n_predict + server.chat_template_file = f'../../../models/templates/{template_name}.jinja' + server.start(timeout_seconds=TIMEOUT_SERVER_START) + do_test_completion_without_tool_call(server, n_predict, tools, tool_choice) @pytest.mark.slow @@ -270,7 +290,12 @@ def test_completion_without_tool_call_fast(template_name: str, n_predict: int, t ("meta-llama-Llama-3.2-3B-Instruct", 256, [PYTHON_TOOL], 'none'), ]) def test_completion_without_tool_call_slow(template_name: str, n_predict: int, tools: list[dict], tool_choice: str | None): - do_test_completion_without_tool_call(template_name, n_predict, tools, tool_choice) + global server + server.jinja = True + server.n_predict = n_predict + server.chat_template_file = f'../../../models/templates/{template_name}.jinja' + server.start(timeout_seconds=TIMEOUT_SERVER_START) + do_test_completion_without_tool_call(server, n_predict, tools, tool_choice) @pytest.mark.slow @@ -281,6 +306,12 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", None), + ("bartowski/Qwen2.5-1.5B-Instruct-GGUF:Q4_K_M", "chatml"), + + ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None), + ("bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), @@ -324,48 +355,53 @@ def test_weather(hf_repo: str, template_override: str | Tuple[str, str | None] | elif isinstance(template_override, str): server.chat_template = template_override server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": n_predict, + do_test_weather(server, max_tokens=n_predict) + + +def do_test_weather(server: ServerProcess, **kwargs): + res = server.make_request("POST", "/v1/chat/completions", data={ "messages": [ {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things."}, {"role": "user", "content": "What is the weather in Istanbul?"}, ], "tools": [WEATHER_TOOL], + **kwargs, }, timeout=TIMEOUT_HTTP_REQUEST) assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" choice = res.body["choices"][0] tool_calls = choice["message"].get("tool_calls") assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] - assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' - assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"] + # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' + assert tool_call["function"]["name"] == WEATHER_TOOL["function"]["name"], f'Expected weather tool call, got {tool_call["function"]["name"]}' + assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}' actual_arguments = json.loads(tool_call["function"]["arguments"]) assert 'location' in actual_arguments, f"location not found in {json.dumps(actual_arguments)}" location = actual_arguments["location"] assert isinstance(location, str), f"Expected location to be a string, got {type(location)}: {json.dumps(location)}" - assert re.match('^Istanbul(, (TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}' + assert re.match('^Istanbul(( |, ?)(TR|Turkey|Türkiye))?$', location), f'Expected Istanbul for location, got {location}' @pytest.mark.slow @pytest.mark.parametrize("result_override,n_predict,hf_repo,template_override", [ (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), - (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), + (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", None), + (None, 128, "bartowski/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), (None, 128, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")), (None, 128, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai/functionary-medium-v3.2", None)), - (None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), (None, 128, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), (None, 128, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), # TODO: fix these (wrong results, either didn't respect decimal instruction or got wrong value) - ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), + # (None, 128, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), + # ("[\\s\\S]*?\\*\\*\\s*0.5($|\\*\\*)", 8192, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), ]) def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, template_override: str | Tuple[str, str | None] | None): global server - # n_predict = 512 server.n_slots = 1 server.jinja = True server.n_ctx = 8192 * 2 @@ -379,10 +415,14 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, elif isinstance(template_override, str): server.chat_template = template_override server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ + do_test_calc_result(server, result_override, n_predict) + + +def do_test_calc_result(server: ServerProcess, result_override: str | None, n_predict: int, **kwargs): + res = server.make_request("POST", "/v1/chat/completions", data={ "max_tokens": n_predict, "messages": [ - {"role": "system", "content": "You are a chatbot that uses tools/functions. Dont overthink things, and provide very concise answers. Do not explain your reasoning to the user. Provide any numerical values back to the user with at most two decimals."}, + {"role": "system", "content": "You are a tools-calling assistant. You express numerical values with at most two decimals."}, {"role": "user", "content": "What's the y coordinate of a point on the unit sphere at angle 30 degrees?"}, { "role": "assistant", @@ -423,7 +463,8 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, } } } - ] + ], + **kwargs, }, timeout=TIMEOUT_HTTP_REQUEST) assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" choice = res.body["choices"][0] @@ -434,19 +475,19 @@ def test_calc_result(result_override: str | None, n_predict: int, hf_repo: str, if result_override is not None: assert re.match(result_override, content), f'Expected {result_override}, got {content}' else: - assert re.match('^[\\s\\S]*?The (y[ -])?coordinate [\\s\\S]*?is (approximately )?0\\.56\\b|^0\\.56$', content), \ + assert re.match('^[\\s\\S]*?((That\'s|\\bis) (approximately )?)?\\b0\\.(5\\b|56\\b|556)', content), \ f'Expected something like "The y coordinate is 0.56.", got {content}' @pytest.mark.slow @pytest.mark.parametrize("n_predict,reasoning_format,expect_content,expect_reasoning_content,hf_repo,template_override", [ - (128, 'deepseek', "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), - (128, None, "^The sum of 102 and 7 is 109.*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + (128, 'deepseek', "^The sum of 102 and 7 is 109[\\s\\S]*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + (128, None, "^The sum of 102 and 7 is 109[\\s\\S]*", None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), - (1024, 'deepseek', "To find the sum of.*", "I need to calculate the sum of 102 and 7.*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - (1024, 'none', "^I need[\\s\\S]*?\n?To find.*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + (1024, 'deepseek', "To find the sum of[\\s\\S]*", "I need to calculate the sum of 102 and 7[\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), + (1024, 'none', "^(\\s*)?I need[\\s\\S]*?\\s*To find[\\s\\S]*", None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - (1024, 'deepseek', "To find the sum of.*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), + (1024, 'deepseek', "To find the sum of[\\s\\S]*", "First, I [\\s\\S]*", "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)), ]) def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] | None, expect_content: str | None, expect_reasoning_content: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): global server @@ -464,7 +505,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] elif isinstance(template_override, str): server.chat_template = template_override server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ + res = server.make_request("POST", "/v1/chat/completions", data={ "max_tokens": n_predict, "messages": [ {"role": "user", "content": "What's the sum of 102 and 7?"}, @@ -476,7 +517,7 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] content = choice["message"].get("content") if expect_content is None: - assert content is None, f'Expected no content in {choice["message"]}' + assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' else: assert re.match(expect_content, content), f'Expected {expect_content}, got {content}' @@ -488,46 +529,46 @@ def test_thoughts(n_predict: int, reasoning_format: Literal['deepseek', 'none'] @pytest.mark.slow -@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [ - (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - # (None, "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"), +@pytest.mark.parametrize("hf_repo,template_override", [ + ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None), - (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), - (None, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", None), + ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M", "chatml"), - (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)), - (None, "bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), + ("bartowski/functionary-small-v3.2-GGUF:Q8_0", ("meetkai-functionary-medium-v3.2", None)), + ("bartowski/functionary-small-v3.2-GGUF:Q8_0", "chatml"), - ('{"code":"print("}', "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), - (None, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), + # ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None), + ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"), - (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), - (None, "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), + ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", None), - ('{"code":"print("}', "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), - (None, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", ("meta-llama-Llama-3.2-3B-Instruct", None)), + ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M", None), - (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), - (None, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), + ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", None), + ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M", "chatml"), - (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), - (None, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), + ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")), + ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"), - (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")), - (None, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), + ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")), + ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M", "chatml"), - (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), - (None, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), + ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None), + ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"), - # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it. - (None, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), + ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", None), + ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M", "chatml"), ]) -def test_hello_world(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None): +def test_hello_world(hf_repo: str, template_override: str | Tuple[str, str | None] | None): global server + n_predict = 512 # High because of DeepSeek R1 server.n_slots = 1 server.jinja = True server.n_ctx = 8192 - server.n_predict = 512 # High because of DeepSeek R1 + server.n_predict = n_predict server.model_hf_repo = hf_repo server.model_hf_file = None if isinstance(template_override, tuple): @@ -537,31 +578,29 @@ def test_hello_world(expected_arguments_override: str | None, hf_repo: str, temp elif isinstance(template_override, str): server.chat_template = template_override server.start(timeout_seconds=TIMEOUT_SERVER_START) - res = server.make_request("POST", "/chat/completions", data={ - "max_tokens": 256, + + do_test_hello_world(server, max_tokens=n_predict) + + +def do_test_hello_world(server: ServerProcess, **kwargs): + res = server.make_request("POST", "/v1/chat/completions", data={ "messages": [ - {"role": "system", "content": "You are a coding assistant."}, + {"role": "system", "content": "You are a tool-calling agent."}, {"role": "user", "content": "say hello world with python"}, ], "tools": [PYTHON_TOOL], - # Note: without these greedy params, Functionary v3.2 writes `def hello_world():\n print("Hello, World!")\nhello_world()` which is correct but a pain to test. - "temperature": 0.0, - "top_k": 1, - "top_p": 1.0, + **kwargs, }, timeout=TIMEOUT_HTTP_REQUEST) assert res.status_code == 200, f"Expected status code 200, got {res.status_code}" choice = res.body["choices"][0] tool_calls = choice["message"].get("tool_calls") assert tool_calls and len(tool_calls) == 1, f'Expected 1 tool call in {choice["message"]}' tool_call = tool_calls[0] - assert choice["message"].get("content") is None, f'Expected no content in {choice["message"]}' + # assert choice["message"].get("content") in (None, ""), f'Expected no content in {choice["message"]}' assert tool_call["function"]["name"] == PYTHON_TOOL["function"]["name"] - actual_arguments = tool_call["function"]["arguments"] - if expected_arguments_override is not None: - assert actual_arguments == expected_arguments_override - else: - actual_arguments = json.loads(actual_arguments) - assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}" - code = actual_arguments["code"] - assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}" - assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}' + assert len(tool_call.get("id", "")) > 0, f'Expected non empty tool call id in {tool_call}' + actual_arguments = json.loads(tool_call["function"]["arguments"]) + assert 'code' in actual_arguments, f"code not found in {json.dumps(actual_arguments)}" + code = actual_arguments["code"] + assert isinstance(code, str), f"Expected code to be a string, got {type(code)}: {json.dumps(code)}" + assert re.match(r'''print\(("[Hh]ello,? [Ww]orld!?"|'[Hh]ello,? [Ww]orld!?')\)''', code), f'Expected hello world, got {code}' diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index a82504235f..30aa866095 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -26,7 +26,10 @@ from re import RegexFlag import wget -DEFAULT_HTTP_TIMEOUT = 12 if "LLAMA_SANITIZE" not in os.environ else 30 +DEFAULT_HTTP_TIMEOUT = 12 + +if "LLAMA_SANITIZE" in os.environ or "GITHUB_ACTION" in os.environ: + DEFAULT_HTTP_TIMEOUT = 30 class ServerResponse: @@ -64,6 +67,9 @@ class ServerProcess: id_slot: int | None = None cache_prompt: bool | None = None n_slots: int | None = None + ctk: str | None = None + ctv: str | None = None + fa: bool | None = None server_continuous_batching: bool | None = False server_embeddings: bool | None = False server_reranking: bool | None = False @@ -81,6 +87,7 @@ class ServerProcess: reasoning_format: Literal['deepseek', 'none'] | None = None chat_template: str | None = None chat_template_file: str | None = None + server_path: str | None = None # session variables process: subprocess.Popen | None = None @@ -94,7 +101,9 @@ class ServerProcess: self.server_port = int(os.environ["PORT"]) def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None: - if "LLAMA_SERVER_BIN_PATH" in os.environ: + if self.server_path is not None: + server_path = self.server_path + elif "LLAMA_SERVER_BIN_PATH" in os.environ: server_path = os.environ["LLAMA_SERVER_BIN_PATH"] elif os.name == "nt": server_path = "../../../build/bin/Release/llama-server.exe" @@ -148,6 +157,12 @@ class ServerProcess: server_args.extend(["--ctx-size", self.n_ctx]) if self.n_slots: server_args.extend(["--parallel", self.n_slots]) + if self.ctk: + server_args.extend(["-ctk", self.ctk]) + if self.ctv: + server_args.extend(["-ctv", self.ctv]) + if self.fa is not None: + server_args.append("-fa") if self.n_predict: server_args.extend(["--n-predict", self.n_predict]) if self.slot_save_path: @@ -181,7 +196,7 @@ class ServerProcess: server_args.extend(["--chat-template-file", self.chat_template_file]) args = [str(arg) for arg in [server_path, *server_args]] - print(f"bench: starting server with: {' '.join(args)}") + print(f"tests: starting server with: {' '.join(args)}") flags = 0 if "nt" == os.name: @@ -212,6 +227,10 @@ class ServerProcess: return # server is ready except Exception as e: pass + # Check if process died + if self.process.poll() is not None: + raise RuntimeError(f"Server process died with return code {self.process.returncode}") + print(f"Waiting for server to start...") time.sleep(0.5) raise TimeoutError(f"Server did not start within {timeout_seconds} seconds") @@ -283,7 +302,7 @@ class ServerPreset: server.model_hf_repo = "ggml-org/models" server.model_hf_file = "tinyllamas/stories260K.gguf" server.model_alias = "tinyllama-2" - server.n_ctx = 256 + server.n_ctx = 512 server.n_batch = 32 server.n_slots = 2 server.n_predict = 64 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 6f8ab2b93a..58cdd6af92 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -7,6 +7,8 @@ // increase max payload length to allow use of larger context size #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576 +// disable Nagle's algorithm +#define CPPHTTPLIB_TCP_NODELAY true #include "httplib.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: @@ -433,6 +435,10 @@ static std::string gen_chatcmplid() { return "chatcmpl-" + random_string(); } +static std::string gen_tool_call_id() { + return random_string(); +} + // // other common utils // @@ -519,8 +525,13 @@ static json oaicompat_completion_params_parse(const json & body) { throw std::runtime_error("Only one completion choice is allowed"); } + // Handle "echo" field + if (json_value(body, "echo", false)) { + throw std::runtime_error("Only no echo is supported"); + } + // Params supported by OAI but unsupported by llama.cpp - static const std::vector unsupported_params { "best_of", "echo", "suffix" }; + static const std::vector unsupported_params { "best_of", "suffix" }; for (const auto & param : unsupported_params) { if (body.contains(param)) { throw std::runtime_error("Unsupported param: " + param); @@ -583,8 +594,8 @@ static json oaicompat_completion_params_parse( if (response_type == "json_object") { json_schema = json_value(response_format, "schema", json::object()); } else if (response_type == "json_schema") { - json json_schema = json_value(response_format, "json_schema", json::object()); - json_schema = json_value(json_schema, "schema", json::object()); + auto schema_wrapper = json_value(response_format, "json_schema", json::object()); + json_schema = json_value(schema_wrapper, "schema", json::object()); } else if (!response_type.empty() && response_type != "text") { throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); } @@ -596,10 +607,11 @@ static json oaicompat_completion_params_parse( inputs.tool_choice = common_chat_tool_choice_parse_oaicompat(json_value(body, "tool_choice", std::string("auto"))); inputs.json_schema = json_schema.is_null() ? "" : json_schema.dump(); inputs.grammar = grammar; - inputs.add_generation_prompt = true; + inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); inputs.use_jinja = use_jinja; inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false); inputs.extract_reasoning = reasoning_format != COMMON_REASONING_FORMAT_NONE; + inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) { throw std::runtime_error("Cannot use custom grammar constraints with tools."); } @@ -609,14 +621,13 @@ static json oaicompat_completion_params_parse( llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; - llama_params["grammar"] = chat_params.grammar; + if (!chat_params.grammar.empty()) { + llama_params["grammar"] = chat_params.grammar; + } llama_params["grammar_lazy"] = chat_params.grammar_lazy; auto grammar_triggers = json::array(); for (const auto & trigger : chat_params.grammar_triggers) { - grammar_triggers.push_back({ - {"word", trigger.word}, - {"at_start", trigger.at_start}, - }); + grammar_triggers.push_back(trigger.to_json()); } llama_params["grammar_triggers"] = grammar_triggers; llama_params["preserved_tokens"] = chat_params.preserved_tokens; diff --git a/examples/server/webui/src/components/ChatScreen.tsx b/examples/server/webui/src/components/ChatScreen.tsx index d7a246cf6a..d12b06e125 100644 --- a/examples/server/webui/src/components/ChatScreen.tsx +++ b/examples/server/webui/src/components/ChatScreen.tsx @@ -2,7 +2,7 @@ import { useEffect, useMemo, useRef, useState } from 'react'; import { CallbackGeneratedChunk, useAppContext } from '../utils/app.context'; import ChatMessage from './ChatMessage'; import { CanvasType, Message, PendingMessage } from '../utils/types'; -import { classNames, throttle } from '../utils/misc'; +import { classNames, cleanCurrentUrl, throttle } from '../utils/misc'; import CanvasPyInterpreter from './CanvasPyInterpreter'; import StorageUtils from '../utils/storage'; import { useVSCodeContext } from '../utils/llama-vscode'; @@ -18,6 +18,24 @@ export interface MessageDisplay { isPending?: boolean; } +/** + * If the current URL contains "?m=...", prefill the message input with the value. + * If the current URL contains "?q=...", prefill and SEND the message. + */ +const prefilledMsg = { + content() { + const url = new URL(window.location.href); + return url.searchParams.get('m') ?? url.searchParams.get('q') ?? ''; + }, + shouldSend() { + const url = new URL(window.location.href); + return url.searchParams.has('q'); + }, + clear() { + cleanCurrentUrl(['m', 'q']); + }, +}; + function getListMessageDisplay( msgs: Readonly, leafNodeId: Message['id'] @@ -81,13 +99,9 @@ export default function ChatScreen() { canvasData, replaceMessageAndGenerate, } = useAppContext(); - const [inputMsg, setInputMsg] = useState(''); - const inputRef = useRef(null); + const textarea = useOptimizedTextarea(prefilledMsg.content()); - const { extraContext, clearExtraContext } = useVSCodeContext( - inputRef, - setInputMsg - ); + const { extraContext, clearExtraContext } = useVSCodeContext(textarea); // TODO: improve this when we have "upload file" feature const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined; @@ -117,9 +131,10 @@ export default function ChatScreen() { }; const sendNewMessage = async () => { - if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return; - const lastInpMsg = inputMsg; - setInputMsg(''); + const lastInpMsg = textarea.value(); + if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? '')) + return; + textarea.setValue(''); scrollToBottom(false); setCurrNodeId(-1); // get the last message node @@ -128,13 +143,13 @@ export default function ChatScreen() { !(await sendMessage( currConvId, lastMsgNodeId, - inputMsg, + lastInpMsg, currExtra, onChunk )) ) { // restore the input message if failed - setInputMsg(lastInpMsg); + textarea.setValue(lastInpMsg); } // OK clearExtraContext(); @@ -172,6 +187,19 @@ export default function ChatScreen() { const hasCanvas = !!canvasData; + useEffect(() => { + if (prefilledMsg.shouldSend()) { + // send the prefilled message if needed + sendNewMessage(); + } else { + // otherwise, focus on the input + textarea.focus(); + } + prefilledMsg.clear(); + // no need to keep track of sendNewMessage + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [textarea.ref]); + // due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg) const pendingMsgDisplay: MessageDisplay[] = pendingMsg && messages.at(-1)?.msg.id !== pendingMsg.id @@ -224,9 +252,7 @@ export default function ChatScreen() {