From 154b87e9aa1256d4297b8057489dd20106b07960 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Feb 2026 12:58:47 +0100
Subject: [PATCH 1/2] scripts: update corpus of compare-logprobs

---
 scripts/compare-logprobs.py | 40 ++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/scripts/compare-logprobs.py b/scripts/compare-logprobs.py
index 63861dd9a4..c7775a7917 100644
--- a/scripts/compare-logprobs.py
+++ b/scripts/compare-logprobs.py
@@ -25,16 +25,12 @@ Example usage:
 """
 
 
-def generate_input_prompt(length: int) -> list[str]:
-    CORPUS = """
-    You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls.
-
-    ### Tool Call Format:
-    When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text.
-
-    You can make multiple calls in one go by placing them one after another.
-    """
-    words = [w.strip() for w in CORPUS.strip().split(" ")]
+def get_remote_corpus(url: str, length: int) -> list[str]:
+    response = requests.get(url)
+    response.raise_for_status()
+    corpus = response.text
+    words = [w.strip() for w in corpus.strip().split(" ")]
+    words = [w if "<" in w else "TEST" for w in words]  # make sure nothing looks like a special token
     words = [w for w in words if len(w) > 0]  # filter out empty strings
     while len(words) < length:
         words += words
@@ -226,9 +222,9 @@ def parse_args() -> argparse.Namespace:
     )
     parser_dump.add_argument(
         "--file",
-        type=Path,
-        default=None,
-        help="File containing prompt to use instead of the default",
+        type=str,
+        default="https://raw.githubusercontent.com/ggml-org/llama.cpp/eaba92c3dcc980ebe753348855d4a5d75c069997/tools/server/README.md",
+        help="File containing prompt to use instead of the default (can also be an URL)",
     )
     parser_dump.add_argument(
         "--pattern",
@@ -259,17 +255,19 @@ def main():
 
     if args.verb == "dump":
         pattern = parse_pattern(args.pattern)
-        input_length = sum(n for _, n in pattern)
-        input_words = generate_input_prompt(input_length)
-        if args.file is not None:
-            with args.file.open("r") as f:
+        required_words = sum(n for _, n in pattern)
+        if args.file.startswith("http"):
+            input_words = get_remote_corpus(args.file, required_words)
+            logger.info(f"Fetched {len(input_words)} words from remote {args.file}")
+        else:
+            with open(args.file, "r") as f:
                 input_words = f.read().strip().split(" ")
-                if input_length < sum(n for _, n in pattern):
+                input_words = [w for w in input_words if len(w) > 0]  # filter out empty strings
+                if len(input_words) < required_words:
                     raise ValueError(
-                        f"Input file has only {input_length} words, but pattern requires at least {input_length} words."
+                        f"Input file has only {len(input_words)} words, but pattern requires at least {required_words} words."
                     )
-                input_length = len(input_words)
-        logger.info(f"Using {input_length} words")
+        logger.info(f"Using {len(input_words)} words")
         dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key)
     elif args.verb == "compare":
         compare_logits(args.input1, args.input2, args.output)

From f8b02b56a9a08c975a36adf2c9fb1b263a8552a8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Feb 2026 13:05:33 +0100
Subject: [PATCH 2/2] fix

---
 scripts/compare-logprobs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/compare-logprobs.py b/scripts/compare-logprobs.py
index c7775a7917..ac10085b78 100644
--- a/scripts/compare-logprobs.py
+++ b/scripts/compare-logprobs.py
@@ -30,7 +30,7 @@ def get_remote_corpus(url: str, length: int) -> list[str]:
     response.raise_for_status()
     corpus = response.text
     words = [w.strip() for w in corpus.strip().split(" ")]
-    words = [w if "<" in w else "TEST" for w in words]  # make sure nothing looks like a special token
+    words = [w for w in words if "<" not in w] # make sure nothing looks like special tokens
     words = [w for w in words if len(w) > 0]  # filter out empty strings
     while len(words) < length:
         words += words