#!/usr/bin/env python3 """ Test tool calling capability via chat completions endpoint. Each test case contains: - tools: list of tool definitions (OpenAI-compatible) - messages: initial conversation messages - mock_tool_responses: dict mapping tool_name -> callable(arguments) -> str (JSON) - validate: callable(tool_calls_history, final_content) -> (passed: bool, reason: str) """ import argparse import json import requests import sys # --------------------------------------------------------------------------- # Color / formatting helpers # --------------------------------------------------------------------------- RESET = "\x1b[0m" BOLD = "\x1b[1m" DIM = "\x1b[2m" # Foreground colors CYAN = "\x1b[36m" YELLOW = "\x1b[33m" GREEN = "\x1b[32m" RED = "\x1b[31m" BLUE = "\x1b[34m" WHITE = "\x1b[97m" def _print(text="", end="\n"): sys.stdout.write(text + end) sys.stdout.flush() def print_header(title): bar = "─" * 60 _print(f"\n{BOLD}{CYAN}┌{bar}┐{RESET}") _print( f"{BOLD}{CYAN}│ {WHITE}{title}{CYAN}{' ' * max(0, 58 - len(title))}│{RESET}" ) _print(f"{BOLD}{CYAN}└{bar}┘{RESET}") def print_tool_call(name, args): args_str = json.dumps(args) _print( f"\n {BOLD}{YELLOW}⚙ tool call{RESET} {CYAN}{name}{RESET}{DIM}({args_str}){RESET}" ) def print_tool_result(result): preview = result[:160] + ("…" if len(result) > 160 else "") _print(f" {DIM}{BLUE}↳ result{RESET} {DIM}{preview}{RESET}") def print_model_output(text): # printed inline during streaming; prefix with a visual marker on first chunk sys.stdout.write(text) sys.stdout.flush() def print_pass(reason): _print(f"\n{BOLD}{GREEN}✔ PASS{RESET} {reason}") def print_fail(reason): _print(f"\n{BOLD}{RED}✘ FAIL{RESET} {reason}") def print_info(msg): _print(f"{DIM}{msg}{RESET}") # --------------------------------------------------------------------------- # HTTP helpers # --------------------------------------------------------------------------- def chat_completion(url, messages, tools=None, stream=False): payload = { "messages": messages, "stream": stream, "max_tokens": 4096, } if tools: payload["tools"] = tools payload["tool_choice"] = "auto" try: response = requests.post(url, json=payload, stream=stream) response.raise_for_status() except requests.exceptions.RequestException as e: body = e.response.content if (e.response is not None) else b"" print_fail(f"Request error: {e} | body: {body}") return None full_content = "" reasoning_content = "" tool_calls: list[dict] = [] if stream: for line in response.iter_lines(): if not line: continue decoded = line.decode("utf-8") if not decoded.startswith("data: "): continue data_str = decoded[6:] if data_str == "[DONE]": break try: data = json.loads(data_str) except json.JSONDecodeError: continue choices = data.get("choices", []) if not choices: continue delta = choices[0].get("delta", {}) if delta.get("reasoning_content"): reasoning_content += delta["reasoning_content"] if delta.get("content"): full_content += delta["content"] print_model_output(delta["content"]) for tc in delta.get("tool_calls", []): idx = tc.get("index", 0) while len(tool_calls) <= idx: tool_calls.append( { "id": "", "type": "function", "function": {"name": "", "arguments": ""}, } ) if "id" in tc: tool_calls[idx]["id"] += tc["id"] if "function" in tc: if "name" in tc["function"]: tool_calls[idx]["function"]["name"] += tc["function"]["name"] if "arguments" in tc["function"]: tool_calls[idx]["function"]["arguments"] += tc["function"][ "arguments" ] else: data = response.json() choices = data.get("choices", []) if choices: msg = choices[0].get("message", {}) full_content = msg.get("content") or "" reasoning_content = msg.get("reasoning_content") or "" tool_calls = msg.get("tool_calls") or [] if full_content: print_model_output(full_content) result = {"content": full_content, "tool_calls": tool_calls} if reasoning_content: result["reasoning_content"] = reasoning_content return result def run_agentic_loop(url, messages, tools, mock_tool_responses, stream, max_turns=6): """ Drive the multi-turn tool-call loop: 1. Send messages to model. 2. If the model returns tool calls, execute mocks and append results. 3. Repeat until no more tool calls or max_turns reached. Returns (all_tool_calls, final_content). """ msgs = list(messages) all_tool_calls: list[dict] = [] for _ in range(max_turns): result = chat_completion(url, msgs, tools=tools, stream=stream) if result is None: return all_tool_calls, None tcs = result.get("tool_calls") or [] content = result.get("content") or "" if not tcs: # Print a visual separator before the final model response if content: _print(f"\n{DIM}{'·'*60}{RESET}") _print(f"{DIM} model response:{RESET}\n") return all_tool_calls, content # Record tool calls for validation all_tool_calls.extend(tcs) # Append assistant message with tool calls assistant_msg: dict = { "role": "assistant", "content": content, "tool_calls": tcs, } reasoning = result.get("reasoning_content") if reasoning: assistant_msg["reasoning_content"] = reasoning msgs.append(assistant_msg) # Execute each tool call via mock and append tool result messages for tc in tcs: tool_name = tc["function"]["name"] try: args = json.loads(tc["function"]["arguments"]) except json.JSONDecodeError: args = {} print_tool_call(tool_name, args) mock_fn = mock_tool_responses.get(tool_name) if mock_fn: tool_result = mock_fn(args) else: tool_result = json.dumps({"error": f"Unknown tool: {tool_name}"}) print_tool_result(tool_result) msgs.append( { "role": "tool", "tool_call_id": tc.get("id", ""), "content": tool_result, } ) return all_tool_calls, None # --------------------------------------------------------------------------- # Test case runner # --------------------------------------------------------------------------- def run_test(url, test_case, stream): name = test_case["name"] mode = f"{'stream' if stream else 'non-stream'}" print_header(f"{name} [{mode}]") all_tool_calls, final_content = run_agentic_loop( url, messages=test_case["messages"], tools=test_case["tools"], mock_tool_responses=test_case["mock_tool_responses"], stream=stream, ) if final_content is None and not all_tool_calls: print_fail("No response from server.") return False passed, reason = test_case["validate"](all_tool_calls, final_content) if passed: print_pass(reason) else: print_fail(reason) return passed # --------------------------------------------------------------------------- # Test case definitions # --------------------------------------------------------------------------- # ---- Test 1: E-commerce multi-step search (Azzoo = anonymized marketplace) ---- _AZZOO_TOOLS = [ { "type": "function", "function": { "name": "azzoo_search_products", "description": ( "Search for products on Azzoo marketplace by keyword. " "Returns a list of matching products with IDs, titles, ratings and prices." ), "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Search keyword or phrase", }, "page": { "type": "string", "description": "Page number (1-based)", "default": "1", }, }, "required": ["query"], }, }, }, { "type": "function", "function": { "name": "azzoo_get_product", "description": "Retrieve detailed information about a specific Azzoo product including specs and price.", "parameters": { "type": "object", "properties": { "product_id": { "type": "string", "description": "Azzoo product identifier (e.g. AZB12345)", }, }, "required": ["product_id"], }, }, }, { "type": "function", "function": { "name": "azzoo_get_reviews", "description": "Fetch customer reviews for an Azzoo product.", "parameters": { "type": "object", "properties": { "product_id": { "type": "string", "description": "Azzoo product identifier", }, "page": { "type": "string", "description": "Review page number", "default": "1", }, }, "required": ["product_id"], }, }, }, ] _AZZOO_SEARCH_RESULT = { "results": [ { "product_id": "AZB00001", "title": "SteelBrew Pro Kettle 1.7L", "rating": 4.6, "price": 34.99, }, { "product_id": "AZB00002", "title": "HeatKeep Gooseneck Kettle", "rating": 4.3, "price": 27.50, }, { "product_id": "AZB00003", "title": "QuickBoil Stainless Kettle", "rating": 4.1, "price": 21.00, }, ] } _AZZOO_PRODUCT_RESULT = { "product_id": "AZB00001", "title": "SteelBrew Pro Kettle 1.7L", "price": 34.99, "rating": 4.6, "review_count": 2847, "specs": { "material": "18/8 stainless steel", "capacity": "1.7 L", "auto_shutoff": True, "keep_warm": "30 min", "warranty": "2 years", }, } _AZZOO_REVIEWS_RESULT = { "product_id": "AZB00001", "average_rating": 4.6, "reviews": [ { "rating": 5, "title": "Excellent build quality", "body": "Very sturdy, boils fast and stays warm longer than expected.", }, { "rating": 5, "title": "Great for loose-leaf tea", "body": "The wide spout makes filling a teapot easy. No leaks after months of use.", }, { "rating": 3, "title": "Minor lid issue", "body": "The lid doesn't always click shut properly, but overall happy with it.", }, { "rating": 4, "title": "Good value", "body": "Heats quickly and the auto shutoff works reliably.", }, ], } AZZOO_TEST_CASE = { "name": "Azzoo E-commerce: search -> product detail -> reviews", "messages": [ { "role": "user", "content": ( "I need a durable stainless steel tea kettle for my weekly tea gatherings. " "Please search Azzoo for 'stainless steel tea kettle', then get full details " "on the top-rated result, and finally fetch its customer reviews so I can " "check for recurring complaints. Give me a summary with pros and cons." ), } ], "tools": _AZZOO_TOOLS, "mock_tool_responses": { "azzoo_search_products": lambda _: json.dumps(_AZZOO_SEARCH_RESULT), "azzoo_get_product": lambda _: json.dumps(_AZZOO_PRODUCT_RESULT), "azzoo_get_reviews": lambda _: json.dumps(_AZZOO_REVIEWS_RESULT), }, "validate": lambda tcs, content: _validate_azzoo(tcs, content), } def _validate_azzoo(tcs, content): names = [tc["function"]["name"] for tc in tcs] if not names: return False, "No tool calls made" if "azzoo_search_products" not in names: return False, f"Expected azzoo_search_products to be called, got: {names}" # After search the model should look up product details if "azzoo_get_product" not in names and "azzoo_get_reviews" not in names: return False, f"Expected follow-up product/review lookup, got: {names}" # Verify product lookup used an ID from search results for tc in tcs: if tc["function"]["name"] == "azzoo_get_product": try: args = json.loads(tc["function"]["arguments"]) pid = args.get("product_id", "") if not pid: return False, "azzoo_get_product called with empty product_id" except json.JSONDecodeError: return False, "azzoo_get_product arguments are not valid JSON" if not content: return False, "No final summary produced" return True, f"All expected tools called in order: {names}" # ---- Test 2: Fitness BMI + exercise recommendations ---- _FITNESS_TOOLS = [ { "type": "function", "function": { "name": "calculate_bmi", "description": "Calculate Body Mass Index (BMI) from weight and height.", "parameters": { "type": "object", "properties": { "weight_kg": { "type": "number", "description": "Body weight in kilograms", }, "height_m": {"type": "number", "description": "Height in meters"}, }, "required": ["weight_kg", "height_m"], }, }, }, { "type": "function", "function": { "name": "get_exercises", "description": ( "Fetch a list of exercises filtered by muscle group, difficulty, category, " "and/or force type." ), "parameters": { "type": "object", "properties": { "muscle": { "type": "string", "description": "Target muscle group (e.g. chest, back, legs)", }, "difficulty": { "type": "string", "description": "Difficulty level: beginner, intermediate, expert", }, "category": { "type": "string", "description": "Exercise category (e.g. strength, cardio, stretching)", }, "force": { "type": "string", "description": "Force type: push, pull, static", }, }, "required": [], }, }, }, ] _BMI_RESULT = {"bmi": 24.5, "category": "Normal weight", "healthy_range": "18.5 – 24.9"} _EXERCISES_RESULT = { "exercises": [ { "name": "Push-Up", "muscle": "chest", "difficulty": "beginner", "equipment": "none", "instructions": "Keep body straight, lower chest to floor.", }, { "name": "Incline Dumbbell Press", "muscle": "chest", "difficulty": "beginner", "equipment": "dumbbells, bench", "instructions": "Press dumbbells up from chest on incline bench.", }, { "name": "Chest Fly (cables)", "muscle": "chest", "difficulty": "beginner", "equipment": "cable machine", "instructions": "Bring cables together in an arc motion.", }, ] } FITNESS_TEST_CASE = { "name": "Fitness: BMI calculation + exercise suggestions", "messages": [ { "role": "user", "content": ( "I'm a 32-year-old male, 78 kg and 1.80 m tall. " "Please calculate my BMI and then suggest some beginner chest exercises I can do " "to build strength. Give me a short personalised plan." ), } ], "tools": _FITNESS_TOOLS, "mock_tool_responses": { "calculate_bmi": lambda _: json.dumps(_BMI_RESULT), "get_exercises": lambda _: json.dumps(_EXERCISES_RESULT), }, "validate": lambda tcs, content: _validate_fitness(tcs, content), } def _validate_fitness(tcs, content): names = [tc["function"]["name"] for tc in tcs] if not names: return False, "No tool calls made" if "calculate_bmi" not in names: return False, f"Expected calculate_bmi to be called, got: {names}" # Validate BMI args contain plausible values for tc in tcs: if tc["function"]["name"] == "calculate_bmi": try: args = json.loads(tc["function"]["arguments"]) w = args.get("weight_kg") h = args.get("height_m") if w is None or h is None: return False, f"calculate_bmi missing weight_kg or height_m: {args}" if not (50 <= float(w) <= 200): return False, f"calculate_bmi weight out of plausible range: {w}" if not (1.0 <= float(h) <= 2.5): return False, f"calculate_bmi height out of plausible range: {h}" except (json.JSONDecodeError, ValueError) as e: return False, f"calculate_bmi argument error: {e}" if not content: return False, "No final plan produced" return True, f"Tools called: {names}" # ---- Test 3: Community class planning (anonymised cooking/topic discovery) ---- _COMMUNITY_TOOLS = [ { "type": "function", "function": { "name": "get_trending_questions", "description": ( "Fetch commonly asked questions on a topic from search engine 'People Also Ask' boxes." ), "parameters": { "type": "object", "properties": { "query": {"type": "string", "description": "Topic to search for"}, "max_results": { "type": "integer", "description": "Maximum questions to return", "default": 10, }, }, "required": ["query"], }, }, }, { "type": "function", "function": { "name": "search_mobile_apps", "description": "Search the mobile app store for apps matching a category or keyword.", "parameters": { "type": "object", "properties": { "keyword": { "type": "string", "description": "Search keyword (e.g. 'Italian cooking')", }, "platform": { "type": "string", "enum": ["ios", "android", "both"], "default": "both", }, "max_results": { "type": "integer", "description": "Number of results", "default": 10, }, }, "required": ["keyword"], }, }, }, ] _TRENDING_QUESTIONS_RESULT = { "query": "Italian cuisine", "questions": [ "What are the most popular Italian dishes?", "What makes Italian food different from other cuisines?", "How do you make authentic Italian pasta from scratch?", "What are traditional Italian desserts?", "What herbs are commonly used in Italian cooking?", "Is Italian food healthy?", "What wine pairs best with Italian pasta?", ], } _APPS_RESULT = { "keyword": "Italian cooking", "results": [ { "name": "PastaPro", "rating": 4.5, "installs": "500K+", "focus": "pasta recipes only", }, { "name": "CookEasy", "rating": 4.2, "installs": "1M+", "focus": "general cooking, limited Italian content", }, { "name": "ItalianKitchen", "rating": 3.8, "installs": "100K+", "focus": "regional Italian recipes, no video", }, ], } COMMUNITY_CLASS_TEST_CASE = { "name": "Community class planning: trending topics + app gap analysis", "messages": [ { "role": "user", "content": ( "I want to start teaching Italian cooking classes at my community centre. " "First, find out what people commonly ask about Italian cuisine online. " "Then search for existing Italian cooking apps to see what they cover. " "Use both results to suggest three unique angles for my classes that fill gaps " "in what apps already offer." ), } ], "tools": _COMMUNITY_TOOLS, "mock_tool_responses": { "get_trending_questions": lambda _: json.dumps(_TRENDING_QUESTIONS_RESULT), "search_mobile_apps": lambda _: json.dumps(_APPS_RESULT), }, "validate": lambda tcs, content: _validate_community(tcs, content), } def _validate_community(tcs, content): names = [tc["function"]["name"] for tc in tcs] if not names: return False, "No tool calls made" missing = [ t for t in ("get_trending_questions", "search_mobile_apps") if t not in names ] if missing: return False, f"Missing expected tool calls: {missing}; got: {names}" if not content: return False, "No class suggestion produced" return True, f"Both discovery tools called: {names}" # ---- Test 4: Multi-hostname geolocation filter (anonymized gallery discovery) ---- # Inspired by: checking gallery website server locations to find truly remote venues. # Anonymized: galleryone.de → halle-eins.de, gallerytwo.fr → galerie-deux.fr, # gallerythree.it → galleria-tre.it _GEO_TOOLS = [ { "type": "function", "function": { "name": "lookup_ip_geolocation", "description": ( "Retrieve geolocation data for an IP address or hostname, including country, " "city, coordinates, and network info. Useful for verifying physical server " "locations or personalising regional content." ), "parameters": { "type": "object", "properties": { "host": { "type": "string", "description": "IP address or hostname to look up (e.g. '8.8.8.8' or 'example.com').", }, }, "required": ["host"], }, }, }, ] # Mock: one urban (Berlin → discard), two rural (keep) _GEO_RESPONSES = { "halle-eins.de": { "host": "halle-eins.de", "city": "Berlin", "country": "DE", "lat": 52.5200, "lon": 13.4050, "is_major_city": True, }, "galerie-deux.fr": { "host": "galerie-deux.fr", "city": "Rocamadour", "country": "FR", "lat": 44.7994, "lon": 1.6178, "is_major_city": False, }, "galleria-tre.it": { "host": "galleria-tre.it", "city": "Matera", "country": "IT", "lat": 40.6664, "lon": 16.6044, "is_major_city": False, }, } def _geo_mock(args): host = args.get("host", "") return json.dumps(_GEO_RESPONSES.get(host, {"error": f"unknown host: {host}"})) GEO_TEST_CASE = { "name": "Gallery geolocation: filter urban venues, keep remote ones", "messages": [ { "role": "user", "content": ( "I have abstract paintings to exhibit in remote European galleries. " "I received enquiries from three venues: halle-eins.de, galerie-deux.fr, " "and galleria-tre.it. Please look up the geolocation of each website's server. " "Discard any venue whose server is in a major city (e.g. Berlin, Paris, Rome). " "For the remaining venues, report their exact coordinates so I can check " "whether hiking trails are nearby — my work thrives where nature and art meet." ), } ], "tools": _GEO_TOOLS, "mock_tool_responses": { "lookup_ip_geolocation": _geo_mock, }, "validate": lambda tcs, content: _validate_geo(tcs, content), } def _validate_geo(tcs, content): names = [tc["function"]["name"] for tc in tcs] if not names: return False, "No tool calls made" # Expect exactly one geolocation call per domain (3 total) geo_calls = [tc for tc in tcs if tc["function"]["name"] == "lookup_ip_geolocation"] if len(geo_calls) < 3: return ( False, f"Expected geolocation called 3 times (once per domain), got {len(geo_calls)}", ) queried_hosts = set() for tc in geo_calls: try: args = json.loads(tc["function"]["arguments"]) host = args.get("host", "") if not host: return False, f"lookup_ip_geolocation called with empty host: {args}" queried_hosts.add(host) except json.JSONDecodeError: return False, "lookup_ip_geolocation arguments are not valid JSON" expected = {"halle-eins.de", "galerie-deux.fr", "galleria-tre.it"} if not expected.issubset(queried_hosts): return ( False, f"Not all domains queried. Expected {expected}, got {queried_hosts}", ) if not content: return False, "No final summary produced" return True, f"All 3 domains geolocated: {sorted(queried_hosts)}" # ---- Test 5: EV fleet expansion — stock → security → property → video ---- # Inspired by: multi-step business analysis combining finance, cybersecurity, # real estate and educational content. # Anonymized: Tesla → Voltara (VLTR), Rivian → Rivex (RVXN), # Trenton → Halverton _EV_TOOLS = [ { "type": "function", "function": { "name": "get_stock_quote", "description": "Retrieve the latest market quote for a financial instrument by ticker symbol.", "parameters": { "type": "object", "properties": { "symbol": { "type": "string", "description": "Ticker symbol (e.g. 'VLTR', 'RVXN')", }, "interval": { "type": "string", "description": "Time interval: 1min, 5min, 1h, 1day, 1week", "default": "1day", }, }, "required": ["symbol"], }, }, }, { "type": "function", "function": { "name": "get_security_advisories", "description": ( "Fetch current cybersecurity advisories from the national security agency, " "covering known vulnerabilities and exploits for industrial and consumer systems." ), "parameters": { "type": "object", "properties": { "keyword": { "type": "string", "description": "Filter advisories by keyword or product name", }, "limit": { "type": "integer", "description": "Maximum number of advisories to return", "default": 5, }, }, "required": [], }, }, }, { "type": "function", "function": { "name": "search_commercial_properties", "description": "Search for commercial properties (offices, garages, warehouses) available for rent or sale in a given city.", "parameters": { "type": "object", "properties": { "city": {"type": "string", "description": "City name to search in"}, "property_type": { "type": "string", "description": "Type of property: office, garage, warehouse, premises", }, "operation": { "type": "string", "enum": ["rent", "sale"], "default": "rent", }, "max_price": { "type": "integer", "description": "Maximum monthly rent or sale price", }, }, "required": ["city", "property_type"], }, }, }, { "type": "function", "function": { "name": "get_video_recommendations", "description": "Fetch a list of recommended videos related to a given topic or reference video.", "parameters": { "type": "object", "properties": { "topic": { "type": "string", "description": "Topic or keyword to search for related videos", }, }, "required": ["topic"], }, }, }, ] _STOCK_RESULT_VLTR = { "symbol": "VLTR", "company": "Voltara Inc.", "price": 218.45, "change_pct": "+2.3%", "market_cap": "694B", "currency": "USD", } _STOCK_RESULT_RVXN = { "symbol": "RVXN", "company": "Rivex Motors", "price": 12.80, "change_pct": "-1.1%", "market_cap": "11B", "currency": "USD", } _ADVISORIES_RESULT = { "count": 2, "advisories": [ { "id": "ICSA-24-102-01", "title": "Voltara In-Vehicle Infotainment System Authentication Bypass", "severity": "Medium", "summary": "Improper authentication in the OTA update module may allow an adjacent attacker to install unsigned firmware.", "published": "2024-04-11", }, { "id": "ICSA-24-085-03", "title": "Voltara Charging Management API Input Validation Flaw", "severity": "Low", "summary": "Insufficient input validation in the charging session API could expose internal error messages.", "published": "2024-03-26", }, ], } _PROPERTIES_RESULT = { "city": "Halverton", "listings": [ { "id": "HV-0041", "type": "garage", "area_sqm": 420, "monthly_rent": 2800, "ev_power_outlets": 12, "address": "14 Ironworks Lane, Halverton", }, { "id": "HV-0089", "type": "warehouse", "area_sqm": 900, "monthly_rent": 4200, "ev_power_outlets": 30, "address": "7 Depot Road, Halverton", }, ], } _VIDEOS_RESULT = { "topic": "fleet electrification", "recommendations": [ { "title": "How to Build an EV Fleet from Scratch", "channel": "Fleet Future", "views": "182K", }, { "title": "EV Charging Infrastructure for Commercial Fleets", "channel": "GreenDrive Pro", "views": "94K", }, { "title": "Total Cost of Ownership: Electric vs Diesel Vans", "channel": "LogisticsTech", "views": "61K", }, ], } def _ev_stock_mock(args): symbol = args.get("symbol", "").upper() if symbol == "VLTR": return json.dumps(_STOCK_RESULT_VLTR) if symbol == "RVXN": return json.dumps(_STOCK_RESULT_RVXN) return json.dumps({"error": f"Unknown symbol: {symbol}"}) EV_FLEET_TEST_CASE = { "name": "EV fleet expansion: stock → cybersecurity → property → videos", "messages": [ { "role": "user", "content": ( "I'm expanding my courier business into electric vehicles and need a multi-step analysis:\n" "1. Get the latest stock quote for Voltara (VLTR) and Rivex (RVXN). " "If either is above $50, continue with that company.\n" "2. Search for cybersecurity advisories related to that company's vehicle models " "to understand any tech risks.\n" "3. Find commercial garage or warehouse properties in Halverton suitable for " "EV charging infrastructure.\n" "4. Recommend videos on fleet electrification strategies.\n" "Please work through all four steps and give me a concise summary." ), } ], "tools": _EV_TOOLS, "mock_tool_responses": { "get_stock_quote": _ev_stock_mock, "get_security_advisories": lambda _: json.dumps(_ADVISORIES_RESULT), "search_commercial_properties": lambda _: json.dumps(_PROPERTIES_RESULT), "get_video_recommendations": lambda _: json.dumps(_VIDEOS_RESULT), }, "validate": lambda tcs, content: _validate_ev(tcs, content), } def _validate_ev(tcs, content): names = [tc["function"]["name"] for tc in tcs] if not names: return False, "No tool calls made" # Stock quote must come first if names[0] != "get_stock_quote": return False, f"Expected get_stock_quote to be called first, got: {names[0]}" stock_calls = [tc for tc in tcs if tc["function"]["name"] == "get_stock_quote"] for tc in stock_calls: try: args = json.loads(tc["function"]["arguments"]) sym = args.get("symbol", "") if not sym: return False, f"get_stock_quote called with empty symbol: {args}" except json.JSONDecodeError: return False, "get_stock_quote arguments are not valid JSON" # All four pipeline tools expected required = [ "get_stock_quote", "get_security_advisories", "search_commercial_properties", "get_video_recommendations", ] missing = [t for t in required if t not in names] if missing: return False, f"Missing pipeline steps: {missing}" if not content: return False, "No final summary produced" return True, f"Full 4-step pipeline executed: {names}" # --------------------------------------------------------------------------- # All test cases # --------------------------------------------------------------------------- ALL_TEST_CASES = [ AZZOO_TEST_CASE, FITNESS_TEST_CASE, COMMUNITY_CLASS_TEST_CASE, GEO_TEST_CASE, EV_FLEET_TEST_CASE, ] # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description="Test llama-server tool-calling capability." ) parser.add_argument("--host", default="localhost") parser.add_argument("--port", default=8080, type=int) parser.add_argument( "--no-stream", action="store_true", help="Disable streaming mode tests" ) parser.add_argument( "--stream-only", action="store_true", help="Only run streaming mode tests" ) parser.add_argument( "--test", help="Run only the test whose name contains this substring (case-insensitive)", ) args = parser.parse_args() url = f"http://{args.host}:{args.port}/v1/chat/completions" print_info(f"Testing server at {url}") modes = [] if not args.stream_only: modes.append(False) if not args.no_stream: modes.append(True) cases: list[dict] = ALL_TEST_CASES if args.test: name_filter = args.test.lower() cases = [c for c in cases if name_filter in str(c["name"]).lower()] if not cases: print_fail(f"No test cases matched '{args.test}'") sys.exit(1) total = 0 passed = 0 for stream in modes: for case in cases: total += 1 if run_test(url, case, stream=stream): passed += 1 color = GREEN if passed == total else RED _print(f"\n{BOLD}{color}{'─'*60}{RESET}") _print(f"{BOLD}{color} Results: {passed}/{total} passed{RESET}") _print(f"{BOLD}{color}{'─'*60}{RESET}\n") sys.exit(0 if passed == total else 1) if __name__ == "__main__": main()