116 lines
3.4 KiB
Python
116 lines
3.4 KiB
Python
import os
|
|
import tempfile
|
|
import pytest
|
|
from utils import *
|
|
|
|
server = ServerPreset.tinyllama2()
|
|
|
|
class LogReader:
|
|
def __init__(self, path):
|
|
self.path = path
|
|
self.pos = 0
|
|
def drain(self):
|
|
with open(self.path) as f:
|
|
f.seek(self.pos)
|
|
content = f.read()
|
|
self.pos = f.tell()
|
|
return content
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def create_server():
|
|
global server
|
|
server = ServerPreset.tinyllama2()
|
|
server.n_slots = 2
|
|
server.n_predict = 4
|
|
server.temperature = 0.0
|
|
server.server_slots = True
|
|
server.cache_ram = 100
|
|
server.kv_unified = True
|
|
server.debug = True
|
|
fd, server.log_path = tempfile.mkstemp(suffix='.log')
|
|
os.close(fd)
|
|
yield
|
|
|
|
|
|
LONG_PROMPT = (
|
|
"Once upon a time in a land far away, there lived a brave knight "
|
|
"who traveled across mountains and rivers to find the legendary "
|
|
"golden sword hidden deep within the enchanted forest of whispers. "
|
|
"He met many creatures along the way including dragons and fairies "
|
|
"and wizards who helped him on his noble quest to save the kingdom."
|
|
)
|
|
|
|
|
|
# idle slot cleared on launch should restore from cache-ram
|
|
def test_clear_and_restore():
|
|
global server
|
|
server.start()
|
|
log = LogReader(server.log_path)
|
|
|
|
# verify feature is enabled
|
|
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
|
|
|
|
res = server.make_request("POST", "/completion", data={
|
|
"prompt": LONG_PROMPT,
|
|
"id_slot": 0,
|
|
"cache_prompt": True,
|
|
})
|
|
assert res.status_code == 200
|
|
original_prompt_n = res.body["timings"]["prompt_n"]
|
|
|
|
# Slot 0 is the only slot with KV — should NOT be cleared
|
|
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
|
|
|
# Launching slot 1 clears idle slot 0
|
|
res = server.make_request("POST", "/completion", data={
|
|
"prompt": "The quick brown fox",
|
|
"id_slot": 1,
|
|
"cache_prompt": True,
|
|
})
|
|
assert res.status_code == 200
|
|
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
|
|
|
|
# Re-send same prompt — should restore from cache-ram
|
|
res = server.make_request("POST", "/completion", data={
|
|
"prompt": LONG_PROMPT,
|
|
"cache_prompt": True,
|
|
})
|
|
assert res.status_code == 200
|
|
assert "updating prompt cache" in log.drain()
|
|
assert res.body["timings"]["cache_n"] > 0
|
|
assert res.body["timings"]["prompt_n"] < original_prompt_n
|
|
|
|
# Follow-up — slot 0 kept its KV, no clearing needed
|
|
res = server.make_request("POST", "/completion", data={
|
|
"prompt": LONG_PROMPT + " The knight finally reached the castle gates.",
|
|
"cache_prompt": True,
|
|
})
|
|
assert res.status_code == 200
|
|
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|
|
|
|
|
|
def test_disabled_with_flag():
|
|
global server
|
|
server.no_clear_idle = True
|
|
server.start()
|
|
log = LogReader(server.log_path)
|
|
|
|
# Feature should not be enabled
|
|
assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
|
|
|
|
res = server.make_request("POST", "/completion", data={
|
|
"prompt": LONG_PROMPT,
|
|
"id_slot": 0,
|
|
"cache_prompt": True,
|
|
})
|
|
assert res.status_code == 200
|
|
|
|
# Request on different slot — should NOT trigger clearing
|
|
res = server.make_request("POST", "/completion", data={
|
|
"prompt": "The quick brown fox",
|
|
"id_slot": 1,
|
|
"cache_prompt": True,
|
|
})
|
|
assert res.status_code == 200
|
|
assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
|