llama.cpp/examples/server/pythonClient/LlamaCppApi.py

import requests
import json

class LlamaCppApi:
    """
    LlamaCppApi provides a Pythonic interface to interact with a server offering
    various Natural Language Processing (NLP) endpoints, including text generation,
    tokenization, detokenization, embedding, and server health checks.

    :param base_url: The base URL of the NLP server API.
    :param api_key: An optional API key for authentication with the server.
    """

    def __init__(self, base_url: str, api_key: str = None):
        self.base_url = base_url
        self.headers = {'Content-Type': 'application/json'}
        if api_key:
            self.headers['Authorization'] = f'Bearer {api_key}'

    def _send_request(self, method: str, endpoint: str, data: dict = None, params: dict = None, stream: bool = False):
        """
        Sends an HTTP request to the specified endpoint and handles the response,
        including streaming responses.

        :param method: The HTTP method to use ('get' or 'post').
        :param endpoint: The API endpoint to send the request to.
        :param data: The JSON payload for 'post' requests.
        :param params: The query parameters for 'get' requests.
        :param stream: Whether to stream the response.
        :return: The JSON-decoded response data, or None on failure.
        """
        url = f"{self.base_url}/{endpoint}"
        try:
            response = requests.request(method, url, headers=self.headers, json=data, params=params, stream=stream)
            response.raise_for_status()

            if stream:
                return response.iter_lines(decode_unicode=True)
            else:
                return response
        except requests.RequestException as e:
            print(f"Request to {url} failed: {e}")
            return None

    def post_completion(self, prompt: str, options: dict = {}):
        """
        Requests text completion from the server.

        :param prompt: The input text to generate completions for.
        :param options: Additional options for controlling generation.
        :return: Server's response as a JSON object.
        """
        return self._send_request('post', 'completion', data={"prompt": prompt, **options})

    def post_tokenize(self, content: str, options: dict = {}):
        """
        Requests tokenization of the provided content.

        :param content: The text content to tokenize.
        :param options: Additional options for the tokenization request.
        :return: Tokenized content as a JSON object.
        """
        return self._send_request('post', 'tokenize', data={"content": content, **options})

    def post_detokenize(self, tokens: list, options: dict = {}):
        """
        Requests detokenization of the provided tokens.

        :param tokens: The list of tokens to detokenize.
        :param options: Additional options for the detokenization request.
        :return: Detokenized text as a JSON object.
        """
        return self._send_request('post', 'detokenize', data={"tokens": tokens, **options})

    def post_embedding(self, content: str, options: dict = {}):
        """
        Requests embeddings for the provided content.

        :param content: The text content to generate embeddings for.
        :param options: Additional options for the embedding request.
        :return: Embedding data as a JSON object.
        """
        return self._send_request('post', 'embedding', data={"content": content, **options})

    def get_health(self, options: dict = {}):
        """
        Checks the health of the server.

        :param options: Additional options for the health check request.
        :return: Health status as a JSON object.
        """
        return self._send_request('get', 'health', params=options)

    def stream_response(self, endpoint: str, data: dict = {}, chunk_callback = None):
        """
        Handles streaming responses for endpoints that support it, invoking the provided
        callback function for each received chunk of data.

        :param endpoint: The API endpoint to send the streaming request to.
        :param data: The request data for streaming endpoints.
        :param chunk_callback: The callback function invoked with each received chunk.
        """
        response_stream = self._send_request('post', endpoint, data=data, stream=True)
        if response_stream:
            for line in response_stream:
                if line.startswith("data: "):
                    try:
                        json_data = json.loads(line.split("data: ", 1)[1])
                        if callable(chunk_callback):
                            chunk_callback(json_data)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON from streaming response: {e}")
        return response_stream

# Example usage of the LlamaCppApi client
if __name__ == "__main__":
    client = LlamaCppApi(base_url="https://localhost:8080", api_key="YourAPIKey")

    # Requesting text completion with specific options
    prompt = "The meaning of life is"
    options = {"temperature": 0.5, "max_tokens": 50}
    completion_response = client.post_completion(prompt, options=options)
    print("Completion response:", completion_response)