"""Synchronous OpenAI-compatible vision (VLM) client.
The OpenAI chat completions endpoint accepts vision input via multimodal
content blocks (``{"type": "image_url", "image_url": {"url": ...}}``);
this module wraps that pattern in a small, focused interface so memory
adapters and other callers can hand off an image (bytes or path) and get
back a textual description.
The client is deliberately *not* part of :class:`Provider` — VLM and chat
endpoints frequently live under different model namespaces (e.g. GLM
splits ``glm-5.x`` chat from ``glm-4.6v`` vision), and the credentials
may differ.
"""
from __future__ import annotations
import base64
import mimetypes
import os
from dataclasses import dataclass, replace
from pathlib import Path
from typing import TYPE_CHECKING, Any
from openai import (
APIConnectionError,
APITimeoutError,
InternalServerError,
OpenAI,
RateLimitError,
)
from rath.llm.credentials import resolve_credential
from rath.llm.retry import retry_with_backoff
if TYPE_CHECKING:
from rath.config.store import ConfigStore
__all__ = ["VLMProvider", "RathOpenAIVLMClient"]
_VLM_RETRYABLE: tuple[type[BaseException], ...] = (
RateLimitError,
APIConnectionError,
APITimeoutError,
InternalServerError,
)
[docs]
@dataclass(frozen=True, kw_only=True, slots=True)
class VLMProvider:
"""Routing + credentials for an OpenAI-compatible vision endpoint."""
model: str
base_url: str | None = None
api_key: str | None = None
max_tokens: int | None = 512
temperature: float | None = None
retry_max_attempts: int | None = None
retry_base_seconds: float | None = None
def __str__(self) -> str:
return self.model
def __repr__(self) -> str:
return self.__str__()
[docs]
@classmethod
def from_config(
cls,
name: str | None = None,
*,
store: "ConfigStore | None" = None,
**overrides: Any,
) -> "VLMProvider":
"""Build a :class:`VLMProvider` from ``~/.openrath/config.json``.
Lookup order:
1. ``name`` if given.
2. ``llm.vlm_provider`` if set.
Unlike :class:`EmbeddingProvider`, there is **no fallback** to
``llm.default_provider``: a chat model is rarely a vision model,
and silently falling back would produce confusing 400 errors at
first use. Raises :class:`KeyError` instead.
"""
from rath.config.store import ConfigStore
s = store or ConfigStore.load()
if name is None:
target = getattr(s.config.llm, "vlm_provider", None)
if target is None:
raise KeyError(
"no VLM provider configured: set llm.vlm_provider in "
f"{s.path} or pass name= explicitly",
)
else:
target = name
entry = s.get_llm_provider(target)
if not entry.model:
raise KeyError(
f"VLM provider {target!r} has no model set; vision endpoints "
"have no safe default — configure it in the config file",
)
base = cls(
model=entry.model,
api_key=entry.api_key,
base_url=entry.base_url,
)
if not overrides:
return base
return replace(base, **overrides)
def _resolve_api_key(provider: VLMProvider) -> str:
return resolve_credential(
provider.api_key,
os.environ.get("OPENAI_API_KEY"),
)
def _resolve_base_url(provider: VLMProvider) -> str:
return resolve_credential(
provider.base_url,
os.environ.get("OPENAI_BASE_URL"),
)
def _data_url(image_bytes: bytes, mime: str) -> str:
payload = base64.b64encode(image_bytes).decode("ascii")
return f"data:{mime};base64,{payload}"
def _infer_mime(path: Path) -> str:
guess, _ = mimetypes.guess_type(path.name)
return guess or "application/octet-stream"
[docs]
class RathOpenAIVLMClient:
"""Thin wrapper turning ``(image, prompt) -> caption`` into a chat call."""
def __init__(self, provider: VLMProvider) -> None:
key = _resolve_api_key(provider)
if not key:
raise ValueError(
"No API key for VLMProvider: set VLMProvider.api_key, export "
"OPENAI_API_KEY, or configure llm.vlm_provider in "
"~/.openrath/config.json.",
)
self._provider = provider
init_kw: dict[str, Any] = {"api_key": key}
base_url = _resolve_base_url(provider)
if base_url:
init_kw["base_url"] = base_url
self._client: OpenAI = OpenAI(**init_kw)
@property
def provider(self) -> VLMProvider:
return self._provider
[docs]
def describe(
self,
image_bytes: bytes,
*,
prompt: str,
mime: str = "image/png",
) -> str:
"""Send a single image + text prompt; return the model's reply text."""
url = _data_url(image_bytes, mime)
kwargs: dict[str, Any] = {
"model": self._provider.model,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": url}},
],
},
],
}
if self._provider.max_tokens is not None:
kwargs["max_tokens"] = self._provider.max_tokens
if self._provider.temperature is not None:
kwargs["temperature"] = self._provider.temperature
def _call() -> str:
resp = self._client.chat.completions.create(**kwargs)
if not resp.choices:
return ""
message = resp.choices[0].message
content = getattr(message, "content", "") or ""
return str(content)
return retry_with_backoff(
_call,
retryable=_VLM_RETRYABLE,
max_attempts=self._provider.retry_max_attempts,
base_seconds=self._provider.retry_base_seconds,
)
[docs]
def describe_path(self, path: Path, *, prompt: str) -> str:
"""Load an image from disk and call :meth:`describe`."""
data = Path(path).read_bytes()
return self.describe(data, prompt=prompt, mime=_infer_mime(Path(path)))