Skip to main content

Vision (Images)

Analyze images using multimodal models, enabling visual understanding, image description, and visual question answering.

Prerequisites

pip install apertis

Get your API Key from Apertis

Basic Image Analysis

from apertis import Apertis

def main():
client = Apertis()

response = client.chat.completions.create(
model="gpt-4.1",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg"
}
}
]
}
]
)

print(response.choices[0].message.content)

if __name__ == "__main__":
main()

Local Image (Base64)

import base64
from pathlib import Path
from apertis import Apertis

def encode_image(image_path: str) -> str:
"""Encode image to base64."""
with open(image_path, "rb") as image_file:
return base64.standard_b64encode(image_file.read()).decode("utf-8")

def main():
client = Apertis()

image_path = "path/to/your/image.jpg"
base64_image = encode_image(image_path)

# Determine media type
suffix = Path(image_path).suffix.lower()
media_types = {".jpg": "jpeg", ".jpeg": "jpeg", ".png": "png", ".gif": "gif", ".webp": "webp"}
media_type = media_types.get(suffix, "jpeg")

response = client.chat.completions.create(
model="gpt-4.1",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in detail."},
{
"type": "image_url",
"image_url": {
"url": f"data:image/{media_type};base64,{base64_image}"
}
}
]
}
]
)

print(response.choices[0].message.content)

if __name__ == "__main__":
main()

Multiple Images

from apertis import Apertis

def main():
client = Apertis()

response = client.chat.completions.create(
model="gpt-4.1",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Compare these two images. What are the differences?"},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image1.jpg"}
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image2.jpg"}
}
]
}
]
)

print(response.choices[0].message.content)

if __name__ == "__main__":
main()

Image Quality Control

from apertis import Apertis

def main():
client = Apertis()

# High detail for complex images
response = client.chat.completions.create(
model="gpt-4.1",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Read all the text in this document."},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/document.png",
"detail": "high" # Use high detail for text/documents
}
}
]
}
]
)

print(response.choices[0].message.content)

if __name__ == "__main__":
main()

Visual Question Answering

from apertis import Apertis

def main():
client = Apertis()

image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png"

questions = [
"What objects are in this image?",
"What colors do you see?",
"Is there any text visible?",
]

for question in questions:
response = client.chat.completions.create(
model="gpt-4.1-mini",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}}
]
}
]
)
print(f"Q: {question}")
print(f"A: {response.choices[0].message.content}\n")

if __name__ == "__main__":
main()

Streaming with Images

from apertis import Apertis

def main():
client = Apertis()

stream = client.chat.completions.create(
model="gpt-4.1",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Describe this image in detail."},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg"
}
}
]
}
],
stream=True
)

for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)

print()

if __name__ == "__main__":
main()

Supported Models

Vision capabilities are available on:

ProviderModels
OpenAIgpt-4.1, gpt-4.1-mini, gpt-4.1
Anthropicclaude-sonnet-4.5, claude-opus-4-5-20251101, claude-haiku-4-5-20250501
Googlegemini-3-pro-preview, gemini-2.5-flash

View all models →

API Reference

Image URL Object

FieldTypeDescription
urlstrImage URL or base64 data URI
detailstrDetail level: "auto", "low", or "high"

Supported Formats

  • JPEG / JPG
  • PNG
  • GIF
  • WebP

Size Limits

  • Maximum image size varies by model
  • Images are automatically resized if needed
  • Use detail: "low" for faster processing of simple images