Analyze images and multi-frame sequences using OpenAI GPT series
This skill enables image analysis, scene understanding, text extraction, and multi-frame comparison using OpenAI's vision-capable GPT models (e.g., gpt-4o, gpt-5). It supports single and multiple images analysis and sequential frames for temporal analysis.
The following Python libraries are required:
from openai import OpenAI
import base64
import json
import os
from pathlib import Path
All analysis results should be returned as valid JSON conforming to this schema:
{
"success": true,
"model": "gpt-5",
"analysis": "Detailed description or analysis of the image content...",
"metadata": {
"image_count": 1,
"detail_level": "high",
"tokens_used": 850,
"processing_time_ms": 1234
},
"extracted_data": {
"objects": ["car", "person", "building"],
"text_found": "Sample text from image",
"colors": ["blue", "white", "gray"],
"scene_type": "urban street"
},
"warnings": []
}
success: Boolean indicating whether the API call succeededmodel: The GPT model used for analysis (e.g., "gpt-4o", "gpt-5")analysis: Complete textual analysis or description from the modelmetadata.image_count: Number of images analyzed in this requestmetadata.detail_level: Detail parameter used ("low", "high", or "auto")metadata.tokens_used: Approximate token count for the requestmetadata.processing_time_ms: Time taken to process the requestextracted_data: Structured information extracted from the image(s)warnings: Array of issues or limitations encounteredfrom openai import OpenAI
import base64
def analyze_image(image_path, prompt="What's in this image?"):
"""Analyze a single image using GPT-5 Vision."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# Read and encode image
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
response = client.chat.completions.create(
model="gpt-5",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
max_tokens=300
)
return response.choices[0].message.content
from openai import OpenAI
def analyze_image_url(image_url, prompt="Describe this image"):
"""Analyze an image from a URL."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
response = client.chat.completions.create(
model="gpt-5",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {"url": image_url}
}
]
}
]
)
return response.choices[0].message.content
from openai import OpenAI
import base64
def analyze_multiple_images(image_paths, prompt="Compare these images"):
"""Analyze multiple images in a single request."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# Build content array with text and all images
content = [{"type": "text", "text": prompt}]
for image_path in image_paths:
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
})
response = client.chat.completions.create(
model="gpt-5",
messages=[{"role": "user", "content": content}],
max_tokens=500
)
return response.choices[0].message.content
from openai import OpenAI
import base64
import json
import time
def analyze_image_to_json(image_path, prompt="Analyze this image"):
"""Analyze image and return structured JSON output."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
start_time = time.time()
warnings = []
try:
# Read and encode image
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
# Make API call
response = client.chat.completions.create(
model="gpt-5",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
}
]
}
],
max_tokens=500
)
analysis = response.choices[0].message.content
tokens_used = response.usage.total_tokens
processing_time = int((time.time() - start_time) * 1000)
result = {
"success": True,
"model": "gpt-5",
"analysis": analysis,
"metadata": {
"image_count": 1,
"detail_level": "high",
"tokens_used": tokens_used,
"processing_time_ms": processing_time
},
"extracted_data": {},
"warnings": warnings
}
except Exception as e:
result = {
"success": False,
"model": "gpt-5",
"analysis": "",
"metadata": {
"image_count": 0,
"detail_level": "high",
"tokens_used": 0,
"processing_time_ms": 0
},
"extracted_data": {},
"warnings": [f"API call failed: {str(e)}"]
}
return result
# Usage
result = analyze_image_to_json("photo.jpg", "Describe what you see in detail")
print(json.dumps(result, indent=2))
from openai import OpenAI
import base64
from pathlib import Path
def process_video_frames(frames_directory, analysis_prompt):
"""Process sequential video frames for temporal analysis."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
image_extensions = {'.jpg', '.jpeg', '.png', '.webp'}
frame_paths = sorted([
f for f in Path(frames_directory).iterdir()
if f.suffix.lower() in image_extensions
])
# Analyze frames in groups (e.g., 5 frames at a time)
batch_size = 5
results = []
for i in range(0, len(frame_paths), batch_size):
batch = frame_paths[i:i+batch_size]
# Build content with all frames in batch
content = [{"type": "text", "text": analysis_prompt}]
for frame_path in batch:
with open(frame_path, "rb") as f:
base64_image = base64.b64encode(f.read()).decode('utf-8')
content.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "low" # Use low detail for video frames to save tokens
}
})
response = client.chat.completions.create(
model="gpt-5",
messages=[{"role": "user", "content": content}],
max_tokens=800
)
results.append({
"batch_index": i // batch_size,
"frame_range": f"{batch[0].name} to {batch[-1].name}",
"analysis": response.choices[0].message.content
})
return results
from openai import OpenAI
import base64
def extract_text_with_gpt(image_path):
"""Extract text from image using GPT Vision as OCR alternative."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
with open(image_path, "rb") as image_file:
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
response = client.chat.completions.create(
model="gpt-5",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Extract all text from this image. Return only the text content, preserving the layout and structure."
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
}
]
}
],
max_tokens=1000
)
return response.choices[0].message.content
# GPT-4o - Best for general vision tasks, fast and cost-effective
model = "gpt-4o"
# GPT-5-nano - Faster and cheaper for simple vision tasks
model = "gpt-5-nano"
# GPT-5 - More capable for complex reasoning
model = "gpt-5"
Control how much visual detail the model processes:
# Low detail - 512×512px resolution, fewer tokens, faster
"image_url": {
"url": image_url,
"detail": "low"
}
# High detail - Full resolution with tiling, more tokens, better accuracy
"image_url": {
"url": image_url,
"detail": "high"
}
# Auto - Model chooses appropriate detail level
"image_url": {
"url": image_url,
"detail": "auto"
}
When to use each detail level:
Image tokens count toward your request limits and costs:
# For gpt-5 with high detail:
# - Base: 85 tokens
# - Per tile: 170 tokens
# - Example: 1024×1024 image = 85 + (2×2 tiles × 170) = 765 tokens
def estimate_tokens_high_detail(width, height):
"""Estimate token cost for high-detail image (gpt-5)."""
# Scale to fit within 2048×2048
scale = min(2048 / width, 2048 / height, 1.0)
scaled_w = int(width * scale)
scaled_h = int(height * scale)
# Calculate tiles (512×512)
tiles_x = (scaled_w + 511) // 512
tiles_y = (scaled_h + 511) // 512
total_tiles = tiles_x * tiles_y
# Token calculation
base_tokens = 85
tile_tokens = total_tiles * 170
return base_tokens + tile_tokens
def compare_images(image1_path, image2_path):
"""Compare two images and identify differences."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
images = []
for path in [image1_path, image2_path]:
with open(path, "rb") as f:
base64_image = base64.b64encode(f.read()).decode('utf-8')
images.append(base64_image)
response = client.chat.completions.create(
model="gpt-5",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Compare these two images. List all differences you observe, including changes in objects, colors, positions, or any other visual elements."
},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{images[0]}"}},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{images[1]}"}}
]
}
]
)
return response.choices[0].message.content
def extract_structured_data(image_path, schema_description):
"""Extract structured information from image based on schema."""
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
with open(image_path, "rb") as f:
base64_image = base64.b64encode(f.read()).decode('utf-8')
prompt = f"""Analyze this image and extract information in JSON format following this schema:
{schema_description}
Return only valid JSON, no additional text."""
response = client.chat.completions.create(
model="gpt-5",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"detail": "high"
}
}
]
}
],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
# Usage example
schema = """
{
"products": [{"name": string, "price": number, "quantity": number}],
"total": number,
"date": string
}
"""
data = extract_structured_data("receipt.jpg", schema)
Issue: API authentication failed
# Verify API key is set
import os
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
Issue: Image too large
from PIL import Image
def resize_if_needed(image_path, max_size=2048):
"""Resize image if dimensions exceed maximum."""
img = Image.open(image_path)
if max(img.size) > max_size:
img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
resized_path = image_path.replace('.', '_resized.')
img.save(resized_path, quality=95)
return resized_path
return image_path
Issue: Token limit exceeded
# Reduce max_tokens or use low detail mode
response = client.chat.completions.create(
model="gpt-5",
messages=[...],
max_tokens=300 # Reduce from default
)
Issue: Rate limit errors
import time
from openai import RateLimitError
def analyze_with_retry(image_path, max_retries=3):
"""Analyze image with exponential backoff on rate limits."""
for attempt in range(max_retries):
try:
return analyze_image(image_path)
except RateLimitError:
if attempt < max_retries - 1:
wait_time = 2 ** attempt # Exponential backoff
print(f"Rate limit hit, waiting {wait_time}s...")
time.sleep(wait_time)
else:
raise
Before returning results, verify:
json.loads() to validate)