This skill provides guidance for video analysis and processing tasks using computer vision techniques. It should be used when analyzing video frames, detecting motion or events, tracking objects, extracting temporal data (e.g., identifying specific frames like takeoff/landing moments), or performing frame-by-frame processing with OpenCV or similar libraries.
This skill provides structured approaches for video analysis tasks involving frame extraction, motion detection, event identification, and temporal analysis. It emphasizes visualization-first debugging, systematic parameter tuning, and robust validation strategies to avoid common pitfalls in video processing workflows.
Before writing any detection algorithms:
import cv2
cap = cv2.VideoCapture('video.mp4')
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print(f"FPS: {fps}, Frames: {frame_count}, Resolution: {width}x{height}")
Create visualization capabilities before implementing detection logic:
def save_debug_frame(frame, frame_num, detections, output_dir):
"""Save annotated frame for visual verification."""
annotated = frame.copy()
for det in detections:
cv2.rectangle(annotated, det['bbox'][:2], det['bbox'][2:], (0, 255, 0), 2)
cv2.putText(annotated, f"y={det['lowest_y']}",
(det['bbox'][0], det['bbox'][1]-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
cv2.imwrite(f"{output_dir}/frame_{frame_num:04d}.png", annotated)
Avoid hardcoded magic numbers. Make all thresholds configurable:
class VideoAnalyzerConfig:
blur_kernel_size: tuple = (21, 21) # Document why this size
binary_threshold: int = 25 # Threshold for foreground detection
dilation_iterations: int = 2 # Morphological operations
min_contour_area: int = 500 # Minimum detection size in pixels
smoothing_window: int = 5 # Temporal smoothing for metrics
Common scenarios requiring explicit handling:
def handle_detection_gap(frame_data, gap_start, gap_end):
"""Interpolate or use secondary metrics during detection gaps."""
# Option 1: Linear interpolation of position
# Option 2: Use motion magnitude as proxy
# Option 3: Flag gap for manual review
pass
At each major algorithm step, output visual proof:
# After background subtraction
cv2.imwrite("debug/01_background_diff.png", diff_frame)
# After thresholding
cv2.imwrite("debug/02_thresholded.png", thresh_frame)
# After morphological operations
cv2.imwrite("debug/03_morphed.png", morph_frame)
# After contour detection
cv2.imwrite("debug/04_contours.png", contour_frame)
Add runtime validation for expected conditions:
def validate_detection(detection, frame_num, video_props):
"""Verify detection makes physical sense."""
assert detection['area'] > 0, f"Zero area detection at frame {frame_num}"
assert 0 <= detection['center_x'] <= video_props['width']
assert 0 <= detection['center_y'] <= video_props['height']
# Domain-specific checks
if frame_num > 0:
max_reasonable_movement = video_props['fps'] * 50 # pixels per frame
assert abs(detection['center_x'] - prev_x) < max_reasonable_movement
Plot metrics over time to identify anomalies:
import matplotlib.pyplot as plt
def plot_metrics(frame_data, output_path):
"""Visualize metrics for anomaly detection."""
frames = [d['frame'] for d in frame_data]
y_positions = [d.get('lowest_y', None) for d in frame_data]
motion = [d.get('motion_magnitude', None) for d in frame_data]
fig, axes = plt.subplots(2, 1, figsize=(12, 8))
axes[0].plot(frames, y_positions, 'b-', label='Y Position')
axes[0].set_ylabel('Y Position (pixels)')
axes[1].plot(frames, motion, 'r-', label='Motion')
axes[1].set_ylabel('Motion Magnitude')
plt.savefig(output_path)
Image coordinates have origin at top-left, with Y increasing downward:
# CORRECT: Finding highest point (lowest Y value)
peak_frame = min(detections, key=lambda d: d['lowest_y'])
# WRONG: Assuming higher Y = higher position
# peak_frame = max(detections, key=lambda d: d['lowest_y'])
Convert numpy types before JSON/TOML serialization:
# WRONG: Will fail with "Object of type int64 is not JSON serializable"
result = {'frame': detection['frame'], 'y': detection['y']}
# CORRECT: Explicit conversion
result = {'frame': int(detection['frame']), 'y': int(detection['y'])}
Bounding box coordinates may not reflect actual body position:
Thresholds tuned on one video may fail on others:
When writing analysis scripts via heredoc or Write tool:
python -m py_compile script.py)Best for: Static camera, moving subject against stationary background
def frame_difference(frame1, frame2, threshold=25):
gray1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
gray2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
diff = cv2.absdiff(gray1, gray2)
_, thresh = cv2.threshold(diff, threshold, 255, cv2.THRESH_BINARY)
return thresh
Best for: Longer videos, gradual lighting changes
# MOG2 handles lighting changes better
bg_subtractor = cv2.createBackgroundSubtractorMOG2(
history=500, varThreshold=16, detectShadows=True
)
# KNN for more stable backgrounds
bg_subtractor = cv2.createBackgroundSubtractorKNN(
history=500, dist2Threshold=400.0, detectShadows=True
)
Best for: Tracking motion direction and magnitude
def compute_optical_flow(prev_gray, curr_gray):
flow = cv2.calcOpticalFlowFarneback(
prev_gray, curr_gray, None,
pyr_scale=0.5, levels=3, winsize=15,
iterations=3, poly_n=5, poly_sigma=1.2, flags=0
)
magnitude, angle = cv2.cartToPolar(flow[..., 0], flow[..., 1])
return magnitude, angle
For detecting specific events (e.g., takeoff, landing, collisions):
def detect_event(frame_data, event_type='takeoff'):
"""
Detect event using multiple signals for robustness.
Returns: frame_number, confidence_score, supporting_evidence
"""
candidates = []
for i, data in enumerate(frame_data):
signals = {
'y_derivative': compute_y_velocity(frame_data, i),
'motion_spike': data['motion'] > motion_threshold,
'position_threshold': data['y'] < y_threshold,
'acceleration': compute_acceleration(frame_data, i)
}
# Require multiple confirming signals
confidence = sum(signals.values()) / len(signals)
if confidence > 0.6:
candidates.append({
'frame': data['frame'],
'confidence': confidence,
'signals': signals
})
# Return highest confidence candidate
return max(candidates, key=lambda c: c['confidence'])
When producing analysis results:
result = {
'takeoff_frame': int(takeoff),
'takeoff_confidence': 0.85,
'takeoff_range': [93, 97],
'landing_frame': int(landing),
'landing_confidence': 0.92,
'landing_range': [112, 116],
'assumptions': [
'First frame contains no subject',
'Single subject in frame',
'Camera is stationary'
],
'debug_frames_exported': True
}