Patterns and techniques for adding governance, safety, and trust controls to AI agent systems. Use this skill when: - Building AI agents that call external tools (APIs, databases, file systems) - Implementing policy-based access controls for agent tool usage - Adding semantic intent classification to detect dangerous prompts - Creating trust scoring systems for multi-agent workflows - Building audit trails for agent actions and decisions - Enforcing rate limits, content filters, or tool restrictions on agents - Working with any agent framework (PydanticAI, CrewAI, OpenAI Agents, LangChain, AutoGen)
Patterns for adding safety, trust, and policy enforcement to AI agent systems.
Governance patterns ensure AI agents operate within defined boundaries — controlling which tools they can call, what content they can process, how much they can do, and maintaining accountability through audit trails.
User Request → Intent Classification → Policy Check → Tool Execution → Audit Log
↓ ↓ ↓
Threat Detection Allow/Deny Trust Update
Define what an agent is allowed to do as a composable, serializable policy object.
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
import re
class PolicyAction(Enum):
ALLOW = "allow"
DENY = "deny"
REVIEW = "review" # flag for human review
@dataclass
class GovernancePolicy:
"""Declarative policy controlling agent behavior."""
name: str
allowed_tools: list[str] = field(default_factory=list) # allowlist
blocked_tools: list[str] = field(default_factory=list) # blocklist
blocked_patterns: list[str] = field(default_factory=list) # content filters
max_calls_per_request: int = 100 # rate limit
require_human_approval: list[str] = field(default_factory=list) # tools needing approval
def check_tool(self, tool_name: str) -> PolicyAction:
"""Check if a tool is allowed by this policy."""
if tool_name in self.blocked_tools:
return PolicyAction.DENY
if tool_name in self.require_human_approval:
return PolicyAction.REVIEW
if self.allowed_tools and tool_name not in self.allowed_tools:
return PolicyAction.DENY
return PolicyAction.ALLOW
def check_content(self, content: str) -> Optional[str]:
"""Check content against blocked patterns. Returns matched pattern or None."""
for pattern in self.blocked_patterns:
if re.search(pattern, content, re.IGNORECASE):
return pattern
return None
Combine multiple policies (e.g., org-wide + team + agent-specific):
def compose_policies(*policies: GovernancePolicy) -> GovernancePolicy:
"""Merge policies with most-restrictive-wins semantics."""
combined = GovernancePolicy(name="composed")
for policy in policies:
combined.blocked_tools.extend(policy.blocked_tools)
combined.blocked_patterns.extend(policy.blocked_patterns)
combined.require_human_approval.extend(policy.require_human_approval)
combined.max_calls_per_request = min(
combined.max_calls_per_request,
policy.max_calls_per_request
)
if policy.allowed_tools:
if combined.allowed_tools:
combined.allowed_tools = [
t for t in combined.allowed_tools if t in policy.allowed_tools
]
else:
combined.allowed_tools = list(policy.allowed_tools)
return combined
# Usage: layer policies from broad to specific
org_policy = GovernancePolicy(
name="org-wide",
blocked_tools=["shell_exec", "delete_database"],
blocked_patterns=[r"(?i)(api[_-]?key|secret|password)\s*[:=]"],
max_calls_per_request=50
)
team_policy = GovernancePolicy(
name="data-team",
allowed_tools=["query_db", "read_file", "write_report"],
require_human_approval=["write_report"]
)
agent_policy = compose_policies(org_policy, team_policy)
Store policies as configuration, not code:
# governance-policy.yaml