Query and analyze logs using Grafana Loki for the Kagenti platform, search for errors, and investigate issues
This skill helps you query and analyze logs from the Kagenti platform using Loki via Grafana.
Access Grafana: https://grafana.localtest.me:9443 Navigate: Explore → Select Loki datasource
Log Dashboard: https://grafana.localtest.me:9443/d/loki-logs/loki-logs
Query Examples in Grafana Explore:
# All logs from observability namespace
{kubernetes_namespace_name="observability"}
# Logs from specific pod
{kubernetes_pod_name=~"prometheus.*"}
# Logs with errors
{kubernetes_namespace_name="observability"} |= "error"
# Logs from last 5 minutes with level=error
{kubernetes_namespace_name="observability"} | json | level="error"
# Count errors per namespace
sum by (kubernetes_namespace_name) (count_over_time({kubernetes_namespace_name=~".+"} |= "error" [5m]))
# Query Loki for recent errors in observability namespace
kubectl exec -n observability deployment/grafana -- \
curl -s -G 'http://loki.observability.svc:3100/loki/api/v1/query_range' \
--data-urlencode 'query={kubernetes_namespace_name="observability"} |= "error"' \
--data-urlencode 'limit=100' \
--data-urlencode 'start='$(date -u -v-5M +%s)000000000 \
--data-urlencode 'end='$(date -u +%s)000000000 | python3 -m json.tool
# Get logs for a specific pod using kubectl
kubectl logs -n observability deployment/prometheus --tail=100
# Get logs from previous container (if crashed)
kubectl logs -n observability pod/prometheus-xxx --previous
# Follow logs in real-time
kubectl logs -n observability deployment/grafana -f --tail=20
# Get logs from specific container in pod
kubectl logs -n observability pod/alertmanager-xxx -c alertmanager --tail=50
# Get recent error logs from all namespaces
for ns in observability keycloak oauth2-proxy istio-system kiali-system; do
echo "=== Errors in $ns ==="
kubectl logs -n $ns --all-containers=true --tail=50 2>&1 | grep -i "error\|fatal\|exception" | head -5
echo
done
# Find pods with issues and check their logs
kubectl get pods -A | grep -E "Error|CrashLoop|ImagePull" | while read ns pod rest; do
echo "=== Logs for $pod in $ns ==="
kubectl logs -n $ns $pod --tail=30 --previous 2>/dev/null || kubectl logs -n $ns $pod --tail=30
echo
done
# In Grafana Explore (Loki datasource)
sum by (kubernetes_namespace_name) (
rate({kubernetes_namespace_name=~".+"}[5m])
)
# Find connection errors
{kubernetes_namespace_name="observability"} |~ "connection (refused|timeout|reset)"
# Find authentication failures
{kubernetes_namespace_name=~"keycloak|oauth2-proxy"} |~ "auth.*fail|unauthorized|forbidden"
# Find OOM kills
{kubernetes_namespace_name=~".+"} |~ "OOM|out of memory|oom.*kill"
# Only errors
{kubernetes_namespace_name="observability"} | json | level="error"
# Errors and warnings
{kubernetes_namespace_name="observability"} | json | level=~"error|warn"
# Everything except debug
{kubernetes_namespace_name="observability"} | json | level!="debug"
kubectl logs -n observability deployment/prometheus --tail=100
# Check for scrape errors
kubectl logs -n observability deployment/prometheus | grep -i "scrape\|error"
kubectl logs -n observability deployment/grafana --tail=100
# Check for datasource errors
kubectl logs -n observability deployment/grafana | grep -i "datasource\|error"
kubectl logs -n keycloak statefulset/keycloak --tail=100
# Check for authentication errors
kubectl logs -n keycloak statefulset/keycloak | grep -i "auth\|login\|error"
# Check sidecar logs for a specific pod
POD=$(kubectl get pod -n observability -l app=alertmanager -o jsonpath='{.items[0].metadata.name}')
kubectl logs -n observability $POD -c istio-proxy --tail=50
kubectl logs -n observability deployment/alertmanager -c alertmanager --tail=100
# Check for notification errors
kubectl logs -n observability deployment/alertmanager -c alertmanager | grep -i "notif\|error\|fail"
# Find pods restarting frequently
kubectl get pods -A | awk '{if ($4 > 5) print $0}'
# Check logs before crash
kubectl logs -n <namespace> <pod-name> --previous | tail -50
{kubernetes_namespace_name=~".+"} |~ "HTTP.*[45]\\d{2}"
{kubernetes_namespace_name=~".+"} |~ "timeout|timed out|deadline exceeded"
{kubernetes_namespace_name=~".+"} |~ "database.*error|connection.*refused|SQL.*error"
kubectl describe pod <pod-name> -n <namespace>kubectl logs <pod-name> -n <namespace>kubectl logs <pod-name> -n <namespace> -c <init-container>{kubernetes_namespace_name="X"} |= "error" [5m]sum by (kubernetes_pod_name) (count_over_time({...} |= "error" [5m])){kubernetes_namespace_name="X"} |= "warn"Loki Logs Dashboard: https://grafana.localtest.me:9443/d/loki-logs/loki-logs
Features:
Panels:
--previous--tail=N to limit output, then increase if needed🤖 Generated with Claude Code