payment-service.yaml 1.7 KB

123456789101112131415161718192021222324252627282930
  1. service: payment-service
  2. runbook_version: "2.0"
  3. procedures:
  4. - pattern: "Stripe API 429"
  5. severity: P2
  6. steps:
  7. - "Immediately enable request queue with rate limiting: set STRIPE_MAX_RPS=40 in env and redeploy"
  8. - "Enable exponential backoff on retries: set STRIPE_RETRY_BACKOFF=exponential, STRIPE_RETRY_INITIAL_DELAY_MS=1000"
  9. - "Check Stripe dashboard for current rate limit quota and usage"
  10. - "If a promotional event is active, implement request queuing to smooth burst traffic"
  11. - "Add circuit breaker: open circuit if stripe_api_error_rate > 50% for 30s; queue requests locally"
  12. - "For sustained high volume: contact Stripe to raise rate limits; consider async Payment Intents API"
  13. - "Monitor stripe_api_error_rate and payment_success_rate until both return to baseline"
  14. - pattern: "payment_success_rate low"
  15. severity: P2
  16. steps:
  17. - "Identify root cause category: check stripe_api_error_rate vs internal payment errors"
  18. - "For Stripe rate limit issues: follow 'Stripe API 429' procedure above"
  19. - "For authentication failures: verify API key is valid and not expired in Stripe dashboard"
  20. - "For network issues: check VPC NAT gateway health and egress bandwidth utilization"
  21. - "Communicate status to customer support team with estimated recovery time"
  22. - pattern: "retry storm"
  23. severity: P1
  24. steps:
  25. - "Immediately disable retries in payment-service config to stop amplification"
  26. - "Wait for upstream rate limit window to reset (check retry_after header in logs)"
  27. - "Re-enable retries only after adding exponential backoff with jitter"
  28. - "Post-incident: add retry budget enforcement to prevent retry storm recurrence"