Skip to main content

Overview

Production workflows must handle errors gracefully. This example demonstrates retry logic, fallback strategies, partial failure handling, and error recovery patterns.

Basic Error Handling

name: resilient-workflow
description: Handle errors with retry and fallback

flow:
  # Try with automatic retry
  - member: fetch-data
    type: API
    config:
      url: "https://api.example.com/data"
      retries: 3
      timeout: 5000
    retry:
      maxAttempts: 3
      backoff: exponential
    continue_on_error: true  # Don't fail entire workflow

  # Fallback if fetch failed
  - member: use-cached-data
    condition: ${!fetch-data.success}
    type: Data
    config:
      storage: kv
      operation: get
      binding: CACHE

  # Proceed with either fresh or cached data
  - member: process-data
    input:
      data: ${fetch-data.success ? fetch-data.output.data : use-cached-data.output.value}

output:
  result: ${process-data.output}
  dataSource: ${fetch-data.success ? 'api' : 'cache'}
  error: ${fetch-data.error}

Multiple Fallback Levels

name: multi-level-fallback
description: Chain multiple fallback strategies

flow:
  # Primary: Try external API
  - member: fetch-from-api
    type: API
    config:
      url: "https://api.primary.com/data"
    continue_on_error: true

  # Fallback 1: Try secondary API
  - member: fetch-from-backup-api
    condition: ${!fetch-from-api.success}
    type: API
    config:
      url: "https://api.backup.com/data"
    continue_on_error: true

  # Fallback 2: Try cache
  - member: fetch-from-cache
    condition: ${!fetch-from-api.success && !fetch-from-backup-api.success}
    type: Data
    config:
      storage: kv
      operation: get
    continue_on_error: true

  # Fallback 3: Use default values
  - member: use-defaults
    condition: ${!fetch-from-api.success && !fetch-from-backup-api.success && !fetch-from-cache.success}
    type: Function

output:
  data: ${fetch-from-api.success ? fetch-from-api.output : fetch-from-backup-api.success ? fetch-from-backup-api.output : fetch-from-cache.success ? fetch-from-cache.output.value : use-defaults.output}
  source: ${fetch-from-api.success ? 'primary-api' : fetch-from-backup-api.success ? 'backup-api' : fetch-from-cache.success ? 'cache' : 'defaults'}
  errors:
    primaryError: ${fetch-from-api.error}
    backupError: ${fetch-from-backup-api.error}
    cacheError: ${fetch-from-cache.error}

Partial Failure Handling

name: gather-with-partial-failures
description: Continue even if some sources fail

flow:
  # Try to gather from multiple sources
  parallel:
    - member: source-a
      continue_on_error: true

    - member: source-b
      continue_on_error: true

    - member: source-c
      continue_on_error: true

  # Process whatever data we got
  - member: combine-available-data
    input:
      sourceA: ${source-a.success ? source-a.output : null}
      sourceB: ${source-b.success ? source-b.output : null}
      sourceC: ${source-c.success ? source-c.output : null}
      successCount: ${[source-a.success, source-b.success, source-c.success].filter(Boolean).length}

  # Only fail if all sources failed
  - member: check-minimum-data
    input:
      successCount: ${combine-available-data.output.successCount}
      minimumRequired: 1

output:
  data: ${combine-available-data.output}
  sourcesSucceeded: ${[source-a.success, source-b.success, source-c.success].filter(Boolean).length}
  sourcesFailed: ${[source-a.success, source-b.success, source-c.success].filter(x => !x).length}
  errors:
    - ${!source-a.success ? source-a.error : null}
    - ${!source-b.success ? source-b.error : null}
    - ${!source-c.success ? source-c.error : null}

Retry with Backoff

name: retry-patterns
description: Different retry strategies

flow:
  # Exponential backoff (recommended)
  - member: flaky-api-call
    retry:
      maxAttempts: 5
      backoff: exponential  # 1s, 2s, 4s, 8s, 16s
      timeout: 30000

  # Linear backoff
  - member: moderate-retry
    retry:
      maxAttempts: 3
      backoff: linear  # 2s, 4s, 6s

  # Fixed backoff
  - member: simple-retry
    retry:
      maxAttempts: 3
      backoff: fixed  # 5s, 5s, 5s

Error Recovery with Compensation

name: transactional-workflow
description: Rollback on failure

state:
  schema:
    createdResources: array
    needsRollback: boolean

flow:
  # Step 1: Create user
  - member: create-user
    input:
      userData: ${input.userData}
    state:
      set: [createdResources]
    continue_on_error: true

  # Step 2: Send welcome email
  - member: send-welcome-email
    condition: ${create-user.success}
    input:
      userId: ${create-user.output.id}
      email: ${input.userData.email}
    continue_on_error: true

  # Step 3: Create subscription
  - member: create-subscription
    condition: ${create-user.success && send-welcome-email.success}
    input:
      userId: ${create-user.output.id}
      plan: ${input.plan}
    state:
      use: [createdResources]
      set: [createdResources]
    continue_on_error: true

  # Rollback if any step failed
  - member: rollback-user
    condition: ${create-user.success && (!send-welcome-email.success || !create-subscription.success)}
    input:
      userId: ${create-user.output.id}

  - member: cleanup-resources
    condition: ${!send-welcome-email.success || !create-subscription.success}
    state:
      use: [createdResources]

output:
  success: ${create-user.success && send-welcome-email.success && create-subscription.success}
  userId: ${create-user.success ? create-user.output.id : null}
  rolledBack: ${rollback-user.success}
  errors:
    user: ${create-user.error}
    email: ${send-welcome-email.error}
    subscription: ${create-subscription.error}

Circuit Breaker Pattern

name: circuit-breaker
description: Stop calling failing service

state:
  schema:
    failureCount: number
    circuitOpen: boolean
    lastFailureTime: number

flow:
  # Check circuit state
  - member: check-circuit
    type: Function
    state:
      use: [failureCount, circuitOpen, lastFailureTime]
      set: [circuitOpen]

  # Only call if circuit is closed
  - member: call-service
    condition: ${!state.circuitOpen}
    type: API
    config:
      url: "https://api.example.com/data"
    continue_on_error: true

  # Update circuit based on result
  - member: update-circuit
    type: Function
    input:
      success: ${call-service.success}
      previousFailures: ${state.failureCount}
    state:
      use: [failureCount]
      set: [failureCount, circuitOpen, lastFailureTime]

  # Use fallback if circuit is open
  - member: use-fallback
    condition: ${state.circuitOpen}
    type: Function

output:
  result: ${call-service.success ? call-service.output : use-fallback.output}
  circuitOpen: ${state.circuitOpen}
  failureCount: ${state.failureCount}

Validation with Error Messages

name: validation-with-errors
description: Collect all validation errors

flow:
  # Validate all fields in parallel
  parallel:
    - member: validate-email
      input:
        email: ${input.email}
      continue_on_error: true

    - member: validate-phone
      input:
        phone: ${input.phone}
      continue_on_error: true

    - member: validate-address
      input:
        address: ${input.address}
      continue_on_error: true

  # Collect all errors
  - member: collect-validation-errors
    type: Function
    input:
      emailValid: ${validate-email.success}
      emailError: ${validate-email.error}
      phoneValid: ${validate-phone.success}
      phoneError: ${validate-phone.error}
      addressValid: ${validate-address.success}
      addressError: ${validate-address.error}

output:
  valid: ${validate-email.success && validate-phone.success && validate-address.success}
  errors: ${collect-validation-errors.output.errors}
  errorCount: ${collect-validation-errors.output.errorCount}

Timeout Handling

name: timeout-handling
description: Handle slow operations

flow:
  # Set aggressive timeout
  - member: fast-api
    type: API
    config:
      url: "https://api.example.com/fast"
      timeout: 5000  # 5 seconds
    continue_on_error: true

  # Fallback to slower but more reliable source
  - member: slow-but-reliable
    condition: ${!fast-api.success}
    type: API
    config:
      url: "https://api.example.com/reliable"
      timeout: 30000  # 30 seconds

output:
  data: ${fast-api.success ? fast-api.output : slow-but-reliable.output}
  responseTime: ${fast-api.success ? fast-api.executionTime : slow-but-reliable.executionTime}
  timedOut: ${fast-api.error?.includes('timeout')}

Member-Level Error Handling

// members/resilient-function/index.ts
import { createFunctionMember } from '@ensemble-edge/conductor/sdk';

export default createFunctionMember({
  async handler({ input }) {
    try {
      // Primary logic
      const result = await riskyOperation(input);

      return {
        success: true,
        result
      };
    } catch (error) {
      // Log error for monitoring
      console.error('Operation failed:', error);

      // Try fallback
      try {
        const fallbackResult = await fallbackOperation(input);

        return {
          success: true,
          result: fallbackResult,
          usedFallback: true,
          originalError: error.message
        };
      } catch (fallbackError) {
        // Return structured error instead of throwing
        return {
          success: false,
          error: error.message,
          fallbackError: fallbackError.message,
          defaultValue: getDefaultValue()
        };
      }
    }
  }
});

Testing Error Scenarios

import { describe, it, expect } from 'vitest';
import { TestConductor } from '@ensemble-edge/conductor/testing';

describe('resilient-workflow', () => {
  it('should use cache when API fails', async () => {
    const conductor = await TestConductor.create({
      mocks: {
        http: {
          handler: async () => {
            throw new Error('API unavailable');
          }
        },
        database: {
          responses: {
            'use-cached-data': {
              value: { cached: 'data' }
            }
          }
        }
      }
    });

    const result = await conductor.executeEnsemble('resilient-workflow', {});

    expect(result).toBeSuccessful();
    expect(result.output.dataSource).toBe('cache');
    expect(result.output.result).toBeDefined();
  });

  it('should retry failed operations', async () => {
    let attempts = 0;

    const conductor = await TestConductor.create({
      mocks: {
        http: {
          handler: async () => {
            attempts++;
            if (attempts < 3) {
              throw new Error('Temporary failure');
            }
            return { data: 'success' };
          }
        }
      }
    });

    const result = await conductor.executeEnsemble('resilient-workflow', {});

    expect(result).toBeSuccessful();
    expect(attempts).toBe(3);
  });

  it('should handle all sources failing', async () => {
    const conductor = await TestConductor.create({
      mocks: {
        http: {
          handler: async () => {
            throw new Error('All sources down');
          }
        },
        database: {
          handler: async () => {
            throw new Error('Cache miss');
          }
        }
      }
    });

    const result = await conductor.executeEnsemble('multi-level-fallback', {});

    expect(result).toBeSuccessful();
    expect(result.output.source).toBe('defaults');
    expect(result.output.errors.primaryError).toBeDefined();
  });
});

Best Practices

1. Fail Fast for User Errors

# ✅ Good - fail immediately for bad input
- member: validate-input
  # Don't retry validation failures

# ❌ Bad - retrying won't help
- member: validate-input
  retry:
    maxAttempts: 3  # Waste of time

2. Retry Transient Failures

# ✅ Good - retry network errors
- member: api-call
  retry:
    maxAttempts: 3
    backoff: exponential

# ✅ Good - don't retry permanent failures
- member: api-call
  retry:
    retryOn: [500, 502, 503, 504]  # Only server errors

3. Always Have a Fallback

# ✅ Good - graceful degradation
- member: primary
  continue_on_error: true

- member: fallback
  condition: ${!primary.success}

# ❌ Bad - no recovery path
- member: primary
  # Workflow fails if this fails

4. Log Errors for Monitoring

// Log errors but don't throw
console.error('Operation failed:', {
  error: error.message,
  input: input,
  timestamp: Date.now()
});

// Report to monitoring service
env.ANALYTICS?.writeDataPoint({
  blobs: ['error', memberName],
  doubles: [1],
  indexes: [Date.now()]
});