Overview
Production workflows must handle errors gracefully. This example demonstrates retry logic, fallback strategies, partial failure handling, and error recovery patterns.Basic Error Handling
Copy
name: resilient-workflow
description: Handle errors with retry and fallback
flow:
# Try with automatic retry
- member: fetch-data
type: API
config:
url: "https://api.example.com/data"
retries: 3
timeout: 5000
retry:
maxAttempts: 3
backoff: exponential
continue_on_error: true # Don't fail entire workflow
# Fallback if fetch failed
- member: use-cached-data
condition: ${!fetch-data.success}
type: Data
config:
storage: kv
operation: get
binding: CACHE
# Proceed with either fresh or cached data
- member: process-data
input:
data: ${fetch-data.success ? fetch-data.output.data : use-cached-data.output.value}
output:
result: ${process-data.output}
dataSource: ${fetch-data.success ? 'api' : 'cache'}
error: ${fetch-data.error}
Multiple Fallback Levels
Copy
name: multi-level-fallback
description: Chain multiple fallback strategies
flow:
# Primary: Try external API
- member: fetch-from-api
type: API
config:
url: "https://api.primary.com/data"
continue_on_error: true
# Fallback 1: Try secondary API
- member: fetch-from-backup-api
condition: ${!fetch-from-api.success}
type: API
config:
url: "https://api.backup.com/data"
continue_on_error: true
# Fallback 2: Try cache
- member: fetch-from-cache
condition: ${!fetch-from-api.success && !fetch-from-backup-api.success}
type: Data
config:
storage: kv
operation: get
continue_on_error: true
# Fallback 3: Use default values
- member: use-defaults
condition: ${!fetch-from-api.success && !fetch-from-backup-api.success && !fetch-from-cache.success}
type: Function
output:
data: ${fetch-from-api.success ? fetch-from-api.output : fetch-from-backup-api.success ? fetch-from-backup-api.output : fetch-from-cache.success ? fetch-from-cache.output.value : use-defaults.output}
source: ${fetch-from-api.success ? 'primary-api' : fetch-from-backup-api.success ? 'backup-api' : fetch-from-cache.success ? 'cache' : 'defaults'}
errors:
primaryError: ${fetch-from-api.error}
backupError: ${fetch-from-backup-api.error}
cacheError: ${fetch-from-cache.error}
Partial Failure Handling
Copy
name: gather-with-partial-failures
description: Continue even if some sources fail
flow:
# Try to gather from multiple sources
parallel:
- member: source-a
continue_on_error: true
- member: source-b
continue_on_error: true
- member: source-c
continue_on_error: true
# Process whatever data we got
- member: combine-available-data
input:
sourceA: ${source-a.success ? source-a.output : null}
sourceB: ${source-b.success ? source-b.output : null}
sourceC: ${source-c.success ? source-c.output : null}
successCount: ${[source-a.success, source-b.success, source-c.success].filter(Boolean).length}
# Only fail if all sources failed
- member: check-minimum-data
input:
successCount: ${combine-available-data.output.successCount}
minimumRequired: 1
output:
data: ${combine-available-data.output}
sourcesSucceeded: ${[source-a.success, source-b.success, source-c.success].filter(Boolean).length}
sourcesFailed: ${[source-a.success, source-b.success, source-c.success].filter(x => !x).length}
errors:
- ${!source-a.success ? source-a.error : null}
- ${!source-b.success ? source-b.error : null}
- ${!source-c.success ? source-c.error : null}
Retry with Backoff
Copy
name: retry-patterns
description: Different retry strategies
flow:
# Exponential backoff (recommended)
- member: flaky-api-call
retry:
maxAttempts: 5
backoff: exponential # 1s, 2s, 4s, 8s, 16s
timeout: 30000
# Linear backoff
- member: moderate-retry
retry:
maxAttempts: 3
backoff: linear # 2s, 4s, 6s
# Fixed backoff
- member: simple-retry
retry:
maxAttempts: 3
backoff: fixed # 5s, 5s, 5s
Error Recovery with Compensation
Copy
name: transactional-workflow
description: Rollback on failure
state:
schema:
createdResources: array
needsRollback: boolean
flow:
# Step 1: Create user
- member: create-user
input:
userData: ${input.userData}
state:
set: [createdResources]
continue_on_error: true
# Step 2: Send welcome email
- member: send-welcome-email
condition: ${create-user.success}
input:
userId: ${create-user.output.id}
email: ${input.userData.email}
continue_on_error: true
# Step 3: Create subscription
- member: create-subscription
condition: ${create-user.success && send-welcome-email.success}
input:
userId: ${create-user.output.id}
plan: ${input.plan}
state:
use: [createdResources]
set: [createdResources]
continue_on_error: true
# Rollback if any step failed
- member: rollback-user
condition: ${create-user.success && (!send-welcome-email.success || !create-subscription.success)}
input:
userId: ${create-user.output.id}
- member: cleanup-resources
condition: ${!send-welcome-email.success || !create-subscription.success}
state:
use: [createdResources]
output:
success: ${create-user.success && send-welcome-email.success && create-subscription.success}
userId: ${create-user.success ? create-user.output.id : null}
rolledBack: ${rollback-user.success}
errors:
user: ${create-user.error}
email: ${send-welcome-email.error}
subscription: ${create-subscription.error}
Circuit Breaker Pattern
Copy
name: circuit-breaker
description: Stop calling failing service
state:
schema:
failureCount: number
circuitOpen: boolean
lastFailureTime: number
flow:
# Check circuit state
- member: check-circuit
type: Function
state:
use: [failureCount, circuitOpen, lastFailureTime]
set: [circuitOpen]
# Only call if circuit is closed
- member: call-service
condition: ${!state.circuitOpen}
type: API
config:
url: "https://api.example.com/data"
continue_on_error: true
# Update circuit based on result
- member: update-circuit
type: Function
input:
success: ${call-service.success}
previousFailures: ${state.failureCount}
state:
use: [failureCount]
set: [failureCount, circuitOpen, lastFailureTime]
# Use fallback if circuit is open
- member: use-fallback
condition: ${state.circuitOpen}
type: Function
output:
result: ${call-service.success ? call-service.output : use-fallback.output}
circuitOpen: ${state.circuitOpen}
failureCount: ${state.failureCount}
Validation with Error Messages
Copy
name: validation-with-errors
description: Collect all validation errors
flow:
# Validate all fields in parallel
parallel:
- member: validate-email
input:
email: ${input.email}
continue_on_error: true
- member: validate-phone
input:
phone: ${input.phone}
continue_on_error: true
- member: validate-address
input:
address: ${input.address}
continue_on_error: true
# Collect all errors
- member: collect-validation-errors
type: Function
input:
emailValid: ${validate-email.success}
emailError: ${validate-email.error}
phoneValid: ${validate-phone.success}
phoneError: ${validate-phone.error}
addressValid: ${validate-address.success}
addressError: ${validate-address.error}
output:
valid: ${validate-email.success && validate-phone.success && validate-address.success}
errors: ${collect-validation-errors.output.errors}
errorCount: ${collect-validation-errors.output.errorCount}
Timeout Handling
Copy
name: timeout-handling
description: Handle slow operations
flow:
# Set aggressive timeout
- member: fast-api
type: API
config:
url: "https://api.example.com/fast"
timeout: 5000 # 5 seconds
continue_on_error: true
# Fallback to slower but more reliable source
- member: slow-but-reliable
condition: ${!fast-api.success}
type: API
config:
url: "https://api.example.com/reliable"
timeout: 30000 # 30 seconds
output:
data: ${fast-api.success ? fast-api.output : slow-but-reliable.output}
responseTime: ${fast-api.success ? fast-api.executionTime : slow-but-reliable.executionTime}
timedOut: ${fast-api.error?.includes('timeout')}
Member-Level Error Handling
Copy
// members/resilient-function/index.ts
import { createFunctionMember } from '@ensemble-edge/conductor/sdk';
export default createFunctionMember({
async handler({ input }) {
try {
// Primary logic
const result = await riskyOperation(input);
return {
success: true,
result
};
} catch (error) {
// Log error for monitoring
console.error('Operation failed:', error);
// Try fallback
try {
const fallbackResult = await fallbackOperation(input);
return {
success: true,
result: fallbackResult,
usedFallback: true,
originalError: error.message
};
} catch (fallbackError) {
// Return structured error instead of throwing
return {
success: false,
error: error.message,
fallbackError: fallbackError.message,
defaultValue: getDefaultValue()
};
}
}
}
});
Testing Error Scenarios
Copy
import { describe, it, expect } from 'vitest';
import { TestConductor } from '@ensemble-edge/conductor/testing';
describe('resilient-workflow', () => {
it('should use cache when API fails', async () => {
const conductor = await TestConductor.create({
mocks: {
http: {
handler: async () => {
throw new Error('API unavailable');
}
},
database: {
responses: {
'use-cached-data': {
value: { cached: 'data' }
}
}
}
}
});
const result = await conductor.executeEnsemble('resilient-workflow', {});
expect(result).toBeSuccessful();
expect(result.output.dataSource).toBe('cache');
expect(result.output.result).toBeDefined();
});
it('should retry failed operations', async () => {
let attempts = 0;
const conductor = await TestConductor.create({
mocks: {
http: {
handler: async () => {
attempts++;
if (attempts < 3) {
throw new Error('Temporary failure');
}
return { data: 'success' };
}
}
}
});
const result = await conductor.executeEnsemble('resilient-workflow', {});
expect(result).toBeSuccessful();
expect(attempts).toBe(3);
});
it('should handle all sources failing', async () => {
const conductor = await TestConductor.create({
mocks: {
http: {
handler: async () => {
throw new Error('All sources down');
}
},
database: {
handler: async () => {
throw new Error('Cache miss');
}
}
}
});
const result = await conductor.executeEnsemble('multi-level-fallback', {});
expect(result).toBeSuccessful();
expect(result.output.source).toBe('defaults');
expect(result.output.errors.primaryError).toBeDefined();
});
});
Best Practices
1. Fail Fast for User Errors
Copy
# ✅ Good - fail immediately for bad input
- member: validate-input
# Don't retry validation failures
# ❌ Bad - retrying won't help
- member: validate-input
retry:
maxAttempts: 3 # Waste of time
2. Retry Transient Failures
Copy
# ✅ Good - retry network errors
- member: api-call
retry:
maxAttempts: 3
backoff: exponential
# ✅ Good - don't retry permanent failures
- member: api-call
retry:
retryOn: [500, 502, 503, 504] # Only server errors
3. Always Have a Fallback
Copy
# ✅ Good - graceful degradation
- member: primary
continue_on_error: true
- member: fallback
condition: ${!primary.success}
# ❌ Bad - no recovery path
- member: primary
# Workflow fails if this fails
4. Log Errors for Monitoring
Copy
// Log errors but don't throw
console.error('Operation failed:', {
error: error.message,
input: input,
timestamp: Date.now()
});
// Report to monitoring service
env.ANALYTICS?.writeDataPoint({
blobs: ['error', memberName],
doubles: [1],
indexes: [Date.now()]
});

