Error Handling & Recovery

Comprehensive guide to error handling, debugging, and recovery strategies in Axon OS workflows.

Overview

Effective error handling is critical for building robust and reliable workflows in Axon OS. The platform provides comprehensive error handling mechanisms, debugging tools, and recovery strategies to ensure workflows can gracefully handle failures and maintain operational continuity.

Key Features

Multi-Level Error Handling: Node, workflow, and system-level error handling
Smart Recovery: Automatic retry mechanisms with exponential backoff
Error Classification: Categorization and contextual error information
Circuit Breakers: Prevent cascading failures
Debugging Tools: Comprehensive debugging and diagnostic capabilities
Monitoring Integration: Real-time error tracking and alerting

Error Types and Classification

Error Hierarchy

enum ErrorType {
  // System errors
  SYSTEM_ERROR = 'system_error',
  RESOURCE_ERROR = 'resource_error',
  CONFIGURATION_ERROR = 'configuration_error',
  
  // Workflow errors
  VALIDATION_ERROR = 'validation_error',
  EXECUTION_ERROR = 'execution_error',
  TIMEOUT_ERROR = 'timeout_error',
  DEPENDENCY_ERROR = 'dependency_error',
  
  // Data errors
  DATA_ERROR = 'data_error',
  SCHEMA_ERROR = 'schema_error',
  TRANSFORMATION_ERROR = 'transformation_error',
  
  // External errors
  NETWORK_ERROR = 'network_error',
  API_ERROR = 'api_error',
  DATABASE_ERROR = 'database_error',
  
  // User errors
  USER_ERROR = 'user_error',
  PERMISSION_ERROR = 'permission_error',
  AUTHENTICATION_ERROR = 'authentication_error'
}

enum ErrorSeverity {
  LOW = 'low',           // Warning, workflow continues
  MEDIUM = 'medium',     // Recoverable error, retry possible
  HIGH = 'high',         // Node failure, workflow continues
  CRITICAL = 'critical'  // Workflow failure, immediate attention required
}

interface WorkflowError {
  id: string;
  type: ErrorType;
  severity: ErrorSeverity;
  code: string;
  message: string;
  details: ErrorDetails;
  context: ErrorContext;
  timestamp: Date;
  retryCount: number;
  resolved: boolean;
  resolutionStrategy?: string;
}

interface ErrorDetails {
  nodeId?: string;
  portId?: string;
  stackTrace: string;
  inputData?: any;
  expectedOutput?: any;
  actualOutput?: any;
  systemInfo: SystemInfo;
  metadata: Record<string, any>;
}

interface ErrorContext {
  workflowId: string;
  executionId: string;
  nodeId: string;
  stepNumber: number;
  parentExecutionId?: string;
  userInfo: UserInfo;
  environmentInfo: EnvironmentInfo;
}

interface SystemInfo {
  nodeVersion: string;
  platform: string;
  architecture: string;
  memoryUsage: NodeJS.MemoryUsage;
  cpuUsage: NodeJS.CpuUsage;
  uptime: number;
}

Error Categories

class ErrorClassifier {
  classify(error: Error | WorkflowError): ErrorClassification {
    // Determine error type
    const type = this.determineErrorType(error);
    
    // Assess severity
    const severity = this.assessSeverity(error, type);
    
    // Check if error is recoverable
    const recoverable = this.isRecoverable(error, type);
    
    // Suggest recovery strategy
    const strategy = this.suggestRecoveryStrategy(error, type, severity);
    
    return {
      type,
      severity,
      recoverable,
      strategy,
      retryable: this.isRetryable(error, type),
      userActionRequired: this.requiresUserAction(error, type),
      systemActionRequired: this.requiresSystemAction(error, type)
    };
  }
  
  private determineErrorType(error: Error | WorkflowError): ErrorType {
    if (error instanceof WorkflowError) {
      return error.type;
    }
    
    // Classify based on error message and stack trace
    const message = error.message.toLowerCase();
    const stack = error.stack?.toLowerCase() || '';
    
    if (message.includes('timeout') || message.includes('timed out')) {
      return ErrorType.TIMEOUT_ERROR;
    }
    
    if (message.includes('network') || message.includes('fetch') || message.includes('connection')) {
      return ErrorType.NETWORK_ERROR;
    }
    
    if (message.includes('validation') || message.includes('invalid')) {
      return ErrorType.VALIDATION_ERROR;
    }
    
    if (message.includes('permission') || message.includes('unauthorized')) {
      return ErrorType.PERMISSION_ERROR;
    }
    
    if (message.includes('schema') || message.includes('type')) {
      return ErrorType.SCHEMA_ERROR;
    }
    
    if (stack.includes('database') || stack.includes('sql')) {
      return ErrorType.DATABASE_ERROR;
    }
    
    // Default to execution error
    return ErrorType.EXECUTION_ERROR;
  }
  
  private assessSeverity(error: Error | WorkflowError, type: ErrorType): ErrorSeverity {
    // Critical errors that require immediate attention
    const criticalTypes = [
      ErrorType.SYSTEM_ERROR,
      ErrorType.CONFIGURATION_ERROR,
      ErrorType.AUTHENTICATION_ERROR
    ];
    
    if (criticalTypes.includes(type)) {
      return ErrorSeverity.CRITICAL;
    }
    
    // High severity errors that stop node execution
    const highSeverityTypes = [
      ErrorType.RESOURCE_ERROR,
      ErrorType.DEPENDENCY_ERROR
    ];
    
    if (highSeverityTypes.includes(type)) {
      return ErrorSeverity.HIGH;
    }
    
    // Medium severity errors that are often recoverable
    const mediumSeverityTypes = [
      ErrorType.NETWORK_ERROR,
      ErrorType.API_ERROR,
      ErrorType.TIMEOUT_ERROR
    ];
    
    if (mediumSeverityTypes.includes(type)) {
      return ErrorSeverity.MEDIUM;
    }
    
    // Low severity for validation and user errors
    return ErrorSeverity.LOW;
  }
  
  private isRecoverable(error: Error | WorkflowError, type: ErrorType): boolean {
    const recoverableTypes = [
      ErrorType.NETWORK_ERROR,
      ErrorType.API_ERROR,
      ErrorType.TIMEOUT_ERROR,
      ErrorType.DATABASE_ERROR,
      ErrorType.RESOURCE_ERROR
    ];
    
    return recoverableTypes.includes(type);
  }
  
  private suggestRecoveryStrategy(
    error: Error | WorkflowError, 
    type: ErrorType, 
    severity: ErrorSeverity
  ): string {
    
    switch (type) {
      case ErrorType.NETWORK_ERROR:
      case ErrorType.API_ERROR:
        return 'retry_with_backoff';
        
      case ErrorType.TIMEOUT_ERROR:
        return 'increase_timeout_and_retry';
        
      case ErrorType.RESOURCE_ERROR:
        return 'wait_for_resources';
        
      case ErrorType.VALIDATION_ERROR:
        return 'fix_input_data';
        
      case ErrorType.CONFIGURATION_ERROR:
        return 'update_configuration';
        
      case ErrorType.PERMISSION_ERROR:
        return 'check_permissions';
        
      default:
        return severity === ErrorSeverity.CRITICAL ? 'manual_intervention' : 'retry';
    }
  }
}

Node-Level Error Handling

Error Handling Configuration

interface NodeErrorHandling {
  enabled: boolean;
  strategy: ErrorStrategy;
  retryConfig: RetryConfiguration;
  fallbackBehavior: FallbackBehavior;
  customHandlers: CustomErrorHandler[];
  propagation: ErrorPropagation;
}

enum ErrorStrategy {
  FAIL_FAST = 'fail_fast',           // Stop immediately on error
  RETRY = 'retry',                   // Retry with configuration
  FALLBACK = 'fallback',             // Use fallback value/node
  IGNORE = 'ignore',                 // Continue execution, log error
  CIRCUIT_BREAKER = 'circuit_breaker' // Use circuit breaker pattern
}

interface RetryConfiguration {
  maxAttempts: number;
  initialDelay: number;
  maxDelay: number;
  backoffMultiplier: number;
  jitter: boolean;
  retryConditions: RetryCondition[];
}

interface RetryCondition {
  errorTypes: ErrorType[];
  statusCodes?: number[];
  messagePatterns?: RegExp[];
  customPredicate?: (error: WorkflowError) => boolean;
}

interface FallbackBehavior {
  type: 'value' | 'node' | 'skip' | 'default';
  value?: any;
  nodeId?: string;
  skipToNodeId?: string;
}

class NodeErrorHandler {
  async handleError(
    error: WorkflowError,
    nodeConfig: NodeErrorHandling,
    context: ExecutionContext
  ): Promise<ErrorHandlingResult> {
    
    // Classify the error
    const classification = new ErrorClassifier().classify(error);
    
    // Check if error should be handled
    if (!this.shouldHandle(error, nodeConfig, classification)) {
      return { action: 'propagate', error };
    }
    
    // Apply error handling strategy
    switch (nodeConfig.strategy) {
      case ErrorStrategy.RETRY:
        return await this.handleRetry(error, nodeConfig, context);
        
      case ErrorStrategy.FALLBACK:
        return await this.handleFallback(error, nodeConfig, context);
        
      case ErrorStrategy.IGNORE:
        return await this.handleIgnore(error, nodeConfig, context);
        
      case ErrorStrategy.CIRCUIT_BREAKER:
        return await this.handleCircuitBreaker(error, nodeConfig, context);
        
      case ErrorStrategy.FAIL_FAST:
      default:
        return { action: 'fail', error };
    }
  }
  
  private async handleRetry(
    error: WorkflowError,
    config: NodeErrorHandling,
    context: ExecutionContext
  ): Promise<ErrorHandlingResult> {
    
    const retryConfig = config.retryConfig;
    
    // Check if we should retry this error
    if (!this.shouldRetry(error, retryConfig)) {
      return { action: 'fail', error };
    }
    
    // Check retry limit
    if (error.retryCount >= retryConfig.maxAttempts) {
      return { action: 'fail', error: this.createMaxRetriesError(error) };
    }
    
    // Calculate delay
    const delay = this.calculateRetryDelay(error.retryCount, retryConfig);
    
    // Schedule retry
    await this.delay(delay);
    
    return {
      action: 'retry',
      error,
      retryCount: error.retryCount + 1,
      delay
    };
  }
  
  private async handleFallback(
    error: WorkflowError,
    config: NodeErrorHandling,
    context: ExecutionContext
  ): Promise<ErrorHandlingResult> {
    
    const fallback = config.fallbackBehavior;
    
    switch (fallback.type) {
      case 'value':
        return {
          action: 'continue',
          result: fallback.value,
          error
        };
        
      case 'node':
        return {
          action: 'redirect',
          targetNodeId: fallback.nodeId!,
          error
        };
        
      case 'skip':
        return {
          action: 'skip',
          skipToNodeId: fallback.skipToNodeId,
          error
        };
        
      case 'default':
        const defaultValue = this.getDefaultValue(context.nodeDefinition);
        return {
          action: 'continue',
          result: defaultValue,
          error
        };
        
      default:
        return { action: 'fail', error };
    }
  }
  
  private calculateRetryDelay(
    retryCount: number, 
    config: RetryConfiguration
  ): number {
    
    let delay = config.initialDelay * Math.pow(config.backoffMultiplier, retryCount);
    
    // Apply max delay limit
    delay = Math.min(delay, config.maxDelay);
    
    // Add jitter to prevent thundering herd
    if (config.jitter) {
      delay += Math.random() * delay * 0.1;
    }
    
    return delay;
  }
  
  private shouldRetry(error: WorkflowError, config: RetryConfiguration): boolean {
    // Check retry conditions
    for (const condition of config.retryConditions) {
      if (this.matchesCondition(error, condition)) {
        return true;
      }
    }
    
    return false;
  }
  
  private matchesCondition(error: WorkflowError, condition: RetryCondition): boolean {
    // Check error type
    if (condition.errorTypes.includes(error.type)) {
      return true;
    }
    
    // Check status codes (for API errors)
    if (condition.statusCodes && error.details.metadata.statusCode) {
      if (condition.statusCodes.includes(error.details.metadata.statusCode)) {
        return true;
      }
    }
    
    // Check message patterns
    if (condition.messagePatterns) {
      for (const pattern of condition.messagePatterns) {
        if (pattern.test(error.message)) {
          return true;
        }
      }
    }
    
    // Check custom predicate
    if (condition.customPredicate) {
      return condition.customPredicate(error);
    }
    
    return false;
  }
}

// Example: HTTP Request node with comprehensive error handling
const httpNodeErrorConfig: NodeErrorHandling = {
  enabled: true,
  strategy: ErrorStrategy.RETRY,
  retryConfig: {
    maxAttempts: 3,
    initialDelay: 1000,
    maxDelay: 10000,
    backoffMultiplier: 2,
    jitter: true,
    retryConditions: [
      {
        errorTypes: [ErrorType.NETWORK_ERROR, ErrorType.TIMEOUT_ERROR],
        statusCodes: [429, 502, 503, 504], // Rate limit, bad gateway, service unavailable, gateway timeout
        messagePatterns: [/connection/i, /timeout/i]
      }
    ]
  },
  fallbackBehavior: {
    type: 'value',
    value: {
      status: 0,
      data: null,
      error: 'Request failed after retries'
    }
  },
  customHandlers: [],
  propagation: {
    stopOnError: false,
    notifyParent: true,
    logLevel: 'warn'
  }
};

Circuit Breaker Pattern

Circuit Breaker Implementation

enum CircuitState {
  CLOSED = 'closed',     // Normal operation
  OPEN = 'open',         // Failing, rejecting requests
  HALF_OPEN = 'half_open' // Testing if service recovered
}

interface CircuitBreakerConfig {
  failureThreshold: number;    // Number of failures to open circuit
  recoveryTimeout: number;     // Time to wait before trying again
  successThreshold: number;    // Successes needed to close circuit
  timeout: number;            // Request timeout
  monitoringPeriod: number;   // Time window for failure counting
}

class CircuitBreaker {
  private state: CircuitState = CircuitState.CLOSED;
  private failureCount: number = 0;
  private successCount: number = 0;
  private lastFailureTime: number = 0;
  private nextAttemptTime: number = 0;
  
  constructor(
    private nodeId: string,
    private config: CircuitBreakerConfig
  ) {}
  
  async execute<T>(operation: () => Promise<T>): Promise<T> {
    // Check circuit state
    if (this.state === CircuitState.OPEN) {
      if (Date.now() < this.nextAttemptTime) {
        throw new CircuitBreakerOpenError(
          `Circuit breaker is OPEN for node ${this.nodeId}. Next attempt at ${new Date(this.nextAttemptTime)}`
        );
      } else {
        // Transition to HALF_OPEN
        this.state = CircuitState.HALF_OPEN;
        this.successCount = 0;
      }
    }
    
    try {
      // Execute operation with timeout
      const result = await this.executeWithTimeout(operation);
      
      // Record success
      this.onSuccess();
      
      return result;
      
    } catch (error) {
      // Record failure
      this.onFailure();
      throw error;
    }
  }
  
  private async executeWithTimeout<T>(operation: () => Promise<T>): Promise<T> {
    return new Promise<T>((resolve, reject) => {
      const timer = setTimeout(() => {
        reject(new TimeoutError(`Operation timed out after ${this.config.timeout}ms`));
      }, this.config.timeout);
      
      operation()
        .then(result => {
          clearTimeout(timer);
          resolve(result);
        })
        .catch(error => {
          clearTimeout(timer);
          reject(error);
        });
    });
  }
  
  private onSuccess(): void {
    this.failureCount = 0;
    
    if (this.state === CircuitState.HALF_OPEN) {
      this.successCount++;
      
      if (this.successCount >= this.config.successThreshold) {
        this.state = CircuitState.CLOSED;
        this.successCount = 0;
      }
    }
  }
  
  private onFailure(): void {
    this.failureCount++;
    this.lastFailureTime = Date.now();
    
    if (this.state === CircuitState.HALF_OPEN) {
      // Failed during recovery, go back to OPEN
      this.state = CircuitState.OPEN;
      this.nextAttemptTime = Date.now() + this.config.recoveryTimeout;
    } else if (this.failureCount >= this.config.failureThreshold) {
      // Too many failures, open the circuit
      this.state = CircuitState.OPEN;
      this.nextAttemptTime = Date.now() + this.config.recoveryTimeout;
    }
  }
  
  getStatus(): CircuitBreakerStatus {
    return {
      nodeId: this.nodeId,
      state: this.state,
      failureCount: this.failureCount,
      successCount: this.successCount,
      lastFailureTime: new Date(this.lastFailureTime),
      nextAttemptTime: new Date(this.nextAttemptTime),
      isHealthy: this.state === CircuitState.CLOSED
    };
  }
}

class CircuitBreakerManager {
  private breakers: Map<string, CircuitBreaker> = new Map();
  
  getOrCreateBreaker(nodeId: string, config: CircuitBreakerConfig): CircuitBreaker {
    if (!this.breakers.has(nodeId)) {
      this.breakers.set(nodeId, new CircuitBreaker(nodeId, config));
    }
    return this.breakers.get(nodeId)!;
  }
  
  getAllStatuses(): CircuitBreakerStatus[] {
    return Array.from(this.breakers.values()).map(breaker => breaker.getStatus());
  }
  
  resetBreaker(nodeId: string): void {
    const breaker = this.breakers.get(nodeId);
    if (breaker) {
      // Create new breaker to reset state
      const config = breaker['config'];
      this.breakers.set(nodeId, new CircuitBreaker(nodeId, config));
    }
  }
}

Workflow-Level Error Handling

Global Error Handling

interface WorkflowErrorHandling {
  globalHandlers: GlobalErrorHandler[];
  errorPropagation: ErrorPropagationStrategy;
  deadLetterQueue: DeadLetterQueueConfig;
  rollbackStrategy: RollbackStrategy;
  notificationSettings: NotificationSettings;
}

interface GlobalErrorHandler {
  id: string;
  name: string;
  errorTypes: ErrorType[];
  priority: number;
  handler: (error: WorkflowError, context: ExecutionContext) => Promise<ErrorHandlingResult>;
  conditions: ErrorCondition[];
}

class WorkflowErrorManager {
  constructor(
    private config: WorkflowErrorHandling,
    private circuitBreakerManager: CircuitBreakerManager
  ) {}
  
  async handleWorkflowError(
    error: WorkflowError,
    context: ExecutionContext
  ): Promise<WorkflowErrorHandlingResult> {
    
    // Find applicable global handlers
    const handlers = this.findApplicableHandlers(error);
    
    // Try each handler in priority order
    for (const handler of handlers) {
      try {
        const result = await handler.handler(error, context);
        
        if (result.action !== 'continue_to_next_handler') {
          return this.createWorkflowResult(result, handler);
        }
        
      } catch (handlerError) {
        console.error(`Error in global handler ${handler.id}:`, handlerError);
      }
    }
    
    // No handler could resolve the error
    return await this.handleUnresolvedError(error, context);
  }
  
  private findApplicableHandlers(error: WorkflowError): GlobalErrorHandler[] {
    return this.config.globalHandlers
      .filter(handler => {
        // Check error type
        if (handler.errorTypes.length > 0 && !handler.errorTypes.includes(error.type)) {
          return false;
        }
        
        // Check conditions
        return handler.conditions.every(condition => this.evaluateCondition(condition, error));
      })
      .sort((a, b) => b.priority - a.priority);
  }
  
  private async handleUnresolvedError(
    error: WorkflowError,
    context: ExecutionContext
  ): Promise<WorkflowErrorHandlingResult> {
    
    // Send to dead letter queue if configured
    if (this.config.deadLetterQueue.enabled) {
      await this.sendToDeadLetterQueue(error, context);
    }
    
    // Execute rollback if configured
    if (this.config.rollbackStrategy.enabled) {
      await this.executeRollback(context);
    }
    
    // Send notifications
    await this.sendErrorNotifications(error, context);
    
    return {
      action: 'terminate_workflow',
      error,
      reason: 'Unresolved error after all handlers attempted'
    };
  }
}

// Example: Global error handlers
const defaultGlobalHandlers: GlobalErrorHandler[] = [
  {
    id: 'network_retry_handler',
    name: 'Network Error Retry Handler',
    errorTypes: [ErrorType.NETWORK_ERROR, ErrorType.API_ERROR],
    priority: 100,
    handler: async (error: WorkflowError, context: ExecutionContext) => {
      // Implement smart retry logic for network errors
      const retryCount = error.retryCount || 0;
      
      if (retryCount < 3) {
        const delay = Math.min(1000 * Math.pow(2, retryCount), 10000);
        
        return {
          action: 'retry_node',
          delay,
          retryCount: retryCount + 1
        };
      }
      
      return { action: 'continue_to_next_handler' };
    },
    conditions: [
      {
        type: 'retry_count_less_than',
        value: 3
      }
    ]
  },
  
  {
    id: 'permission_escalation_handler',
    name: 'Permission Escalation Handler',
    errorTypes: [ErrorType.PERMISSION_ERROR],
    priority: 90,
    handler: async (error: WorkflowError, context: ExecutionContext) => {
      // Try to escalate permissions or suggest alternative
      const canEscalate = await this.checkPermissionEscalation(context.userInfo);
      
      if (canEscalate) {
        await this.requestPermissionEscalation(error, context);
        
        return {
          action: 'pause_workflow',
          reason: 'Waiting for permission escalation approval'
        };
      }
      
      return { action: 'continue_to_next_handler' };
    },
    conditions: []
  },
  
  {
    id: 'resource_allocation_handler',
    name: 'Resource Allocation Handler',
    errorTypes: [ErrorType.RESOURCE_ERROR],
    priority: 80,
    handler: async (error: WorkflowError, context: ExecutionContext) => {
      // Try to allocate more resources or queue for later
      const resourcesAvailable = await this.checkResourceAvailability();
      
      if (resourcesAvailable) {
        await this.allocateAdditionalResources(context);
        
        return {
          action: 'retry_node',
          delay: 5000
        };
      } else {
        // Queue workflow for later execution
        await this.queueForLaterExecution(context);
        
        return {
          action: 'pause_workflow',
          reason: 'Queued for execution when resources become available'
        };
      }
    },
    conditions: []
  }
];

Debugging and Diagnostics

Debug Information Collection

interface DebugInfo {
  executionId: string;
  nodeId: string;
  timestamp: Date;
  stepNumber: number;
  inputData: any;
  outputData: any;
  executionTime: number;
  memoryUsage: NodeJS.MemoryUsage;
  stackTrace?: string;
  variables: Record<string, any>;
  metadata: DebugMetadata;
}

interface DebugMetadata {
  nodeType: string;
  nodeVersion: string;
  executionMode: string;
  parentExecution?: string;
  childExecutions: string[];
  tags: string[];
}

class DebugCollector {
  private debugInfo: Map<string, DebugInfo[]> = new Map();
  private isCollectionEnabled: boolean = false;
  
  enableCollection(executionId: string): void {
    this.isCollectionEnabled = true;
    if (!this.debugInfo.has(executionId)) {
      this.debugInfo.set(executionId, []);
    }
  }
  
  collectNodeExecution(
    executionId: string,
    nodeId: string,
    stepNumber: number,
    inputData: any,
    outputData: any,
    executionTime: number,
    metadata: DebugMetadata
  ): void {
    
    if (!this.isCollectionEnabled) return;
    
    const debugInfo: DebugInfo = {
      executionId,
      nodeId,
      timestamp: new Date(),
      stepNumber,
      inputData: this.deepClone(inputData),
      outputData: this.deepClone(outputData),
      executionTime,
      memoryUsage: process.memoryUsage(),
      variables: this.collectVariables(executionId),
      metadata
    };
    
    const executionDebugInfo = this.debugInfo.get(executionId) || [];
    executionDebugInfo.push(debugInfo);
    this.debugInfo.set(executionId, executionDebugInfo);
  }
  
  collectError(
    executionId: string,
    nodeId: string,
    error: Error | WorkflowError,
    context: any
  ): void {
    
    if (!this.isCollectionEnabled) return;
    
    const debugInfo: DebugInfo = {
      executionId,
      nodeId,
      timestamp: new Date(),
      stepNumber: -1,
      inputData: context.inputData,
      outputData: null,
      executionTime: 0,
      memoryUsage: process.memoryUsage(),
      stackTrace: error.stack,
      variables: this.collectVariables(executionId),
      metadata: {
        nodeType: 'error',
        nodeVersion: '1.0.0',
        executionMode: 'debug',
        childExecutions: [],
        tags: ['error', error.constructor.name]
      }
    };
    
    const executionDebugInfo = this.debugInfo.get(executionId) || [];
    executionDebugInfo.push(debugInfo);
    this.debugInfo.set(executionId, executionDebugInfo);
  }
  
  getDebugTrace(executionId: string): DebugTrace {
    const debugInfo = this.debugInfo.get(executionId) || [];
    
    return {
      executionId,
      totalSteps: debugInfo.length,
      totalExecutionTime: debugInfo.reduce((sum, info) => sum + info.executionTime, 0),
      memoryPeak: Math.max(...debugInfo.map(info => info.memoryUsage.heapUsed)),
      errorCount: debugInfo.filter(info => info.stackTrace).length,
      steps: debugInfo.sort((a, b) => a.stepNumber - b.stepNumber)
    };
  }
  
  generateDebugReport(executionId: string): DebugReport {
    const trace = this.getDebugTrace(executionId);
    
    return {
      executionId,
      summary: {
        totalSteps: trace.totalSteps,
        totalTime: trace.totalExecutionTime,
        memoryPeak: trace.memoryPeak,
        errorCount: trace.errorCount,
        status: trace.errorCount > 0 ? 'failed' : 'success'
      },
      timeline: this.generateTimeline(trace),
      memoryProfile: this.generateMemoryProfile(trace),
      errorAnalysis: this.analyzeErrors(trace),
      performanceAnalysis: this.analyzePerformance(trace),
      recommendations: this.generateRecommendations(trace)
    };
  }
  
  private generateTimeline(trace: DebugTrace): TimelineEntry[] {
    return trace.steps.map(step => ({
      timestamp: step.timestamp,
      nodeId: step.nodeId,
      action: step.stackTrace ? 'error' : 'execute',
      duration: step.executionTime,
      status: step.stackTrace ? 'failed' : 'success',
      details: step.stackTrace || `Executed in ${step.executionTime}ms`
    }));
  }
  
  private analyzeErrors(trace: DebugTrace): ErrorAnalysis {
    const errorSteps = trace.steps.filter(step => step.stackTrace);
    
    const errorPatterns = this.findErrorPatterns(errorSteps);
    const errorFrequency = this.calculateErrorFrequency(errorSteps);
    const rootCauses = this.identifyRootCauses(errorSteps);
    
    return {
      totalErrors: errorSteps.length,
      uniqueErrors: new Set(errorSteps.map(step => step.stackTrace)).size,
      errorPatterns,
      errorFrequency,
      rootCauses,
      suggestions: this.generateErrorSuggestions(errorSteps)
    };
  }
  
  private generateRecommendations(trace: DebugTrace): Recommendation[] {
    const recommendations: Recommendation[] = [];
    
    // Performance recommendations
    const slowSteps = trace.steps.filter(step => step.executionTime > 5000);
    if (slowSteps.length > 0) {
      recommendations.push({
        type: 'performance',
        priority: 'high',
        title: 'Optimize slow nodes',
        description: `${slowSteps.length} nodes took longer than 5 seconds to execute`,
        action: 'Consider adding caching, optimizing queries, or using parallel execution',
        affectedNodes: slowSteps.map(step => step.nodeId)
      });
    }
    
    // Memory recommendations
    const memoryGrowth = this.calculateMemoryGrowth(trace);
    if (memoryGrowth > 100 * 1024 * 1024) { // 100MB
      recommendations.push({
        type: 'memory',
        priority: 'medium',
        title: 'High memory usage detected',
        description: `Memory usage increased by ${(memoryGrowth / 1024 / 1024).toFixed(2)}MB during execution`,
        action: 'Consider processing data in smaller chunks or clearing unused variables',
        affectedNodes: []
      });
    }
    
    // Error recommendations
    const errorCount = trace.steps.filter(step => step.stackTrace).length;
    if (errorCount > 0) {
      recommendations.push({
        type: 'reliability',
        priority: 'high',
        title: 'Improve error handling',
        description: `${errorCount} errors occurred during execution`,
        action: 'Add better error handling, validation, and retry mechanisms',
        affectedNodes: trace.steps.filter(step => step.stackTrace).map(step => step.nodeId)
      });
    }
    
    return recommendations;
  }
}

Interactive Debugging

interface DebugSession {
  id: string;
  executionId: string;
  status: 'active' | 'paused' | 'stopped';
  breakpoints: Breakpoint[];
  watches: WatchExpression[];
  currentStep: number;
  stackTrace: StackFrame[];
}

interface Breakpoint {
  id: string;
  nodeId: string;
  condition?: string;
  enabled: boolean;
  hitCount: number;
}

interface WatchExpression {
  id: string;
  expression: string;
  value: any;
  error?: string;
}

class InteractiveDebugger {
  private sessions: Map<string, DebugSession> = new Map();
  
  startDebugSession(executionId: string): DebugSession {
    const session: DebugSession = {
      id: this.generateSessionId(),
      executionId,
      status: 'active',
      breakpoints: [],
      watches: [],
      currentStep: 0,
      stackTrace: []
    };
    
    this.sessions.set(session.id, session);
    return session;
  }
  
  addBreakpoint(
    sessionId: string, 
    nodeId: string, 
    condition?: string
  ): Breakpoint {
    
    const session = this.getSession(sessionId);
    const breakpoint: Breakpoint = {
      id: this.generateBreakpointId(),
      nodeId,
      condition,
      enabled: true,
      hitCount: 0
    };
    
    session.breakpoints.push(breakpoint);
    return breakpoint;
  }
  
  addWatch(sessionId: string, expression: string): WatchExpression {
    const session = this.getSession(sessionId);
    const watch: WatchExpression = {
      id: this.generateWatchId(),
      expression,
      value: null
    };
    
    session.watches.push(watch);
    this.evaluateWatch(session, watch);
    return watch;
  }
  
  async shouldBreak(
    sessionId: string,
    nodeId: string,
    context: ExecutionContext
  ): Promise<boolean> {
    
    const session = this.sessions.get(sessionId);
    if (!session || session.status !== 'active') {
      return false;
    }
    
    // Check breakpoints for this node
    for (const breakpoint of session.breakpoints) {
      if (breakpoint.nodeId === nodeId && breakpoint.enabled) {
        
        // Increment hit count
        breakpoint.hitCount++;
        
        // Check condition if specified
        if (breakpoint.condition) {
          try {
            const shouldBreak = await this.evaluateCondition(
              breakpoint.condition, 
              context
            );
            if (!shouldBreak) continue;
          } catch (error) {
            console.warn(`Breakpoint condition error: ${error.message}`);
          }
        }
        
        // Pause execution
        session.status = 'paused';
        session.currentStep = context.stepNumber;
        session.stackTrace = this.captureStackTrace(context);
        
        // Update watch expressions
        await this.updateWatches(session, context);
        
        return true;
      }
    }
    
    return false;
  }
  
  stepOver(sessionId: string): void {
    const session = this.getSession(sessionId);
    session.status = 'active';
    // Will pause at next node
  }
  
  stepInto(sessionId: string): void {
    const session = this.getSession(sessionId);
    session.status = 'active';
    // Will pause at next execution point (including subworkflows)
  }
  
  continue(sessionId: string): void {
    const session = this.getSession(sessionId);
    session.status = 'active';
    // Will continue until next breakpoint
  }
  
  stop(sessionId: string): void {
    const session = this.getSession(sessionId);
    session.status = 'stopped';
    this.sessions.delete(sessionId);
  }
  
  private async evaluateCondition(
    condition: string, 
    context: ExecutionContext
  ): Promise<boolean> {
    
    // Create safe evaluation context
    const evalContext = {
      input: context.inputData,
      output: context.outputData,
      variables: context.variables,
      nodeId: context.nodeId,
      stepNumber: context.stepNumber
    };
    
    // Use safe evaluation (consider using a sandboxed environment)
    try {
      const func = new Function(
        'context', 
        `with(context) { return ${condition}; }`
      );
      return Boolean(func(evalContext));
    } catch (error) {
      console.error(`Condition evaluation error: ${error.message}`);
      return false;
    }
  }
  
  private async updateWatches(
    session: DebugSession, 
    context: ExecutionContext
  ): Promise<void> {
    
    for (const watch of session.watches) {
      try {
        watch.value = await this.evaluateExpression(watch.expression, context);
        watch.error = undefined;
      } catch (error) {
        watch.error = error.message;
        watch.value = null;
      }
    }
  }
}

Error Monitoring and Alerting

Real-time Error Monitoring

interface ErrorMonitoringConfig {
  enabled: boolean;
  alertThresholds: AlertThreshold[];
  aggregationWindows: AggregationWindow[];
  destinations: AlertDestination[];
  suppressionRules: SuppressionRule[];
}

interface AlertThreshold {
  id: string;
  name: string;
  metric: 'error_rate' | 'error_count' | 'error_latency' | 'circuit_breaker_trips';
  condition: 'greater_than' | 'less_than' | 'equals';
  value: number;
  window: string; // e.g., '5m', '1h', '1d'
  severity: 'low' | 'medium' | 'high' | 'critical';
}

class ErrorMonitor {
  private metrics: Map<string, ErrorMetric[]> = new Map();
  private alerts: Alert[] = [];
  
  recordError(error: WorkflowError, context: ExecutionContext): void {
    const metric: ErrorMetric = {
      timestamp: new Date(),
      workflowId: context.workflowId,
      executionId: context.executionId,
      nodeId: error.details.nodeId || '',
      errorType: error.type,
      severity: error.severity,
      resolved: false,
      metadata: {
        userAgent: context.userInfo.userAgent,
        environment: context.environmentInfo.environment,
        version: context.environmentInfo.version
      }
    };
    
    const key = this.getMetricKey(metric);
    const metrics = this.metrics.get(key) || [];
    metrics.push(metric);
    this.metrics.set(key, metrics);
    
    // Check alert thresholds
    this.checkAlertThresholds(metric);
  }
  
  private checkAlertThresholds(metric: ErrorMetric): void {
    for (const threshold of this.config.alertThresholds) {
      const shouldAlert = this.evaluateThreshold(threshold, metric);
      
      if (shouldAlert) {
        this.createAlert(threshold, metric);
      }
    }
  }
  
  private evaluateThreshold(
    threshold: AlertThreshold, 
    metric: ErrorMetric
  ): boolean {
    
    const windowMs = this.parseTimeWindow(threshold.window);
    const since = new Date(Date.now() - windowMs);
    
    const relevantMetrics = this.getMetricsInWindow(metric, since);
    
    switch (threshold.metric) {
      case 'error_count':
        return this.evaluateCount(relevantMetrics, threshold);
        
      case 'error_rate':
        return this.evaluateRate(relevantMetrics, threshold, windowMs);
        
      case 'circuit_breaker_trips':
        return this.evaluateCircuitBreakerTrips(relevantMetrics, threshold);
        
      default:
        return false;
    }
  }
  
  private createAlert(threshold: AlertThreshold, metric: ErrorMetric): void {
    // Check suppression rules
    if (this.isSuppressed(threshold, metric)) {
      return;
    }
    
    const alert: Alert = {
      id: this.generateAlertId(),
      thresholdId: threshold.id,
      severity: threshold.severity,
      title: `${threshold.name} - ${threshold.metric} ${threshold.condition} ${threshold.value}`,
      description: this.generateAlertDescription(threshold, metric),
      timestamp: new Date(),
      workflowId: metric.workflowId,
      nodeId: metric.nodeId,
      status: 'active',
      metadata: {
        threshold,
        triggeringMetric: metric,
        affectedExecutions: this.getAffectedExecutions(metric)
      }
    };
    
    this.alerts.push(alert);
    this.sendAlert(alert);
  }
  
  private async sendAlert(alert: Alert): Promise<void> {
    for (const destination of this.config.destinations) {
      try {
        await this.sendToDestination(alert, destination);
      } catch (error) {
        console.error(`Failed to send alert to ${destination.type}:`, error);
      }
    }
  }
  
  private async sendToDestination(
    alert: Alert, 
    destination: AlertDestination
  ): Promise<void> {
    
    switch (destination.type) {
      case 'email':
        await this.sendEmailAlert(alert, destination);
        break;
        
      case 'slack':
        await this.sendSlackAlert(alert, destination);
        break;
        
      case 'webhook':
        await this.sendWebhookAlert(alert, destination);
        break;
        
      case 'sms':
        await this.sendSmsAlert(alert, destination);
        break;
    }
  }
  
  generateErrorDashboard(): ErrorDashboard {
    const now = new Date();
    const hour = new Date(now.getTime() - 60 * 60 * 1000);
    const day = new Date(now.getTime() - 24 * 60 * 60 * 1000);
    
    return {
      summary: {
        totalErrors: this.countErrorsInWindow(hour, now),
        errorRate: this.calculateErrorRate(hour, now),
        activeAlerts: this.alerts.filter(a => a.status === 'active').length,
        circuitBreakersOpen: this.countOpenCircuitBreakers()
      },
      trends: {
        hourly: this.getHourlyErrorTrend(day, now),
        byType: this.getErrorsByType(day, now),
        byNode: this.getErrorsByNode(day, now),
        bySeverity: this.getErrorsBySeverity(day, now)
      },
      topErrors: this.getTopErrors(day, now),
      recentAlerts: this.getRecentAlerts(10),
      healthCheck: this.generateHealthCheck()
    };
  }
}

Overview​

Key Features​

Error Types and Classification​

Error Hierarchy​

Error Categories​

Node-Level Error Handling​

Error Handling Configuration​

Circuit Breaker Pattern​

Circuit Breaker Implementation​

Workflow-Level Error Handling​

Global Error Handling​

Debugging and Diagnostics​

Debug Information Collection​

Interactive Debugging​

Error Monitoring and Alerting​

Real-time Error Monitoring​

Need Help?​