Skip to main content

Error Handling & Recovery

Comprehensive guide to error handling, debugging, and recovery strategies in Axon OS workflows.

Overview

Effective error handling is critical for building robust and reliable workflows in Axon OS. The platform provides comprehensive error handling mechanisms, debugging tools, and recovery strategies to ensure workflows can gracefully handle failures and maintain operational continuity.

Key Features

  • Multi-Level Error Handling: Node, workflow, and system-level error handling
  • Smart Recovery: Automatic retry mechanisms with exponential backoff
  • Error Classification: Categorization and contextual error information
  • Circuit Breakers: Prevent cascading failures
  • Debugging Tools: Comprehensive debugging and diagnostic capabilities
  • Monitoring Integration: Real-time error tracking and alerting

Error Types and Classification

Error Hierarchy

enum ErrorType {
// System errors
SYSTEM_ERROR = 'system_error',
RESOURCE_ERROR = 'resource_error',
CONFIGURATION_ERROR = 'configuration_error',

// Workflow errors
VALIDATION_ERROR = 'validation_error',
EXECUTION_ERROR = 'execution_error',
TIMEOUT_ERROR = 'timeout_error',
DEPENDENCY_ERROR = 'dependency_error',

// Data errors
DATA_ERROR = 'data_error',
SCHEMA_ERROR = 'schema_error',
TRANSFORMATION_ERROR = 'transformation_error',

// External errors
NETWORK_ERROR = 'network_error',
API_ERROR = 'api_error',
DATABASE_ERROR = 'database_error',

// User errors
USER_ERROR = 'user_error',
PERMISSION_ERROR = 'permission_error',
AUTHENTICATION_ERROR = 'authentication_error'
}

enum ErrorSeverity {
LOW = 'low', // Warning, workflow continues
MEDIUM = 'medium', // Recoverable error, retry possible
HIGH = 'high', // Node failure, workflow continues
CRITICAL = 'critical' // Workflow failure, immediate attention required
}

interface WorkflowError {
id: string;
type: ErrorType;
severity: ErrorSeverity;
code: string;
message: string;
details: ErrorDetails;
context: ErrorContext;
timestamp: Date;
retryCount: number;
resolved: boolean;
resolutionStrategy?: string;
}

interface ErrorDetails {
nodeId?: string;
portId?: string;
stackTrace: string;
inputData?: any;
expectedOutput?: any;
actualOutput?: any;
systemInfo: SystemInfo;
metadata: Record<string, any>;
}

interface ErrorContext {
workflowId: string;
executionId: string;
nodeId: string;
stepNumber: number;
parentExecutionId?: string;
userInfo: UserInfo;
environmentInfo: EnvironmentInfo;
}

interface SystemInfo {
nodeVersion: string;
platform: string;
architecture: string;
memoryUsage: NodeJS.MemoryUsage;
cpuUsage: NodeJS.CpuUsage;
uptime: number;
}

Error Categories

class ErrorClassifier {
classify(error: Error | WorkflowError): ErrorClassification {
// Determine error type
const type = this.determineErrorType(error);

// Assess severity
const severity = this.assessSeverity(error, type);

// Check if error is recoverable
const recoverable = this.isRecoverable(error, type);

// Suggest recovery strategy
const strategy = this.suggestRecoveryStrategy(error, type, severity);

return {
type,
severity,
recoverable,
strategy,
retryable: this.isRetryable(error, type),
userActionRequired: this.requiresUserAction(error, type),
systemActionRequired: this.requiresSystemAction(error, type)
};
}

private determineErrorType(error: Error | WorkflowError): ErrorType {
if (error instanceof WorkflowError) {
return error.type;
}

// Classify based on error message and stack trace
const message = error.message.toLowerCase();
const stack = error.stack?.toLowerCase() || '';

if (message.includes('timeout') || message.includes('timed out')) {
return ErrorType.TIMEOUT_ERROR;
}

if (message.includes('network') || message.includes('fetch') || message.includes('connection')) {
return ErrorType.NETWORK_ERROR;
}

if (message.includes('validation') || message.includes('invalid')) {
return ErrorType.VALIDATION_ERROR;
}

if (message.includes('permission') || message.includes('unauthorized')) {
return ErrorType.PERMISSION_ERROR;
}

if (message.includes('schema') || message.includes('type')) {
return ErrorType.SCHEMA_ERROR;
}

if (stack.includes('database') || stack.includes('sql')) {
return ErrorType.DATABASE_ERROR;
}

// Default to execution error
return ErrorType.EXECUTION_ERROR;
}

private assessSeverity(error: Error | WorkflowError, type: ErrorType): ErrorSeverity {
// Critical errors that require immediate attention
const criticalTypes = [
ErrorType.SYSTEM_ERROR,
ErrorType.CONFIGURATION_ERROR,
ErrorType.AUTHENTICATION_ERROR
];

if (criticalTypes.includes(type)) {
return ErrorSeverity.CRITICAL;
}

// High severity errors that stop node execution
const highSeverityTypes = [
ErrorType.RESOURCE_ERROR,
ErrorType.DEPENDENCY_ERROR
];

if (highSeverityTypes.includes(type)) {
return ErrorSeverity.HIGH;
}

// Medium severity errors that are often recoverable
const mediumSeverityTypes = [
ErrorType.NETWORK_ERROR,
ErrorType.API_ERROR,
ErrorType.TIMEOUT_ERROR
];

if (mediumSeverityTypes.includes(type)) {
return ErrorSeverity.MEDIUM;
}

// Low severity for validation and user errors
return ErrorSeverity.LOW;
}

private isRecoverable(error: Error | WorkflowError, type: ErrorType): boolean {
const recoverableTypes = [
ErrorType.NETWORK_ERROR,
ErrorType.API_ERROR,
ErrorType.TIMEOUT_ERROR,
ErrorType.DATABASE_ERROR,
ErrorType.RESOURCE_ERROR
];

return recoverableTypes.includes(type);
}

private suggestRecoveryStrategy(
error: Error | WorkflowError,
type: ErrorType,
severity: ErrorSeverity
): string {

switch (type) {
case ErrorType.NETWORK_ERROR:
case ErrorType.API_ERROR:
return 'retry_with_backoff';

case ErrorType.TIMEOUT_ERROR:
return 'increase_timeout_and_retry';

case ErrorType.RESOURCE_ERROR:
return 'wait_for_resources';

case ErrorType.VALIDATION_ERROR:
return 'fix_input_data';

case ErrorType.CONFIGURATION_ERROR:
return 'update_configuration';

case ErrorType.PERMISSION_ERROR:
return 'check_permissions';

default:
return severity === ErrorSeverity.CRITICAL ? 'manual_intervention' : 'retry';
}
}
}

Node-Level Error Handling

Error Handling Configuration

interface NodeErrorHandling {
enabled: boolean;
strategy: ErrorStrategy;
retryConfig: RetryConfiguration;
fallbackBehavior: FallbackBehavior;
customHandlers: CustomErrorHandler[];
propagation: ErrorPropagation;
}

enum ErrorStrategy {
FAIL_FAST = 'fail_fast', // Stop immediately on error
RETRY = 'retry', // Retry with configuration
FALLBACK = 'fallback', // Use fallback value/node
IGNORE = 'ignore', // Continue execution, log error
CIRCUIT_BREAKER = 'circuit_breaker' // Use circuit breaker pattern
}

interface RetryConfiguration {
maxAttempts: number;
initialDelay: number;
maxDelay: number;
backoffMultiplier: number;
jitter: boolean;
retryConditions: RetryCondition[];
}

interface RetryCondition {
errorTypes: ErrorType[];
statusCodes?: number[];
messagePatterns?: RegExp[];
customPredicate?: (error: WorkflowError) => boolean;
}

interface FallbackBehavior {
type: 'value' | 'node' | 'skip' | 'default';
value?: any;
nodeId?: string;
skipToNodeId?: string;
}

class NodeErrorHandler {
async handleError(
error: WorkflowError,
nodeConfig: NodeErrorHandling,
context: ExecutionContext
): Promise<ErrorHandlingResult> {

// Classify the error
const classification = new ErrorClassifier().classify(error);

// Check if error should be handled
if (!this.shouldHandle(error, nodeConfig, classification)) {
return { action: 'propagate', error };
}

// Apply error handling strategy
switch (nodeConfig.strategy) {
case ErrorStrategy.RETRY:
return await this.handleRetry(error, nodeConfig, context);

case ErrorStrategy.FALLBACK:
return await this.handleFallback(error, nodeConfig, context);

case ErrorStrategy.IGNORE:
return await this.handleIgnore(error, nodeConfig, context);

case ErrorStrategy.CIRCUIT_BREAKER:
return await this.handleCircuitBreaker(error, nodeConfig, context);

case ErrorStrategy.FAIL_FAST:
default:
return { action: 'fail', error };
}
}

private async handleRetry(
error: WorkflowError,
config: NodeErrorHandling,
context: ExecutionContext
): Promise<ErrorHandlingResult> {

const retryConfig = config.retryConfig;

// Check if we should retry this error
if (!this.shouldRetry(error, retryConfig)) {
return { action: 'fail', error };
}

// Check retry limit
if (error.retryCount >= retryConfig.maxAttempts) {
return { action: 'fail', error: this.createMaxRetriesError(error) };
}

// Calculate delay
const delay = this.calculateRetryDelay(error.retryCount, retryConfig);

// Schedule retry
await this.delay(delay);

return {
action: 'retry',
error,
retryCount: error.retryCount + 1,
delay
};
}

private async handleFallback(
error: WorkflowError,
config: NodeErrorHandling,
context: ExecutionContext
): Promise<ErrorHandlingResult> {

const fallback = config.fallbackBehavior;

switch (fallback.type) {
case 'value':
return {
action: 'continue',
result: fallback.value,
error
};

case 'node':
return {
action: 'redirect',
targetNodeId: fallback.nodeId!,
error
};

case 'skip':
return {
action: 'skip',
skipToNodeId: fallback.skipToNodeId,
error
};

case 'default':
const defaultValue = this.getDefaultValue(context.nodeDefinition);
return {
action: 'continue',
result: defaultValue,
error
};

default:
return { action: 'fail', error };
}
}

private calculateRetryDelay(
retryCount: number,
config: RetryConfiguration
): number {

let delay = config.initialDelay * Math.pow(config.backoffMultiplier, retryCount);

// Apply max delay limit
delay = Math.min(delay, config.maxDelay);

// Add jitter to prevent thundering herd
if (config.jitter) {
delay += Math.random() * delay * 0.1;
}

return delay;
}

private shouldRetry(error: WorkflowError, config: RetryConfiguration): boolean {
// Check retry conditions
for (const condition of config.retryConditions) {
if (this.matchesCondition(error, condition)) {
return true;
}
}

return false;
}

private matchesCondition(error: WorkflowError, condition: RetryCondition): boolean {
// Check error type
if (condition.errorTypes.includes(error.type)) {
return true;
}

// Check status codes (for API errors)
if (condition.statusCodes && error.details.metadata.statusCode) {
if (condition.statusCodes.includes(error.details.metadata.statusCode)) {
return true;
}
}

// Check message patterns
if (condition.messagePatterns) {
for (const pattern of condition.messagePatterns) {
if (pattern.test(error.message)) {
return true;
}
}
}

// Check custom predicate
if (condition.customPredicate) {
return condition.customPredicate(error);
}

return false;
}
}

// Example: HTTP Request node with comprehensive error handling
const httpNodeErrorConfig: NodeErrorHandling = {
enabled: true,
strategy: ErrorStrategy.RETRY,
retryConfig: {
maxAttempts: 3,
initialDelay: 1000,
maxDelay: 10000,
backoffMultiplier: 2,
jitter: true,
retryConditions: [
{
errorTypes: [ErrorType.NETWORK_ERROR, ErrorType.TIMEOUT_ERROR],
statusCodes: [429, 502, 503, 504], // Rate limit, bad gateway, service unavailable, gateway timeout
messagePatterns: [/connection/i, /timeout/i]
}
]
},
fallbackBehavior: {
type: 'value',
value: {
status: 0,
data: null,
error: 'Request failed after retries'
}
},
customHandlers: [],
propagation: {
stopOnError: false,
notifyParent: true,
logLevel: 'warn'
}
};

Circuit Breaker Pattern

Circuit Breaker Implementation

enum CircuitState {
CLOSED = 'closed', // Normal operation
OPEN = 'open', // Failing, rejecting requests
HALF_OPEN = 'half_open' // Testing if service recovered
}

interface CircuitBreakerConfig {
failureThreshold: number; // Number of failures to open circuit
recoveryTimeout: number; // Time to wait before trying again
successThreshold: number; // Successes needed to close circuit
timeout: number; // Request timeout
monitoringPeriod: number; // Time window for failure counting
}

class CircuitBreaker {
private state: CircuitState = CircuitState.CLOSED;
private failureCount: number = 0;
private successCount: number = 0;
private lastFailureTime: number = 0;
private nextAttemptTime: number = 0;

constructor(
private nodeId: string,
private config: CircuitBreakerConfig
) {}

async execute<T>(operation: () => Promise<T>): Promise<T> {
// Check circuit state
if (this.state === CircuitState.OPEN) {
if (Date.now() < this.nextAttemptTime) {
throw new CircuitBreakerOpenError(
`Circuit breaker is OPEN for node ${this.nodeId}. Next attempt at ${new Date(this.nextAttemptTime)}`
);
} else {
// Transition to HALF_OPEN
this.state = CircuitState.HALF_OPEN;
this.successCount = 0;
}
}

try {
// Execute operation with timeout
const result = await this.executeWithTimeout(operation);

// Record success
this.onSuccess();

return result;

} catch (error) {
// Record failure
this.onFailure();
throw error;
}
}

private async executeWithTimeout<T>(operation: () => Promise<T>): Promise<T> {
return new Promise<T>((resolve, reject) => {
const timer = setTimeout(() => {
reject(new TimeoutError(`Operation timed out after ${this.config.timeout}ms`));
}, this.config.timeout);

operation()
.then(result => {
clearTimeout(timer);
resolve(result);
})
.catch(error => {
clearTimeout(timer);
reject(error);
});
});
}

private onSuccess(): void {
this.failureCount = 0;

if (this.state === CircuitState.HALF_OPEN) {
this.successCount++;

if (this.successCount >= this.config.successThreshold) {
this.state = CircuitState.CLOSED;
this.successCount = 0;
}
}
}

private onFailure(): void {
this.failureCount++;
this.lastFailureTime = Date.now();

if (this.state === CircuitState.HALF_OPEN) {
// Failed during recovery, go back to OPEN
this.state = CircuitState.OPEN;
this.nextAttemptTime = Date.now() + this.config.recoveryTimeout;
} else if (this.failureCount >= this.config.failureThreshold) {
// Too many failures, open the circuit
this.state = CircuitState.OPEN;
this.nextAttemptTime = Date.now() + this.config.recoveryTimeout;
}
}

getStatus(): CircuitBreakerStatus {
return {
nodeId: this.nodeId,
state: this.state,
failureCount: this.failureCount,
successCount: this.successCount,
lastFailureTime: new Date(this.lastFailureTime),
nextAttemptTime: new Date(this.nextAttemptTime),
isHealthy: this.state === CircuitState.CLOSED
};
}
}

class CircuitBreakerManager {
private breakers: Map<string, CircuitBreaker> = new Map();

getOrCreateBreaker(nodeId: string, config: CircuitBreakerConfig): CircuitBreaker {
if (!this.breakers.has(nodeId)) {
this.breakers.set(nodeId, new CircuitBreaker(nodeId, config));
}
return this.breakers.get(nodeId)!;
}

getAllStatuses(): CircuitBreakerStatus[] {
return Array.from(this.breakers.values()).map(breaker => breaker.getStatus());
}

resetBreaker(nodeId: string): void {
const breaker = this.breakers.get(nodeId);
if (breaker) {
// Create new breaker to reset state
const config = breaker['config'];
this.breakers.set(nodeId, new CircuitBreaker(nodeId, config));
}
}
}

Workflow-Level Error Handling

Global Error Handling

interface WorkflowErrorHandling {
globalHandlers: GlobalErrorHandler[];
errorPropagation: ErrorPropagationStrategy;
deadLetterQueue: DeadLetterQueueConfig;
rollbackStrategy: RollbackStrategy;
notificationSettings: NotificationSettings;
}

interface GlobalErrorHandler {
id: string;
name: string;
errorTypes: ErrorType[];
priority: number;
handler: (error: WorkflowError, context: ExecutionContext) => Promise<ErrorHandlingResult>;
conditions: ErrorCondition[];
}

class WorkflowErrorManager {
constructor(
private config: WorkflowErrorHandling,
private circuitBreakerManager: CircuitBreakerManager
) {}

async handleWorkflowError(
error: WorkflowError,
context: ExecutionContext
): Promise<WorkflowErrorHandlingResult> {

// Find applicable global handlers
const handlers = this.findApplicableHandlers(error);

// Try each handler in priority order
for (const handler of handlers) {
try {
const result = await handler.handler(error, context);

if (result.action !== 'continue_to_next_handler') {
return this.createWorkflowResult(result, handler);
}

} catch (handlerError) {
console.error(`Error in global handler ${handler.id}:`, handlerError);
}
}

// No handler could resolve the error
return await this.handleUnresolvedError(error, context);
}

private findApplicableHandlers(error: WorkflowError): GlobalErrorHandler[] {
return this.config.globalHandlers
.filter(handler => {
// Check error type
if (handler.errorTypes.length > 0 && !handler.errorTypes.includes(error.type)) {
return false;
}

// Check conditions
return handler.conditions.every(condition => this.evaluateCondition(condition, error));
})
.sort((a, b) => b.priority - a.priority);
}

private async handleUnresolvedError(
error: WorkflowError,
context: ExecutionContext
): Promise<WorkflowErrorHandlingResult> {

// Send to dead letter queue if configured
if (this.config.deadLetterQueue.enabled) {
await this.sendToDeadLetterQueue(error, context);
}

// Execute rollback if configured
if (this.config.rollbackStrategy.enabled) {
await this.executeRollback(context);
}

// Send notifications
await this.sendErrorNotifications(error, context);

return {
action: 'terminate_workflow',
error,
reason: 'Unresolved error after all handlers attempted'
};
}
}

// Example: Global error handlers
const defaultGlobalHandlers: GlobalErrorHandler[] = [
{
id: 'network_retry_handler',
name: 'Network Error Retry Handler',
errorTypes: [ErrorType.NETWORK_ERROR, ErrorType.API_ERROR],
priority: 100,
handler: async (error: WorkflowError, context: ExecutionContext) => {
// Implement smart retry logic for network errors
const retryCount = error.retryCount || 0;

if (retryCount < 3) {
const delay = Math.min(1000 * Math.pow(2, retryCount), 10000);

return {
action: 'retry_node',
delay,
retryCount: retryCount + 1
};
}

return { action: 'continue_to_next_handler' };
},
conditions: [
{
type: 'retry_count_less_than',
value: 3
}
]
},

{
id: 'permission_escalation_handler',
name: 'Permission Escalation Handler',
errorTypes: [ErrorType.PERMISSION_ERROR],
priority: 90,
handler: async (error: WorkflowError, context: ExecutionContext) => {
// Try to escalate permissions or suggest alternative
const canEscalate = await this.checkPermissionEscalation(context.userInfo);

if (canEscalate) {
await this.requestPermissionEscalation(error, context);

return {
action: 'pause_workflow',
reason: 'Waiting for permission escalation approval'
};
}

return { action: 'continue_to_next_handler' };
},
conditions: []
},

{
id: 'resource_allocation_handler',
name: 'Resource Allocation Handler',
errorTypes: [ErrorType.RESOURCE_ERROR],
priority: 80,
handler: async (error: WorkflowError, context: ExecutionContext) => {
// Try to allocate more resources or queue for later
const resourcesAvailable = await this.checkResourceAvailability();

if (resourcesAvailable) {
await this.allocateAdditionalResources(context);

return {
action: 'retry_node',
delay: 5000
};
} else {
// Queue workflow for later execution
await this.queueForLaterExecution(context);

return {
action: 'pause_workflow',
reason: 'Queued for execution when resources become available'
};
}
},
conditions: []
}
];

Debugging and Diagnostics

Debug Information Collection

interface DebugInfo {
executionId: string;
nodeId: string;
timestamp: Date;
stepNumber: number;
inputData: any;
outputData: any;
executionTime: number;
memoryUsage: NodeJS.MemoryUsage;
stackTrace?: string;
variables: Record<string, any>;
metadata: DebugMetadata;
}

interface DebugMetadata {
nodeType: string;
nodeVersion: string;
executionMode: string;
parentExecution?: string;
childExecutions: string[];
tags: string[];
}

class DebugCollector {
private debugInfo: Map<string, DebugInfo[]> = new Map();
private isCollectionEnabled: boolean = false;

enableCollection(executionId: string): void {
this.isCollectionEnabled = true;
if (!this.debugInfo.has(executionId)) {
this.debugInfo.set(executionId, []);
}
}

collectNodeExecution(
executionId: string,
nodeId: string,
stepNumber: number,
inputData: any,
outputData: any,
executionTime: number,
metadata: DebugMetadata
): void {

if (!this.isCollectionEnabled) return;

const debugInfo: DebugInfo = {
executionId,
nodeId,
timestamp: new Date(),
stepNumber,
inputData: this.deepClone(inputData),
outputData: this.deepClone(outputData),
executionTime,
memoryUsage: process.memoryUsage(),
variables: this.collectVariables(executionId),
metadata
};

const executionDebugInfo = this.debugInfo.get(executionId) || [];
executionDebugInfo.push(debugInfo);
this.debugInfo.set(executionId, executionDebugInfo);
}

collectError(
executionId: string,
nodeId: string,
error: Error | WorkflowError,
context: any
): void {

if (!this.isCollectionEnabled) return;

const debugInfo: DebugInfo = {
executionId,
nodeId,
timestamp: new Date(),
stepNumber: -1,
inputData: context.inputData,
outputData: null,
executionTime: 0,
memoryUsage: process.memoryUsage(),
stackTrace: error.stack,
variables: this.collectVariables(executionId),
metadata: {
nodeType: 'error',
nodeVersion: '1.0.0',
executionMode: 'debug',
childExecutions: [],
tags: ['error', error.constructor.name]
}
};

const executionDebugInfo = this.debugInfo.get(executionId) || [];
executionDebugInfo.push(debugInfo);
this.debugInfo.set(executionId, executionDebugInfo);
}

getDebugTrace(executionId: string): DebugTrace {
const debugInfo = this.debugInfo.get(executionId) || [];

return {
executionId,
totalSteps: debugInfo.length,
totalExecutionTime: debugInfo.reduce((sum, info) => sum + info.executionTime, 0),
memoryPeak: Math.max(...debugInfo.map(info => info.memoryUsage.heapUsed)),
errorCount: debugInfo.filter(info => info.stackTrace).length,
steps: debugInfo.sort((a, b) => a.stepNumber - b.stepNumber)
};
}

generateDebugReport(executionId: string): DebugReport {
const trace = this.getDebugTrace(executionId);

return {
executionId,
summary: {
totalSteps: trace.totalSteps,
totalTime: trace.totalExecutionTime,
memoryPeak: trace.memoryPeak,
errorCount: trace.errorCount,
status: trace.errorCount > 0 ? 'failed' : 'success'
},
timeline: this.generateTimeline(trace),
memoryProfile: this.generateMemoryProfile(trace),
errorAnalysis: this.analyzeErrors(trace),
performanceAnalysis: this.analyzePerformance(trace),
recommendations: this.generateRecommendations(trace)
};
}

private generateTimeline(trace: DebugTrace): TimelineEntry[] {
return trace.steps.map(step => ({
timestamp: step.timestamp,
nodeId: step.nodeId,
action: step.stackTrace ? 'error' : 'execute',
duration: step.executionTime,
status: step.stackTrace ? 'failed' : 'success',
details: step.stackTrace || `Executed in ${step.executionTime}ms`
}));
}

private analyzeErrors(trace: DebugTrace): ErrorAnalysis {
const errorSteps = trace.steps.filter(step => step.stackTrace);

const errorPatterns = this.findErrorPatterns(errorSteps);
const errorFrequency = this.calculateErrorFrequency(errorSteps);
const rootCauses = this.identifyRootCauses(errorSteps);

return {
totalErrors: errorSteps.length,
uniqueErrors: new Set(errorSteps.map(step => step.stackTrace)).size,
errorPatterns,
errorFrequency,
rootCauses,
suggestions: this.generateErrorSuggestions(errorSteps)
};
}

private generateRecommendations(trace: DebugTrace): Recommendation[] {
const recommendations: Recommendation[] = [];

// Performance recommendations
const slowSteps = trace.steps.filter(step => step.executionTime > 5000);
if (slowSteps.length > 0) {
recommendations.push({
type: 'performance',
priority: 'high',
title: 'Optimize slow nodes',
description: `${slowSteps.length} nodes took longer than 5 seconds to execute`,
action: 'Consider adding caching, optimizing queries, or using parallel execution',
affectedNodes: slowSteps.map(step => step.nodeId)
});
}

// Memory recommendations
const memoryGrowth = this.calculateMemoryGrowth(trace);
if (memoryGrowth > 100 * 1024 * 1024) { // 100MB
recommendations.push({
type: 'memory',
priority: 'medium',
title: 'High memory usage detected',
description: `Memory usage increased by ${(memoryGrowth / 1024 / 1024).toFixed(2)}MB during execution`,
action: 'Consider processing data in smaller chunks or clearing unused variables',
affectedNodes: []
});
}

// Error recommendations
const errorCount = trace.steps.filter(step => step.stackTrace).length;
if (errorCount > 0) {
recommendations.push({
type: 'reliability',
priority: 'high',
title: 'Improve error handling',
description: `${errorCount} errors occurred during execution`,
action: 'Add better error handling, validation, and retry mechanisms',
affectedNodes: trace.steps.filter(step => step.stackTrace).map(step => step.nodeId)
});
}

return recommendations;
}
}

Interactive Debugging

interface DebugSession {
id: string;
executionId: string;
status: 'active' | 'paused' | 'stopped';
breakpoints: Breakpoint[];
watches: WatchExpression[];
currentStep: number;
stackTrace: StackFrame[];
}

interface Breakpoint {
id: string;
nodeId: string;
condition?: string;
enabled: boolean;
hitCount: number;
}

interface WatchExpression {
id: string;
expression: string;
value: any;
error?: string;
}

class InteractiveDebugger {
private sessions: Map<string, DebugSession> = new Map();

startDebugSession(executionId: string): DebugSession {
const session: DebugSession = {
id: this.generateSessionId(),
executionId,
status: 'active',
breakpoints: [],
watches: [],
currentStep: 0,
stackTrace: []
};

this.sessions.set(session.id, session);
return session;
}

addBreakpoint(
sessionId: string,
nodeId: string,
condition?: string
): Breakpoint {

const session = this.getSession(sessionId);
const breakpoint: Breakpoint = {
id: this.generateBreakpointId(),
nodeId,
condition,
enabled: true,
hitCount: 0
};

session.breakpoints.push(breakpoint);
return breakpoint;
}

addWatch(sessionId: string, expression: string): WatchExpression {
const session = this.getSession(sessionId);
const watch: WatchExpression = {
id: this.generateWatchId(),
expression,
value: null
};

session.watches.push(watch);
this.evaluateWatch(session, watch);
return watch;
}

async shouldBreak(
sessionId: string,
nodeId: string,
context: ExecutionContext
): Promise<boolean> {

const session = this.sessions.get(sessionId);
if (!session || session.status !== 'active') {
return false;
}

// Check breakpoints for this node
for (const breakpoint of session.breakpoints) {
if (breakpoint.nodeId === nodeId && breakpoint.enabled) {

// Increment hit count
breakpoint.hitCount++;

// Check condition if specified
if (breakpoint.condition) {
try {
const shouldBreak = await this.evaluateCondition(
breakpoint.condition,
context
);
if (!shouldBreak) continue;
} catch (error) {
console.warn(`Breakpoint condition error: ${error.message}`);
}
}

// Pause execution
session.status = 'paused';
session.currentStep = context.stepNumber;
session.stackTrace = this.captureStackTrace(context);

// Update watch expressions
await this.updateWatches(session, context);

return true;
}
}

return false;
}

stepOver(sessionId: string): void {
const session = this.getSession(sessionId);
session.status = 'active';
// Will pause at next node
}

stepInto(sessionId: string): void {
const session = this.getSession(sessionId);
session.status = 'active';
// Will pause at next execution point (including subworkflows)
}

continue(sessionId: string): void {
const session = this.getSession(sessionId);
session.status = 'active';
// Will continue until next breakpoint
}

stop(sessionId: string): void {
const session = this.getSession(sessionId);
session.status = 'stopped';
this.sessions.delete(sessionId);
}

private async evaluateCondition(
condition: string,
context: ExecutionContext
): Promise<boolean> {

// Create safe evaluation context
const evalContext = {
input: context.inputData,
output: context.outputData,
variables: context.variables,
nodeId: context.nodeId,
stepNumber: context.stepNumber
};

// Use safe evaluation (consider using a sandboxed environment)
try {
const func = new Function(
'context',
`with(context) { return ${condition}; }`
);
return Boolean(func(evalContext));
} catch (error) {
console.error(`Condition evaluation error: ${error.message}`);
return false;
}
}

private async updateWatches(
session: DebugSession,
context: ExecutionContext
): Promise<void> {

for (const watch of session.watches) {
try {
watch.value = await this.evaluateExpression(watch.expression, context);
watch.error = undefined;
} catch (error) {
watch.error = error.message;
watch.value = null;
}
}
}
}

Error Monitoring and Alerting

Real-time Error Monitoring

interface ErrorMonitoringConfig {
enabled: boolean;
alertThresholds: AlertThreshold[];
aggregationWindows: AggregationWindow[];
destinations: AlertDestination[];
suppressionRules: SuppressionRule[];
}

interface AlertThreshold {
id: string;
name: string;
metric: 'error_rate' | 'error_count' | 'error_latency' | 'circuit_breaker_trips';
condition: 'greater_than' | 'less_than' | 'equals';
value: number;
window: string; // e.g., '5m', '1h', '1d'
severity: 'low' | 'medium' | 'high' | 'critical';
}

class ErrorMonitor {
private metrics: Map<string, ErrorMetric[]> = new Map();
private alerts: Alert[] = [];

recordError(error: WorkflowError, context: ExecutionContext): void {
const metric: ErrorMetric = {
timestamp: new Date(),
workflowId: context.workflowId,
executionId: context.executionId,
nodeId: error.details.nodeId || '',
errorType: error.type,
severity: error.severity,
resolved: false,
metadata: {
userAgent: context.userInfo.userAgent,
environment: context.environmentInfo.environment,
version: context.environmentInfo.version
}
};

const key = this.getMetricKey(metric);
const metrics = this.metrics.get(key) || [];
metrics.push(metric);
this.metrics.set(key, metrics);

// Check alert thresholds
this.checkAlertThresholds(metric);
}

private checkAlertThresholds(metric: ErrorMetric): void {
for (const threshold of this.config.alertThresholds) {
const shouldAlert = this.evaluateThreshold(threshold, metric);

if (shouldAlert) {
this.createAlert(threshold, metric);
}
}
}

private evaluateThreshold(
threshold: AlertThreshold,
metric: ErrorMetric
): boolean {

const windowMs = this.parseTimeWindow(threshold.window);
const since = new Date(Date.now() - windowMs);

const relevantMetrics = this.getMetricsInWindow(metric, since);

switch (threshold.metric) {
case 'error_count':
return this.evaluateCount(relevantMetrics, threshold);

case 'error_rate':
return this.evaluateRate(relevantMetrics, threshold, windowMs);

case 'circuit_breaker_trips':
return this.evaluateCircuitBreakerTrips(relevantMetrics, threshold);

default:
return false;
}
}

private createAlert(threshold: AlertThreshold, metric: ErrorMetric): void {
// Check suppression rules
if (this.isSuppressed(threshold, metric)) {
return;
}

const alert: Alert = {
id: this.generateAlertId(),
thresholdId: threshold.id,
severity: threshold.severity,
title: `${threshold.name} - ${threshold.metric} ${threshold.condition} ${threshold.value}`,
description: this.generateAlertDescription(threshold, metric),
timestamp: new Date(),
workflowId: metric.workflowId,
nodeId: metric.nodeId,
status: 'active',
metadata: {
threshold,
triggeringMetric: metric,
affectedExecutions: this.getAffectedExecutions(metric)
}
};

this.alerts.push(alert);
this.sendAlert(alert);
}

private async sendAlert(alert: Alert): Promise<void> {
for (const destination of this.config.destinations) {
try {
await this.sendToDestination(alert, destination);
} catch (error) {
console.error(`Failed to send alert to ${destination.type}:`, error);
}
}
}

private async sendToDestination(
alert: Alert,
destination: AlertDestination
): Promise<void> {

switch (destination.type) {
case 'email':
await this.sendEmailAlert(alert, destination);
break;

case 'slack':
await this.sendSlackAlert(alert, destination);
break;

case 'webhook':
await this.sendWebhookAlert(alert, destination);
break;

case 'sms':
await this.sendSmsAlert(alert, destination);
break;
}
}

generateErrorDashboard(): ErrorDashboard {
const now = new Date();
const hour = new Date(now.getTime() - 60 * 60 * 1000);
const day = new Date(now.getTime() - 24 * 60 * 60 * 1000);

return {
summary: {
totalErrors: this.countErrorsInWindow(hour, now),
errorRate: this.calculateErrorRate(hour, now),
activeAlerts: this.alerts.filter(a => a.status === 'active').length,
circuitBreakersOpen: this.countOpenCircuitBreakers()
},
trends: {
hourly: this.getHourlyErrorTrend(day, now),
byType: this.getErrorsByType(day, now),
byNode: this.getErrorsByNode(day, now),
bySeverity: this.getErrorsBySeverity(day, now)
},
topErrors: this.getTopErrors(day, now),
recentAlerts: this.getRecentAlerts(10),
healthCheck: this.generateHealthCheck()
};
}
}

Need Help?