Monitoring Guide
Comprehensive monitoring and observability guide for Axon OS, covering metrics, logging, alerting, and performance monitoring.
Overview
Axon OS provides comprehensive monitoring capabilities to ensure system health, performance, and reliability. The monitoring stack includes:
- Metrics Collection: Prometheus, StatsD, custom metrics
- Logging: Structured logging with multiple outputs
- Distributed Tracing: Jaeger, OpenTelemetry
- Alerting: PagerDuty, Slack, email notifications
- Dashboards: Grafana, custom web dashboards
Metrics and Monitoring
Core Metrics
System Metrics
interface SystemMetrics {
// CPU metrics
cpu_usage_percent: number;
cpu_load_1m: number;
cpu_load_5m: number;
cpu_load_15m: number;
// Memory metrics
memory_used_bytes: number;
memory_available_bytes: number;
memory_usage_percent: number;
// Disk metrics
disk_used_bytes: number;
disk_available_bytes: number;
disk_usage_percent: number;
disk_io_read_bytes_per_sec: number;
disk_io_write_bytes_per_sec: number;
// Network metrics
network_in_bytes_per_sec: number;
network_out_bytes_per_sec: number;
network_connections_active: number;
}
Application Metrics
interface ApplicationMetrics {
// HTTP metrics
http_requests_total: number;
http_request_duration_seconds: number;
http_requests_in_flight: number;
// Workflow metrics
workflow_executions_total: number;
workflow_execution_duration_seconds: number;
workflow_failures_total: number;
workflow_queue_size: number;
// Node metrics
node_executions_total: number;
node_execution_duration_seconds: number;
node_failures_total: number;
node_registry_size: number;
// Database metrics
db_connections_active: number;
db_connections_idle: number;
db_query_duration_seconds: number;
db_queries_total: number;
}
Prometheus Configuration
Metrics Exposition
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "rules/*.yml"
scrape_configs:
- job_name: 'axonos'
static_configs:
- targets: ['localhost:8080']
metrics_path: '/metrics'
scrape_interval: 10s
- job_name: 'axonos-workflow-engine'
static_configs:
- targets: ['localhost:8081']
metrics_path: '/metrics'
- job_name: 'axonos-nodes'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: axonos-node
Custom Metrics Registration
import { register, Counter, Histogram, Gauge } from 'prom-client';
// HTTP request metrics
const httpRequestsTotal = new Counter({
name: 'axonos_http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
const httpRequestDuration = new Histogram({
name: 'axonos_http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
});
// Workflow metrics
const workflowExecutions = new Counter({
name: 'axonos_workflow_executions_total',
help: 'Total number of workflow executions',
labelNames: ['workflow_name', 'status']
});
const activeWorkflows = new Gauge({
name: 'axonos_active_workflows',
help: 'Number of currently active workflows',
labelNames: ['status']
});
// Node metrics
const nodeExecutions = new Counter({
name: 'axonos_node_executions_total',
help: 'Total number of node executions',
labelNames: ['node_type', 'status']
});
const nodeExecutionDuration = new Histogram({
name: 'axonos_node_execution_duration_seconds',
help: 'Node execution duration in seconds',
labelNames: ['node_type'],
buckets: [0.01, 0.1, 0.5, 1, 5, 10, 30, 60, 300]
});
// Register metrics
register.registerMetric(httpRequestsTotal);
register.registerMetric(httpRequestDuration);
register.registerMetric(workflowExecutions);
register.registerMetric(activeWorkflows);
register.registerMetric(nodeExecutions);
register.registerMetric(nodeExecutionDuration);
Metrics Collection Middleware
import { Request, Response, NextFunction } from 'express';
function metricsMiddleware(req: Request, res: Response, next: NextFunction) {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const route = req.route?.path || req.path;
// Record metrics
httpRequestsTotal
.labels(req.method, route, res.statusCode.toString())
.inc();
httpRequestDuration
.labels(req.method, route)
.observe(duration);
});
next();
}
// Usage
app.use(metricsMiddleware);
Grafana Dashboards
System Overview Dashboard
{
"dashboard": {
"title": "Axon OS System Overview",
"panels": [
{
"title": "CPU Usage",
"type": "stat",
"targets": [
{
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
"legendFormat": "CPU Usage %"
}
]
},
{
"title": "Memory Usage",
"type": "stat",
"targets": [
{
"expr": "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100",
"legendFormat": "Memory Usage %"
}
]
},
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(axonos_http_requests_total[5m])",
"legendFormat": "{{method}} {{route}}"
}
]
},
{
"title": "Response Times",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(axonos_http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.50, rate(axonos_http_request_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile"
}
]
}
]
}
}
Workflow Performance Dashboard
{
"dashboard": {
"title": "Axon OS Workflow Performance",
"panels": [
{
"title": "Workflow Execution Rate",
"type": "graph",
"targets": [
{
"expr": "rate(axonos_workflow_executions_total[5m])",
"legendFormat": "{{workflow_name}} ({{status}})"
}
]
},
{
"title": "Average Execution Time",
"type": "graph",
"targets": [
{
"expr": "rate(axonos_workflow_execution_duration_seconds_sum[5m]) / rate(axonos_workflow_executions_total[5m])",
"legendFormat": "{{workflow_name}}"
}
]
},
{
"title": "Active Workflows",
"type": "stat",
"targets": [
{
"expr": "sum(axonos_active_workflows)",
"legendFormat": "Active Workflows"
}
]
},
{
"title": "Workflow Queue Size",
"type": "graph",
"targets": [
{
"expr": "axonos_workflow_queue_size",
"legendFormat": "Queue Size"
}
]
}
]
}
}
Logging
Structured Logging Configuration
logging:
level: "info"
format: "json"
timestamp_format: "2006-01-02T15:04:05.000Z"
# Output destinations
outputs:
- type: "stdout"
level: "info"
- type: "file"
level: "debug"
file_path: "/var/log/axonos/app.log"
rotation:
max_size: "100MB"
max_age: "7d"
max_backups: 10
compress: true
- type: "elasticsearch"
level: "warn"
endpoint: "https://elasticsearch:9200"
index: "axonos-logs"
# Component-specific levels
components:
workflow_engine: "debug"
node_registry: "info"
api_server: "info"
database: "warn"
security: "info"
Log Aggregation with ELK Stack
Elasticsearch Index Template
{
"index_patterns": ["axonos-logs-*"],
"template": {
"settings": {
"number_of_shards": 3,
"number_of_replicas": 1,
"index.lifecycle.name": "axonos-logs-policy",
"index.lifecycle.rollover_alias": "axonos-logs"
},
"mappings": {
"properties": {
"@timestamp": { "type": "date" },
"level": { "type": "keyword" },
"component": { "type": "keyword" },
"message": { "type": "text" },
"user_id": { "type": "keyword" },
"workflow_id": { "type": "keyword" },
"execution_id": { "type": "keyword" },
"node_id": { "type": "keyword" },
"duration_ms": { "type": "long" },
"error": {
"properties": {
"message": { "type": "text" },
"stack": { "type": "text" },
"code": { "type": "keyword" }
}
}
}
}
}
}
Logstash Configuration
# logstash.conf
input {
beats {
port => 5044
}
http {
port => 8080
codec => json
}
}
filter {
if [fields][service] == "axonos" {
mutate {
add_field => { "service" => "axonos" }
}
# Parse JSON logs
if [message] =~ /^\{/ {
json {
source => "message"
}
}
# Add geo location for IP addresses
if [client_ip] {
geoip {
source => "client_ip"
target => "geoip"
}
}
# Parse workflow execution logs
if [component] == "workflow_engine" {
grok {
match => {
"message" => "Workflow %{WORD:workflow_name} execution %{WORD:execution_id} %{WORD:status} in %{NUMBER:duration_ms:int}ms"
}
}
}
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "axonos-logs-%{+YYYY.MM.dd}"
template_name => "axonos-logs"
template => "/etc/logstash/templates/axonos-logs.json"
template_overwrite => true
}
# Send critical errors to alerting
if [level] == "error" or [level] == "fatal" {
http {
url => "https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK"
http_method => "post"
format => "json"
mapping => {
"text" => "🚨 Axon OS Error: %{message}"
"channel" => "#alerts"
}
}
}
}
Log Analysis and Monitoring
Kibana Dashboards
{
"version": "7.15.0",
"objects": [
{
"id": "axonos-logs-overview",
"type": "dashboard",
"attributes": {
"title": "Axon OS Logs Overview",
"panelsJSON": "[{\"version\":\"7.15.0\",\"gridData\":{\"x\":0,\"y\":0,\"w\":24,\"h\":15,\"i\":\"1\"},\"panelIndex\":\"1\",\"embeddableConfig\":{},\"panelRefName\":\"panel_1\"}]"
}
}
]
}
Distributed Tracing
OpenTelemetry Configuration
import { NodeSDK } from '@opentelemetry/sdk-node';
import { JaegerExporter } from '@opentelemetry/exporter-jaeger';
import { getNodeAutoInstrumentations } from '@opentelemetry/auto-instrumentations-node';
const jaegerExporter = new JaegerExporter({
endpoint: 'http://jaeger:14268/api/traces',
});
const sdk = new NodeSDK({
traceExporter: jaegerExporter,
instrumentations: [getNodeAutoInstrumentations()],
serviceName: 'axonos',
serviceVersion: process.env.APP_VERSION,
});
sdk.start();
Custom Tracing
import { trace, context, SpanStatusCode } from '@opentelemetry/api';
const tracer = trace.getTracer('axonos-workflow-engine');
async function executeWorkflow(workflowId: string, input: any): Promise<any> {
const span = tracer.startSpan('workflow.execute', {
attributes: {
'workflow.id': workflowId,
'workflow.input_size': JSON.stringify(input).length
}
});
try {
const result = await context.with(trace.setSpan(context.active(), span), async () => {
// Create child spans for each node execution
const nodes = await getWorkflowNodes(workflowId);
const results = [];
for (const node of nodes) {
const nodeSpan = tracer.startSpan('node.execute', {
attributes: {
'node.type': node.type,
'node.id': node.id
}
});
try {
const nodeResult = await executeNode(node, input);
results.push(nodeResult);
nodeSpan.setStatus({ code: SpanStatusCode.OK });
} catch (error) {
nodeSpan.recordException(error);
nodeSpan.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
throw error;
} finally {
nodeSpan.end();
}
}
return results;
});
span.setAttributes({
'workflow.result_size': JSON.stringify(result).length,
'workflow.nodes_executed': result.length
});
span.setStatus({ code: SpanStatusCode.OK });
return result;
} catch (error) {
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
throw error;
} finally {
span.end();
}
}
Alerting
Alert Rules Configuration
# alerting_rules.yml
groups:
- name: axonos_system
rules:
- alert: HighCPUUsage
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
service: axonos
annotations:
summary: "High CPU usage detected"
description: "CPU usage is {{ $value }}% for more than 5 minutes"
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 3m
labels:
severity: warning
service: axonos
annotations:
summary: "High memory usage detected"
description: "Memory usage is {{ $value }}% for more than 3 minutes"
- alert: DiskSpaceLow
expr: (1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100 > 90
for: 1m
labels:
severity: critical
service: axonos
annotations:
summary: "Disk space is running low"
description: "Disk usage is {{ $value }}% on {{ $labels.device }}"
- name: axonos_application
rules:
- alert: HighErrorRate
expr: rate(axonos_http_requests_total{status_code=~"5.."}[5m]) / rate(axonos_http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: critical
service: axonos
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for endpoint {{ $labels.route }}"
- alert: SlowResponseTimes
expr: histogram_quantile(0.95, rate(axonos_http_request_duration_seconds_bucket[5m])) > 2
for: 3m
labels:
severity: warning
service: axonos
annotations:
summary: "Slow response times detected"
description: "95th percentile response time is {{ $value }}s for {{ $labels.route }}"
- alert: WorkflowFailureRate
expr: rate(axonos_workflow_executions_total{status="failed"}[10m]) / rate(axonos_workflow_executions_total[10m]) > 0.1
for: 5m
labels:
severity: warning
service: axonos
annotations:
summary: "High workflow failure rate"
description: "Workflow failure rate is {{ $value | humanizePercentage }}"
AlertManager Configuration
# alertmanager.yml
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@axonos.dev'
route:
group_by: ['alertname', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'default'
webhook_configs:
- url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
- name: 'critical-alerts'
email_configs:
- to: 'oncall@axonos.dev'
subject: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
pagerduty_configs:
- service_key: 'YOUR_PAGERDUTY_KEY'
description: '{{ .GroupLabels.alertname }}'
- name: 'warning-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
channel: '#alerts'
title: '⚠️ Warning: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
Health Checks
Application Health Endpoints
import { Request, Response } from 'express';
interface HealthStatus {
status: 'healthy' | 'degraded' | 'unhealthy';
version: string;
timestamp: string;
uptime: number;
checks: HealthCheck[];
}
interface HealthCheck {
name: string;
status: 'pass' | 'fail' | 'warn';
duration_ms: number;
error?: string;
}
class HealthMonitor {
private checks: Map<string, () => Promise<HealthCheck>> = new Map();
constructor() {
this.registerCheck('database', this.checkDatabase.bind(this));
this.registerCheck('redis', this.checkRedis.bind(this));
this.registerCheck('node_registry', this.checkNodeRegistry.bind(this));
this.registerCheck('disk_space', this.checkDiskSpace.bind(this));
}
registerCheck(name: string, check: () => Promise<HealthCheck>) {
this.checks.set(name, check);
}
async getHealth(): Promise<HealthStatus> {
const start = Date.now();
const checks: HealthCheck[] = [];
for (const [name, check] of this.checks) {
try {
const result = await check();
checks.push(result);
} catch (error) {
checks.push({
name,
status: 'fail',
duration_ms: Date.now() - start,
error: error.message
});
}
}
const hasFailures = checks.some(c => c.status === 'fail');
const hasWarnings = checks.some(c => c.status === 'warn');
let status: 'healthy' | 'degraded' | 'unhealthy';
if (hasFailures) {
status = 'unhealthy';
} else if (hasWarnings) {
status = 'degraded';
} else {
status = 'healthy';
}
return {
status,
version: process.env.APP_VERSION || 'unknown',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
checks
};
}
private async checkDatabase(): Promise<HealthCheck> {
const start = Date.now();
try {
await db.query('SELECT 1');
return {
name: 'database',
status: 'pass',
duration_ms: Date.now() - start
};
} catch (error) {
return {
name: 'database',
status: 'fail',
duration_ms: Date.now() - start,
error: error.message
};
}
}
private async checkRedis(): Promise<HealthCheck> {
const start = Date.now();
try {
await redis.ping();
return {
name: 'redis',
status: 'pass',
duration_ms: Date.now() - start
};
} catch (error) {
return {
name: 'redis',
status: 'fail',
duration_ms: Date.now() - start,
error: error.message
};
}
}
private async checkDiskSpace(): Promise<HealthCheck> {
const start = Date.now();
try {
const stats = await fs.promises.statvfs('/var/lib/axonos');
const freeSpace = stats.f_bavail * stats.f_frsize;
const totalSpace = stats.f_blocks * stats.f_frsize;
const usagePercent = ((totalSpace - freeSpace) / totalSpace) * 100;
let status: 'pass' | 'warn' | 'fail';
if (usagePercent > 90) {
status = 'fail';
} else if (usagePercent > 80) {
status = 'warn';
} else {
status = 'pass';
}
return {
name: 'disk_space',
status,
duration_ms: Date.now() - start
};
} catch (error) {
return {
name: 'disk_space',
status: 'fail',
duration_ms: Date.now() - start,
error: error.message
};
}
}
}
// Health check endpoints
const healthMonitor = new HealthMonitor();
app.get('/health', async (req: Request, res: Response) => {
const health = await healthMonitor.getHealth();
const statusCode = health.status === 'healthy' ? 200 :
health.status === 'degraded' ? 200 : 503;
res.status(statusCode).json(health);
});
app.get('/health/live', (req: Request, res: Response) => {
res.status(200).json({ status: 'alive' });
});
app.get('/health/ready', async (req: Request, res: Response) => {
const health = await healthMonitor.getHealth();
const ready = health.status !== 'unhealthy';
res.status(ready ? 200 : 503).json({ ready });
});
Performance Monitoring
Application Performance Monitoring (APM)
import { performance } from 'perf_hooks';
class PerformanceMonitor {
private metrics: Map<string, number[]> = new Map();
startTimer(operation: string): () => void {
const start = performance.now();
return () => {
const duration = performance.now() - start;
this.recordMetric(operation, duration);
};
}
recordMetric(operation: string, duration: number) {
if (!this.metrics.has(operation)) {
this.metrics.set(operation, []);
}
const values = this.metrics.get(operation)!;
values.push(duration);
// Keep only last 1000 measurements
if (values.length > 1000) {
values.shift();
}
// Send to metrics backend
nodeExecutionDuration
.labels(operation)
.observe(duration / 1000); // Convert to seconds
}
getStats(operation: string) {
const values = this.metrics.get(operation) || [];
if (values.length === 0) return null;
const sorted = [...values].sort((a, b) => a - b);
const sum = values.reduce((a, b) => a + b, 0);
return {
count: values.length,
min: sorted[0],
max: sorted[sorted.length - 1],
avg: sum / values.length,
p50: sorted[Math.floor(sorted.length * 0.5)],
p95: sorted[Math.floor(sorted.length * 0.95)],
p99: sorted[Math.floor(sorted.length * 0.99)]
};
}
}
const perfMonitor = new PerformanceMonitor();
// Usage in workflow execution
async function executeNode(node: Node, input: any): Promise<any> {
const endTimer = perfMonitor.startTimer(`node.${node.type}.execute`);
try {
const result = await node.execute(input);
return result;
} finally {
endTimer();
}
}
Database Performance Monitoring
import { Pool } from 'pg';
class DatabaseMonitor {
private pool: Pool;
constructor(pool: Pool) {
this.pool = pool;
this.setupMonitoring();
}
private setupMonitoring() {
// Monitor connection pool
setInterval(() => {
const totalCount = this.pool.totalCount;
const idleCount = this.pool.idleCount;
const waitingCount = this.pool.waitingCount;
// Send metrics to Prometheus
dbConnectionsActive.set(totalCount - idleCount);
dbConnectionsIdle.set(idleCount);
dbConnectionsWaiting.set(waitingCount);
}, 5000);
}
async query(text: string, params?: any[]): Promise<any> {
const start = performance.now();
const client = await this.pool.connect();
try {
const result = await client.query(text, params);
const duration = performance.now() - start;
// Record query metrics
dbQueriesTotal.inc();
dbQueryDuration.observe(duration / 1000);
// Log slow queries
if (duration > 1000) { // > 1 second
console.warn('Slow query detected', {
duration_ms: duration,
query: text,
params: params
});
}
return result;
} finally {
client.release();
}
}
}
Monitoring Best Practices
1. Metric Naming Conventions
- Use snake_case for metric names
- Include units in metric names (e.g.,
_seconds
,_bytes
) - Use consistent prefixes (e.g.,
axonos_
) - Include relevant labels for filtering
2. Alert Configuration
- Set appropriate thresholds based on SLAs
- Use multiple severity levels
- Include actionable information in alerts
- Test alert channels regularly
3. Dashboard Design
- Group related metrics together
- Use consistent time ranges
- Include SLA targets as reference lines
- Make dashboards mobile-friendly
4. Performance Optimization
- Sample high-frequency metrics
- Use histogram buckets appropriate for your use case
- Avoid high-cardinality labels
- Regular cleanup of old metrics