stock-bot/libs/queue/src/queue-metrics.ts

327 lines
No EOL
9.9 KiB
TypeScript

import { Queue, QueueEvents } from 'bullmq';
import { getLogger } from '@stock-bot/logger';
import type { Job } from 'bullmq';
// const logger = getLogger('queue-metrics');
export interface QueueMetrics {
// Job counts
waiting: number;
active: number;
completed: number;
failed: number;
delayed: number;
paused?: number;
// Performance metrics
processingTime: {
avg: number;
min: number;
max: number;
p95: number;
p99: number;
};
// Throughput
throughput: {
completedPerMinute: number;
failedPerMinute: number;
totalPerMinute: number;
};
// Job age
oldestWaitingJob: Date | null;
// Health
isHealthy: boolean;
healthIssues: string[];
}
export class QueueMetricsCollector {
private processingTimes: number[] = [];
private completedTimestamps: number[] = [];
private failedTimestamps: number[] = [];
private readonly maxSamples = 1000;
private readonly metricsInterval = 60000; // 1 minute
constructor(
private queue: Queue,
private queueEvents: QueueEvents
) {
this.setupEventListeners();
}
/**
* Setup event listeners for metrics collection
*/
private setupEventListeners(): void {
this.queueEvents.on('completed', () => {
// Record completion
this.completedTimestamps.push(Date.now());
this.cleanupOldTimestamps();
});
this.queueEvents.on('failed', () => {
// Record failure
this.failedTimestamps.push(Date.now());
this.cleanupOldTimestamps();
});
// Track processing times
this.queueEvents.on('active', async ({ jobId }) => {
const job = await this.getJob(jobId);
if (job) {
(job as any)._startTime = Date.now();
}
});
this.queueEvents.on('completed', async ({ jobId }) => {
const job = await this.getJob(jobId);
if (job && (job as any)._startTime) {
const processingTime = Date.now() - (job as any)._startTime;
this.recordProcessingTime(processingTime);
}
});
}
/**
* Get job by ID
*/
private async getJob(jobId: string): Promise<Job | undefined> {
try {
return await this.queue.getJob(jobId) || undefined;
} catch {
return undefined;
}
}
/**
* Record processing time
*/
private recordProcessingTime(time: number): void {
this.processingTimes.push(time);
// Keep only recent samples
if (this.processingTimes.length > this.maxSamples) {
this.processingTimes = this.processingTimes.slice(-this.maxSamples);
}
}
/**
* Clean up old timestamps
*/
private cleanupOldTimestamps(): void {
const cutoff = Date.now() - this.metricsInterval;
this.completedTimestamps = this.completedTimestamps.filter(ts => ts > cutoff);
this.failedTimestamps = this.failedTimestamps.filter(ts => ts > cutoff);
}
/**
* Collect current metrics
*/
async collect(): Promise<QueueMetrics> {
// Get job counts
const [waiting, active, completed, failed, delayed] = await Promise.all([
this.queue.getWaitingCount(),
this.queue.getActiveCount(),
this.queue.getCompletedCount(),
this.queue.getFailedCount(),
this.queue.getDelayedCount(),
]);
// BullMQ doesn't have getPausedCount, check if queue is paused
const paused = await this.queue.isPaused() ? waiting : 0;
// Calculate processing time metrics
const processingTime = this.calculateProcessingTimeMetrics();
// Calculate throughput
const throughput = this.calculateThroughput();
// Get oldest waiting job
const oldestWaitingJob = await this.getOldestWaitingJob();
// Check health
const { isHealthy, healthIssues } = this.checkHealth({
waiting,
active,
failed,
processingTime,
});
return {
waiting,
active,
completed,
failed,
delayed,
paused,
processingTime,
throughput,
oldestWaitingJob,
isHealthy,
healthIssues,
};
}
/**
* Calculate processing time metrics
*/
private calculateProcessingTimeMetrics(): QueueMetrics['processingTime'] {
if (this.processingTimes.length === 0) {
return { avg: 0, min: 0, max: 0, p95: 0, p99: 0 };
}
const sorted = [...this.processingTimes].sort((a, b) => a - b);
const sum = sorted.reduce((acc, val) => acc + val, 0);
return {
avg: sorted.length > 0 ? Math.round(sum / sorted.length) : 0,
min: sorted[0] || 0,
max: sorted[sorted.length - 1] || 0,
p95: sorted[Math.floor(sorted.length * 0.95)] || 0,
p99: sorted[Math.floor(sorted.length * 0.99)] || 0,
};
}
/**
* Calculate throughput metrics
*/
private calculateThroughput(): QueueMetrics['throughput'] {
const now = Date.now();
const oneMinuteAgo = now - 60000;
const completedPerMinute = this.completedTimestamps.filter(ts => ts > oneMinuteAgo).length;
const failedPerMinute = this.failedTimestamps.filter(ts => ts > oneMinuteAgo).length;
return {
completedPerMinute,
failedPerMinute,
totalPerMinute: completedPerMinute + failedPerMinute,
};
}
/**
* Get oldest waiting job
*/
private async getOldestWaitingJob(): Promise<Date | null> {
const waitingJobs = await this.queue.getWaiting(0, 1);
if (waitingJobs.length > 0) {
return new Date(waitingJobs[0].timestamp);
}
return null;
}
/**
* Check queue health
*/
private checkHealth(metrics: {
waiting: number;
active: number;
failed: number;
processingTime: QueueMetrics['processingTime'];
}): { isHealthy: boolean; healthIssues: string[] } {
const issues: string[] = [];
// Check for high failure rate
const failureRate = metrics.failed / (metrics.failed + this.completedTimestamps.length);
if (failureRate > 0.1) {
issues.push(`High failure rate: ${(failureRate * 100).toFixed(1)}%`);
}
// Check for queue backlog
if (metrics.waiting > 1000) {
issues.push(`Large queue backlog: ${metrics.waiting} jobs waiting`);
}
// Check for slow processing
if (metrics.processingTime.avg > 30000) { // 30 seconds
issues.push(`Slow average processing time: ${(metrics.processingTime.avg / 1000).toFixed(1)}s`);
}
// Check for stalled active jobs
if (metrics.active > 100) {
issues.push(`High number of active jobs: ${metrics.active}`);
}
return {
isHealthy: issues.length === 0,
healthIssues: issues,
};
}
/**
* Get formatted metrics report
*/
async getReport(): Promise<string> {
const metrics = await this.collect();
return `
Queue Metrics Report
===================
Status: ${metrics.isHealthy ? '✅ Healthy' : '⚠️ Issues Detected'}
Job Counts:
- Waiting: ${metrics.waiting}
- Active: ${metrics.active}
- Completed: ${metrics.completed}
- Failed: ${metrics.failed}
- Delayed: ${metrics.delayed}
- Paused: ${metrics.paused}
Performance:
- Avg Processing Time: ${(metrics.processingTime.avg / 1000).toFixed(2)}s
- Min/Max: ${(metrics.processingTime.min / 1000).toFixed(2)}s / ${(metrics.processingTime.max / 1000).toFixed(2)}s
- P95/P99: ${(metrics.processingTime.p95 / 1000).toFixed(2)}s / ${(metrics.processingTime.p99 / 1000).toFixed(2)}s
Throughput:
- Completed/min: ${metrics.throughput.completedPerMinute}
- Failed/min: ${metrics.throughput.failedPerMinute}
- Total/min: ${metrics.throughput.totalPerMinute}
${metrics.oldestWaitingJob ? `Oldest Waiting Job: ${metrics.oldestWaitingJob.toISOString()}` : 'No waiting jobs'}
${metrics.healthIssues.length > 0 ? `\nHealth Issues:\n${metrics.healthIssues.map(issue => `- ${issue}`).join('\n')}` : ''}
`.trim();
}
/**
* Export metrics in Prometheus format
*/
async getPrometheusMetrics(): Promise<string> {
const metrics = await this.collect();
const queueName = this.queue.name;
return `
# HELP queue_jobs_total Total number of jobs by status
# TYPE queue_jobs_total gauge
queue_jobs_total{queue="${queueName}",status="waiting"} ${metrics.waiting}
queue_jobs_total{queue="${queueName}",status="active"} ${metrics.active}
queue_jobs_total{queue="${queueName}",status="completed"} ${metrics.completed}
queue_jobs_total{queue="${queueName}",status="failed"} ${metrics.failed}
queue_jobs_total{queue="${queueName}",status="delayed"} ${metrics.delayed}
queue_jobs_total{queue="${queueName}",status="paused"} ${metrics.paused}
# HELP queue_processing_time_seconds Job processing time in seconds
# TYPE queue_processing_time_seconds summary
queue_processing_time_seconds{queue="${queueName}",quantile="0.5"} ${(metrics.processingTime.avg / 1000).toFixed(3)}
queue_processing_time_seconds{queue="${queueName}",quantile="0.95"} ${(metrics.processingTime.p95 / 1000).toFixed(3)}
queue_processing_time_seconds{queue="${queueName}",quantile="0.99"} ${(metrics.processingTime.p99 / 1000).toFixed(3)}
queue_processing_time_seconds_sum{queue="${queueName}"} ${(metrics.processingTime.avg * this.processingTimes.length / 1000).toFixed(3)}
queue_processing_time_seconds_count{queue="${queueName}"} ${this.processingTimes.length}
# HELP queue_throughput_per_minute Jobs processed per minute
# TYPE queue_throughput_per_minute gauge
queue_throughput_per_minute{queue="${queueName}",status="completed"} ${metrics.throughput.completedPerMinute}
queue_throughput_per_minute{queue="${queueName}",status="failed"} ${metrics.throughput.failedPerMinute}
queue_throughput_per_minute{queue="${queueName}",status="total"} ${metrics.throughput.totalPerMinute}
# HELP queue_health Queue health status
# TYPE queue_health gauge
queue_health{queue="${queueName}"} ${metrics.isHealthy ? 1 : 0}
`.trim();
}
}