314 lines
No EOL
9.6 KiB
TypeScript
314 lines
No EOL
9.6 KiB
TypeScript
import { Queue, QueueEvents } from 'bullmq';
|
|
// import { getLogger } from '@stock-bot/logger';
|
|
|
|
// const logger = getLogger('queue-metrics');
|
|
|
|
export interface QueueMetrics {
|
|
// Job counts
|
|
waiting: number;
|
|
active: number;
|
|
completed: number;
|
|
failed: number;
|
|
delayed: number;
|
|
paused?: number;
|
|
|
|
// Performance metrics
|
|
processingTime: {
|
|
avg: number;
|
|
min: number;
|
|
max: number;
|
|
p95: number;
|
|
p99: number;
|
|
};
|
|
|
|
// Throughput
|
|
throughput: {
|
|
completedPerMinute: number;
|
|
failedPerMinute: number;
|
|
totalPerMinute: number;
|
|
};
|
|
|
|
// Job age
|
|
oldestWaitingJob: Date | null;
|
|
|
|
// Health
|
|
isHealthy: boolean;
|
|
healthIssues: string[];
|
|
}
|
|
|
|
export class QueueMetricsCollector {
|
|
private processingTimes: number[] = [];
|
|
private completedTimestamps: number[] = [];
|
|
private failedTimestamps: number[] = [];
|
|
private jobStartTimes = new Map<string, number>();
|
|
private readonly maxSamples = 1000;
|
|
private readonly metricsInterval = 60000; // 1 minute
|
|
|
|
constructor(
|
|
private queue: Queue,
|
|
private queueEvents: QueueEvents
|
|
) {
|
|
this.setupEventListeners();
|
|
}
|
|
|
|
/**
|
|
* Setup event listeners for metrics collection
|
|
*/
|
|
private setupEventListeners(): void {
|
|
this.queueEvents.on('completed', () => {
|
|
// Record completion
|
|
this.completedTimestamps.push(Date.now());
|
|
this.cleanupOldTimestamps();
|
|
});
|
|
|
|
this.queueEvents.on('failed', () => {
|
|
// Record failure
|
|
this.failedTimestamps.push(Date.now());
|
|
this.cleanupOldTimestamps();
|
|
});
|
|
|
|
// Track processing times
|
|
this.queueEvents.on('active', ({ jobId }) => {
|
|
this.jobStartTimes.set(jobId, Date.now());
|
|
});
|
|
|
|
this.queueEvents.on('completed', ({ jobId }) => {
|
|
const startTime = this.jobStartTimes.get(jobId);
|
|
if (startTime) {
|
|
const processingTime = Date.now() - startTime;
|
|
this.recordProcessingTime(processingTime);
|
|
this.jobStartTimes.delete(jobId);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Record processing time
|
|
*/
|
|
private recordProcessingTime(time: number): void {
|
|
this.processingTimes.push(time);
|
|
|
|
// Keep only recent samples
|
|
if (this.processingTimes.length > this.maxSamples) {
|
|
this.processingTimes = this.processingTimes.slice(-this.maxSamples);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clean up old timestamps
|
|
*/
|
|
private cleanupOldTimestamps(): void {
|
|
const cutoff = Date.now() - this.metricsInterval;
|
|
|
|
this.completedTimestamps = this.completedTimestamps.filter(ts => ts > cutoff);
|
|
this.failedTimestamps = this.failedTimestamps.filter(ts => ts > cutoff);
|
|
}
|
|
|
|
/**
|
|
* Collect current metrics
|
|
*/
|
|
async collect(): Promise<QueueMetrics> {
|
|
// Get job counts
|
|
const [waiting, active, completed, failed, delayed] = await Promise.all([
|
|
this.queue.getWaitingCount(),
|
|
this.queue.getActiveCount(),
|
|
this.queue.getCompletedCount(),
|
|
this.queue.getFailedCount(),
|
|
this.queue.getDelayedCount(),
|
|
]);
|
|
|
|
// BullMQ doesn't have getPausedCount, check if queue is paused
|
|
const paused = await this.queue.isPaused() ? waiting : 0;
|
|
|
|
// Calculate processing time metrics
|
|
const processingTime = this.calculateProcessingTimeMetrics();
|
|
|
|
// Calculate throughput
|
|
const throughput = this.calculateThroughput();
|
|
|
|
// Get oldest waiting job
|
|
const oldestWaitingJob = await this.getOldestWaitingJob();
|
|
|
|
// Check health
|
|
const { isHealthy, healthIssues } = this.checkHealth({
|
|
waiting,
|
|
active,
|
|
failed,
|
|
processingTime,
|
|
});
|
|
|
|
return {
|
|
waiting,
|
|
active,
|
|
completed,
|
|
failed,
|
|
delayed,
|
|
paused,
|
|
processingTime,
|
|
throughput,
|
|
oldestWaitingJob,
|
|
isHealthy,
|
|
healthIssues,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Calculate processing time metrics
|
|
*/
|
|
private calculateProcessingTimeMetrics(): QueueMetrics['processingTime'] {
|
|
if (this.processingTimes.length === 0) {
|
|
return { avg: 0, min: 0, max: 0, p95: 0, p99: 0 };
|
|
}
|
|
|
|
const sorted = [...this.processingTimes].sort((a, b) => a - b);
|
|
const sum = sorted.reduce((acc, val) => acc + val, 0);
|
|
|
|
return {
|
|
avg: sorted.length > 0 ? Math.round(sum / sorted.length) : 0,
|
|
min: sorted[0] || 0,
|
|
max: sorted[sorted.length - 1] || 0,
|
|
p95: sorted[Math.floor(sorted.length * 0.95)] || 0,
|
|
p99: sorted[Math.floor(sorted.length * 0.99)] || 0,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Calculate throughput metrics
|
|
*/
|
|
private calculateThroughput(): QueueMetrics['throughput'] {
|
|
const now = Date.now();
|
|
const oneMinuteAgo = now - 60000;
|
|
|
|
const completedPerMinute = this.completedTimestamps.filter(ts => ts > oneMinuteAgo).length;
|
|
const failedPerMinute = this.failedTimestamps.filter(ts => ts > oneMinuteAgo).length;
|
|
|
|
return {
|
|
completedPerMinute,
|
|
failedPerMinute,
|
|
totalPerMinute: completedPerMinute + failedPerMinute,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get oldest waiting job
|
|
*/
|
|
private async getOldestWaitingJob(): Promise<Date | null> {
|
|
const waitingJobs = await this.queue.getWaiting(0, 1);
|
|
|
|
if (waitingJobs.length > 0) {
|
|
return new Date(waitingJobs[0].timestamp);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Check queue health
|
|
*/
|
|
private checkHealth(metrics: {
|
|
waiting: number;
|
|
active: number;
|
|
failed: number;
|
|
processingTime: QueueMetrics['processingTime'];
|
|
}): { isHealthy: boolean; healthIssues: string[] } {
|
|
const issues: string[] = [];
|
|
|
|
// Check for high failure rate
|
|
const failureRate = metrics.failed / (metrics.failed + this.completedTimestamps.length);
|
|
if (failureRate > 0.1) {
|
|
issues.push(`High failure rate: ${(failureRate * 100).toFixed(1)}%`);
|
|
}
|
|
|
|
// Check for queue backlog
|
|
if (metrics.waiting > 1000) {
|
|
issues.push(`Large queue backlog: ${metrics.waiting} jobs waiting`);
|
|
}
|
|
|
|
// Check for slow processing
|
|
if (metrics.processingTime.avg > 30000) { // 30 seconds
|
|
issues.push(`Slow average processing time: ${(metrics.processingTime.avg / 1000).toFixed(1)}s`);
|
|
}
|
|
|
|
// Check for stalled active jobs
|
|
if (metrics.active > 100) {
|
|
issues.push(`High number of active jobs: ${metrics.active}`);
|
|
}
|
|
|
|
return {
|
|
isHealthy: issues.length === 0,
|
|
healthIssues: issues,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get formatted metrics report
|
|
*/
|
|
async getReport(): Promise<string> {
|
|
const metrics = await this.collect();
|
|
|
|
return `
|
|
Queue Metrics Report
|
|
===================
|
|
Status: ${metrics.isHealthy ? '✅ Healthy' : '⚠️ Issues Detected'}
|
|
|
|
Job Counts:
|
|
- Waiting: ${metrics.waiting}
|
|
- Active: ${metrics.active}
|
|
- Completed: ${metrics.completed}
|
|
- Failed: ${metrics.failed}
|
|
- Delayed: ${metrics.delayed}
|
|
- Paused: ${metrics.paused}
|
|
|
|
Performance:
|
|
- Avg Processing Time: ${(metrics.processingTime.avg / 1000).toFixed(2)}s
|
|
- Min/Max: ${(metrics.processingTime.min / 1000).toFixed(2)}s / ${(metrics.processingTime.max / 1000).toFixed(2)}s
|
|
- P95/P99: ${(metrics.processingTime.p95 / 1000).toFixed(2)}s / ${(metrics.processingTime.p99 / 1000).toFixed(2)}s
|
|
|
|
Throughput:
|
|
- Completed/min: ${metrics.throughput.completedPerMinute}
|
|
- Failed/min: ${metrics.throughput.failedPerMinute}
|
|
- Total/min: ${metrics.throughput.totalPerMinute}
|
|
|
|
${metrics.oldestWaitingJob ? `Oldest Waiting Job: ${metrics.oldestWaitingJob.toISOString()}` : 'No waiting jobs'}
|
|
|
|
${metrics.healthIssues.length > 0 ? `\nHealth Issues:\n${metrics.healthIssues.map(issue => `- ${issue}`).join('\n')}` : ''}
|
|
`.trim();
|
|
}
|
|
|
|
/**
|
|
* Export metrics in Prometheus format
|
|
*/
|
|
async getPrometheusMetrics(): Promise<string> {
|
|
const metrics = await this.collect();
|
|
const queueName = this.queue.name;
|
|
|
|
return `
|
|
# HELP queue_jobs_total Total number of jobs by status
|
|
# TYPE queue_jobs_total gauge
|
|
queue_jobs_total{queue="${queueName}",status="waiting"} ${metrics.waiting}
|
|
queue_jobs_total{queue="${queueName}",status="active"} ${metrics.active}
|
|
queue_jobs_total{queue="${queueName}",status="completed"} ${metrics.completed}
|
|
queue_jobs_total{queue="${queueName}",status="failed"} ${metrics.failed}
|
|
queue_jobs_total{queue="${queueName}",status="delayed"} ${metrics.delayed}
|
|
queue_jobs_total{queue="${queueName}",status="paused"} ${metrics.paused}
|
|
|
|
# HELP queue_processing_time_seconds Job processing time in seconds
|
|
# TYPE queue_processing_time_seconds summary
|
|
queue_processing_time_seconds{queue="${queueName}",quantile="0.5"} ${(metrics.processingTime.avg / 1000).toFixed(3)}
|
|
queue_processing_time_seconds{queue="${queueName}",quantile="0.95"} ${(metrics.processingTime.p95 / 1000).toFixed(3)}
|
|
queue_processing_time_seconds{queue="${queueName}",quantile="0.99"} ${(metrics.processingTime.p99 / 1000).toFixed(3)}
|
|
queue_processing_time_seconds_sum{queue="${queueName}"} ${(metrics.processingTime.avg * this.processingTimes.length / 1000).toFixed(3)}
|
|
queue_processing_time_seconds_count{queue="${queueName}"} ${this.processingTimes.length}
|
|
|
|
# HELP queue_throughput_per_minute Jobs processed per minute
|
|
# TYPE queue_throughput_per_minute gauge
|
|
queue_throughput_per_minute{queue="${queueName}",status="completed"} ${metrics.throughput.completedPerMinute}
|
|
queue_throughput_per_minute{queue="${queueName}",status="failed"} ${metrics.throughput.failedPerMinute}
|
|
queue_throughput_per_minute{queue="${queueName}",status="total"} ${metrics.throughput.totalPerMinute}
|
|
|
|
# HELP queue_health Queue health status
|
|
# TYPE queue_health gauge
|
|
queue_health{queue="${queueName}"} ${metrics.isHealthy ? 1 : 0}
|
|
`.trim();
|
|
}
|
|
} |