reworked queue lib
This commit is contained in:
parent
629ba2b8d4
commit
c05a7413dc
34 changed files with 3887 additions and 861 deletions
327
libs/queue/src/queue-metrics.ts
Normal file
327
libs/queue/src/queue-metrics.ts
Normal file
|
|
@ -0,0 +1,327 @@
|
|||
import { Queue, QueueEvents } from 'bullmq';
|
||||
import { getLogger } from '@stock-bot/logger';
|
||||
import type { Job } from 'bullmq';
|
||||
|
||||
const logger = getLogger('queue-metrics');
|
||||
|
||||
export interface QueueMetrics {
|
||||
// Job counts
|
||||
waiting: number;
|
||||
active: number;
|
||||
completed: number;
|
||||
failed: number;
|
||||
delayed: number;
|
||||
paused?: number;
|
||||
|
||||
// Performance metrics
|
||||
processingTime: {
|
||||
avg: number;
|
||||
min: number;
|
||||
max: number;
|
||||
p95: number;
|
||||
p99: number;
|
||||
};
|
||||
|
||||
// Throughput
|
||||
throughput: {
|
||||
completedPerMinute: number;
|
||||
failedPerMinute: number;
|
||||
totalPerMinute: number;
|
||||
};
|
||||
|
||||
// Job age
|
||||
oldestWaitingJob: Date | null;
|
||||
|
||||
// Health
|
||||
isHealthy: boolean;
|
||||
healthIssues: string[];
|
||||
}
|
||||
|
||||
export class QueueMetricsCollector {
|
||||
private processingTimes: number[] = [];
|
||||
private completedTimestamps: number[] = [];
|
||||
private failedTimestamps: number[] = [];
|
||||
private readonly maxSamples = 1000;
|
||||
private readonly metricsInterval = 60000; // 1 minute
|
||||
|
||||
constructor(
|
||||
private queue: Queue,
|
||||
private queueEvents: QueueEvents
|
||||
) {
|
||||
this.setupEventListeners();
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup event listeners for metrics collection
|
||||
*/
|
||||
private setupEventListeners(): void {
|
||||
this.queueEvents.on('completed', ({ jobId, returnvalue, prev }) => {
|
||||
// Record completion
|
||||
this.completedTimestamps.push(Date.now());
|
||||
this.cleanupOldTimestamps();
|
||||
});
|
||||
|
||||
this.queueEvents.on('failed', ({ jobId, failedReason, prev }) => {
|
||||
// Record failure
|
||||
this.failedTimestamps.push(Date.now());
|
||||
this.cleanupOldTimestamps();
|
||||
});
|
||||
|
||||
// Track processing times
|
||||
this.queueEvents.on('active', async ({ jobId, prev }) => {
|
||||
const job = await this.getJob(jobId);
|
||||
if (job) {
|
||||
(job as any)._startTime = Date.now();
|
||||
}
|
||||
});
|
||||
|
||||
this.queueEvents.on('completed', async ({ jobId }) => {
|
||||
const job = await this.getJob(jobId);
|
||||
if (job && (job as any)._startTime) {
|
||||
const processingTime = Date.now() - (job as any)._startTime;
|
||||
this.recordProcessingTime(processingTime);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get job by ID
|
||||
*/
|
||||
private async getJob(jobId: string): Promise<Job | undefined> {
|
||||
try {
|
||||
return await this.queue.getJob(jobId) || undefined;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Record processing time
|
||||
*/
|
||||
private recordProcessingTime(time: number): void {
|
||||
this.processingTimes.push(time);
|
||||
|
||||
// Keep only recent samples
|
||||
if (this.processingTimes.length > this.maxSamples) {
|
||||
this.processingTimes = this.processingTimes.slice(-this.maxSamples);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up old timestamps
|
||||
*/
|
||||
private cleanupOldTimestamps(): void {
|
||||
const cutoff = Date.now() - this.metricsInterval;
|
||||
|
||||
this.completedTimestamps = this.completedTimestamps.filter(ts => ts > cutoff);
|
||||
this.failedTimestamps = this.failedTimestamps.filter(ts => ts > cutoff);
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect current metrics
|
||||
*/
|
||||
async collect(): Promise<QueueMetrics> {
|
||||
// Get job counts
|
||||
const [waiting, active, completed, failed, delayed] = await Promise.all([
|
||||
this.queue.getWaitingCount(),
|
||||
this.queue.getActiveCount(),
|
||||
this.queue.getCompletedCount(),
|
||||
this.queue.getFailedCount(),
|
||||
this.queue.getDelayedCount(),
|
||||
]);
|
||||
|
||||
// BullMQ doesn't have getPausedCount, check if queue is paused
|
||||
const paused = await this.queue.isPaused() ? waiting : 0;
|
||||
|
||||
// Calculate processing time metrics
|
||||
const processingTime = this.calculateProcessingTimeMetrics();
|
||||
|
||||
// Calculate throughput
|
||||
const throughput = this.calculateThroughput();
|
||||
|
||||
// Get oldest waiting job
|
||||
const oldestWaitingJob = await this.getOldestWaitingJob();
|
||||
|
||||
// Check health
|
||||
const { isHealthy, healthIssues } = this.checkHealth({
|
||||
waiting,
|
||||
active,
|
||||
failed,
|
||||
processingTime,
|
||||
});
|
||||
|
||||
return {
|
||||
waiting,
|
||||
active,
|
||||
completed,
|
||||
failed,
|
||||
delayed,
|
||||
paused,
|
||||
processingTime,
|
||||
throughput,
|
||||
oldestWaitingJob,
|
||||
isHealthy,
|
||||
healthIssues,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate processing time metrics
|
||||
*/
|
||||
private calculateProcessingTimeMetrics(): QueueMetrics['processingTime'] {
|
||||
if (this.processingTimes.length === 0) {
|
||||
return { avg: 0, min: 0, max: 0, p95: 0, p99: 0 };
|
||||
}
|
||||
|
||||
const sorted = [...this.processingTimes].sort((a, b) => a - b);
|
||||
const sum = sorted.reduce((acc, val) => acc + val, 0);
|
||||
|
||||
return {
|
||||
avg: Math.round(sum / sorted.length),
|
||||
min: sorted[0],
|
||||
max: sorted[sorted.length - 1],
|
||||
p95: sorted[Math.floor(sorted.length * 0.95)],
|
||||
p99: sorted[Math.floor(sorted.length * 0.99)],
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate throughput metrics
|
||||
*/
|
||||
private calculateThroughput(): QueueMetrics['throughput'] {
|
||||
const now = Date.now();
|
||||
const oneMinuteAgo = now - 60000;
|
||||
|
||||
const completedPerMinute = this.completedTimestamps.filter(ts => ts > oneMinuteAgo).length;
|
||||
const failedPerMinute = this.failedTimestamps.filter(ts => ts > oneMinuteAgo).length;
|
||||
|
||||
return {
|
||||
completedPerMinute,
|
||||
failedPerMinute,
|
||||
totalPerMinute: completedPerMinute + failedPerMinute,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get oldest waiting job
|
||||
*/
|
||||
private async getOldestWaitingJob(): Promise<Date | null> {
|
||||
const waitingJobs = await this.queue.getWaiting(0, 1);
|
||||
|
||||
if (waitingJobs.length > 0) {
|
||||
return new Date(waitingJobs[0].timestamp);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check queue health
|
||||
*/
|
||||
private checkHealth(metrics: {
|
||||
waiting: number;
|
||||
active: number;
|
||||
failed: number;
|
||||
processingTime: QueueMetrics['processingTime'];
|
||||
}): { isHealthy: boolean; healthIssues: string[] } {
|
||||
const issues: string[] = [];
|
||||
|
||||
// Check for high failure rate
|
||||
const failureRate = metrics.failed / (metrics.failed + this.completedTimestamps.length);
|
||||
if (failureRate > 0.1) {
|
||||
issues.push(`High failure rate: ${(failureRate * 100).toFixed(1)}%`);
|
||||
}
|
||||
|
||||
// Check for queue backlog
|
||||
if (metrics.waiting > 1000) {
|
||||
issues.push(`Large queue backlog: ${metrics.waiting} jobs waiting`);
|
||||
}
|
||||
|
||||
// Check for slow processing
|
||||
if (metrics.processingTime.avg > 30000) { // 30 seconds
|
||||
issues.push(`Slow average processing time: ${(metrics.processingTime.avg / 1000).toFixed(1)}s`);
|
||||
}
|
||||
|
||||
// Check for stalled active jobs
|
||||
if (metrics.active > 100) {
|
||||
issues.push(`High number of active jobs: ${metrics.active}`);
|
||||
}
|
||||
|
||||
return {
|
||||
isHealthy: issues.length === 0,
|
||||
healthIssues: issues,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get formatted metrics report
|
||||
*/
|
||||
async getReport(): Promise<string> {
|
||||
const metrics = await this.collect();
|
||||
|
||||
return `
|
||||
Queue Metrics Report
|
||||
===================
|
||||
Status: ${metrics.isHealthy ? '✅ Healthy' : '⚠️ Issues Detected'}
|
||||
|
||||
Job Counts:
|
||||
- Waiting: ${metrics.waiting}
|
||||
- Active: ${metrics.active}
|
||||
- Completed: ${metrics.completed}
|
||||
- Failed: ${metrics.failed}
|
||||
- Delayed: ${metrics.delayed}
|
||||
- Paused: ${metrics.paused}
|
||||
|
||||
Performance:
|
||||
- Avg Processing Time: ${(metrics.processingTime.avg / 1000).toFixed(2)}s
|
||||
- Min/Max: ${(metrics.processingTime.min / 1000).toFixed(2)}s / ${(metrics.processingTime.max / 1000).toFixed(2)}s
|
||||
- P95/P99: ${(metrics.processingTime.p95 / 1000).toFixed(2)}s / ${(metrics.processingTime.p99 / 1000).toFixed(2)}s
|
||||
|
||||
Throughput:
|
||||
- Completed/min: ${metrics.throughput.completedPerMinute}
|
||||
- Failed/min: ${metrics.throughput.failedPerMinute}
|
||||
- Total/min: ${metrics.throughput.totalPerMinute}
|
||||
|
||||
${metrics.oldestWaitingJob ? `Oldest Waiting Job: ${metrics.oldestWaitingJob.toISOString()}` : 'No waiting jobs'}
|
||||
|
||||
${metrics.healthIssues.length > 0 ? `\nHealth Issues:\n${metrics.healthIssues.map(issue => `- ${issue}`).join('\n')}` : ''}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Export metrics in Prometheus format
|
||||
*/
|
||||
async getPrometheusMetrics(): Promise<string> {
|
||||
const metrics = await this.collect();
|
||||
const queueName = this.queue.name;
|
||||
|
||||
return `
|
||||
# HELP queue_jobs_total Total number of jobs by status
|
||||
# TYPE queue_jobs_total gauge
|
||||
queue_jobs_total{queue="${queueName}",status="waiting"} ${metrics.waiting}
|
||||
queue_jobs_total{queue="${queueName}",status="active"} ${metrics.active}
|
||||
queue_jobs_total{queue="${queueName}",status="completed"} ${metrics.completed}
|
||||
queue_jobs_total{queue="${queueName}",status="failed"} ${metrics.failed}
|
||||
queue_jobs_total{queue="${queueName}",status="delayed"} ${metrics.delayed}
|
||||
queue_jobs_total{queue="${queueName}",status="paused"} ${metrics.paused}
|
||||
|
||||
# HELP queue_processing_time_seconds Job processing time in seconds
|
||||
# TYPE queue_processing_time_seconds summary
|
||||
queue_processing_time_seconds{queue="${queueName}",quantile="0.5"} ${(metrics.processingTime.avg / 1000).toFixed(3)}
|
||||
queue_processing_time_seconds{queue="${queueName}",quantile="0.95"} ${(metrics.processingTime.p95 / 1000).toFixed(3)}
|
||||
queue_processing_time_seconds{queue="${queueName}",quantile="0.99"} ${(metrics.processingTime.p99 / 1000).toFixed(3)}
|
||||
queue_processing_time_seconds_sum{queue="${queueName}"} ${(metrics.processingTime.avg * this.processingTimes.length / 1000).toFixed(3)}
|
||||
queue_processing_time_seconds_count{queue="${queueName}"} ${this.processingTimes.length}
|
||||
|
||||
# HELP queue_throughput_per_minute Jobs processed per minute
|
||||
# TYPE queue_throughput_per_minute gauge
|
||||
queue_throughput_per_minute{queue="${queueName}",status="completed"} ${metrics.throughput.completedPerMinute}
|
||||
queue_throughput_per_minute{queue="${queueName}",status="failed"} ${metrics.throughput.failedPerMinute}
|
||||
queue_throughput_per_minute{queue="${queueName}",status="total"} ${metrics.throughput.totalPerMinute}
|
||||
|
||||
# HELP queue_health Queue health status
|
||||
# TYPE queue_health gauge
|
||||
queue_health{queue="${queueName}"} ${metrics.isHealthy ? 1 : 0}
|
||||
`.trim();
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue