258 lines
No EOL
6.7 KiB
TypeScript
258 lines
No EOL
6.7 KiB
TypeScript
import { Queue, type Job } from 'bullmq';
|
|
import { getLogger } from '@stock-bot/logger';
|
|
import type { JobData } from './types';
|
|
import { getRedisConnection } from './utils';
|
|
|
|
const logger = getLogger('dlq-handler');
|
|
|
|
export interface DLQConfig {
|
|
maxRetries?: number;
|
|
retryDelay?: number;
|
|
alertThreshold?: number;
|
|
cleanupAge?: number; // hours
|
|
}
|
|
|
|
export class DeadLetterQueueHandler {
|
|
private dlq: Queue;
|
|
private config: Required<DLQConfig>;
|
|
private failureCount = new Map<string, number>();
|
|
|
|
constructor(
|
|
private mainQueue: Queue,
|
|
private connection: any,
|
|
config: DLQConfig = {}
|
|
) {
|
|
this.config = {
|
|
maxRetries: config.maxRetries ?? 3,
|
|
retryDelay: config.retryDelay ?? 60000, // 1 minute
|
|
alertThreshold: config.alertThreshold ?? 100,
|
|
cleanupAge: config.cleanupAge ?? 168, // 7 days
|
|
};
|
|
|
|
// Create DLQ with same name but -dlq suffix
|
|
const dlqName = `${mainQueue.name}-dlq`;
|
|
this.dlq = new Queue(dlqName, { connection: getRedisConnection(connection) });
|
|
}
|
|
|
|
/**
|
|
* Process a failed job - either retry or move to DLQ
|
|
*/
|
|
async handleFailedJob(job: Job, error: Error): Promise<void> {
|
|
const jobKey = `${job.name}:${job.id}`;
|
|
const currentFailures = (this.failureCount.get(jobKey) || 0) + 1;
|
|
this.failureCount.set(jobKey, currentFailures);
|
|
|
|
logger.warn('Job failed', {
|
|
jobId: job.id,
|
|
jobName: job.name,
|
|
attempt: job.attemptsMade,
|
|
maxAttempts: job.opts.attempts,
|
|
error: error.message,
|
|
failureCount: currentFailures,
|
|
});
|
|
|
|
// Check if job should be moved to DLQ
|
|
if (job.attemptsMade >= (job.opts.attempts || this.config.maxRetries)) {
|
|
await this.moveToDeadLetterQueue(job, error);
|
|
this.failureCount.delete(jobKey);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Move job to dead letter queue
|
|
*/
|
|
private async moveToDeadLetterQueue(job: Job, error: Error): Promise<void> {
|
|
try {
|
|
const dlqData = {
|
|
originalJob: {
|
|
id: job.id,
|
|
name: job.name,
|
|
data: job.data,
|
|
opts: job.opts,
|
|
attemptsMade: job.attemptsMade,
|
|
failedReason: job.failedReason,
|
|
processedOn: job.processedOn,
|
|
timestamp: job.timestamp,
|
|
},
|
|
error: {
|
|
message: error.message,
|
|
stack: error.stack,
|
|
name: error.name,
|
|
},
|
|
movedToDLQAt: new Date().toISOString(),
|
|
};
|
|
|
|
await this.dlq.add('failed-job', dlqData, {
|
|
removeOnComplete: false,
|
|
removeOnFail: false,
|
|
});
|
|
|
|
logger.error('Job moved to DLQ', {
|
|
jobId: job.id,
|
|
jobName: job.name,
|
|
error: error.message,
|
|
});
|
|
|
|
// Check if we need to alert
|
|
await this.checkAlertThreshold();
|
|
} catch (dlqError) {
|
|
logger.error('Failed to move job to DLQ', {
|
|
jobId: job.id,
|
|
error: dlqError,
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Retry jobs from DLQ
|
|
*/
|
|
async retryDLQJobs(limit = 10): Promise<number> {
|
|
const jobs = await this.dlq.getCompleted(0, limit);
|
|
let retriedCount = 0;
|
|
|
|
for (const dlqJob of jobs) {
|
|
try {
|
|
const { originalJob } = dlqJob.data;
|
|
|
|
// Re-add to main queue with delay
|
|
await this.mainQueue.add(
|
|
originalJob.name,
|
|
originalJob.data,
|
|
{
|
|
...originalJob.opts,
|
|
delay: this.config.retryDelay,
|
|
attempts: this.config.maxRetries,
|
|
}
|
|
);
|
|
|
|
// Remove from DLQ
|
|
await dlqJob.remove();
|
|
retriedCount++;
|
|
|
|
logger.info('Job retried from DLQ', {
|
|
originalJobId: originalJob.id,
|
|
jobName: originalJob.name,
|
|
});
|
|
} catch (error) {
|
|
logger.error('Failed to retry DLQ job', {
|
|
dlqJobId: dlqJob.id,
|
|
error,
|
|
});
|
|
}
|
|
}
|
|
|
|
return retriedCount;
|
|
}
|
|
|
|
/**
|
|
* Get DLQ statistics
|
|
*/
|
|
async getStats(): Promise<{
|
|
total: number;
|
|
recent: number;
|
|
byJobName: Record<string, number>;
|
|
oldestJob: Date | null;
|
|
}> {
|
|
const [completed, failed, waiting] = await Promise.all([
|
|
this.dlq.getCompleted(),
|
|
this.dlq.getFailed(),
|
|
this.dlq.getWaiting(),
|
|
]);
|
|
|
|
const allJobs = [...completed, ...failed, ...waiting];
|
|
const byJobName: Record<string, number> = {};
|
|
let oldestTimestamp: number | null = null;
|
|
|
|
for (const job of allJobs) {
|
|
const jobName = job.data.originalJob?.name || 'unknown';
|
|
byJobName[jobName] = (byJobName[jobName] || 0) + 1;
|
|
|
|
if (!oldestTimestamp || job.timestamp < oldestTimestamp) {
|
|
oldestTimestamp = job.timestamp;
|
|
}
|
|
}
|
|
|
|
// Count recent jobs (last 24 hours)
|
|
const oneDayAgo = Date.now() - 24 * 60 * 60 * 1000;
|
|
const recent = allJobs.filter(job => job.timestamp > oneDayAgo).length;
|
|
|
|
return {
|
|
total: allJobs.length,
|
|
recent,
|
|
byJobName,
|
|
oldestJob: oldestTimestamp ? new Date(oldestTimestamp) : null,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Clean up old DLQ entries
|
|
*/
|
|
async cleanup(): Promise<number> {
|
|
const ageInMs = this.config.cleanupAge * 60 * 60 * 1000;
|
|
const cutoffTime = Date.now() - ageInMs;
|
|
|
|
const jobs = await this.dlq.getCompleted();
|
|
let removedCount = 0;
|
|
|
|
for (const job of jobs) {
|
|
if (job.timestamp < cutoffTime) {
|
|
await job.remove();
|
|
removedCount++;
|
|
}
|
|
}
|
|
|
|
logger.info('DLQ cleanup completed', {
|
|
removedCount,
|
|
cleanupAge: `${this.config.cleanupAge} hours`,
|
|
});
|
|
|
|
return removedCount;
|
|
}
|
|
|
|
/**
|
|
* Check if alert threshold is exceeded
|
|
*/
|
|
private async checkAlertThreshold(): Promise<void> {
|
|
const stats = await this.getStats();
|
|
|
|
if (stats.total >= this.config.alertThreshold) {
|
|
logger.error('DLQ alert threshold exceeded', {
|
|
threshold: this.config.alertThreshold,
|
|
currentCount: stats.total,
|
|
byJobName: stats.byJobName,
|
|
});
|
|
// In a real implementation, this would trigger alerts
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get failed jobs for inspection
|
|
*/
|
|
async inspectFailedJobs(limit = 10): Promise<Array<{
|
|
id: string;
|
|
name: string;
|
|
data: any;
|
|
error: any;
|
|
failedAt: string;
|
|
attempts: number;
|
|
}>> {
|
|
const jobs = await this.dlq.getCompleted(0, limit);
|
|
|
|
return jobs.map(job => ({
|
|
id: job.data.originalJob.id,
|
|
name: job.data.originalJob.name,
|
|
data: job.data.originalJob.data,
|
|
error: job.data.error,
|
|
failedAt: job.data.movedToDLQAt,
|
|
attempts: job.data.originalJob.attemptsMade,
|
|
}));
|
|
}
|
|
|
|
/**
|
|
* Shutdown DLQ handler
|
|
*/
|
|
async shutdown(): Promise<void> {
|
|
await this.dlq.close();
|
|
this.failureCount.clear();
|
|
}
|
|
} |