reworked queue lib
This commit is contained in:
parent
629ba2b8d4
commit
c05a7413dc
34 changed files with 3887 additions and 861 deletions
258
libs/queue/src/dlq-handler.ts
Normal file
258
libs/queue/src/dlq-handler.ts
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
import { Queue, type Job } from 'bullmq';
|
||||
import { getLogger } from '@stock-bot/logger';
|
||||
import type { JobData } from './types';
|
||||
import { getRedisConnection } from './utils';
|
||||
|
||||
const logger = getLogger('dlq-handler');
|
||||
|
||||
export interface DLQConfig {
|
||||
maxRetries?: number;
|
||||
retryDelay?: number;
|
||||
alertThreshold?: number;
|
||||
cleanupAge?: number; // hours
|
||||
}
|
||||
|
||||
export class DeadLetterQueueHandler {
|
||||
private dlq: Queue;
|
||||
private config: Required<DLQConfig>;
|
||||
private failureCount = new Map<string, number>();
|
||||
|
||||
constructor(
|
||||
private mainQueue: Queue,
|
||||
private connection: any,
|
||||
config: DLQConfig = {}
|
||||
) {
|
||||
this.config = {
|
||||
maxRetries: config.maxRetries ?? 3,
|
||||
retryDelay: config.retryDelay ?? 60000, // 1 minute
|
||||
alertThreshold: config.alertThreshold ?? 100,
|
||||
cleanupAge: config.cleanupAge ?? 168, // 7 days
|
||||
};
|
||||
|
||||
// Create DLQ with same name but -dlq suffix
|
||||
const dlqName = `${mainQueue.name}-dlq`;
|
||||
this.dlq = new Queue(dlqName, { connection: getRedisConnection(connection) });
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a failed job - either retry or move to DLQ
|
||||
*/
|
||||
async handleFailedJob(job: Job, error: Error): Promise<void> {
|
||||
const jobKey = `${job.name}:${job.id}`;
|
||||
const currentFailures = (this.failureCount.get(jobKey) || 0) + 1;
|
||||
this.failureCount.set(jobKey, currentFailures);
|
||||
|
||||
logger.warn('Job failed', {
|
||||
jobId: job.id,
|
||||
jobName: job.name,
|
||||
attempt: job.attemptsMade,
|
||||
maxAttempts: job.opts.attempts,
|
||||
error: error.message,
|
||||
failureCount: currentFailures,
|
||||
});
|
||||
|
||||
// Check if job should be moved to DLQ
|
||||
if (job.attemptsMade >= (job.opts.attempts || this.config.maxRetries)) {
|
||||
await this.moveToDeadLetterQueue(job, error);
|
||||
this.failureCount.delete(jobKey);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Move job to dead letter queue
|
||||
*/
|
||||
private async moveToDeadLetterQueue(job: Job, error: Error): Promise<void> {
|
||||
try {
|
||||
const dlqData = {
|
||||
originalJob: {
|
||||
id: job.id,
|
||||
name: job.name,
|
||||
data: job.data,
|
||||
opts: job.opts,
|
||||
attemptsMade: job.attemptsMade,
|
||||
failedReason: job.failedReason,
|
||||
processedOn: job.processedOn,
|
||||
timestamp: job.timestamp,
|
||||
},
|
||||
error: {
|
||||
message: error.message,
|
||||
stack: error.stack,
|
||||
name: error.name,
|
||||
},
|
||||
movedToDLQAt: new Date().toISOString(),
|
||||
};
|
||||
|
||||
await this.dlq.add('failed-job', dlqData, {
|
||||
removeOnComplete: false,
|
||||
removeOnFail: false,
|
||||
});
|
||||
|
||||
logger.error('Job moved to DLQ', {
|
||||
jobId: job.id,
|
||||
jobName: job.name,
|
||||
error: error.message,
|
||||
});
|
||||
|
||||
// Check if we need to alert
|
||||
await this.checkAlertThreshold();
|
||||
} catch (dlqError) {
|
||||
logger.error('Failed to move job to DLQ', {
|
||||
jobId: job.id,
|
||||
error: dlqError,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retry jobs from DLQ
|
||||
*/
|
||||
async retryDLQJobs(limit = 10): Promise<number> {
|
||||
const jobs = await this.dlq.getCompleted(0, limit);
|
||||
let retriedCount = 0;
|
||||
|
||||
for (const dlqJob of jobs) {
|
||||
try {
|
||||
const { originalJob } = dlqJob.data;
|
||||
|
||||
// Re-add to main queue with delay
|
||||
await this.mainQueue.add(
|
||||
originalJob.name,
|
||||
originalJob.data,
|
||||
{
|
||||
...originalJob.opts,
|
||||
delay: this.config.retryDelay,
|
||||
attempts: this.config.maxRetries,
|
||||
}
|
||||
);
|
||||
|
||||
// Remove from DLQ
|
||||
await dlqJob.remove();
|
||||
retriedCount++;
|
||||
|
||||
logger.info('Job retried from DLQ', {
|
||||
originalJobId: originalJob.id,
|
||||
jobName: originalJob.name,
|
||||
});
|
||||
} catch (error) {
|
||||
logger.error('Failed to retry DLQ job', {
|
||||
dlqJobId: dlqJob.id,
|
||||
error,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return retriedCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get DLQ statistics
|
||||
*/
|
||||
async getStats(): Promise<{
|
||||
total: number;
|
||||
recent: number;
|
||||
byJobName: Record<string, number>;
|
||||
oldestJob: Date | null;
|
||||
}> {
|
||||
const [completed, failed, waiting] = await Promise.all([
|
||||
this.dlq.getCompleted(),
|
||||
this.dlq.getFailed(),
|
||||
this.dlq.getWaiting(),
|
||||
]);
|
||||
|
||||
const allJobs = [...completed, ...failed, ...waiting];
|
||||
const byJobName: Record<string, number> = {};
|
||||
let oldestTimestamp: number | null = null;
|
||||
|
||||
for (const job of allJobs) {
|
||||
const jobName = job.data.originalJob?.name || 'unknown';
|
||||
byJobName[jobName] = (byJobName[jobName] || 0) + 1;
|
||||
|
||||
if (!oldestTimestamp || job.timestamp < oldestTimestamp) {
|
||||
oldestTimestamp = job.timestamp;
|
||||
}
|
||||
}
|
||||
|
||||
// Count recent jobs (last 24 hours)
|
||||
const oneDayAgo = Date.now() - 24 * 60 * 60 * 1000;
|
||||
const recent = allJobs.filter(job => job.timestamp > oneDayAgo).length;
|
||||
|
||||
return {
|
||||
total: allJobs.length,
|
||||
recent,
|
||||
byJobName,
|
||||
oldestJob: oldestTimestamp ? new Date(oldestTimestamp) : null,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up old DLQ entries
|
||||
*/
|
||||
async cleanup(): Promise<number> {
|
||||
const ageInMs = this.config.cleanupAge * 60 * 60 * 1000;
|
||||
const cutoffTime = Date.now() - ageInMs;
|
||||
|
||||
const jobs = await this.dlq.getCompleted();
|
||||
let removedCount = 0;
|
||||
|
||||
for (const job of jobs) {
|
||||
if (job.timestamp < cutoffTime) {
|
||||
await job.remove();
|
||||
removedCount++;
|
||||
}
|
||||
}
|
||||
|
||||
logger.info('DLQ cleanup completed', {
|
||||
removedCount,
|
||||
cleanupAge: `${this.config.cleanupAge} hours`,
|
||||
});
|
||||
|
||||
return removedCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if alert threshold is exceeded
|
||||
*/
|
||||
private async checkAlertThreshold(): Promise<void> {
|
||||
const stats = await this.getStats();
|
||||
|
||||
if (stats.total >= this.config.alertThreshold) {
|
||||
logger.error('DLQ alert threshold exceeded', {
|
||||
threshold: this.config.alertThreshold,
|
||||
currentCount: stats.total,
|
||||
byJobName: stats.byJobName,
|
||||
});
|
||||
// In a real implementation, this would trigger alerts
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get failed jobs for inspection
|
||||
*/
|
||||
async inspectFailedJobs(limit = 10): Promise<Array<{
|
||||
id: string;
|
||||
name: string;
|
||||
data: any;
|
||||
error: any;
|
||||
failedAt: string;
|
||||
attempts: number;
|
||||
}>> {
|
||||
const jobs = await this.dlq.getCompleted(0, limit);
|
||||
|
||||
return jobs.map(job => ({
|
||||
id: job.data.originalJob.id,
|
||||
name: job.data.originalJob.name,
|
||||
data: job.data.originalJob.data,
|
||||
error: job.data.error,
|
||||
failedAt: job.data.movedToDLQAt,
|
||||
attempts: job.data.originalJob.attemptsMade,
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Shutdown DLQ handler
|
||||
*/
|
||||
async shutdown(): Promise<void> {
|
||||
await this.dlq.close();
|
||||
this.failureCount.clear();
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue