adding data-services

This commit is contained in:
Bojan Kucera 2025-06-03 07:42:48 -04:00
parent e3bfd05b90
commit 405b818c86
139 changed files with 55943 additions and 416 deletions

View file

@ -0,0 +1,293 @@
import { EventBus } from '@stock-bot/event-bus';
import { logger } from '@stock-bot/utils';
import { DataPipeline, PipelineStatus, PipelineJob, JobStatus } from '../types/DataPipeline';
import { DataIngestionService } from '../services/DataIngestionService';
import { DataTransformationService } from '../services/DataTransformationService';
import { DataValidationService } from '../services/DataValidationService';
import { DataQualityService } from '../services/DataQualityService';
import { PipelineScheduler } from './PipelineScheduler';
import { JobQueue } from './JobQueue';
export class DataPipelineOrchestrator {
private eventBus: EventBus;
private scheduler: PipelineScheduler;
private jobQueue: JobQueue;
private pipelines: Map<string, DataPipeline> = new Map();
private runningJobs: Map<string, PipelineJob> = new Map();
constructor(
private ingestionService: DataIngestionService,
private transformationService: DataTransformationService,
private validationService: DataValidationService,
private qualityService: DataQualityService
) {
this.eventBus = new EventBus();
this.scheduler = new PipelineScheduler(this);
this.jobQueue = new JobQueue(this);
}
async initialize(): Promise<void> {
logger.info('🔄 Initializing Data Pipeline Orchestrator...');
await this.eventBus.initialize();
await this.scheduler.initialize();
await this.jobQueue.initialize();
// Subscribe to pipeline events
await this.eventBus.subscribe('data.pipeline.*', this.handlePipelineEvent.bind(this));
await this.eventBus.subscribe('data.job.*', this.handleJobEvent.bind(this));
// Load existing pipelines
await this.loadPipelines();
logger.info('✅ Data Pipeline Orchestrator initialized');
}
async createPipeline(pipeline: Omit<DataPipeline, 'id' | 'createdAt' | 'updatedAt'>): Promise<DataPipeline> {
const pipelineWithId: DataPipeline = {
...pipeline,
id: this.generatePipelineId(),
status: PipelineStatus.DRAFT,
createdAt: new Date(),
updatedAt: new Date(),
};
this.pipelines.set(pipelineWithId.id, pipelineWithId);
await this.eventBus.publish('data.pipeline.created', {
pipelineId: pipelineWithId.id,
pipeline: pipelineWithId,
});
logger.info(`📋 Created pipeline: ${pipelineWithId.name} (${pipelineWithId.id})`);
return pipelineWithId;
}
async runPipeline(pipelineId: string, parameters?: Record<string, any>): Promise<PipelineJob> {
const pipeline = this.pipelines.get(pipelineId);
if (!pipeline) {
throw new Error(`Pipeline not found: ${pipelineId}`);
}
if (pipeline.status !== PipelineStatus.ACTIVE) {
throw new Error(`Pipeline is not active: ${pipeline.status}`);
}
const job: PipelineJob = {
id: this.generateJobId(),
pipelineId,
status: JobStatus.PENDING,
parameters: parameters || {},
createdAt: new Date(),
startedAt: null,
completedAt: null,
error: null,
metrics: {
recordsProcessed: 0,
recordsSuccessful: 0,
recordsFailed: 0,
processingTimeMs: 0,
},
};
this.runningJobs.set(job.id, job);
// Queue the job for execution
await this.jobQueue.enqueueJob(job);
await this.eventBus.publish('data.job.queued', {
jobId: job.id,
pipelineId,
job,
});
logger.info(`🚀 Queued pipeline job: ${job.id} for pipeline: ${pipeline.name}`);
return job;
}
async executePipelineJob(job: PipelineJob): Promise<void> {
const pipeline = this.pipelines.get(job.pipelineId);
if (!pipeline) {
throw new Error(`Pipeline not found: ${job.pipelineId}`);
}
const startTime = Date.now();
job.status = JobStatus.RUNNING;
job.startedAt = new Date();
await this.eventBus.publish('data.job.started', {
jobId: job.id,
pipelineId: job.pipelineId,
job,
});
try {
logger.info(`⚙️ Executing pipeline job: ${job.id}`);
// Execute pipeline steps
await this.executeIngestionStep(pipeline, job);
await this.executeTransformationStep(pipeline, job);
await this.executeValidationStep(pipeline, job);
await this.executeQualityChecks(pipeline, job);
// Complete the job
job.status = JobStatus.COMPLETED;
job.completedAt = new Date();
job.metrics.processingTimeMs = Date.now() - startTime;
await this.eventBus.publish('data.job.completed', {
jobId: job.id,
pipelineId: job.pipelineId,
job,
});
logger.info(`✅ Pipeline job completed: ${job.id} in ${job.metrics.processingTimeMs}ms`);
} catch (error) {
job.status = JobStatus.FAILED;
job.completedAt = new Date();
job.error = error instanceof Error ? error.message : 'Unknown error';
job.metrics.processingTimeMs = Date.now() - startTime;
await this.eventBus.publish('data.job.failed', {
jobId: job.id,
pipelineId: job.pipelineId,
job,
error: job.error,
});
logger.error(`❌ Pipeline job failed: ${job.id}`, error);
throw error;
}
}
private async executeIngestionStep(pipeline: DataPipeline, job: PipelineJob): Promise<void> {
if (!pipeline.steps.ingestion) return;
logger.info(`📥 Executing ingestion step for job: ${job.id}`);
const result = await this.ingestionService.ingestData(
pipeline.steps.ingestion,
job.parameters
);
job.metrics.recordsProcessed += result.recordsProcessed;
job.metrics.recordsSuccessful += result.recordsSuccessful;
job.metrics.recordsFailed += result.recordsFailed;
}
private async executeTransformationStep(pipeline: DataPipeline, job: PipelineJob): Promise<void> {
if (!pipeline.steps.transformation) return;
logger.info(`🔄 Executing transformation step for job: ${job.id}`);
const result = await this.transformationService.transformData(
pipeline.steps.transformation,
job.parameters
);
job.metrics.recordsProcessed += result.recordsProcessed;
job.metrics.recordsSuccessful += result.recordsSuccessful;
job.metrics.recordsFailed += result.recordsFailed;
}
private async executeValidationStep(pipeline: DataPipeline, job: PipelineJob): Promise<void> {
if (!pipeline.steps.validation) return;
logger.info(`✅ Executing validation step for job: ${job.id}`);
const result = await this.validationService.validateData(
pipeline.steps.validation,
job.parameters
);
job.metrics.recordsProcessed += result.recordsProcessed;
job.metrics.recordsSuccessful += result.recordsSuccessful;
job.metrics.recordsFailed += result.recordsFailed;
}
private async executeQualityChecks(pipeline: DataPipeline, job: PipelineJob): Promise<void> {
if (!pipeline.steps.qualityChecks) return;
logger.info(`🔍 Executing quality checks for job: ${job.id}`);
await this.qualityService.runQualityChecks(
pipeline.steps.qualityChecks,
job.parameters
);
}
async schedulePipeline(pipelineId: string, cronExpression: string): Promise<void> {
const pipeline = this.pipelines.get(pipelineId);
if (!pipeline) {
throw new Error(`Pipeline not found: ${pipelineId}`);
}
await this.scheduler.schedulePipeline(pipelineId, cronExpression);
pipeline.schedule = {
cronExpression,
enabled: true,
lastRun: null,
nextRun: this.scheduler.getNextRunTime(cronExpression),
};
await this.eventBus.publish('data.pipeline.scheduled', {
pipelineId,
cronExpression,
});
logger.info(`📅 Scheduled pipeline: ${pipeline.name} with cron: ${cronExpression}`);
}
// Pipeline CRUD operations
getPipeline(pipelineId: string): DataPipeline | undefined {
return this.pipelines.get(pipelineId);
}
listPipelines(): DataPipeline[] {
return Array.from(this.pipelines.values());
}
getJob(jobId: string): PipelineJob | undefined {
return this.runningJobs.get(jobId);
}
listJobs(pipelineId?: string): PipelineJob[] {
const jobs = Array.from(this.runningJobs.values());
return pipelineId ? jobs.filter(job => job.pipelineId === pipelineId) : jobs;
}
private async handlePipelineEvent(event: any): Promise<void> {
logger.debug('📨 Received pipeline event:', event);
// Handle pipeline-level events
}
private async handleJobEvent(event: any): Promise<void> {
logger.debug('📨 Received job event:', event);
// Handle job-level events
}
private async loadPipelines(): Promise<void> {
// In a real implementation, load pipelines from persistent storage
logger.info('📂 Loading existing pipelines...');
}
private generatePipelineId(): string {
return `pipeline_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
}
private generateJobId(): string {
return `job_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
}
async shutdown(): Promise<void> {
logger.info('🔄 Shutting down Data Pipeline Orchestrator...');
await this.scheduler.shutdown();
await this.jobQueue.shutdown();
await this.eventBus.disconnect();
logger.info('✅ Data Pipeline Orchestrator shutdown complete');
}
}

View file

@ -0,0 +1,77 @@
import Queue from 'bull';
import { logger } from '@stock-bot/utils';
import { PipelineJob } from '../types/DataPipeline';
import { DataPipelineOrchestrator } from './DataPipelineOrchestrator';
export class JobQueue {
private queue: Queue.Queue;
constructor(private orchestrator: DataPipelineOrchestrator) {
this.queue = new Queue('data-pipeline-jobs', {
redis: {
host: process.env.REDIS_HOST || 'localhost',
port: parseInt(process.env.REDIS_PORT || '6379'),
},
});
}
async initialize(): Promise<void> {
logger.info('🔄 Initializing Job Queue...');
// Process jobs with a maximum of 5 concurrent jobs
this.queue.process('pipeline-job', 5, async (job) => {
const pipelineJob: PipelineJob = job.data;
await this.orchestrator.executePipelineJob(pipelineJob);
});
// Handle job events
this.queue.on('completed', (job) => {
logger.info(`✅ Job completed: ${job.id}`);
});
this.queue.on('failed', (job, error) => {
logger.error(`❌ Job failed: ${job.id}`, error);
});
this.queue.on('stalled', (job) => {
logger.warn(`⚠️ Job stalled: ${job.id}`);
});
logger.info('✅ Job Queue initialized');
}
async enqueueJob(job: PipelineJob): Promise<void> {
await this.queue.add('pipeline-job', job, {
jobId: job.id,
removeOnComplete: 100, // Keep last 100 completed jobs
removeOnFail: 50, // Keep last 50 failed jobs
attempts: 3, // Retry failed jobs up to 3 times
backoff: {
type: 'exponential',
delay: 2000,
},
});
logger.info(`📤 Enqueued job: ${job.id}`);
}
async getJobStats(): Promise<any> {
const waiting = await this.queue.getWaiting();
const active = await this.queue.getActive();
const completed = await this.queue.getCompleted();
const failed = await this.queue.getFailed();
return {
waiting: waiting.length,
active: active.length,
completed: completed.length,
failed: failed.length,
};
}
async shutdown(): Promise<void> {
logger.info('🔄 Shutting down Job Queue...');
await this.queue.close();
logger.info('✅ Job Queue shutdown complete');
}
}

View file

@ -0,0 +1,69 @@
import { CronJob } from 'cron';
import { logger } from '@stock-bot/utils';
import { DataPipelineOrchestrator } from './DataPipelineOrchestrator';
export class PipelineScheduler {
private scheduledJobs: Map<string, CronJob> = new Map();
constructor(private orchestrator: DataPipelineOrchestrator) {}
async initialize(): Promise<void> {
logger.info('🔄 Initializing Pipeline Scheduler...');
logger.info('✅ Pipeline Scheduler initialized');
}
async schedulePipeline(pipelineId: string, cronExpression: string): Promise<void> {
// Cancel existing schedule if it exists
if (this.scheduledJobs.has(pipelineId)) {
this.cancelSchedule(pipelineId);
}
const cronJob = new CronJob(
cronExpression,
async () => {
try {
logger.info(`⏰ Scheduled execution triggered for pipeline: ${pipelineId}`);
await this.orchestrator.runPipeline(pipelineId);
} catch (error) {
logger.error(`❌ Scheduled pipeline execution failed: ${pipelineId}`, error);
}
},
null,
true, // Start immediately
'UTC'
);
this.scheduledJobs.set(pipelineId, cronJob);
logger.info(`📅 Scheduled pipeline ${pipelineId} with cron: ${cronExpression}`);
}
cancelSchedule(pipelineId: string): void {
const job = this.scheduledJobs.get(pipelineId);
if (job) {
job.stop();
this.scheduledJobs.delete(pipelineId);
logger.info(`🚫 Cancelled schedule for pipeline: ${pipelineId}`);
}
}
getNextRunTime(cronExpression: string): Date {
const job = new CronJob(cronExpression);
return job.nextDate().toDate();
}
getScheduledPipelines(): string[] {
return Array.from(this.scheduledJobs.keys());
}
async shutdown(): Promise<void> {
logger.info('🔄 Shutting down Pipeline Scheduler...');
for (const [pipelineId, job] of this.scheduledJobs) {
job.stop();
logger.info(`🚫 Stopped scheduled job for pipeline: ${pipelineId}`);
}
this.scheduledJobs.clear();
logger.info('✅ Pipeline Scheduler shutdown complete');
}
}