From 104bebb783571fb7e036532f178a3da57cb94ddf Mon Sep 17 00:00:00 2001 From: Boki Date: Fri, 27 Mar 2026 09:14:48 -0400 Subject: [PATCH] added crawler for schedules jobs --- .../te/actions/crawl-scheduler.action.ts | 38 +++++++++++++++++++ .../src/handlers/te/actions/index.ts | 1 + .../src/handlers/te/te.handler.ts | 9 ++++- 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 apps/stock/data-ingestion/src/handlers/te/actions/crawl-scheduler.action.ts diff --git a/apps/stock/data-ingestion/src/handlers/te/actions/crawl-scheduler.action.ts b/apps/stock/data-ingestion/src/handlers/te/actions/crawl-scheduler.action.ts new file mode 100644 index 0000000..2a45a61 --- /dev/null +++ b/apps/stock/data-ingestion/src/handlers/te/actions/crawl-scheduler.action.ts @@ -0,0 +1,38 @@ +import type { TeHandler } from '../te.handler'; + +const STALE_DAYS = 30; + +export async function crawlScheduler(this: TeHandler): Promise<{ scheduled: number }> { + const { logger, mongodb } = this; + + const cutoff = new Date(Date.now() - STALE_DAYS * 24 * 60 * 60 * 1000); + + const staleUrls = await mongodb?.find('teUrls', { + $or: [ + { lastCrawled: { $exists: false } }, + { lastCrawled: { $lt: cutoff } } + ], + skipReason: { $exists: false } + }, { + sort: { lastCrawled: 1 }, + projection: { url: 1 } + }); + + if (!staleUrls?.length) { + logger.debug('No stale URLs to schedule'); + return { scheduled: 0 }; + } + + logger.info(`Scheduling ${staleUrls.length} stale/uncrawled URLs for spidering`); + + for (let i = 0; i < staleUrls.length; i++) { + const { url } = staleUrls[i]; + await this.scheduleOperation('te-spider', { url }, { + jobId: `spider-${url.replace(/\//g, '-')}`, + priority: 8, + delay: i * 200 + }); + } + + return { scheduled: staleUrls.length }; +} diff --git a/apps/stock/data-ingestion/src/handlers/te/actions/index.ts b/apps/stock/data-ingestion/src/handlers/te/actions/index.ts index 018c6ed..833e166 100644 --- a/apps/stock/data-ingestion/src/handlers/te/actions/index.ts +++ b/apps/stock/data-ingestion/src/handlers/te/actions/index.ts @@ -1,4 +1,5 @@ // Export all action functions here export * from './fetch-countries.action'; export * from './spider.action'; +export * from './crawl-scheduler.action'; diff --git a/apps/stock/data-ingestion/src/handlers/te/te.handler.ts b/apps/stock/data-ingestion/src/handlers/te/te.handler.ts index 8b12509..0a3a9a1 100644 --- a/apps/stock/data-ingestion/src/handlers/te/te.handler.ts +++ b/apps/stock/data-ingestion/src/handlers/te/te.handler.ts @@ -5,7 +5,7 @@ import { ScheduledOperation } from '@stock-bot/handlers'; import type { DataIngestionServices } from '../../types'; -import { fetchCountries, spiderUrl } from './actions'; +import { crawlScheduler, fetchCountries, spiderUrl } from './actions'; @Handler('te') export class TeHandler extends BaseHandler { @@ -115,4 +115,11 @@ export class TeHandler extends BaseHandler { immediately: true, }) spiderUrlSchedule = spiderUrl; + + @ScheduledOperation('te-crawl-scheduler', '*/5 * * * *', { + priority: 8, + description: 'Schedule spider jobs for stale/uncrawled URLs (every 5 min)', + immediately: true, + }) + crawlScheduler = crawlScheduler; } \ No newline at end of file