This commit is contained in:
Boki 2026-03-26 16:32:37 -04:00
parent deeb934526
commit eeed957fe1
5 changed files with 490 additions and 196 deletions

View file

@ -1,5 +1,6 @@
import { import {
BaseHandler, BaseHandler,
Disabled,
Handler, Handler,
Operation, Operation,
RateLimit, RateLimit,
@ -32,6 +33,7 @@ import { createEODOperationRegistry } from './shared';
* Operations can specify just a cost to use handler limits, or override with custom limits * Operations can specify just a cost to use handler limits, or override with custom limits
*/ */
@Handler('eod') @Handler('eod')
@Disabled()
@RateLimit({ @RateLimit({
limits: [ limits: [
{ points: 900, duration: 60 }, // 1000 points per minute { points: 900, duration: 60 }, // 1000 points per minute

View file

@ -1,5 +1,6 @@
import { import {
BaseHandler, BaseHandler,
Disabled,
Handler, Handler,
Operation, Operation,
ScheduledOperation, ScheduledOperation,
@ -7,6 +8,7 @@ import {
import { fetchExchanges, fetchExchangesAndSymbols, fetchSession, fetchSymbols } from './actions'; import { fetchExchanges, fetchExchangesAndSymbols, fetchSession, fetchSymbols } from './actions';
@Handler('ib') @Handler('ib')
@Disabled()
export class IbHandler extends BaseHandler { export class IbHandler extends BaseHandler {
constructor(services: any) { constructor(services: any) {
super(services); super(services);

View file

@ -103,6 +103,21 @@ export async function initializeAllHandlers(serviceContainer: IServiceContainer)
handlersWithSchedule: handlerRegistry.getAllHandlersWithSchedule().size, handlersWithSchedule: handlerRegistry.getAllHandlersWithSchedule().size,
}); });
// Initialize handlers that have onInit method
// We need to instantiate handlers and call their onInit
for (const HandlerClass of handlers) {
try {
// Create handler instance with service container
const handlerInstance = new HandlerClass(serviceContainer);
if (handlerInstance && typeof handlerInstance.onInit === 'function') {
const handlerName = (HandlerClass as any).__handlerName || HandlerClass.name;
logger.info(`Calling onInit for handler: ${handlerName}`);
await handlerInstance.onInit();
}
} catch (error) {
logger.error(`Failed to initialize handler ${HandlerClass.name}:`, error);
}
}
} }
} else { } else {
logger.error('Could not access DI container from service container'); logger.error('Could not access DI container from service container');

View file

@ -1,167 +1,355 @@
import { getRandomUserAgent } from '@stock-bot/utils'; import { getRandomUserAgent } from '@stock-bot/utils';
import * as cheerio from 'cheerio'; import * as cheerio from 'cheerio';
import { TE_CONFIG } from '../shared/config'; import { TE_CONFIG } from '../shared/config';
import type { TeHandler } from '../te.handler'; import type { TeHandler } from '../te.handler';
export async function spiderUrl(this: TeHandler, payload: { url: string }): Promise<string[] | null> { export async function spiderUrl(this: TeHandler, payload: { url: string }): Promise<string[] | null> {
const { logger, mongodb } = this; const { logger, mongodb, proxy, http } = this;
const reqUrl = payload && payload.url ? TE_CONFIG.MAIN_URL + payload.url : TE_CONFIG.MAIN_URL; const urlPath = payload?.url || '/';
this.logger.info(`Spiderring URL: ${reqUrl}`, {reqUrl}); const fullUrl = TE_CONFIG.MAIN_URL + urlPath;
// if( mongoRecord){ // 1. Check if already crawled recently (30 days)
// const url = mongoRecord.url; const record = await mongodb?.findOne('teUrls', { url: urlPath });
// if (shouldSkipUrl(url)) { if (record?.lastCrawled) {
// logger.info(`Skipping URL ${url} as its too deep`); const daysSinceCrawl = (Date.now() - record.lastCrawled.getTime()) / (1000 * 60 * 60 * 24 * 30);
// return null; if (daysSinceCrawl < 30) {
// } logger.info(`Skipping ${urlPath} - crawled ${(daysSinceCrawl * 30).toFixed(1)} days ago`);
// } return null;
const mongoRecord = await mongodb?.findOne('teUrls', { url: payload?.url || '/' }); }
if(payload && payload.url && mongoRecord && mongoRecord.lastCrawled && mongoRecord.lastCrawled.getTime() > Date.now() - 30 * 24 * 60 * 60 * 1000) { }
this.logger.info(`Skipping URL ${reqUrl} as it was already crawled in the last 24 hours`);
return null; // Skip if already crawled in the last 24 hours // 2. Fetch the page
} logger.info(`Crawling ${fullUrl}`);
const html = await fetchPage.call(this, fullUrl);
if (!html) {
if (!payload) { logger.error(`Failed to fetch ${fullUrl}`);
const oneDayAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000); // Mark as crawled anyway to avoid retrying immediately
const records = await mongodb?.find('teUrls', { await mongodb?.updateOne(
$or: [ 'teUrls',
{ lastCrawled: { $lt: oneDayAgo } }, // Crawled more than 24 hours ago { url: urlPath },
{ lastCrawled: { $exists: false } } // Never crawled {
] $set: {
}); lastCrawled: new Date(),
this.logger.info(`Found ${records?.length || 0} records to process`); foundLinks: 0,
for (const record of records || []) { error: true
const url = record.url; }
if (shouldSkipUrl(url)) { },
logger.info(`Skipping URL ${url} as its too deep`); { upsert: true }
continue; );
} return null;
}
await this.scheduleOperation('te-spider', {
url: record.url, // 3. Extract TE data from the HTML
}, { let match = html.match(/var TESymbol = '([^']+)'/);
jobId: `te-spider-${record.url}`, const teSymbol = match ? match[1] : undefined;
priority: 5, // Lower priority than financial data
}); match = html.match(/TELastUpdate = '([^']+)'/);
} const teLastUpdate = match ? match[1] : undefined;
}
match = html.match(/TEChart = '([^']+)'/);
try { const teChart = match ? match[1] : undefined;
// 1. Fetch the HTML page
const reqInfo = { match = html.match(/var TEAlertsName\s*=\s*'([^']+)'/);
proxy: 'http://5.79.66.2:13010',//this.proxy.getProxy(), const teAlertsName = match ? match[1] : undefined;
headers: {
'User-Agent': getRandomUserAgent(), // Extract from script tags - these appear after TEChartUrl
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', match = html.match(/TEChartUrl = '([^']+)'/);
'Accept-Language': 'en-US,en;q=0.5', const teChartUrl = match ? match[1] : undefined;
'Accept-Encoding': 'gzip, deflate, br',
}, match = html.match(/TECountry = '([^']+)'/);
} const teCountry = match ? match[1] : undefined;
const response = await fetch(reqUrl, reqInfo);
match = html.match(/TECategory = '([^']+)'/);
logger.debug('Response status:', { const teCategory = match ? match[1] : undefined;
status: response.status,
statusText: response.statusText, match = html.match(/TEType = '([^']+)'/);
url: response.url const teType = match ? match[1] : undefined;
});
match = html.match(/TEFrequency = '([^']+)'/);
if (!response.ok) { const teFrequency = match ? match[1] : undefined;
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
} // Extract array data
match = html.match(/TEForecast\s*=\s*\[([^\]]+)\]/);
const html = await response.text(); const teForecast = match ? JSON.parse('[' + match[1] + ']') : undefined;
// Extract JSON metadata - use a more flexible regex
let match = html.match(/TESymbol = '([^']+)'/); match = html.match(/TEChartsMeta = (\[[\s\S]*?\]);/);
const teSymbol = match ? match[1] : undefined; const teChartsMeta = match ? (() => {
match = html.match(/;TELastUpdate = '([^']+)'/); try {
const teLastUpdate = match ? match[1] : undefined; return JSON.parse(match[1]);
match = html.match(/; var TEChartsDatasource = '([^']+)'/); } catch (e) {
const teChartUrl = match ? match[1] : undefined; logger.warn('Failed to parse TEChartsMeta:', e);
match = html.match(/; var TEChartsToken = '([^']+)'/); return undefined;
const teChartToken = match ? match[1] : undefined; }
})() : undefined;
console.log(teSymbol, teLastUpdate, teChartUrl, teChartToken);
// Extract symbol data
const $ = cheerio.load(html); match = html.match(/symbol = '([^']+)'/);
const urls: string[] = []; const symbol = match ? match[1] : undefined;
$('.list-group-item, a[href^="/"]').each((_, element) => { match = html.match(/symbolType = '([^']+)'/);
const $el = $(element); const symbolType = match ? match[1] : undefined;
let url: string | undefined;
if ($el.is('a')) { // TEChartsDatasource is the CDN URL for chart data
const href = $el.attr('href'); match = html.match(/; var TEChartsDatasource = '([^']+)'/);
if (href && href.startsWith('/') && !href.includes('.aspx')) { const teChartsDatasource = match ? match[1] : undefined;
url = href;
} match = html.match(/; var TEChartsToken = '([^']+)'/);
} const teChartToken = match ? match[1] : undefined;
if (url && urls.indexOf(url) === -1) { // Log if we found any TE data
urls.push(url); const foundData = teSymbol || teLastUpdate || teChartUrl || teChartsDatasource ||
} teChartToken || teChart || teAlertsName || teCountry ||
}); teCategory || teType || teFrequency || teForecast ||
teChartsMeta || symbol || symbolType;
if (urls.length === 0) {
throw new Error('No urls found in HTML'); if (foundData) {
} logger.debug('Found TE data:', {
teSymbol,
// 3. Save to MongoDB teLastUpdate,
try { teChart,
if (urls.length > 0) { teAlertsName,
const urlMap: {url: string, lastCrawled?: Date, teSymbol? : string, teLastUpdate? : string, teChartUrl? : string, teChartToken? : string}[] = urls.map(url => ({url})); teChartUrl,
if( payload && payload.url) { teCountry,
urlMap.push({ teCategory,
url: payload.url, teType,
lastCrawled: new Date(), teFrequency,
teSymbol, teForecast: teForecast ? `[${teForecast.length} values]` : undefined,
teLastUpdate, teChartsMeta: teChartsMeta ? `[${teChartsMeta.length} items]` : undefined,
teChartUrl, symbol,
teChartToken,}) symbolType,
}else { teChartsDatasource,
urlMap.push({url: '/', lastCrawled: new Date()}) teChartToken
} });
}
const result = await mongodb?.batchUpsert('teUrls', urlMap, ['url']);
logger.info('TE URLs saved to MongoDB', { // 4. Extract all links
matched: result.matchedCount, const $ = cheerio.load(html);
modified: result.modifiedCount, const foundUrls = new Set<string>();
upserted: result.upsertedCount,
}); // Add some seed URLs if this is the root page
} if (urlPath === '/') {
} catch (dbError) { const seedUrls = [
logger.error('Failed to save urls to MongoDB', { error: dbError }); '/united-states', '/china', '/eurozone', '/japan', '/united-kingdom',
throw dbError; '/indicators', '/calendar', '/forecasts', '/countries',
} '/gdp', '/inflation', '/unemployment', '/interest-rate',
'/stocks', '/bonds', '/commodity', '/currency'
for (const url of urls) { ];
if (shouldSkipUrl(url)) { seedUrls.forEach(url => foundUrls.add(url));
logger.info(`Skipping URL ${url} as its too deep`); }
continue; // Skip if it's a subpage or already crawled
} // Extract links from the page
this.scheduleOperation('te-spider', { $('a[href]').each((_, element) => {
url: url, const href = $(element).attr('href');
}, { if (!href) {
jobId: `te-spider-${url}`, return;
priority: 5, // Lower priority than financial data }
})
} // Convert to absolute path
let path: string;
return urls; if (href.startsWith('/')) {
} catch (error) { path = href;
logger.error(`Failed to fetch Trading Economics URLs ${reqUrl}`, { } else if (href.includes('tradingeconomics.com')) {
error: error instanceof Error ? error.message : String(error), try {
stack: error instanceof Error ? error.stack : undefined, path = new URL(href).pathname;
}); } catch {
return null; return; // Invalid URL
} }
} } else {
return; // Skip external links
function shouldSkipUrl(url?: string): boolean { }
// Skip if it's a subpage or already crawled in the last 24 hours
if (!url) { // Clean the path
return false; path = path.split('?')[0].split('#')[0];
}
const matches = url.match(/\//g); // Basic filtering
return matches !== null && matches.length >= 4; if (shouldCrawl(path)) {
foundUrls.add(path);
}
});
const urls = Array.from(foundUrls);
logger.info(`Found ${urls.length} URLs on ${urlPath}`);
// 5. Update database
// Mark current URL as crawled with TE data if found
const updateData: any = {
lastCrawled: new Date(),
foundLinks: urls.length,
error: false
};
// Add TE data if found
if (teSymbol) {
updateData.teSymbol = teSymbol;
}
if (teLastUpdate) {
updateData.teLastUpdate = teLastUpdate;
}
if (teChart) {
updateData.teChart = teChart;
}
if (teAlertsName) {
updateData.teAlertsName = teAlertsName;
}
if (teChartUrl) {
updateData.teChartUrl = teChartUrl; // Full chart URL with PNG
}
if (teCountry) {
updateData.teCountry = teCountry;
}
if (teCategory) {
updateData.teCategory = teCategory;
}
if (teType) {
updateData.teType = teType;
}
if (teFrequency) {
updateData.teFrequency = teFrequency;
}
if (teForecast) {
updateData.teForecast = teForecast;
}
if (teChartsMeta) {
updateData.teChartsMeta = teChartsMeta;
}
if (symbol) {
updateData.symbol = symbol;
}
if (symbolType) {
updateData.symbolType = symbolType;
}
if (teChartsDatasource) {
updateData.teChartsDatasource = teChartsDatasource; // CDN URL for chart data
}
if (teChartToken) {
updateData.teChartToken = teChartToken;
}
await mongodb?.updateOne(
'teUrls',
{ url: urlPath },
{ $set: updateData },
{ upsert: true }
);
// Insert new URLs (without lastCrawled so they'll be picked up for crawling)
const newUrls: string[] = [];
for (const url of urls) {
const result = await mongodb?.updateOne(
'teUrls',
{ url },
{
$setOnInsert: {
url,
createdAt: new Date(),
source: urlPath
}
},
{ upsert: true }
);
// Only schedule if it was actually inserted (not already existing)
if (result?.upsertedCount > 0) {
newUrls.push(url);
}
}
// 6. Schedule individual jobs for each new URL
logger.info(`Scheduling ${newUrls.length} new URLs for crawling`);
for (const url of newUrls) {
await this.scheduleOperation('te-spider', { url }, {
jobId: `spider-${url.replace(/\//g, '-')}`,
priority: 10,
delay: Math.floor(Math.random() * 10000) // Spread requests over 10 seconds
});
}
return urls;
}
// Simple fetch with retry using http service with proxy
async function fetchPage(this: TeHandler, url: string): Promise<string | null> {
const { http, proxy, logger } = this;
for (let attempt = 1; attempt <= 3; attempt++) {
try {
// Get a proxy URL for this request
const proxyUrl = proxy?.getProxy();
const response = await http.get(url, {
proxy: proxyUrl,
headers: {
'User-Agent': getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
});
if (response.ok) {
return await response.text();
}
// If not OK, log the status
logger.error(`HTTP ${response.status} for ${url}`);
} catch (error) {
logger.error(`Attempt ${attempt}/3 failed for ${url}:`, error);
if (attempt === 3) {
return null;
}
// Wait before retrying (exponential backoff)
await new Promise(resolve => setTimeout(resolve, 1000 * attempt));
}
}
return null;
}
// Simple URL filtering
function shouldCrawl(url: string): boolean {
// Skip empty or root
if (!url || url === '/') {
return false;
}
// Skip static files
if (/\.(pdf|jpg|jpeg|png|gif|svg|css|js|ico|xml|rss|json|txt|csv|xlsx|xls|doc|docx|zip)$/i.test(url)) {
return false;
}
// Skip deep URLs (more than 4 levels)
const depth = (url.match(/\//g) || []).length;
if (depth > 4) {
return false;
}
// Skip common non-content pages
const skipPatterns = [
/\/api\//,
/\/login/,
/\/register/,
/\/logout/,
/\/admin/,
/\/search/,
/\/print\//,
/\/download\//,
/\/embed\//,
/\/widget\//,
/\/stream\//,
/\/rss\//,
/#/,
];
for (const pattern of skipPatterns) {
if (pattern.test(url)) {
return false;
}
}
// Accept everything else
return true;
} }

View file

@ -1,31 +1,118 @@
import { import {
BaseHandler, BaseHandler,
Disabled, Disabled,
Handler, Handler,
ScheduledOperation ScheduledOperation
} from '@stock-bot/handlers'; } from '@stock-bot/handlers';
import type { DataIngestionServices } from '../../types'; import type { DataIngestionServices } from '../../types';
import { fetchCountries, spiderUrl } from './actions'; import { fetchCountries, spiderUrl } from './actions';
@Handler('te') @Handler('te')
@Disabled() export class TeHandler extends BaseHandler<DataIngestionServices> {
export class TeHandler extends BaseHandler<DataIngestionServices> { constructor(services: any) {
constructor(services: any) { super(services);
super(services); }
}
/**
@ScheduledOperation('te-countries', '0 0 * * 0', { * Initialize handler and create necessary indexes
priority: 5, */
description: 'Fetch and update Trading Economics countries data', async onInit(): Promise<void> {
immediately: false, this.logger.info('Initializing TeHandler and creating indexes');
})
@Disabled() if (!this.mongodb) {
fetchCountries = fetchCountries; this.logger.warn('MongoDB not available, skipping index creation');
return;
@ScheduledOperation('te-spider', '0 0 * * 0', { }
priority: 5,
description: 'Fetch and update Trading Economics countries data', try {
immediately: false, // Create indexes for teUrls collection
}) const indexes = [
spiderUrlSchedule = spiderUrl; // Compound index for finding URLs to crawl
{
indexSpec: { lastCrawled: 1, url: 1 },
options: {
name: 'crawl_status_idx',
background: true
}
},
// Unique index on URL to prevent duplicates
{
indexSpec: { url: 1 },
options: {
name: 'url_unique_idx',
unique: true,
background: true
}
},
// Index for finding URLs by symbol
{
indexSpec: { teSymbol: 1 },
options: {
name: 'symbol_idx',
sparse: true,
background: true
}
},
// Index for skip reason filtering
{
indexSpec: { skipReason: 1 },
options: {
name: 'skip_reason_idx',
sparse: true,
background: true
}
},
// Compound index for efficient batch queries
{
indexSpec: { lastCrawled: 1, skipReason: 1 },
options: {
name: 'batch_query_idx',
background: true
}
},
// Index for finding URLs with chart data
{
indexSpec: { teChartUrl: 1 },
options: {
name: 'chart_url_idx',
sparse: true,
background: true
}
}
];
for (const index of indexes) {
try {
await this.mongodb.createIndex('teUrls', index.indexSpec, index.options);
this.logger.info(`Created/verified index: ${index.options.name}`);
} catch (error) {
// Index might already exist, that's OK
this.logger.debug(`Index ${index.options.name} may already exist:`, error);
}
}
// Check collection stats
const count = await this.mongodb.countDocuments('teUrls', {});
this.logger.info(`TeUrls collection has ${count} documents`);
} catch (error) {
this.logger.error('Error creating indexes for TeHandler:', error);
// Don't throw - allow handler to continue even if indexes fail
}
}
@ScheduledOperation('te-countries', '0 0 * * 0', {
priority: 5,
description: 'Fetch and update Trading Economics countries data',
immediately: false,
})
@Disabled()
fetchCountries = fetchCountries;
@ScheduledOperation('te-spider', '* * * * *', {
priority: 5,
description: 'Spider Trading Economics URLs for data extraction (every minute)',
immediately: true,
})
spiderUrlSchedule = spiderUrl;
} }