work on spider for te

This commit is contained in:
Boki 2025-07-06 15:26:47 -04:00
parent 505565a09e
commit 95b1381480
17 changed files with 485 additions and 28 deletions

View file

@ -78,7 +78,7 @@
"db": 1
},
"workers": 5,
"concurrency": 5,
"concurrency": 2,
"enableScheduledJobs": true,
"defaultJobOptions": {
"attempts": 3,

View file

@ -0,0 +1 @@
../../../node_modules

View file

@ -27,7 +27,8 @@
"@stock-bot/browser": "*",
"@stock-bot/proxy": "*",
"hono": "^4.0.0",
"pako": "^2.1.0"
"pako": "^2.1.0",
"cheerio": "^1.0.0"
},
"devDependencies": {
"typescript": "^5.0.0"

View file

@ -33,6 +33,7 @@ import { crawlIntradayData, scheduleIntradayCrawls } from './actions/intraday-cr
import { createQMOperationRegistry } from './shared/operation-provider';
@Handler('qm')
@Disabled() // Disable by default, enable specific operations as needed
export class QMHandler extends BaseHandler<DataIngestionServices> {
public operationRegistry: OperationRegistry;

View file

@ -0,0 +1,227 @@
import { getRandomUserAgent } from '@stock-bot/utils';
import * as cheerio from 'cheerio';
import { TE_CONFIG } from '../shared/config';
import type { TeCountry } from '../shared/types';
import type { TeHandler } from '../te.handler';
export async function fetchCountries(this: TeHandler): Promise<TeCountry[] | null> {
const { logger, mongodb } = this;
try {
// 1. Fetch the HTML page
const reqInfo = {
proxy: this.proxy.getProxy(),
headers: {
'User-Agent': getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
},
}
const response = await fetch(TE_CONFIG.COUNTRIES_URL, reqInfo);
logger.debug('Response status:', {
status: response.status,
statusText: response.statusText,
url: response.url
});
if (!response.ok) {
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
}
const html = await response.text();
logger.info('Fetched HTML length:', { length: html.length });
// 2. Parse HTML to extract country data
const $ = cheerio.load(html);
const countries: TeCountry[] = [];
// Look for country links - they typically have a pattern like /country-name
// Trading Economics groups countries by region in the page
$('.list-group-item, a[href^="/"]').each((_, element) => {
const $el = $(element);
// Try to extract country information
let name: string | undefined;
let url: string | undefined;
let region: string | undefined;
// Check if it's a direct link
if ($el.is('a')) {
const href = $el.attr('href');
const text = $el.text().trim();
console.log(href)
// Filter for country URLs (they don't contain special paths like /indicators, /calendar, etc.)
if (href && href.startsWith('/') && !href.includes('/') && text) {
name = text;
url = href;
}
} else {
// Check for links within table rows
const $link = $el.find('a[href^="/"]').first();
if ($link.length) {
const href = $link.attr('href');
const text = $link.text().trim();
if (href && text && !href.includes('/indicators') && !href.includes('/calendar')) {
name = text;
url = href;
// Try to get region from parent elements
const $regionHeader = $el.closest('.region-section, .country-group').find('h2, h3, .region-title').first();
if ($regionHeader.length) {
region = $regionHeader.text().trim();
}
}
}
}
// Add to countries array if we found valid data
if (name && url) {
// Extract country code from URL if possible (e.g., /united-states -> US)
const code = extractCountryCode(url, name);
countries.push({
name,
code,
url: `https://tradingeconomics.com${url}`,
region,
updated_at: new Date(),
});
}
});
// Remove duplicates based on name
const uniqueCountries = Array.from(
new Map(countries.map(c => [c.name, c])).values()
);
if (uniqueCountries.length === 0) {
throw new Error('No countries found in HTML');
}
logger.info('Extracted countries from HTML', {
count: uniqueCountries.length,
byRegion: groupCountriesByRegion(uniqueCountries),
});
// 3. Save to MongoDB
try {
console.log( uniqueCountries)
if (uniqueCountries.length > 0) {
const result = await mongodb?.batchUpsert('teCountries', uniqueCountries, ['code']);
logger.info('Countries saved to MongoDB', {
matched: result.matchedCount,
modified: result.modifiedCount,
upserted: result.upsertedCount,
});
}
} catch (dbError) {
logger.error('Failed to save countries to MongoDB', { error: dbError });
throw dbError;
}
return uniqueCountries;
} catch (error) {
logger.error('Failed to fetch Trading Economics countries', {
error: error instanceof Error ? error.message : String(error),
stack: error instanceof Error ? error.stack : undefined,
});
return null;
}
}
function extractCountryCode(url: string, name: string): string | undefined {
// Common country code mappings
const countryCodeMap: Record<string, string> = {
'united-states': 'US',
'united-kingdom': 'GB',
'euro-area': 'EU',
'china': 'CN',
'japan': 'JP',
'germany': 'DE',
'france': 'FR',
'italy': 'IT',
'spain': 'ES',
'canada': 'CA',
'australia': 'AU',
'south-korea': 'KR',
'india': 'IN',
'brazil': 'BR',
'russia': 'RU',
'mexico': 'MX',
'indonesia': 'ID',
'netherlands': 'NL',
'saudi-arabia': 'SA',
'turkey': 'TR',
'switzerland': 'CH',
'poland': 'PL',
'sweden': 'SE',
'belgium': 'BE',
'argentina': 'AR',
'ireland': 'IE',
'austria': 'AT',
'norway': 'NO',
'israel': 'IL',
'singapore': 'SG',
'denmark': 'DK',
'egypt': 'EG',
'philippines': 'PH',
'finland': 'FI',
'chile': 'CL',
'pakistan': 'PK',
'romania': 'RO',
'new-zealand': 'NZ',
'greece': 'GR',
'iraq': 'IQ',
'portugal': 'PT',
'czech-republic': 'CZ',
'vietnam': 'VN',
'peru': 'PE',
'colombia': 'CO',
'malaysia': 'MY',
'ukraine': 'UA',
'hungary': 'HU',
'kuwait': 'KW',
'morocco': 'MA',
'slovakia': 'SK',
'kenya': 'KE',
'puerto-rico': 'PR',
'ecuador': 'EC',
'ethiopia': 'ET',
'dominican-republic': 'DO',
'luxembourg': 'LU',
'oman': 'OM',
'guatemala': 'GT',
'bulgaria': 'BG',
'ghana': 'GH',
'tanzania': 'TZ',
'turkmenistan': 'TM',
'croatia': 'HR',
'costa-rica': 'CR',
'lebanon': 'LB',
'slovenia': 'SI',
'lithuania': 'LT',
'serbia': 'RS',
'panama': 'PA',
};
// Clean URL to get country slug
const slug = url.replace(/^\//, '').toLowerCase();
return countryCodeMap[slug];
}
function groupCountriesByRegion(countries: TeCountry[]): Record<string, number> {
const groups: Record<string, number> = {};
for (const country of countries) {
const region = country.region || 'Unknown';
groups[region] = (groups[region] || 0) + 1;
}
return groups;
}

View file

@ -1,2 +1,4 @@
// Export all action functions here
// export * from './example.action';
export * from './fetch-countries.action';
export * from './spider.action';

View file

@ -0,0 +1,140 @@
import { getRandomUserAgent } from '@stock-bot/utils';
import * as cheerio from 'cheerio';
import { TE_CONFIG } from '../shared/config';
import type { TeHandler } from '../te.handler';
export async function spiderUrl(this: TeHandler, payload: { url: string }): Promise<string[] | null> {
const { logger, mongodb } = this;
const reqUrl = payload && payload.url ? TE_CONFIG.MAIN_URL + payload.url : TE_CONFIG.MAIN_URL;
this.logger.info(`Spiderring URL: ${reqUrl}`, {reqUrl});
const mongoRecord = await mongodb?.findOne('teUrls', { url: payload?.url || '/' });
if(payload && payload.url && mongoRecord && mongoRecord.lastCrawled && mongoRecord.lastCrawled.getTime() > Date.now() - 7* 24 * 60 * 60 * 1000) {
this.logger.info(`Skipping URL ${reqUrl} as it was already crawled in the last 24 hours`);
return null; // Skip if already crawled in the last 24 hours
}
if (!payload) {
const oneDayAgo = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000);
const records = await mongodb?.find('teUrls', {
$or: [
{ lastCrawled: { $lt: oneDayAgo } }, // Crawled more than 24 hours ago
{ lastCrawled: { $exists: false } } // Never crawled
]
});
this.logger.info(`Found ${records?.length || 0} records to process`);
for (const record of records || []) {
await this.scheduleOperation('te-spider', {
url: record.url,
}, {
jobId: `te-spider-${record.url}`,
priority: 5, // Lower priority than financial data
});
}
}
try {
// 1. Fetch the HTML page
const reqInfo = {
proxy: 'http://5.79.66.2:13010',//this.proxy.getProxy(),
headers: {
'User-Agent': getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
},
}
const response = await fetch(reqUrl, reqInfo);
logger.debug('Response status:', {
status: response.status,
statusText: response.statusText,
url: response.url
});
if (!response.ok) {
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
}
const html = await response.text();
let match = html.match(/TESymbol = '([^']+)'/);
const teSymbol = match ? match[1] : undefined;
match = html.match(/;TELastUpdate = '([^']+)'/);
const teLastUpdate = match ? match[1] : undefined;
match = html.match(/; var TEChartsDatasource = '([^']+)'/);
const teChartUrl = match ? match[1] : undefined;
match = html.match(/; var TEChartsToken = '([^']+)'/);
const teChartToken = match ? match[1] : undefined;
console.log(teSymbol, teLastUpdate, teChartUrl, teChartToken);
const $ = cheerio.load(html);
const urls: string[] = [];
$('.list-group-item, a[href^="/"]').each((_, element) => {
const $el = $(element);
let url: string | undefined;
if ($el.is('a')) {
const href = $el.attr('href');
if (href && href.startsWith('/') && !href.includes('.aspx')) {
url = href;
}
}
if (url && urls.indexOf(url) === -1) {
urls.push(url);
}
});
if (urls.length === 0) {
throw new Error('No urls found in HTML');
}
// 3. Save to MongoDB
try {
if (urls.length > 0) {
const urlMap: {url: string, lastCrawled?: Date, teSymbol? : string, teLastUpdate? : string, teChartUrl? : string, teChartToken? : string}[] = urls.map(url => ({url}));
if( payload && payload.url) {
urlMap.push({
url: payload.url,
lastCrawled: new Date(),
teSymbol,
teLastUpdate,
teChartUrl,
teChartToken,})
}else {
urlMap.push({url: '/', lastCrawled: new Date()})
}
const result = await mongodb?.batchUpsert('teUrls', urlMap, ['url']);
logger.info('TE URLs saved to MongoDB', {
matched: result.matchedCount,
modified: result.modifiedCount,
upserted: result.upsertedCount,
});
}
} catch (dbError) {
logger.error('Failed to save urls to MongoDB', { error: dbError });
throw dbError;
}
for (const url of urls) {
this.scheduleOperation('te-spider', {
url: url,
}, {
jobId: `te-spider-${url}`,
priority: 5, // Lower priority than financial data
})
}
return urls;
} catch (error) {
logger.error(`Failed to fetch Trading Economics URLs ${reqUrl}`, {
error: error instanceof Error ? error.message : String(error),
stack: error instanceof Error ? error.stack : undefined,
});
return null;
}
}

View file

@ -1,6 +1,9 @@
export const TE_CONFIG = {
// Add configuration constants here
API_URL: 'https://api.example.com',
MAIN_URL: 'https://tradingeconomics.com',
COUNTRIES_URL: 'https://tradingeconomics.com/countries',
REQUEST_TIMEOUT: 30000,
USER_AGENT: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
KEY: 'tradingeconomics-charts-core-api-key',
};

View file

@ -10,4 +10,13 @@ export interface TeResponse {
data: TeData[];
status: string;
timestamp: Date;
}
export interface TeCountry {
name: string;
code?: string;
region?: string;
url?: string;
updated_at: Date;
created_at?: Date;
}

View file

@ -1,29 +1,31 @@
import {
BaseHandler,
Disabled,
Handler,
Operation,
ScheduledOperation,
ScheduledOperation
} from '@stock-bot/handlers';
import type { DataIngestionServices } from '../../types';
import { fetchCountries, spiderUrl } from './actions';
@Handler('te')
export class TeHandler extends BaseHandler {
@Disabled()
export class TeHandler extends BaseHandler<DataIngestionServices> {
constructor(services: any) {
super(services);
}
@Operation('example-operation')
async exampleOperation(): Promise<unknown> {
this.logger.info('TE handler example operation executed');
return { success: true };
}
@ScheduledOperation('te-scheduled-job', '0 0 * * *', {
@ScheduledOperation('te-countries', '0 0 * * 0', {
priority: 5,
description: 'Daily TE handler scheduled job',
immediately: false,
description: 'Fetch and update Trading Economics countries data',
immediately: true,
})
async scheduledJob(): Promise<unknown> {
this.logger.info('TE handler scheduled job executed');
return this.exampleOperation();
}
@Disabled()
fetchCountries = fetchCountries;
@ScheduledOperation('te-spider', '0 0 * * 0', {
priority: 5,
description: 'Fetch and update Trading Economics countries data',
immediately: true,
})
spiderUrlSchedule = spiderUrl;
}

View file

@ -14,7 +14,6 @@
{ "path": "./data-pipeline" },
{ "path": "./web-api" },
{ "path": "./web-app" },
{ "path": "./engine"},
{ "path": "./orchestrator"}
]
}

View file

@ -7,10 +7,11 @@
"module": "ESNext",
"skipLibCheck": true,
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
// "allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
// "noEmit": true,
"composite": true,
"jsx": "react-jsx",
"strict": true,
"noUnusedLocals": true,