work on spider for te
This commit is contained in:
parent
505565a09e
commit
95b1381480
17 changed files with 485 additions and 28 deletions
|
|
@ -78,7 +78,7 @@
|
|||
"db": 1
|
||||
},
|
||||
"workers": 5,
|
||||
"concurrency": 5,
|
||||
"concurrency": 2,
|
||||
"enableScheduledJobs": true,
|
||||
"defaultJobOptions": {
|
||||
"attempts": 3,
|
||||
|
|
|
|||
1
apps/stock/data-ingestion/node_modules
Symbolic link
1
apps/stock/data-ingestion/node_modules
Symbolic link
|
|
@ -0,0 +1 @@
|
|||
../../../node_modules
|
||||
|
|
@ -27,7 +27,8 @@
|
|||
"@stock-bot/browser": "*",
|
||||
"@stock-bot/proxy": "*",
|
||||
"hono": "^4.0.0",
|
||||
"pako": "^2.1.0"
|
||||
"pako": "^2.1.0",
|
||||
"cheerio": "^1.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"typescript": "^5.0.0"
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ import { crawlIntradayData, scheduleIntradayCrawls } from './actions/intraday-cr
|
|||
import { createQMOperationRegistry } from './shared/operation-provider';
|
||||
|
||||
@Handler('qm')
|
||||
@Disabled() // Disable by default, enable specific operations as needed
|
||||
export class QMHandler extends BaseHandler<DataIngestionServices> {
|
||||
public operationRegistry: OperationRegistry;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,227 @@
|
|||
import { getRandomUserAgent } from '@stock-bot/utils';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { TE_CONFIG } from '../shared/config';
|
||||
import type { TeCountry } from '../shared/types';
|
||||
import type { TeHandler } from '../te.handler';
|
||||
|
||||
export async function fetchCountries(this: TeHandler): Promise<TeCountry[] | null> {
|
||||
const { logger, mongodb } = this;
|
||||
|
||||
try {
|
||||
// 1. Fetch the HTML page
|
||||
const reqInfo = {
|
||||
proxy: this.proxy.getProxy(),
|
||||
headers: {
|
||||
'User-Agent': getRandomUserAgent(),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
},
|
||||
}
|
||||
const response = await fetch(TE_CONFIG.COUNTRIES_URL, reqInfo);
|
||||
|
||||
logger.debug('Response status:', {
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
url: response.url
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
logger.info('Fetched HTML length:', { length: html.length });
|
||||
|
||||
|
||||
// 2. Parse HTML to extract country data
|
||||
const $ = cheerio.load(html);
|
||||
const countries: TeCountry[] = [];
|
||||
|
||||
// Look for country links - they typically have a pattern like /country-name
|
||||
// Trading Economics groups countries by region in the page
|
||||
$('.list-group-item, a[href^="/"]').each((_, element) => {
|
||||
const $el = $(element);
|
||||
|
||||
// Try to extract country information
|
||||
let name: string | undefined;
|
||||
let url: string | undefined;
|
||||
let region: string | undefined;
|
||||
|
||||
// Check if it's a direct link
|
||||
if ($el.is('a')) {
|
||||
const href = $el.attr('href');
|
||||
const text = $el.text().trim();
|
||||
console.log(href)
|
||||
// Filter for country URLs (they don't contain special paths like /indicators, /calendar, etc.)
|
||||
if (href && href.startsWith('/') && !href.includes('/') && text) {
|
||||
name = text;
|
||||
url = href;
|
||||
}
|
||||
} else {
|
||||
// Check for links within table rows
|
||||
const $link = $el.find('a[href^="/"]').first();
|
||||
if ($link.length) {
|
||||
const href = $link.attr('href');
|
||||
const text = $link.text().trim();
|
||||
|
||||
if (href && text && !href.includes('/indicators') && !href.includes('/calendar')) {
|
||||
name = text;
|
||||
url = href;
|
||||
|
||||
// Try to get region from parent elements
|
||||
const $regionHeader = $el.closest('.region-section, .country-group').find('h2, h3, .region-title').first();
|
||||
if ($regionHeader.length) {
|
||||
region = $regionHeader.text().trim();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add to countries array if we found valid data
|
||||
if (name && url) {
|
||||
// Extract country code from URL if possible (e.g., /united-states -> US)
|
||||
const code = extractCountryCode(url, name);
|
||||
|
||||
countries.push({
|
||||
name,
|
||||
code,
|
||||
url: `https://tradingeconomics.com${url}`,
|
||||
region,
|
||||
updated_at: new Date(),
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Remove duplicates based on name
|
||||
const uniqueCountries = Array.from(
|
||||
new Map(countries.map(c => [c.name, c])).values()
|
||||
);
|
||||
|
||||
if (uniqueCountries.length === 0) {
|
||||
throw new Error('No countries found in HTML');
|
||||
}
|
||||
|
||||
logger.info('Extracted countries from HTML', {
|
||||
count: uniqueCountries.length,
|
||||
byRegion: groupCountriesByRegion(uniqueCountries),
|
||||
});
|
||||
|
||||
// 3. Save to MongoDB
|
||||
try {
|
||||
console.log( uniqueCountries)
|
||||
if (uniqueCountries.length > 0) {
|
||||
const result = await mongodb?.batchUpsert('teCountries', uniqueCountries, ['code']);
|
||||
logger.info('Countries saved to MongoDB', {
|
||||
matched: result.matchedCount,
|
||||
modified: result.modifiedCount,
|
||||
upserted: result.upsertedCount,
|
||||
});
|
||||
}
|
||||
} catch (dbError) {
|
||||
logger.error('Failed to save countries to MongoDB', { error: dbError });
|
||||
throw dbError;
|
||||
}
|
||||
|
||||
return uniqueCountries;
|
||||
} catch (error) {
|
||||
logger.error('Failed to fetch Trading Economics countries', {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
stack: error instanceof Error ? error.stack : undefined,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function extractCountryCode(url: string, name: string): string | undefined {
|
||||
// Common country code mappings
|
||||
const countryCodeMap: Record<string, string> = {
|
||||
'united-states': 'US',
|
||||
'united-kingdom': 'GB',
|
||||
'euro-area': 'EU',
|
||||
'china': 'CN',
|
||||
'japan': 'JP',
|
||||
'germany': 'DE',
|
||||
'france': 'FR',
|
||||
'italy': 'IT',
|
||||
'spain': 'ES',
|
||||
'canada': 'CA',
|
||||
'australia': 'AU',
|
||||
'south-korea': 'KR',
|
||||
'india': 'IN',
|
||||
'brazil': 'BR',
|
||||
'russia': 'RU',
|
||||
'mexico': 'MX',
|
||||
'indonesia': 'ID',
|
||||
'netherlands': 'NL',
|
||||
'saudi-arabia': 'SA',
|
||||
'turkey': 'TR',
|
||||
'switzerland': 'CH',
|
||||
'poland': 'PL',
|
||||
'sweden': 'SE',
|
||||
'belgium': 'BE',
|
||||
'argentina': 'AR',
|
||||
'ireland': 'IE',
|
||||
'austria': 'AT',
|
||||
'norway': 'NO',
|
||||
'israel': 'IL',
|
||||
'singapore': 'SG',
|
||||
'denmark': 'DK',
|
||||
'egypt': 'EG',
|
||||
'philippines': 'PH',
|
||||
'finland': 'FI',
|
||||
'chile': 'CL',
|
||||
'pakistan': 'PK',
|
||||
'romania': 'RO',
|
||||
'new-zealand': 'NZ',
|
||||
'greece': 'GR',
|
||||
'iraq': 'IQ',
|
||||
'portugal': 'PT',
|
||||
'czech-republic': 'CZ',
|
||||
'vietnam': 'VN',
|
||||
'peru': 'PE',
|
||||
'colombia': 'CO',
|
||||
'malaysia': 'MY',
|
||||
'ukraine': 'UA',
|
||||
'hungary': 'HU',
|
||||
'kuwait': 'KW',
|
||||
'morocco': 'MA',
|
||||
'slovakia': 'SK',
|
||||
'kenya': 'KE',
|
||||
'puerto-rico': 'PR',
|
||||
'ecuador': 'EC',
|
||||
'ethiopia': 'ET',
|
||||
'dominican-republic': 'DO',
|
||||
'luxembourg': 'LU',
|
||||
'oman': 'OM',
|
||||
'guatemala': 'GT',
|
||||
'bulgaria': 'BG',
|
||||
'ghana': 'GH',
|
||||
'tanzania': 'TZ',
|
||||
'turkmenistan': 'TM',
|
||||
'croatia': 'HR',
|
||||
'costa-rica': 'CR',
|
||||
'lebanon': 'LB',
|
||||
'slovenia': 'SI',
|
||||
'lithuania': 'LT',
|
||||
'serbia': 'RS',
|
||||
'panama': 'PA',
|
||||
};
|
||||
|
||||
// Clean URL to get country slug
|
||||
const slug = url.replace(/^\//, '').toLowerCase();
|
||||
|
||||
return countryCodeMap[slug];
|
||||
}
|
||||
|
||||
function groupCountriesByRegion(countries: TeCountry[]): Record<string, number> {
|
||||
const groups: Record<string, number> = {};
|
||||
|
||||
for (const country of countries) {
|
||||
const region = country.region || 'Unknown';
|
||||
groups[region] = (groups[region] || 0) + 1;
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
|
@ -1,2 +1,4 @@
|
|||
// Export all action functions here
|
||||
// export * from './example.action';
|
||||
export * from './fetch-countries.action';
|
||||
export * from './spider.action';
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,140 @@
|
|||
import { getRandomUserAgent } from '@stock-bot/utils';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { TE_CONFIG } from '../shared/config';
|
||||
import type { TeHandler } from '../te.handler';
|
||||
|
||||
export async function spiderUrl(this: TeHandler, payload: { url: string }): Promise<string[] | null> {
|
||||
const { logger, mongodb } = this;
|
||||
const reqUrl = payload && payload.url ? TE_CONFIG.MAIN_URL + payload.url : TE_CONFIG.MAIN_URL;
|
||||
this.logger.info(`Spiderring URL: ${reqUrl}`, {reqUrl});
|
||||
|
||||
const mongoRecord = await mongodb?.findOne('teUrls', { url: payload?.url || '/' });
|
||||
if(payload && payload.url && mongoRecord && mongoRecord.lastCrawled && mongoRecord.lastCrawled.getTime() > Date.now() - 7* 24 * 60 * 60 * 1000) {
|
||||
this.logger.info(`Skipping URL ${reqUrl} as it was already crawled in the last 24 hours`);
|
||||
return null; // Skip if already crawled in the last 24 hours
|
||||
}
|
||||
|
||||
if (!payload) {
|
||||
const oneDayAgo = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000);
|
||||
const records = await mongodb?.find('teUrls', {
|
||||
$or: [
|
||||
{ lastCrawled: { $lt: oneDayAgo } }, // Crawled more than 24 hours ago
|
||||
{ lastCrawled: { $exists: false } } // Never crawled
|
||||
]
|
||||
});
|
||||
this.logger.info(`Found ${records?.length || 0} records to process`);
|
||||
for (const record of records || []) {
|
||||
await this.scheduleOperation('te-spider', {
|
||||
url: record.url,
|
||||
}, {
|
||||
jobId: `te-spider-${record.url}`,
|
||||
priority: 5, // Lower priority than financial data
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
// 1. Fetch the HTML page
|
||||
const reqInfo = {
|
||||
proxy: 'http://5.79.66.2:13010',//this.proxy.getProxy(),
|
||||
headers: {
|
||||
'User-Agent': getRandomUserAgent(),
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
},
|
||||
}
|
||||
const response = await fetch(reqUrl, reqInfo);
|
||||
|
||||
logger.debug('Response status:', {
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
url: response.url
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch: ${response.status} ${response.statusText}`);
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
|
||||
|
||||
let match = html.match(/TESymbol = '([^']+)'/);
|
||||
const teSymbol = match ? match[1] : undefined;
|
||||
match = html.match(/;TELastUpdate = '([^']+)'/);
|
||||
const teLastUpdate = match ? match[1] : undefined;
|
||||
match = html.match(/; var TEChartsDatasource = '([^']+)'/);
|
||||
const teChartUrl = match ? match[1] : undefined;
|
||||
match = html.match(/; var TEChartsToken = '([^']+)'/);
|
||||
const teChartToken = match ? match[1] : undefined;
|
||||
|
||||
console.log(teSymbol, teLastUpdate, teChartUrl, teChartToken);
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
const urls: string[] = [];
|
||||
|
||||
$('.list-group-item, a[href^="/"]').each((_, element) => {
|
||||
const $el = $(element);
|
||||
let url: string | undefined;
|
||||
if ($el.is('a')) {
|
||||
const href = $el.attr('href');
|
||||
if (href && href.startsWith('/') && !href.includes('.aspx')) {
|
||||
url = href;
|
||||
}
|
||||
}
|
||||
|
||||
if (url && urls.indexOf(url) === -1) {
|
||||
urls.push(url);
|
||||
}
|
||||
});
|
||||
|
||||
if (urls.length === 0) {
|
||||
throw new Error('No urls found in HTML');
|
||||
}
|
||||
|
||||
// 3. Save to MongoDB
|
||||
try {
|
||||
if (urls.length > 0) {
|
||||
const urlMap: {url: string, lastCrawled?: Date, teSymbol? : string, teLastUpdate? : string, teChartUrl? : string, teChartToken? : string}[] = urls.map(url => ({url}));
|
||||
if( payload && payload.url) {
|
||||
urlMap.push({
|
||||
url: payload.url,
|
||||
lastCrawled: new Date(),
|
||||
teSymbol,
|
||||
teLastUpdate,
|
||||
teChartUrl,
|
||||
teChartToken,})
|
||||
}else {
|
||||
urlMap.push({url: '/', lastCrawled: new Date()})
|
||||
}
|
||||
|
||||
const result = await mongodb?.batchUpsert('teUrls', urlMap, ['url']);
|
||||
logger.info('TE URLs saved to MongoDB', {
|
||||
matched: result.matchedCount,
|
||||
modified: result.modifiedCount,
|
||||
upserted: result.upsertedCount,
|
||||
});
|
||||
}
|
||||
} catch (dbError) {
|
||||
logger.error('Failed to save urls to MongoDB', { error: dbError });
|
||||
throw dbError;
|
||||
}
|
||||
|
||||
for (const url of urls) {
|
||||
this.scheduleOperation('te-spider', {
|
||||
url: url,
|
||||
}, {
|
||||
jobId: `te-spider-${url}`,
|
||||
priority: 5, // Lower priority than financial data
|
||||
})
|
||||
}
|
||||
|
||||
return urls;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to fetch Trading Economics URLs ${reqUrl}`, {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
stack: error instanceof Error ? error.stack : undefined,
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,9 @@
|
|||
export const TE_CONFIG = {
|
||||
// Add configuration constants here
|
||||
API_URL: 'https://api.example.com',
|
||||
MAIN_URL: 'https://tradingeconomics.com',
|
||||
COUNTRIES_URL: 'https://tradingeconomics.com/countries',
|
||||
REQUEST_TIMEOUT: 30000,
|
||||
USER_AGENT: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
KEY: 'tradingeconomics-charts-core-api-key',
|
||||
};
|
||||
|
|
@ -10,4 +10,13 @@ export interface TeResponse {
|
|||
data: TeData[];
|
||||
status: string;
|
||||
timestamp: Date;
|
||||
}
|
||||
|
||||
export interface TeCountry {
|
||||
name: string;
|
||||
code?: string;
|
||||
region?: string;
|
||||
url?: string;
|
||||
updated_at: Date;
|
||||
created_at?: Date;
|
||||
}
|
||||
|
|
@ -1,29 +1,31 @@
|
|||
import {
|
||||
BaseHandler,
|
||||
Disabled,
|
||||
Handler,
|
||||
Operation,
|
||||
ScheduledOperation,
|
||||
ScheduledOperation
|
||||
} from '@stock-bot/handlers';
|
||||
import type { DataIngestionServices } from '../../types';
|
||||
import { fetchCountries, spiderUrl } from './actions';
|
||||
|
||||
@Handler('te')
|
||||
export class TeHandler extends BaseHandler {
|
||||
@Disabled()
|
||||
export class TeHandler extends BaseHandler<DataIngestionServices> {
|
||||
constructor(services: any) {
|
||||
super(services);
|
||||
}
|
||||
|
||||
@Operation('example-operation')
|
||||
async exampleOperation(): Promise<unknown> {
|
||||
this.logger.info('TE handler example operation executed');
|
||||
return { success: true };
|
||||
}
|
||||
|
||||
@ScheduledOperation('te-scheduled-job', '0 0 * * *', {
|
||||
@ScheduledOperation('te-countries', '0 0 * * 0', {
|
||||
priority: 5,
|
||||
description: 'Daily TE handler scheduled job',
|
||||
immediately: false,
|
||||
description: 'Fetch and update Trading Economics countries data',
|
||||
immediately: true,
|
||||
})
|
||||
async scheduledJob(): Promise<unknown> {
|
||||
this.logger.info('TE handler scheduled job executed');
|
||||
return this.exampleOperation();
|
||||
}
|
||||
@Disabled()
|
||||
fetchCountries = fetchCountries;
|
||||
|
||||
@ScheduledOperation('te-spider', '0 0 * * 0', {
|
||||
priority: 5,
|
||||
description: 'Fetch and update Trading Economics countries data',
|
||||
immediately: true,
|
||||
})
|
||||
spiderUrlSchedule = spiderUrl;
|
||||
}
|
||||
|
|
@ -14,7 +14,6 @@
|
|||
{ "path": "./data-pipeline" },
|
||||
{ "path": "./web-api" },
|
||||
{ "path": "./web-app" },
|
||||
{ "path": "./engine"},
|
||||
{ "path": "./orchestrator"}
|
||||
]
|
||||
}
|
||||
|
|
@ -7,10 +7,11 @@
|
|||
"module": "ESNext",
|
||||
"skipLibCheck": true,
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
// "allowImportingTsExtensions": true,
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"noEmit": true,
|
||||
// "noEmit": true,
|
||||
"composite": true,
|
||||
"jsx": "react-jsx",
|
||||
"strict": true,
|
||||
"noUnusedLocals": true,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue