refactor of data-service
This commit is contained in:
parent
6fb98c69f2
commit
09c97df1a8
49 changed files with 2394 additions and 112 deletions
0
libs/browser/src/browser-pool.ts
Normal file
0
libs/browser/src/browser-pool.ts
Normal file
361
libs/browser/src/browser.ts
Normal file
361
libs/browser/src/browser.ts
Normal file
|
|
@ -0,0 +1,361 @@
|
|||
import { BrowserContext, chromium, Page, Browser as PlaywrightBrowser } from 'playwright';
|
||||
import { getLogger } from '@stock-bot/logger';
|
||||
import type { BrowserOptions, NetworkEvent, NetworkEventHandler } from './types';
|
||||
|
||||
class BrowserSingleton {
|
||||
private browser?: PlaywrightBrowser;
|
||||
private contexts: Map<string, BrowserContext> = new Map();
|
||||
private logger = getLogger('browser');
|
||||
private options: BrowserOptions;
|
||||
private initialized = false;
|
||||
|
||||
constructor() {
|
||||
this.options = {
|
||||
headless: true,
|
||||
timeout: 30000,
|
||||
blockResources: false,
|
||||
enableNetworkLogging: false,
|
||||
};
|
||||
}
|
||||
|
||||
async initialize(options: BrowserOptions = {}): Promise<void> {
|
||||
if (this.initialized) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Merge options
|
||||
this.options = {
|
||||
...this.options,
|
||||
...options,
|
||||
};
|
||||
|
||||
this.logger.info('Initializing browser...');
|
||||
|
||||
try {
|
||||
this.browser = await chromium.launch({
|
||||
headless: this.options.headless,
|
||||
timeout: this.options.timeout,
|
||||
args: [
|
||||
// Security and sandbox
|
||||
'--no-sandbox',
|
||||
// '--disable-setuid-sandbox',
|
||||
// '--disable-dev-shm-usage',
|
||||
// '--disable-web-security',
|
||||
// '--disable-features=VizDisplayCompositor',
|
||||
// '--disable-blink-features=AutomationControlled',
|
||||
|
||||
// // Performance optimizations
|
||||
// '--disable-gpu',
|
||||
// '--disable-gpu-sandbox',
|
||||
// '--disable-software-rasterizer',
|
||||
// '--disable-background-timer-throttling',
|
||||
// '--disable-renderer-backgrounding',
|
||||
// '--disable-backgrounding-occluded-windows',
|
||||
// '--disable-field-trial-config',
|
||||
// '--disable-back-forward-cache',
|
||||
// '--disable-hang-monitor',
|
||||
// '--disable-ipc-flooding-protection',
|
||||
|
||||
// // Extensions and plugins
|
||||
// '--disable-extensions',
|
||||
// '--disable-plugins',
|
||||
// '--disable-component-extensions-with-background-pages',
|
||||
// '--disable-component-update',
|
||||
// '--disable-plugins-discovery',
|
||||
// '--disable-bundled-ppapi-flash',
|
||||
|
||||
// // Features we don't need
|
||||
// '--disable-default-apps',
|
||||
// '--disable-sync',
|
||||
// '--disable-translate',
|
||||
// '--disable-client-side-phishing-detection',
|
||||
// '--disable-domain-reliability',
|
||||
// '--disable-features=TranslateUI',
|
||||
// '--disable-features=Translate',
|
||||
// '--disable-breakpad',
|
||||
// '--disable-preconnect',
|
||||
// '--disable-print-preview',
|
||||
// '--disable-password-generation',
|
||||
// '--disable-password-manager-reauthentication',
|
||||
// '--disable-save-password-bubble',
|
||||
// '--disable-single-click-autofill',
|
||||
// '--disable-autofill',
|
||||
// '--disable-autofill-keyboard-accessory-view',
|
||||
// '--disable-full-form-autofill-ios',
|
||||
|
||||
// // Audio/Video/Media
|
||||
// '--mute-audio',
|
||||
// '--disable-audio-output',
|
||||
// '--autoplay-policy=user-gesture-required',
|
||||
// '--disable-background-media-playback',
|
||||
|
||||
// // Networking
|
||||
// '--disable-background-networking',
|
||||
// '--disable-sync',
|
||||
// '--aggressive-cache-discard',
|
||||
// '--disable-default-apps',
|
||||
|
||||
// // UI/UX optimizations
|
||||
// '--no-first-run',
|
||||
// '--disable-infobars',
|
||||
// '--disable-notifications',
|
||||
// '--disable-desktop-notifications',
|
||||
// '--disable-prompt-on-repost',
|
||||
// '--disable-logging',
|
||||
// '--disable-file-system',
|
||||
// '--hide-scrollbars',
|
||||
|
||||
// // Memory optimizations
|
||||
// '--memory-pressure-off',
|
||||
// '--max_old_space_size=4096',
|
||||
// '--js-flags="--max-old-space-size=4096"',
|
||||
// '--media-cache-size=1',
|
||||
// '--disk-cache-size=1',
|
||||
|
||||
// // Process management
|
||||
// '--use-mock-keychain',
|
||||
// '--password-store=basic',
|
||||
// '--enable-automation',
|
||||
// '--no-pings',
|
||||
// '--no-service-autorun',
|
||||
// '--metrics-recording-only',
|
||||
// '--safebrowsing-disable-auto-update',
|
||||
|
||||
// // Disable unnecessary features for headless mode
|
||||
// '--disable-speech-api',
|
||||
// '--disable-gesture-typing',
|
||||
// '--disable-voice-input',
|
||||
// '--disable-wake-on-wifi',
|
||||
// '--disable-webgl',
|
||||
// '--disable-webgl2',
|
||||
// '--disable-3d-apis',
|
||||
// '--disable-accelerated-2d-canvas',
|
||||
// '--disable-accelerated-jpeg-decoding',
|
||||
// '--disable-accelerated-mjpeg-decode',
|
||||
// '--disable-accelerated-video-decode',
|
||||
// '--disable-canvas-aa',
|
||||
// '--disable-2d-canvas-clip-aa',
|
||||
// '--disable-gl-drawing-for-tests',
|
||||
],
|
||||
});
|
||||
|
||||
this.initialized = true;
|
||||
this.logger.info('Browser initialized successfully');
|
||||
} catch (error) {
|
||||
this.logger.error('Failed to initialize browser', { error });
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async createPageWithProxy(
|
||||
url: string,
|
||||
proxy?: string
|
||||
): Promise<{
|
||||
page: Page & {
|
||||
onNetworkEvent: (handler: NetworkEventHandler) => void;
|
||||
offNetworkEvent: (handler: NetworkEventHandler) => void;
|
||||
clearNetworkListeners: () => void;
|
||||
};
|
||||
contextId: string;
|
||||
}> {
|
||||
if (!this.browser) {
|
||||
throw new Error('Browser not initialized. Call Browser.initialize() first.');
|
||||
}
|
||||
|
||||
const contextId = `ctx-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
||||
|
||||
const contextOptions: Record<string, unknown> = {
|
||||
ignoreHTTPSErrors: true,
|
||||
bypassCSP: true,
|
||||
};
|
||||
|
||||
if (proxy) {
|
||||
const [protocol, rest] = proxy.split('://');
|
||||
const [auth, hostPort] = rest.includes('@') ? rest.split('@') : [null, rest];
|
||||
const [host, port] = hostPort.split(':');
|
||||
|
||||
contextOptions.proxy = {
|
||||
server: `${protocol}://${host}:${port}`,
|
||||
username: auth?.split(':')[0] || '',
|
||||
password: auth?.split(':')[1] || '',
|
||||
};
|
||||
}
|
||||
|
||||
const context = await this.browser.newContext(contextOptions);
|
||||
|
||||
// Block resources for performance
|
||||
if (this.options.blockResources) {
|
||||
await context.route('**/*.{png,jpg,jpeg,gif,svg,ico,woff,woff2,ttf,css}', route => {
|
||||
route.abort();
|
||||
});
|
||||
}
|
||||
|
||||
this.contexts.set(contextId, context);
|
||||
|
||||
const page = await context.newPage();
|
||||
page.setDefaultTimeout(this.options.timeout || 30000);
|
||||
page.setDefaultNavigationTimeout(this.options.timeout || 30000);
|
||||
|
||||
// Create network event handlers for this page
|
||||
const networkEventHandlers: Set<NetworkEventHandler> = new Set();
|
||||
|
||||
// Add network monitoring methods to the page
|
||||
const enhancedPage = page as Page & {
|
||||
onNetworkEvent: (handler: NetworkEventHandler) => void;
|
||||
offNetworkEvent: (handler: NetworkEventHandler) => void;
|
||||
clearNetworkListeners: () => void;
|
||||
};
|
||||
|
||||
enhancedPage.onNetworkEvent = (handler: NetworkEventHandler) => {
|
||||
networkEventHandlers.add(handler);
|
||||
|
||||
// Set up network monitoring on first handler
|
||||
if (networkEventHandlers.size === 1) {
|
||||
this.setupNetworkMonitoring(page, networkEventHandlers);
|
||||
}
|
||||
};
|
||||
|
||||
enhancedPage.offNetworkEvent = (handler: NetworkEventHandler) => {
|
||||
networkEventHandlers.delete(handler);
|
||||
};
|
||||
|
||||
enhancedPage.clearNetworkListeners = () => {
|
||||
networkEventHandlers.clear();
|
||||
};
|
||||
|
||||
if (url) {
|
||||
await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: this.options.timeout,
|
||||
});
|
||||
}
|
||||
|
||||
return { page: enhancedPage, contextId };
|
||||
}
|
||||
|
||||
private setupNetworkMonitoring(page: Page, handlers: Set<NetworkEventHandler>): void {
|
||||
// Listen to requests
|
||||
page.on('request', async request => {
|
||||
const event: NetworkEvent = {
|
||||
url: request.url(),
|
||||
method: request.method(),
|
||||
type: 'request',
|
||||
timestamp: Date.now(),
|
||||
headers: request.headers(),
|
||||
};
|
||||
|
||||
// Capture request data for POST/PUT/PATCH requests
|
||||
if (['POST', 'PUT', 'PATCH'].includes(request.method())) {
|
||||
try {
|
||||
const postData = request.postData();
|
||||
if (postData) {
|
||||
event.requestData = postData;
|
||||
}
|
||||
} catch {
|
||||
// Some requests might not have accessible post data
|
||||
}
|
||||
}
|
||||
|
||||
this.emitNetworkEvent(event, handlers);
|
||||
});
|
||||
|
||||
// Listen to responses
|
||||
page.on('response', async response => {
|
||||
const event: NetworkEvent = {
|
||||
url: response.url(),
|
||||
method: response.request().method(),
|
||||
status: response.status(),
|
||||
type: 'response',
|
||||
timestamp: Date.now(),
|
||||
headers: response.headers(),
|
||||
};
|
||||
|
||||
// Capture response data for GET/POST requests with JSON content
|
||||
const contentType = response.headers()['content-type'] || '';
|
||||
if (contentType.includes('application/json') || contentType.includes('text/')) {
|
||||
try {
|
||||
const responseData = await response.text();
|
||||
event.responseData = responseData;
|
||||
} catch {
|
||||
// Response might be too large or not accessible
|
||||
}
|
||||
}
|
||||
|
||||
this.emitNetworkEvent(event, handlers);
|
||||
});
|
||||
|
||||
// Listen to failed requests
|
||||
page.on('requestfailed', request => {
|
||||
const event: NetworkEvent = {
|
||||
url: request.url(),
|
||||
method: request.method(),
|
||||
type: 'failed',
|
||||
timestamp: Date.now(),
|
||||
headers: request.headers(),
|
||||
};
|
||||
|
||||
// Try to capture request data for failed requests too
|
||||
if (['POST', 'PUT', 'PATCH'].includes(request.method())) {
|
||||
try {
|
||||
const postData = request.postData();
|
||||
if (postData) {
|
||||
event.requestData = postData;
|
||||
}
|
||||
} catch {
|
||||
// Ignore errors when accessing post data
|
||||
}
|
||||
}
|
||||
|
||||
this.emitNetworkEvent(event, handlers);
|
||||
});
|
||||
}
|
||||
|
||||
private emitNetworkEvent(event: NetworkEvent, handlers: Set<NetworkEventHandler>): void {
|
||||
for (const handler of handlers) {
|
||||
try {
|
||||
handler(event);
|
||||
} catch (error) {
|
||||
this.logger.error('Network event handler error', { error });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async evaluate<T>(page: Page, fn: () => T): Promise<T> {
|
||||
return page.evaluate(fn);
|
||||
}
|
||||
|
||||
async closeContext(contextId: string): Promise<void> {
|
||||
const context = this.contexts.get(contextId);
|
||||
if (context) {
|
||||
await context.close();
|
||||
this.contexts.delete(contextId);
|
||||
}
|
||||
}
|
||||
|
||||
async close(): Promise<void> {
|
||||
// Close all contexts
|
||||
for (const [, context] of this.contexts) {
|
||||
await context.close();
|
||||
}
|
||||
this.contexts.clear();
|
||||
|
||||
// Close browser
|
||||
if (this.browser) {
|
||||
await this.browser.close();
|
||||
this.browser = undefined;
|
||||
}
|
||||
|
||||
this.initialized = false;
|
||||
this.logger.info('Browser closed');
|
||||
}
|
||||
|
||||
get isInitialized(): boolean {
|
||||
return this.initialized;
|
||||
}
|
||||
}
|
||||
|
||||
// Export singleton instance
|
||||
export const Browser = new BrowserSingleton();
|
||||
|
||||
// Also export the class for typing if needed
|
||||
export { BrowserSingleton as BrowserClass };
|
||||
0
libs/browser/src/fast-browser.ts
Normal file
0
libs/browser/src/fast-browser.ts
Normal file
3
libs/browser/src/index.ts
Normal file
3
libs/browser/src/index.ts
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
export { Browser } from './browser';
|
||||
export { BrowserTabManager } from './tab-manager';
|
||||
export type { BrowserOptions, ScrapingResult } from './types';
|
||||
103
libs/browser/src/tab-manager.ts
Normal file
103
libs/browser/src/tab-manager.ts
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
import { Page } from 'playwright';
|
||||
import { getLogger } from '@stock-bot/logger';
|
||||
import { Browser } from './browser';
|
||||
import type { ScrapingResult } from './types';
|
||||
|
||||
interface TabInfo {
|
||||
page: Page;
|
||||
contextId: string;
|
||||
}
|
||||
|
||||
export class BrowserTabManager {
|
||||
private tabs: Map<string, TabInfo> = new Map();
|
||||
private logger = getLogger('browser-tab-manager');
|
||||
|
||||
async createTab(url?: string): Promise<{ page: Page; tabId: string }> {
|
||||
const tabId = `tab-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
||||
const { page, contextId } = await Browser.createPageWithProxy(url || 'about:blank');
|
||||
|
||||
this.tabs.set(tabId, { page, contextId });
|
||||
this.logger.debug('Tab created', { tabId, url });
|
||||
|
||||
return { page, tabId };
|
||||
}
|
||||
|
||||
async createTabWithProxy(
|
||||
url: string,
|
||||
proxy: string
|
||||
): Promise<{ page: Page; tabId: string; contextId: string }> {
|
||||
const tabId = `tab-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
|
||||
const { page, contextId } = await Browser.createPageWithProxy(url, proxy);
|
||||
|
||||
this.tabs.set(tabId, { page, contextId });
|
||||
this.logger.debug('Tab with proxy created', { tabId, url, proxy });
|
||||
|
||||
return { page, tabId, contextId };
|
||||
}
|
||||
|
||||
async scrapeUrlsWithProxies<T>(
|
||||
urlProxyPairs: Array<{ url: string; proxy: string }>,
|
||||
extractor: (page: Page) => Promise<T>,
|
||||
options: { concurrency?: number } = {}
|
||||
): Promise<ScrapingResult<T>[]> {
|
||||
const { concurrency = 3 } = options;
|
||||
const results: ScrapingResult<T>[] = [];
|
||||
|
||||
for (let i = 0; i < urlProxyPairs.length; i += concurrency) {
|
||||
const batch = urlProxyPairs.slice(i, i + concurrency);
|
||||
|
||||
const batchPromises = batch.map(async ({ url, proxy }) => {
|
||||
let tabId: string | undefined;
|
||||
|
||||
try {
|
||||
const result = await this.createTabWithProxy(url, proxy);
|
||||
tabId = result.tabId;
|
||||
|
||||
const data = await extractor(result.page);
|
||||
|
||||
return {
|
||||
data,
|
||||
url,
|
||||
success: true,
|
||||
} as ScrapingResult<T>;
|
||||
} catch (error) {
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
|
||||
return {
|
||||
data: null as T,
|
||||
url,
|
||||
success: false,
|
||||
error: errorMessage,
|
||||
} as ScrapingResult<T>;
|
||||
} finally {
|
||||
if (tabId) {
|
||||
await this.closeTab(tabId);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const batchResults = await Promise.all(batchPromises);
|
||||
results.push(...batchResults);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
async closeTab(tabId: string): Promise<void> {
|
||||
const tab = this.tabs.get(tabId);
|
||||
if (tab) {
|
||||
await tab.page.close();
|
||||
await Browser.closeContext(tab.contextId);
|
||||
this.tabs.delete(tabId);
|
||||
this.logger.debug('Tab closed', { tabId });
|
||||
}
|
||||
}
|
||||
|
||||
getTabCount(): number {
|
||||
return this.tabs.size;
|
||||
}
|
||||
|
||||
getAllTabIds(): string[] {
|
||||
return Array.from(this.tabs.keys());
|
||||
}
|
||||
}
|
||||
30
libs/browser/src/types.ts
Normal file
30
libs/browser/src/types.ts
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
export interface BrowserOptions {
|
||||
proxy?: string;
|
||||
headless?: boolean;
|
||||
timeout?: number;
|
||||
blockResources?: boolean;
|
||||
enableNetworkLogging?: boolean;
|
||||
}
|
||||
|
||||
// Keep the old name for backward compatibility
|
||||
export type FastBrowserOptions = BrowserOptions;
|
||||
|
||||
export interface ScrapingResult<T = unknown> {
|
||||
data: T;
|
||||
url: string;
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface NetworkEvent {
|
||||
url: string;
|
||||
method: string;
|
||||
status?: number;
|
||||
type: 'request' | 'response' | 'failed';
|
||||
timestamp: number;
|
||||
requestData?: string;
|
||||
responseData?: string;
|
||||
headers?: Record<string, string>;
|
||||
}
|
||||
|
||||
export type NetworkEventHandler = (event: NetworkEvent) => void;
|
||||
0
libs/browser/src/utils.ts
Normal file
0
libs/browser/src/utils.ts
Normal file
Loading…
Add table
Add a link
Reference in a new issue