import { getLogger } from '@stock-bot/logger'; export interface DataFrameRow { [key: string]: any; } export interface DataFrameOptions { index?: string; columns?: string[]; dtypes?: Record; } export interface GroupByResult { [key: string]: DataFrame; } export interface AggregationFunction { (values: any[]): any; } export class DataFrame { private data: DataFrameRow[]; private _columns: string[]; private _index: string; private _dtypes: Record; private logger = getLogger('dataframe'); constructor(data: DataFrameRow[] = [], options: DataFrameOptions = {}) { this.data = [...data]; this._index = options.index || 'index'; this._columns = options.columns || this.inferColumns(); this._dtypes = options.dtypes || {}; this.validateAndCleanData(); } private inferColumns(): string[] { if (this.data.length === 0) return []; const columns = new Set(); for (const row of this.data) { Object.keys(row).forEach(key => columns.add(key)); } return Array.from(columns).sort(); } private validateAndCleanData(): void { if (this.data.length === 0) return; // Ensure all rows have the same columns for (let i = 0; i < this.data.length; i++) { const row = this.data[i]; // Add missing columns with null values for (const col of this._columns) { if (!(col in row)) { row[col] = null; } } // Apply data type conversions for (const [col, dtype] of Object.entries(this._dtypes)) { if (col in row && row[col] !== null) { row[col] = this.convertValue(row[col], dtype); } } } } private convertValue(value: any, dtype: string): any { switch (dtype) { case 'number': return typeof value === 'number' ? value : parseFloat(value); case 'string': return String(value); case 'boolean': return Boolean(value); case 'date': return value instanceof Date ? value : new Date(value); default: return value; } } // Basic properties get columns(): string[] { return [...this._columns]; } get index(): string { return this._index; } get length(): number { return this.data.length; } get shape(): [number, number] { return [this.data.length, this._columns.length]; } get empty(): boolean { return this.data.length === 0; } // Data access methods head(n: number = 5): DataFrame { return new DataFrame(this.data.slice(0, n), { columns: this._columns, index: this._index, dtypes: this._dtypes }); } tail(n: number = 5): DataFrame { return new DataFrame(this.data.slice(-n), { columns: this._columns, index: this._index, dtypes: this._dtypes }); } iloc(start: number, end?: number): DataFrame { const slice = end !== undefined ? this.data.slice(start, end) : this.data.slice(start); return new DataFrame(slice, { columns: this._columns, index: this._index, dtypes: this._dtypes }); } at(index: number, column: string): any { if (index < 0 || index >= this.data.length) { throw new Error(`Index ${index} out of bounds`); } return this.data[index][column]; } // Column operations select(columns: string[]): DataFrame { const validColumns = columns.filter(col => this._columns.includes(col)); const newData = this.data.map(row => { const newRow: DataFrameRow = {}; for (const col of validColumns) { newRow[col] = row[col]; } return newRow; }); return new DataFrame(newData, { columns: validColumns, index: this._index, dtypes: this.filterDtypes(validColumns) }); } drop(columns: string[]): DataFrame { const remainingColumns = this._columns.filter(col => !columns.includes(col)); return this.select(remainingColumns); } getColumn(column: string): any[] { if (!this._columns.includes(column)) { throw new Error(`Column '${column}' not found`); } return this.data.map(row => row[column]); } setColumn(column: string, values: any[]): DataFrame { if (values.length !== this.data.length) { throw new Error('Values length must match DataFrame length'); } const newData = this.data.map((row, index) => ({ ...row, [column]: values[index] })); const newColumns = this._columns.includes(column) ? this._columns : [...this._columns, column]; return new DataFrame(newData, { columns: newColumns, index: this._index, dtypes: this._dtypes }); } // Filtering filter(predicate: (row: DataFrameRow, index: number) => boolean): DataFrame { const filteredData = this.data.filter(predicate); return new DataFrame(filteredData, { columns: this._columns, index: this._index, dtypes: this._dtypes }); } where(column: string, operator: '>' | '<' | '>=' | '<=' | '==' | '!=', value: any): DataFrame { return this.filter(row => { const cellValue = row[column]; switch (operator) { case '>': return cellValue > value; case '<': return cellValue < value; case '>=': return cellValue >= value; case '<=': return cellValue <= value; case '==': return cellValue === value; case '!=': return cellValue !== value; default: return false; } }); } // Sorting sort(column: string, ascending: boolean = true): DataFrame { const sortedData = [...this.data].sort((a, b) => { const aVal = a[column]; const bVal = b[column]; if (aVal === bVal) return 0; const comparison = aVal > bVal ? 1 : -1; return ascending ? comparison : -comparison; }); return new DataFrame(sortedData, { columns: this._columns, index: this._index, dtypes: this._dtypes }); } // Aggregation groupBy(column: string): GroupByResult { const groups: Record = {}; for (const row of this.data) { const key = String(row[column]); if (!groups[key]) { groups[key] = []; } groups[key].push(row); } const result: GroupByResult = {}; for (const [key, rows] of Object.entries(groups)) { result[key] = new DataFrame(rows, { columns: this._columns, index: this._index, dtypes: this._dtypes }); } return result; } agg(aggregations: Record): DataFrameRow { const result: DataFrameRow = {}; for (const [column, func] of Object.entries(aggregations)) { if (!this._columns.includes(column)) { throw new Error(`Column '${column}' not found`); } const values = this.getColumn(column).filter(val => val !== null && val !== undefined); result[column] = func(values); } return result; } // Statistical methods mean(column: string): number { const values = this.getColumn(column).filter(val => typeof val === 'number'); return values.reduce((sum, val) => sum + val, 0) / values.length; } sum(column: string): number { const values = this.getColumn(column).filter(val => typeof val === 'number'); return values.reduce((sum, val) => sum + val, 0); } min(column: string): number { const values = this.getColumn(column).filter(val => typeof val === 'number'); return Math.min(...values); } max(column: string): number { const values = this.getColumn(column).filter(val => typeof val === 'number'); return Math.max(...values); } std(column: string): number { const values = this.getColumn(column).filter(val => typeof val === 'number'); const mean = values.reduce((sum, val) => sum + val, 0) / values.length; const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length; return Math.sqrt(variance); } // Time series specific methods resample(timeColumn: string, frequency: string): DataFrame { // Simple resampling implementation // For production, you'd want more sophisticated time-based grouping const sorted = this.sort(timeColumn); switch (frequency) { case '1H': return this.resampleByHour(sorted, timeColumn); case '1D': return this.resampleByDay(sorted, timeColumn); default: throw new Error(`Unsupported frequency: ${frequency}`); } } private resampleByHour(sorted: DataFrame, timeColumn: string): DataFrame { const groups: Record = {}; for (const row of sorted.data) { const date = new Date(row[timeColumn]); const hourKey = `${date.getFullYear()}-${date.getMonth()}-${date.getDate()}-${date.getHours()}`; if (!groups[hourKey]) { groups[hourKey] = []; } groups[hourKey].push(row); } const aggregatedData: DataFrameRow[] = []; for (const [key, rows] of Object.entries(groups)) { const tempDf = new DataFrame(rows, { columns: this._columns, index: this._index, dtypes: this._dtypes }); // Create OHLCV aggregation const aggregated: DataFrameRow = { [timeColumn]: rows[0][timeColumn], open: rows[0].close || rows[0].price, high: tempDf.max('high') || tempDf.max('close') || tempDf.max('price'), low: tempDf.min('low') || tempDf.min('close') || tempDf.min('price'), close: rows[rows.length - 1].close || rows[rows.length - 1].price, volume: tempDf.sum('volume') || 0 }; aggregatedData.push(aggregated); } return new DataFrame(aggregatedData); } private resampleByDay(sorted: DataFrame, timeColumn: string): DataFrame { // Similar to resampleByHour but group by day const groups: Record = {}; for (const row of sorted.data) { const date = new Date(row[timeColumn]); const dayKey = `${date.getFullYear()}-${date.getMonth()}-${date.getDate()}`; if (!groups[dayKey]) { groups[dayKey] = []; } groups[dayKey].push(row); } const aggregatedData: DataFrameRow[] = []; for (const [key, rows] of Object.entries(groups)) { const tempDf = new DataFrame(rows, { columns: this._columns, index: this._index, dtypes: this._dtypes }); const aggregated: DataFrameRow = { [timeColumn]: rows[0][timeColumn], open: rows[0].close || rows[0].price, high: tempDf.max('high') || tempDf.max('close') || tempDf.max('price'), low: tempDf.min('low') || tempDf.min('close') || tempDf.min('price'), close: rows[rows.length - 1].close || rows[rows.length - 1].price, volume: tempDf.sum('volume') || 0 }; aggregatedData.push(aggregated); } return new DataFrame(aggregatedData); } // Utility methods copy(): DataFrame { return new DataFrame(this.data.map(row => ({ ...row })), { columns: this._columns, index: this._index, dtypes: { ...this._dtypes } }); } concat(other: DataFrame): DataFrame { const combinedData = [...this.data, ...other.data]; const combinedColumns = Array.from(new Set([...this._columns, ...other._columns])); return new DataFrame(combinedData, { columns: combinedColumns, index: this._index, dtypes: { ...this._dtypes, ...other._dtypes } }); } toArray(): DataFrameRow[] { return this.data.map(row => ({ ...row })); } toJSON(): string { return JSON.stringify(this.data); } private filterDtypes(columns: string[]): Record { const filtered: Record = {}; for (const col of columns) { if (this._dtypes[col]) { filtered[col] = this._dtypes[col]; } } return filtered; } // Display method toString(): string { if (this.empty) { return 'Empty DataFrame'; } const maxRows = 10; const displayData = this.data.slice(0, maxRows); let result = `DataFrame (${this.length} rows x ${this._columns.length} columns)\n`; result += this._columns.join('\t') + '\n'; result += '-'.repeat(this._columns.join('\t').length) + '\n'; for (const row of displayData) { const values = this._columns.map(col => String(row[col] ?? 'null')); result += values.join('\t') + '\n'; } if (this.length > maxRows) { result += `... (${this.length - maxRows} more rows)\n`; } return result; } } // Factory functions export function createDataFrame(data: DataFrameRow[], options?: DataFrameOptions): DataFrame { return new DataFrame(data, options); } export function readCSV(csvData: string, options?: DataFrameOptions): DataFrame { const lines = csvData.trim().split('\n'); if (lines.length === 0) { return new DataFrame(); } const headers = lines[0].split(',').map(h => h.trim()); const data: DataFrameRow[] = []; for (let i = 1; i < lines.length; i++) { const values = lines[i].split(',').map(v => v.trim()); const row: DataFrameRow = {}; for (let j = 0; j < headers.length; j++) { row[headers[j]] = values[j] || null; } data.push(row); } return new DataFrame(data, { columns: headers, ...options }); }