485 lines
No EOL
14 KiB
TypeScript
485 lines
No EOL
14 KiB
TypeScript
import { getLogger } from '@stock-bot/logger';
|
|
|
|
export interface DataFrameRow {
|
|
[key: string]: any;
|
|
}
|
|
|
|
export interface DataFrameOptions {
|
|
index?: string;
|
|
columns?: string[];
|
|
dtypes?: Record<string, 'number' | 'string' | 'boolean' | 'date'>;
|
|
}
|
|
|
|
export interface GroupByResult {
|
|
[key: string]: DataFrame;
|
|
}
|
|
|
|
export interface AggregationFunction {
|
|
(values: any[]): any;
|
|
}
|
|
|
|
export class DataFrame {
|
|
private data: DataFrameRow[];
|
|
private _columns: string[];
|
|
private _index: string;
|
|
private _dtypes: Record<string, 'number' | 'string' | 'boolean' | 'date'>;
|
|
private logger = getLogger('dataframe');
|
|
|
|
constructor(data: DataFrameRow[] = [], options: DataFrameOptions = {}) {
|
|
this.data = [...data];
|
|
this._index = options.index || 'index';
|
|
this._columns = options.columns || this.inferColumns();
|
|
this._dtypes = options.dtypes || {};
|
|
|
|
this.validateAndCleanData();
|
|
}
|
|
|
|
private inferColumns(): string[] {
|
|
if (this.data.length === 0) return [];
|
|
|
|
const columns = new Set<string>();
|
|
for (const row of this.data) {
|
|
Object.keys(row).forEach(key => columns.add(key));
|
|
}
|
|
|
|
return Array.from(columns).sort();
|
|
}
|
|
|
|
private validateAndCleanData(): void {
|
|
if (this.data.length === 0) return;
|
|
|
|
// Ensure all rows have the same columns
|
|
for (let i = 0; i < this.data.length; i++) {
|
|
const row = this.data[i];
|
|
|
|
// Add missing columns with null values
|
|
for (const col of this._columns) {
|
|
if (!(col in row)) {
|
|
row[col] = null;
|
|
}
|
|
}
|
|
|
|
// Apply data type conversions
|
|
for (const [col, dtype] of Object.entries(this._dtypes)) {
|
|
if (col in row && row[col] !== null) {
|
|
row[col] = this.convertValue(row[col], dtype);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private convertValue(value: any, dtype: string): any {
|
|
switch (dtype) {
|
|
case 'number':
|
|
return typeof value === 'number' ? value : parseFloat(value);
|
|
case 'string':
|
|
return String(value);
|
|
case 'boolean':
|
|
return Boolean(value);
|
|
case 'date':
|
|
return value instanceof Date ? value : new Date(value);
|
|
default:
|
|
return value;
|
|
}
|
|
}
|
|
|
|
// Basic properties
|
|
get columns(): string[] {
|
|
return [...this._columns];
|
|
}
|
|
|
|
get index(): string {
|
|
return this._index;
|
|
}
|
|
|
|
get length(): number {
|
|
return this.data.length;
|
|
}
|
|
|
|
get shape(): [number, number] {
|
|
return [this.data.length, this._columns.length];
|
|
}
|
|
|
|
get empty(): boolean {
|
|
return this.data.length === 0;
|
|
}
|
|
|
|
// Data access methods
|
|
head(n: number = 5): DataFrame {
|
|
return new DataFrame(this.data.slice(0, n), {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
}
|
|
|
|
tail(n: number = 5): DataFrame {
|
|
return new DataFrame(this.data.slice(-n), {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
}
|
|
|
|
iloc(start: number, end?: number): DataFrame {
|
|
const slice = end !== undefined ? this.data.slice(start, end) : this.data.slice(start);
|
|
return new DataFrame(slice, {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
}
|
|
|
|
at(index: number, column: string): any {
|
|
if (index < 0 || index >= this.data.length) {
|
|
throw new Error(`Index ${index} out of bounds`);
|
|
}
|
|
return this.data[index][column];
|
|
}
|
|
|
|
// Column operations
|
|
select(columns: string[]): DataFrame {
|
|
const validColumns = columns.filter(col => this._columns.includes(col));
|
|
const newData = this.data.map(row => {
|
|
const newRow: DataFrameRow = {};
|
|
for (const col of validColumns) {
|
|
newRow[col] = row[col];
|
|
}
|
|
return newRow;
|
|
});
|
|
|
|
return new DataFrame(newData, {
|
|
columns: validColumns,
|
|
index: this._index,
|
|
dtypes: this.filterDtypes(validColumns)
|
|
});
|
|
}
|
|
|
|
drop(columns: string[]): DataFrame {
|
|
const remainingColumns = this._columns.filter(col => !columns.includes(col));
|
|
return this.select(remainingColumns);
|
|
}
|
|
|
|
getColumn(column: string): any[] {
|
|
if (!this._columns.includes(column)) {
|
|
throw new Error(`Column '${column}' not found`);
|
|
}
|
|
return this.data.map(row => row[column]);
|
|
}
|
|
|
|
setColumn(column: string, values: any[]): DataFrame {
|
|
if (values.length !== this.data.length) {
|
|
throw new Error('Values length must match DataFrame length');
|
|
}
|
|
|
|
const newData = this.data.map((row, index) => ({
|
|
...row,
|
|
[column]: values[index]
|
|
}));
|
|
|
|
const newColumns = this._columns.includes(column)
|
|
? this._columns
|
|
: [...this._columns, column];
|
|
|
|
return new DataFrame(newData, {
|
|
columns: newColumns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
}
|
|
|
|
// Filtering
|
|
filter(predicate: (row: DataFrameRow, index: number) => boolean): DataFrame {
|
|
const filteredData = this.data.filter(predicate);
|
|
return new DataFrame(filteredData, {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
}
|
|
|
|
where(column: string, operator: '>' | '<' | '>=' | '<=' | '==' | '!=', value: any): DataFrame {
|
|
return this.filter(row => {
|
|
const cellValue = row[column];
|
|
switch (operator) {
|
|
case '>': return cellValue > value;
|
|
case '<': return cellValue < value;
|
|
case '>=': return cellValue >= value;
|
|
case '<=': return cellValue <= value;
|
|
case '==': return cellValue === value;
|
|
case '!=': return cellValue !== value;
|
|
default: return false;
|
|
}
|
|
});
|
|
}
|
|
|
|
// Sorting
|
|
sort(column: string, ascending: boolean = true): DataFrame {
|
|
const sortedData = [...this.data].sort((a, b) => {
|
|
const aVal = a[column];
|
|
const bVal = b[column];
|
|
|
|
if (aVal === bVal) return 0;
|
|
|
|
const comparison = aVal > bVal ? 1 : -1;
|
|
return ascending ? comparison : -comparison;
|
|
});
|
|
|
|
return new DataFrame(sortedData, {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
}
|
|
|
|
// Aggregation
|
|
groupBy(column: string): GroupByResult {
|
|
const groups: Record<string, DataFrameRow[]> = {};
|
|
|
|
for (const row of this.data) {
|
|
const key = String(row[column]);
|
|
if (!groups[key]) {
|
|
groups[key] = [];
|
|
}
|
|
groups[key].push(row);
|
|
}
|
|
|
|
const result: GroupByResult = {};
|
|
for (const [key, rows] of Object.entries(groups)) {
|
|
result[key] = new DataFrame(rows, {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
agg(aggregations: Record<string, AggregationFunction>): DataFrameRow {
|
|
const result: DataFrameRow = {};
|
|
|
|
for (const [column, func] of Object.entries(aggregations)) {
|
|
if (!this._columns.includes(column)) {
|
|
throw new Error(`Column '${column}' not found`);
|
|
}
|
|
|
|
const values = this.getColumn(column).filter(val => val !== null && val !== undefined);
|
|
result[column] = func(values);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
// Statistical methods
|
|
mean(column: string): number {
|
|
const values = this.getColumn(column).filter(val => typeof val === 'number');
|
|
return values.reduce((sum, val) => sum + val, 0) / values.length;
|
|
}
|
|
|
|
sum(column: string): number {
|
|
const values = this.getColumn(column).filter(val => typeof val === 'number');
|
|
return values.reduce((sum, val) => sum + val, 0);
|
|
}
|
|
|
|
min(column: string): number {
|
|
const values = this.getColumn(column).filter(val => typeof val === 'number');
|
|
return Math.min(...values);
|
|
}
|
|
|
|
max(column: string): number {
|
|
const values = this.getColumn(column).filter(val => typeof val === 'number');
|
|
return Math.max(...values);
|
|
}
|
|
|
|
std(column: string): number {
|
|
const values = this.getColumn(column).filter(val => typeof val === 'number');
|
|
const mean = values.reduce((sum, val) => sum + val, 0) / values.length;
|
|
const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length;
|
|
return Math.sqrt(variance);
|
|
}
|
|
|
|
// Time series specific methods
|
|
resample(timeColumn: string, frequency: string): DataFrame {
|
|
// Simple resampling implementation
|
|
// For production, you'd want more sophisticated time-based grouping
|
|
const sorted = this.sort(timeColumn);
|
|
|
|
switch (frequency) {
|
|
case '1H':
|
|
return this.resampleByHour(sorted, timeColumn);
|
|
case '1D':
|
|
return this.resampleByDay(sorted, timeColumn);
|
|
default:
|
|
throw new Error(`Unsupported frequency: ${frequency}`);
|
|
}
|
|
}
|
|
|
|
private resampleByHour(sorted: DataFrame, timeColumn: string): DataFrame {
|
|
const groups: Record<string, DataFrameRow[]> = {};
|
|
|
|
for (const row of sorted.data) {
|
|
const date = new Date(row[timeColumn]);
|
|
const hourKey = `${date.getFullYear()}-${date.getMonth()}-${date.getDate()}-${date.getHours()}`;
|
|
|
|
if (!groups[hourKey]) {
|
|
groups[hourKey] = [];
|
|
}
|
|
groups[hourKey].push(row);
|
|
}
|
|
|
|
const aggregatedData: DataFrameRow[] = [];
|
|
for (const [key, rows] of Object.entries(groups)) {
|
|
const tempDf = new DataFrame(rows, {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
|
|
// Create OHLCV aggregation
|
|
const aggregated: DataFrameRow = {
|
|
[timeColumn]: rows[0][timeColumn],
|
|
open: rows[0].close || rows[0].price,
|
|
high: tempDf.max('high') || tempDf.max('close') || tempDf.max('price'),
|
|
low: tempDf.min('low') || tempDf.min('close') || tempDf.min('price'),
|
|
close: rows[rows.length - 1].close || rows[rows.length - 1].price,
|
|
volume: tempDf.sum('volume') || 0
|
|
};
|
|
|
|
aggregatedData.push(aggregated);
|
|
}
|
|
|
|
return new DataFrame(aggregatedData);
|
|
}
|
|
|
|
private resampleByDay(sorted: DataFrame, timeColumn: string): DataFrame {
|
|
// Similar to resampleByHour but group by day
|
|
const groups: Record<string, DataFrameRow[]> = {};
|
|
|
|
for (const row of sorted.data) {
|
|
const date = new Date(row[timeColumn]);
|
|
const dayKey = `${date.getFullYear()}-${date.getMonth()}-${date.getDate()}`;
|
|
|
|
if (!groups[dayKey]) {
|
|
groups[dayKey] = [];
|
|
}
|
|
groups[dayKey].push(row);
|
|
}
|
|
|
|
const aggregatedData: DataFrameRow[] = [];
|
|
for (const [key, rows] of Object.entries(groups)) {
|
|
const tempDf = new DataFrame(rows, {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: this._dtypes
|
|
});
|
|
|
|
const aggregated: DataFrameRow = {
|
|
[timeColumn]: rows[0][timeColumn],
|
|
open: rows[0].close || rows[0].price,
|
|
high: tempDf.max('high') || tempDf.max('close') || tempDf.max('price'),
|
|
low: tempDf.min('low') || tempDf.min('close') || tempDf.min('price'),
|
|
close: rows[rows.length - 1].close || rows[rows.length - 1].price,
|
|
volume: tempDf.sum('volume') || 0
|
|
};
|
|
|
|
aggregatedData.push(aggregated);
|
|
}
|
|
|
|
return new DataFrame(aggregatedData);
|
|
}
|
|
|
|
// Utility methods
|
|
copy(): DataFrame {
|
|
return new DataFrame(this.data.map(row => ({ ...row })), {
|
|
columns: this._columns,
|
|
index: this._index,
|
|
dtypes: { ...this._dtypes }
|
|
});
|
|
}
|
|
|
|
concat(other: DataFrame): DataFrame {
|
|
const combinedData = [...this.data, ...other.data];
|
|
const combinedColumns = Array.from(new Set([...this._columns, ...other._columns]));
|
|
|
|
return new DataFrame(combinedData, {
|
|
columns: combinedColumns,
|
|
index: this._index,
|
|
dtypes: { ...this._dtypes, ...other._dtypes }
|
|
});
|
|
}
|
|
|
|
toArray(): DataFrameRow[] {
|
|
return this.data.map(row => ({ ...row }));
|
|
}
|
|
|
|
toJSON(): string {
|
|
return JSON.stringify(this.data);
|
|
}
|
|
|
|
private filterDtypes(columns: string[]): Record<string, 'number' | 'string' | 'boolean' | 'date'> {
|
|
const filtered: Record<string, 'number' | 'string' | 'boolean' | 'date'> = {};
|
|
for (const col of columns) {
|
|
if (this._dtypes[col]) {
|
|
filtered[col] = this._dtypes[col];
|
|
}
|
|
}
|
|
return filtered;
|
|
}
|
|
|
|
// Display method
|
|
toString(): string {
|
|
if (this.empty) {
|
|
return 'Empty DataFrame';
|
|
}
|
|
|
|
const maxRows = 10;
|
|
const displayData = this.data.slice(0, maxRows);
|
|
|
|
let result = `DataFrame (${this.length} rows x ${this._columns.length} columns)\n`;
|
|
result += this._columns.join('\t') + '\n';
|
|
result += '-'.repeat(this._columns.join('\t').length) + '\n';
|
|
|
|
for (const row of displayData) {
|
|
const values = this._columns.map(col => String(row[col] ?? 'null'));
|
|
result += values.join('\t') + '\n';
|
|
}
|
|
|
|
if (this.length > maxRows) {
|
|
result += `... (${this.length - maxRows} more rows)\n`;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}
|
|
|
|
// Factory functions
|
|
export function createDataFrame(data: DataFrameRow[], options?: DataFrameOptions): DataFrame {
|
|
return new DataFrame(data, options);
|
|
}
|
|
|
|
export function readCSV(csvData: string, options?: DataFrameOptions): DataFrame {
|
|
const lines = csvData.trim().split('\n');
|
|
if (lines.length === 0) {
|
|
return new DataFrame();
|
|
}
|
|
|
|
const headers = lines[0].split(',').map(h => h.trim());
|
|
const data: DataFrameRow[] = [];
|
|
|
|
for (let i = 1; i < lines.length; i++) {
|
|
const values = lines[i].split(',').map(v => v.trim());
|
|
const row: DataFrameRow = {};
|
|
|
|
for (let j = 0; j < headers.length; j++) {
|
|
row[headers[j]] = values[j] || null;
|
|
}
|
|
|
|
data.push(row);
|
|
}
|
|
|
|
return new DataFrame(data, {
|
|
columns: headers,
|
|
...options
|
|
});
|
|
} |