added initial py analytics / rust core / ts orchestrator services

This commit is contained in:
Boki 2025-07-01 11:16:25 -04:00
parent 680b5fd2ae
commit c862ed496b
62 changed files with 13459 additions and 0 deletions

View file

@ -0,0 +1,410 @@
import numpy as np
import pandas as pd
from scipy import stats
from typing import Dict, List, Tuple, Optional
import logging
from dataclasses import dataclass
from sklearn.model_selection import TimeSeriesSplit
import warnings
logger = logging.getLogger(__name__)
@dataclass
class ValidationResult:
"""Results from statistical validation tests"""
is_overfit: bool
confidence_level: float
psr: float # Probabilistic Sharpe Ratio
dsr: float # Deflated Sharpe Ratio
monte_carlo_percentile: float
out_of_sample_degradation: float
statistical_significance: bool
warnings: List[str]
recommendations: List[str]
class StatisticalValidator:
"""
Statistical validation for backtesting results
Detects overfitting and validates strategy robustness
"""
def __init__(self, min_trades: int = 30, confidence_level: float = 0.95):
self.min_trades = min_trades
self.confidence_level = confidence_level
def validate_backtest(
self,
returns: np.ndarray,
trades: pd.DataFrame,
parameters: Dict,
market_returns: Optional[np.ndarray] = None
) -> ValidationResult:
"""
Comprehensive validation of backtest results
"""
warnings_list = []
recommendations = []
# Check minimum requirements
if len(trades) < self.min_trades:
warnings_list.append(f"Insufficient trades ({len(trades)} < {self.min_trades})")
recommendations.append("Extend backtest period or reduce trading filters")
# Calculate key metrics
sharpe = self.calculate_sharpe_ratio(returns)
psr = self.calculate_probabilistic_sharpe_ratio(sharpe, len(returns))
dsr = self.calculate_deflated_sharpe_ratio(
sharpe, len(returns), len(parameters)
)
# Monte Carlo analysis
mc_percentile = self.monte_carlo_test(returns, trades)
# Out-of-sample testing
oos_degradation = self.out_of_sample_test(returns, trades)
# Statistical significance tests
is_significant = self.test_statistical_significance(returns, market_returns)
# Overfitting detection
is_overfit = self.detect_overfitting(
psr, dsr, mc_percentile, oos_degradation, len(parameters)
)
# Generate recommendations
if dsr < 0.95:
recommendations.append("Reduce strategy complexity or increase sample size")
if mc_percentile < 0.95:
recommendations.append("Strategy may be exploiting random patterns")
if oos_degradation > 0.5:
recommendations.append("Consider walk-forward optimization")
return ValidationResult(
is_overfit=is_overfit,
confidence_level=1 - is_overfit * 0.5, # Simple confidence measure
psr=psr,
dsr=dsr,
monte_carlo_percentile=mc_percentile,
out_of_sample_degradation=oos_degradation,
statistical_significance=is_significant,
warnings=warnings_list,
recommendations=recommendations
)
def calculate_sharpe_ratio(self, returns: np.ndarray) -> float:
"""Calculate annualized Sharpe ratio"""
if len(returns) == 0:
return 0.0
# Assume daily returns
mean_return = np.mean(returns)
std_return = np.std(returns, ddof=1)
if std_return == 0:
return 0.0
# Annualize
sharpe = mean_return / std_return * np.sqrt(252)
return sharpe
def calculate_probabilistic_sharpe_ratio(
self,
sharpe: float,
num_observations: int
) -> float:
"""
Calculate Probabilistic Sharpe Ratio (PSR)
Adjusts for sample size and non-normality
"""
if num_observations < 2:
return 0.0
# Adjust for sample size
psr = stats.norm.cdf(
sharpe * np.sqrt(num_observations - 1) /
np.sqrt(1 + 0.5 * sharpe**2)
)
return psr
def calculate_deflated_sharpe_ratio(
self,
sharpe: float,
num_observations: int,
num_parameters: int,
num_trials: int = 1
) -> float:
"""
Calculate Deflated Sharpe Ratio (DSR)
Accounts for multiple testing and parameter optimization
"""
if num_observations < num_parameters + 2:
return 0.0
# Expected maximum Sharpe under null hypothesis
expected_max_sharpe = np.sqrt(2 * np.log(num_trials)) / np.sqrt(num_observations)
# Standard error of Sharpe ratio
se_sharpe = np.sqrt(
(1 + 0.5 * sharpe**2) / (num_observations - 1)
)
# Deflated Sharpe Ratio
dsr = (sharpe - expected_max_sharpe) / se_sharpe
# Convert to probability
return stats.norm.cdf(dsr)
def monte_carlo_test(
self,
returns: np.ndarray,
trades: pd.DataFrame,
num_simulations: int = 1000
) -> float:
"""
Monte Carlo permutation test
Tests if strategy is better than random
"""
original_sharpe = self.calculate_sharpe_ratio(returns)
# Generate random strategies
random_sharpes = []
for _ in range(num_simulations):
# Randomly shuffle trade outcomes
shuffled_returns = np.random.permutation(returns)
random_sharpe = self.calculate_sharpe_ratio(shuffled_returns)
random_sharpes.append(random_sharpe)
# Calculate percentile
percentile = np.sum(original_sharpe > np.array(random_sharpes)) / num_simulations
return percentile
def out_of_sample_test(
self,
returns: np.ndarray,
trades: pd.DataFrame,
test_size: float = 0.3
) -> float:
"""
Test performance degradation out-of-sample
"""
if len(returns) < 100: # Need sufficient data
return 0.0
# Split data
split_point = int(len(returns) * (1 - test_size))
in_sample_returns = returns[:split_point]
out_sample_returns = returns[split_point:]
# Calculate Sharpe ratios
is_sharpe = self.calculate_sharpe_ratio(in_sample_returns)
oos_sharpe = self.calculate_sharpe_ratio(out_sample_returns)
# Calculate degradation
if is_sharpe > 0:
degradation = max(0, 1 - oos_sharpe / is_sharpe)
else:
degradation = 1.0
return degradation
def test_statistical_significance(
self,
strategy_returns: np.ndarray,
market_returns: Optional[np.ndarray] = None
) -> bool:
"""
Test if returns are statistically significant
"""
# Test against zero returns
t_stat, p_value = stats.ttest_1samp(strategy_returns, 0)
if p_value < (1 - self.confidence_level):
return True
# If market returns provided, test for alpha
if market_returns is not None and len(market_returns) == len(strategy_returns):
excess_returns = strategy_returns - market_returns
t_stat, p_value = stats.ttest_1samp(excess_returns, 0)
return p_value < (1 - self.confidence_level)
return False
def detect_overfitting(
self,
psr: float,
dsr: float,
mc_percentile: float,
oos_degradation: float,
num_parameters: int
) -> bool:
"""
Detect potential overfitting based on multiple criteria
"""
overfitting_score = 0
# Check PSR
if psr < 0.95:
overfitting_score += 1
# Check DSR
if dsr < 0.95:
overfitting_score += 2 # More weight on DSR
# Check Monte Carlo
if mc_percentile < 0.95:
overfitting_score += 1
# Check out-of-sample degradation
if oos_degradation > 0.5:
overfitting_score += 2
# Check parameter count
if num_parameters > 10:
overfitting_score += 1
# Decision threshold
return overfitting_score >= 3
def walk_forward_analysis(
self,
data: pd.DataFrame,
strategy_func,
window_size: int,
step_size: int,
num_windows: int = 5
) -> Dict:
"""
Perform walk-forward analysis
"""
results = {
'in_sample_sharpes': [],
'out_sample_sharpes': [],
'parameters': [],
'stability_score': 0
}
tscv = TimeSeriesSplit(n_splits=num_windows)
for train_idx, test_idx in tscv.split(data):
train_data = data.iloc[train_idx]
test_data = data.iloc[test_idx]
# Optimize on training data
best_params = self.optimize_parameters(train_data, strategy_func)
results['parameters'].append(best_params)
# Test on out-of-sample data
is_returns = strategy_func(train_data, best_params)
oos_returns = strategy_func(test_data, best_params)
is_sharpe = self.calculate_sharpe_ratio(is_returns)
oos_sharpe = self.calculate_sharpe_ratio(oos_returns)
results['in_sample_sharpes'].append(is_sharpe)
results['out_sample_sharpes'].append(oos_sharpe)
# Calculate stability score
param_stability = self.calculate_parameter_stability(results['parameters'])
performance_stability = 1 - np.std(results['out_sample_sharpes']) / (np.mean(results['out_sample_sharpes']) + 1e-6)
results['stability_score'] = (param_stability + performance_stability) / 2
return results
def calculate_parameter_stability(self, parameters_list: List[Dict]) -> float:
"""
Calculate how stable parameters are across different periods
"""
if len(parameters_list) < 2:
return 1.0
# Convert to DataFrame for easier analysis
params_df = pd.DataFrame(parameters_list)
# Calculate coefficient of variation for each parameter
stabilities = []
for col in params_df.columns:
if params_df[col].dtype in [np.float64, np.int64]:
mean_val = params_df[col].mean()
std_val = params_df[col].std()
if mean_val != 0:
cv = std_val / abs(mean_val)
stability = 1 / (1 + cv) # Convert to 0-1 scale
stabilities.append(stability)
return np.mean(stabilities) if stabilities else 0.5
def optimize_parameters(self, data: pd.DataFrame, strategy_func) -> Dict:
"""
Placeholder for parameter optimization
In practice, this would use grid search, Bayesian optimization, etc.
"""
# Simple example - would be replaced with actual optimization
return {'param1': 20, 'param2': 2.0}
def bootstrap_confidence_intervals(
self,
returns: np.ndarray,
metric_func,
confidence_level: float = 0.95,
num_samples: int = 1000
) -> Tuple[float, float, float]:
"""
Calculate bootstrap confidence intervals for any metric
"""
bootstrap_metrics = []
for _ in range(num_samples):
# Resample with replacement
sample_returns = np.random.choice(returns, size=len(returns), replace=True)
metric = metric_func(sample_returns)
bootstrap_metrics.append(metric)
# Calculate percentiles
lower_percentile = (1 - confidence_level) / 2
upper_percentile = 1 - lower_percentile
lower_bound = np.percentile(bootstrap_metrics, lower_percentile * 100)
upper_bound = np.percentile(bootstrap_metrics, upper_percentile * 100)
point_estimate = metric_func(returns)
return lower_bound, point_estimate, upper_bound
def generate_report(self, validation_result: ValidationResult) -> str:
"""
Generate human-readable validation report
"""
report = f"""
Statistical Validation Report
============================
Overall Assessment: {'PASSED' if not validation_result.is_overfit else 'FAILED'}
Confidence Level: {validation_result.confidence_level:.1%}
Key Metrics:
-----------
Probabilistic Sharpe Ratio (PSR): {validation_result.psr:.3f}
Deflated Sharpe Ratio (DSR): {validation_result.dsr:.3f}
Monte Carlo Percentile: {validation_result.monte_carlo_percentile:.1%}
Out-of-Sample Degradation: {validation_result.out_of_sample_degradation:.1%}
Statistical Significance: {'Yes' if validation_result.statistical_significance else 'No'}
Warnings:
---------
"""
for warning in validation_result.warnings:
report += f"- {warning}\n"
report += """
Recommendations:
---------------
"""
for rec in validation_result.recommendations:
report += f"- {rec}\n"
return report

View file

@ -0,0 +1,217 @@
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Tuple
import logging
logger = logging.getLogger(__name__)
class PerformanceAnalyzer:
"""
Comprehensive performance analysis for trading strategies and portfolios
"""
def __init__(self, risk_free_rate: float = 0.02):
self.risk_free_rate = risk_free_rate
def calculate_metrics(
self,
portfolio_id: str,
start_date: datetime,
end_date: datetime
) -> Dict:
"""
Calculate comprehensive performance metrics
"""
# In real implementation, would fetch data from database
# For now, generate sample data
returns = self._generate_sample_returns(start_date, end_date)
metrics = {
'total_return': self._calculate_total_return(returns),
'annualized_return': self._calculate_annualized_return(returns),
'volatility': self._calculate_volatility(returns),
'sharpe_ratio': self._calculate_sharpe_ratio(returns),
'sortino_ratio': self._calculate_sortino_ratio(returns),
'max_drawdown': self._calculate_max_drawdown(returns),
'calmar_ratio': self._calculate_calmar_ratio(returns),
'win_rate': self._calculate_win_rate(returns),
'profit_factor': self._calculate_profit_factor(returns),
'avg_win': np.mean(returns[returns > 0]) if any(returns > 0) else 0,
'avg_loss': np.mean(returns[returns < 0]) if any(returns < 0) else 0,
'total_trades': len(returns),
'best_day': np.max(returns),
'worst_day': np.min(returns),
'skewness': self._calculate_skewness(returns),
'kurtosis': self._calculate_kurtosis(returns)
}
return metrics
def calculate_risk_metrics(
self,
portfolio_id: str,
window: int = 252,
confidence_levels: List[float] = [0.95, 0.99]
) -> Dict:
"""
Calculate risk metrics including VaR and CVaR
"""
# Generate sample returns
returns = self._generate_sample_returns(
datetime.now() - timedelta(days=window),
datetime.now()
)
risk_metrics = {
'volatility': self._calculate_volatility(returns),
'downside_deviation': self._calculate_downside_deviation(returns),
'beta': self._calculate_beta(returns), # Would need market returns
'tracking_error': 0.0, # Placeholder
}
# Calculate VaR and CVaR for each confidence level
for confidence in confidence_levels:
var = self._calculate_var(returns, confidence)
cvar = self._calculate_cvar(returns, confidence)
risk_metrics[f'var_{int(confidence*100)}'] = var
risk_metrics[f'cvar_{int(confidence*100)}'] = cvar
return risk_metrics
def analyze_backtest(self, backtest_id: str) -> Dict:
"""
Analyze backtest results
"""
# In real implementation, would fetch backtest data
# For now, return comprehensive mock analysis
return {
'metrics': {
'total_return': 0.156,
'sharpe_ratio': 1.45,
'max_drawdown': 0.087,
'win_rate': 0.58,
'profit_factor': 1.78
},
'statistics': {
'total_trades': 245,
'winning_trades': 142,
'losing_trades': 103,
'avg_holding_period': 3.5,
'max_consecutive_wins': 8,
'max_consecutive_losses': 5
},
'risk_analysis': {
'var_95': 0.024,
'cvar_95': 0.031,
'downside_deviation': 0.018,
'ulcer_index': 0.045
},
'trade_analysis': {
'best_trade': 0.087,
'worst_trade': -0.043,
'avg_win': 0.023,
'avg_loss': -0.015,
'largest_winner': 0.087,
'largest_loser': -0.043
}
}
# Helper methods
def _generate_sample_returns(self, start_date: datetime, end_date: datetime) -> np.ndarray:
"""Generate sample returns for testing"""
days = (end_date - start_date).days
# Generate returns with realistic properties
returns = np.random.normal(0.0005, 0.02, days)
# Add some autocorrelation
for i in range(1, len(returns)):
returns[i] = 0.1 * returns[i-1] + 0.9 * returns[i]
return returns
def _calculate_total_return(self, returns: np.ndarray) -> float:
"""Calculate total cumulative return"""
return np.prod(1 + returns) - 1
def _calculate_annualized_return(self, returns: np.ndarray) -> float:
"""Calculate annualized return"""
total_return = self._calculate_total_return(returns)
years = len(returns) / 252
return (1 + total_return) ** (1 / years) - 1
def _calculate_volatility(self, returns: np.ndarray) -> float:
"""Calculate annualized volatility"""
return np.std(returns) * np.sqrt(252)
def _calculate_sharpe_ratio(self, returns: np.ndarray) -> float:
"""Calculate Sharpe ratio"""
excess_returns = returns - self.risk_free_rate / 252
return np.mean(excess_returns) / np.std(excess_returns) * np.sqrt(252)
def _calculate_sortino_ratio(self, returns: np.ndarray) -> float:
"""Calculate Sortino ratio"""
excess_returns = returns - self.risk_free_rate / 252
downside_returns = excess_returns[excess_returns < 0]
downside_std = np.std(downside_returns) if len(downside_returns) > 0 else 1e-6
return np.mean(excess_returns) / downside_std * np.sqrt(252)
def _calculate_max_drawdown(self, returns: np.ndarray) -> float:
"""Calculate maximum drawdown"""
cumulative = (1 + returns).cumprod()
running_max = np.maximum.accumulate(cumulative)
drawdown = (cumulative - running_max) / running_max
return np.min(drawdown)
def _calculate_calmar_ratio(self, returns: np.ndarray) -> float:
"""Calculate Calmar ratio"""
annual_return = self._calculate_annualized_return(returns)
max_dd = abs(self._calculate_max_drawdown(returns))
return annual_return / max_dd if max_dd > 0 else 0
def _calculate_win_rate(self, returns: np.ndarray) -> float:
"""Calculate win rate"""
return np.sum(returns > 0) / len(returns) if len(returns) > 0 else 0
def _calculate_profit_factor(self, returns: np.ndarray) -> float:
"""Calculate profit factor"""
gains = returns[returns > 0]
losses = returns[returns < 0]
total_gains = np.sum(gains) if len(gains) > 0 else 0
total_losses = abs(np.sum(losses)) if len(losses) > 0 else 1e-6
return total_gains / total_losses
def _calculate_downside_deviation(self, returns: np.ndarray, mar: float = 0) -> float:
"""Calculate downside deviation"""
downside_returns = returns[returns < mar]
return np.std(downside_returns) * np.sqrt(252) if len(downside_returns) > 0 else 0
def _calculate_var(self, returns: np.ndarray, confidence: float) -> float:
"""Calculate Value at Risk"""
return np.percentile(returns, (1 - confidence) * 100)
def _calculate_cvar(self, returns: np.ndarray, confidence: float) -> float:
"""Calculate Conditional Value at Risk"""
var = self._calculate_var(returns, confidence)
return np.mean(returns[returns <= var])
def _calculate_beta(self, returns: np.ndarray, market_returns: Optional[np.ndarray] = None) -> float:
"""Calculate beta relative to market"""
if market_returns is None:
# Generate mock market returns
market_returns = np.random.normal(0.0003, 0.015, len(returns))
covariance = np.cov(returns, market_returns)[0, 1]
market_variance = np.var(market_returns)
return covariance / market_variance if market_variance > 0 else 1.0
def _calculate_skewness(self, returns: np.ndarray) -> float:
"""Calculate skewness of returns"""
mean = np.mean(returns)
std = np.std(returns)
return np.mean(((returns - mean) / std) ** 3) if std > 0 else 0
def _calculate_kurtosis(self, returns: np.ndarray) -> float:
"""Calculate kurtosis of returns"""
mean = np.mean(returns)
std = np.std(returns)
return np.mean(((returns - mean) / std) ** 4) - 3 if std > 0 else 0

View file

@ -0,0 +1,284 @@
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from typing import Dict, List, Tuple
from scipy import stats
from sklearn.mixture import GaussianMixture
import logging
logger = logging.getLogger(__name__)
class RegimeDetector:
"""
Market regime detection using various statistical and ML methods
"""
def __init__(self):
self.regimes = ['bull', 'bear', 'sideways', 'high_volatility', 'low_volatility']
def detect_current_regime(self, lookback_days: int = 60) -> Dict:
"""
Detect current market regime using multiple indicators
"""
# In real implementation, would fetch market data
# For now, generate sample data
market_data = self._generate_market_data(lookback_days)
# Calculate various regime indicators
trend_regime = self._detect_trend_regime(market_data)
volatility_regime = self._detect_volatility_regime(market_data)
momentum_regime = self._detect_momentum_regime(market_data)
# Combine indicators for final regime
regime, confidence = self._combine_regime_indicators(
trend_regime,
volatility_regime,
momentum_regime
)
return {
'regime': regime,
'confidence': confidence,
'indicators': {
'trend': trend_regime,
'volatility': volatility_regime,
'momentum': momentum_regime,
'market_breadth': self._calculate_market_breadth(market_data),
'fear_greed_index': self._calculate_fear_greed_index(market_data)
},
'sub_regimes': {
'trend_strength': self._calculate_trend_strength(market_data),
'volatility_percentile': self._calculate_volatility_percentile(market_data),
'correlation_regime': self._detect_correlation_regime(market_data)
}
}
def _generate_market_data(self, days: int) -> pd.DataFrame:
"""Generate sample market data for testing"""
dates = pd.date_range(end=datetime.now(), periods=days, freq='D')
# Generate correlated returns for multiple assets
n_assets = 10
returns = np.random.multivariate_normal(
mean=[0.0005] * n_assets,
cov=np.eye(n_assets) * 0.0004 + np.ones((n_assets, n_assets)) * 0.0001,
size=days
)
# Create price series
prices = pd.DataFrame(
(1 + returns).cumprod(axis=0) * 100,
index=dates,
columns=[f'Asset_{i}' for i in range(n_assets)]
)
# Add market index
prices['Market'] = prices.mean(axis=1)
# Add volatility index (like VIX)
prices['Volatility'] = pd.Series(returns[:, 0]).rolling(20).std() * np.sqrt(252) * 100
return prices
def _detect_trend_regime(self, data: pd.DataFrame) -> Dict:
"""Detect trend regime using moving averages and linear regression"""
market = data['Market']
# Calculate moving averages
ma_short = market.rolling(20).mean()
ma_long = market.rolling(50).mean()
# Trend strength
current_price = market.iloc[-1]
trend_score = (current_price - ma_long.iloc[-1]) / ma_long.iloc[-1]
# Linear regression trend
x = np.arange(len(market))
slope, _, r_value, _, _ = stats.linregress(x, market.values)
# Determine regime
if trend_score > 0.05 and ma_short.iloc[-1] > ma_long.iloc[-1]:
regime = 'bull'
elif trend_score < -0.05 and ma_short.iloc[-1] < ma_long.iloc[-1]:
regime = 'bear'
else:
regime = 'sideways'
return {
'regime': regime,
'trend_score': trend_score,
'slope': slope,
'r_squared': r_value ** 2
}
def _detect_volatility_regime(self, data: pd.DataFrame) -> Dict:
"""Detect volatility regime using GARCH-like analysis"""
returns = data['Market'].pct_change().dropna()
# Calculate rolling volatility
vol_short = returns.rolling(10).std() * np.sqrt(252)
vol_long = returns.rolling(30).std() * np.sqrt(252)
current_vol = vol_short.iloc[-1]
vol_percentile = stats.percentileofscore(vol_long.dropna(), current_vol)
# Volatility regime
if vol_percentile > 75:
regime = 'high_volatility'
elif vol_percentile < 25:
regime = 'low_volatility'
else:
regime = 'normal_volatility'
# Volatility of volatility
vol_of_vol = vol_short.rolling(20).std().iloc[-1]
return {
'regime': regime,
'current_volatility': current_vol,
'volatility_percentile': vol_percentile,
'vol_of_vol': vol_of_vol
}
def _detect_momentum_regime(self, data: pd.DataFrame) -> Dict:
"""Detect momentum regime using RSI and rate of change"""
market = data['Market']
# Calculate RSI
rsi = self._calculate_rsi(market, period=14)
# Rate of change
roc_short = (market.iloc[-1] / market.iloc[-5] - 1) * 100
roc_long = (market.iloc[-1] / market.iloc[-20] - 1) * 100
# Momentum regime
if rsi > 70 and roc_short > 0:
regime = 'overbought'
elif rsi < 30 and roc_short < 0:
regime = 'oversold'
elif roc_short > 2 and roc_long > 5:
regime = 'strong_momentum'
elif roc_short < -2 and roc_long < -5:
regime = 'weak_momentum'
else:
regime = 'neutral_momentum'
return {
'regime': regime,
'rsi': rsi,
'roc_short': roc_short,
'roc_long': roc_long
}
def _detect_correlation_regime(self, data: pd.DataFrame) -> str:
"""Detect correlation regime among assets"""
# Calculate rolling correlation
asset_returns = data.iloc[:, :-2].pct_change().dropna()
corr_matrix = asset_returns.rolling(30).corr()
# Average pairwise correlation
n_assets = len(asset_returns.columns)
avg_corr = (corr_matrix.sum().sum() - n_assets) / (n_assets * (n_assets - 1))
current_avg_corr = avg_corr.iloc[-1]
if current_avg_corr > 0.7:
return 'high_correlation'
elif current_avg_corr < 0.3:
return 'low_correlation'
else:
return 'normal_correlation'
def _calculate_rsi(self, prices: pd.Series, period: int = 14) -> float:
"""Calculate RSI"""
delta = prices.diff()
gain = (delta.where(delta > 0, 0)).rolling(period).mean()
loss = (-delta.where(delta < 0, 0)).rolling(period).mean()
rs = gain / loss
rsi = 100 - (100 / (1 + rs))
return rsi.iloc[-1]
def _calculate_market_breadth(self, data: pd.DataFrame) -> float:
"""Calculate market breadth (advance/decline ratio)"""
# Calculate daily returns for all assets
returns = data.iloc[:, :-2].pct_change().iloc[-1]
advancing = (returns > 0).sum()
declining = (returns < 0).sum()
return advancing / (advancing + declining) if (advancing + declining) > 0 else 0.5
def _calculate_fear_greed_index(self, data: pd.DataFrame) -> float:
"""Simplified fear & greed index"""
# Combine multiple indicators
volatility = data['Volatility'].iloc[-1]
momentum = self._detect_momentum_regime(data)['roc_short']
breadth = self._calculate_market_breadth(data)
# Normalize and combine
vol_score = 1 - min(volatility / 40, 1) # Lower vol = higher greed
momentum_score = (momentum + 10) / 20 # Normalize to 0-1
fear_greed = (vol_score + momentum_score + breadth) / 3
return fear_greed * 100 # 0 = extreme fear, 100 = extreme greed
def _calculate_trend_strength(self, data: pd.DataFrame) -> float:
"""Calculate trend strength using ADX-like indicator"""
market = data['Market']
# Calculate directional movement
high = market.rolling(2).max()
low = market.rolling(2).min()
plus_dm = (high - high.shift(1)).where(lambda x: x > 0, 0)
minus_dm = (low.shift(1) - low).where(lambda x: x > 0, 0)
# Smooth and normalize
period = 14
plus_di = plus_dm.rolling(period).mean() / market.rolling(period).std()
minus_di = minus_dm.rolling(period).mean() / market.rolling(period).std()
# Calculate trend strength
dx = abs(plus_di - minus_di) / (plus_di + minus_di)
adx = dx.rolling(period).mean().iloc[-1]
return min(adx * 100, 100) if not np.isnan(adx) else 50
def _calculate_volatility_percentile(self, data: pd.DataFrame) -> float:
"""Calculate current volatility percentile"""
volatility_regime = self._detect_volatility_regime(data)
return volatility_regime['volatility_percentile']
def _combine_regime_indicators(
self,
trend: Dict,
volatility: Dict,
momentum: Dict
) -> Tuple[str, float]:
"""Combine multiple indicators to determine overall regime"""
# Simple weighted combination
regimes = []
weights = []
# Trend regime
if trend['regime'] in ['bull', 'bear']:
regimes.append(trend['regime'])
weights.append(abs(trend['trend_score']) * 10)
# Volatility regime
if volatility['regime'] == 'high_volatility':
regimes.append('high_volatility')
weights.append(volatility['volatility_percentile'] / 100)
# Choose dominant regime
if not regimes:
return 'sideways', 0.5
# Weight by confidence
dominant_idx = np.argmax(weights)
regime = regimes[dominant_idx]
confidence = min(weights[dominant_idx], 1.0)
return regime, confidence

View file

@ -0,0 +1,79 @@
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
import logging
from typing import Dict, Any
from .endpoints import optimization, analytics, models
from ..analytics.performance import PerformanceAnalyzer
from ..analytics.regime import RegimeDetector
from ..optimization.portfolio_optimizer import PortfolioOptimizer
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global instances
performance_analyzer = PerformanceAnalyzer()
regime_detector = RegimeDetector()
portfolio_optimizer = PortfolioOptimizer()
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
logger.info("Starting Trading Analytics Service...")
# Initialize connections, load models, etc.
yield
# Shutdown
logger.info("Shutting down Trading Analytics Service...")
# Create FastAPI app
app = FastAPI(
title="Trading Analytics Service",
description="Complex analytics, optimization, and ML inference for trading",
version="0.1.0",
lifespan=lifespan
)
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Configure appropriately for production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routers
app.include_router(optimization.router, prefix="/optimize", tags=["optimization"])
app.include_router(analytics.router, prefix="/analytics", tags=["analytics"])
app.include_router(models.router, prefix="/models", tags=["models"])
@app.get("/")
async def root():
return {
"service": "Trading Analytics",
"status": "operational",
"version": "0.1.0"
}
@app.get("/health")
async def health_check():
return {
"status": "healthy",
"components": {
"performance_analyzer": "operational",
"regime_detector": "operational",
"portfolio_optimizer": "operational"
}
}
# Dependency injection
def get_performance_analyzer():
return performance_analyzer
def get_regime_detector():
return regime_detector
def get_portfolio_optimizer():
return portfolio_optimizer

View file

@ -0,0 +1,163 @@
from fastapi import APIRouter, HTTPException, Query, Depends
from datetime import datetime, date
from typing import List, Optional
import pandas as pd
import numpy as np
from ...analytics.performance import PerformanceAnalyzer
from ...analytics.regime import RegimeDetector
from ..app import get_performance_analyzer, get_regime_detector
router = APIRouter()
@router.get("/performance/{portfolio_id}")
async def get_performance_metrics(
portfolio_id: str,
start_date: datetime = Query(..., description="Start date for analysis"),
end_date: datetime = Query(..., description="End date for analysis"),
analyzer: PerformanceAnalyzer = Depends(get_performance_analyzer)
):
"""
Calculate comprehensive performance metrics for a portfolio
"""
try:
# In real implementation, would fetch data from database
# For now, using mock data
metrics = analyzer.calculate_metrics(
portfolio_id=portfolio_id,
start_date=start_date,
end_date=end_date
)
return metrics
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to calculate performance metrics: {str(e)}")
@router.get("/risk/{portfolio_id}")
async def get_risk_metrics(
portfolio_id: str,
window: int = Query(252, description="Rolling window for risk calculations"),
analyzer: PerformanceAnalyzer = Depends(get_performance_analyzer)
):
"""
Calculate risk metrics including VaR and CVaR
"""
try:
risk_metrics = analyzer.calculate_risk_metrics(
portfolio_id=portfolio_id,
window=window
)
return risk_metrics
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to calculate risk metrics: {str(e)}")
@router.get("/regime")
async def detect_market_regime(
lookback_days: int = Query(60, description="Days to look back for regime detection"),
detector: RegimeDetector = Depends(get_regime_detector)
):
"""
Detect current market regime using various indicators
"""
try:
regime = detector.detect_current_regime(lookback_days=lookback_days)
return {
"regime": regime['regime'],
"confidence": regime['confidence'],
"indicators": regime['indicators'],
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to detect market regime: {str(e)}")
@router.post("/correlation")
async def calculate_correlation_matrix(
symbols: List[str],
start_date: Optional[date] = None,
end_date: Optional[date] = None,
method: str = Query("pearson", pattern="^(pearson|spearman|kendall)$")
):
"""
Calculate correlation matrix for given symbols
"""
try:
# In real implementation, would fetch price data
# For now, return mock correlation matrix
n = len(symbols)
# Generate realistic correlation matrix
np.random.seed(42)
A = np.random.randn(n, n)
correlation_matrix = np.dot(A, A.T)
# Normalize to correlation
D = np.sqrt(np.diag(np.diag(correlation_matrix)))
correlation_matrix = np.linalg.inv(D) @ correlation_matrix @ np.linalg.inv(D)
return {
"symbols": symbols,
"matrix": correlation_matrix.tolist(),
"method": method
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to calculate correlation: {str(e)}")
@router.get("/backtest/{backtest_id}")
async def analyze_backtest_results(
backtest_id: str,
analyzer: PerformanceAnalyzer = Depends(get_performance_analyzer)
):
"""
Analyze results from a completed backtest
"""
try:
analysis = analyzer.analyze_backtest(backtest_id)
return {
"backtest_id": backtest_id,
"metrics": analysis['metrics'],
"statistics": analysis['statistics'],
"risk_analysis": analysis['risk_analysis'],
"trade_analysis": analysis['trade_analysis']
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to analyze backtest: {str(e)}")
@router.post("/attribution")
async def performance_attribution(
portfolio_id: str,
benchmark: str,
start_date: date,
end_date: date,
method: str = Query("brinson", pattern="^(brinson|factor|risk)$")
):
"""
Perform performance attribution analysis
"""
try:
# Placeholder for attribution analysis
return {
"portfolio_id": portfolio_id,
"benchmark": benchmark,
"period": {
"start": start_date.isoformat(),
"end": end_date.isoformat()
},
"method": method,
"attribution": {
"allocation_effect": 0.0023,
"selection_effect": 0.0045,
"interaction_effect": 0.0001,
"total_effect": 0.0069
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to perform attribution: {str(e)}")

View file

@ -0,0 +1,182 @@
from fastapi import APIRouter, HTTPException, UploadFile, File
from pydantic import BaseModel
from typing import Dict, Any, List, Optional
import numpy as np
import onnxruntime as ort
import json
import logging
logger = logging.getLogger(__name__)
router = APIRouter()
# In-memory model storage (in production, use proper model registry)
loaded_models = {}
class PredictionRequest(BaseModel):
model_id: str
features: Dict[str, float]
class PredictionResponse(BaseModel):
model_id: str
prediction: float
probability: Optional[Dict[str, float]] = None
metadata: Optional[Dict[str, Any]] = None
class ModelInfo(BaseModel):
model_id: str
name: str
version: str
type: str
input_features: List[str]
output_shape: List[int]
metadata: Dict[str, Any]
@router.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
"""
Run inference on a loaded model
"""
try:
if request.model_id not in loaded_models:
raise HTTPException(status_code=404, detail=f"Model {request.model_id} not found")
model_info = loaded_models[request.model_id]
session = model_info['session']
# Prepare input
input_features = model_info['input_features']
input_array = np.array([[request.features.get(f, 0.0) for f in input_features]], dtype=np.float32)
# Run inference
input_name = session.get_inputs()[0].name
output = session.run(None, {input_name: input_array})
# Process output
prediction = float(output[0][0])
# For classification models, get probabilities
probability = None
if model_info['type'] == 'classification' and len(output[0][0]) > 1:
probability = {
f"class_{i}": float(p)
for i, p in enumerate(output[0][0])
}
return PredictionResponse(
model_id=request.model_id,
prediction=prediction,
probability=probability,
metadata={
"model_version": model_info['version'],
"timestamp": np.datetime64('now').tolist()
}
)
except Exception as e:
logger.error(f"Prediction failed: {str(e)}")
raise HTTPException(status_code=500, detail=f"Prediction failed: {str(e)}")
@router.post("/load")
async def load_model(
model_id: str,
model_file: UploadFile = File(...),
metadata: str = None
):
"""
Load an ONNX model for inference
"""
try:
# Read model file
content = await model_file.read()
# Create ONNX session
session = ort.InferenceSession(content)
# Parse metadata
model_metadata = json.loads(metadata) if metadata else {}
# Extract model info
input_features = [inp.name for inp in session.get_inputs()]
output_shape = [out.shape for out in session.get_outputs()]
# Store model
loaded_models[model_id] = {
'session': session,
'input_features': model_metadata.get('feature_names', input_features),
'type': model_metadata.get('model_type', 'regression'),
'version': model_metadata.get('version', '1.0'),
'metadata': model_metadata
}
return {
"message": f"Model {model_id} loaded successfully",
"input_features": input_features,
"output_shape": output_shape
}
except Exception as e:
logger.error(f"Failed to load model: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to load model: {str(e)}")
@router.get("/list", response_model=List[ModelInfo])
async def list_models():
"""
List all loaded models
"""
models = []
for model_id, info in loaded_models.items():
session = info['session']
models.append(ModelInfo(
model_id=model_id,
name=info['metadata'].get('name', model_id),
version=info['version'],
type=info['type'],
input_features=info['input_features'],
output_shape=[out.shape for out in session.get_outputs()],
metadata=info['metadata']
))
return models
@router.delete("/{model_id}")
async def unload_model(model_id: str):
"""
Unload a model from memory
"""
if model_id not in loaded_models:
raise HTTPException(status_code=404, detail=f"Model {model_id} not found")
del loaded_models[model_id]
return {"message": f"Model {model_id} unloaded successfully"}
@router.post("/batch_predict")
async def batch_predict(
model_id: str,
features: List[Dict[str, float]]
):
"""
Run batch predictions
"""
try:
if model_id not in loaded_models:
raise HTTPException(status_code=404, detail=f"Model {model_id} not found")
predictions = []
for feature_set in features:
request = PredictionRequest(model_id=model_id, features=feature_set)
result = await predict(request)
predictions.append(result.dict())
return {
"model_id": model_id,
"predictions": predictions,
"count": len(predictions)
}
except Exception as e:
logger.error(f"Batch prediction failed: {str(e)}")
raise HTTPException(status_code=500, detail=f"Batch prediction failed: {str(e)}")

View file

@ -0,0 +1,120 @@
from fastapi import APIRouter, HTTPException, Depends
from pydantic import BaseModel, Field
from typing import List, Optional, Dict
import numpy as np
from ...optimization.portfolio_optimizer import PortfolioOptimizer
from ..app import get_portfolio_optimizer
router = APIRouter()
class OptimizationConstraints(BaseModel):
min_weight: Optional[float] = Field(0.0, ge=0.0, le=1.0)
max_weight: Optional[float] = Field(1.0, ge=0.0, le=1.0)
target_return: Optional[float] = None
max_risk: Optional[float] = None
class PortfolioOptimizationRequest(BaseModel):
symbols: List[str]
returns: List[List[float]]
constraints: Optional[OptimizationConstraints] = None
method: str = Field("mean_variance", pattern="^(mean_variance|min_variance|max_sharpe|risk_parity|black_litterman)$")
class PortfolioWeights(BaseModel):
symbols: List[str]
weights: List[float]
expected_return: float
expected_risk: float
sharpe_ratio: float
@router.post("/portfolio", response_model=PortfolioWeights)
async def optimize_portfolio(
request: PortfolioOptimizationRequest,
optimizer: PortfolioOptimizer = Depends(get_portfolio_optimizer)
):
"""
Optimize portfolio weights using various methods
"""
try:
# Convert returns to numpy array
returns_array = np.array(request.returns)
# Validate dimensions
if len(request.symbols) != returns_array.shape[1]:
raise HTTPException(
status_code=400,
detail="Number of symbols must match number of return columns"
)
# Run optimization
result = optimizer.optimize(
returns=returns_array,
method=request.method,
constraints=request.constraints.dict() if request.constraints else None
)
return PortfolioWeights(
symbols=request.symbols,
weights=result['weights'].tolist(),
expected_return=float(result['expected_return']),
expected_risk=float(result['expected_risk']),
sharpe_ratio=float(result['sharpe_ratio'])
)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
raise HTTPException(status_code=500, detail=f"Optimization failed: {str(e)}")
@router.post("/efficient_frontier")
async def calculate_efficient_frontier(
request: PortfolioOptimizationRequest,
num_portfolios: int = 100,
optimizer: PortfolioOptimizer = Depends(get_portfolio_optimizer)
):
"""
Calculate the efficient frontier for a set of assets
"""
try:
returns_array = np.array(request.returns)
frontier = optimizer.calculate_efficient_frontier(
returns=returns_array,
num_portfolios=num_portfolios
)
return {
"symbols": request.symbols,
"frontier": frontier
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to calculate efficient frontier: {str(e)}")
@router.post("/rebalance")
async def suggest_rebalance(
current_weights: Dict[str, float],
target_weights: Dict[str, float],
constraints: Optional[Dict[str, float]] = None
):
"""
Suggest trades to rebalance portfolio from current to target weights
"""
try:
# Calculate differences
trades = {}
for symbol in target_weights:
current = current_weights.get(symbol, 0.0)
target = target_weights[symbol]
diff = target - current
if abs(diff) > 0.001: # Ignore tiny differences
trades[symbol] = diff
return {
"trades": trades,
"total_turnover": sum(abs(t) for t in trades.values())
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Rebalance calculation failed: {str(e)}")

View file

@ -0,0 +1,481 @@
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, Union
import talib
from scipy import stats
from sklearn.preprocessing import StandardScaler, RobustScaler
import logging
logger = logging.getLogger(__name__)
class FeatureEngineer:
"""
Feature engineering for financial ML models
"""
def __init__(self, lookback_periods: List[int] = None):
self.lookback_periods = lookback_periods or [5, 10, 20, 50, 100, 200]
self.scaler = RobustScaler() # Robust to outliers
self.feature_names: List[str] = []
def create_features(
self,
data: pd.DataFrame,
include_technical: bool = True,
include_microstructure: bool = True,
include_fundamental: bool = False,
include_sentiment: bool = False
) -> pd.DataFrame:
"""
Create comprehensive feature set for ML models
"""
features = pd.DataFrame(index=data.index)
# Price-based features
logger.info("Creating price-based features...")
price_features = self._create_price_features(data)
features = pd.concat([features, price_features], axis=1)
# Technical indicators
if include_technical:
logger.info("Creating technical indicators...")
tech_features = self._create_technical_features(data)
features = pd.concat([features, tech_features], axis=1)
# Microstructure features
if include_microstructure:
logger.info("Creating microstructure features...")
micro_features = self._create_microstructure_features(data)
features = pd.concat([features, micro_features], axis=1)
# Fundamental features (if available)
if include_fundamental and 'earnings' in data.columns:
logger.info("Creating fundamental features...")
fund_features = self._create_fundamental_features(data)
features = pd.concat([features, fund_features], axis=1)
# Sentiment features (if available)
if include_sentiment and 'sentiment' in data.columns:
logger.info("Creating sentiment features...")
sent_features = self._create_sentiment_features(data)
features = pd.concat([features, sent_features], axis=1)
# Time-based features
logger.info("Creating time-based features...")
time_features = self._create_time_features(data)
features = pd.concat([features, time_features], axis=1)
# Cross-sectional features (if multiple symbols)
if 'symbol' in data.columns and data['symbol'].nunique() > 1:
logger.info("Creating cross-sectional features...")
cross_features = self._create_cross_sectional_features(data)
features = pd.concat([features, cross_features], axis=1)
# Store feature names
self.feature_names = features.columns.tolist()
# Handle missing values
features = self._handle_missing_values(features)
return features
def _create_price_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Create price-based features"""
features = pd.DataFrame(index=data.index)
# Returns at different horizons
for period in self.lookback_periods:
features[f'returns_{period}'] = data['close'].pct_change(period)
features[f'log_returns_{period}'] = np.log(data['close'] / data['close'].shift(period))
# Price ratios
features['high_low_ratio'] = data['high'] / data['low']
features['close_open_ratio'] = data['close'] / data['open']
# Price position in range
features['price_position'] = (data['close'] - data['low']) / (data['high'] - data['low']).replace(0, np.nan)
# Volume-weighted metrics
if 'volume' in data.columns:
features['vwap'] = (data['close'] * data['volume']).rolling(20).sum() / data['volume'].rolling(20).sum()
features['volume_ratio'] = data['volume'] / data['volume'].rolling(20).mean()
features['dollar_volume'] = data['close'] * data['volume']
# Volatility measures
for period in [5, 20, 50]:
features[f'volatility_{period}'] = data['close'].pct_change().rolling(period).std() * np.sqrt(252)
features[f'realized_var_{period}'] = (data['close'].pct_change() ** 2).rolling(period).sum()
# Price momentum
features['momentum_1m'] = data['close'] / data['close'].shift(20) - 1
features['momentum_3m'] = data['close'] / data['close'].shift(60) - 1
features['momentum_6m'] = data['close'] / data['close'].shift(120) - 1
# Relative strength
for short, long in [(10, 30), (20, 50), (50, 200)]:
features[f'rs_{short}_{long}'] = (
data['close'].rolling(short).mean() /
data['close'].rolling(long).mean()
)
return features
def _create_technical_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Create technical indicator features"""
features = pd.DataFrame(index=data.index)
# Moving averages
for period in self.lookback_periods:
sma = talib.SMA(data['close'].values, timeperiod=period)
ema = talib.EMA(data['close'].values, timeperiod=period)
features[f'sma_{period}'] = sma
features[f'ema_{period}'] = ema
features[f'price_to_sma_{period}'] = data['close'] / sma
# Bollinger Bands
for period in [20, 50]:
upper, middle, lower = talib.BBANDS(
data['close'].values,
timeperiod=period,
nbdevup=2,
nbdevdn=2
)
features[f'bb_upper_{period}'] = upper
features[f'bb_lower_{period}'] = lower
features[f'bb_width_{period}'] = (upper - lower) / middle
features[f'bb_position_{period}'] = (data['close'] - lower) / (upper - lower)
# RSI
for period in [14, 28]:
features[f'rsi_{period}'] = talib.RSI(data['close'].values, timeperiod=period)
# MACD
macd, signal, hist = talib.MACD(data['close'].values)
features['macd'] = macd
features['macd_signal'] = signal
features['macd_hist'] = hist
# Stochastic
slowk, slowd = talib.STOCH(
data['high'].values,
data['low'].values,
data['close'].values
)
features['stoch_k'] = slowk
features['stoch_d'] = slowd
# ADX (Average Directional Index)
features['adx'] = talib.ADX(
data['high'].values,
data['low'].values,
data['close'].values
)
# ATR (Average True Range)
for period in [14, 20]:
features[f'atr_{period}'] = talib.ATR(
data['high'].values,
data['low'].values,
data['close'].values,
timeperiod=period
)
# CCI (Commodity Channel Index)
features['cci'] = talib.CCI(
data['high'].values,
data['low'].values,
data['close'].values
)
# Williams %R
features['williams_r'] = talib.WILLR(
data['high'].values,
data['low'].values,
data['close'].values
)
# OBV (On Balance Volume)
if 'volume' in data.columns:
features['obv'] = talib.OBV(data['close'].values, data['volume'].values)
features['obv_ema'] = talib.EMA(features['obv'].values, timeperiod=20)
return features
def _create_microstructure_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Create market microstructure features"""
features = pd.DataFrame(index=data.index)
# Spread estimation (using high-low)
features['hl_spread'] = 2 * (data['high'] - data['low']) / (data['high'] + data['low'])
features['hl_spread_ma'] = features['hl_spread'].rolling(20).mean()
# Roll's implied spread
if len(data) > 2:
returns = data['close'].pct_change()
features['roll_spread'] = 2 * np.sqrt(-returns.rolling(20).cov(returns.shift(1)))
# Amihud illiquidity
if 'volume' in data.columns:
features['amihud'] = (returns.abs() / (data['volume'] * data['close'])).rolling(20).mean() * 1e6
features['log_amihud'] = np.log(features['amihud'].replace(0, np.nan) + 1e-10)
# Kyle's lambda (price impact)
if 'volume' in data.columns:
# Simplified version using rolling regression
for period in [20, 50]:
price_changes = data['close'].pct_change()
signed_volume = data['volume'] * np.sign(price_changes)
# Rolling correlation as proxy for Kyle's lambda
features[f'kyle_lambda_{period}'] = (
price_changes.rolling(period).corr(signed_volume) *
price_changes.rolling(period).std() /
signed_volume.rolling(period).std()
)
# Intraday patterns
if 'timestamp' in data.columns:
data['hour'] = pd.to_datetime(data['timestamp']).dt.hour
data['minute'] = pd.to_datetime(data['timestamp']).dt.minute
# Time since market open (assuming 9:30 AM open)
features['minutes_since_open'] = (data['hour'] - 9) * 60 + data['minute'] - 30
features['minutes_to_close'] = 390 - features['minutes_since_open'] # 6.5 hour day
# Normalized time of day
features['time_of_day_norm'] = features['minutes_since_open'] / 390
# Order flow imbalance proxy
features['high_low_imbalance'] = (data['high'] - data['close']) / (data['close'] - data['low'] + 1e-10)
features['close_position_in_range'] = (data['close'] - data['low']) / (data['high'] - data['low'] + 1e-10)
return features
def _create_fundamental_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Create fundamental analysis features"""
features = pd.DataFrame(index=data.index)
# Price to earnings
if 'earnings' in data.columns:
features['pe_ratio'] = data['close'] / data['earnings']
features['earnings_yield'] = data['earnings'] / data['close']
features['pe_relative'] = features['pe_ratio'] / features['pe_ratio'].rolling(252).mean()
# Price to book
if 'book_value' in data.columns:
features['pb_ratio'] = data['close'] / data['book_value']
features['pb_relative'] = features['pb_ratio'] / features['pb_ratio'].rolling(252).mean()
# Dividend yield
if 'dividends' in data.columns:
features['dividend_yield'] = data['dividends'].rolling(252).sum() / data['close']
features['dividend_growth'] = data['dividends'].pct_change(252)
# Sales/Revenue metrics
if 'revenue' in data.columns:
features['price_to_sales'] = data['close'] * data['shares_outstanding'] / data['revenue']
features['revenue_growth'] = data['revenue'].pct_change(4) # YoY for quarterly
# Profitability metrics
if 'net_income' in data.columns and 'total_assets' in data.columns:
features['roe'] = data['net_income'] / data['shareholders_equity']
features['roa'] = data['net_income'] / data['total_assets']
features['profit_margin'] = data['net_income'] / data['revenue']
return features
def _create_sentiment_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Create sentiment-based features"""
features = pd.DataFrame(index=data.index)
if 'sentiment' in data.columns:
# Raw sentiment
features['sentiment'] = data['sentiment']
features['sentiment_ma'] = data['sentiment'].rolling(20).mean()
features['sentiment_std'] = data['sentiment'].rolling(20).std()
# Sentiment momentum
features['sentiment_change'] = data['sentiment'].pct_change(5)
features['sentiment_momentum'] = data['sentiment'] - data['sentiment'].shift(20)
# Sentiment extremes
features['sentiment_zscore'] = (
(data['sentiment'] - features['sentiment_ma']) /
features['sentiment_std']
)
# Sentiment divergence from price
price_zscore = (data['close'] - data['close'].rolling(20).mean()) / data['close'].rolling(20).std()
features['sentiment_price_divergence'] = features['sentiment_zscore'] - price_zscore
# News volume features
if 'news_count' in data.columns:
features['news_volume'] = data['news_count']
features['news_volume_ma'] = data['news_count'].rolling(5).mean()
features['news_spike'] = data['news_count'] / features['news_volume_ma']
# Social media features
if 'twitter_mentions' in data.columns:
features['social_volume'] = data['twitter_mentions']
features['social_momentum'] = data['twitter_mentions'].pct_change(1)
features['social_vs_avg'] = data['twitter_mentions'] / data['twitter_mentions'].rolling(20).mean()
return features
def _create_time_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Create time-based features"""
features = pd.DataFrame(index=data.index)
if 'timestamp' in data.columns:
timestamps = pd.to_datetime(data['timestamp'])
# Day of week
features['day_of_week'] = timestamps.dt.dayofweek
features['is_monday'] = (features['day_of_week'] == 0).astype(int)
features['is_friday'] = (features['day_of_week'] == 4).astype(int)
# Month
features['month'] = timestamps.dt.month
features['is_quarter_end'] = timestamps.dt.month.isin([3, 6, 9, 12]).astype(int)
features['is_year_end'] = timestamps.dt.month.eq(12).astype(int)
# Trading day in month
features['trading_day_of_month'] = timestamps.dt.day
features['trading_day_of_year'] = timestamps.dt.dayofyear
# Seasonality features
features['sin_day_of_year'] = np.sin(2 * np.pi * features['trading_day_of_year'] / 365)
features['cos_day_of_year'] = np.cos(2 * np.pi * features['trading_day_of_year'] / 365)
# Options expiration week (third Friday)
features['is_opex_week'] = self._is_options_expiration_week(timestamps)
# Fed meeting weeks (approximate)
features['is_fed_week'] = self._is_fed_meeting_week(timestamps)
return features
def _create_cross_sectional_features(self, data: pd.DataFrame) -> pd.DataFrame:
"""Create features comparing across multiple symbols"""
features = pd.DataFrame(index=data.index)
# Calculate market averages
market_returns = data.groupby('timestamp')['close'].mean().pct_change()
market_volume = data.groupby('timestamp')['volume'].mean()
# Relative performance
data['returns'] = data.groupby('symbol')['close'].pct_change()
features['relative_returns'] = data['returns'] - market_returns[data['timestamp']].values
features['relative_volume'] = data['volume'] / market_volume[data['timestamp']].values
# Sector/market correlation
for period in [20, 50]:
rolling_corr = data.groupby('symbol')['returns'].rolling(period).corr(market_returns)
features[f'market_correlation_{period}'] = rolling_corr
# Cross-sectional momentum
features['cross_sectional_rank'] = data.groupby('timestamp')['returns'].rank(pct=True)
return features
def _handle_missing_values(self, features: pd.DataFrame) -> pd.DataFrame:
"""Handle missing values in features"""
# Forward fill for small gaps
features = features.fillna(method='ffill', limit=5)
# For remaining NaNs, use median of non-missing values
for col in features.columns:
if features[col].isna().any():
median_val = features[col].median()
features[col].fillna(median_val, inplace=True)
# Replace any infinities
features = features.replace([np.inf, -np.inf], np.nan)
features = features.fillna(0)
return features
def _is_options_expiration_week(self, timestamps: pd.Series) -> pd.Series:
"""Identify options expiration weeks (third Friday of month)"""
# This is a simplified version
is_third_week = (timestamps.dt.day >= 15) & (timestamps.dt.day <= 21)
is_friday = timestamps.dt.dayofweek == 4
return (is_third_week & is_friday).astype(int)
def _is_fed_meeting_week(self, timestamps: pd.Series) -> pd.Series:
"""Identify approximate Fed meeting weeks"""
# Fed typically meets 8 times per year, roughly every 6 weeks
# This is a simplified approximation
week_of_year = timestamps.dt.isocalendar().week
return (week_of_year % 6 == 0).astype(int)
def transform_features(
self,
features: pd.DataFrame,
method: str = 'robust',
clip_outliers: bool = True,
clip_quantile: float = 0.01
) -> pd.DataFrame:
"""
Transform features for ML models
"""
transformed = features.copy()
# Clip outliers if requested
if clip_outliers:
lower = features.quantile(clip_quantile)
upper = features.quantile(1 - clip_quantile)
transformed = features.clip(lower=lower, upper=upper, axis=1)
# Scale features
if method == 'robust':
scaler = RobustScaler()
elif method == 'standard':
scaler = StandardScaler()
else:
raise ValueError(f"Unknown scaling method: {method}")
scaled_values = scaler.fit_transform(transformed)
transformed = pd.DataFrame(
scaled_values,
index=features.index,
columns=features.columns
)
self.scaler = scaler
return transformed
def get_feature_importance(
self,
features: pd.DataFrame,
target: pd.Series,
method: str = 'mutual_info'
) -> pd.DataFrame:
"""
Calculate feature importance scores
"""
importance_scores = {}
if method == 'mutual_info':
from sklearn.feature_selection import mutual_info_regression
scores = mutual_info_regression(features, target)
importance_scores['mutual_info'] = scores
elif method == 'correlation':
scores = features.corrwith(target).abs()
importance_scores['correlation'] = scores.values
elif method == 'random_forest':
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(features, target)
importance_scores['rf_importance'] = rf.feature_importances_
# Create DataFrame with results
importance_df = pd.DataFrame(
importance_scores,
index=features.columns
).sort_values(by=list(importance_scores.keys())[0], ascending=False)
return importance_df

View file

@ -0,0 +1,354 @@
import numpy as np
import pandas as pd
import cvxpy as cp
from typing import Dict, List, Optional, Tuple
import logging
logger = logging.getLogger(__name__)
class PortfolioOptimizer:
"""
Portfolio optimization using various methods
"""
def __init__(self, risk_free_rate: float = 0.02):
self.risk_free_rate = risk_free_rate
def optimize(
self,
returns: np.ndarray,
method: str = 'mean_variance',
constraints: Optional[Dict] = None
) -> Dict:
"""
Optimize portfolio weights using specified method
"""
if method == 'mean_variance':
return self._mean_variance_optimization(returns, constraints)
elif method == 'min_variance':
return self._minimum_variance_optimization(returns, constraints)
elif method == 'max_sharpe':
return self._maximum_sharpe_optimization(returns, constraints)
elif method == 'risk_parity':
return self._risk_parity_optimization(returns)
elif method == 'black_litterman':
return self._black_litterman_optimization(returns, constraints)
else:
raise ValueError(f"Unknown optimization method: {method}")
def _mean_variance_optimization(
self,
returns: np.ndarray,
constraints: Optional[Dict] = None
) -> Dict:
"""
Classical Markowitz mean-variance optimization
"""
n_assets = returns.shape[1]
# Calculate expected returns and covariance
expected_returns = np.mean(returns, axis=0)
cov_matrix = np.cov(returns.T)
# Add small value to diagonal for numerical stability
cov_matrix += np.eye(n_assets) * 1e-6
# Define optimization variables
weights = cp.Variable(n_assets)
# Define objective (maximize return - lambda * risk)
risk_aversion = 2.0 # Can be parameterized
portfolio_return = expected_returns @ weights
portfolio_risk = cp.quad_form(weights, cov_matrix)
objective = cp.Maximize(portfolio_return - risk_aversion * portfolio_risk)
# Define constraints
constraints_list = [
cp.sum(weights) == 1, # Weights sum to 1
weights >= 0, # No short selling (can be relaxed)
]
# Add custom constraints
if constraints:
if 'min_weight' in constraints:
constraints_list.append(weights >= constraints['min_weight'])
if 'max_weight' in constraints:
constraints_list.append(weights <= constraints['max_weight'])
if 'target_return' in constraints:
constraints_list.append(portfolio_return >= constraints['target_return'])
if 'max_risk' in constraints:
max_variance = constraints['max_risk'] ** 2
constraints_list.append(portfolio_risk <= max_variance)
# Solve optimization
problem = cp.Problem(objective, constraints_list)
problem.solve()
if problem.status != 'optimal':
logger.warning(f"Optimization status: {problem.status}")
# Return equal weights as fallback
weights_array = np.ones(n_assets) / n_assets
else:
weights_array = weights.value
# Calculate portfolio metrics
portfolio_return = expected_returns @ weights_array
portfolio_risk = np.sqrt(weights_array @ cov_matrix @ weights_array)
sharpe_ratio = (portfolio_return - self.risk_free_rate) / portfolio_risk
return {
'weights': weights_array,
'expected_return': portfolio_return * 252, # Annualized
'expected_risk': portfolio_risk * np.sqrt(252), # Annualized
'sharpe_ratio': sharpe_ratio * np.sqrt(252)
}
def _minimum_variance_optimization(
self,
returns: np.ndarray,
constraints: Optional[Dict] = None
) -> Dict:
"""
Minimize portfolio variance
"""
n_assets = returns.shape[1]
cov_matrix = np.cov(returns.T)
cov_matrix += np.eye(n_assets) * 1e-6
# Define optimization
weights = cp.Variable(n_assets)
portfolio_risk = cp.quad_form(weights, cov_matrix)
objective = cp.Minimize(portfolio_risk)
constraints_list = [
cp.sum(weights) == 1,
weights >= 0,
]
# Solve
problem = cp.Problem(objective, constraints_list)
problem.solve()
weights_array = weights.value if problem.status == 'optimal' else np.ones(n_assets) / n_assets
# Calculate metrics
expected_returns = np.mean(returns, axis=0)
portfolio_return = expected_returns @ weights_array
portfolio_risk = np.sqrt(weights_array @ cov_matrix @ weights_array)
sharpe_ratio = (portfolio_return - self.risk_free_rate / 252) / portfolio_risk
return {
'weights': weights_array,
'expected_return': portfolio_return * 252,
'expected_risk': portfolio_risk * np.sqrt(252),
'sharpe_ratio': sharpe_ratio * np.sqrt(252)
}
def _maximum_sharpe_optimization(
self,
returns: np.ndarray,
constraints: Optional[Dict] = None
) -> Dict:
"""
Maximize Sharpe ratio
"""
# This is a bit tricky as Sharpe ratio is not convex
# We use a trick: for each target return, find min variance
# Then select the portfolio with highest Sharpe
n_assets = returns.shape[1]
expected_returns = np.mean(returns, axis=0)
cov_matrix = np.cov(returns.T)
# Generate efficient frontier
target_returns = np.linspace(
np.min(expected_returns),
np.max(expected_returns),
50
)
best_sharpe = -np.inf
best_weights = None
for target_ret in target_returns:
weights = cp.Variable(n_assets)
portfolio_risk = cp.quad_form(weights, cov_matrix)
objective = cp.Minimize(portfolio_risk)
constraints_list = [
cp.sum(weights) == 1,
weights >= 0,
expected_returns @ weights >= target_ret
]
problem = cp.Problem(objective, constraints_list)
problem.solve()
if problem.status == 'optimal':
w = weights.value
ret = expected_returns @ w
risk = np.sqrt(w @ cov_matrix @ w)
sharpe = (ret - self.risk_free_rate / 252) / risk
if sharpe > best_sharpe:
best_sharpe = sharpe
best_weights = w
if best_weights is None:
best_weights = np.ones(n_assets) / n_assets
# Calculate final metrics
portfolio_return = expected_returns @ best_weights
portfolio_risk = np.sqrt(best_weights @ cov_matrix @ best_weights)
return {
'weights': best_weights,
'expected_return': portfolio_return * 252,
'expected_risk': portfolio_risk * np.sqrt(252),
'sharpe_ratio': best_sharpe * np.sqrt(252)
}
def _risk_parity_optimization(self, returns: np.ndarray) -> Dict:
"""
Risk parity optimization - equal risk contribution
"""
n_assets = returns.shape[1]
cov_matrix = np.cov(returns.T)
# Initial guess - equal weights
weights = np.ones(n_assets) / n_assets
# Iterative algorithm
for _ in range(100):
# Calculate marginal risk contributions
portfolio_vol = np.sqrt(weights @ cov_matrix @ weights)
marginal_contrib = cov_matrix @ weights / portfolio_vol
contrib = weights * marginal_contrib
# Target equal contribution
target_contrib = portfolio_vol / n_assets
# Update weights
weights = weights * (target_contrib / contrib)
weights = weights / np.sum(weights)
# Calculate metrics
expected_returns = np.mean(returns, axis=0)
portfolio_return = expected_returns @ weights
portfolio_risk = np.sqrt(weights @ cov_matrix @ weights)
sharpe_ratio = (portfolio_return - self.risk_free_rate / 252) / portfolio_risk
return {
'weights': weights,
'expected_return': portfolio_return * 252,
'expected_risk': portfolio_risk * np.sqrt(252),
'sharpe_ratio': sharpe_ratio * np.sqrt(252)
}
def _black_litterman_optimization(
self,
returns: np.ndarray,
constraints: Optional[Dict] = None,
views: Optional[Dict] = None
) -> Dict:
"""
Black-Litterman optimization
"""
# Simplified implementation
# In practice, would incorporate market views
n_assets = returns.shape[1]
# Market equilibrium weights (market cap weighted)
# For demo, use equal weights
market_weights = np.ones(n_assets) / n_assets
# Calculate implied returns
cov_matrix = np.cov(returns.T)
risk_aversion = 2.5
implied_returns = risk_aversion * cov_matrix @ market_weights
# Without views, this reduces to market weights
# With views, would blend implied returns with views
if views:
# Implement view blending
pass
# For now, return mean-variance with implied returns
expected_returns = implied_returns
# Run mean-variance with these returns
weights = cp.Variable(n_assets)
portfolio_return = expected_returns @ weights
portfolio_risk = cp.quad_form(weights, cov_matrix)
objective = cp.Maximize(portfolio_return - risk_aversion * portfolio_risk)
constraints_list = [
cp.sum(weights) == 1,
weights >= 0,
]
problem = cp.Problem(objective, constraints_list)
problem.solve()
weights_array = weights.value if problem.status == 'optimal' else market_weights
# Calculate metrics
portfolio_return = expected_returns @ weights_array
portfolio_risk = np.sqrt(weights_array @ cov_matrix @ weights_array)
sharpe_ratio = (portfolio_return - self.risk_free_rate / 252) / portfolio_risk
return {
'weights': weights_array,
'expected_return': portfolio_return * 252,
'expected_risk': portfolio_risk * np.sqrt(252),
'sharpe_ratio': sharpe_ratio * np.sqrt(252)
}
def calculate_efficient_frontier(
self,
returns: np.ndarray,
num_portfolios: int = 100
) -> List[Dict]:
"""
Calculate the efficient frontier
"""
n_assets = returns.shape[1]
expected_returns = np.mean(returns, axis=0)
cov_matrix = np.cov(returns.T)
# Range of target returns
min_ret = np.min(expected_returns)
max_ret = np.max(expected_returns)
target_returns = np.linspace(min_ret, max_ret, num_portfolios)
frontier = []
for target_ret in target_returns:
weights = cp.Variable(n_assets)
portfolio_risk = cp.quad_form(weights, cov_matrix)
objective = cp.Minimize(portfolio_risk)
constraints_list = [
cp.sum(weights) == 1,
weights >= 0,
expected_returns @ weights >= target_ret
]
problem = cp.Problem(objective, constraints_list)
problem.solve()
if problem.status == 'optimal':
w = weights.value
risk = np.sqrt(w @ cov_matrix @ w)
frontier.append({
'return': target_ret * 252,
'risk': risk * np.sqrt(252),
'weights': w.tolist()
})
return frontier