
你有 50 个候选因子,怎么选出真正有用的?
历史表现好,未来还能好吗?
因子的价值在于预测,而非解释。过去辉煌不等于未来可期。
50 个因子,普通 OLS 会过拟合。正则化约束系数大小。
L2 惩罚让系数变小但不会归零。
fn ridge_regression(
y: &[f64],
x_matrix: &[Vec<f64>],
lambda: f64,
) -> Vec<f64> {
let n = y.len();
let p = x_matrix.len();
// X'X + λI
let mut xtx = vec![vec![0.0; p]; p];
for i in 0..p {
for j in 0..p {
xtx[i][j] = (0..n)
.map(|obs| x_matrix[i][obs] * x_matrix[j][obs])
.sum::<f64>();
if i == j {
xtx[i][j] += lambda;
}
}
}
// X'y
let mut xty = vec![0.0; p];
for i in 0..p {
xty[i] = (0..n)
.map(|obs| x_matrix[i][obs] * y[obs])
.sum::<f64>();
}
solve_linear_system(&xtx, &xty)
}L1 惩罚让部分系数归零——自动做因子选择。
需要迭代求解(坐标下降法):
fn lasso_regression(
y: &[f64],
x_matrix: &[Vec<f64>],
lambda: f64,
max_iter: usize,
) -> Vec<f64> {
let n = y.len();
let p = x_matrix.len();
// 标准化
let x_std: Vec<Vec<f64>> = x_matrix.iter()
.map(|col| {
let mean = col.iter().sum::<f64>() / n as f64;
let std = (col.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n as f64).sqrt();
col.iter().map(|x| (x - mean) / std).collect()
})
.collect();
let y_mean = y.iter().sum::<f64>() / n as f64;
let y_centered: Vec<f64> = y.iter().map(|yi| yi - y_mean).collect();
// 初始化系数
let mut beta = vec![0.0; p];
// 坐标下降
for _ in 0..max_iter {
for j in 0..p {
// 计算部分残差
let partial_residual: Vec<f64> = (0..n)
.map(|i| {
y_centered[i] - (0..p)
.filter(|&k| k != j)
.map(|k| x_std[k][i] * beta[k])
.sum::<f64>()
})
.collect();
// 计算相关系数
let rho: f64 = (0..n)
.map(|i| x_std[j][i] * partial_residual[i])
.sum::<f64>();
// Soft thresholding
beta[j] = soft_threshold(rho, lambda);
}
}
beta
}
fn soft_threshold(rho: f64, lambda: f64) -> f64 {
if rho > lambda {
rho - lambda
} else if rho < -lambda {
rho + lambda
} else {
0.0
}
}结合 Ridge 和 Lasso:
因子暴露随时间变化。滚动回归追踪动态 Beta。
use polars::prelude::*;
fn rolling_regression(
df: &DataFrame,
window_size: usize,
) -> Result<DataFrame> {
let returns = df.column("stock_return")?.f64()?.into_iter()
.flatten()
.collect::<Vec<f64>>();
let mkt = df.column("mkt_return")?.f64()?.into_iter()
.flatten()
.collect::<Vec<f64>>();
let mut betas = vec![f64::NAN; window_size - 1];
for i in window_size..=returns.len() {
let y_window = &returns[i - window_size..i];
let x_window = &mkt[i - window_size..i];
let result = ols_regression(y_window, x_window);
betas.push(result.beta);
}
let dates = df.column("date")?.clone();
df![
"date" => dates,
"rolling_beta" => betas,
]
}更近期数据权重更高:
fn ewma_beta(
returns: &[f64],
mkt: &[f64],
lambda: f64,
) -> Vec<f64> {
let n = returns.len();
let mut betas = Vec::with_capacity(n);
let mut cov = 0.0;
let mut var = 0.0;
// 初始化
cov = returns[0] * mkt[0];
var = mkt[0].powi(2);
betas.push(cov / var);
for i in 1..n {
cov = lambda * cov + (1.0 - lambda) * returns[i] * mkt[i];
var = lambda * var + (1.0 - lambda) * mkt[i].powi(2);
betas.push(cov / var);
}
betas
}因子 IC 好 ≠ 未来预测好。样本外验证是必须的。
fn train_test_split(
df: &DataFrame,
train_ratio: f64,
) -> Result<(DataFrame, DataFrame)> {
let n = df.height();
let split_point = (n as f64 * train_ratio) as usize;
let train = df.slice(0, split_point);
let test = df.slice(split_point, n - split_point);
Ok((train, test))
}struct PredictionMetrics {
mse: f64,
rmse: f64,
mae: f64,
r_squared_oos: f64,
}
fn evaluate_predictions(
actual: &[f64],
predicted: &[f64],
) -> PredictionMetrics {
let n = actual.len();
let mse: f64 = actual.iter().zip(predicted.iter())
.map(|(a, p)| (a - p).powi(2))
.sum::<f64>() / n as f64;
let rmse = mse.sqrt();
let mae: f64 = actual.iter().zip(predicted.iter())
.map(|(a, p)| (a - p).abs())
.sum::<f64>() / n as f64;
// 样本外 R²
let y_mean = actual.iter().sum::<f64>() / n as f64;
let ss_total: f64 = actual.iter().map(|a| (a - y_mean).powi(2)).sum();
let r_squared_oos = 1.0 - mse * n as f64 / ss_total;
PredictionMetrics { mse, rmse, mae, r_squared_oos }
}fn factor_prediction_backtest(
df: &DataFrame,
factor_cols: &[&str],
) -> Result<()> {
let (train, test) = train_test_split(df, 0.7)?;
// 训练模型
let y_train: Vec<f64> = train.column("forward_return")?.f64()?.into_iter()
.flatten()
.collect();
let x_train: Vec<Vec<f64>> = factor_cols.iter()
.map(|c| train.column(c)?.f64()?.into_iter().flatten().collect())
.collect();
let coefficients = multiple_ols_regression(&y_train, &x_train).coefficients;
// 测试预测
let y_test: Vec<f64> = test.column("forward_return")?.f64()?.into_iter()
.flatten()
.collect();
let x_test: Vec<Vec<f64>> = factor_cols.iter()
.map(|c| test.column(c)?.f64()?.into_iter().flatten().collect())
.collect();
let predicted: Vec<f64> = (0..y_test.len())
.map(|i| {
coefficients[0] + (1..coefficients.len())
.map(|j| coefficients[j] * x_test[j - 1][i])
.sum::<f64>()
})
.collect();
// 评估
let metrics = evaluate_predictions(&y_test, &predicted);
println!("=== 样本外预测评估 ===");
println!("MSE: {:.6f}", metrics.mse);
println!("RMSE: {:.6f}", metrics.rmse);
println!("MAE: {:.6f}", metrics.mae);
println!("R² OOS: {:.4f}", metrics.r_squared_oos);
// IC 分析
let ic = compute_ic(&y_test, &predicted);
println!("IC: {:.4f}", ic);
Ok(())
}
fn compute_ic(actual: &[f64], predicted: &[f64]) -> f64 {
let n = actual.len();
let mean_a = actual.iter().sum::<f64>() / n as f64;
let mean_p = predicted.iter().sum::<f64>() / n as f64;
let cov: f64 = actual.iter().zip(predicted.iter())
.map(|(a, p)| (a - mean_a) * (p - mean_p))
.sum::<f64>() / n as f64;
let std_a = (actual.iter().map(|a| (a - mean_a).powi(2)).sum::<f64>() / n as f64).sqrt();
let std_p = (predicted.iter().map(|p| (p - mean_p).powi(2)).sum::<f64>() / n as f64).sqrt();
cov / (std_a * std_p)
}因子值与未来收益的相关性:
fn factor_ic_analysis(
factor_values: &[f64],
forward_returns: &[f64],
) -> ICReport {
let ic = compute_ic(factor_values, forward_returns);
// Rank IC(Spearman)
let mut ranked_factor = (0..factor_values.len()).collect::<Vec<_>>();
ranked_factor.sort_by(|&a, &b| factor_values[a].partial_cmp(&factor_values[b]).unwrap());
let mut ranked_returns = (0..forward_returns.len()).collect::<Vec<_>>();
ranked_returns.sort_by(|&a, &b| forward_returns[a].partial_cmp(&forward_returns[b]).unwrap());
// 计算 Rank IC...
ICReport {
ic,
rank_ic: ic, // 简化
}
}
struct ICReport {
ic: f64,
rank_ic: f64,
}IC 的稳定性:
fn information_ratio(ics: &[f64]) -> f64 {
let mean_ic = ics.iter().sum::<f64>() / ics.len() as f64;
let std_ic = {
let var = ics.iter().map(|ic| (ic - mean_ic).powi(2)).sum::<f64>() / ics.len() as f64;
var.sqrt()
};
if std_ic > 0.0 {
mean_ic / std_ic
} else {
0.0
}
}判断标准:
IR | 因子质量 |
|---|---|
< 0.5 | 较差 |
0.5 - 1.0 | 一般 |
> 1.0 | 优秀 |
因子的价值在于预测,而非解释。
下一站:时间序列分析——自相关、平稳性与 ARIMA。