在拟合数据训练之前需要设置超参数,以获得更健壮和优化的模型。任何模型的目标都是实现最小化误差,超参数调优(Hyperparameter Tuning / Optimization)有助于实现这一目标。
实验采用kaggle小数据集手机价格预测的小数据集(mobile-price-classification)。
import pandas as pd
import numpy as np
def load_data(data_root: str):
df = pd.read_csv(data_root)
X = df.drop('price_range',axis=1).values
y = df.price_range.values
return X, y
X,y = load_data('data/mobile-price-classification/train.csv')
GridSearch简单便利左右可能参数组合。Grid格点的疏密需要自己定义。
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
classifier = ensemble.RandomForestClassifier(n_jobs=-1)
X,y = load_data('data/mobile-price-classification/train.csv')
param_grid = {
"n_estimators" : [100, 200, 300, 400],
"max_depth": [1,3,5,7],
"criterion":["gini","entropy"],
}
model = model_selection.GridSearchCV(
estimator=classifier,
param_grid=param_grid,
scoring='accuracy',
verbose=10,
n_jobs=1,
cv=5
)
model.fit(X,y)
print(model.best_score_)
print(model.best_estimator_.get_params())
结果输出
相对于简单排列组合所有参数可能组合,Random Search用随机来查早最优解。参数搜索空间相对grid大很多。相对上述有变化的是param_grid,和传入参数。
param_grid = {
"n_estimators" : np.arange(100,1500,100),
"max_depth": np.arange(1,20),
"criterion":["gini","entropy"],
}
model = model_selection.RandomizedSearchCV(
estimator=classifier,
param_distributions=param_grid,
n_iter=10,
scoring='accuracy',
verbose=10,
n_jobs=1,
cv=5
)
model.fit(X,y)
print(model.best_score_)
print(model.best_estimator_.get_params())
结果输出
实际应用中,还可以结合sklearn的preprocess,pipeline,把模型的数据预处理和多个处理流程整合起来,形成更宽泛意义的estimator。
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import pipeline
def load_data(data_root: str):
df = pd.read_csv(data_root)
X = df.drop('price_range',axis=1).values
y = df.price_range.values
return X, y
X,y = load_data('data/mobile-price-classification/train.csv')
scl = preprocessing.StandardScaler()
pca = decomposition.PCA()
rf = ensemble.RandomForestClassifier(n_jobs=-1)
classifier = pipeline.Pipeline([("scaling",scl), ("pca", pca),("rf",rf)])
param_grid = {
"pca__n_components": np.arange(5,10),
"rf__n_estimators" : np.arange(100,1500,100),
"rf__max_depth": np.arange(1,20),
"rf__criterion":["gini","entropy"],
}
model = model_selection.RandomizedSearchCV(
estimator=classifier,
param_distributions=param_grid,
n_iter=10,
scoring='accuracy',
verbose=10,
n_jobs=1,
cv=5
)
model.fit(X,y)
print(model.best_score_)
print(model.best_estimator_.get_params())
结果输出:
from functools import partial
from skopt import space
from skopt import gp_minimize
def optimize(params, param_names, X, y):
params = dict(zip(param_names, params))
model = ensemble.RandomForestClassifier(** params)
kf = model_selection.StratifiedKFold(n_splits=5)
accuracies = []
for idx in kf.split(X=X, y=y):
train_idx, test_idx = idx[0], idx[1]
xtrain = X[train_idx]
ytrain = y[train_idx]
xtest = X[test_idx]
ytest = y[test_idx]
model.fit(xtrain, ytrain)
preds = model.predict(xtest)
fold_acc = metrics.accuracy_score(ytest, preds)
accuracies.append(fold_acc)
return -1.0 * np.mean(accuracies)
param_names = ["max_depth", "n_estimators", "criterion", "max_features" ]
param_space = [
space.Integer(3, 15, name="max_depth"),
space.Integer(100, 600, name="n_estimators"),
space.Categorical(["gini","entropy"], name="criterion"),
space.Real(0.01, 1, prior="uniform", name="max_features")
]
optimization_function = partial(
optimize,
param_names=param_names,
X=X,
y=y
)
result = gp_minimize(
optimization_function,
dimensions=param_space,
n_calls=15,
n_random_starts=10,
verbose=10
)
print(dict(zip(param_names, result.x)))
结果输出:
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
def optimize(params, X, y):
model = ensemble.RandomForestClassifier(** params)
kf = model_selection.StratifiedKFold(n_splits=5)
accuracies = []
for idx in kf.split(X=X, y=y):
train_idx, test_idx = idx[0], idx[1]
xtrain = X[train_idx]
ytrain = y[train_idx]
xtest = X[test_idx]
ytest = y[test_idx]
model.fit(xtrain, ytrain)
preds = model.predict(xtest)
fold_acc = metrics.accuracy_score(ytest, preds)
accuracies.append(fold_acc)
return -1.0 * np.mean(accuracies)
param_space = {
"max_depth": scope.int(hp.quniform("max_depth", 3, 15, 1)),
"n_estimators": scope.int(hp.quniform("n_estimators", 100, 600, 1)),
"criterion": hp.choice("criterion", ["gini", "entropy"]),
"max_features": hp.uniform("max_features", 0.01, 1)
}
optimization_function = partial(optimize, X=X, y=y)
trials = Trials()
result = fmin(
fn=optimization_function,
space=param_space,
algo=tpe.suggest,
max_evals=15,
trials=trials,
)
print(result)
结果输出:
import optuna
def optimize(trial, X, y):
criterion = trial.suggest_categorical("criterion", ['gini', 'entropy'])
n_estimators = trial.suggest_int("n_estimators", 100, 1500)
max_depth = trial.suggest_int("max_depth", 3, 15)
max_features = trial.suggest_uniform("max_features", 0.01, 1.0)
model = ensemble.RandomForestClassifier(
n_estimators = n_estimators,
max_depth = max_depth,
max_features = max_features,
criterion = criterion
)
kf = model_selection.StratifiedKFold(n_splits=5)
accuracies = []
for idx in kf.split(X=X, y=y):
train_idx, test_idx = idx[0], idx[1]
xtrain = X[train_idx]
ytrain = y[train_idx]
xtest = X[test_idx]
ytest = y[test_idx]
model.fit(xtrain, ytrain)
preds = model.predict(xtest)
fold_acc = metrics.accuracy_score(ytest, preds)
accuracies.append(fold_acc)
return -1.0 * np.mean(accuracies)
optimization_function = partial(optimize, X=X, y=y)
study = optuna.create_study(direction='minimize')
study.optimize(optimization_function, n_trials=15)
结果输出:
个人推荐HyperOpt、optuna两个库。HyperOpt后续还支持SparkTrial支持大数据的集群训练,Optuna设计上更新颖方便。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。