# 模型建立与调参

• 从最简单的模型开始（线性回归 & 交叉验证 & 构建线下测试集）
• 评估算法模型的框架（这里会给出一个选择模型的框架，适合迁移）
• 模型的调参技术（贪心调参， GridSearchCV调参和贝叶斯调参）
• 绘制训练集曲线与验证集曲线（从曲线分析过拟合欠拟合的问题，以及如果发生了这些问题，我们应该怎么去尝试解决）
• 总结

### 1. 从简单的线性模型开始

$$Y=w_1x_1+w_2x_2+…+w_nx_n+b$$

# 导入之前处理好的数据

# 然后训练集和测试集分开
train = data[:train_data.shape[0]]
test = data[train_data.shape[0]:]    # 这个先不用

# 选择那些数值型的数据特征
continue_fea = ['power', 'kilometer', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_10', 'v_11', 'v_12', 'v_14',
'v_std', 'fuelType_price_average', 'gearbox_std', 'bodyType_price_average', 'brand_price_average',
'used_time', 'estivalue_price_average', 'estivalueprice_std', 'estivalue_price_min']
train_x = train[continue_fea]
train_y = train_data['price']

from sklearn.linear_model import LinearRegression

model = LinearRegression(normalize=True)
model.fit(train_x, train_y)

"""查看训练的线性回归模型的截距(intercept)与权重(coef)"""
print('intercept: ' + str(model.intercept_))
sorted(dict(zip(continue_fea, model.coef_)).items(), key=lambda x: x[1], reverse=True)

## 结果：
intercept: -178881.74591832393
[('v_6', 482008.29891714785),
('v_std', 23713.66414841167),
('v_10', 7035.056136559963),
('v_14', 1418.4037751433352),
('used_time', 186.48306334062053),
('power', 12.19202369791551),
('estivalue_price_average', 0.4082359327905722),
('brand_price_average', 0.38196351334425965),
('gearbox_std', 0.1716754674248321),
('fuelType_price_average', 0.023785798378739224),
('estivalueprice_std', -0.016868767797045624),
('bodyType_price_average', -0.21364358471329278),
('kilometer', -155.11999534761347),
('estivalue_price_min', -574.6952072539285),
('v_11', -1164.0263997737668),
('v_12', -1953.0558048250668),
('v_4', -2198.03802357537),
('v_3', -3811.7514971187525),
('v_2', -5116.825271420712),
('v_5', -447495.6394686485)]

y_pred = model.predict(x_test)

subsample_index = np.random.randint(low=0, high=len(train_y), size=50)

plt.scatter(train_x['v_6'][subsample_index], train_y[subsample_index], color='black')
plt.scatter(train_x['v_6'][subsample_index], model.predict(train_x.loc[subsample_index]), color='blue')
plt.xlabel('v_6')
plt.ylabel('price')
plt.legend(['True Price','Predicted Price'],loc='upper right')
print('The predicted price is obvious different from true price')
plt.show()

price的分布图如下：

train_y_ln = np.log1p(train_y)
print('The transformed price seems like normal distribution')
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.distplot(train_y_ln)
plt.subplot(1,2,2)
sns.distplot(train_y_ln[train_y_ln < np.quantile(train_y_ln, 0.9)])

model = model.fit(train_x, train_y_ln)

print('intercept:'+ str(model.intercept_))
sorted(dict(zip(continue_fea, model.coef_)).items(), key=lambda x:x[1], reverse=True)

#### 1.1 交叉验证

K折交叉验证是将原始数据分成K组，将每个子集数据分别做一次验证集，其余的K-1组子集数据作为训练集，这样会得到K个模型，用这K个模型最终的验证集分类准确率的平均数，作为此K折交叉验证下分类器的性能指标。以下图为例：

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer

def log_transfer(func):
def wrapper(y, yhat):
result = func(np.log(y), np.nan_to_num(np.log(yhat)))   # 这个是为了解决不合法的值的
return result
return wrapper

# 下面是交叉验证
scores = cross_val_score(model, X=train_x, y=train_y, verbose=1, cv=5, scoring=make_scorer(log_transfer(mean_absolute_error)))

# 使用线性回归模型，对未处理标签的特征数据进行五折交叉验证（Error 1.36）
print('AVG:', np.mean(scores))

# 对处理的标签交叉验证
scores = cross_val_score(model, X=train_x, y=train_y_ln, verbose=1, cv = 5, scoring=make_scorer(mean_absolute_error))
print('AVG:', np.mean(scores))

# 输出五次的验证结果：
scores = pd.DataFrame(scores.reshape(1,-1))
scores.columns = ['cv' + str(x) for x in range(1, 6)]
scores.index = ['MAE']
scores

cv1 cv2 cv3 cv4 cv5
0.194979 0.195399 0.19679 0.19257 0.197563

split_point = len(train_x) // 5 * 4

# 训练集
xtrain = train_x[:split_point]
ytrain = train_y[:split_point]

# 测试集
xval = train_x[split_point:]
yval = train_y[split_point:]
ytrain_ln = np.log1p(ytrain)
yval_ln = np.log1p(yval)

# 训练
model.fit(xtrain, ytrain_ln)
mean_absolute_error(yval_ln, model.predict(xval))

#### 1.2 构建一个线下测试集

# 导入数据

train = data[:train_data.shape[0]]
test = data[train_data.shape[0]:]    # 这个先不用

# 选数据
X = train[:100000]
Y= train_data['price'][:100000]
Y_ln = np.log1p(Y)

XTest = train[100000:]   # 模拟一个线下测试集， 看看模型的泛化能力
Ytrue = train_data['price'][100000:]

### 2. 评估模型的框架

num_folds = 10
seed = 7

# 把所有模型写到一个字典中
models = {}
models['LR'] = LinearRegression()
models['Ridge'] = Ridge()
models['LASSO'] = Lasso()
models['DecisionTree'] = DecisionTreeRegressor()
models['RandomForest'] = RandomForestRegressor()
models['XGB'] = XGBRegressor(n_estimators = 100, objective='reg:squarederror')
models['LGB'] = LGBMRegressor(n_estimators=100)
#models['SVR'] = SVR()   # 支持向量机运行不出来

results = []
for key in models:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_result = cross_val_score(models[key], X, Y_ln, cv=kfold, scoring=make_scorer(mean_absolute_error))
results.append(cv_result)
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))

# 评估算法 --- 箱线图
fig1 = plt.figure(figsize=(15, 10))
fig1.suptitle('Algorithm Comparison')
plt.boxplot(results)
ax.set_xticklabels(models.keys())
plt.show()

## 结果：
LR: 0.192890 (0.001501)
Ridge: 0.196279 (0.001616)
LASSO: 0.515573 (0.003923)
DecisionTree: 0.190959 (0.002524)
RandomForest: 0.142333 (0.001489)
XGB: 0.178492 (0.001441)
LGB: 0.147875 (0.001397)

model2 = LGBMRegressor(n_estimators=100)
model2.fit(X, Y_ln)
pred2 = model2.predict(XTest)
print("mae: ", mean_absolute_error(Ytrue, np.expm1(pred2)))

# 结果：
mae:  713.9408513079144

def bulid_modl_lgb(x_train, y_train):
estimator = LGBMRegressor(num_leaves=127, n_estimators=150)
param_grid = {'learning_rage': [0.01, 0.05, 0.1, 0.2]}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(x_train, y_train)
return gbm

model_lgb = bulid_modl_lgb(X, Y_ln)
val_lgb = model_lgb.predict(XTest)
MAE_lgb = mean_absolute_error(Ytrue, np.expm1(val_lgb))
print(MAE_lgb)

## 结果：
591.4221480289154

from sklearn.pipeline import Pipeline

pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])
pipelines['ScalerRidge'] = Pipeline([('Scaler', StandardScaler()), ('Ridge', Ridge())])
pipelines['ScalerLasso'] = Pipeline([('Scaler', StandardScaler()), ('Lasso', Lasso())])
pipelines['ScalerTree'] = Pipeline([('Scaler', StandardScaler()), ('Tree', DecisionTreeRegressor())])
pipelines['ScalerForest'] = Pipeline([('Scaler', StandardScaler()), ('Forest', RandomForestRegressor())])
pipelines['ScalerGBDT'] = Pipeline([('Scaler', StandardScaler()), ('GBDT', GradientBoostingRegressor())])
pipelines['ScalerXGB'] = Pipeline([('Scaler', StandardScaler()), ('XGB', XGBRegressor(n_estimators = 100, objective='reg:squarederror'))])
pipelines['ScalerLGB'] = Pipeline([('Scaler', StandardScaler()), ('LGB', LGBMRegressor(n_estimators=100))])

results = []
for key in pipelines:
kfold = KFold(n_splits=num_folds, random_state=seed)
cv_result = cross_val_score(pipelines[key], X, Y_ln, cv=kfold, scoring=make_scorer(mean_absolute_error))
results.append(cv_result)
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))

# 评估算法 --- 箱线图
fig2 = plt.figure(figsize=(15, 10))
fig2.suptitle('Algorithm Comparison')
plt.boxplot(results)
ax.set_xticklabels(models.keys())

### 3. 模型调参

• 贪心调参
• 网格搜索调参
• 贝叶斯调参

objective = ['regression', 'regression_l1', 'mape', 'huber', 'fair']
num_leaves = [10, 55, 70, 100, 200]
max_depth = [ 10, 55, 70, 100, 200]
n_estimators = [200, 400, 800, 1000]
learning_rate =  [0.01, 0.05, 0.1, 0.2]

#### 3.1 贪心调参

# 先建立一个参数字典
best_obj = dict()

# 调objective
for obj in objective:
model = LGBMRegressor(objective=obj)
score = np.mean(cross_val_score(model, X, Y_ln, verbose=0, cv=5, scoring=make_scorer(mean_absolute_error)))
best_obj[obj] = score

# 上面调好之后，用上面的参数调num_leaves
best_leaves = dict()
for leaves in num_leaves:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
score = np.mean(cross_val_score(model, X, Y_ln, verbose=0, cv=5, scoring=make_scorer(mean_absolute_error)))
best_leaves[leaves] = score

# 用上面两个最优参数调max_depth
best_depth = dict()
for depth in max_depth:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x: x[1])[0],
max_depth=depth)
score = np.mean(cross_val_score(model, X, Y_ln, verbose=0, cv=5, scoring=make_scorer(mean_absolute_error)))
best_depth[depth] = score

# 调n_estimators
best_nstimators = dict()
for nstimator in n_estimators:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x: x[1])[0],
max_depth=min(best_depth.items(), key=lambda x:x[1])[0],
n_estimators=nstimator)

score = np.mean(cross_val_score(model, X, Y_ln, verbose=0, cv=5, scoring=make_scorer(mean_absolute_error)))
best_nstimators[nstimator] = score

# 调learning_rate
best_lr = dict()
for lr in learning_rate:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x: x[1])[0],
max_depth=min(best_depth.items(), key=lambda x:x[1])[0],
n_estimators=min(best_nstimators.items(), key=lambda x:x[1])[0],
learning_rate=lr)
score = np.mean(cross_val_score(model, X, Y_ln, verbose=0, cv=5, scoring=make_scorer(mean_absolute_error)))
best_lr[lr] = score

sns.lineplot(x=['0_initial','1_turning_obj','2_turning_leaves',
'3_turning_depth','4_turning_estimators', '5_turning_lr'],
y=[0.143 ,min(best_obj.values()), min(best_leaves.values()), min(best_depth.values()),
min(best_nstimators.values()), min(best_lr.values())])

print("best_obj:", min(best_obj.items(), key=lambda x: x[1]))
print("best_leaves:", min(best_leaves.items(), key=lambda x: x[1]) )
print('best_depth:', min(best_depth.items(), key=lambda x: x[1]))
print('best_nstimators: ', min(best_nstimators.items(), key=lambda x: x[1]))
print('best_lr:', min(best_lr.items(), key=lambda x: x[1]))

## 结果如下：
best_obj: ('regression_l1', 0.1457016215267976)
best_leaves: (100, 0.132929241004274)
best_depth: (20, 0.13275966837758682)
best_nstimators:  (1000, 0.11861541074643345)
best_lr: (0.05, 0.11728267187328578)

#### 3.2 GridSearchCV调参

GridSearchCV，它存在的意义就是自动调参，只要把参数输进去，就能给出最优化的结果和参数。但是这个方法适合于小数据集，一旦数据的量级上去了，很难得出结果。这个在这里面优势不大， 因为数据集很大，不太能跑出结果，但是我也整理一下，有时候还是很好用的

from sklearn.model_selection import GridSearchCV

# 这个我这边电脑运行时间太长，先不跑了
parameters = {'objective':objective, 'num_leaves':num_leaves, 'max_depth':max_depth,
'n_estimators': n_estimators, 'learning_rate':learning_rate}

model = LGBMRegressor()
clf = GridSearchCV(model, parameters, cv=5)
clf = clf.fit(X, Y_ln)

# 输出最优参数
clf.best_params_

#### 3.3 贝叶斯调参

• 贝叶斯调参采用高斯过程，考虑之前的参数信息，不断地更新先验；网格搜索未考虑之前的参数信息
• 贝叶斯调参迭代次数少，速度快；网格搜索速度慢，参数多时易导致维度爆炸
• 贝叶斯调参针对非凸问题依然稳健；网格搜索针对非凸问题易得到局部最优

• 定义优化函数(rf_cv，在里面把优化的参数传入，然后建立模型，返回要优化的分数指标)
• 定义优化参数
• 开始优化（最大化分数还是最小化分数等）
• 得到优化结果
from  bayes_opt import BayesianOptimization

# 定义优化函数
def rf_cv(num_leaves, max_depth, subsample, min_child_samples):
model = LGBMRegressor(objective='regression_l1', num_leaves=int(num_leaves),
max_depth=int(max_depth), subsample=subsample,
min_child_samples = int(min_child_samples))
val = cross_val_score(model, X, Y_ln, verbose=0, cv=5, scoring=make_scorer(mean_absolute_error)).mean()

return 1-val

# 定义优化参数
rf_bo = BayesianOptimization(
rf_cv,
{
'num_leaves':(2, 100),
'max_depth':(2, 100),
'subsample':(0.1, 1),
'min_child_samples':(2, 100)
}
)

#开始优化
num_iter = 25
init_points = 5
rf_bo.maximize(init_points=init_points,n_iter=num_iter)

#显示优化结果
rf_bo.res["max"]

#附近搜索（已经有不错的参数值的时候）
rf_bo.explore(
{'n_estimators': [10, 100, 200],
'min_samples_split': [2, 10, 20],
'max_features': [0.1, 0.5, 0.9],
'max_depth': [5, 10, 15]
})

# 定义优化函数
def rf_cv(n_estimators,  max_depth):
model = RandomForestRegressor(n_estimators=int(n_estimators),
max_depth=int(max_depth))
val = cross_val_score(model, X, Y_ln, verbose=0, cv=5, scoring=make_scorer(mean_absolute_error)).mean()

return 1-val

rf_bo = BayesianOptimization(
rf_cv,
{
'n_estimators':(100, 200),
'max_depth':(2, 100)
}
)

rf_bo.maximize()

### 4. 绘制训练集曲线与验证集曲线

• 学习曲线是不同训练集大小，模型在训练集和验证集上的得分变化曲线
• 学习曲线图的横坐标是x_train的数据量，纵坐标是对应的train_score，test_score。随着训练样本的逐渐增加，算法练出的模型的表现能力；

train_sizes，train_scores，test_score = learning_curve(estimator, X, y, groups=None, train_sizes=array([0.1, 0.33, 0.55, 0.78, 1. ]), cv=’warn’, scoring=None)

• estimator：估计器，用什么模型进行学习；
• cv：交叉验证生成器，确定交叉验证拆分策略；

learning_curve为什么运行时间那么长：模型要进行train_sizes * cv次运行

from sklearn.model_selection import learning_curve, validation_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_size=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel('Training example')
plt.ylabel('score')
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_size, scoring = make_scorer(mean_absolute_error))
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()#区域
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color='r',
label="Training score")
plt.plot(train_sizes, test_scores_mean,'o-',color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt

# 假设已经调好了LGB的参数，我们可以绘制一下曲线看看这个模型有没有什么问题
model = LGBMRegressor(n_estimators=1000, leaves=200, learning_rate=0.05, objective='regression_l1')
model.fit(X, Y_ln)
pred2 = model.predict(XTest)
print("mae: ", mean_absolute_error(Ytrue, np.expm1(pred2)))

# 画出学习曲线
plot_learning_curve(model, 'LGB', X[:10000], Y_ln[:10000], ylim=(0.0, 1), cv=5, n_jobs=1)

#### 下面整理一下如何观察学习曲线

learning_curve里面有个scoring参数可以设置你想求的值，分类可以设置 accuracy ，回归问题可以设置 neg_mean_squared_error ，总体来说，值都是越大越好，但是注意如果模型设置的是 mae erro ，那就是越低越好