# 如何让Bert在finetune小数据集时更“稳”一点

（updated on 2020.06.17）新增内容 ：这里的推导我又研究了一下，最后看到一个网站上的回答有些道理，这里贴出来供大家参考：Understanding a derivation of bias correction for the Adam optimizer

m = tf.get_variable(
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())
v = tf.get_variable(
shape=param.shape.as_list(),
dtype=tf.float32,
trainable=False,
initializer=tf.zeros_initializer())

next_m = (
tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
next_v = (
tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,

update = next_m / (tf.sqrt(next_v) + self.epsilon)

### 再次验证

class AdamWeightDecayOptimizer(optimizer.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""

def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,

self.learning_rate = learning_rate
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
self.learning_rate_t = None
self._beta1_t = None
self._beta2_t = None
self._epsilon_t = None

def _get_beta_accumulators(self):
with ops.init_scope():
if context.executing_eagerly():
graph = None
else:
graph = ops.get_default_graph()
return (self._get_non_slot_variable("beta1_power", graph=graph),
self._get_non_slot_variable("beta2_power", graph=graph))

def _prepare(self):
self.learning_rate_t = ops.convert_to_tensor(
self.learning_rate, name='learning_rate')
self.weight_decay_rate_t = ops.convert_to_tensor(
self.weight_decay_rate, name='weight_decay_rate')
self.beta_1_t = ops.convert_to_tensor(self.beta_1, name='beta_1')
self.beta_2_t = ops.convert_to_tensor(self.beta_2, name='beta_2')
self.epsilon_t = ops.convert_to_tensor(self.epsilon, name='epsilon')

def _create_slots(self, var_list):
first_var = min(var_list, key=lambda x: x.name)
self._create_non_slot_variable(initial_value=self.beta_1,
name="beta1_power",
colocate_with=first_var)
self._create_non_slot_variable(initial_value=self.beta_2,
name="beta2_power",
colocate_with=first_var)
for v in var_list:
self._zeros_slot(v, 'm', self._name)
self._zeros_slot(v, 'v', self._name)

learning_rate_t = math_ops.cast(
self.learning_rate_t, var.dtype.base_dtype)
beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
weight_decay_rate_t = math_ops.cast(
self.weight_decay_rate_t, var.dtype.base_dtype)

m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
beta1_power, beta2_power = self._get_beta_accumulators()
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
learning_rate_t = math_ops.cast(self.learning_rate_t, var.dtype.base_dtype)
learning_rate_t = (learning_rate_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

next_m = (
tf.multiply(beta_1_t, m) +
next_v = (
tf.multiply(beta_2_t, v) + tf.multiply(1.0 - beta_2_t,

update = next_m / (tf.sqrt(next_v) + epsilon_t)

if self._do_use_weight_decay(var.name):
update += weight_decay_rate_t * var

update_with_lr = learning_rate_t * update

next_param = var - update_with_lr

return control_flow_ops.group(*[var.assign(next_param),
m.assign(next_m),
v.assign(next_v)])

learning_rate_t = math_ops.cast(
self.learning_rate_t, var.dtype.base_dtype)
beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
weight_decay_rate_t = math_ops.cast(
self.weight_decay_rate_t, var.dtype.base_dtype)

m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
beta1_power, beta2_power = self._get_beta_accumulators()
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
learning_rate_t = math_ops.cast(self.learning_rate_t, var.dtype.base_dtype)
learning_rate_t = (learning_rate_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

next_m = (
tf.multiply(beta_1_t, m) +
next_v = (
tf.multiply(beta_2_t, v) + tf.multiply(1.0 - beta_2_t,

update = next_m / (tf.sqrt(next_v) + epsilon_t)

if self._do_use_weight_decay(var.name):
update += weight_decay_rate_t * var

update_with_lr = learning_rate_t * update

next_param = var - update_with_lr

return control_flow_ops.group(*[var.assign(next_param),
m.assign(next_m),
v.assign(next_v)])

learning_rate_t = math_ops.cast(
self.learning_rate_t, var.dtype.base_dtype)
beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
weight_decay_rate_t = math_ops.cast(
self.weight_decay_rate_t, var.dtype.base_dtype)

m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
beta1_power, beta2_power = self._get_beta_accumulators()
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
learning_rate_t = math_ops.cast(self.learning_rate_t, var.dtype.base_dtype)
learning_rate_t = (learning_rate_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

m_t = state_ops.assign(m, m * beta_1_t,
use_locking=self._use_locking)

m_scaled_g_values = grad * (1 - beta_1_t)
with ops.control_dependencies([m_t]):

v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
with ops.control_dependencies([v_t]):

update = m_t / (math_ops.sqrt(v_t) + epsilon_t)

if self._do_use_weight_decay(var.name):
update += weight_decay_rate_t * var

update_with_lr = learning_rate_t * update

var_update = state_ops.assign_sub(var,
update_with_lr,
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t])

return self._apply_sparse_shared(
lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
x, i, v, use_locking=self._use_locking))

with ops.control_dependencies(
x.handle, i, v)]):
return x.value()

return self._apply_sparse_shared(

def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for param_name."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True
def _finish(self, update_ops, name_scope):
# Update the power accumulators.
with ops.control_dependencies(update_ops):
beta1_power, beta2_power = self._get_beta_accumulators()
with ops.colocate_with(beta1_power):
update_beta1 = beta1_power.assign(
beta1_power * self.beta_1_t, use_locking=self._use_locking)
update_beta2 = beta2_power.assign(
beta2_power * self.beta_2_t, use_locking=self._use_locking)
return control_flow_ops.group(*update_ops + [update_beta1, update_beta2],
name=name_scope)

beta1_power, beta2_power = self._get_beta_accumulators()
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
learning_rate_t = math_ops.cast(self.learning_rate_t, var.dtype.base_dtype)
learning_rate_t = (learning_rate_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

## Weight Re-initializing

### 再次验证

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
"""Compute the union of the current variables and checkpoint variables."""
assignment_map = {}
initialized_variable_names = {}
name_to_variable = collections.OrderedDict()
for var in tvars:
name = var.name
m = re.match("^(.*):\\d+\$", name)
if m is not None:
name = m.group(1)
name_to_variable[name] = var

init_vars = tf.train.list_variables(init_checkpoint)

assignment_map = collections.OrderedDict()
filtered_layer_names = [......] //这里放需要重新初始化的权重参数名称就可以了
for x in init_vars:
(name, var) = (x[0], x[1])
if name not in name_to_variable:
continue
if name not in filtered_layer_names:
assignment_map[name] = name_to_variable[name]
initialized_variable_names[name] = 1
initialized_variable_names[name + ":0"] = 1

return (assignment_map, initialized_variable_names)

## 更多对比实验

1、Pre-trained Weight Decay，传统的weight decay中，权重参数会减去一个正则项。而pre-trained weight decay则是在finetune时，将预训练时的权重 引入到weight decay计算中 ,最终正则项为  。通过这种方式，能够使得模型的训练变得更稳定。

2、Mixout。在finetune时，每个训练iter都会设定给一个概率p，模型会根据这个p将模型参数随机替换成预训练的权重参数。这个方法主要是为了减缓灾难性遗忘，让模型不至于在finetune任务时忘记预训练时学习到的知识。

3、Layerwise Learning Rate Decay。这个方法我也经常会去尝试，即对于不同的层数，会使用不同的学习率。因为靠近底部的层学习到的是比较通用的知识，所以在finetune时并不需要它过多的去更新参数，相反靠近顶部的层由于偏向学习下游任务的相关知识，因此需要更多得被更新。

## 小结

https://zhuanlan.zhihu.com/p/148720604

#### 关于AINLP

AINLP 是一个有趣有AI的自然语言处理社区，专注于 AI、NLP、机器学习、深度学习、推荐算法等相关技术的分享，主题包括文本摘要、智能问答、聊天机器人、机器翻译、自动生成、知识图谱、预训练模型、推荐系统、计算广告、招聘信息、求职经验分享等，欢迎关注！加技术交流群请添加AINLPer(id：ainlper)，备注工作/研究方向+加群目的。