二分类问题：基于BERT的文本分类实践！附完整代码

2014 年 4 月 19 日

import keras

from keras.utils import to_categorical

from keras.layers import *

from keras.callbacks import *

from keras.models import Model

import keras.backend as K

from keras.optimizers import Adam

import codecs

import gc

import numpy as np

import pandas as pd

import time

import os

from keras.utils.training_utils import multi_gpu_model

import tensorflow as tf

from keras.backend.tensorflow_backend import set_session

from sklearn.model_selection import KFold

from keras_bert import load_trained_model_from_checkpoint, Tokenizer

from keras_self_attention import SeqSelfAttention

from sklearn.metrics import roc_auc_score

# 线下0.9552568091358987 batch = 16 交叉熵 1e-5  线上 0.96668

# 线下0.9603767202619631 batch = 16 在上一步基础上用f1loss 不调bert层 1e-7 线上0.97010




class OurTokenizer(Tokenizer):

    def _tokenize(self, text):

        R = []

        for c in text:

            if c in self._token_dict:

                R.append(c)

            elif self._is_space(c):

                R.append('[unused1]')  # space类用未经训练的[unused1]表示

            else:

                R.append('[UNK]')  # 剩余的字符是[UNK]

        return R




 def f1_loss(y_true, y_pred):

    # y_true:真实标签0或者1；y_pred:为正类的概率

    loss = 2 * tf.reduce_sum(y_true * y_pred) / tf.reduce_sum(y_true + y_pred) + K.epsilon()

    return -loss




def seq_padding(X, padding=0):

    L = [len(x) for x in X]

    ML = max(L)

    return np.array([

        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X

    ])




class data_generator:

    def __init__(self, data, batch_size=8, shuffle=True):

        self.data = data

        self.batch_size = batch_size

        self.shuffle = shuffle

        self.steps = len(self.data) // self.batch_size

        if len(self.data) % self.batch_size != 0:

            self.steps += 1




    def __len__(self):

        return self.steps




    def __iter__(self):

        while True:

            idxs = list(range(len(self.data)))




            if self.shuffle:

                np.random.shuffle(idxs)




            X1, X2, Y = [], [], []

            for i in idxs:

                d = self.data[i]

                text = d[0][:maxlen]

                # indices, segments = tokenizer.encode(first='unaffable', second='钢', max_len=10)

                x1, x2 = tokenizer.encode(first=text)

                y = np.float32(d[1])

                X1.append(x1)

                X2.append(x2)

                Y.append([y])

                if len(X1) == self.batch_size or i == idxs[-1]:

                    X1 = seq_padding(X1)

                    X2 = seq_padding(X2)

                    Y = seq_padding(Y)

                    # print('Y', Y)

                    yield [X1, X2], Y[:, 0]

                    [X1, X2, Y] = [], [], []







def build_bert(nclass, selfloss, lr, is_train):

    bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)




    for l in bert_model.layers:

        l.trainable = is_train




    x1_in = Input(shape=(None,))

    x2_in = Input(shape=(None,))




    x = bert_model([x1_in, x2_in])

    x = Lambda(lambda x: x[:, :])(x)




    avg_pool_3 = GlobalAveragePooling1D()(x)

    max_pool_3 = GlobalMaxPooling1D()(x)

    # 官方文档：https://www.cnpython.com/pypi/keras-self-attention

    # 源码 https://github.com/CyberZHG/keras-self-attention/blob/master/keras_self_attention/seq_self_attention.py

    attention_3 = SeqSelfAttention(attention_activation='softmax')(x)

    attention_3 = Lambda(lambda x: x[:, 0])(attention_3)




    x = keras.layers.concatenate([avg_pool_3, max_pool_3, attention_3], name="fc")

    p = Dense(nclass, activation='sigmoid')(x)




    model = Model([x1_in, x2_in], p)

    model.compile(loss=selfloss,

                  optimizer=Adam(lr),

                  metrics=['acc'])

    print(model.summary())

    return model




def run_cv(nfold, data, data_test):

    kf = KFold(n_splits=nfold, shuffle=True, random_state=2020).split(data)

    train_model_pred = np.zeros((len(data), 1))

    test_model_pred = np.zeros((len(data_test), 1))




    lr = 1e-7  # 1e-5

    # categorical_crossentropy (可选方案：'binary_crossentropy', f1_loss)

    selfloss = f1_loss

    is_train = False  # True False




    for i, (train_fold, test_fold) in enumerate(kf):

        print('***************%d-th****************' % i)

        t = time.time()

        X_train, X_valid, = data[train_fold, :], data[test_fold, :]




        model = build_bert(1, selfloss, lr, is_train)

        early_stopping = EarlyStopping(monitor='val_acc', patience=3)

        plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=2)

        checkpoint = ModelCheckpoint('/home/codes/news_classify/comment_classify/expriments/' + str(i) + '_2.hdf5', monitor='val_acc',

                                     verbose=2, save_best_only=True, mode='max', save_weights_only=False)




        batch_size = 16

        train_D = data_generator(X_train, batch_size=batch_size, shuffle=True)

        valid_D = data_generator(X_valid, batch_size=batch_size, shuffle=False)

        test_D = data_generator(data_test, batch_size=batch_size, shuffle=False)




        model.load_weights('/home/codes/news_classify/comment_classify/expriments/' + str(i) + '.hdf5')




        model.fit_generator(

            train_D.__iter__(),

            steps_per_epoch=len(train_D),

            epochs=8,

            validation_data=valid_D.__iter__(),

            validation_steps=len(valid_D),

            callbacks=[early_stopping, plateau, checkpoint],

        )




        # return model

        train_model_pred[test_fold] = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=1)

        test_model_pred += model.predict_generator(test_D.__iter__(), steps=len(test_D), verbose=1)




        del model

        gc.collect()

        K.clear_session()




        print('time:', time.time()-t)




    return train_model_pred, test_model_pred







if __name__ == '__main__':




    config = tf.ConfigProto()

    config.gpu_options.per_process_gpu_memory_fraction = 0.8  # 定量

    config.gpu_options.allow_growth = True  # 按需

    set_session(tf.Session(config=config))




    t = time.time()

    maxlen = 20  # 数据集中最大长度是19

    config_path = '/home/codes/news_classify/chinese_L-12_H-768_A-12/bert_config.json'

    checkpoint_path = '/home/codes/news_classify/chinese_L-12_H-768_A-12/bert_model.ckpt'

    dict_path = '/home/codes/news_classify/chinese_L-12_H-768_A-12/vocab.txt'

    token_dict = {}

    with codecs.open(dict_path, 'r', 'utf8') as reader:

        for line in reader:

            token = line.strip()

            token_dict[token] = len(token_dict)




    tokenizer = OurTokenizer(token_dict)




    data_dir = '/home/codes/news_classify/comment_classify/'

    train_df = pd.read_csv(os.path.join(data_dir, 'union_train.csv'))

    test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))




    print(len(train_df), len(test_df))




    DATA_LIST = []

    for data_row in train_df.iloc[:].itertuples():

        DATA_LIST.append((data_row.content, data_row.label))

    DATA_LIST = np.array(DATA_LIST)




    DATA_LIST_TEST = []

    for data_row in test_df.iloc[:].itertuples():

        DATA_LIST_TEST.append((data_row.content, 0))

    DATA_LIST_TEST = np.array(DATA_LIST_TEST)




    n_cv = 5

    train_model_pred, test_model_pred = run_cv(n_cv, DATA_LIST, DATA_LIST_TEST)




    train_df['Prediction'] = train_model_pred

    test_df['Prediction'] = test_model_pred/n_cv




    train_df.to_csv(os.path.join(data_dir, 'train_union_submit2.csv'), index=False)




    test_df['ID'] = test_df.index

    test_df[['ID', 'Prediction']].to_csv(os.path.join(data_dir, 'submit2.csv'), index=False)




    auc = roc_auc_score(np.array(train_df['label']), np.array(train_df['Prediction']))

    print('auc', auc)




    print('time is ', time.time()-t)  # 2853s

About The Author

bjmayor

程序员，码农，php,python,ios,android,go，产品经理，创业。