智源&计算所-互联网虚假新闻检测挑战赛（冠军）方案，开源分享

https://www.biendata.com/competition/falsenews/

https://www.biendata.com/models/category/3529/L_notebook/

BERT模型可采用roeberta_zh_L-24_H-1024_A-16，其优点为准确率高，缺点为显存占用率较高。

def remake(x,num):

L = []

for i,each in enumerate(num):

L += [x[i]]*each

return L

words = [t for t in jieba.cut(text)]

temp = [len(t) for t in words]

x3 = [word2id[t] if t in vocabulary else 1 for t in words]

x3 = remake(x3, temp)

if len(x3) < maxlen - 2:

x3 = [1] + x3 + [1] + [0] * (maxlen - len(x3) - 2)

else:

x3 = [1] + x3[:maxlen - 2] + [1]

class MaskedGlobalMaxPool1D(keras.layers.Layer):

def __init__(self, **kwargs):

super(MaskedGlobalMaxPool1D, self).__init__(**kwargs)

self.supports_masking = True

def compute_mask(self, inputs, mask=None):

return None

def compute_output_shape(self, input_shape):

return input_shape[:-2] + (input_shape[-1],)

def call(self, inputs, mask=None):

if mask is not None:

mask = K.cast(mask, K.floatx())

inputs -= K.expand_dims((1.0 - mask) * 1e6, axis=-1)

return K.max(inputs, axis=-2)

支持mask的平均池化

class MaskedGlobalAveragePooling1D(keras.layers.Layer):

def __init__(self, **kwargs):

super(MaskedGlobalAveragePooling1D, self).__init__(**kwargs)

self.supports_masking = True

def compute_mask(self, inputs, mask=None):

return None

def compute_output_shape(self, input_shape):

return input_shape[:-2] + (input_shape[-1],)

def call(self, x, mask=None):

if mask is not None:

mask = K.repeat(mask, x.shape[-1])

mask = tf.transpose(mask, [0, 2, 1])

mask = K.cast(mask, K.floatx())

x = x * mask

return K.sum(x, axis=1) / K.sum(mask, axis=1)

else:

return K.mean(x, axis=1)

Bert Finetune

x1_in = Input(shape=(None,))

x2_in = Input(shape=(None,))

bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path)

for l in bert_model.layers:

l.trainable = True

x = bert_model([x1_in, x2_in])

x = Lambda(lambda x: x[:, 0])(x)

x = Dropout(0.1)(x)

p = Dense(1, activation='sigmoid')(x)

model = Model([x1_in, x2_in], p)

model.compile(

loss='binary_crossentropy',

optimizer=Adam(1e-5),

metrics=['accuracy']

)

BERT+TextCNN

x1_in = Input(shape=(None,))

x2_in = Input(shape=(None,))

x3_in = Input(shape=(None,))

x1, x2,x3 = x1_in, x2_in,x3_in

x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x1)

bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path)

embedding1= Embedding(len(vocabulary) + 2, 200,weights=[embedding_index],mask_zero= True)

x3 = embedding1(x3)

embed_layer = bert_model([x1_in, x2_in])

embed_layer  = Concatenate()([embed_layer,x3])

x = MaskedConv1D(filters=256, kernel_size=3, padding='same', activation='relu')(embed_layer )

pool = MaskedGlobalMaxPool1D()(x)

ave = MaskedGlobalAveragePooling1D()(x)

x = Add()([pool,ave])

x = Dropout(0.1)(x)

x = Dense(32, activation = 'relu')(x)

p = Dense(1, activation='sigmoid')(x)

model = Model([x1_in, x2_in,x3_in], p)

model.compile(

loss='binary_crossentropy',

optimizer=Adam(1e-3),

metrics=['accuracy']

)

BERT + RNN + CNN

x1_in = Input(shape=(None,))

x2_in = Input(shape=(None,))

x3_in = Input(shape=(None,))

x1, x2,x3 = x1_in, x2_in,x3_in

x_mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x1)

bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path)

embedding1= Embedding(len(vocabulary) + 2, 200,weights=[embedding_index],mask_zero= True)

x3 = embedding1(x3)

embed_layer = bert_model([x1_in, x2_in])

embed_layer  = Concatenate()([embed_layer,x3])

embed_layer = Bidirectional(LSTM(units=128,return_sequences=True))(embed_layer)

embed_layer = Bidirectional(LSTM(units=128,return_sequences=True))(embed_layer)

x = MaskedConv1D(filters=256, kernel_size=3, padding='same', activation='relu')(embed_layer )

pool = MaskedGlobalMaxPool1D()(x)

ave = MaskedGlobalAveragePooling1D()(x)

x = Add()([pool,ave])

x = Dropout(0.1)(x)

x = Dense(32, activation = 'relu')(x)

p = Dense(1, activation='sigmoid')(x)

model = Model([x1_in, x2_in,x3_in], p)

model.compile(

loss='binary_crossentropy',

optimizer=Adam(1e-3),

metrics=['accuracy']

)

10折交叉训练

for train,test in kfold.split(train_data_X,train_data_Y):

model = getModel()

t1,t2,t3,t4 = np.array(train_data_X)[train], np.array(train_data_X)[test],np.array(train_data_Y)[train],np.array(train_data_Y)[test]

train_D = data_generator(t1.tolist(), t3.tolist())

dev_D = data_generator(t2.tolist(), t4.tolist())

evaluator = Evaluate()

model.fit_generator(train_D.__iter__(),

steps_per_epoch=len(train_D),

epochs=3,

callbacks=[evaluator,lrate]

)

del model

K.clear_session()

def extract(L):

return  [r.word for r in L]

tr4w = TextRank4Keyword()

result = []

for sentence in train:

tr4w.analyze(text=text, lower=True, window=2)

s =  extract(tr4w.get_keywords(10, word_min_len=1))

result = result + s

c = Counter(result)

print(c.most_common(100))

Datawhale竞赛 群已成立

，可申请加入竞赛学习交流群，一起组队参赛（一定要备注：

▲长按加群
AI学习路线和优质资源，在后台回复”AI”获取