DocumentClassificationwithHANandBERT/hanModel.py at master · FrancescoGradi/DocumentClassificationwithHANandBERT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dropout, GRU, TimeDistributed, Bidirectional, Embedding, Dense
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras import initializers as initializers

import torch
import torch.nn as nn


class AttentionLayer(layers.Layer):
    """
    Hierarchial Attention Layer as described by Hierarchical Attention Networks for Document Classification(2016)
    - Yang et. al.
    Source: https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf
    """
    def __init__(self, attention_dim=100, return_coefficients=True, **kwargs):
        self.supports_masking = True
        self.return_coefficients = return_coefficients
        self.init = initializers.get('glorot_uniform')
        self.attention_dim = attention_dim
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):

        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], self.attention_dim),
                                 initializer=self.init,
                                 trainable=True,
                                 name='W')
        self.b = self.add_weight(shape=(self.attention_dim,),
                                 initializer=self.init,
                                 trainable=True,
                                 name='b')
        self.u = self.add_weight(shape=(self.attention_dim, 1),
                                 initializer=self.init,
                                 trainable=True,
                                 name='u')

        super(AttentionLayer, self).build(input_shape)

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'attention_dim': self.attention_dim,
        })
        return config

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, h_it, mask=None):
        u_it = K.bias_add(K.dot(h_it, self.W), self.b)
        u_it = K.tanh(u_it)

        a_it = K.dot(u_it, self.u)
        a_it = K.squeeze(a_it, -1)
        a_it = K.exp(a_it)

        if mask is not None:
            a_it *= K.cast(mask, K.floatx())

        a_it /= K.cast(K.sum(a_it, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a_it = K.expand_dims(a_it)
        weighted_input = h_it * a_it

        if self.return_coefficients:
            return [K.sum(weighted_input, axis=1), a_it]
        else:
            return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        if self.return_coefficients:
            return [(input_shape[0], input_shape[-1]), (input_shape[0], input_shape[-1], 1)]
        else:
            return input_shape[0], input_shape[-1]


def HanModel(n_classes, len_word_index, embedding_matrix, MAX_SENTENCE_NUM=40, MAX_WORD_NUM=50, EMBED_SIZE=100):
    # Word Encoder
    word_input = Input(shape=(MAX_WORD_NUM,), dtype='int32', name='word_input')
    word_sequences = Embedding(len_word_index + 1, EMBED_SIZE, weights=[embedding_matrix], input_length=MAX_WORD_NUM,
                               trainable=True, name='word_embedding')(word_input)
    emb_drop = Dropout(rate=0.2, name='word_dropout')(word_sequences)
    word_gru = Bidirectional(GRU((int)(EMBED_SIZE / 2), return_sequences=True, bias_regularizer=regularizers.l2(0.01), kernel_regularizer=regularizers.l2(0.01),
                                 recurrent_regularizer=regularizers.l2(0.01)), name='word_gru')(emb_drop)
    word_dense = Dense(EMBED_SIZE, activation='relu', name='word_dense')(word_gru)
    word_att, word_coeff = AttentionLayer(EMBED_SIZE, return_coefficients=True, name='word_attention')(word_dense)
    word_encoder = Model(inputs=word_input, outputs=word_att, name='WordEncoder')
    print(word_encoder.summary())

    # Sentence Attention model
    sent_input = Input(shape=(MAX_SENTENCE_NUM, MAX_WORD_NUM), dtype='int32', name='sent_input')
    sent_encoder = TimeDistributed(word_encoder, name='sent_linking')(sent_input)
    sent_gru = Bidirectional(GRU((int)(EMBED_SIZE / 2), return_sequences=True, bias_regularizer=regularizers.l2(0.01), kernel_regularizer=regularizers.l2(0.01),
                                 recurrent_regularizer=regularizers.l2(0.01)), name='sent_gru')(sent_encoder)
    sent_dense = Dense(EMBED_SIZE, activation='relu', name='sent_dense')(sent_gru)
    sent_att, sent_coeff = AttentionLayer(EMBED_SIZE, return_coefficients=True, name='sent_attention')(sent_dense)
    sent_drop = Dropout(rate=0.5, name='sent_dropout')(sent_att)
    preds = Dense(n_classes, activation='softmax', name='output')(sent_drop)

    return Model(sent_input, preds, name='HanModel')