-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWordEmbedProcess.py
More file actions
executable file
·73 lines (61 loc) · 2.78 KB
/
WordEmbedProcess.py
File metadata and controls
executable file
·73 lines (61 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import numpy as np
from sklearn.base import BaseEstimator
class WordEmbedProcess(BaseEstimator):
def __init__(self,
embed_size,
max_features
):
self.max_features = max_features
self.embed_size = embed_size
super(WordEmbedProcess, self).__init__()
def get_pickable(self):
return {
'embed_size': self.embed_size,
'max_features': self.max_features
}
def load_pickable(self, pkl):
self.embed_size = pkl['embed_size']
self.max_features = pkl['max_features']
@staticmethod
def load_pretrainedwordembed(embedding_path):
"""
Read the glove word vectors (space delimited strings) into a dictionary {word:vector}
:param embedding_path:
:return:
"""
pretrained_wordembed = {}
with open(embedding_path, encoding='utf8') as f:
for line in f:
values = line.rstrip().rsplit(' ')
word = values[0] # 1st value is word
coefs = np.asarray(values[1:], dtype='float32') # remaining values are coefs
pretrained_wordembed[word] = coefs
return pretrained_wordembed
def fit_transform(self,pretrainedwordembed, word_index, **fit_params):
"""
if train word occur in glove use word embedding
else if train word doesnt occur in glove below are the possibilites 1. use word embedding for something 2. take average of word embedding 3. use a zero vector
Take only 1st max_features words from training dataset.
Takes max_features number of features from training data and
get pretrained glove vectors for those max_features only
ith index of embedding_matrix denotes ei
0th index of embedding_matrix will always be 0 because word_index has min index value of 1
"""
num_words = min(self.max_features, len(word_index)+1)
embedding_matrix = np.zeros((num_words, self.embed_size))
for word, i in word_index.items():
if i >= self.max_features:
continue
embedding_vector = pretrainedwordembed.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
return embedding_matrix
"""
glove_wordembed = WordEmbedProcess.load_pretrainedwordembed(config['preWordEmbedPath']) # Note staticmethod is called with classname (ie before initializing class)
wordembedProcess = WordEmbedProcess(
embed_size = 50,
max_features = 200000
)
embedding_matrix = wordembedProcess.fit_transform(pretrainedwordembed=glove_wordembed, word_index=textPreprocess.word_index)
"""