-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvocabulary.py
More file actions
130 lines (112 loc) · 4.53 KB
/
vocabulary.py
File metadata and controls
130 lines (112 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from collections import Counter
from defaults import (BEGIN_WORD, BEGIN_WORD_CHAR, COPY,
COPY_CHAR, DELETE, DELETE_CHAR, END_WORD,
END_WORD_CHAR, STEP, STEP_CHAR, UNK, UNK_CHAR)
#############################################################
# VOCABULARY
#############################################################
class Vocab(object):
def __init__(self, w2i=None, encoding='utf8'):
if w2i is None:
self.w2i = dict()
else:
self.w2i = dict(w2i)
self.i2w = {i:w for w,i in self.w2i.items()}
self.encoding = encoding
self.freqs = Counter(list(self.i2w.keys()))
@classmethod
def from_list(cls, words, encoding='utf8'):
w2i = {}
idx = 0
for word in set(words):
if encoding:
word = word.decode(encoding)
w2i[word] = idx
idx += 1
return Vocab(w2i, encoding=encoding)
def __getitem__(self, word):
# encodes the word if it is not in vocab
if self.encoding:
word = word.decode(self.encoding)
if word in self.w2i:
idx = self.w2i[word]
else:
idx = self.size()
self.w2i[word] = idx
self.i2w[idx] = word
self.freqs[idx] += 1
return idx
def __contains__(self, word):
if self.encoding:
word = word.decode(self.encoding)
return word in self.w2i
def keys(self): return list(self.w2i.keys())
def freq(self): return dict(self.freqs)
def __repr__(self): return str(self.w2i)
def __len__(self): return self.size()
def size(self): return len(list(self.w2i.keys()))
class VocabBox(object):
def __init__(self, acts, pos_emb, avm_feat_format, param_tying, encoding):
self.w2i_acts = acts
self.act = Vocab(acts, encoding=encoding)
# number of special actions
self.number_specials = len(self.w2i_acts)
# special features
w2i_feats = {UNK_CHAR : UNK}
self.feat = Vocab(w2i_feats, encoding=encoding)
if pos_emb:
# pos features get special treatment
self.pos = Vocab(w2i_feats, encoding=encoding)
print('VOCAB will index POS separately.')
else:
self.pos = self.feat
if avm_feat_format:
# feature types get encoded, too
self.feat_type = Vocab(dict(), encoding=encoding)
print('VOCAB will index all feature types.')
else:
self.feat_type = self.feat
if param_tying:
# use one set of indices for acts and chars
self.char = self.act
print('VOCAB will use same indices for actions and chars.')
else:
# special chars
w2i_chars = {BEGIN_WORD_CHAR : BEGIN_WORD,
END_WORD_CHAR : END_WORD,
UNK_CHAR : UNK}
self.char = Vocab(w2i_chars, encoding=encoding)
# encoding of words
self.word = Vocab(encoding=encoding)
# training set cut-offs
self.act_train = None
self.feat_train = None
self.pos_train = None
self.char_train = None
self.feat_type_train = None
def __repr__(self):
return ('VocabBox (act, feat, pos, char, feat_type) with the following '
'special actions: {}'.format(self.w2i_acts))
def train_cutoff(self):
# store indices separating training set elements
# from elements encoded later from unseen samples
self.act_train = len(self.act)
self.feat_train = len(self.feat)
self.pos_train = len(self.pos)
self.char_train = len(self.char)
self.feat_type_train = len(self.feat_type)
class MinimalVocab(VocabBox):
def __init__(self, pos_emb=True, avm_feat_format=False, param_tying=False, encoding=None):
acts = {UNK_CHAR : UNK,
BEGIN_WORD_CHAR : BEGIN_WORD,
END_WORD_CHAR : END_WORD,
STEP_CHAR : STEP}
super(MinimalVocab, self).__init__(acts, pos_emb, avm_feat_format, param_tying, encoding)
class EditVocab(VocabBox):
def __init__(self, pos_emb=True, avm_feat_format=False, param_tying=False, encoding=None):
acts = {UNK_CHAR : UNK,
BEGIN_WORD_CHAR : BEGIN_WORD,
END_WORD_CHAR : END_WORD,
DELETE_CHAR : DELETE,
COPY_CHAR : COPY}
super(EditVocab, self).__init__(acts, pos_emb, avm_feat_format, param_tying, encoding)