ReinflectionTransduce3/vocabulary.py at master · davidgu13/ReinflectionTransduce3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from collections import Counter

from defaults import (BEGIN_WORD, BEGIN_WORD_CHAR, COPY,
                      COPY_CHAR, DELETE, DELETE_CHAR, END_WORD,
                      END_WORD_CHAR, STEP, STEP_CHAR, UNK, UNK_CHAR)


#############################################################
# VOCABULARY
#############################################################

class Vocab(object):
    def __init__(self, w2i=None, encoding='utf8'):
        if w2i is None:
            self.w2i = dict()
        else:
            self.w2i = dict(w2i)
        self.i2w = {i:w for w,i in self.w2i.items()}
        self.encoding = encoding
        self.freqs = Counter(list(self.i2w.keys()))

    @classmethod
    def from_list(cls, words, encoding='utf8'):
        w2i = {}
        idx = 0
        for word in set(words):
            if encoding:
                word = word.decode(encoding)
            w2i[word] = idx
            idx += 1
        return Vocab(w2i, encoding=encoding)

    def __getitem__(self, word):
        # encodes the word if it is not in vocab
        if self.encoding:
            word = word.decode(self.encoding)
        if word in self.w2i:
            idx = self.w2i[word]
        else:
            idx = self.size()
            self.w2i[word] = idx
            self.i2w[idx] = word
        self.freqs[idx] += 1
        return idx

    def __contains__(self, word):
        if self.encoding:
            word = word.decode(self.encoding)
        return word in self.w2i

    def keys(self): return list(self.w2i.keys())

    def freq(self): return dict(self.freqs)

    def __repr__(self): return str(self.w2i)

    def __len__(self): return self.size()

    def size(self): return len(list(self.w2i.keys()))


class VocabBox(object):
    def __init__(self, acts, pos_emb, avm_feat_format, param_tying, encoding):

        self.w2i_acts = acts
        self.act = Vocab(acts, encoding=encoding)
        # number of special actions
        self.number_specials = len(self.w2i_acts)
        # special features
        w2i_feats = {UNK_CHAR : UNK}
        self.feat = Vocab(w2i_feats, encoding=encoding)
        if pos_emb:
            # pos features get special treatment
            self.pos = Vocab(w2i_feats, encoding=encoding)
            print('VOCAB will index POS separately.')
        else:
            self.pos = self.feat
        if avm_feat_format:
            # feature types get encoded, too
            self.feat_type = Vocab(dict(), encoding=encoding)
            print('VOCAB will index all feature types.')
        else:
            self.feat_type = self.feat
        if param_tying:
            # use one set of indices for acts and chars
            self.char = self.act
            print('VOCAB will use same indices for actions and chars.')
        else:
            # special chars
            w2i_chars = {BEGIN_WORD_CHAR : BEGIN_WORD,
                         END_WORD_CHAR : END_WORD,
                         UNK_CHAR : UNK}
            self.char = Vocab(w2i_chars, encoding=encoding)
        # encoding of words
        self.word = Vocab(encoding=encoding)
        # training set cut-offs
        self.act_train  = None
        self.feat_train = None
        self.pos_train  = None
        self.char_train = None
        self.feat_type_train = None
    def __repr__(self):
        return ('VocabBox (act, feat, pos, char, feat_type) with the following '
                 'special actions: {}'.format(self.w2i_acts))

    def train_cutoff(self):
        # store indices separating training set elements
        # from elements encoded later from unseen samples
        self.act_train  = len(self.act)
        self.feat_train = len(self.feat)
        self.pos_train  = len(self.pos)
        self.char_train = len(self.char)
        self.feat_type_train = len(self.feat_type)

class MinimalVocab(VocabBox):
    def __init__(self, pos_emb=True, avm_feat_format=False, param_tying=False, encoding=None):
        acts = {UNK_CHAR : UNK,
                BEGIN_WORD_CHAR : BEGIN_WORD,
                END_WORD_CHAR : END_WORD,
                STEP_CHAR : STEP}
        super(MinimalVocab, self).__init__(acts, pos_emb, avm_feat_format, param_tying, encoding)

class EditVocab(VocabBox):
    def __init__(self, pos_emb=True, avm_feat_format=False, param_tying=False, encoding=None):
        acts = {UNK_CHAR : UNK,
                BEGIN_WORD_CHAR : BEGIN_WORD,
                END_WORD_CHAR : END_WORD,
                DELETE_CHAR : DELETE,
                COPY_CHAR : COPY}
        super(EditVocab, self).__init__(acts, pos_emb, avm_feat_format, param_tying, encoding)