tiny-compiler/lex.py at main · HopeBaron/tiny-compiler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import enum
import sys


class Lexer:
    def __init__(self, source):
        self.source = source
        self.position = -1
        self.current = None
        self.size = len(source)
        self.nextChar()

    def nextChar(self):
        next_char = self.position + 1
        if next_char >= self.size:
            self.current = "\0"
            return self.current
        self.position = next_char
        self.current = self.source[self.position]

    def peek(self):
        next_char = self.position + 1
        if next_char >= self.size:
            return "\0"
        return self.source[next_char]

    def abort(self, message):
        sys.exit("Lexer Aborted: " + message)

    def skipWhitespace(self):
        while self.current == " " or self.current == "\t" or self.current == "\r":
            self.nextChar()

    def skipComment(self):
        if self.current == "#":
            while self.current != "\n":
                self.nextChar()

    def getToken(self):
        token = None
        self.skipWhitespace()
        self.skipComment()
        current_character = self.current
        next_character = self.peek()
        if current_character == "\0":
            token = Token(current_character, TokenType.EOF)
        if current_character == ">" and next_character == "=":
            token = Token(current_character + next_character, TokenType.GTEQ)
        if current_character == "<" and next_character == "=":
            token = Token(current_character + next_character, TokenType.LTEQ)

        if current_character == "=" and next_character == "=":
            token = Token(current_character + next_character, TokenType.EQEQ)
        if current_character == "!" and next_character == "=":
            token = Token(current_character + next_character, TokenType.NOTEQ)
        if current_character == "=":
            token = Token(current_character, TokenType.EQ)
        if current_character == ">":
            token = Token(current_character, TokenType.GT)
        if current_character == "<":
            token = Token(current_character, TokenType.LT)
        if current_character == "+":
            token = Token(current_character, TokenType.PLUS)
        if current_character == "-":
            token = Token(current_character, TokenType.MINUS)
        if current_character == "*":
            token = Token(current_character, TokenType.ASTERISK)
        if current_character == "/":
            token = Token(current_character, TokenType.SLASH)
        if current_character == "\n":
            token = Token(current_character, TokenType.NEWLINE)
        if current_character == '"':
            # Get characters between quotations.
            self.nextChar()
            startPos = self.position

            while self.current != '"':
                # Don't allow special characters in the string. No escape characters, newlines, tabs, or %.
                # We will be using C's printf on this string.
                if (
                    self.current == "\r"
                    or self.current == "\n"
                    or self.current == "\t"
                    or self.current == "\\"
                    or self.current == "%"
                ):
                    self.abort("Illegal character in string.")
                self.nextChar()

            string_content = self.source[startPos : self.position]  # Get the substring.
            token = Token(string_content, TokenType.STRING)

        if current_character.isdigit():
            # Leading character is a digit, so this must be a number.
            # Get all consecutive digits and decimal if there is one.
            startPos = self.position
            while self.peek().isdigit():
                self.nextChar()
            if self.peek() == ".":  # Decimal!
                self.nextChar()

                # Must have at least one digit after decimal.
                if not self.peek().isdigit():
                    # Error!
                    self.abort("Illegal character in number.")
                while self.peek().isdigit():
                    self.nextChar()

            number_content = self.source[
                startPos : self.position + 1
            ]  # Get the substring.
            token = Token(number_content, TokenType.NUMBER)
        if current_character.isalpha():
            # Leading character is a letter, so this must be an identifier or a keyword.
            # Get all consecutive alpha numeric characters.
            startPos = self.position
            while self.peek().isalnum():
                self.nextChar()

            # Check if the token is in the list of keywords.
            tokText = self.source[startPos : self.position + 1]  # Get the substring.
            keyword = Token.checkIfKeyword(tokText)
            if keyword == None:  # Identifier
                token = Token(tokText, TokenType.IDENT)
            else:  # Keyword
                token = Token(tokText, keyword)
        if token is None:
            self.abort("Unknown Token:")
        self.nextChar()
        return token


class Token:
    def __init__(self, tokenText, tokenKind):
        self.text = tokenText
        self.kind = tokenKind

    @staticmethod
    def checkIfKeyword(tokenText):
        for kind in TokenType:
            # Relies on all keyword enum values being 1XX.
            if kind.name == tokenText and kind.value >= 100 and kind.value < 200:
                return kind
        return None


class TokenType(enum.Enum):
    EOF = -1
    NEWLINE = 0
    NUMBER = 1
    IDENT = 2
    STRING = 3

    LABEL = 101
    GOTO = 102
    PRINT = 103
    INPUT = 104
    LET = 105
    IF = 106
    THEN = 107
    ENDIF = 108
    WHILE = 109
    REPEAT = 110
    ENDWHILE = 111

    EQ = 201
    PLUS = 202
    MINUS = 203
    ASTERISK = 204
    SLASH = 205
    EQEQ = 206
    NOTEQ = 207
    LT = 208
    LTEQ = 209
    GT = 210
    GTEQ = 211