-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlex.py
117 lines (93 loc) · 3.11 KB
/
lex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""Lexical analysis for stilted."""
import base64
import re
from dataclasses import dataclass
from typing import Any, Callable, Iterable
from error import Tilted
from dtypes import Integer, Name, Object, Real, String
@dataclass
class Token:
"""A token that we want."""
rx: str
converter: Callable[[str], Any]
keep: bool = True
@dataclass
class Skip:
"""Characters that can be discarded."""
rx: str
keep: bool = False
class Lexer:
"""
A lexical analyzer.
Initialize with a bunch of Token/Skip instances.
"""
def __init__(self, *tokens) -> None:
rxes = []
self.converters = {}
for i, t in enumerate(tokens):
if t.keep:
assert isinstance(t, Token)
group_name = f"g{i}"
rxes.append(f"(?P<{group_name}>{t.rx})")
self.converters[group_name] = t.converter
else:
rxes.append(f"({t.rx})")
self.rx = "(?m)" + "|".join(rxes)
def tokens(self, text: str) -> Iterable[Object]:
"""
Yield Stilted objects for the tokens in `text`.
"""
for match in re.finditer(self.rx, text):
if group_name := match.lastgroup:
converter = self.converters[group_name]
yield converter(match[0])
def convert_string(text: str) -> String:
"""
A converter for raw string text to the string value we want.
"""
assert text[0] == "("
assert text[-1] == ")"
def do_escape(match):
esc_text = match[0]
if esc_text[1] in "01234567":
return chr(int(esc_text[1:], 8))
else:
match esc_text:
case r"\n":
return "\n"
case r"\t":
return "\t"
case "\\\n":
return ""
case _:
return esc_text[1]
string = re.sub(r"(?s)\\[0-7]{1,3}|\\.", do_escape, text[1:-1])
return String.from_bytes(string.encode("iso8859-1"))
def convert_hex_string(text: str) -> String:
"""
Convert a hex string to a string.
"""
assert text[0] == "<"
assert text[-1] == ">"
text = re.sub(r"\s", "", text[1:-1])
if len(text) % 2 == 1:
text += "0"
return String.from_bytes(base64.b16decode(text, casefold=True))
def error(text: str):
"""A "converter" to raise syntaxerror for bad matches."""
raise Tilted("syntaxerror")
# A look-ahead to only match tokens if they are properly delimited.
DELIMITED = r"(?=[()<>\[\]{}/%\s]|\Z)"
lexer = Lexer(
Token(r"[-+]?\d*(\d\.|\.\d)\d*([eE][-+]?\d+)?" + DELIMITED, Real.from_string),
Token(r"[-+]?\d+[eE][-+]?\d+" + DELIMITED, Real.from_string),
Token(r"[-+]?\d+" + DELIMITED, Integer.from_string),
Token(r"\d+#[0-9a-zA-Z]+" + DELIMITED, Integer.from_string),
Token(r"/?[\[\]{}]", Name.from_string),
Token(r"/?[^()<>\[\]{}/%\s]+" + DELIMITED, Name.from_string),
Token(r"\((?:\\\[0-7]{1,3}|\\.|\\\n|.|\n)*?\)", convert_string),
Token(r"<[0-9a-fA-F\s]+>", convert_hex_string),
Skip(r"%.*$"),
Skip(r"\s+"),
Token(r".", error),
)