-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrunboundlessinference.py
More file actions
68 lines (47 loc) · 2.52 KB
/
runboundlessinference.py
File metadata and controls
68 lines (47 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from boundlessbpe import FasterRegexInference
import time
num_lines = 100000 # 100000
vocab_size = 131072 # 1000 # 131072 # 500 # 40960 # 256 + 50, 131072 ~ 128k
tau = 0.9 # deletion threshold
blowup = False
# inputprefix = "./models/pypyfastersuperrollbackfixupdate1GB_1000000_131072_0.9"
# inputprefix = "./models/pypyfastersuperhalfdirect_1000000_131072_0.9_131072"
inputname = "./models/boundless_1000000_1000000000_131072_0.9_1000_ultimate_0_40960.model"
# inputname = "./models/boundless_1000000_1000000000_131072_0.9_1000_ultimate_1_40960.model"
print("inputname:", inputname)
# make sure it loads
print("loading")
tokenizer = FasterRegexInference()
tokenizer.load(inputname)
# print("debug: Kent", b' Kent' in tokenizer.deletions)
# print("debug: ĠKent", 'ĠKent' in tokenizer.deletions)
# print("debug:", (b' of', b' the') in tokenizer.supermerges)
# print("debug:", b' signific' in tokenizer.deletions)
# this handles special tokens correctly
# print(tokenizer.encode("<|endoftext|>hello world", allowed_special="all"))
# print()
# print(tokenizer.encode("King of the hill"))
# print()
# print()
# # if you just want to see the tokens rather than the int values do this
# print(tokenizer.encode_ordinary_chunks("Government of the people, by the people, for the people, shall not perish from the earth", blowup=blowup, verbose=True))
# print()
# print()
# # extercise our deletion code
# "to be, is to feel boundless wonder."
"tail of the bobcat is short"
print(tokenizer.encode_ordinary_chunks("tail of the bobcat, is short", blowup=blowup, verbose=True))
print()
start_time = time.time()
# print(tokenizer.encode_ordinary_chunks(" Kentucky Kent Ken Kentuck", blowup=blowup, verbose=True))
# print()
# print(time.time() - start_time)
# print(tokenizer.encode_ordinary_chunks("To be or not to be", blowup=blowup, verbose=True))
# print(tokenizer.encode_ordinary_chunks("significant other", blowup=blowup, verbose=True))
# print(tokenizer.encode_ordinary_chunks("Government of the people, by the people, for the people.", blowup=blowup, verbose=True))
# print(tokenizer.encode_ordinary_chunks("King of the Hill", blowup=blowup, verbose=True))
# print(tokenizer.encode_ordinary_chunks("Tip of the hat", blowup=blowup, verbose=True))
# print(tokenizer.encode_ordinary_chunks("Hello, this is a test of the tokenizer!", blowup=blowup, verbose=True))
# print(tokenizer.encode_ordinary_chunks("between, ween", blowup=blowup, verbose=True))
print("done")
# python -u run4fasterinference.py 2>&1 | tee best_pair_pypyfastersuper_count_100k_good.txt