forked from cubrink/srt-parse
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy patharpabet.py
More file actions
46 lines (36 loc) · 1.27 KB
/
arpabet.py
File metadata and controls
46 lines (36 loc) · 1.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import argparse
from itertools import chain
import nltk
# Must uncomment on first run
# nltk.download('cmudict')
def build_parser():
'''
Creates an argparse parser
'''
parser = argparse.ArgumentParser(description='Adds an Arpabet translation to text files',
prog='srt-parse')
parser.add_argument('input', type=str,
help='Location of file to be processed')
parser.add_argument('--out-file', type=str,
help='Directory for processed files to be saved to',
default='./out/out.txt')
return parser
args = build_parser().parse_args()
arpabet = nltk.corpus.cmudict.dict()
f = open(args.input, "r")
lines = f.readlines()
with open(args.out_file, 'w') as out:
for idx, line in enumerate(lines):
sub_line = line.split('|') # remove the file name before the separator
new_line = []
skip = False
for word in sub_line[-1].split():
if word not in arpabet:
skip = True
break
new_line += ["{" + " ".join(chain.from_iterable(arpabet[word])) + "}"]
if skip:
continue
out.write(line)
out.write(sub_line[0] + "|" + " ".join(new_line))
out.close()