diff --git a/ICECAN/corpus_reorganization_script.py b/ICECAN/corpus_reorganization_script.py index 81f7225..dbfc9c1 100644 --- a/ICECAN/corpus_reorganization_script.py +++ b/ICECAN/corpus_reorganization_script.py @@ -1,3 +1,5 @@ + +# coding=utf-8 import os import sys import csv @@ -8,8 +10,10 @@ from statistics import mean from textgrid import TextGrid, IntervalTier -orig_dir = r'/media/share/corpora/ICE-Can' -output_dir = r'/media/share/corpora/ICE-Can/to-align' +# orig_dir = r'/media/share/corpora/ICE-Can' +# output_dir = r'/media/share/corpora/ICE-Can/to-align' +orig_dir = r"/Volumes/data/corpora/ICE-Can" +output_dir = r"/Volumes/data/corpora/ICE-Can/to-align" os.makedirs(output_dir, exist_ok=True) @@ -332,5 +336,5 @@ def convert_wavs(): if __name__ == '__main__': reorganize_meta_file() - convert_wavs() - parse_transcripts() + # convert_wavs() + # parse_transcripts() diff --git a/ICECAN/sibilant_script/.DS_Store b/ICECAN/sibilant_script/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/ICECAN/sibilant_script/.DS_Store differ diff --git a/ICECAN/sibilant_script/open_2.praat b/ICECAN/sibilant_script/open_2.praat new file mode 100644 index 0000000..5a430af --- /dev/null +++ b/ICECAN/sibilant_script/open_2.praat @@ -0,0 +1,49 @@ +# read in file info +form Open a tgwav + sentence Tg_path /Volumes/data/corpora/Raleigh/ral368/ral3680d.TextGrid + sentence Wav_path /Volumes/data/corpora/Raleigh/ral368/ral3680d.wav + positive Start 1494.81 + positive End 1494.91 +endform + +# load files +tg = Read from file: tg_path$ +wav = Read from file: wav_path$ + +# select objects +selectObject: wav +plusObject: tg +grid$ = selected$ ("TextGrid") +sound$ = selected$ ("Sound") + +# add annotation tier and boundaries +select TextGrid 'grid$' +numberOfTiers = Get number of tiers + +Edit +editor TextGrid 'grid$' + Add interval tier... numberOfTiers+1 sibann + Close +endeditor + +Insert boundary... numberOfTiers+1 start +Insert boundary... numberOfTiers+1 end + +plus Sound 'sound$' +# zoom in on focused part +View & Edit +editor TextGrid 'grid$' + Zoom: start, end +endeditor + + +select Sound 'sound$' +Edit +editor Sound 'sound$' + Zoom: start, end + Select: start, end +endeditor + +#writeInfoLine: "COG: ", cog +#appendInfoLine: " + diff --git a/ICECAN/sibilant_script/open_tg.praat b/ICECAN/sibilant_script/open_tg.praat new file mode 100644 index 0000000..87170a2 --- /dev/null +++ b/ICECAN/sibilant_script/open_tg.praat @@ -0,0 +1,19 @@ +form Open a tgwav + sentence tg_path /Volumes/data/corpora/Raleigh/ral368/ral3680d.TextGrid + sentence wav_path /Volumes/data/corpora/Raleigh/ral368/ral3680d.wav + positive start 1494.81 + positive end 1494.91 +endform +tg = Read from file: tg_path$ +wav = Read from file: wav_path$ +selectObject: wav +plusObject: tg + +View & Edit +Insert interval tier... '5' 'sib_ann' +editor: tg + + #Insert boundary... '5' start + #Insert boundary... '5' end + Zoom: start, end +endeditor \ No newline at end of file diff --git a/ICECAN/sibilant_script/plan b/ICECAN/sibilant_script/plan new file mode 100644 index 0000000..58d1e15 --- /dev/null +++ b/ICECAN/sibilant_script/plan @@ -0,0 +1,17 @@ +plan: +- open/read csv w/ python/pandas (done) +- take 1% weighted sample (done) +- for each sample: + get path/filename + get timing info + write to individual file + get COG, peak, slope, spread + write to another file (?) +- interactive script + in batches (to avoid opening 600 praat windows at once and crashing computer) + subprocess call to praat script + praat script opens corresponding file with path/filename and timing info (done) + praat script adds a tier where annotation can happen (done) + praat script opens textgrid and wav file, zooms in on time slice (done) + praat script opens long term spectral slice window + praat script opens info window with COG, peak, slope, spread measures diff --git a/ICECAN/sibilant_script/sendpraat b/ICECAN/sibilant_script/sendpraat new file mode 100755 index 0000000..9733b02 Binary files /dev/null and b/ICECAN/sibilant_script/sendpraat differ diff --git a/ICECAN/sibilant_script/sib_script b/ICECAN/sibilant_script/sib_script new file mode 100644 index 0000000..48358e3 --- /dev/null +++ b/ICECAN/sibilant_script/sib_script @@ -0,0 +1,20 @@ +csv_raw$ = readFile$ ("testsibilants.csv") + +procedure split (.sep$, .str$) + .seplen = length(.sep$) + .length = 0 + repeat + .strlen = length(.str$) + .sep = index(.str$, .sep$) + if .sep > 0 + .part$ = left$(.str$, .sep-1) + .str$ = mid$(.str$, .sep+.seplen, .strlen) + else + .part$ = .str$ + endif + .length = .length+1 + .array$[.length] = .part$ + until .sep = 0 +endproc + +Read from file: "/Volumes/data/corpora" ; Mac diff --git a/ICECAN/sibilant_script/superscript.py b/ICECAN/sibilant_script/superscript.py new file mode 100644 index 0000000..7edcdd9 --- /dev/null +++ b/ICECAN/sibilant_script/superscript.py @@ -0,0 +1,116 @@ +import os +import pandas as pd +import re +import argparse +import numpy as np +from subprocess import Popen, PIPE +import sys +import shlex +from pyraat import PraatAnalysisFunction + +np.random.seed(1234) + +PRAAT = "/Applications/Praat.app/Contents/MacOS/Praat" + +def get_sample(path): + sib_df = pd.read_csv(path) + + all_corpora = sib_df['corpus'] + corp_freqdict = {c:0 for c in set(all_corpora)} + data_dict = {c: None for c in set(all_corpora)} + for c in all_corpora: + corp_freqdict[c]+=1 + perc = .01 + tot_df = pd.DataFrame() + corp_freqdict = {c: np.rint(perc * float(v)) for c,v in corp_freqdict.items()} + for corp,num_samples in corp_freqdict.items(): + data=[] + sub_frame = sib_df[sib_df.corpus == corp] + + all_idxs = np.arange(0, sub_frame.shape[0],1) + + chosen_idxs = np.random.choice(all_idxs, size=int(num_samples)) + tot_df = pd.concat([tot_df, sub_frame.iloc[chosen_idxs]]) + + return tot_df, set(all_corpora) + +def input_taker(df,locations): + print("Interactive script for sibilant checks:") + enter = input("press enter to continue") + row_idx = 0 + print(enter) + + while enter.strip() is "": + # get a line from the df + row = df.iloc[row_idx] + filename = row["discourse"] + corpus = row["corpus"].lower() + print(corpus) + if corpus == "SOTC": + split_name = re.split("-", filename) + outer_dir = "-".join(split_name[0:2]) + inner_dir = "-".join(split_name[0:3]) + tg_path = os.path.join(locations[corpus], outer_dir, inner_dir, filename + ".TextGrid") + wav_path = os.path.join(locations[corpus], outer_dir, inner_dir, filename + ".wav") + else: + # elif corpus == "Raleigh": + outer_dir = filename[0:6] + tg_path = os.path.join(locations[corpus], outer_dir, filename + ".TextGrid") + wav_path = os.path.join(locations[corpus], outer_dir, filename + ".wav") + + zoom_start, zoom_end = row["begin"], row["end"] + + path_to_open = os.path.join(os.path.split(os.path.abspath(__file__))[0], "open_2.praat") + # quote_str = '"runScript: \\"{}\\", {} {} {} {}"'.format(path_to_open, tg_path, wav_path, zoom_start, zoom_end) + # script_args = [tg_path, wav_path, zoom_start, zoom_end] + quote_str = "execute /Users/esteng/SPADE/ICECAN/sibilant_script/open_2.praat {} {} {} {}".format( + tg_path, wav_path, zoom_start, zoom_end) + cmd = ["./sendpraat", "praat", quote_str] + print(quote_str) + with Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=PIPE) as p: + try: + text = str(p.stdout.read().decode('latin')) + err = str(p.stderr.read().decode('latin')) + except UnicodeDecodeError: + print(p.stdout.read()) + print(p.stderr.read()) + + print(text, err) + + # ./sendpraat praat "execute Users/Elias/SPADE/ICECAN/sibilant_script/open_tg.praat + # run_script("/Applications/Praat.app/Contents/MacOS/Praat", "open_2.praat", *script_args) + # par = run_script("open_2.praat", arguments=script_args) + # par() + enter = input("press enter to continue") + # open textgrid with wav by subprocess calling praat script with arguments + row_idx+=1 + + +def get_locations(corpora, location_file): + """ + needs a list of corpora (for checks) and a location file + where each line is , + """ + with open(location_file) as f1: + lines = [x.split(",") for x in f1.readlines()] + location_dict = {x.lower():None for x in corpora} + for corpus, location in lines: + try: + if not os.path.exists(location.strip()): + print("Error: Location {} does not exist".format(location)) + sys.exit(1) + location_dict[corpus.lower()] = location.strip() + except KeyError: + print("Error: Corpus {} is not in the sibilant dataset".format(corpus)) + sys.exit(1) + return location_dict + + + +one_perc_df, corpora = get_sample("testsibilants.csv") +just_Ral = one_perc_df[one_perc_df.corpus == "Raleigh"] +loc_dict = get_locations(corpora, "locations.txt") +input_taker(just_Ral, loc_dict) + + +print(one_perc_df.shape) \ No newline at end of file diff --git a/ICECAN/sibilant_script/temp.sh b/ICECAN/sibilant_script/temp.sh new file mode 100644 index 0000000..39f897c --- /dev/null +++ b/ICECAN/sibilant_script/temp.sh @@ -0,0 +1 @@ +./sendpraat praat "execute /Users/esteng/SPADE/ICECAN/sibilant_script/open_2.praat /Volumes/data/corpora/Raleigh/ral128/ral1280d.TextGrid /Volumes/data/corpora/Raleigh/ral128/ral1280d.wav 1963.9436 1964.1331699999998" \ No newline at end of file diff --git a/sibilant_script/.DS_Store b/sibilant_script/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/sibilant_script/.DS_Store differ diff --git a/sibilant_script/README.md b/sibilant_script/README.md new file mode 100644 index 0000000..4e7ea09 --- /dev/null +++ b/sibilant_script/README.md @@ -0,0 +1,31 @@ +# Instructions on how to use interactive script + +## prerequisites +Python 3.5 and praat are required to run this script. The current version assumes that OsX is running, but this can be changed. Using a virtualenv is recommended; an easy way to set up an environment is by installing and using [Miniconda](https://conda.io/miniconda.html). + +To use the script, the libraries in requirements.txt need to be installed. This can be done by inputting + `pip install -r requirements.txt` + +The testsibilants.csv file should be in the same directory as the script. + +## current functionality +Right now, the script works for the following corpora: SB_West, SOTC, and Raleigh + +## How to: +1. Edit the location file + - the format should be , + - the path should be an absolute path + - examples can be found in the "location_example.txt" file +2. Open the Praat application + - praat needs to be running already for the script to work +3. run the script + - input `python superscript.py` into the command line +4. step through the interactive script + - in the command line, a prompt will appear to press enter. Each time you press enter in the command line, a new row will be read from the testsibilants.csv file, which corresponds to a new sibilant. Three Praat windows should open. Note that opening the Praat windows may take a few seconds. + +## Adding new corpora +To add support for new corpora (in case you have them in textgrid format), two changes need to be made: +1. add their locations to the `locations.txt` file +2. add their names to `CORPUS_LIST` list at line 11 of `superscript.py` + + diff --git a/sibilant_script/open_2.praat b/sibilant_script/open_2.praat new file mode 100644 index 0000000..58d6352 --- /dev/null +++ b/sibilant_script/open_2.praat @@ -0,0 +1,57 @@ +# read in file info +form Open a tgwav + sentence Tg_path /Volumes/data/corpora/Raleigh/ral368/ral3680d.TextGrid + sentence Wav_path /Volumes/data/corpora/Raleigh/ral368/ral3680d.wav + positive Start 1494.81 + positive End 1494.91 + positive Cog 0.0 + positive Peak 0.0 + positive Slope 0.0 + positive Spread 0.0 +endform + +# load files +tg = Read from file: tg_path$ +wav = Read from file: wav_path$ + +# select objects +selectObject: wav +plusObject: tg +grid$ = selected$ ("TextGrid") +sound$ = selected$ ("Sound") + +# add annotation tier and boundaries +select TextGrid 'grid$' +numberOfTiers = Get number of tiers + +Edit +editor TextGrid 'grid$' + Add interval tier... numberOfTiers+1 sibann + Close +endeditor + +Insert boundary... numberOfTiers+1 start +Insert boundary... numberOfTiers+1 end + +plus Sound 'sound$' +# zoom in on focused part +View & Edit +editor TextGrid 'grid$' + Zoom: start, end +endeditor + + +select Sound 'sound$' +Edit +editor Sound 'sound$' + Zoom: start, end + Select: start, end +endeditor + +writeInfoLine: "COG: ", cog +appendInfoLine: "Peak: ", peak +appendInfoLine: "Slope: ", slope +appendInfoLine: "Spread: ", spread + + + diff --git a/sibilant_script/plan b/sibilant_script/plan new file mode 100644 index 0000000..58d1e15 --- /dev/null +++ b/sibilant_script/plan @@ -0,0 +1,17 @@ +plan: +- open/read csv w/ python/pandas (done) +- take 1% weighted sample (done) +- for each sample: + get path/filename + get timing info + write to individual file + get COG, peak, slope, spread + write to another file (?) +- interactive script + in batches (to avoid opening 600 praat windows at once and crashing computer) + subprocess call to praat script + praat script opens corresponding file with path/filename and timing info (done) + praat script adds a tier where annotation can happen (done) + praat script opens textgrid and wav file, zooms in on time slice (done) + praat script opens long term spectral slice window + praat script opens info window with COG, peak, slope, spread measures diff --git a/sibilant_script/sendpraat b/sibilant_script/sendpraat new file mode 100755 index 0000000..9733b02 Binary files /dev/null and b/sibilant_script/sendpraat differ diff --git a/sibilant_script/sendpraat.exe b/sibilant_script/sendpraat.exe new file mode 100644 index 0000000..9d334af Binary files /dev/null and b/sibilant_script/sendpraat.exe differ diff --git a/sibilant_script/superscript.py b/sibilant_script/superscript.py new file mode 100644 index 0000000..2e7758c --- /dev/null +++ b/sibilant_script/superscript.py @@ -0,0 +1,121 @@ +import os +import pandas as pd +import re +import argparse +import numpy as np +from subprocess import Popen, PIPE +import sys +import platform + +np.random.seed(1234) + +CORPUS_LIST = ["SB_West", "Raleigh"] + +def get_sample(path): + sib_df = pd.read_csv(path) + + all_corpora = sib_df['corpus'] + corp_freqdict = {c:0 for c in set(all_corpora)} + data_dict = {c: None for c in set(all_corpora)} + for c in all_corpora: + corp_freqdict[c]+=1 + perc = .01 + tot_df = pd.DataFrame() + corp_freqdict = {c: np.rint(perc * float(v)) for c,v in corp_freqdict.items()} + for corp,num_samples in corp_freqdict.items(): + data=[] + sub_frame = sib_df[sib_df.corpus == corp] + + all_idxs = np.arange(0, sub_frame.shape[0],1) + + chosen_idxs = np.random.choice(all_idxs, size=int(num_samples)) + tot_df = pd.concat([tot_df, sub_frame.iloc[chosen_idxs]]) + + return tot_df, set(all_corpora) + +def input_taker(df,locations): + print("Interactive script for sibilant checks:") + enter = input("press enter to continue") + row_idx = 0 + print(enter) + + while enter.strip() is "": + # get a line from the df + row = df.iloc[row_idx] + filename = row["discourse"] + corpus = row["corpus"].lower() + print(corpus) + if corpus.lower() == "sotc": + split_name = re.split("-", filename) + outer_dir = "-".join(split_name[0:2]) + inner_dir = "-".join(split_name[0:3]) + tg_path = os.path.join(locations[corpus], outer_dir, inner_dir, filename + ".TextGrid") + wav_path = os.path.join(locations[corpus], outer_dir, inner_dir, filename + ".wav") + elif corpus.lower() == "raleigh": + outer_dir = filename[0:6] + tg_path = os.path.join(locations[corpus], outer_dir, filename + ".TextGrid") + wav_path = os.path.join(locations[corpus], outer_dir, filename + ".wav") + elif corpus.lower() == "sb_west": + tg_path = os.path.join(locations[corpus], filename + ".TextGrid") + wav_path = os.path.join(locations[corpus], filename + ".wav") + else: + print("Error: Corpus {} not implemented".format(corpus)) + sys.exit() + + zoom_start, zoom_end = row["begin"], row["end"] + cog, peak, slope, spread = row["cog"], row["peak"], row["slope"], row["spread"] + + path_to_open = os.path.join(os.path.split(os.path.abspath(__file__))[0], "open_2.praat") + + quote_str = "execute {} {} {} {} {} {} {} {} {}".format(path_to_open, + tg_path, wav_path, + zoom_start, + zoom_end, + cog, peak, + slope, + spread) + + if platform.system() == "Darwin": + cmd = ["./sendpraat", "praat", quote_str] + else: + cmd = ["sendpraat.exe", "praat", quote_str] + # print(quote_str) + with Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=PIPE) as p: + try: + text = str(p.stdout.read().decode('latin')) + err = str(p.stderr.read().decode('latin')) + except UnicodeDecodeError: + print(p.stdout.read()) + print(p.stderr.read()) + + enter = input("press enter to continue") + row_idx+=1 + + +def get_locations(corpora, location_file): + """ + needs a list of corpora (for checks) and a location file + where each line is , + """ + with open(location_file) as f1: + lines = [x.split(",") for x in f1.readlines()] + location_dict = {x.lower():None for x in corpora} + for corpus, location in lines: + try: + if not os.path.exists(location.strip()): + print("Error: Location {} does not exist".format(location)) + sys.exit(1) + location_dict[corpus.lower()] = location.strip() + except KeyError: + print("Error: Corpus {} is not in the sibilant dataset".format(corpus)) + sys.exit(1) + return location_dict + + + +one_perc_df, corpora = get_sample("testsibilants.csv") +sub_df = one_perc_df[one_perc_df.corpus.isin(CORPUS_LIST)] +loc_dict = get_locations(corpora, "locations.txt") +input_taker(sub_df, loc_dict) + +print(one_perc_df.shape) \ No newline at end of file