-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprep_channel_map.py
More file actions
70 lines (64 loc) · 3.06 KB
/
prep_channel_map.py
File metadata and controls
70 lines (64 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import argparse
import os
import librosa
import soundfile as sf
import numpy as np
import json
from codec_bpe.core.utils import get_files
from codec_bpe.tools.audio_encoder import SUPPORTED_EXTENSIONS
from tqdm import tqdm
from realtime_codec_agent.utils.transcript_utils import load_transcript
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Construct a mapping between transcript speaker ids and the audio channel that they are on"
)
parser.add_argument("--transcripts_path", type=str, required=True)
parser.add_argument("--audio_path", type=str, required=True)
parser.add_argument("--overwrite", action="store_true", help="Overwrite existing channel map files")
args = parser.parse_args()
transcript_files = get_files(args.transcripts_path, ".txt")
for transcript_file in tqdm(transcript_files, desc="Transcript files"):
audio_file = None
for ext in SUPPORTED_EXTENSIONS:
audio_file_ = transcript_file.replace(args.transcripts_path, args.audio_path).replace(".txt", ext)
if os.path.exists(audio_file_):
audio_file = audio_file_
break
if audio_file is None:
print(f"Skipping {transcript_file} because no audio file was found.")
continue
# open audio file and load transcript
num_channels = sf.info(audio_file).channels
if num_channels == 1:
print(f"Skipping {transcript_file} because audio is mono.")
continue
channel_map_file = transcript_file.replace(".txt", "_channel_map.json")
if not args.overwrite and os.path.exists(channel_map_file):
print(f"Skipping {transcript_file} because channel map already exists.")
continue
audio, sr = librosa.load(audio_file, sr=16000, mono=False)
transcript_lines, speakers = load_transcript(transcript_file)
# construct the channel map
channel_map = {}
for speaker in speakers:
# concatenate the audio segments for this speaker
speaker_segments = [line for line in transcript_lines if line[2] == speaker]
speaker_audio = np.concatenate([
audio[:, int(start * sr):int(end * sr)] for start, end, _, _ in speaker_segments
], axis=-1)
if speaker_audio.size == 0:
channel_map[speaker] = {
"channel": None,
"duration_secs": 0.0,
}
else:
# get average amplitude for each channel during these segments
speaker_abs_mean_amplitudes = np.mean(np.abs(speaker_audio), axis=-1)
speaker_channel = np.argmax(speaker_abs_mean_amplitudes)
channel_map[speaker] = {
"channel": speaker_channel.item(),
"duration_secs": speaker_audio.shape[-1] / sr,
}
# save the channel map to a JSON file alongside the transcript file
with open(channel_map_file, "w") as f:
json.dump(channel_map, f, indent=4)