wolfgitpr
diff --git a/‎binarize.py‎
Lines changed: 76 additions & 128 deletions b/‎binarize.py‎
Lines changed: 76 additions & 128 deletions
diff --git a/‎configs/binarize_config.yaml‎
Lines changed: 4 additions & 4 deletions b/‎configs/binarize_config.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎configs/datasets_config.yaml‎
Lines changed: 1 addition & 4 deletions b/‎configs/datasets_config.yaml‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎configs/train_config.yaml‎
Lines changed: 12 additions & 11 deletions b/‎configs/train_config.yaml‎
Lines changed: 12 additions & 11 deletions
@@ -9,11 +9,11 @@
 import yaml
 from tqdm import tqdm
 
+from tools.binarize_util import load_wav, get_curves
 from tools.config_utils import load_yaml
 from tools.dataset import IndexedDatasetBuilder
 from tools.encoder import UnitsEncoder
 from tools.get_melspec import MelSpecExtractor
-from tools.load_wav import load_wav
 from tools.multiprocess_utils import chunked_multiprocess_run
 
 unitsEncoder = None
@@ -56,10 +56,11 @@ def __init__(self, binary_config):
         self.max_length = binary_config['max_length']
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
+        self.hop_size = self.melspec_config["hop_size"]
+        self.window_size = self.melspec_config["window_size"]
         self.sample_rate = self.melspec_config["sample_rate"]
-        self.frame_length = self.melspec_config["hop_length"] / self.sample_rate
+        self.frame_length = self.hop_size / self.sample_rate
 
-        self.hop_size = binary_config['melspec_config']["hop_length"]
         self.hubert_channel = binary_config['hubert_config']["channel"]
 
     def get_vocab(self):
@@ -85,8 +86,6 @@ def get_vocab(self):
                 dict_phonemes.append(ph)
 
         for dataset in self.datasets:
-            if dataset.get("label_type", "blank") == "blank":
-                continue
             language = dataset.get("language", "blank")
             raw_data_dir = dataset["raw_data_dir"]
 
@@ -195,103 +194,65 @@ def process(self):
         # binarize train set
         self.binarize("train", meta_data_train, self.binary_folder)
 
-    def make_ph_data(self, vocab, T, label_type_id, raw_ph_id_seq, raw_ph_dur):
-        if label_type_id == 0:
-            # ph_seq: [S]
-            ph_id_seq = np.array([]).astype("int32")
+    def make_ph_data(self, vocab, T, raw_ph_id_seq, raw_ph_dur):
+        # ph_seq: [S]
+        ph_id_seq = np.array(raw_ph_id_seq).astype("int32")
+        not_sp_idx = ph_id_seq != 0
+        ph_id_seq = ph_id_seq[not_sp_idx]
 
-            # ph_edge: [T]
-            ph_edge = np.zeros([T], dtype="float32")
-
-            # ph_frame: [T]
-            ph_frame = np.zeros(T, dtype="int32")
-
-            # ph_time: [T]
-            ph_time = np.zeros(T, dtype="float32")
-
-            # ph_mask: [vocab_size]
-            ph_mask = np.ones(vocab["vocab_size"], dtype="int32")
-        elif label_type_id == 1:
-            # ph_seq: [S]
-            ph_id_seq = np.array(raw_ph_id_seq).astype("int32")
-            ph_id_seq = ph_id_seq[ph_id_seq != 0]
-
-            if len(ph_id_seq) <= 0:
-                return None, None, None, None, None
-
-            # ph_edge: [T]
-            ph_edge = np.zeros([T], dtype="float32")
+        # ph_edge: [T]
+        ph_dur = np.array(raw_ph_dur).astype("float32")
+        ph_time = np.array(np.concatenate(([0], ph_dur))).cumsum()
+        ph_frame = ph_time / self.frame_length
+        ph_interval = np.stack((ph_frame[:-1], ph_frame[1:]))
+        ph_time = ph_time[:-1]
+        ph_time = ph_time[not_sp_idx]
 
-            # ph_frame: [T]
-            ph_frame = np.zeros(T, dtype="int32")
+        ph_interval = ph_interval[:, not_sp_idx]
+        ph_id_seq = ph_id_seq
+        ph_frame = np.unique(ph_interval.flatten())
+        if ph_frame[-1] >= T:
+            ph_frame = ph_frame[:-1]
 
-            # ph_time: [T]
-            ph_time = np.zeros(T, dtype="float32")
+        if len(ph_id_seq) <= 0:
+            return None, None, None, None, None
 
-            # ph_mask: [vocab_size]
-            ph_mask = np.zeros(vocab["vocab_size"], dtype="int32")
-            ph_mask[ph_id_seq] = 1
-            ph_mask[0] = 1
-        elif label_type_id >= 2:
-            # ph_seq: [S]
-            ph_id_seq = np.array(raw_ph_id_seq).astype("int32")
-            not_sp_idx = ph_id_seq != 0
-            ph_id_seq = ph_id_seq[not_sp_idx]
-
-            # ph_edge: [T]
-            ph_dur = np.array(raw_ph_dur).astype("float32")
-            ph_time = np.array(np.concatenate(([0], ph_dur))).cumsum()
-            ph_frame = ph_time / self.frame_length
-            ph_interval = np.stack((ph_frame[:-1], ph_frame[1:]))
-            ph_time = ph_time[:-1]
-            ph_time = ph_time[not_sp_idx]
-
-            ph_interval = ph_interval[:, not_sp_idx]
-            ph_id_seq = ph_id_seq
-            ph_frame = np.unique(ph_interval.flatten())
-            if ph_frame[-1] >= T:
+        ph_edge = np.zeros([T], dtype="float32")
+        if len(ph_id_seq) > 0:
+            if ph_frame[-1] + 0.5 > T:
                 ph_frame = ph_frame[:-1]
+            if ph_frame[0] - 0.5 < 0:
+                ph_frame = ph_frame[1:]
+            ph_time_int = np.round(ph_frame).astype("int32")
+            ph_time_fractional = ph_frame - ph_time_int
+
+            ph_edge[ph_time_int] = 0.5 + ph_time_fractional
+            ph_edge[ph_time_int - 1] = 0.5 - ph_time_fractional
+            ph_edge = ph_edge * 0.8 + 0.1
+
+        # ph_frame: [T]
+        ph_frame = np.zeros(T, dtype="int32")
+        if len(ph_id_seq) > 0:
+            for ph_id, st, ed in zip(
+                    ph_id_seq, ph_interval[0], ph_interval[1]
+            ):
+                if st < 0:
+                    st = 0
+                if ed > T:
+                    ed = T
+                ph_frame[int(np.round(st)): int(np.round(ed))] = ph_id
+
+        # ph_mask: [vocab_size]
+        ph_mask = np.zeros(vocab["vocab_size"], dtype="int32")
+        if len(ph_id_seq) > 0:
+            ph_mask[ph_id_seq] = 1
+        ph_mask[0] = 1
 
-            if len(ph_id_seq) <= 0:
-                return None, None, None, None, None
-
-            ph_edge = np.zeros([T], dtype="float32")
-            if len(ph_id_seq) > 0:
-                if ph_frame[-1] + 0.5 > T:
-                    ph_frame = ph_frame[:-1]
-                if ph_frame[0] - 0.5 < 0:
-                    ph_frame = ph_frame[1:]
-                ph_time_int = np.round(ph_frame).astype("int32")
-                ph_time_fractional = ph_frame - ph_time_int
-
-                ph_edge[ph_time_int] = 0.5 + ph_time_fractional
-                ph_edge[ph_time_int - 1] = 0.5 - ph_time_fractional
-                ph_edge = ph_edge * 0.8 + 0.1
-
-            # ph_frame: [T]
-            ph_frame = np.zeros(T, dtype="int32")
-            if len(ph_id_seq) > 0:
-                for ph_id, st, ed in zip(
-                        ph_id_seq, ph_interval[0], ph_interval[1]
-                ):
-                    if st < 0:
-                        st = 0
-                    if ed > T:
-                        ed = T
-                    ph_frame[int(np.round(st)): int(np.round(ed))] = ph_id
-
-            # ph_mask: [vocab_size]
-            ph_mask = np.zeros(vocab["vocab_size"], dtype="int32")
-            if len(ph_id_seq) > 0:
-                ph_mask[ph_id_seq] = 1
-            ph_mask[0] = 1
-        else:
-            return None, None, None, None, None
         return ph_id_seq, ph_edge, ph_frame, ph_mask, ph_time
 
     def make_non_speech_ph_data(self, T, ph_id_seq, ph_duration):
         if len(ph_id_seq) == 0:
-            return None, None
+            return np.zeros((len(self.vocab.keys()) + 1, T), dtype="int32"), []
 
         ph_id_seq = np.array(ph_id_seq, dtype="int32")
         ph_dur = np.array(ph_duration, dtype="float32")
@@ -376,22 +337,18 @@ def process_item(self, _item, export_mel=False):
                 print(f"Skipping {wav_path}, because it doesn't exist")
                 return None
 
-            waveform = load_wav(wav_path, self.device, self.sample_rate)  # (L,)
-            wav_length = len(waveform) / self.sample_rate  # seconds
+            waveform, wav_length, n_frames = load_wav(wav_path, self.sample_rate, self.hop_size,
+                                                      self.device)  # (L,) seconds
             if wav_length > self.max_length:
-                print(
-                    f"Item {wav_path} has a length of {wav_length}s, which is too long, skip it."
-                )
+                print(f"Item {wav_path} has a length of {wav_length}s, which is too long, skip it.")
                 return None
-            n_frames = waveform.size(-1) // self.hop_size + 1
 
-            label_type_id = {"blank": 0, "weak": 1, "full": 2, "evaluate": 3}[_item.label_type]
-            if label_type_id >= 2:
-                if len(_item.ph_dur) != len(_item.ph_id_seq): label_type_id = 1
-                if not _item.ph_id_seq: label_type_id = 0
+            curves = get_curves(waveform, n_frames, self.window_size, self.hop_size, device=self.device)  # [B, C, T]
 
+            if len(_item.ph_id_seq) == 0 or len(_item.ph_dur) != len(_item.ph_id_seq):
+                return None
             ph_id_seq, ph_edge, ph_frame, ph_mask, ph_time = self.make_ph_data(
-                self.vocab, n_frames, label_type_id, _item.ph_id_seq, _item.ph_dur
+                self.vocab, n_frames, _item.ph_id_seq, _item.ph_dur
             )
             if ph_id_seq is None:
                 print(f"Skipping {wav_path}, make ph data failed.")
@@ -425,6 +382,7 @@ def process_item(self, _item, export_mel=False):
             return {
                 'name': str(_item["name"]),
                 'input_feature': units.cpu().numpy().astype("float32"),
+                'curves': curves.cpu().numpy().astype("float32"),
                 'melspec': melspec.cpu().numpy().astype("float32") if export_mel else np.array([0]),
                 'ph_id_seq': ph_id_seq.astype("int32"),
                 'ph_edge': ph_edge.astype("float32"),
@@ -434,7 +392,6 @@ def process_item(self, _item, export_mel=False):
                 'ph_time_raw': np.concatenate(([0], _item.ph_dur)).cumsum()[:-1].astype("float32"),
                 'ph_seq_raw': _item.ph_seq,
                 'ph_seq': [ph for ph in _item.ph_seq if self.vocab["vocab"][ph] != 0],
-                "label_type": label_type_id,
                 "non_speech_target": non_speech_target.astype("int32"),
                 "non_speech_intervals": non_speech_intervals.astype("int32"),
                 "wav_length": wav_length
@@ -454,34 +411,25 @@ def get_meta_data(self):
             test_prefixes = dataset.get("test_prefixes", [])
 
             assert raw_data_dir.exists(), f"{raw_data_dir} does not exist."
-            assert label_type in ["full", "weak", "evaluate", "blank"], \
-                f"{label_type} not in ['full', 'weak', 'evaluate','blank]."
-            if label_type == "blank":
-                df = pd.DataFrame(
-                    columns=["name", "ph_seq", "ph_id_seq", "label_type", "wav_length", "validation"])
-                wavs_path = [i for i in raw_data_dir.rglob("*.wav")]
-                df["wav_path"] = wavs_path
-                df["name"] = df["wav_path"].apply(lambda wav_path: os.path.splitext(os.path.basename(wav_path)))
-                df["wav_length"] = 0
-                df["validation"] = False
-            else:
-                tuple_prefixes = tuple([x for x in test_prefixes if x] if test_prefixes is not None else [])
+            assert label_type in ["full", "evaluate"], \
+                f"{label_type} not in ['full','evaluate']."
+
+            tuple_prefixes = tuple([x for x in test_prefixes if x] if test_prefixes is not None else [])
 
-                csv_path = raw_data_dir / "transcriptions.csv"
-                wav_folder = raw_data_dir / "wavs"
-                assert csv_path.exists() and wav_folder.exists(), f"{csv_path.absolute()} or {wav_folder.absolute()} does not exist."
+            csv_path = raw_data_dir / "transcriptions.csv"
+            wav_folder = raw_data_dir / "wavs"
+            assert csv_path.exists() and wav_folder.exists(), f"{csv_path.absolute()} or {wav_folder.absolute()} does not exist."
 
-                df = pd.read_csv(csv_path, dtype=str)
-                assert "ph_seq" in df.columns, f"{csv_path.absolute()} does not contain 'ph_seq'."
-                if label_type == "full":
-                    assert "ph_dur" in df.columns, f"full label csv: {csv_path.absolute()} does not contain 'ph_dur'."
+            df = pd.read_csv(csv_path, dtype=str)
+            assert "ph_seq" in df.columns, f"{csv_path.absolute()} does not contain 'ph_seq'."
+            assert "ph_dur" in df.columns, f"full label csv: {csv_path.absolute()} does not contain 'ph_dur'."
 
-                if len(tuple_prefixes) > 0:
-                    df["validation"] = df["name"].apply(lambda name: name.startswith(tuple_prefixes))
-                else:
-                    df["validation"] = False
+            if len(tuple_prefixes) > 0:
+                df["validation"] = df["name"].apply(lambda name: name.startswith(tuple_prefixes))
+            else:
+                df["validation"] = False
 
-                df["wav_path"] = df["name"].apply(lambda name: str(wav_folder / (str(name) + ".wav")))
+            df["wav_path"] = df["name"].apply(lambda name: str(wav_folder / (str(name) + ".wav")))
 
             df["label_type"] = label_type
             df["ph_seq"] = df["ph_seq"].apply(
 
@@ -51,11 +51,11 @@ hubert_config:
 melspec_config:
   n_mels: 128
   sample_rate: 44100
-  win_length: 1024
-  hop_length: 512
+  window_size: 1024
+  hop_size: 512
   n_fft: 2048
-  fmin: 40
-  fmax: 16000
+  f_min: 40
+  f_max: 16000
   clamp: 0.00001
 
 # 不建议开启
 
@@ -17,7 +17,4 @@ datasets:
     label_type: evaluate
     language: yue
     test_prefixes:
-      - xxx
-  # blank 为无标注wav，不确定是否有效
-  - raw_data_dir: path/to/spk_1/raw
-    label_type: blank
+      - xxx
@@ -1,4 +1,4 @@
-model_name: 0727_hfa_cvnt
+model_name: 0918_hfa_power
 
 # settings
 float32_matmul_precision: high
@@ -23,20 +23,22 @@ model:
   hidden_dims: 192
   down_sampling_factor: 2
   down_sampling_times: 3
-  channels_scaleup_factor: 1.3
+  channels_scaleup_factor: 1.5
+  dropout: 0.1
+
+  curves_attention_dropout: 0.1
 
 cvnt_arg:
-  mask_ratio: 0.3
+  mask_ratio: 0.2
   encoder_conform_attention_drop: 0.05
-
-  num_layers: 3
-  encoder_conform_dim: 96
+  num_layers: 4
+  encoder_conform_dim: 128
   encoder_conform_ffn_latent_drop: 0.05
   encoder_conform_ffn_out_drop: 0.05
-  encoder_conform_kernel_size: 31
+  encoder_conform_kernel_size: 23
 
 optimizer_config:
-  lr: 0.0005
+  lr: 0.0003
   gamma: 0.9999
   total_steps: 20000
   muon_args:
@@ -46,13 +48,12 @@ optimizer_config:
 
 loss_config:
   losses:
-    weights: [ 8.0, 0.1, 0.01, 0.1, 2.0, 6.0 ]
-    enable_RampUpScheduler: [ False,False,False,False,True,False ]
+    weights: [ 8.0, 0.1, 1.0, 6.0, 10.0 ]
+    enable_RampUpScheduler: [ False,False,False,True,False ]
   function:
     num_bins: 10
     alpha: 0.999
     label_smoothing: 0.08
-    pseudo_label_ratio: 0.3
 
 # trainer
 accelerator: auto