import math import multiprocessing import os import argparse from random import shuffle import random import torch from glob import glob from tqdm import tqdm from modules.mel_processing import spectrogram_torch import json import utils import logging logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) import diffusion.logger.utils as du from diffusion.vocoder import Vocoder import librosa import numpy as np hps = utils.get_hparams_from_file("configs/config.json") dconfig = du.load_config("configs/diffusion.yaml") sampling_rate = hps.data.sampling_rate hop_length = hps.data.hop_length speech_encoder = hps["model"]["speech_encoder"] def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None): # print(filename) wav, sr = librosa.load(filename, sr=sampling_rate) audio_norm = torch.FloatTensor(wav) audio_norm = audio_norm.unsqueeze(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") soft_path = filename + ".soft.pt" if not os.path.exists(soft_path): wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000) wav16k = torch.from_numpy(wav16k).to(device) c = hmodel.encoder(wav16k) torch.save(c.cpu(), soft_path) f0_path = filename + ".f0.npy" if not os.path.exists(f0_path): f0_predictor = utils.get_f0_predictor(f0p,sampling_rate=sampling_rate, hop_length=hop_length,device=None,threshold=0.05) f0,uv = f0_predictor.compute_f0_uv( wav ) np.save(f0_path, np.asanyarray((f0,uv),dtype=object)) spec_path = filename.replace(".wav", ".spec.pt") if not os.path.exists(spec_path): # Process spectrogram # The following code can't be replaced by torch.FloatTensor(wav) # because load_wav_to_torch return a tensor that need to be normalized if sr != hps.data.sampling_rate: raise ValueError( "{} SR doesn't match target {} SR".format( sr, hps.data.sampling_rate ) ) #audio_norm = audio / hps.data.max_wav_value spec = spectrogram_torch( audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False, ) spec = torch.squeeze(spec, 0) torch.save(spec, spec_path) if diff or hps.model.vol_embedding: volume_path = filename + ".vol.npy" volume_extractor = utils.Volume_Extractor(hop_length) if not os.path.exists(volume_path): volume = volume_extractor.extract(audio_norm) np.save(volume_path, volume.to('cpu').numpy()) if diff: mel_path = filename + ".mel.npy" if not os.path.exists(mel_path) and mel_extractor is not None: mel_t = mel_extractor.extract(audio_norm.to(device), sampling_rate) mel = mel_t.squeeze().to('cpu').numpy() np.save(mel_path, mel) aug_mel_path = filename + ".aug_mel.npy" aug_vol_path = filename + ".aug_vol.npy" max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5 max_shift = min(1, np.log10(1/max_amp)) log10_vol_shift = random.uniform(-1, max_shift) keyshift = random.uniform(-5, 5) if mel_extractor is not None: aug_mel_t = mel_extractor.extract(audio_norm * (10 ** log10_vol_shift), sampling_rate, keyshift = keyshift) aug_mel = aug_mel_t.squeeze().to('cpu').numpy() aug_vol = volume_extractor.extract(audio_norm * (10 ** log10_vol_shift)) if not os.path.exists(aug_mel_path): np.save(aug_mel_path,np.asanyarray((aug_mel,keyshift),dtype=object)) if not os.path.exists(aug_vol_path): np.save(aug_vol_path,aug_vol.to('cpu').numpy()) def process_batch(filenames,f0p,diff=False,mel_extractor=None): print("Loading speech encoder for content...") device = "cuda" if torch.cuda.is_available() else "cpu" hmodel = utils.get_speech_encoder(speech_encoder,device=device) print("Loaded speech encoder.") for filename in tqdm(filenames): process_one(filename, hmodel,f0p,diff,mel_extractor) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--in_dir", type=str, default="dataset/44k", help="path to input dir" ) parser.add_argument( '--use_diff',action='store_true', help='Whether to use the diffusion model' ) parser.add_argument( '--f0_predictor', type=str, default="dio", help='Select F0 predictor, can select crepe,pm,dio,harvest, default pm(note: crepe is original F0 using mean filter)' ) parser.add_argument( '--num_processes', type=int, default=1, help='You are advised to set the number of processes to the same as the number of CPU cores' ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = parser.parse_args() f0p = args.f0_predictor print(speech_encoder) print(f0p) if args.use_diff: print("use_diff") print("Loading Mel Extractor...") mel_extractor = Vocoder(dconfig.vocoder.type, dconfig.vocoder.ckpt, device = device) print("Loaded Mel Extractor.") else: mel_extractor = None filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10] shuffle(filenames) multiprocessing.set_start_method("spawn", force=True) num_processes = args.num_processes chunk_size = int(math.ceil(len(filenames) / num_processes)) chunks = [ filenames[i : i + chunk_size] for i in range(0, len(filenames), chunk_size) ] print([len(c) for c in chunks]) processes = [ multiprocessing.Process(target=process_batch, args=(chunk,f0p,args.use_diff,mel_extractor)) for chunk in chunks ] for p in processes: p.start()