Source code for data.movie.audio
from data.abstract import Dataset
from data.utils import calculate_volume_and_timbre
import os
import glob
import numpy as np
import librosa
from types import SimpleNamespace
import pandas as pd
[docs]
class Audio(Dataset):
# Raw audio is sampled at 48000 Hz
_unit_second = 1/48000
_base_path = os.path.dirname(__file__)
[docs]
def __init__(self): super().__init__()
def _load(self):
samples, sr = librosa.load(os.path.join(self._base_path, "../bin/movie/audio/audio.wav"), sr=None)
return samples
@property
def instruments(self):
volume, timbre = calculate_volume_and_timbre(os.path.join(self._base_path, "../bin/movie/audio/accompaniment.wav"))
return SimpleNamespace(volume=volume, timbre=timbre)
@property
def vocals(self):
volume, timbre = calculate_volume_and_timbre(os.path.join(self._base_path, "../bin/movie/audio/vocals.wav"))
# According to 3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes
vocal_regress = np.load(os.path.join(self._base_path, "../bin/movie/audio/vocal_attributes.npy"))
arousal = Dataset.from_array(vocal_regress[:, 0], 0.3)
dominance = Dataset.from_array(vocal_regress[:, 1], 0.3)
valence = Dataset.from_array(vocal_regress[:, 2], 0.3)
# According to 3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes
# {0: 'Angry',1: 'Sad',2: 'Happy',3: 'Surprise',4: 'Fear',5: 'Disgust',6: 'Contempt',7: 'Neutral'}
emotions = pd.read_parquet(os.path.join(self._base_path, "../bin/movie/audio/vocal_emos.parquet"))
# make seperate array for each emotion
angry = Dataset.from_array(emotions['Angry'].to_numpy(), 0.3)
sad = Dataset.from_array(emotions['Sad'].to_numpy(), 0.3)
happy = Dataset.from_array(emotions['Happy'].to_numpy(), 0.3)
surprise = Dataset.from_array(emotions['Surprise'].to_numpy(), 0.3)
fear = Dataset.from_array(emotions['Fear'].to_numpy(), 0.3)
disgust = Dataset.from_array(emotions['Disgust'].to_numpy(), 0.3)
contempt = Dataset.from_array(emotions['Contempt'].to_numpy(), 0.3)
neutral = Dataset.from_array(emotions['Neutral'].to_numpy(), 0.3)
return SimpleNamespace(volume=volume, timbre=timbre,
arousal=arousal, dominance=dominance, valence=valence,
angry=angry, sad=sad, happy=happy, surprise=surprise,
fear=fear, disgust=disgust, contempt=contempt,
neutral=neutral)