Skip to content

Commit fba3d54

Browse files
committed
TorchAudio Materials
1 parent 1a15c92 commit fba3d54

2 files changed

Lines changed: 148 additions & 0 deletions

File tree

torchaudio/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Use TorchAudio to Prepare Audio Data for Deep Learning
2+
3+
This folder provides sample code for the Real Python tutorial [Use TorchAudio to Prepare Audio Data for Deep Learning](https://realpython.com/python-torchaudio/).

torchaudio/speech.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
try:
2+
from copy import replace
3+
except ImportError:
4+
5+
def replace(obj, **kwargs):
6+
return obj._replace(**kwargs)
7+
8+
9+
from pathlib import Path
10+
from typing import Callable, NamedTuple, Self
11+
12+
import sounddevice as sd
13+
import torchaudio
14+
from IPython.display import Audio
15+
from torch import Tensor, clamp, randn_like
16+
from torch.nn import functional as F
17+
from torch.utils.data import Dataset
18+
from torchaudio import functional as AF
19+
from torchaudio.datasets import SPEECHCOMMANDS
20+
from torchaudio.datasets.speechcommands import FOLDER_IN_ARCHIVE
21+
from tqdm import tqdm
22+
23+
24+
class SpeechSample(NamedTuple):
25+
waveform: Tensor
26+
sample_rate: int
27+
label: str
28+
speaker_id: str
29+
utterance_number: int
30+
31+
@property
32+
def num_channels(self) -> int:
33+
return self.waveform.size(0)
34+
35+
@property
36+
def num_samples(self) -> int:
37+
return self.waveform.size(1)
38+
39+
@property
40+
def num_seconds(self) -> float:
41+
return self.num_samples / self.sample_rate
42+
43+
def play(self) -> None:
44+
sd.play(
45+
self.waveform.numpy().reshape(-1, self.num_channels),
46+
self.sample_rate,
47+
blocking=True,
48+
)
49+
50+
def play_widget(self) -> Audio:
51+
return Audio(
52+
self.waveform.numpy(), rate=self.sample_rate, autoplay=True
53+
)
54+
55+
def save(self, path: str | Path) -> None:
56+
torchaudio.save(path, self.waveform, self.sample_rate)
57+
58+
def apply(self, transform: Callable[[Tensor], Tensor]) -> Self:
59+
return replace(self, waveform=transform(self.waveform))
60+
61+
def resample(self, sample_rate: int) -> Self:
62+
return replace(
63+
self,
64+
sample_rate=sample_rate,
65+
waveform=AF.resample(
66+
self.waveform,
67+
orig_freq=self.sample_rate,
68+
new_freq=sample_rate,
69+
),
70+
)
71+
72+
def pad_trim(self, seconds: int | float) -> Self:
73+
num_samples = int(self.sample_rate * seconds)
74+
if self.num_samples > num_samples:
75+
return replace(self, waveform=self.waveform[:, :num_samples])
76+
elif self.num_samples < num_samples:
77+
padding_amount = num_samples - self.num_samples
78+
return replace(
79+
self, waveform=F.pad(self.waveform, (0, padding_amount))
80+
)
81+
else:
82+
return self
83+
84+
def with_gaussian_noise(self, level=0.01) -> Self:
85+
noise = randn_like(self.waveform) * level
86+
return replace(self, waveform=clamp(self.waveform + noise, -1.0, 1.0))
87+
88+
89+
class AugmentedSpeechCommands(Dataset):
90+
def __init__(
91+
self,
92+
folder: str | Path | None = None,
93+
seconds: int | float | None = None,
94+
noise_level: float = 0.005,
95+
enable_noise: bool = True,
96+
transform: Callable[[Tensor], Tensor] | None = None,
97+
) -> None:
98+
if folder:
99+
self.folder = Path(folder).resolve()
100+
else:
101+
self.folder = Path.cwd() / FOLDER_IN_ARCHIVE
102+
self._raw_dataset = SPEECHCOMMANDS(
103+
self.folder.parent, folder_in_archive=self.folder.name
104+
)
105+
self._noise = noise_level
106+
self._enable_noise = enable_noise
107+
self._transform = transform
108+
self._seconds = seconds
109+
110+
def __len__(self) -> int:
111+
return len(self._raw_dataset)
112+
113+
def __getitem__(self, index: int) -> SpeechSample:
114+
relative_path, _, *metadata = self._raw_dataset.get_metadata(index)
115+
absolute_path = self.folder / relative_path
116+
waveform, sample_rate = torchaudio.load(absolute_path)
117+
speech_sample = SpeechSample(waveform, sample_rate, *metadata)
118+
119+
if self._seconds is not None:
120+
speech_sample = speech_sample.pad_trim(self._seconds)
121+
122+
if self._enable_noise:
123+
speech_sample = speech_sample.with_gaussian_noise(self._noise)
124+
125+
if self._transform:
126+
speech_sample = speech_sample.apply(self._transform)
127+
128+
return speech_sample
129+
130+
131+
def bulk_process(
132+
dataset: SPEECHCOMMANDS,
133+
output_dir: str | Path,
134+
sample_rate: int,
135+
seconds: int | float,
136+
) -> None:
137+
for index, sample in tqdm(enumerate(dataset), total=len(dataset)):
138+
speech_sample = SpeechSample(*sample)
139+
input_path, *_ = dataset.get_metadata(index)
140+
output_path = Path(output_dir).resolve() / input_path
141+
output_path.parent.mkdir(parents=True, exist_ok=True)
142+
if speech_sample.sample_rate != sample_rate:
143+
speech_sample = speech_sample.resample(sample_rate)
144+
speech_sample = speech_sample.pad_trim(seconds)
145+
speech_sample.save(output_path)

0 commit comments

Comments
 (0)