- Notifications
You must be signed in to change notification settings - Fork 236
/
Copy pathvectorization.py
106 lines (87 loc) · 3.36 KB
/
vectorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Copyright 2019 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Functions that convert audio to machine readable vectors
"""
importhashlib
importnumpyasnp
importos
fromtypingimport*
fromprecise.paramsimportpr, Vectorizer
fromprecise.utilimportload_audio, InvalidAudio
fromsonopyimportmfcc_spec, mel_spec
inhibit_t=0.4
inhibit_dist_t=1.0
inhibit_hop_t=0.1
# Functions that convert audio frames -> vectors
vectorizers= {
Vectorizer.mels: lambdax: mel_spec(
x, pr.sample_rate, (pr.window_samples, pr.hop_samples),
num_filt=pr.n_filt, fft_size=pr.n_fft
),
Vectorizer.mfccs: lambdax: mfcc_spec(
x, pr.sample_rate, (pr.window_samples, pr.hop_samples),
num_filt=pr.n_filt, fft_size=pr.n_fft, num_coeffs=pr.n_mfcc
),
Vectorizer.speechpy_mfccs: lambdax: __import__('speechpy').feature.mfcc(
x, pr.sample_rate, pr.window_t, pr.hop_t, pr.n_mfcc, pr.n_filt, pr.n_fft
)
}
defvectorize_raw(audio: np.ndarray) ->np.ndarray:
"""Turns audio into feature vectors, without clipping for length"""
iflen(audio) ==0:
raiseInvalidAudio('Cannot vectorize empty audio!')
returnvectorizers[pr.vectorizer](audio)
defadd_deltas(features: np.ndarray) ->np.ndarray:
"""Inserts extra features that are the difference between adjacent timesteps"""
deltas=np.zeros_like(features)
foriinrange(1, len(features)):
deltas[i] =features[i] -features[i-1]
returnnp.concatenate([features, deltas], -1)
defvectorize(audio: np.ndarray) ->np.ndarray:
"""
Converts audio to machine readable vectors using
configuration specified in ListenerParams (params.py)
Args:
audio: Audio verified to be of `sample_rate`
Returns:
array<float>: Vector representation of audio
"""
iflen(audio) >pr.max_samples:
audio=audio[-pr.max_samples:]
features=vectorize_raw(audio)
iflen(features) <pr.n_features:
features=np.concatenate([
np.zeros((pr.n_features-len(features), features.shape[1])),
features
])
iflen(features) >pr.n_features:
features=features[-pr.n_features:]
returnfeatures
defvectorize_delta(audio: np.ndarray) ->np.ndarray:
"""Vectorizer for when use_delta is True"""
returnadd_deltas(vectorize(audio))
defvectorize_inhibit(audio: np.ndarray) ->np.ndarray:
"""
Returns an array of inputs generated from the
wake word audio that shouldn't cause an activation
"""
defsamp(x):
returnint(pr.sample_rate*x)
inputs= []
foroffsetinrange(samp(inhibit_t), samp(inhibit_dist_t), samp(inhibit_hop_t)):
iflen(audio) -offset<samp(pr.buffer_t/2.):
break
inputs.append(vectorize(audio[:-offset]))
returnnp.array(inputs) ifinputselsenp.empty((0, pr.n_features, pr.feature_size))