"""VAD-based segmentation.
"""
from ._pocketsphinx import Endpointer
from collections import namedtuple
SpeechSegment = namedtuple("SpeechSegment", ["start_time", "end_time", "pcm"])
[docs]
class Segmenter(Endpointer):
"""VAD-based speech segmentation.
This is a simple class that segments audio from an input stream,
which is assumed to produce binary data as 16-bit signed integers
when `read` is called on it. It takes the same arguments as its
parent `Endpointer` class.
You could obviously use this on a raw audio file, but also on a
`sounddevice.RawInputStream` or the output of `sox`. You can even
use it with the built-in `wave` module, for example::
with wave.open("foo.wav", "r") as w:
segmenter = Segmenter(sample_rate=w.getframerate())
for seg in segmenter.segment(w.getfp()):
with wave.open("%.2f-%.2f.wav"
% (seg.start_time, seg.end_time), "w") as wo:
wo.setframerate(w.getframerate())
wo.writeframesraw(seg.pcm)
Args:
window(float): Length in seconds of window for decision.
ratio(float): Fraction of window that must be speech or
non-speech to make a transition.
mode(int): Aggressiveness of voice activity detction (0-3)
sample_rate(int): Sampling rate of input, default is 16000.
Rates other than 8000, 16000, 32000, 48000
are only approximately supported, see note
in `frame_length`. Outlandish sampling
rates like 3924 and 115200 will raise a
`ValueError`.
frame_length(float): Desired input frame length in seconds,
default is 0.03. The *actual* frame
length may be different if an
approximately supported sampling rate is
requested. You must *always* use the
`frame_bytes` and `frame_length`
attributes to determine the input size.
Raises:
ValueError: Invalid input parameter. Also raised if the ratio
makes it impossible to do endpointing (i.e. it
is more than N-1 or less than 1 frame).
"""
def __init__(self, *args, **kwargs):
super(Segmenter, self).__init__(*args, **kwargs)
self.speech_frames = []
[docs]
def segment(self, stream):
"""Split a stream of data into speech segments.
Args:
stream: File-like object returning binary data (assumed to
be single-channel, 16-bit integer PCM)
Returns:
Iterable[SpeechSegment]: Generator over `SpeechSegment` for
each speech region detected by the `Endpointer`.
"""
idx = 0
while True:
frame = stream.read(self.frame_bytes)
if len(frame) == 0:
break
elif len(frame) < self.frame_bytes:
speech = self.end_stream(frame)
else:
speech = self.process(frame)
if speech is not None:
self.speech_frames.append(speech)
if not self.in_speech:
yield SpeechSegment(
start_time=self.speech_start,
end_time=self.speech_end,
pcm=b"".join(self.speech_frames),
)
del self.speech_frames[:]
idx += 1