Source code for pocketsphinx.segmenter

"""VAD-based segmentation.
"""

from ._pocketsphinx import Endpointer
from collections import namedtuple

SpeechSegment = namedtuple("SpeechSegment", ["start_time", "end_time", "pcm"])


[docs]class Segmenter(Endpointer):
    """VAD-based speech segmentation.

    This is a simple class that segments audio from an input stream,
    which is assumed to produce binary data as 16-bit signed integers
    when `read` is called on it.  It takes the same arguments as its
    parent `Endpointer` class.

    You could obviously use this on a raw audio file, but also on a
    `sounddevice.RawInputStream` or the output of `sox`.  You can even
    use it with the built-in `wave` module, for example::

        with wave.open("foo.wav", "r") as w:
            segmenter = Segmenter(sample_rate=w.getframerate())
            for seg in segmenter.segment(w.getfp()):
                with wave.open("%.2f-%.2f.wav"
                               % (seg.start_time, seg.end_time), "w") as wo:
                    wo.setframerate(w.getframerate())
                    wo.writeframesraw(seg.pcm)

    Args:
      window(float): Length in seconds of window for decision.
      ratio(float): Fraction of window that must be speech or
                    non-speech to make a transition.
      mode(int): Aggressiveness of voice activity detction (0-3)
      sample_rate(int): Sampling rate of input, default is 16000.
                        Rates other than 8000, 16000, 32000, 48000
                        are only approximately supported, see note
                        in `frame_length`.  Outlandish sampling
                        rates like 3924 and 115200 will raise a
                        `ValueError`.
      frame_length(float): Desired input frame length in seconds,
                           default is 0.03.  The *actual* frame
                           length may be different if an
                           approximately supported sampling rate is
                           requested.  You must *always* use the
                           `frame_bytes` and `frame_length`
                           attributes to determine the input size.

    Raises:
      ValueError: Invalid input parameter.  Also raised if the ratio
                  makes it impossible to do endpointing (i.e. it
                  is more than N-1 or less than 1 frame).
    """
    def __init__(self, *args, **kwargs):
        super(Segmenter, self).__init__(*args, **kwargs)
        self.speech_frames = []

[docs]    def segment(self, stream):
        """Split a stream of data into speech segments.

        Args:
            stream: File-like object returning binary data (assumed to
                    be single-channel, 16-bit integer PCM)

        Returns:
           Iterable[SpeechSegment]: Generator over `SpeechSegment` for
           each speech region detected by the `Endpointer`.

        """
        idx = 0
        while True:
            frame = stream.read(self.frame_bytes)
            if len(frame) == 0:
                break
            elif len(frame) < self.frame_bytes:
                speech = self.end_stream(frame)
            else:
                speech = self.process(frame)
            if speech is not None:
                self.speech_frames.append(speech)
                if not self.in_speech:
                    yield SpeechSegment(
                        start_time=self.speech_start,
                        end_time=self.speech_end,
                        pcm=b"".join(self.speech_frames),
                    )
                    del self.speech_frames[:]
                    idx += 1