"""Main module for the PocketSphinx speech recognizer.
"""
# Copyright (c) 1999-2016 Carnegie Mellon University. All rights
# reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# This work was supported in part by funding from the Defense Advanced
# Research Projects Agency and the National Science Foundation of the
# United States of America, and the CMU Sphinx Speech Consortium.
#
# THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
# ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
# NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import collections
import importlib.util
import os
import signal
from contextlib import contextmanager
from . import _pocketsphinx as pocketsphinx # noqa: F401
from ._pocketsphinx import LogMath # noqa: F401
from ._pocketsphinx import Config # noqa: F401
from ._pocketsphinx import Decoder # noqa: F401
from ._pocketsphinx import Jsgf # noqa: F401
from ._pocketsphinx import JsgfRule # noqa: F401
from ._pocketsphinx import NGramModel # noqa: F401
from ._pocketsphinx import FsgModel # noqa: F401
from ._pocketsphinx import Segment # noqa: F401
from ._pocketsphinx import Hypothesis # noqa: F401
from ._pocketsphinx import Lattice # noqa: F401
from ._pocketsphinx import Vad # noqa: F401
from ._pocketsphinx import Endpointer # noqa: F401
from ._pocketsphinx import Alignment # noqa: F401
from ._pocketsphinx import AlignmentEntry # noqa: F401
from ._pocketsphinx import set_loglevel # noqa: F401
from .segmenter import Segmenter # noqa: F401
Arg = collections.namedtuple("Arg", ["name", "default", "doc", "type", "required"])
Arg.__doc__ = "Description of a configuration parameter."
Arg.name.__doc__ = "Parameter name (without leading dash)."
Arg.default.__doc__ = "Default value of parameter."
Arg.doc.__doc__ = "Description of parameter."
Arg.type.__doc__ = "Type (as a Python type object) of parameter value."
Arg.required.__doc__ = "Is this parameter required?"
def get_model_path(subpath=None):
"""Return path to the model directory, or optionally, a specific file
or directory within it.
If the POCKETSPHINX_PATH environment variable is set, it will be
returned here, otherwise the default is determined by your
PocketSphinx installation, and may or may not be writable by you.
Args:
subpath: An optional path to add to the model directory.
Returns:
The requested path within the model directory.
"""
model_path = pocketsphinx._ps_default_modeldir()
if model_path is None:
# Use importlib to find things (so editable installs work)
model_path = importlib.util.find_spec(
"pocketsphinx.model"
).submodule_search_locations[0]
if subpath is not None:
return os.path.join(model_path, subpath)
else:
return model_path
class Pocketsphinx(Decoder):
"""Compatibility wrapper class.
This class is deprecated, as most of its functionality is now
available in the main `Decoder` class, but it is here in case you
had code that used the old external pocketsphinx-python module.
"""
def __init__(self, **kwargs):
if kwargs.get("dic") is not None and kwargs.get("dict") is None:
kwargs["dict"] = kwargs.pop("dic")
if kwargs.pop("verbose", False) is True:
kwargs["loglevel"] = "INFO"
self.start_frame = 0
super(Pocketsphinx, self).__init__(**kwargs)
def __str__(self):
return self.hypothesis()
@contextmanager
def start_utterance(self):
self.start_utt()
yield
self.end_utt()
@contextmanager
def end_utterance(self):
self.end_utt()
yield
self.start_utt()
def decode(self, audio_file, buffer_size=2048, no_search=False, full_utt=False):
buf = bytearray(buffer_size)
with open(audio_file, "rb") as f:
with self.start_utterance():
while f.readinto(buf):
self.process_raw(buf, no_search, full_utt)
return self
def segments(self, detailed=False):
if detailed:
lmath = self.get_logmath()
return [
(
s.word,
lmath.log(s.prob),
self.start_frame + s.start_frame,
self.start_frame + s.end_frame,
)
for s in self.seg()
]
else:
return [s.word for s in self.seg()]
def hypothesis(self):
hyp = self.hyp()
if hyp:
return hyp.hypstr
else:
return ""
def probability(self):
hyp = self.hyp()
if hyp:
return self.get_logmath().log(hyp.prob)
def score(self):
hyp = self.hyp()
if hyp:
return self.get_logmath().log(hyp.best_score)
def best(self, count=10):
lmath = self.get_logmath()
return [
(h.hypstr, lmath.log(h.score)) for h, i in zip(self.nbest(), range(count))
]
def confidence(self):
hyp = self.hyp()
if hyp:
return hyp.prob
[docs]
class AudioFile(Pocketsphinx):
"""Simple audio file segmentation and speech recognition.
It is recommended to use the `Segmenter` and `Decoder` classes
directly, but this is here in case you had code that used the old
external pocketsphinx-python module, or need something very
simple.
"""
def __init__(self, audio_file=None, **kwargs):
signal.signal(signal.SIGINT, self.stop)
self.audio_file = audio_file
self.segmenter = Segmenter()
# You would never actually set these!
kwargs.pop("no_search", False)
kwargs.pop("full_utt", False)
kwargs.pop("buffer_size", False)
self.keyphrase = kwargs.get("keyphrase")
super(AudioFile, self).__init__(**kwargs)
self.f = open(self.audio_file, "rb")
def __iter__(self):
with self.f:
for speech in self.segmenter.segment(self.f):
self.start_frame = int(speech.start_time * self.config["frate"] + 0.5)
self.start_utt()
self.process_raw(speech.pcm, full_utt=True)
if self.keyphrase and self.hyp():
self.end_utt()
yield self
else:
self.end_utt()
yield self
[docs]
def stop(self, *args, **kwargs):
raise StopIteration
[docs]
class LiveSpeech(Pocketsphinx):
"""Simple endpointing and live speech recognition.
This class is not very useful for an actual application. It is
recommended to use the `Endpointer` and `Decoder` classes
directly, but it is here in case you had code that used the old
external pocketsphinx-python module, or need something incredibly
simple.
"""
def __init__(self, **kwargs):
self.audio_device = kwargs.pop("audio_device", None)
self.sampling_rate = kwargs.pop("sampling_rate", 16000)
self.ep = Endpointer(sample_rate=self.sampling_rate)
self.buffer_size = self.ep.frame_bytes
# Setting these will not do anything good!
kwargs.pop("no_search", False)
kwargs.pop("full_utt", False)
kwargs.pop("buffer_size", False)
self.keyphrase = kwargs.get("keyphrase")
try:
import sounddevice
assert sounddevice
except Exception as e:
# In case PortAudio is not present, for instance
raise RuntimeError("LiveSpeech not supported: %s" % e)
self.ad = sounddevice.RawInputStream(
samplerate=self.sampling_rate,
# WE DO NOT CARE ABOUT LATENCY!
blocksize=self.buffer_size // 2,
dtype="int16",
channels=1,
device=self.audio_device,
)
super(LiveSpeech, self).__init__(**kwargs)
def __iter__(self):
with self.ad:
not_done = True
while not_done:
try:
self.buf, _ = self.ad.read(self.buffer_size // 2)
if len(self.buf) == self.buffer_size:
speech = self.ep.process(self.buf)
else:
speech = self.ep.end_stream(self.buf)
not_done = False
if speech is not None:
if not self.in_speech:
self.start_utt()
self.process_raw(speech)
if self.keyphrase and self.hyp():
with self.end_utterance():
yield self
elif not self.ep.in_speech:
self.end_utt()
if self.hyp():
yield self
except KeyboardInterrupt:
break
@property
def in_speech(self):
return self.get_in_speech()