"""
The SpeechToText widget controls the speech recognition service of the
browser.
It wraps the HTML5 SpeechRecognition API. See
https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognition
This functionality is **experimental** and only supported by Chrome
and a few other browsers. Checkout
https://caniuse.com/speech-recognition for a up to date list of
browsers supporting the SpeechRecognition Api. Or alternatively
https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognition#Browser_compatibility
On some browsers, like Chrome, using Speech Recognition on a web page
involves a server-based recognition engine. Your audio is sent to a
web service for recognition processing, so it won't work
offline. Whether this is secure and confidential enough for your use
case is up to you to evaluate.
"""
from __future__ import annotations
from typing import (
TYPE_CHECKING, ClassVar, Mapping, Type,
)
import param
from ..models.speech_to_text import SpeechToText as _BkSpeechToText
from .base import Widget
from .button import BUTTON_TYPES
if TYPE_CHECKING:
from bokeh.model import Model
BUTTON_TYPES = BUTTON_TYPES+['light', 'dark']
[docs]class Language(param.Parameterized):
country = param.String(doc="A country like 'United States'")
name = param.String(doc="""
The bcp 47 code uniquely identifying the language. For example
'en-US'.""")
family = param.String(doc="""
The overall language family. For example 'English'.""")
def __str__(self):
return f"{self.family} - {self.country} ({self.name})"
# Source: https://stackoverflow.com/questions/14257598/what-are-language-codes-in-chromes-implementation-of-the-html5-speech-recogniti
# See also https://cloud.google.com/speech-to-text/docs/languages
LANGUAGES = [
Language(country='South Africa', family='Afrikaans', name='af-ZA'),
Language(country='Algeria', family='Arabic', name='ar-DZ'),
Language(country='Bahrain', family='Arabic', name='ar-BH'),
Language(country='Egypt', family='Arabic', name='ar-EG'),
Language(country='Israel', family='Arabic', name='ar-IL'),
Language(country='Iraq', family='Arabic', name='ar-IQ'),
Language(country='Jordan', family='Arabic', name='ar-JO'),
Language(country='Kuwait', family='Arabic', name='ar-KW'),
Language(country='Lebanon', family='Arabic', name='ar-LB'),
Language(country='Morocco', family='Arabic', name='ar-MA'),
Language(country='Oman', family='Arabic', name='ar-OM'),
Language(country='Palestinian Territory', family='Arabic', name='ar-PS'),
Language(country='Qatar', family='Arabic', name='ar-QA'),
Language(country='Saudi Arabia', family='Arabic', name='ar-SA'),
Language(country='Tunisia', family='Arabic', name='ar-TN'),
Language(country='UAE', family='Arabic', name='ar-AE'),
Language(country='Spain', family='Basque', name='eu-ES'),
Language(country='Bulgaria', family='Bulgarian', name='bg-BG'),
Language(country='Spain', family='Catalan', name='ca-ES'),
Language(country='China (Simp.)', family='Chinese Mandarin', name='cmn-Hans-CN'),
Language(country='Hong Kong SAR (Trad.)', family='Chinese Mandarin', name='cmn-Hans-HK'),
Language(country='Taiwan (Trad.)', family='Chinese Mandarin', name='cmn-Hant-TW'),
Language(country='Hong Kong', family='Chinese Cantonese', name='yue-Hant-HK'),
Language(country='Croatia', family='Croatian', name='hr_HR'),
Language(country='Czech Republic', family='Czech', name='cs-CZ'),
Language(country='Denmark', family='Danish', name='da-DK'),
Language(country='Australia', family='English', name='en-AU'),
Language(country='Canada', family='English', name='en-CA'),
Language(country='India', family='English', name='en-IN'),
Language(country='Ireland', family='English', name='en-IE'),
Language(country='New Zealand', family='English', name='en-NZ'),
Language(country='Philippines', family='English', name='en-PH'),
Language(country='South Africa', family='English', name='en-ZA'),
Language(country='United Kingdom', family='English', name='en-GB'),
Language(country='United States', family='English', name='en-US'),
Language(country='Iran', family='Farsi', name='fa-IR'),
Language(country='France', family='French', name='fr-FR'),
Language(country='Philippines', family='Filipino', name='fil-PH'),
Language(country='Spain', family='Galician', name='gl-ES'),
Language(country='Germany', family='German', name='de-DE'),
Language(country='Greece', family='Greek', name='el-GR'),
Language(country='Finland', family='Finnish', name='fi-FI'),
Language(country='Israel', family='Hebrew', name='he-IL'),
Language(country='India', family='Hindi', name='hi-IN'),
Language(country='Hungary', family='Hungarian', name='hu-HU'),
Language(country='Indonesia', family='Indonesian', name='id-ID'),
Language(country='Iceland', family='Icelandic', name='is-IS'),
Language(country='Italy', family='Italian', name='it-IT'),
Language(country='Switzerland', family='Italian', name='it-CH'),
Language(country='Japan', family='Japanese', name='ja-JP'),
Language(country='Korea', family='Korean', name='ko-KR'),
Language(country='Lithuania', family='Lithuanian', name='lt-LT'),
Language(country='Malaysia', family='Malaysian', name='ms-MY'),
Language(country='Netherlands', family='Dutch', name='nl-NL'),
Language(country='Norway', family='Norwegian', name='nb-NO'),
Language(country='Poland', family='Polish', name='pl-PL'),
Language(country='Brazil', family='Portuguese', name='pt-BR'),
Language(country='Portugal', family='Portuguese', name='pt-PT'),
Language(country='Romania', family='Romanian', name='ro-RO'),
Language(country='Russia', family='Russian', name='ru-RU'),
Language(country='Serbia', family='Serbian', name='sr-RS'),
Language(country='Slovakia', family='Slovak', name='sk-SK'),
Language(country='Slovenia', family='Slovenian', name='sl-SI'),
Language(country='Argentina', family='Spanish', name='es-AR'),
Language(country='Bolivia', family='Spanish', name='es-BO'),
Language(country='Chile', family='Spanish', name='es-CL'),
Language(country='Colombia', family='Spanish', name='es-CO'),
Language(country='Costa Rica', family='Spanish', name='es-CR'),
Language(country='Dominican Republic', family='Spanish', name='es-DO'),
Language(country='Ecuador', family='Spanish', name='es-EC'),
Language(country='El Salvador', family='Spanish', name='es-SV'),
Language(country='Guatemala', family='Spanish', name='es-GT'),
Language(country='Honduras', family='Spanish', name='es-HN'),
Language(country='México', family='Spanish', name='es-MX'),
Language(country='Nicaragua', family='Spanish', name='es-NI'),
Language(country='Panamá', family='Spanish', name='es-PA'),
Language(country='Paraguay', family='Spanish', name='es-PY'),
Language(country='Perú', family='Spanish', name='es-PE'),
Language(country='Puerto Rico', family='Spanish', name='es-PR'),
Language(country='Spain', family='Spanish', name='es-ES'),
Language(country='Uruguay', family='Spanish', name='es-UY'),
Language(country='United States', family='Spanish', name='es-US'),
Language(country='Venezuela', family='Spanish', name='es-VE'),
Language(country='Sweden', family='Swedish', name='sv-SE'),
Language(country='Thailand', family='Thai', name='th-TH'),
Language(country='Turkey', family='Turkish', name='tr-TR'),
Language(country='Ukraine', family='Ukrainian', name='uk-UA'),
Language(country='Viet Nam', family='Vietnamese', name='vi-VN'),
Language(country='South Africa', family='Zulu', name='zu-ZA'),
]
LANGUAGES.sort(key=lambda x: x.name)
LANGUAGE_CODES = [lang.name for lang in LANGUAGES]
[docs]class Grammar(param.Parameterized):
"""A set of words or patterns of words that we want the speech recognition service to recognize
For example
grammar = Grammar(
src='#JSGF V1.0; grammar colors; public <color> = aqua | azure | beige;',
weight=0.7
)
Wraps the HTML SpeechGrammar API.
See https://developer.mozilla.org/en-US/docs/Web/API/SpeechGrammar
"""
src = param.String(doc="""
A set of words or patterns of words that we want the
recognition service to recognize. Defined using JSpeech
Grammar Format. See https://www.w3.org/TR/jsgf/.""")
uri = param.String(doc="""
An uri pointing to the definition. If src is available it will
be used. Otherwise uri. The uri will be loaded on the client
side only.""")
weight = param.Number(default=1, bounds=(0.0, 1.0), step=0.01, doc="""
The weight of the grammar. A number in the range 0–1. Default is 1.""")
[docs] def serialize(self):
"""Returns the grammar as dict"""
if self.src:
return {"src": self.src, "weight": self.weight}
if self.uri:
return {"uri": self.uri, "weight": self.weight}
raise ValueError("One of src or uri must be set")
[docs]class GrammarList(list):
"""A list of Grammar objects containing words or patterns of words that we want the
recognition service to recognize.
Example:
grammar = '#JSGF V1.0; grammar colors; public <color> = aqua | azure | beige | bisque ;'
grammar_list = GrammarList()
grammar_list.add_from_string(grammar, 1)
Wraps the HTML 5 SpeechGrammarList API
See https://developer.mozilla.org/en-US/docs/Web/API/SpeechGrammarList
"""
[docs] def add_from_string(self, src, weight=1.0):
"""
Takes a src and weight and adds it to the GrammarList as a new
Grammar object. The new Grammar object is returned.
"""
grammar = Grammar(src=src, weight=weight)
self.append(grammar)
return grammar
[docs] def add_from_uri(self, uri, weight=1.0):
"""
Takes a grammar present at a specific uri, and adds it to the
GrammarList as a new Grammar object. The new Grammar object is
returned.
"""
grammar = Grammar(uri=uri, weight=weight)
self.append(grammar)
return grammar
[docs] def serialize(self):
"""Returns a list of serialized grammars"""
return [grammar.serialize() for grammar in self]
[docs]class RecognitionAlternative(param.Parameterized):
"""The RecognitionAlternative represents a word or
sentence that has been recognised by the speech recognition service.
Wraps the HTML5 SpeechRecognitionAlternative API
See https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognitionAlternative
"""
confidence = param.Number(bounds=(0.0, 1.0), constant=True, doc="""
A numeric estimate between 0 and 1 of how confident the speech recognition
system is that the recognition is correct.""")
transcript = param.String(constant=True, doc="""
The transcript of the recognised word or sentence.""")
[docs]class RecognitionResult(param.Parameterized):
"""The Result represents a single recognition match, which may contain
multiple RecognitionAlternative objects.
Wraps the HTML5 SpeechRecognitionResult API.
See https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognitionResult
"""
alternatives = param.List(item_type=RecognitionAlternative, constant=True, doc="""
The list of the n-best alternatives""")
is_final = param.Boolean(constant=True, doc="""
A Boolean that states whether this result is final (True) or
not (False) — if so, then this is the final time this result
will be returned; if not, then this result is an interim
result, and may be updated later on.""")
[docs] @classmethod
def create_from_dict(cls, result):
"""
Deserializes a serialized RecognitionResult
"""
result = result.copy()
alternatives = result.get("alternatives", [])
_alternatives = []
for alternative in alternatives:
_alternatives.append(RecognitionAlternative(**alternative))
result["alternatives"] = _alternatives
return cls(**result)
[docs] @classmethod
def create_from_list(cls, results):
"""
Deserializes a list of serialized RecognitionResults.
"""
return [cls.create_from_dict(result) for result in results]
[docs]class SpeechToText(Widget):
"""
The SpeechToText widget controls the speech recognition service of
the browser.
It wraps the HTML5 SpeechRecognition API. See
https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognition
Reference: https://panel.holoviz.org/reference/widgets/SpeechToText.html
:Example:
>>> SpeechToText(button_type="light")
This functionality is **experimental** and only supported by
Chrome and a few other browsers. Checkout
https://caniuse.com/speech-recognition for a up to date list of
browsers supporting the SpeechRecognition Api. Or alternatively
https://developer.mozilla.org/en-US/docs/Web/API/SpeechRecognition#Browser_compatibility
On some browsers, like Chrome, using Speech Recognition on a web
page involves a server-based recognition engine. Your audio is
sent to a web service for recognition processing, so it won't work
offline. Whether this is secure and confidential enough for your
use case is up to you to evaluate.
"""
abort = param.Event(doc="""
Stops the speech recognition service from listening to
incoming audio, and doesn't attempt to return a
RecognitionResult.""")
start = param.Event(doc="""
Starts the speech recognition service listening to incoming
audio with intent to recognize grammars associated with the
current SpeechRecognition.""")
stop = param.Event(doc="""
Stops the speech recognition service from listening to
incoming audio, and attempts to return a RecognitionResult
using the audio captured so far.""")
lang = param.ObjectSelector(default="", objects=[""] + LANGUAGE_CODES,
allow_None=True, label="Language", doc="""
The language of the current SpeechRecognition in BCP 47
format. For example 'en-US'. If not specified, this defaults
to the HTML lang attribute value, or the user agent's language
setting if that isn't set either. """)
continuous = param.Boolean(default=False, doc="""
Controls whether continuous results are returned for each
recognition, or only a single result. Defaults to False""")
interim_results = param.Boolean(default=False, doc="""
Controls whether interim results should be returned (True) or
not (False.) Interim results are results that are not yet
final (e.g. the RecognitionResult.is_final property is
False).""")
max_alternatives = param.Integer(default=1, bounds=(1, 5), doc="""
Sets the maximum number of RecognitionAlternatives provided
per result. A number between 1 and 5. The default value is
1.""")
service_uri = param.String(doc="""
Specifies the location of the speech recognition service used
by the current SpeechRecognition to handle the actual
recognition. The default is the user agent's default speech
service.""")
grammars = param.ClassSelector(class_=GrammarList, doc="""
A GrammarList object that represents the grammars that will be
understood by the current SpeechRecognition service""")
button_hide = param.Boolean(default=False, label="Hide the Button", doc="""
If True no button is shown. If False a toggle Start/ Stop button is shown.""")
button_type = param.ObjectSelector(default="light", objects=BUTTON_TYPES, doc="""
The button styling.""")
button_not_started = param.String(label="Button Text when not started", doc="""
The text to show on the button when the SpeechRecognition
service is NOT started. If '' a *muted microphone* icon is
shown.""")
button_started = param.String(label="Button Text when started", doc="""
The text to show on the button when the SpeechRecognition
service is started. If '' a *muted microphone* icon is
shown.""")
started = param.Boolean(constant=True, doc="""
Returns True if the Speech Recognition Service is started and
False otherwise.""")
audio_started = param.Boolean(constant=True, doc="""
Returns True if the Audio is started and False otherwise.""")
sound_started = param.Boolean(constant=True, doc="""
Returns True if the Sound is started and False otherwise.""")
speech_started = param.Boolean(constant=True, doc="""
Returns True if the the User has started speaking and False otherwise.""")
results = param.List(constant=True, doc="""
The `results` as a list of Dictionaries.""")
value = param.String(constant=True, label="Last Result", doc="""
The transcipt of the highest confidence RecognitionAlternative
of the last RecognitionResult. Please note we strip the
transcript for leading spaces.""")
_grammars = param.List(constant=True, doc="""
List used to transfer the serialized grammars from server to
browser.""")
_rename: ClassVar[Mapping[str, str | None]] = {
'grammars': None, '_grammars': 'grammars', 'name': None, 'value': None,
}
_widget_type: ClassVar[Type[Model]] = _BkSpeechToText
def __init__(self, **params):
super().__init__(**params)
if self.grammars:
self._update_grammars()
def __repr__(self, depth=None):
# Custom repr needed to avoid infinite recursion because this Parameterized class has
# multiple actions
return f"SpeechToText(name='{self.name}')"
@param.depends('grammars', watch=True)
def _update_grammars(self):
with param.edit_constant(self):
if self.grammars:
self._grammars = self.grammars.serialize() # pylint: disable=no-member
else:
self._grammars = []
@param.depends('results', watch=True)
def _update_results(self):
# pylint: disable=unsubscriptable-object
with param.edit_constant(self):
if self.results and 'alternatives' in self.results[-1]:
self.value = (self.results[-1]['alternatives'][0]['transcript']).lstrip()
else:
self.value = ''
@property
def results_deserialized(self):
"""
Returns the results as a List of RecognitionResults
"""
return RecognitionResult.create_from_list(self.results)
@property
def results_as_html(self) -> str:
"""
Returns the `results` formatted as html
Convenience method for ease of use
"""
if not self.results:
return 'No results'
html = '<div class="pn-speech-recognition-result">'
total = len(self.results) - 1
for index, result in enumerate(reversed(self.results_deserialized)):
if len(self.results) > 1:
html += f'<h3>Result {total-index}</h3>'
html += f'<span>Is Final: {result.is_final}</span><br/>'
for index2, alternative in enumerate(result.alternatives):
if len(result.alternatives) > 1:
html += f'<h4>Alternative {index2}</h4>'
html += f"""
<span>Confidence: {alternative.confidence:.2f}</span>
</br>
<p>
<strong>{alternative.transcript}</strong>
</p>
"""
html += '</div>'
return html