1 /*
2  * Copyright 2008-2014 Arsen Chaloyan
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  *
16  * $Id: mrcp_recog_header.h 2136 2014-07-04 06:33:36Z achaloyan@gmail.com $
17  */
18 
19 #ifndef MRCP_RECOG_HEADER_H
20 #define MRCP_RECOG_HEADER_H
21 
22 /**
23  * @file mrcp_recog_header.h
24  * @brief MRCP Recognizer Header
25  */
26 
27 #include "mrcp_types.h"
28 #include "mrcp_header_accessor.h"
29 
30 APT_BEGIN_EXTERN_C
31 
32 /** MRCP recognizer header fields */
33 typedef enum {
34 	RECOGNIZER_HEADER_CONFIDENCE_THRESHOLD,
35 	RECOGNIZER_HEADER_SENSITIVITY_LEVEL,
36 	RECOGNIZER_HEADER_SPEED_VS_ACCURACY,
37 	RECOGNIZER_HEADER_N_BEST_LIST_LENGTH,
38 	RECOGNIZER_HEADER_NO_INPUT_TIMEOUT,
39 	RECOGNIZER_HEADER_RECOGNITION_TIMEOUT,
40 	RECOGNIZER_HEADER_WAVEFORM_URI,
41 	RECOGNIZER_HEADER_COMPLETION_CAUSE,
42 	RECOGNIZER_HEADER_RECOGNIZER_CONTEXT_BLOCK,
43 	RECOGNIZER_HEADER_START_INPUT_TIMERS,
44 	RECOGNIZER_HEADER_SPEECH_COMPLETE_TIMEOUT,
45 	RECOGNIZER_HEADER_SPEECH_INCOMPLETE_TIMEOUT,
46 	RECOGNIZER_HEADER_DTMF_INTERDIGIT_TIMEOUT,
47 	RECOGNIZER_HEADER_DTMF_TERM_TIMEOUT,
48 	RECOGNIZER_HEADER_DTMF_TERM_CHAR,
49 	RECOGNIZER_HEADER_FAILED_URI,
50 	RECOGNIZER_HEADER_FAILED_URI_CAUSE,
51 	RECOGNIZER_HEADER_SAVE_WAVEFORM,
52 	RECOGNIZER_HEADER_NEW_AUDIO_CHANNEL,
53 	RECOGNIZER_HEADER_SPEECH_LANGUAGE,
54 
55 	/** Additional header fields for MRCP v2 */
56 	RECOGNIZER_HEADER_INPUT_TYPE,
57 	RECOGNIZER_HEADER_INPUT_WAVEFORM_URI,
58 	RECOGNIZER_HEADER_COMPLETION_REASON,
59 	RECOGNIZER_HEADER_MEDIA_TYPE,
60 	RECOGNIZER_HEADER_VER_BUFFER_UTTERANCE,
61 	RECOGNIZER_HEADER_RECOGNITION_MODE,
62 	RECOGNIZER_HEADER_CANCEL_IF_QUEUE,
63 	RECOGNIZER_HEADER_HOTWORD_MAX_DURATION,
64 	RECOGNIZER_HEADER_HOTWORD_MIN_DURATION,
65 	RECOGNIZER_HEADER_INTERPRET_TEXT,
66 	RECOGNIZER_HEADER_DTMF_BUFFER_TIME,
67 	RECOGNIZER_HEADER_CLEAR_DTMF_BUFFER,
68 	RECOGNIZER_HEADER_EARLY_NO_MATCH,
69 	RECOGNIZER_HEADER_NUM_MIN_CONSISTENT_PRONUNCIATIONS,
70 	RECOGNIZER_HEADER_CONSISTENCY_THRESHOLD,
71 	RECOGNIZER_HEADER_CLASH_THRESHOLD,
72 	RECOGNIZER_HEADER_PERSONAL_GRAMMAR_URI,
73 	RECOGNIZER_HEADER_ENROLL_UTTERANCE,
74 	RECOGNIZER_HEADER_PHRASE_ID,
75 	RECOGNIZER_HEADER_PHRASE_NL,
76 	RECOGNIZER_HEADER_WEIGHT,
77 	RECOGNIZER_HEADER_SAVE_BEST_WAVEFORM,
78 	RECOGNIZER_HEADER_NEW_PHRASE_ID,
79 	RECOGNIZER_HEADER_CONFUSABLE_PHRASES_URI,
80 	RECOGNIZER_HEADER_ABORT_PHRASE_ENROLLMENT,
81 
82 	RECOGNIZER_HEADER_COUNT
83 } mrcp_recognizer_header_id;
84 
85 
86 /** MRCP recognizer completion-cause  */
87 typedef enum {
88 	RECOGNIZER_COMPLETION_CAUSE_SUCCESS                 = 0,
89 	RECOGNIZER_COMPLETION_CAUSE_NO_MATCH                = 1,
90 	RECOGNIZER_COMPLETION_CAUSE_NO_INPUT_TIMEOUT        = 2,
91 	RECOGNIZER_COMPLETION_CAUSE_RECOGNITION_TIMEOUT     = 3,
92 	RECOGNIZER_COMPLETION_CAUSE_GRAM_LOAD_FAILURE       = 4,
93 	RECOGNIZER_COMPLETION_CAUSE_GRAM_COMP_FAILURE       = 5,
94 	RECOGNIZER_COMPLETION_CAUSE_ERROR                   = 6,
95 	RECOGNIZER_COMPLETION_CAUSE_SPEECH_TOO_EARLY        = 7,
96 	RECOGNIZER_COMPLETION_CAUSE_TOO_MUCH_SPEECH_TIMEOUT = 8,
97 	RECOGNIZER_COMPLETION_CAUSE_URI_FAILURE             = 9,
98 	RECOGNIZER_COMPLETION_CAUSE_LANGUAGE_UNSUPPORTED    = 10,
99 
100 	/** Additional completion-cause for MRCP v2 */
101 	RECOGNIZER_COMPLETION_CAUSE_CANCELLED               = 11,
102 	RECOGNIZER_COMPLETION_CAUSE_SEMANTICS_FAILURE       = 12,
103 	RECOGNIZER_COMPLETION_CAUSE_PARTIAL_MATCH           = 13,
104 	RECOGNIZER_COMPLETION_CAUSE_PARTIAL_MATCH_MAXTIME   = 14,
105 	RECOGNIZER_COMPLETION_CAUSE_NO_MATCH_MAXTIME        = 15,
106 	RECOGNIZER_COMPLETION_CAUSE_GRAM_DEFINITION_FAILURE = 16,
107 
108 	RECOGNIZER_COMPLETION_CAUSE_COUNT                   = 17,
109 	RECOGNIZER_COMPLETION_CAUSE_UNKNOWN                 = RECOGNIZER_COMPLETION_CAUSE_COUNT
110 } mrcp_recog_completion_cause_e;
111 
112 
113 
114 /** MRCP recognizer-header declaration */
115 typedef struct mrcp_recog_header_t mrcp_recog_header_t;
116 
117 /** MRCP recognizer-header */
118 struct mrcp_recog_header_t {
119 	/** Tells the recognizer resource what confidence level the client considers a
120     successful match */
121 	float                         confidence_threshold;
122 	/** To filter out background noise and not mistake it for speech */
123 	float                         sensitivity_level;
124 	/** Tunable towards Performance or Accuracy */
125 	float                         speed_vs_accuracy;
126 	/** The client, by setting this header, can ask the recognition resource
127 	to send it more  than 1 alternative */
128 	apr_size_t                    n_best_list_length;
129 	/** The client can use the no-input-timeout header to set this timeout */
130 	apr_size_t                    no_input_timeout;
131 	/** The client can use the recognition-timeout header to set this timeout */
132 	apr_size_t                    recognition_timeout;
133 	/** MUST be present in the RECOGNITION-COMPLETE event if the Save-Waveform
134 	header was set to true */
135 	apt_str_t                     waveform_uri;
136 	/** MUST be part of a RECOGNITION-COMPLETE, event coming from
137     the recognizer resource to the client */
138 	mrcp_recog_completion_cause_e completion_cause;
139 	/** MAY be sent as part of the SET-PARAMS or GET-PARAMS request */
140 	apt_str_t                     recognizer_context_block;
141 	/** MAY be sent as part of the RECOGNIZE request. A value of false tells
142 	the recognizer to start recognition, but not to start the no-input timer yet */
143 	apt_bool_t                    start_input_timers;
144 	/** Specifies the length of silence required following user
145     speech before the speech recognizer finalizes a result */
146 	apr_size_t                    speech_complete_timeout;
147 	/** Specifies the required length of silence following user
148     speech after which a recognizer finalizes a result */
149 	apr_size_t                    speech_incomplete_timeout;
150 	/** Specifies the inter-digit timeout value to use when
151     recognizing DTMF input */
152 	apr_size_t                    dtmf_interdigit_timeout;
153 	/** Specifies the terminating timeout to use when
154 	recognizing DTMF input*/
155 	apr_size_t                    dtmf_term_timeout;
156 	/** Specifies the terminating DTMF character for DTMF input
157     recognition */
158 	char                          dtmf_term_char;
159 	/** When a recognizer needs to fetch or access a URI and the access fails
160     the server SHOULD provide the failed URI in this header in the method response*/
161 	apt_str_t                     failed_uri;
162 	/** When a recognizer method needs a recognizer to fetch or access a URI
163     and the access fails the server MUST provide the URI specific or
164     protocol specific response code for the URI in the Failed-URI header */
165 	apt_str_t                     failed_uri_cause;
166 	/** Allows the client to request the recognizer resource to
167     save the audio input to the recognizer */
168 	apt_bool_t                    save_waveform;
169 	/** MAY be specified in a RECOGNIZE request and allows the
170     client to tell the server that, from this point on, further input
171     audio comes from a different audio source */
172 	apt_bool_t                    new_audio_channel;
173 	/** Specifies the language of recognition grammar data within
174     a session or request, if it is not specified within the data */
175 	apt_str_t                     speech_language;
176 
177 	/** Additional header fields for MRCP v2 */
178 	/** Specifies if the input that caused a barge-in was DTMF or speech */
179 	apt_str_t                     input_type;
180 	/** Optional header specifies a URI pointing to audio content to be
181     processed by the RECOGNIZE operation */
182 	apt_str_t                     input_waveform_uri;
183 	/** MAY be specified in a RECOGNITION-COMPLETE event coming from
184     the recognizer resource to the client */
185 	apt_str_t                     completion_reason;
186 	/** Tells the server resource the Media Type in which to store captured
187 	audio such as the one captured and returned by the Waveform-URI header */
188 	apt_str_t                     media_type;
189 	/** Lets the client request the server to buffer the
190     utterance associated with this recognition request into a buffer
191     available to a co-resident verification resource */
192 	apt_bool_t                    ver_buffer_utterance;
193 	/** Specifies what mode the RECOGNIZE method will operate in */
194 	apt_str_t                     recognition_mode;
195 	/** Specifies what will happen if the client attempts to
196     invoke another RECOGNIZE method when this RECOGNIZE request is
197     already in progress for the resource*/
198 	apt_bool_t                    cancel_if_queue;
199 	/** Specifies the maximum length of an utterance (in seconds) that will
200     be considered for Hotword recognition */
201 	apr_size_t                    hotword_max_duration;
202 	/** Specifies the minimum length of an utterance (in seconds) that will
203     be considered for Hotword recognition */
204 	apr_size_t                    hotword_min_duration;
205 	/** Provides a pointer to the text for which a natural language interpretation is desired */
206 	apt_str_t                     interpret_text;
207 	/** MAY be specified in a GET-PARAMS or SET-PARAMS method and
208     is used to specify the size in time, in milliseconds, of the
209     typeahead buffer for the recognizer */
210 	apr_size_t                    dtmf_buffer_time;
211 	/** MAY be specified in a RECOGNIZE method and is used to
212     tell the recognizer to clear the DTMF type-ahead buffer before
213     starting the recognize */
214 	apt_bool_t                    clear_dtmf_buffer;
215 	/** MAY be specified in a RECOGNIZE method and is used to
216     tell the recognizer that it MUST not wait for the end of speech
217     before processing the collected speech to match active grammars */
218 	apt_bool_t                    early_no_match;
219 	/** MAY be specified in a START-PHRASE-ENROLLMENT, "SET-PARAMS", or
220 	"GET-PARAMS" method and is used to specify the minimum number of
221 	consistent pronunciations that must be obtained to voice enroll a new phrase */
222 	apr_size_t                    num_min_consistent_pronunciations;
223 	/** MAY be sent as part of the START-PHRASE-ENROLLMENT,"SET-PARAMS", or
224 	"GET-PARAMS" method and is used during voice-enrollment to specify how similar
225 	to a previously enrolled pronunciation of the same phrase an utterance needs
226 	to be in order to be considered "consistent" */
227 	float                         consistency_threshold;
228 	/** MAY be sent as part of the START-PHRASE-ENROLLMENT, SET-PARAMS, or
229 	"GET-PARAMS" method and is used during voice-enrollment to specify
230 	how similar the pronunciations of two different phrases can be
231 	before they are considered to be clashing */
232 	float                         clash_threshold;
233 	/** Specifies the speaker-trained grammar to be used or
234 	referenced during enrollment operations */
235 	apt_str_t                     personal_grammar_uri;
236 	/** MAY be specified in the RECOGNIZE method. If this header
237 	is set to "true" and an Enrollment is active, the RECOGNIZE command
238 	MUST add the collected utterance to the personal grammar that is
239 	being enrolled */
240 	apt_bool_t                    enroll_utterance;
241 	/** Identifies a phrase in an existing personal grammar for which
242 	enrollment is desired.  It is also returned to the client in the
243 	RECOGNIZE complete event */
244 	apt_str_t                     phrase_id;
245 	/** Specifies the interpreted text to be returned when the
246 	phrase is recognized */
247 	apt_str_t                     phrase_nl;
248 	/** Represents the occurrence likelihood of a phrase in an enrolled grammar */
249 	float                         weight;
250 	/** Allows the client to request the recognizer resource to
251 	save the audio stream for the best repetition of the phrase that was
252 	used during the enrollment session */
253 	apt_bool_t                    save_best_waveform;
254 	/** Replaces the id used to identify the phrase in a personal grammar */
255 	apt_str_t                     new_phrase_id;
256 	/** Specifies a grammar that defines invalid phrases for enrollment */
257 	apt_str_t                     confusable_phrases_uri;
258 	/** Can optionally be specified in the END-PHRASE-ENROLLMENT
259 	method to abort the phrase enrollment, rather than committing the
260 	phrase to the personal grammar */
261 	apt_bool_t                    abort_phrase_enrollment;
262 };
263 
264 
265 /** Get recognizer header vtable */
266 const mrcp_header_vtable_t* mrcp_recog_header_vtable_get(mrcp_version_e version);
267 
268 /** Get recognizer completion cause string */
269 MRCP_DECLARE(const apt_str_t*) mrcp_recog_completion_cause_get(mrcp_recog_completion_cause_e completion_cause, mrcp_version_e version);
270 
271 APT_END_EXTERN_C
272 
273 #endif /* MRCP_RECOG_HEADER_H */
274