1"""
2This module gathers processing (i.e. tokenization) classes.
3
4Class summary
5=============
6
7.. autosummary::
8
9        StreamTokenizer
10"""
11
12from auditok.util import DataValidator
13
14__all__ = ["StreamTokenizer"]
15
16
17class StreamTokenizer():
18    """
19    Class for stream tokenizers. It implements a 4-state automaton scheme
20    to extract sub-sequences of interest on the fly.
21
22    :Parameters:
23
24        `validator` :
25            instance of `DataValidator` that implements `is_valid` method.
26
27        `min_length` : *(int)*
28            Minimum number of frames of a valid token. This includes all \
29            tolerated non valid frames within the token.
30
31        `max_length` : *(int)*
32            Maximum number of frames of a valid token. This includes all \
33            tolerated non valid frames within the token.
34
35        `max_continuous_silence` : *(int)*
36            Maximum number of consecutive non-valid frames within a token.
37            Note that, within a valid token, there may be many tolerated \
38            *silent* regions that contain each a number of non valid frames up to \
39            `max_continuous_silence`
40
41        `init_min` : *(int, default=0)*
42            Minimum number of consecutive valid frames that must be **initially** \
43            gathered before any sequence of non valid frames can be tolerated. This
44            option is not always needed, it can be used to drop non-valid tokens as
45            early as possible. **Default = 0** means that the option is by default
46            ineffective.
47
48        `init_max_silence` : *(int, default=0)*
49            Maximum number of tolerated consecutive non-valid frames if the \
50            number already gathered valid frames has not yet reached 'init_min'.
51            This argument is normally used if `init_min` is used. **Default = 0**,
52            by default this argument is not taken into consideration.
53
54        `mode` : *(int, default=0)*
55            `mode` can be:
56
57        1. `StreamTokenizer.STRICT_MIN_LENGTH`:
58        if token *i* is delivered because `max_length`
59        is reached, and token *i+1* is immediately adjacent to
60        token *i* (i.e. token *i* ends at frame *k* and token *i+1* starts
61        at frame *k+1*) then accept token *i+1* only of it has a size of at
62        least `min_length`. The default behavior is to accept token *i+1*
63        event if it is shorter than `min_length` (given that the above conditions
64        are fulfilled of course).
65
66        :Examples:
67
68        In the following code, without `STRICT_MIN_LENGTH`, the 'BB' token is
69        accepted although it is shorter than `min_length` (3), because it immediately
70        follows the latest delivered token:
71
72        .. code:: python
73
74            from auditok import StreamTokenizer, StringDataSource, DataValidator
75
76            class UpperCaseChecker(DataValidator):
77                def is_valid(self, frame):
78                    return frame.isupper()
79
80
81            dsource = StringDataSource("aaaAAAABBbbb")
82            tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
83                                        min_length=3,
84                                        max_length=4,
85                                        max_continuous_silence=0)
86
87            tokenizer.tokenize(dsource)
88
89
90        :output:
91
92         .. code:: python
93
94            [(['A', 'A', 'A', 'A'], 3, 6), (['B', 'B'], 7, 8)]
95
96
97        The following tokenizer will however reject the 'BB' token:
98
99        .. code:: python
100
101            dsource = StringDataSource("aaaAAAABBbbb")
102            tokenizer = StreamTokenizer(validator=UpperCaseChecker(),
103                                        min_length=3, max_length=4,
104                                        max_continuous_silence=0,
105                                        mode=StreamTokenizer.STRICT_MIN_LENGTH)
106            tokenizer.tokenize(dsource)
107
108        :output:
109
110        .. code:: python
111
112            [(['A', 'A', 'A', 'A'], 3, 6)]
113
114
115        2. `StreamTokenizer.DROP_TRAILING_SILENCE`: drop all tailing non-valid frames
116        from a token to be delivered if and only if it is not **truncated**.
117        This can be a bit tricky. A token is actually delivered if:
118
119        - a. `max_continuous_silence` is reached
120
121        :or:
122
123        - b. Its length reaches `max_length`. This is called a **truncated** token
124
125        In the current implementation, a `StreamTokenizer`'s decision is only based on already seen
126        data and on incoming data. Thus, if a token is truncated at a non-valid but tolerated
127        frame (`max_length` is reached but `max_continuous_silence` not yet) any tailing
128        silence will be kept because it can potentially be part of valid token (if `max_length`
129        was bigger). But if `max_continuous_silence` is reached before `max_length`, the delivered
130        token will not be considered as truncated but a result of *normal* end of detection
131        (i.e. no more valid data). In that case the tailing silence can be removed if you use
132        the `StreamTokenizer.DROP_TRAILING_SILENCE` mode.
133
134        :Example:
135
136        .. code:: python
137
138             tokenizer = StreamTokenizer(validator=UpperCaseChecker(), min_length=3,
139                                         max_length=6, max_continuous_silence=3,
140                                         mode=StreamTokenizer.DROP_TRAILING_SILENCE)
141
142             dsource = StringDataSource("aaaAAAaaaBBbbbb")
143             tokenizer.tokenize(dsource)
144
145        :output:
146
147        .. code:: python
148
149            [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B'], 9, 10)]
150
151        The first token is delivered with its tailing silence because it is truncated
152        while the second one has its tailing frames removed.
153
154        Without `StreamTokenizer.DROP_TRAILING_SILENCE` the output would be:
155
156        .. code:: python
157
158            [(['A', 'A', 'A', 'a', 'a', 'a'], 3, 8), (['B', 'B', 'b', 'b', 'b'], 9, 13)]
159
160
161
162        3. `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`:
163        use both options. That means: first remove tailing silence, then ckeck if the
164        token still has at least a length of `min_length`.
165    """
166
167
168    SILENCE = 0
169    POSSIBLE_SILENCE = 1
170    POSSIBLE_NOISE = 2
171    NOISE = 3
172
173    STRICT_MIN_LENGTH = 2
174    DROP_TRAILING_SILENCE = 4
175    # alias
176    DROP_TAILING_SILENCE = 4
177
178    def __init__(self, validator,
179                 min_length, max_length, max_continuous_silence,
180                 init_min=0, init_max_silence=0,
181                 mode=0):
182
183        if not isinstance(validator, DataValidator):
184            raise TypeError("'validator' must be an instance of 'DataValidator'")
185
186        if max_length <= 0:
187            raise ValueError("'max_length' must be > 0 (value={0})".format(max_length))
188
189        if min_length <= 0 or min_length > max_length:
190            raise ValueError("'min_length' must be > 0 and <= 'max_length' (value={0})".format(min_length))
191
192        if max_continuous_silence >= max_length:
193            raise ValueError("'max_continuous_silence' must be < 'max_length' (value={0})".format(max_continuous_silence))
194
195        if init_min >= max_length:
196            raise ValueError("'init_min' must be < 'max_length' (value={0})".format(max_continuous_silence))
197
198        self.validator = validator
199        self.min_length = min_length
200        self.max_length = max_length
201        self.max_continuous_silence = max_continuous_silence
202        self.init_min = init_min
203        self.init_max_silent = init_max_silence
204
205        self._mode = None
206        self.set_mode(mode)
207        self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
208        self._drop_tailing_silence  = (mode & self.DROP_TRAILING_SILENCE) != 0
209
210        self._deliver = None
211        self._tokens = None
212        self._state = None
213        self._data = None
214        self._contiguous_token = False
215
216        self._init_count = 0
217        self._silence_length = 0
218        self._start_frame = 0
219        self._current_frame = 0
220
221    def set_mode(self, mode):
222        """
223        :Parameters:
224
225            `mode` : *(int)*
226                New mode, must be one of:
227
228
229            - `StreamTokenizer.STRICT_MIN_LENGTH`
230
231            - `StreamTokenizer.DROP_TRAILING_SILENCE`
232
233            - `StreamTokenizer.STRICT_MIN_LENGTH | StreamTokenizer.DROP_TRAILING_SILENCE`
234
235            - `0`
236
237        See `StreamTokenizer.__init__` for more information about the mode.
238        """
239
240        if not mode in [self.STRICT_MIN_LENGTH, self.DROP_TRAILING_SILENCE,
241           self.STRICT_MIN_LENGTH | self.DROP_TRAILING_SILENCE, 0]:
242
243            raise ValueError("Wrong value for mode")
244
245        self._mode = mode
246        self._strict_min_length = (mode & self.STRICT_MIN_LENGTH) != 0
247        self._drop_tailing_silence  = (mode & self.DROP_TRAILING_SILENCE) != 0
248
249
250    def get_mode(self):
251        """
252        Return the current mode. To check whether a specific mode is activated use
253        the bitwise 'and' operator `&`. Example:
254
255        .. code:: python
256
257            if mode & self.STRICT_MIN_LENGTH != 0:
258               do_something()
259        """
260        return self._mode
261
262    def _reinitialize(self):
263        self._contiguous_token = False
264        self._data = []
265        self._tokens = []
266        self._state = self.SILENCE
267        self._current_frame = -1
268        self._deliver = self._append_token
269
270
271    def tokenize(self, data_source, callback=None):
272        """
273        Read data from `data_source`, one frame a time, and process the read frames in
274        order to detect sequences of frames that make up valid tokens.
275
276        :Parameters:
277           `data_source` : instance of the :class:`DataSource` class that implements a `read` method.
278               'read' should return a slice of signal, i.e. frame (of whatever \
279               type as long as it can be processed by validator) and None if \
280               there is no more signal.
281
282           `callback` : an optional 3-argument function.
283               If a `callback` function is given, it will be called each time a valid token
284               is found.
285
286
287        :Returns:
288           A list of tokens if `callback` is None. Each token is tuple with the following elements:
289
290            .. code python
291
292                (data, start, end)
293
294           where `data` is a list of read frames, `start`: index of the first frame in the
295           original data and `end` : index of the last frame.
296
297        """
298
299        self._reinitialize()
300
301        if callback is not None:
302            self._deliver = callback
303
304        while True:
305            frame =  data_source.read()
306            if frame is None:
307                break
308            self._current_frame += 1
309            self._process(frame)
310
311        self._post_process()
312
313        if callback is None:
314            _ret = self._tokens
315            self._tokens = None
316            return _ret
317
318
319    def _process(self, frame):
320
321        frame_is_valid = self.validator.is_valid(frame)
322
323        if self._state == self.SILENCE:
324
325            if frame_is_valid:
326                # seems we got a valid frame after a silence
327                self._init_count = 1
328                self._silence_length = 0
329                self._start_frame = self._current_frame
330                self._data.append(frame)
331
332                if self._init_count  >= self.init_min:
333                    self._state = self.NOISE
334                    if len(self._data) >= self.max_length:
335                        self._process_end_of_detection(True)
336                else:
337                    self._state = self.POSSIBLE_NOISE
338
339        elif self._state == self.POSSIBLE_NOISE:
340
341            if frame_is_valid:
342                self._silence_length = 0
343                self._init_count += 1
344                self._data.append(frame)
345                if self._init_count  >= self.init_min:
346                    self._state = self.NOISE
347                    if len(self._data) >= self.max_length:
348                        self._process_end_of_detection(True)
349
350            else:
351                self._silence_length += 1
352                if self._silence_length > self.init_max_silent or \
353                len(self._data) + 1 >= self.max_length:
354                    # either init_max_silent or max_length is reached
355                    # before _init_count, back to silence
356                    self._data = []
357                    self._state = self.SILENCE
358                else:
359                    self._data.append(frame)
360
361
362        elif self._state == self.NOISE:
363
364            if frame_is_valid:
365                self._data.append(frame)
366                if len(self._data) >= self.max_length:
367                    self._process_end_of_detection(True)
368
369            elif self.max_continuous_silence <= 0 :
370                # max token reached at this frame will _deliver if _contiguous_token
371                # and not _strict_min_length
372                self._process_end_of_detection()
373                self._state = self.SILENCE
374
375            else:
376                # this is the first silent frame following a valid one
377                # and it is tolerated
378                self._silence_length = 1
379                self._data.append(frame)
380                self._state = self.POSSIBLE_SILENCE
381                if len(self._data) == self.max_length:
382                    self._process_end_of_detection(True)
383                    # don't reset _silence_length because we still
384                    # need to know the total number of silent frames
385
386
387
388        elif self._state == self.POSSIBLE_SILENCE:
389
390            if frame_is_valid:
391                self._data.append(frame)
392                self._silence_length = 0
393                self._state = self.NOISE
394                if len(self._data) >= self.max_length:
395                    self._process_end_of_detection(True)
396
397            else:
398                if self._silence_length >= self.max_continuous_silence:
399                    if self._silence_length < len(self._data):
400                        # _deliver only gathered frames aren't all silent
401                        self._process_end_of_detection()
402                    else:
403                        self._data = []
404                    self._state = self.SILENCE
405                    self._silence_length = 0
406                else:
407                    self._data.append(frame)
408                    self._silence_length += 1
409                    if len(self._data) >= self.max_length:
410                        self._process_end_of_detection(True)
411                        # don't reset _silence_length because we still
412                        # need to know the total number of silent frames
413
414
415    def _post_process(self):
416        if self._state == self.NOISE or self._state == self.POSSIBLE_SILENCE:
417            if len(self._data) > 0 and len(self._data) > self._silence_length:
418                self._process_end_of_detection()
419
420
421    def _process_end_of_detection(self, truncated=False):
422
423        if not truncated and self._drop_tailing_silence and self._silence_length > 0:
424            # happens if max_continuous_silence is reached
425            # or max_length is reached at a silent frame
426            self._data = self._data[0: - self._silence_length]
427
428        if (len(self._data) >= self.min_length) or \
429           (len(self._data) > 0 and \
430            not self._strict_min_length and self._contiguous_token):
431
432
433
434            _end_frame = self._start_frame + len(self._data) - 1
435            self._deliver(self._data, self._start_frame, _end_frame)
436
437            if truncated:
438                # next token (if any) will start at _current_frame + 1
439                self._start_frame = self._current_frame + 1
440                # remember that it is contiguous with the just delivered one
441                self._contiguous_token = True
442            else:
443                self._contiguous_token = False
444        else:
445            self._contiguous_token = False
446
447        self._data = []
448
449
450
451    def _append_token(self, data, start, end):
452        self._tokens.append((data, start, end))
453