1# Copyright (C) 2016, 2018  Olga Yakovleva <yakovleva.o.v@gmail.com>
2
3# This program is free software: you can redistribute it and/or modify
4# it under the terms of the GNU Lesser General Public License as published by
5# the Free Software Foundation, either version 2.1 of the License, or
6# (at your option) any later version.
7
8# This program is distributed in the hope that it will be useful,
9# but WITHOUT ANY WARRANTY; without even the implied warranty of
10# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11# GNU Lesser General Public License for more details.
12
13# You should have received a copy of the GNU Lesser General Public License
14# along with this program.  If not, see <http://www.gnu.org/licenses/>.
15
16define Consonant
17б|г|д|ж|җ|з|к|л|м|н|п|р|с|т|ф|х|һ|ц|ч|ш|щ|й|в|ң|ғ|қ|Б|Г|Д|Ж|Җ|З|К|Л|М|Н|П|Р|С|Т|Ф|Х|Һ|Ц|Ч|Ш|Щ|Й|В|Ң|Ғ|Қ |
18b|v|w|g|ğ|d|j|c|z|y|k|q|l|m|n|p|r|s|t|f|x|h |
19B|V|W|G|Ğ|D|J|C|Z|Y|K|Q|L|M|N|P|R|S|T|F|X|H ;
20
21define Vowel
22а|ә|е|и|о|у|ю|я|ы|э|ө|ү|і|А|Ә|Е|И|І|О|У|Ю|Я|Ы|Э|Ө|Ү|І |
23a|e|i|o|u |
24A|E|I|O|U ;
25
26define Letter
27Consonant|Vowel|ь|ъ;
28
29define Word
30Letter+ ;
31
32define NZDigit 1|2|3|4|5|6|7|8|9;
33define Digit %0|NZDigit;
34
35define Number %0 | [NZDigit Digit*];
36
37define ConsonantSequence
38Consonant+ ;
39
40define LetterSequence
41ConsonantSequence | [ь|ъ]+ ;
42
43define BasicTokenizer
44Word @-> ... word || _ ,,
45NZDigit Digit* @-> ... num ,,
46%0 Digit+ @-> ... dig ,,
47%0 @-> ... num ,,
48\[Letter | Digit] @-> ... sym ;
49
50define TagLetterSequences
51word -> lseq || [.#.|word|num|dig|sym] LetterSequence _ ;
52
53define UntagLargeNumbers
54num -> dig || Digit^13 _ ;
55
56regex
57BasicTokenizer .o.
58TagLetterSequences .o.
59UntagLargeNumbers ;
60