1#!/usr/bin/env perl
2# Resolve Serbian hybridized Cyrillic Ijekavian/Ekavian text.
3#
4# Hybridized Serbian Cyrillic text may contain alternatives directives
5# by script (~@) and by dialect (~#):
6#
7#   Поређано ~@/азбучним/abecednim/ редоследом.
8#   Можда и ~#/смеју/смију/ да се појаве.
9#
10# which are resolved into one of the alternatives depending on target
11# dialect and script combination.
12#
13# Alternatives directives by script are needed only when
14# direct Cyrillic to Latin transliteration is not sufficient;
15# for Latin combinations, text outside of alternatives by script
16# is automatically transliterated.
17#
18# Alternatives by dialect should be rare, as dialect hybridization is normally
19# performed by inserting jat-reflex ticks (›, ‹, ◃, ▹) into Ijekavian text:
20#
21#   Пром›јене ће одмах бити заб‹иљежене.
22#
23# Text with jat-reflex ticks is resolved to clean Ijekavian by simply
24# removing the marks, and to Ekavian by applying a mapping table.
25#
26# Text is input through standard output and output to standard output.
27# Input text must be UTF-8 encoded, and output is UTF-8 as well.
28#
29# Chusslove Illich <caslav.ilic@gmx.net>
30
31use strict;
32use warnings;
33use utf8;
34
35binmode(STDIN, ":utf8");
36binmode(STDOUT, ":utf8");
37
38$0 =~ s/.*\///;
39sub error { die "$0: @_\n"; }
40sub warning { warn "$0: @_\n"; }
41
42sub show_usage
43{
44    die "Usage: $0 [ec|el|ic|il]\n";
45}
46
47# Resolve alternatives directives in text,
48# given the alternative head, selected alternative (1-based)
49# and total number of alternatives per directive.
50sub resalts
51{
52    my ($text, $althead, $select, $total) = @_;
53
54    my $althlen = length($althead);
55
56    my $rtext;
57    my $malformed = 0;
58    my $p = -1;
59    my $pp;
60    my $errtext;
61    while (1) {
62        $pp = $p + 1;
63        $p = index($text, $althead, $pp);
64        if ($p < 0) {
65            $rtext .= substr($text, $pp);
66            last;
67        }
68        my $ps = $p;
69
70        # Append segment prior to alternatives directive to the result.
71        $rtext .= substr($text, $pp, $p - $pp);
72        $errtext = substr($text, $p, $p + 30); # text segment for error report
73
74        # Must have at least 2 characters after the head.
75        if (length($text) < $p + $althlen + 2) {
76            $malformed = 1;
77            last;
78        }
79
80        # Read the separating character.
81        $p += $althlen;
82        my $sep = substr($text, $p, 1);
83
84        # Parse requested number of inserts,
85        # choose the one with matching index for the result.
86        my @alts;
87        for (my $i = 0; $i < $total; ++$i) {
88            $pp = $p + 1;
89            $p = index($text, $sep, $pp);
90            # Must have exactly the given total number of alternatives.
91            if ($p < 0) {
92                $malformed = 1;
93                last;
94            }
95            push(@alts, substr($text, $pp, $p - $pp));
96        }
97        last if $malformed;
98
99        # Replace the alternative if admissible, or leave directive untouched.
100        my $isel = $select - 1;
101        if ($isel < @alts) {
102            $rtext .= $alts[$isel];
103        } else {
104            $rtext .= substr($text, $ps, $p + 1 - $ps);
105        }
106    }
107
108    if ($malformed) {
109        $rtext = $text;
110        warning("Malformed alternatives directive at '$errtext', skipped.");
111    }
112
113    return $rtext;
114}
115
116# Transliteration table Serbian Cyrillic->Latin.
117my %map_ctol = (
118    'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'ђ' => 'đ',
119    'е' => 'e', 'ж' => 'ž', 'з' => 'z', 'и' => 'i', 'ј' => 'j', 'к' => 'k',
120    'л' => 'l', 'љ' => 'lj','м' => 'm', 'н' => 'n', 'њ' => 'nj','о' => 'o',
121    'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'ћ' => 'ć', 'у' => 'u',
122    'ф' => 'f', 'х' => 'h', 'ц' => 'c', 'ч' => 'č', 'џ' => 'dž','ш' => 'š',
123    'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Ђ' => 'Đ',
124    'Е' => 'E', 'Ж' => 'Ž', 'З' => 'Z', 'И' => 'I', 'Ј' => 'J', 'К' => 'K',
125    'Л' => 'L', 'Љ' => 'Lj','М' => 'M', 'Н' => 'N', 'Њ' => 'Nj','О' => 'O',
126    'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'Ћ' => 'Ć', 'У' => 'U',
127    'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C', 'Ч' => 'Č', 'Џ' => 'Dž','Ш' => 'Š',
128    # accented NFC:
129    'ѐ' => 'è', 'ѝ' => 'ì', 'ӣ' => 'ī', 'ӯ' => 'ū',
130    'Ѐ' => 'È', 'Ѝ' => 'Ì', 'Ӣ' => 'Ī', 'Ӯ' => 'Ū',
131    # frequent accented from NFD to NFC (keys now 2-char):
132    'а̂' => 'â', 'о̂' => 'ô', 'а̑' => 'ȃ', 'о̑' => 'ȏ',
133);
134
135# Transliterate Cyrillic text to Latin.
136sub ctol
137{
138    my ($text) = @_;
139    my $tlen = length($text);
140    my $ntext = "";
141    for (my $i = 0; $i < $tlen; ++$i) {
142        my $c = substr($text, $i, 1);
143        my $c2 = substr($text, $i, 2);
144        my $r = ($map_ctol{$c2} or $map_ctol{$c});
145        if ($r) {
146            my $cp = $i + 1 < $tlen ? substr($text, $i + 1, 1) : "";
147            my $cn = $i > 0 ? substr($text, $i - 1, 1) : "";
148            if (    length($r) > 1 and $c =~ /[[:upper:]]/
149                and ($cn =~ /[[:upper:]]/ or $cp =~ /[[:upper:]]/))
150            {
151                $ntext .= uc($r);
152            } else {
153                $ntext .= $r;
154            }
155        } else {
156            $ntext .= $c;
157        }
158    }
159    return $ntext;
160}
161
162# Resolve hybrid Cyrillic/Latin text into clean Cyrillic.
163sub hctoc
164{
165    my ($text) = @_;
166    my $ntext;
167    $ntext = resalts($text, '~@', 1, 2);
168    return $ntext;
169}
170
171# Resolve hybrid Cyrillic/Latin text into clean Latin.
172sub hctol
173{
174    my ($text) = @_;
175    my $ntext;
176    $ntext = ctol($text); # FIXME: Do not convert inside alt directives.
177    $ntext = resalts($ntext, '~@', 2, 2);
178    return $ntext;
179}
180
181# Ijekavian to Ekavian map (Latin script and letter cases derived afterwards).
182my @reflex_spec = (
183    ['›', {
184        'ије' => 'е',
185        'је' => 'е',
186    }],
187    ['‹', {
188        'иј' => 'еј',
189        'иљ' => 'ел',
190        'ио' => 'ео',
191        'ље' => 'ле',
192        'ње' => 'не',
193    }],
194    ['▹', {
195        'ије' => 'и',
196        'је' => 'и',
197    }],
198    ['◃', {
199        'ијел' => 'ео',
200        'ијен' => 'ењ',
201        'ил' => 'ел',
202        'ит' => 'ет',
203        'јел' => 'ео',
204        'тн' => 'тњ',
205        'шње' => 'сне',
206    }],
207);
208
209# Derive data for dehybridization.
210my @reflex_spec_dehyb;
211for my $refgrp (@reflex_spec) {
212    my $tick = $refgrp->[0];
213    my $refmap = $refgrp->[1];
214    # Derive Latin mappings (must be fully done before different cases).
215    for my $ijkfrm (keys %{$refmap}) {
216        my $ekvfrm = $refmap->{$ijkfrm};
217        $refmap->{ctol($ijkfrm)} = ctol($ekvfrm);
218    }
219    # Derive mappings with different cases.
220    for my $ijkfrm (keys %{$refmap}) {
221        my $ekvfrm = $refmap->{$ijkfrm};
222        $refmap->{ucfirst($ijkfrm)} = ucfirst($ekvfrm);
223        $refmap->{uc($ijkfrm)} = uc($ekvfrm);
224    }
225    # Compute minimum and maximum reflex lengths.
226    my $reflen_min = 0;
227    my $reflen_max = 0;
228    for my $ijkfrm (keys %{$refmap}) {
229        my $reflen = length($ijkfrm);
230        $reflen_max = $reflen if $reflen_max < $reflen;
231        $reflen_min = $reflen if $reflen_min > $reflen;
232    }
233    # Derivation for current group done.
234    push @reflex_spec_dehyb, [$tick, $refmap, $reflen_min, $reflen_max];
235}
236
237
238# Resolve hybrid Ijekavian text into clean Ekavian.
239sub hitoe
240{
241    my ($text) = @_;
242    return hito_w($text, 0);
243}
244
245# Resolve hybrid Ijekavian text into clean Ijekavian.
246sub hitoi
247{
248    my ($text) = @_;
249    return hito_w($text, 1);
250}
251
252sub hito_w
253{
254    my ($text, $toijek) = @_;
255
256    for my $refgrp (@reflex_spec_dehyb) {
257        $text = hito_w_simple($text, @{$refgrp}, $toijek);
258    }
259    $text = resalts($text, '~#', (!$toijek? 1 : 2), 2);
260
261    return $text;
262}
263
264sub hito_w_simple
265{
266    my ($text, $tick, $refmap, $reflen_min, $reflen_max, $toijek) = @_;
267
268    my $ntext;
269    my $p = 0;
270    while (1) {
271        my $pp = $p;
272        $p = index($text, $tick, $p);
273        if ($p < 0) {
274            $ntext .= substr($text, $pp);
275            last;
276        }
277        $ntext .= substr($text, $pp, $p - $pp);
278        $pp = $p;
279        $p += length($tick);
280        if ($p >= length($text) or substr($text, $p, 1) !~ /\w/) {
281            $ntext .= $tick;
282            next;
283        }
284
285        my $reflen = $reflen_min;
286        my ($ijkfrm, $ekvfrm);
287        while ($reflen <= $reflen_max and !$ekvfrm) {
288            $ijkfrm = substr($text, $p, $reflen);
289            $ekvfrm = $refmap->{$ijkfrm};
290            $reflen += 1;
291        }
292
293        if ($ekvfrm) {
294            $ntext .= (!$toijek ? $ekvfrm : $ijkfrm);
295            $p += length($ijkfrm);
296        } else {
297            $ntext .= $tick;
298            my $dtext = substr($text, $pp, 20);
299            warning("Unknown jat-reflex starting from '$dtext'.");
300        }
301    }
302    return $ntext;
303}
304
305
306sub main
307{
308    @ARGV == 1 or show_usage();
309    my $dstarget = shift @ARGV;
310    $dstarget =~ /^(ec|el|ic|il)$/ or show_usage();
311
312    my $resf;
313    if ($dstarget eq "ec") {
314        $resf = sub { return hitoe(hctoc($_[0])); }
315    } elsif ($dstarget eq "el") {
316        $resf = sub { return hitoe(hctol($_[0])); }
317    } elsif ($dstarget eq "ic") {
318        $resf = sub { return hitoi(hctoc($_[0])); }
319    } else {
320        $resf = sub { return hitoi(hctol($_[0])); }
321    }
322
323    while (<STDIN>) {
324        print $resf->($_);
325    }
326}
327
328main();
329