1#!/usr/bin/env perl 2# Resolve Serbian hybridized Cyrillic Ijekavian/Ekavian text. 3# 4# Hybridized Serbian Cyrillic text may contain alternatives directives 5# by script (~@) and by dialect (~#): 6# 7# Поређано ~@/азбучним/abecednim/ редоследом. 8# Можда и ~#/смеју/смију/ да се појаве. 9# 10# which are resolved into one of the alternatives depending on target 11# dialect and script combination. 12# 13# Alternatives directives by script are needed only when 14# direct Cyrillic to Latin transliteration is not sufficient; 15# for Latin combinations, text outside of alternatives by script 16# is automatically transliterated. 17# 18# Alternatives by dialect should be rare, as dialect hybridization is normally 19# performed by inserting jat-reflex ticks (›, ‹, ◃, ▹) into Ijekavian text: 20# 21# Пром›јене ће одмах бити заб‹иљежене. 22# 23# Text with jat-reflex ticks is resolved to clean Ijekavian by simply 24# removing the marks, and to Ekavian by applying a mapping table. 25# 26# Text is input through standard output and output to standard output. 27# Input text must be UTF-8 encoded, and output is UTF-8 as well. 28# 29# Chusslove Illich <caslav.ilic@gmx.net> 30 31use strict; 32use warnings; 33use utf8; 34 35binmode(STDIN, ":utf8"); 36binmode(STDOUT, ":utf8"); 37 38$0 =~ s/.*\///; 39sub error { die "$0: @_\n"; } 40sub warning { warn "$0: @_\n"; } 41 42sub show_usage 43{ 44 die "Usage: $0 [ec|el|ic|il]\n"; 45} 46 47# Resolve alternatives directives in text, 48# given the alternative head, selected alternative (1-based) 49# and total number of alternatives per directive. 50sub resalts 51{ 52 my ($text, $althead, $select, $total) = @_; 53 54 my $althlen = length($althead); 55 56 my $rtext; 57 my $malformed = 0; 58 my $p = -1; 59 my $pp; 60 my $errtext; 61 while (1) { 62 $pp = $p + 1; 63 $p = index($text, $althead, $pp); 64 if ($p < 0) { 65 $rtext .= substr($text, $pp); 66 last; 67 } 68 my $ps = $p; 69 70 # Append segment prior to alternatives directive to the result. 71 $rtext .= substr($text, $pp, $p - $pp); 72 $errtext = substr($text, $p, $p + 30); # text segment for error report 73 74 # Must have at least 2 characters after the head. 75 if (length($text) < $p + $althlen + 2) { 76 $malformed = 1; 77 last; 78 } 79 80 # Read the separating character. 81 $p += $althlen; 82 my $sep = substr($text, $p, 1); 83 84 # Parse requested number of inserts, 85 # choose the one with matching index for the result. 86 my @alts; 87 for (my $i = 0; $i < $total; ++$i) { 88 $pp = $p + 1; 89 $p = index($text, $sep, $pp); 90 # Must have exactly the given total number of alternatives. 91 if ($p < 0) { 92 $malformed = 1; 93 last; 94 } 95 push(@alts, substr($text, $pp, $p - $pp)); 96 } 97 last if $malformed; 98 99 # Replace the alternative if admissible, or leave directive untouched. 100 my $isel = $select - 1; 101 if ($isel < @alts) { 102 $rtext .= $alts[$isel]; 103 } else { 104 $rtext .= substr($text, $ps, $p + 1 - $ps); 105 } 106 } 107 108 if ($malformed) { 109 $rtext = $text; 110 warning("Malformed alternatives directive at '$errtext', skipped."); 111 } 112 113 return $rtext; 114} 115 116# Transliteration table Serbian Cyrillic->Latin. 117my %map_ctol = ( 118 'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'ђ' => 'đ', 119 'е' => 'e', 'ж' => 'ž', 'з' => 'z', 'и' => 'i', 'ј' => 'j', 'к' => 'k', 120 'л' => 'l', 'љ' => 'lj','м' => 'm', 'н' => 'n', 'њ' => 'nj','о' => 'o', 121 'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'ћ' => 'ć', 'у' => 'u', 122 'ф' => 'f', 'х' => 'h', 'ц' => 'c', 'ч' => 'č', 'џ' => 'dž','ш' => 'š', 123 'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Ђ' => 'Đ', 124 'Е' => 'E', 'Ж' => 'Ž', 'З' => 'Z', 'И' => 'I', 'Ј' => 'J', 'К' => 'K', 125 'Л' => 'L', 'Љ' => 'Lj','М' => 'M', 'Н' => 'N', 'Њ' => 'Nj','О' => 'O', 126 'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'Ћ' => 'Ć', 'У' => 'U', 127 'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C', 'Ч' => 'Č', 'Џ' => 'Dž','Ш' => 'Š', 128 # accented NFC: 129 'ѐ' => 'è', 'ѝ' => 'ì', 'ӣ' => 'ī', 'ӯ' => 'ū', 130 'Ѐ' => 'È', 'Ѝ' => 'Ì', 'Ӣ' => 'Ī', 'Ӯ' => 'Ū', 131 # frequent accented from NFD to NFC (keys now 2-char): 132 'а̂' => 'â', 'о̂' => 'ô', 'а̑' => 'ȃ', 'о̑' => 'ȏ', 133); 134 135# Transliterate Cyrillic text to Latin. 136sub ctol 137{ 138 my ($text) = @_; 139 my $tlen = length($text); 140 my $ntext = ""; 141 for (my $i = 0; $i < $tlen; ++$i) { 142 my $c = substr($text, $i, 1); 143 my $c2 = substr($text, $i, 2); 144 my $r = ($map_ctol{$c2} or $map_ctol{$c}); 145 if ($r) { 146 my $cp = $i + 1 < $tlen ? substr($text, $i + 1, 1) : ""; 147 my $cn = $i > 0 ? substr($text, $i - 1, 1) : ""; 148 if ( length($r) > 1 and $c =~ /[[:upper:]]/ 149 and ($cn =~ /[[:upper:]]/ or $cp =~ /[[:upper:]]/)) 150 { 151 $ntext .= uc($r); 152 } else { 153 $ntext .= $r; 154 } 155 } else { 156 $ntext .= $c; 157 } 158 } 159 return $ntext; 160} 161 162# Resolve hybrid Cyrillic/Latin text into clean Cyrillic. 163sub hctoc 164{ 165 my ($text) = @_; 166 my $ntext; 167 $ntext = resalts($text, '~@', 1, 2); 168 return $ntext; 169} 170 171# Resolve hybrid Cyrillic/Latin text into clean Latin. 172sub hctol 173{ 174 my ($text) = @_; 175 my $ntext; 176 $ntext = ctol($text); # FIXME: Do not convert inside alt directives. 177 $ntext = resalts($ntext, '~@', 2, 2); 178 return $ntext; 179} 180 181# Ijekavian to Ekavian map (Latin script and letter cases derived afterwards). 182my @reflex_spec = ( 183 ['›', { 184 'ије' => 'е', 185 'је' => 'е', 186 }], 187 ['‹', { 188 'иј' => 'еј', 189 'иљ' => 'ел', 190 'ио' => 'ео', 191 'ље' => 'ле', 192 'ње' => 'не', 193 }], 194 ['▹', { 195 'ије' => 'и', 196 'је' => 'и', 197 }], 198 ['◃', { 199 'ијел' => 'ео', 200 'ијен' => 'ењ', 201 'ил' => 'ел', 202 'ит' => 'ет', 203 'јел' => 'ео', 204 'тн' => 'тњ', 205 'шње' => 'сне', 206 }], 207); 208 209# Derive data for dehybridization. 210my @reflex_spec_dehyb; 211for my $refgrp (@reflex_spec) { 212 my $tick = $refgrp->[0]; 213 my $refmap = $refgrp->[1]; 214 # Derive Latin mappings (must be fully done before different cases). 215 for my $ijkfrm (keys %{$refmap}) { 216 my $ekvfrm = $refmap->{$ijkfrm}; 217 $refmap->{ctol($ijkfrm)} = ctol($ekvfrm); 218 } 219 # Derive mappings with different cases. 220 for my $ijkfrm (keys %{$refmap}) { 221 my $ekvfrm = $refmap->{$ijkfrm}; 222 $refmap->{ucfirst($ijkfrm)} = ucfirst($ekvfrm); 223 $refmap->{uc($ijkfrm)} = uc($ekvfrm); 224 } 225 # Compute minimum and maximum reflex lengths. 226 my $reflen_min = 0; 227 my $reflen_max = 0; 228 for my $ijkfrm (keys %{$refmap}) { 229 my $reflen = length($ijkfrm); 230 $reflen_max = $reflen if $reflen_max < $reflen; 231 $reflen_min = $reflen if $reflen_min > $reflen; 232 } 233 # Derivation for current group done. 234 push @reflex_spec_dehyb, [$tick, $refmap, $reflen_min, $reflen_max]; 235} 236 237 238# Resolve hybrid Ijekavian text into clean Ekavian. 239sub hitoe 240{ 241 my ($text) = @_; 242 return hito_w($text, 0); 243} 244 245# Resolve hybrid Ijekavian text into clean Ijekavian. 246sub hitoi 247{ 248 my ($text) = @_; 249 return hito_w($text, 1); 250} 251 252sub hito_w 253{ 254 my ($text, $toijek) = @_; 255 256 for my $refgrp (@reflex_spec_dehyb) { 257 $text = hito_w_simple($text, @{$refgrp}, $toijek); 258 } 259 $text = resalts($text, '~#', (!$toijek? 1 : 2), 2); 260 261 return $text; 262} 263 264sub hito_w_simple 265{ 266 my ($text, $tick, $refmap, $reflen_min, $reflen_max, $toijek) = @_; 267 268 my $ntext; 269 my $p = 0; 270 while (1) { 271 my $pp = $p; 272 $p = index($text, $tick, $p); 273 if ($p < 0) { 274 $ntext .= substr($text, $pp); 275 last; 276 } 277 $ntext .= substr($text, $pp, $p - $pp); 278 $pp = $p; 279 $p += length($tick); 280 if ($p >= length($text) or substr($text, $p, 1) !~ /\w/) { 281 $ntext .= $tick; 282 next; 283 } 284 285 my $reflen = $reflen_min; 286 my ($ijkfrm, $ekvfrm); 287 while ($reflen <= $reflen_max and !$ekvfrm) { 288 $ijkfrm = substr($text, $p, $reflen); 289 $ekvfrm = $refmap->{$ijkfrm}; 290 $reflen += 1; 291 } 292 293 if ($ekvfrm) { 294 $ntext .= (!$toijek ? $ekvfrm : $ijkfrm); 295 $p += length($ijkfrm); 296 } else { 297 $ntext .= $tick; 298 my $dtext = substr($text, $pp, 20); 299 warning("Unknown jat-reflex starting from '$dtext'."); 300 } 301 } 302 return $ntext; 303} 304 305 306sub main 307{ 308 @ARGV == 1 or show_usage(); 309 my $dstarget = shift @ARGV; 310 $dstarget =~ /^(ec|el|ic|il)$/ or show_usage(); 311 312 my $resf; 313 if ($dstarget eq "ec") { 314 $resf = sub { return hitoe(hctoc($_[0])); } 315 } elsif ($dstarget eq "el") { 316 $resf = sub { return hitoe(hctol($_[0])); } 317 } elsif ($dstarget eq "ic") { 318 $resf = sub { return hitoi(hctoc($_[0])); } 319 } else { 320 $resf = sub { return hitoi(hctol($_[0])); } 321 } 322 323 while (<STDIN>) { 324 print $resf->($_); 325 } 326} 327 328main(); 329