1#!/usr/bin/perl -w 2# -*- coding: iso-8859-1 -*- 3# $Id: ispellaff2myspell,v 1.2 2010/02/23 12:05:51 caolan Exp $ 4# 5# (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es> 6# 7# This program is free software; you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation; either version 2 of the License, or 10# (at your option) any later version. 11# 12# This program is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with this program; if not, write to the Free Software 19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 21 22sub usage { 23 print "ispellaff2myspell: A program to convert ispell affix tables to myspell format 24(C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es> License: GPL 25 26Usage: 27 ispellaff2myspell [options] <affixfile> 28 29 Options: 30 --affixfile=s Affix file 31 --bylocale Use current locale setup for upper/lowercase 32 conversion 33 --charset=s Use specified charset for upper/lowercase 34 conversion (defaults to latin1) 35 --debug Print debugging info 36 --extraflags Allow some non alphabetic flags 37 --lowercase=s Lowercase string 38 --myheader=s Header file 39 --printcomments Print commented lines in output 40 --replacements=s Replacements file 41 --split=i Split flags with more that i entries 42 --uppercase=s Uppercase string 43 --wordlist=s Still unused 44 45 Currently allowed valued for charset are: latin1, latin2, latin3 46 47This script does not create the dict file. Something like 48 49( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict 50 51should do the work, with mydict.words+ being the ispell munched wordlist 52 53"; 54 exit; 55} 56 57sub debugprint { 58 if ( $debug ){ 59 print STDERR "@_"; 60 } 61} 62 63sub shipoutflag{ 64 my $flag_entries=scalar @flag_array; 65 66 if ( $flag_entries != 0 ){ 67 if ( $split ){ 68 while ( @flag_array ){ 69 my @flag_subarray=splice(@flag_array,0,$split); 70 my $subflag_entries=scalar @flag_subarray; 71 if ( scalar @flag_array ){ 72 print "$myaffix $flagname $flagcombine $subflag_entries S\n"; 73 } else { 74 print "$myaffix $flagname $flagcombine $subflag_entries\n"; 75 } 76 print join("\n",@flag_subarray); 77 print "\n\n"; 78 } 79 } else { 80 print "$myaffix $flagname $flagcombine $flag_entries\n"; 81 print join("\n",@flag_array); 82 print "\n\n"; 83 } 84 } 85 @flag_array=(); 86 $flagname=''; 87 $flagcombine=''; 88} 89 90sub mylc{ 91 my $inputstring=shift; 92 my $outputstring; 93 94 if ( $bylocale ){ 95 { 96 use locale; 97 $outputstring = lc $inputstring; 98 } 99 } else { 100 if ( $charset eq "latin0" ){ 101 $lowercase='a-z���������������������������������'; 102 $uppercase='A-Z�������������������������������'; 103 } elsif ( $charset eq "latin1" ){ 104 $lowercase='a-z������������������������������'; 105 $uppercase='A-Z������������������������������'; 106 } elsif ( $charset eq "latin2" ){ 107 $lowercase='a-z����������������������������������������'; 108 $uppercase='A-Z����������������������������������������'; 109 } elsif ( $charset eq "latin3" ){ 110 $lowercase='a-z������������������������������������'; 111 $uppercase='A-Z������������������������������������'; 112# } elsif ( $charset eq "other_charset" ){ 113# die "latin2 still unimplemented"; 114 } else { 115 if ( not $lowercase and not $uppercase ){ 116 die "Unsupported charset [$charset] 117 118Explicitly use --lowercase=string and --uppercase=string 119options. Remember that both string must match exactly, but 120case changed. 121"; 122 } 123 } 124 $outputstring=$inputstring; 125 eval "\$outputstring=~tr/$uppercase/$lowercase/"; 126 } 127 return $outputstring; 128} 129 130sub validate_flag (){ 131 my $flag = shift; 132 if ($flag=~m/[a-zA-Z]+/){ 133 return $flag; 134 } elsif ( $hasextraflags ){ 135 foreach ( keys %theextraflags ){ 136 if ($flag =~ m/^$_/){ 137 $flag =~ s/^$_//; 138 return $flag; 139 } 140 } 141 } 142 return ''; 143} 144 145sub process_replacements{ 146 my $file = shift; 147 my @replaces = (); 148 149 open (REPLACE,"< $file") || 150 die "Error: Could not open replacements file: $file\n"; 151 while (<REPLACE>){ 152 next unless m/^REP[\s\t]*\D.*/; 153 next if m/^REP\s+[0-9]+/; 154 s/\015\012//; 155 s/\015//; 156 chomp; 157 push @replaces, $_; 158 } 159 close REPLACE; 160 my $number = scalar @replaces; 161 print "REP $number\n"; 162 foreach ( @replaces ){ 163 print $_ . "\n"; 164 } 165} 166 167# ----------------------------------------------------------- 168# Now the progran start, after the functions are defined 169# ----------------------------------------------------------- 170 171use Getopt::Long; 172 173# Initializing option values 174$affixfile = ''; 175$bylocale = ''; 176$charset = ''; 177$debug = ''; 178$lowercase = ''; 179$myheader = ''; 180$printcomments = ''; 181$replacements = ''; 182$split = ''; 183$uppercase = ''; 184$wordlist = ''; 185$hasextraflags = ''; 186@flag_array = (); 187%theextraflags = (); 188# Initializing root values 189$rootremove = "0"; 190$rootname = ''; 191$addtoroot = ''; 192$comment = ''; 193# Initializing flag values 194$flagname = ''; 195$flagcombine = ''; 196$inflags = ''; 197 198GetOptions ('affixfile=s' => \$affixfile, 199 'bylocale' => \$bylocale, 200 'charset=s' => \$charset, 201 'debug' => \$debug, 202 'extraflags:s' => sub { 203 $hasextraflags = 1; 204 shift; 205 $theflag = shift; 206 $theextraflags{$theflag}++ if $theflag}, 207 'lowercase=s' => \$lowercase, 208 'myheader=s' => \$myheader, 209 'printcomments' => \$printcomments, 210 'replacements=s'=> \$replacements, 211 'split=i' => \$split, 212 'uppercase=s' => \$uppercase, 213 'wordlist=s' => \$wordlist) or usage; 214 215if ( not $affixfile ){ 216 $affixfile=shift or usage; 217} 218 219if ( $charset and ( $lowercase or $uppercase )){ 220 die "Error: charset and lowercase/uppercase options 221are incompatible. Use either charset or lowercase/uppercase options to 222specify the patterns 223" 224} elsif ( not $lowercase and not $uppercase and not $charset ){ 225 $charset="latin1"; 226} 227 228if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){ 229 $theextraflags{"\\\\"}++; 230} 231 232debugprint "$affixfile $charset"; 233 234open (AFFIXFILE,"< $affixfile") || 235 die "Error: Could not open affix file: $affixfile"; 236 237if ( $myheader ){ 238 my $myspell_header=`cat $myheader`; 239 print $myspell_header . "\n"; 240} 241 242while (<AFFIXFILE>){ 243 chomp; 244 if (/^\s*\#.*/){ 245 debugprint "Ignoring line $.\n"; 246 print "$_\n" if $printcomments; 247 } elsif (/^\s*$/){ 248 debugprint "Ignoring line $.\n"; 249 } elsif (/^\s*prefixes/){ 250 debugprint "Prefixes starting in line $.\n"; 251 $affix="PFX"; 252 } elsif (/^\s*suffixes/){ 253 debugprint "Suffixes starting in line $.\n"; 254 $affix="SFX"; 255 } elsif (/^[\s\t]*flag.*/){ 256 next if not $affix; # In case we are still in the preamble 257 shipoutflag if $inflags; 258 $inflags="yes"; 259 s/^[\s\t]*flag[\s\t]*//; 260 s/[\s\t]*:.*$//; 261 debugprint "Found flag $_ in line $.\n"; 262 263 if (/\*/){ 264 s/[\*\s]//g; 265 $flagcombine="Y"; 266 debugprint "Flag renamed to $_ with combine=$flagcombine\n"; 267 } else { 268 $flagcombine="N"; 269 } 270 271 if ( $flagname = &validate_flag($_) ){ 272 $myaffix = $affix; 273 } else { 274 $myaffix = "\# $affix"; 275 $flagname = $_; 276 print STDERR "Ignoring invalid flag $flagname in line $.\n"; 277 } 278 } elsif ( $affix and $inflags ) { 279 ($rootname,@comments) = split('#',$_); 280 $comment = '# ' . join('#',@comments); 281 282 $rootname =~ s/\s*//g; 283 $rootname = mylc $rootname; 284 ($rootname,$addtoroot) = split('>',$rootname); 285 286 if ( $addtoroot =~ s/^\-//g ){ 287 ($rootremove,$addtoroot) = split(',',$addtoroot); 288 $addtoroot = "0" unless $addtoroot; 289 $addtoroot = "0" if ( $addtoroot eq "-"); 290 } else { 291 $rootremove = "0"; 292 } 293 $addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti- 294 295 if ( $rootname eq '.' && $rootremove ne "0" ){ 296 $rootname = $rootremove; 297 } 298 299 debugprint "$rootname, $addtoroot, $rootremove\n"; 300 if ( $printcomments ){ 301 $affix_line=sprintf("%s %s %-5s %-11s %-24s %s", 302 $myaffix, $flagname, $rootremove, 303 $addtoroot, $rootname, $comment); 304 } else { 305 $affix_line=sprintf("%s %s %-5s %-11s %s", 306 $myaffix, $flagname, $rootremove, 307 $addtoroot, $rootname); 308 } 309 $rootremove = "0"; 310 $rootname = ''; 311 $addtoroot = ''; 312 $comment = ''; 313 @comments = (); 314 push @flag_array,$affix_line; 315 debugprint "$affix_line\n"; 316 } else { 317 # 318 } 319} 320shipoutflag; 321 322close AFFIXFILE; 323 324if ( $replacements ){ 325 &process_replacements($replacements); 326} 327 328__END__ 329 330=head1 NAME 331 332B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format. 333 334=head1 SYNOPSIS 335 336 ispellaff2myspell [options] <affixfile> --myheader your_header 337 338 Options: 339 340 --affixfile=s Affix file 341 --bylocale Use current locale setup for upper/lowercase 342 conversion 343 --charset=s Use specified charset for upper/lowercase 344 conversion (defaults to latin1) 345 --debug Print debugging info 346 --extraflags=s Allow some non alphabetic flags 347 --lowercase=s Lowercase string 348 --myheader=s Header file 349 --printcomments Print commented lines in output 350 --replacements=s Replacements file 351 --split=i Split flags with more that i entries 352 --uppercase=s Uppercase string 353 354=head1 DESCRIPTION 355 356B<ispellaff2myspell> is a script that will convert ispell affix tables 357to myspell format in a more or less successful way. 358 359This script does not create the dict file. Something like 360 361( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict 362 363should do the work, with mydict.words+ being the munched wordlist 364 365=head1 OPTIONS 366 367=over 8 368 369=item B<--affixfile=s> 370 371Affix file. You can put it directly in the command line. 372 373=item B<--bylocale> 374 375Use current locale setup for upper/lowercase conversion. Make sure 376that the selected locale match the dictionary one, or you might get 377into trouble. 378 379=item B<--charset=s> 380 381Use specified charset for upper/lowercase conversion (defaults to latin1). 382Currently allowed values for charset are: latin0, latin1, latin2, latin3. 383 384=item B<--debug> 385 386Print some debugging info. 387 388=item B<--extraflags:s> 389 390Allows some non alphabetic flags. 391 392When invoked with no value the supported flags are currently those 393corresponding to chars represented with the escape char B<\> as 394first char. B<\> will be stripped. 395 396When given with the flag prefix will allow that flag and strip the 397given prefix. Be careful when giving the prefix to properly escape chars, 398e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to 399B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all 400flags and pass them unmodified. 401 402You will need a call to -e for each flag type, e.g., 403B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>). 404 405When a prefix is explicitely set, the default value (anything starting by B<\>) 406is disabled and you need to enable it explicitely as in previous example. 407 408=item B<--lowercase=s> 409 410Lowercase string. Manually set the string of lowercase chars. This 411requires B<--uppercase> having exactly that string but uppercase. 412 413=item B<--myheader=s> 414 415Header file. The myspell aff header. You need to write it 416manually. This can contain everything you want to be before the affix table 417 418=item B<--printcomments> 419 420Print commented lines in output. 421 422=item B<--replacements=file> 423 424Add a pre-defined replacements table taken from 'file' to the .aff file. 425Will skip lines not beginning with REP, and set the replacements number 426appropriately. 427 428=item B<--split=i> 429 430Split flags with more that i entries. This can be of interest for flags 431having a lot of entries. Will split the flag in chunks containing B<i> 432entries. 433 434=item B<--uppercase=s> 435 436Uppercase string. Manually set the sring of uppercase chars. This 437requires B<--lowercase> having exactly that string but lowercase. 438 439=back 440 441If your encoding is currently unsupported you can send me a file with 442the two strings of lower and uppercase chars. Note that they must match 443exactly but case changed. It will look something like 444 445 $lowercase='a-z������������������������������'; 446 $uppercase='A-Z������������������������������'; 447 448=head1 SEE ALSO 449 450The OpenOffice.org Lingucomponent Project home page 451 452L<http://lingucomponent.openoffice.org/index.html> 453 454and the document 455 456L<http://lingucomponent.openoffice.org/affix.readme> 457 458that provides information about the basics of the myspell affix file format. 459 460You can also take a look at 461 462 /usr/share/doc/libmyspell-dev/affix.readme.gz 463 /usr/share/doc/libmyspell-dev/README.compoundwords 464 /usr/share/doc/libmyspell-dev/README.replacetable 465 466in your Debian system. 467 468=head1 AUTHORS 469 470Agustin Martin <agustin.martin@hispalinux.es> 471 472=cut 473