1#!/usr/bin/perl -w
2# -*- coding: iso-8859-1 -*-
3# 	$Id: ispellaff2myspell,v 1.2 2010/02/23 12:05:51 caolan Exp $
4#
5#   (C) 2002-2005 Agustin Martin Domingo <agustin.martin@hispalinux.es>
6#
7#    This program is free software; you can redistribute it and/or modify
8#    it under the terms of the GNU General Public License as published by
9#    the Free Software Foundation; either version 2 of the License, or
10#    (at your option) any later version.
11#
12#    This program is distributed in the hope that it will be useful,
13#    but WITHOUT ANY WARRANTY; without even the implied warranty of
14#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15#    GNU General Public License for more details.
16#
17#    You should have received a copy of the GNU General Public License
18#    along with this program; if not, write to the Free Software
19#    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20
21
22sub usage {
23    print "ispellaff2myspell: A program to convert ispell affix tables to myspell format
24(C) 2002-2005 Agustin Martin Domingo <agustin.martin\@hispalinux.es>         License: GPL
25
26Usage:
27	ispellaff2myspell [options] <affixfile>
28
29      Options:
30	--affixfile=s      Affix file
31	--bylocale         Use current locale setup for upper/lowercase
32                           conversion
33	--charset=s        Use specified charset for upper/lowercase
34                           conversion (defaults to latin1)
35 	--debug            Print debugging info
36 	--extraflags       Allow some non alphabetic flags
37	--lowercase=s      Lowercase string
38        --myheader=s       Header file
39	--printcomments    Print commented lines in output
40        --replacements=s   Replacements file
41        --split=i          Split flags with more that i entries
42	--uppercase=s      Uppercase string
43	--wordlist=s       Still unused
44
45  Currently allowed valued for charset are: latin1, latin2, latin3
46
47This script does not create the dict file. Something like
48
49( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
50
51should do the work, with mydict.words+ being the ispell munched wordlist
52
53";
54    exit;
55}
56
57sub debugprint {
58    if ( $debug ){
59	print STDERR "@_";
60    }
61}
62
63sub shipoutflag{
64    my $flag_entries=scalar @flag_array;
65
66    if ( $flag_entries != 0 ){
67	if ( $split ){
68	    while ( @flag_array ){
69		my @flag_subarray=splice(@flag_array,0,$split);
70		my $subflag_entries=scalar @flag_subarray;
71		if ( scalar @flag_array ){
72		    print "$myaffix $flagname $flagcombine $subflag_entries S\n";
73		} else {
74		    print "$myaffix $flagname $flagcombine $subflag_entries\n";
75		}
76		print join("\n",@flag_subarray);
77		print "\n\n";
78	    }
79	} else {
80	    print "$myaffix $flagname $flagcombine $flag_entries\n";
81	    print join("\n",@flag_array);
82	    print "\n\n";
83	}
84    }
85    @flag_array=();
86    $flagname='';
87    $flagcombine='';
88}
89
90sub mylc{
91    my $inputstring=shift;
92    my $outputstring;
93
94    if ( $bylocale ){
95	{
96	    use locale;
97	    $outputstring =  lc $inputstring;
98	}
99    } else {
100	if ( $charset eq "latin0" ){
101	    $lowercase='a-z���������������������������������';
102	    $uppercase='A-Z�����������������������������޼��';
103	} elsif ( $charset eq "latin1" ){
104	    $lowercase='a-z������������������������������';
105	    $uppercase='A-Z������������������������������';
106	} elsif ( $charset eq "latin2" ){
107	    $lowercase='a-z����������������������������������������';
108	    $uppercase='A-Z����������������������������������������';
109	} elsif ( $charset eq "latin3" ){
110	    $lowercase='a-z������������������������������������';
111	    $uppercase='A-Z������������������������������������';
112#	} elsif ( $charset eq "other_charset" ){
113#	    die "latin2 still unimplemented";
114	} else {
115	    if ( not $lowercase and not $uppercase ){
116		die "Unsupported charset [$charset]
117
118Explicitly use --lowercase=string and --uppercase=string
119options. Remember that both string must match exactly, but
120case changed.
121";
122	    }
123	}
124	$outputstring=$inputstring;
125	eval "\$outputstring=~tr/$uppercase/$lowercase/";
126    }
127    return $outputstring;
128}
129
130sub validate_flag (){
131    my $flag = shift;
132    if ($flag=~m/[a-zA-Z]+/){
133	return $flag;
134    } elsif ( $hasextraflags ){
135	foreach ( keys %theextraflags ){
136	    if ($flag =~ m/^$_/){
137		$flag =~ s/^$_//;
138		return $flag;
139	    }
140	}
141    }
142    return '';
143}
144
145sub process_replacements{
146    my $file = shift;
147    my @replaces = ();
148
149    open (REPLACE,"< $file") ||
150	die "Error: Could not open replacements file: $file\n";
151    while (<REPLACE>){
152	next unless m/^REP[\s\t]*\D.*/;
153	next if m/^REP\s+[0-9]+/;
154	s/\015\012//;
155	s/\015//;
156	chomp;
157	push @replaces, $_;
158    }
159    close REPLACE;
160    my $number = scalar @replaces;
161    print "REP $number\n";
162    foreach ( @replaces ){
163	print $_ . "\n";
164    }
165}
166
167# -----------------------------------------------------------
168# Now the progran start, after the functions are defined
169# -----------------------------------------------------------
170
171use Getopt::Long;
172
173# Initializing option values
174$affixfile     = '';
175$bylocale      = '';
176$charset       = '';
177$debug         = '';
178$lowercase     = '';
179$myheader      = '';
180$printcomments = '';
181$replacements  = '';
182$split         = '';
183$uppercase     = '';
184$wordlist      = '';
185$hasextraflags = '';
186@flag_array    = ();
187%theextraflags = ();
188# Initializing root values
189$rootremove    = "0";
190$rootname      = '';
191$addtoroot     = '';
192$comment       = '';
193# Initializing flag values
194$flagname      = '';
195$flagcombine   = '';
196$inflags       = '';
197
198GetOptions ('affixfile=s'   => \$affixfile,
199	    'bylocale'      => \$bylocale,
200	    'charset=s'     => \$charset,
201	    'debug'         => \$debug,
202	    'extraflags:s'  => sub {
203		$hasextraflags = 1;
204		shift;
205		$theflag = shift;
206		$theextraflags{$theflag}++ if $theflag},
207	    'lowercase=s'   => \$lowercase,
208	    'myheader=s'    => \$myheader,
209	    'printcomments' => \$printcomments,
210	    'replacements=s'=> \$replacements,
211	    'split=i'       => \$split,
212	    'uppercase=s'   => \$uppercase,
213	    'wordlist=s'    => \$wordlist) or usage;
214
215if ( not $affixfile ){
216    $affixfile=shift or usage;
217}
218
219if ( $charset and ( $lowercase or $uppercase )){
220    die "Error: charset and lowercase/uppercase options
221are incompatible. Use either charset or lowercase/uppercase options to
222specify the patterns
223"
224} elsif ( not $lowercase and not $uppercase and not $charset ){
225    $charset="latin1";
226}
227
228if ( scalar(keys %theextraflags) == 0 && $hasextraflags ){
229    $theextraflags{"\\\\"}++;
230}
231
232debugprint "$affixfile $charset";
233
234open (AFFIXFILE,"< $affixfile") ||
235    die "Error: Could not open affix file: $affixfile";
236
237if ( $myheader ){
238    my $myspell_header=`cat $myheader`;
239    print $myspell_header . "\n";
240}
241
242while (<AFFIXFILE>){
243    chomp;
244    if (/^\s*\#.*/){
245	debugprint "Ignoring line $.\n";
246	print "$_\n" if $printcomments;
247    } elsif (/^\s*$/){
248	debugprint "Ignoring line $.\n";
249    } elsif (/^\s*prefixes/){
250	debugprint "Prefixes starting in line $.\n";
251	$affix="PFX";
252    } elsif (/^\s*suffixes/){
253	debugprint "Suffixes starting in line $.\n";
254	$affix="SFX";
255    } elsif (/^[\s\t]*flag.*/){
256	next if not $affix;         # In case we are still in the preamble
257	shipoutflag if $inflags;
258	$inflags="yes";
259	s/^[\s\t]*flag[\s\t]*//;
260	s/[\s\t]*:.*$//;
261	debugprint "Found flag $_ in line $.\n";
262
263	if (/\*/){
264	    s/[\*\s]//g;
265	    $flagcombine="Y";
266	    debugprint "Flag renamed to $_ with combine=$flagcombine\n";
267	} else {
268	    $flagcombine="N";
269	}
270
271	if ( $flagname = &validate_flag($_) ){
272	    $myaffix  = $affix;
273	} else {
274	    $myaffix  = "\# $affix";
275	    $flagname = $_;
276	    print STDERR "Ignoring invalid flag $flagname in line $.\n";
277	}
278    } elsif ( $affix and $inflags ) {
279	($rootname,@comments)   =  split('#',$_);
280	$comment                =  '# ' . join('#',@comments);
281
282	$rootname               =~ s/\s*//g;
283	$rootname               =  mylc $rootname;
284	($rootname,$addtoroot)  =  split('>',$rootname);
285
286	if ( $addtoroot =~ s/^\-//g ){
287	    ($rootremove,$addtoroot)  = split(',',$addtoroot);
288	    $addtoroot                = "0" unless $addtoroot;
289	    $addtoroot                = "0" if ( $addtoroot eq "-");
290	} else {
291	    $rootremove = "0";
292	}
293	$addtoroot =~ s/\\\-/\-/g; # prefix ANTI\- to anti-
294
295	if ( $rootname eq '.' && $rootremove ne "0" ){
296	    $rootname = $rootremove;
297	}
298
299	debugprint "$rootname, $addtoroot, $rootremove\n";
300	if ( $printcomments ){
301	    $affix_line=sprintf("%s %s   %-5s %-11s %-24s %s",
302				$myaffix, $flagname, $rootremove,
303				$addtoroot, $rootname, $comment);
304	} else {
305	    $affix_line=sprintf("%s %s   %-5s %-11s %s",
306				$myaffix, $flagname, $rootremove,
307				$addtoroot, $rootname);
308	}
309	$rootremove = "0";
310	$rootname   = '';
311	$addtoroot  = '';
312	$comment    = '';
313	@comments   = ();
314	push @flag_array,$affix_line;
315	debugprint "$affix_line\n";
316    } else {
317	#
318    }
319}
320shipoutflag;
321
322close AFFIXFILE;
323
324if ( $replacements ){
325    &process_replacements($replacements);
326}
327
328__END__
329
330=head1 NAME
331
332B<ispellaff2myspell> - A program to convert ispell affix tables to myspell format.
333
334=head1 SYNOPSIS
335
336 ispellaff2myspell [options] <affixfile> --myheader your_header
337
338   Options:
339
340    --affixfile=s      Affix file
341    --bylocale         Use current locale setup for upper/lowercase
342                       conversion
343    --charset=s        Use specified charset for upper/lowercase
344                       conversion (defaults to latin1)
345    --debug            Print debugging info
346    --extraflags=s     Allow some non alphabetic flags
347    --lowercase=s      Lowercase string
348    --myheader=s       Header file
349    --printcomments    Print commented lines in output
350    --replacements=s   Replacements file
351    --split=i          Split flags with more that i entries
352    --uppercase=s      Uppercase string
353
354=head1 DESCRIPTION
355
356B<ispellaff2myspell> is a script that will convert ispell affix tables
357to myspell format in a more or less successful way.
358
359This script does not create the dict file. Something like
360
361( echo `cat mydict.words+ | wc -l`; cat mydict.words+ ) > mydict.dict
362
363should do the work, with mydict.words+ being the munched wordlist
364
365=head1 OPTIONS
366
367=over 8
368
369=item B<--affixfile=s>
370
371Affix file. You can put it directly in the command line.
372
373=item B<--bylocale>
374
375Use current locale setup for upper/lowercase conversion. Make sure
376that the selected locale match the dictionary one, or you might get
377into trouble.
378
379=item B<--charset=s>
380
381Use specified charset for upper/lowercase conversion (defaults to latin1).
382Currently allowed values for charset are: latin0, latin1, latin2, latin3.
383
384=item B<--debug>
385
386Print some debugging info.
387
388=item B<--extraflags:s>
389
390Allows some non alphabetic flags.
391
392When invoked with no value the supported flags are currently those
393corresponding to chars represented with the escape char B<\> as
394first char. B<\> will be stripped.
395
396When given with the flag prefix will allow that flag and strip the
397given prefix. Be careful when giving the prefix to properly escape chars,
398e.g. you will need B<-e "\\\\"> or B<-e '\\'> for flags like B<\[> to be stripped to
399B<[>. Otherwise you might even get errors. Use B<-e "^"> to allow all
400flags and pass them unmodified.
401
402You will need a call to -e for each flag type, e.g.,
403B<-e "\\\\" -e "~\\\\"> (or B<-e '\\' -e '~\\'>).
404
405When a prefix is explicitely set, the default value (anything starting by B<\>)
406is disabled and you need to enable it explicitely as in previous example.
407
408=item B<--lowercase=s>
409
410Lowercase string. Manually set the string of lowercase chars. This
411requires B<--uppercase> having exactly that string but uppercase.
412
413=item B<--myheader=s>
414
415Header file. The myspell aff header. You need to write it
416manually. This can contain everything you want to be before the affix table
417
418=item B<--printcomments>
419
420Print commented lines in output.
421
422=item B<--replacements=file>
423
424Add a pre-defined replacements table taken from 'file' to the .aff file.
425Will skip lines not beginning with REP, and set the replacements number
426appropriately.
427
428=item B<--split=i>
429
430Split flags with more that i entries. This can be of interest for flags
431having a lot of entries. Will split the flag in chunks containing B<i>
432entries.
433
434=item B<--uppercase=s>
435
436Uppercase string. Manually set the sring of uppercase chars. This
437requires B<--lowercase> having exactly that string but lowercase.
438
439=back
440
441If your encoding is currently unsupported you can send me a file with
442the two strings of lower and uppercase chars. Note that they must match
443exactly but case changed. It will look something like
444
445  $lowercase='a-z������������������������������';
446  $uppercase='A-Z������������������������������';
447
448=head1 SEE ALSO
449
450The OpenOffice.org Lingucomponent Project home page
451
452L<http://lingucomponent.openoffice.org/index.html>
453
454and the document
455
456L<http://lingucomponent.openoffice.org/affix.readme>
457
458that provides information about the basics of the myspell affix file format.
459
460You can also take a look at
461
462 /usr/share/doc/libmyspell-dev/affix.readme.gz
463 /usr/share/doc/libmyspell-dev/README.compoundwords
464 /usr/share/doc/libmyspell-dev/README.replacetable
465
466in your Debian system.
467
468=head1 AUTHORS
469
470Agustin Martin <agustin.martin@hispalinux.es>
471
472=cut
473