enca/enca-1.19/data

=== Programs ===
doit.sh -- Regenerates all */*.base and */*.c files from the source one
           (given as first parameter in */doit.sh), used by */doit.sh to
           regenerate stuff in individual directories too.  Uses many of
           following scripts.

*/doit.sh -- Customized scripts for individual directories.  Once a directory
             contains doit.sh, it's run by the main one.

clean.sh -- Removes most auxiliary files from language subdirs.

basetoc.c -- [filter] Converts one .base file to .c file, used by doit.sh
              $ ./basetoc <CHARSET.base >CHARSET.c

totals.pl -- Reads generated .c files and computes significancy data, weight
             sums and other summary data, writes file `totals.c'
             $ ./totals.pl CHARSET1.c ... CHARSETn.c

normalize.pl -- [filter] Does some kind of funny weight normalization, useful
                for producing CHARSET.base files, since the weights must fit
                into unsigned short int:
                $ ./normalize.pl <COUNTS >NORMALIZED_COUNTS
                Given a file on command line, it normalizes input to have
                exactly(!) the same weight sum:
                $ ./normalize.pl REFERENCE_COUNTS <COUNTS >RENORMALIZED_COUNTS
                This is not run by doit.sh.

extreme.pl -- Given two count files, it finds characters most suitable for
              hook deciding between these two, i.e. characters with the
              biggest difference of occurences:
              $ ./extreme.pl COUNT1 COUNT2

xlt.c -- [filter] Extremely simple charset converter, to become independent
         on the other broken converters:
         $ ./xlt SOURCE.map TARGET.map <TEXT >CONVERTED_TEXT

mystrings.c -- [filter] Extract text chunks from input (strings(1) doesn't
               seem to do good job on 8bit files):
               $ ./mystrings <FILE | ...

countall.c -- [filter] Count character frequencies
              $ ./countall <TEXT >rawcounts.CHARSET

countpair.c -- [filter] Count 8bit letter pair frequencies and print a table
               containing as much pairs as to get 95% of all
               $ ./countpair CHARSET.letters <TEXT >paircounts.CHARSET

findletters.c -- [filter] Find what 8bit characters from a charset map are
                 letters
                 $ ./findletters CHARSET.map <Letters >CHARSET.letters

map2letters.sh -- Run findletters.c for all charsets in maps/.

=== Data ===
Letters -- Unicode characters assumed to be letters, excluding 7bits.  Also
           excluding non-European scripts, to keep it small.

maps/ -- 8bit charset -> UCS2 maps, notable ones:
  ibm866-bad.map -- Translates Latin `i' and `I' to Cyrillic 0x0456 and 0x0406,
                    thus approximates them the opposite way when used as
                    TARGET.
  maccyr.map -- It's Macintosh Cyrillic after Apple unification of Russian
                and Ukriainian variants and adding Euro symbol there, in
                Mac OS 9.0 or so (recode uses the old Russian maccyr -- FIXME
                with iconv it doesn't?).
  macce.map -- Macintosh Central European encoding, the real one, not the
               crappy one used by recode.
  koi8u.map -- KOI8-U (Ukrainian) (recode uses some strange mapping?).
  koi8uni.map -- KOI8-Unified.
  koi8ub.map -- KOI8-UB (Ukrainian/Belarusian).
  cork.map -- T1 Cork encoding (recode uses some strange mapping?).
  iso885913.map -- ISO-8859-13 map (recode uses some strange mapping?).

letters/ -- lists of 8bit charset that are letters (generated) for various
            charsets, run map2letters.sh to create it
Name		Date	Size	#Lines	LOC
..		03-May-2022	-
belarusian/	H	07-May-2022	-	1,166	1,092
bulgarian/	H	07-May-2022	-	829	771
chinese/	H	03-May-2022	-	1,493	1,422
croatian/	H	07-May-2022	-	388	355
czech/	H	07-May-2022	-	479	440
estonian/	H	07-May-2022	-	419	383
hungarian/	H	07-May-2022	-	394	361
latvian/	H	07-May-2022	-	430	394
lithuanian/	H	07-May-2022	-	454	418
maps/	H	05-Sep-2016	-	6,425	6,400
polish/	H	07-May-2022	-	527	485
russian/	H	07-May-2022	-	944	886
slovak/	H	07-May-2022	-	476	437
slovene/	H	07-May-2022	-	350	317
ukrainian/	H	07-May-2022	-	1,036	970
Letters	H A D	14-Jan-2010	5.5 KiB	1,118	1,117
Makefile.am	H A D	04-Jan-2016	3.1 KiB	146	129
Makefile.in	H A D	03-May-2022	23.5 KiB	804	696
README	H A D	04-Jan-2016	3.5 KiB	77	60
basetoc.c	H A D	14-Jan-2010	744	38	30
clean.sh	H A D	04-Jan-2016	306	13	11
countall.c	H A D	14-Jan-2010	440	29	21
countpair.c	H A D	14-Jan-2010	2.3 KiB	116	92
doit.sh	H A D	14-Jan-2010	1.6 KiB	80	67
extreme.pl	H A D	14-Jan-2010	924	62	53
findletters.c	H A D	14-Jan-2010	466	29	22
makepaircounts.sh	H A D	04-Jan-2016	547	25	19
map2letters.sh	H A D	14-Jan-2010	160	8	6
mystrings.c	H A D	14-Jan-2010	805	46	38
normalize.pl	H A D	14-Jan-2010	1.2 KiB	70	58
pairtoc.c	H A D	14-Jan-2010	2 KiB	88	73
totals.pl	H A D	14-Jan-2010	2.4 KiB	92	77
xlt.c	H A D	14-Jan-2010	950	50	37