1#!/usr/local/bin/perl
2
3#################################################################
4#
5# unilook - improved version of look(1) program for Unicode
6#
7#################################################################
8
9use strict;
10use 5.010_000;
11use if $] > 5.010, "autodie";
12use warnings;  # qw[ FATAL all ];
13
14our $VERSION = v0.6.0;
15
16# In case customer charnames files are in bin not lib...
17use FindBin;
18use lib $FindBin::Bin;
19
20# XXX: These aliases should be in separate files
21#      but that would require a more elaborate deployment strategy.
22#      Note that these are ordered, and that therefore dupes that
23#      occur later are meant to override earlier entries.
24
25use charnames (
26		  ":full"   ,
27		  ":short"  ,
28
29		   "latin"  ,
30		   "greek"  ,
31
32		  ":alias"  =>
33    {
34
35    "Aacu" => "LATIN CAPITAL LETTER A WITH ACUTE",        # Á U+00C1
36    "aacu" => "LATIN SMALL LETTER A WITH ACUTE",          # á U+00E1
37   "Acirc" => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",   # Â U+00C2
38   "acirc" => "LATIN SMALL LETTER A WITH CIRCUMFLEX",     # â U+00E2
39     "acu" => "COMBINING ACUTE ACCENT",                   # ́ U+0301
40      "AE" => "LATIN CAPITAL LETTER AE",                  # Æ U+00C6
41      "Ae" => "LATIN CAPITAL LETTER AE",                  # Æ U+00C6
42      "ae" => "LATIN SMALL LETTER AE",                    # æ U+00E6
43  "Agrave" => "LATIN CAPITAL LETTER A WITH GRAVE",        # À U+00C0
44  "agrave" => "LATIN SMALL LETTER A WITH GRAVE",          # à U+00E0
45   "Alpha" => "GREEK CAPITAL LETTER ALPHA WITH TONOS",    # Ά U+0386
46   "alpha" => "GREEK SMALL LETTER ALPHA",                 # α U+03B1
47     "ang" => "LATIN SMALL LETTER A WITH RING ABOVE",     # å U+00E5
48     "Asg" => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",   # Â U+00C2
49     "asg" => "LATIN SMALL LETTER G WITH DOT ABOVE",      # ġ U+0121
50   "asper" => "COMBINING REVERSED COMMA ABOVE",           # ̔ U+0314
51    "Auml" => "LATIN CAPITAL LETTER A WITH DIAERESIS",    # Ä U+00C4
52    "auml" => "LATIN SMALL LETTER A WITH DIAERESIS",      # ä U+00E4
53    "bbar" => "LATIN SMALL LETTER B WITH STROKE",         # ƀ U+0180
54    "Beta" => "GREEK CAPITAL LETTER BETA",                # Β U+0392
55    "beta" => "GREEK SMALL LETTER BETA",                  # β U+03B2
56   "breve" => "COMBINING BREVE",                          # ̆ U+0306
57   "Ccdil" => "LATIN CAPITAL LETTER C WITH CEDILLA",      # Ç U+00C7
58   "ccdil" => "LATIN SMALL LETTER C WITH CEDILLA",        # ç U+00E7
59    "cdil" => "COMBINING CEDILLA",                        # ̧ U+0327
60     "cdl" => "LATIN SMALL LETTER C WITH CEDILLA",        # ç U+00E7
61    "cent" => "CENT SIGN",                                # ¢ U+00A2
62     "Chi" => "GREEK CAPITAL LETTER CHI",                 # Χ U+03A7
63     "chi" => "GREEK SMALL LETTER CHI",                   # χ U+03C7
64    "circ" => "COMBINING CIRCUMFLEX ACCENT",              # ̂ U+0302
65  "circbl" => "COMBINING CIRCUMFLEX ACCENT BELOW",        # ̭ U+032D
66     "dag" => "DAGGER",                                   # † U+2020
67     "deg" => "DEGREE SIGN",                              # ° U+00B0
68   "Delta" => "GREEK CAPITAL LETTER DELTA",               # Δ U+0394
69   "delta" => "GREEK SMALL LETTER DELTA",                 # δ U+03B4
70     "div" => "DIVISION SLASH",                           # ∕ U+2215
71  "dollar" => "DOLLAR SIGN",                              # $ U+0024
72   "dotab" => "COMBINING DOT ABOVE",                      # ̇ U+0307
73   "dotbl" => "COMBINING DOT BELOW",                      # ̣ U+0323
74    "dubh" => "HYPHEN",                                   # ‐ U+2010
75    "Eacu" => "LATIN CAPITAL LETTER E WITH ACUTE",        # É U+00C9
76    "eacu" => "LATIN SMALL LETTER E WITH ACUTE",          # é U+00E9
77   "Ecirc" => "LATIN CAPITAL LETTER E WITH CIRCUMFLEX",   # Ê U+00CA
78   "ecirc" => "LATIN SMALL LETTER E WITH CIRCUMFLEX",     # ê U+00EA
79     "Edh" => "LATIN CAPITAL LETTER ETH",                 # Ð U+00D0
80     "edh" => "LATIN SMALL LETTER ETH",                   # ð U+00F0
81  "Egrave" => "LATIN CAPITAL LETTER E WITH GRAVE",        # È U+00C8
82  "egrave" => "LATIN SMALL LETTER E WITH GRAVE",          # è U+00E8
83 "Epsilon" => "GREEK CAPITAL LETTER EPSILON",             # Ε U+0395
84 "epsilon" => "GREEK SMALL LETTER EPSILON",               # ε U+03B5
85     "Eta" => "GREEK CAPITAL LETTER ETA",                 # Η U+0397
86     "eta" => "GREEK SMALL LETTER BETA",                  # β U+03B2
87     "Eth" => "LATIN CAPITAL LETTER ETH",                 # Ð U+00D0
88     "eth" => "LATIN SMALL LETTER ETH",                   # ð U+00F0
89    "Euml" => "LATIN CAPITAL LETTER E WITH DIAERESIS",    # Ë U+00CB
90    "euml" => "LATIN SMALL LETTER E WITH DIAERESIS",      # ë U+00EB
91    "fata" => "LATIN SMALL LETTER ALPHA",                 # ɑ U+0251
92   "fatax" => "LATIN SMALL LETTER ALPHA",                 # ɑ U+0251
93 "fatpara" => "DOUBLE-STRUCK CAPITAL P",                  # ℙ U+2119
94   "frown" => "COMBINING BREVE",                          # ̆ U+0306
95   "Gamma" => "GREEK CAPITAL LETTER GAMMA",               # Γ U+0393
96   "gamma" => "GREEK SMALL LETTER GAMMA",                 # γ U+03B3
97      "ge" => "GREATER-THAN OR EQUAL TO",                 # ≥ U+2265
98   "grave" => "COMBINING GRAVE ACCENT",                   # ̀ U+0300
99      "gt" => "GREATER-THAN SIGN",                        # > U+003E
100     "h01" => "HEBREW LETTER HET",                        # ח U+05D7
101     "h02" => "ARABIC SHADDA",                            # ّ U+0651
102   "hacek" => "COMBINING CARON",                          # ̌ U+030C
103    "hash" => "NUMBER SIGN",                              # # U+0023
104    "hbar" => "LATIN SMALL LETTER H WITH STROKE",         # ħ U+0127
105     "hgz" => "LATIN SMALL LETTER Z WITH HOOK",           # ȥ U+0225
106    "hook" => "COMBINING CEDILLA",                        # ̧ U+0327
107      "ia" => "LATIN SMALL LETTER ALPHA",                 # ɑ U+0251
108    "Iacu" => "LATIN CAPITAL LETTER I WITH ACUTE",        # Í U+00CD
109    "iacu" => "LATIN SMALL LETTER I WITH ACUTE",          # í U+00ED
110      "ib" => "GREEK SMALL LETTER BETA",                  # β U+03B2
111   "Icirc" => "LATIN CAPITAL LETTER I WITH CIRCUMFLEX",   # Î U+00CE
112   "icirc" => "LATIN SMALL LETTER I WITH CIRCUMFLEX",     # î U+00EE
113      "id" => "GREEK SMALL LETTER DELTA",                 # δ U+03B4
114      "ie" => "LATIN SMALL LETTER OPEN E",                # ɛ U+025B
115      "ig" => "LATIN SMALL LETTER GAMMA",                 # ɣ U+0263
116  "Igrave" => "LATIN CAPITAL LETTER I WITH GRAVE",        # Ì U+00CC
117  "igrave" => "LATIN SMALL LETTER I WITH GRAVE",          # ì U+00EC
118      "ih" => "GREEK SMALL LETTER ETA",                   # η U+03B7
119      "ii" => "LATIN SMALL LETTER IOTA",                  # ɩ U+0269
120   "infin" => "INFINITY",                                 # ∞ U+221E
121    "Iota" => "GREEK CAPITAL LETTER IOTA",                # Ι U+0399
122    "iota" => "GREEK CAPITAL LETTER CHI",                 # Χ U+03A7
123      "iq" => "GREEK SMALL LETTER THETA",                 # θ U+03B8
124    "isub" => "COMBINING GREEK YPOGEGRAMMENI",            # ͅ U+0345
125    "Iuml" => "LATIN CAPITAL LETTER I WITH DIAERESIS",    # Ï U+00CF
126    "iuml" => "LATIN SMALL LETTER I WITH DIAERESIS",      # ï U+00EF
127      "iz" => "GREEK SMALL LETTER ZETA",                  # ζ U+03B6
128   "Kappa" => "GREEK CAPITAL LETTER KAPPA",               # Κ U+039A
129   "kappa" => "GREEK SMALL LETTER KAPPA",                 # κ U+03BA
130  "Lambda" => "GREEK CAPITAL LETTER LAMDA",               # Λ U+039B
131  "lambda" => "GREEK SMALL LETTER LAMDA",                 # λ U+03BB
132     "lar" => "LEFTWARDS ARROW",                          # ← U+2190
133    "Lbar" => "LATIN CAPITAL LETTER L WITH STROKE",       # Ł U+0141
134    "lbar" => "LATIN CAPITAL LETTER O WITH MACRON",       # Ō U+014C
135      "le" => "LESS-THAN OVER EQUAL TO",                  # ≦ U+2266
136   "lenis" => "COMBINING REVERSED COMMA ABOVE",           # ̔ U+0314
137      "lm" => "MODIFIER LETTER TRIANGULAR COLON",         # ː U+02D0
138      "lt" => "LESS-THAN SIGN",                           # < U+003C
139     "mac" => "COMBINING MACRON",                         # ̄ U+0304
140     "min" => "MINUS SIGN",                               # − U+2212
141      "Mu" => "GREEK CAPITAL LETTER MU",                  # Μ U+039C
142      "mu" => "GREEK SMALL LETTER MU",                    # μ U+03BC
143      "ng" => "LATIN SMALL LETTER ENG",                   # ŋ U+014B
144     "ngx" => "LATIN SMALL LETTER ENG",                   # ŋ U+014B
145      "Nu" => "GREEK CAPITAL LETTER NU",                  # Ν U+039D
146      "nu" => "GREEK SMALL LETTER NU",                    # ν U+03BD
147    "Oacu" => "LATIN CAPITAL LETTER O WITH ACUTE",        # Ó U+00D3
148    "oacu" => "LATIN SMALL LETTER O WITH ACUTE",          # ó U+00F3
149    "Obar" => "LATIN CAPITAL LETTER O WITH STROKE",       # Ø U+00D8
150    "obar" => "LATIN SMALL LETTER O WITH STROKE",         # ø U+00F8
151   "Ocirc" => "LATIN CAPITAL LETTER O WITH CIRCUMFLEX",   # Ô U+00D4
152   "ocirc" => "LATIN SMALL LETTER O WITH CIRCUMFLEX",     # ô U+00F4
153      "OE" => "LATIN CAPITAL LIGATURE OE",                # ΠU+0152
154      "Oe" => "LATIN CAPITAL LIGATURE OE",                # ΠU+0152
155      "oe" => "LATIN SMALL LIGATURE OE",                  # œ U+0153
156  "Ograve" => "LATIN CAPITAL LETTER O WITH GRAVE",        # Ò U+00D2
157  "ograve" => "LATIN SMALL LETTER O WITH GRAVE",          # ò U+00F2
158   "Omega" => "GREEK CAPITAL LETTER OMEGA",               # Ω U+03A9
159   "omega" => "GREEK SMALL LETTER OMEGA",                 # ω U+03C9
160 "Omicron" => "GREEK CAPITAL LETTER OMICRON",             # Ο U+039F
161 "omicron" => "GREEK SMALL LETTER OMICRON",               # ο U+03BF
162     "ope" => "LATIN SMALL LETTER OPEN E",                # ɛ U+025B
163    "Ouml" => "LATIN CAPITAL LETTER O WITH DIAERESIS",    # Ö U+00D6
164    "ouml" => "LATIN SMALL LETTER A WITH DIAERESIS",      # ä U+00E4
165      "pa" => "GREEK LETTER ARCHAIC KOPPA",               # Ϙ U+03D8
166    "pall" => "LATIN SMALL LETTER TURNED Y",              # ʎ U+028E
167    "paln" => "LATIN SMALL LETTER N WITH LEFT HOOK",      # ɲ U+0272
168    "para" => "REVERSED PILCROW SIGN",                             # ⁋ U+204B
169     "Phi" => "GREEK CAPITAL LETTER PHI",                 # Φ U+03A6
170     "phi" => "LATIN SMALL LETTER PHI",                   # ɸ U+0278
171      "Pi" => "GREEK CAPITAL LETTER PI",                  # Π U+03A0
172      "pi" => "GREEK SMALL LETTER PI",                    # π U+03C0
173      "pm" => "PLUS-MINUS SIGN",                          # ± U+00B1
174      "pp" => "DOUBLE PRIME",                             # ″ U+2033
175     "Psi" => "GREEK CAPITAL LETTER PSI",                 # Ψ U+03A8
176     "psi" => "GREEK SMALL LETTER PSI",                   # ψ U+03C8
177   "pstlg" => "POUND SIGN",                               # £ U+00A3
178     "rar" => "RIGHTWARDS ARROW",                         # → U+2192
179    "revc" => "LATIN SMALL LETTER OPEN O",                # ɔ U+0254
180  "revope" => "LATIN SMALL LETTER REVERSED OPEN E",       # ɜ U+025C
181    "revr" => "MODIFIER LETTER RHOTIC HOOK",              # ˞ U+02DE
182   "revrx" => "LATIN SMALL LETTER TURNED R",              # ɹ U+0279
183    "revv" => "LATIN SMALL LETTER TURNED V",              # ʌ U+028C
184     "rfa" => "LATIN SMALL LETTER TURNED ALPHA",          # ɒ U+0252
185     "Rho" => "GREEK CAPITAL LETTER RHO",                 # Ρ U+03A1
186     "rho" => "GREEK SMALL LETTER RHO",                   # ρ U+03C1
187   "schwa" => "LATIN SMALL LETTER SCHWA",                 # ə U+0259
188  "schwax" => "LATIN SMALL LETTER SCHWA",                 # ə U+0259
189    "sect" => "SECTION SIGN",                             # § U+00A7
190      "sh" => "LATIN SMALL LETTER ESH",                   # ʃ U+0283
191    "shti" => "LATIN LETTER SMALL CAPITAL I",             # ɪ U+026A
192    "shtu" => "LATIN SMALL LETTER UPSILON",               # ʊ U+028A
193    "shty" => "LATIN LETTER SMALL CAPITAL Y",             # ʏ U+028F
194     "shx" => "LATIN SMALL LETTER ESH",                   # ʃ U+0283
195   "Sigma" => "GREEK CAPITAL LETTER SIGMA",               # Σ U+03A3
196   "sigma" => "GREEK SMALL LETTER SIGMA",                 # σ U+03C3
197      "sm" => "MODIFIER LETTER VERTICAL LINE",            # ˈ U+02C8
198     "smm" => "MODIFIER LETTER LOW VERTICAL LINE",        # ˌ U+02CC
199    "sqrt" => "SQUARE ROOT",                              # √ U+221A
200     "Tau" => "GREEK CAPITAL LETTER TAU",                 # Τ U+03A4
201     "tau" => "GREEK SMALL LETTER TAU",                   # τ U+03C4
202      "Th" => "LATIN CAPITAL LETTER THORN",               # Þ U+00DE
203      "th" => "LATIN SMALL LETTER THORN",                 # þ U+00FE
204   "Theta" => "GREEK CAPITAL LETTER THETA",               # Θ U+0398
205   "theta" => "GREEK SMALL LETTER THETA",                 # θ U+03B8
206   "tilde" => "COMBINING TILDE",                          # ̃ U+0303
207   "times" => "MULTIPLICATION SIGN",                      # × U+00D7
208    "trli" => "PARALLEL TO",                              # ∥ U+2225
209    "Uacu" => "LATIN CAPITAL LETTER U WITH ACUTE",        # Ú U+00DA
210    "uacu" => "LATIN SMALL LETTER U WITH ACUTE",          # ú U+00FA
211   "Ucirc" => "LATIN CAPITAL LETTER U WITH CIRCUMFLEX",   # Û U+00DB
212   "ucirc" => "LATIN SMALL LETTER U WITH CIRCUMFLEX",     # û U+00FB
213    "udtr" => "NABLA",                                    # ∇ U+2207
214  "Ugrave" => "LATIN CAPITAL LETTER U WITH GRAVE",        # Ù U+00D9
215  "ugrave" => "LATIN SMALL LETTER U WITH GRAVE",          # ù U+00F9
216     "uml" => "COMBINING DIAERESIS",                      # ̈ U+0308
217    "undl" => "COMBINING MINUS SIGN BELOW",               # ̠ U+0320
218 "Upsilon" => "GREEK CAPITAL LETTER UPSILON",             # Υ U+03A5
219 "upsilon" => "LATIN SMALL LETTER UPSILON",               # ʊ U+028A
220    "Uuml" => "LATIN CAPITAL LETTER U WITH DIAERESIS",    # Ü U+00DC
221    "uuml" => "LATIN SMALL LETTER U WITH DIAERESIS",      # ü U+00FC
222      "vb" => "VERTICAL LINE",                            # | U+007C
223     "vvf" => "LATIN SMALL LETTER GAMMA",                 # ɣ U+0263
224      "Xi" => "GREEK CAPITAL LETTER XI",                  # Ξ U+039E
225      "xi" => "GREEK SMALL LETTER XI",                    # ξ U+03BE
226    "Yacu" => "LATIN SMALL LETTER Y WITH ACUTE",          # ý U+00FD
227    "yacu" => "LATIN SMALL LETTER Y WITH ACUTE",          # ý U+00FD
228     "Ygh" => "LATIN CAPITAL LETTER YOGH",                # Ȝ U+021C
229     "ygh" => "LATIN SMALL LETTER YOGH",                  # ȝ U+021D
230    "yuml" => "LATIN SMALL LETTER Y WITH DIAERESIS",      # ÿ U+00FF
231    "Zeta" => "GREEK CAPITAL LETTER ZETA",                # Ζ U+0396
232    "zeta" => "GREEK SMALL LETTER ZETA",                  # ζ U+03B6
233      "zh" => "LATIN SMALL LETTER EZH",                   # ʒ U+0292
234
235# Number aliases: these are \p{Other_Number}
236      "sup1" => "SUPERSCRIPT ONE",                            # ¹ U+00B9
237      "sup2" => "SUPERSCRIPT TWO",                            # ² U+00B2
238      "sup3" => "SUPERSCRIPT THREE",                          # ³ U+00B3
239    "frac12" => "VULGAR FRACTION ONE HALF",                   # ½ U+00BD
240    "frac14" => "VULGAR FRACTION ONE QUARTER",                # ¼ U+00BC
241    "frac34" => "VULGAR FRACTION THREE QUARTERS",             # ¾ U+00BE
242
243# Currency sign aliases: \p{Currency_Symbol}
244
245    "curren" => "CURRENCY SIGN",                              # ¤ U+00A4
246      "cent" => "CENT SIGN",                                  # ¢ U+00A2
247     "pound" => "POUND SIGN",                                 # £ U+00A3
248       "yen" => "YEN SIGN",                                   # ¥ U+00A5
249      "euro" => "EURO SIGN",                                  # € U+20AC
250
251# Latin letter aliases in NFC and grouped by first letter
252#
253#   NOTE: some like BLACK LETTER blah and the trademark
254#         symbol are only Latin in NFKD form.
255
256      "ordf" => "FEMININE ORDINAL INDICATOR",                 # ª U+00AA
257    "Oacute" => "LATIN CAPITAL LETTER O WITH ACUTE",          # Ó U+00D3
258    "Aacute" => "LATIN CAPITAL LETTER A WITH ACUTE",          # Á U+00C1
259    "aacute" => "LATIN SMALL LETTER A WITH ACUTE",            # á U+00E1
260    "Agrave" => "LATIN CAPITAL LETTER A WITH GRAVE",          # À U+00C0
261    "agrave" => "LATIN SMALL LETTER A WITH GRAVE",            # à U+00E0
262     "Acirc" => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",     # Â U+00C2
263     "acirc" => "LATIN SMALL LETTER A WITH CIRCUMFLEX",       # â U+00E2
264     "Aring" => "LATIN CAPITAL LETTER A WITH RING ABOVE",     # Å U+00C5
265     "aring" => "LATIN SMALL LETTER A WITH RING ABOVE",       # å U+00E5
266      "Auml" => "LATIN CAPITAL LETTER A WITH DIAERESIS",      # Ä U+00C4
267      "auml" => "LATIN SMALL LETTER A WITH DIAERESIS",        # ä U+00E4
268    "Atilde" => "LATIN CAPITAL LETTER A WITH TILDE",          # Ã U+00C3
269    "atilde" => "LATIN SMALL LETTER A WITH TILDE",            # ã U+00E3
270     "AElig" => "LATIN CAPITAL LETTER AE",                    # Æ U+00C6
271     "aelig" => "LATIN SMALL LETTER AE",                      # æ U+00E6
272
273    "Ccedil" => "LATIN CAPITAL LETTER C WITH CEDILLA",        # Ç U+00C7
274    "ccedil" => "LATIN SMALL LETTER C WITH CEDILLA",          # ç U+00E7
275
276       "ETH" => "LATIN CAPITAL LETTER ETH",                   # Ð U+00D0
277       "eth" => "LATIN SMALL LETTER ETH",                     # ð U+00F0
278
279    "Eacute" => "LATIN CAPITAL LETTER E WITH ACUTE",          # É U+00C9
280    "eacute" => "LATIN SMALL LETTER E WITH ACUTE",            # é U+00E9
281    "Egrave" => "LATIN CAPITAL LETTER E WITH GRAVE",          # È U+00C8
282    "egrave" => "LATIN SMALL LETTER E WITH GRAVE",            # è U+00E8
283     "Ecirc" => "LATIN CAPITAL LETTER E WITH CIRCUMFLEX",     # Ê U+00CA
284     "ecirc" => "LATIN SMALL LETTER E WITH CIRCUMFLEX",       # ê U+00EA
285      "Euml" => "LATIN CAPITAL LETTER E WITH DIAERESIS",      # Ë U+00CB
286      "euml" => "LATIN SMALL LETTER E WITH DIAERESIS",        # ë U+00EB
287
288      "fnof" => "LATIN SMALL LETTER F WITH HOOK",             # ƒ U+0192
289
290     "image" => "BLACK-LETTER CAPITAL I",                     # ℑ U+2111
291    "Iacute" => "LATIN CAPITAL LETTER I WITH ACUTE",          # Í U+00CD
292    "iacute" => "LATIN SMALL LETTER I WITH ACUTE",            # í U+00ED
293    "Igrave" => "LATIN CAPITAL LETTER I WITH GRAVE",          # Ì U+00CC
294    "igrave" => "LATIN SMALL LETTER I WITH GRAVE",            # ì U+00EC
295     "Icirc" => "LATIN CAPITAL LETTER I WITH CIRCUMFLEX",     # Î U+00CE
296     "icirc" => "LATIN SMALL LETTER I WITH CIRCUMFLEX",       # î U+00EE
297      "Iuml" => "LATIN CAPITAL LETTER I WITH DIAERESIS",      # Ï U+00CF
298      "iuml" => "LATIN SMALL LETTER I WITH DIAERESIS",        # ï U+00EF
299
300    "Ntilde" => "LATIN CAPITAL LETTER N WITH TILDE",          # Ñ U+00D1
301    "ntilde" => "LATIN SMALL LETTER N WITH TILDE",            # ñ U+00F1
302
303      "ordm" => "MASCULINE ORDINAL INDICATOR",                # º U+00BA
304    "oacute" => "LATIN SMALL LETTER O WITH ACUTE",            # ó U+00F3
305    "Ograve" => "LATIN CAPITAL LETTER O WITH GRAVE",          # Ò U+00D2
306    "ograve" => "LATIN SMALL LETTER O WITH GRAVE",            # ò U+00F2
307     "Ocirc" => "LATIN CAPITAL LETTER O WITH CIRCUMFLEX",     # Ô U+00D4
308     "ocirc" => "LATIN SMALL LETTER O WITH CIRCUMFLEX",       # ô U+00F4
309      "Ouml" => "LATIN CAPITAL LETTER O WITH DIAERESIS",      # Ö U+00D6
310      "ouml" => "LATIN SMALL LETTER O WITH DIAERESIS",        # ö U+00F6
311    "Otilde" => "LATIN CAPITAL LETTER O WITH TILDE",          # Õ U+00D5
312    "otilde" => "LATIN SMALL LETTER O WITH TILDE",            # õ U+00F5
313    "Oslash" => "LATIN CAPITAL LETTER O WITH STROKE",         # Ø U+00D8
314    "oslash" => "LATIN SMALL LETTER O WITH STROKE",           # ø U+00F8
315     "OElig" => "LATIN CAPITAL LIGATURE OE",                  # ΠU+0152
316     "oelig" => "LATIN SMALL LIGATURE OE",                    # œ U+0153
317
318      "real" => "BLACK-LETTER CAPITAL R",                     # ℜ U+211C
319
320    "Scaron" => "LATIN CAPITAL LETTER S WITH CARON",          # Š U+0160
321    "scaron" => "LATIN SMALL LETTER S WITH CARON",            # š U+0161
322     "szlig" => "LATIN SMALL LETTER SHARP S",                 # ß U+00DF
323
324     "trade" => "TRADE MARK SIGN",                            # ™ U+2122
325
326    "Uacute" => "LATIN CAPITAL LETTER U WITH ACUTE",          # Ú U+00DA
327    "uacute" => "LATIN SMALL LETTER U WITH ACUTE",            # ú U+00FA
328    "Ugrave" => "LATIN CAPITAL LETTER U WITH GRAVE",          # Ù U+00D9
329    "ugrave" => "LATIN SMALL LETTER U WITH GRAVE",            # ù U+00F9
330     "Ucirc" => "LATIN CAPITAL LETTER U WITH CIRCUMFLEX",     # Û U+00DB
331     "ucirc" => "LATIN SMALL LETTER U WITH CIRCUMFLEX",       # û U+00FB
332      "Uuml" => "LATIN CAPITAL LETTER U WITH DIAERESIS",      # Ü U+00DC
333      "uuml" => "LATIN SMALL LETTER U WITH DIAERESIS",        # ü U+00FC
334
335    "Yacute" => "LATIN CAPITAL LETTER Y WITH ACUTE",          # Ý U+00DD
336    "yacute" => "LATIN SMALL LETTER Y WITH ACUTE",            # ý U+00FD
337      "Yuml" => "LATIN CAPITAL LETTER Y WITH DIAERESIS",      # Ÿ U+0178
338      "yuml" => "LATIN SMALL LETTER Y WITH DIAERESIS",        # ÿ U+00FF
339
340     "THORN" => "LATIN CAPITAL LETTER THORN",                 # Þ U+00DE
341     "thorn" => "LATIN SMALL LETTER THORN",                   # þ U+00FE
342
343# This is *not* the same as the HEBREW LETTER ALEF (aleph),
344# although it is a \p{Other_Letter} not a \p{Symbol}.
345   "alefsym" => "ALEF SYMBOL",                                # ℵ U+2135
346
347# Greek letter aliases, or things that sort with them
348
349     "Alpha" => "GREEK CAPITAL LETTER ALPHA",                 # Α U+0391
350     "alpha" => "GREEK SMALL LETTER ALPHA",                   # α U+03B1
351      "Beta" => "GREEK CAPITAL LETTER BETA",                  # Β U+0392
352      "beta" => "GREEK SMALL LETTER BETA",                    # β U+03B2
353     "Gamma" => "GREEK CAPITAL LETTER GAMMA",                 # Γ U+0393
354     "gamma" => "GREEK SMALL LETTER GAMMA",                   # γ U+03B3
355     "Delta" => "GREEK CAPITAL LETTER DELTA",                 # Δ U+0394
356     "delta" => "GREEK SMALL LETTER DELTA",                   # δ U+03B4
357   "Epsilon" => "GREEK CAPITAL LETTER EPSILON",               # Ε U+0395
358   "epsilon" => "GREEK SMALL LETTER EPSILON",                 # ε U+03B5
359      "Zeta" => "GREEK CAPITAL LETTER ZETA",                  # Ζ U+0396
360      "zeta" => "GREEK SMALL LETTER ZETA",                    # ζ U+03B6
361       "Eta" => "GREEK CAPITAL LETTER ETA",                   # Η U+0397
362       "eta" => "GREEK SMALL LETTER ETA",                     # η U+03B7
363     "Theta" => "GREEK CAPITAL LETTER THETA",                 # Θ U+0398
364  "thetasym" => "GREEK THETA SYMBOL",                         # ϑ U+03D1
365     "theta" => "GREEK SMALL LETTER THETA",                   # θ U+03B8
366      "Iota" => "GREEK CAPITAL LETTER IOTA",                  # Ι U+0399
367      "iota" => "GREEK SMALL LETTER IOTA",                    # ι U+03B9
368     "Kappa" => "GREEK CAPITAL LETTER KAPPA",                 # Κ U+039A
369     "kappa" => "GREEK SMALL LETTER KAPPA",                   # κ U+03BA
370    "Lambda" => "GREEK CAPITAL LETTER LAMDA",                 # Λ U+039B
371    "lambda" => "GREEK SMALL LETTER LAMDA",                   # λ U+03BB
372        "Mu" => "GREEK CAPITAL LETTER MU",                    # Μ U+039C
373     "micro" => "MICRO SIGN",                                 # µ U+00B5
374        "mu" => "GREEK SMALL LETTER MU",                      # μ U+03BC
375        "Nu" => "GREEK CAPITAL LETTER NU",                    # Ν U+039D
376        "nu" => "GREEK SMALL LETTER NU",                      # ν U+03BD
377        "Xi" => "GREEK CAPITAL LETTER XI",                    # Ξ U+039E
378        "xi" => "GREEK SMALL LETTER XI",                      # ξ U+03BE
379   "Omicron" => "GREEK CAPITAL LETTER OMICRON",               # Ο U+039F
380   "omicron" => "GREEK SMALL LETTER OMICRON",                 # ο U+03BF
381        "Pi" => "GREEK CAPITAL LETTER PI",                    # Π U+03A0
382       "piv" => "GREEK PI SYMBOL",                            # ϖ U+03D6
383        "pi" => "GREEK SMALL LETTER PI",                      # π U+03C0
384       "Rho" => "GREEK CAPITAL LETTER RHO",                   # Ρ U+03A1
385       "rho" => "GREEK SMALL LETTER RHO",                     # ρ U+03C1
386     "sigma" => "GREEK SMALL LETTER SIGMA",                   # σ U+03C3
387    "sigmaf" => "GREEK SMALL LETTER FINAL SIGMA",             # ς U+03C2
388       "Tau" => "GREEK CAPITAL LETTER TAU",                   # Τ U+03A4
389       "tau" => "GREEK SMALL LETTER TAU",                     # τ U+03C4
390     "upsih" => "GREEK UPSILON WITH HOOK SYMBOL",             # ϒ U+03D2
391   "Upsilon" => "GREEK CAPITAL LETTER UPSILON",               # Υ U+03A5
392   "upsilon" => "GREEK SMALL LETTER UPSILON",                 # υ U+03C5
393       "Phi" => "GREEK CAPITAL LETTER PHI",                   # Φ U+03A6
394       "phi" => "GREEK SMALL LETTER PHI",                     # φ U+03C6
395       "Chi" => "GREEK CAPITAL LETTER CHI",                   # Χ U+03A7
396       "chi" => "GREEK SMALL LETTER CHI",                     # χ U+03C7
397       "Psi" => "GREEK CAPITAL LETTER PSI",                   # Ψ U+03A8
398       "psi" => "GREEK SMALL LETTER PSI",                     # ψ U+03C8
399     "Omega" => "GREEK CAPITAL LETTER OMEGA",                 # Ω U+03A9
400     "omega" => "GREEK SMALL LETTER OMEGA",                   # ω U+03C9
401
402# \p{Format} characters
403
404       "zwj" => "ZERO WIDTH JOINER",                          # ‍ U+200D
405      "zwnj" => "ZERO WIDTH NON-JOINER",                      # ‌ U+200C
406       "rlm" => "RIGHT-TO-LEFT MARK",                         # ‏ U+200F
407       "lrm" => "LEFT-TO-RIGHT MARK",                         # ‎ U+200E
408
409# Various punctuation and symbols in UCA order.
410# None of these is a combining Mark.
411
412     "oline" => "OVERLINE",                                   # ‾ U+203E
413      "ensp" => "EN SPACE",                                   #   U+2002
414      "nbsp" => "NO-BREAK SPACE",                             #   U+00A0
415     "cedil" => "CEDILLA",                                    # ¸ U+00B8
416       "uml" => "DIAERESIS",                                  # ¨ U+00A8
417     "acute" => "ACUTE ACCENT",                               # ´ U+00B4
418     "tilde" => "SMALL TILDE",                                # ˜ U+02DC
419      "emsp" => "EM SPACE",                                   #   U+2003
420      "macr" => "MACRON",                                     # ¯ U+00AF
421    "thinsp" => "THIN SPACE",                                 #   U+2009
422       "shy" => "SOFT HYPHEN",                                # ­ U+00AD
423     "ndash" => "EN DASH",                                    # – U+2013
424     "mdash" => "EM DASH",                                    # — U+2014
425     "iexcl" => "INVERTED EXCLAMATION MARK",                  # ¡ U+00A1
426    "iquest" => "INVERTED QUESTION MARK",                     # ¿ U+00BF
427    "hellip" => "HORIZONTAL ELLIPSIS",                        # … U+2026
428    "middot" => "MIDDLE DOT",                                 # · U+00B7
429      "apos" => "APOSTROPHE",                                 # ' U+0027
430     "lsquo" => "LEFT SINGLE QUOTATION MARK",                 # ‘ U+2018
431     "rsquo" => "RIGHT SINGLE QUOTATION MARK",                # ’ U+2019
432     "sbquo" => "SINGLE LOW-9 QUOTATION MARK",                # ‚ U+201A
433    "lsaquo" => "SINGLE LEFT-POINTING ANGLE QUOTATION MARK",  # ‹ U+2039
434    "rsaquo" => "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK", # › U+203A
435      "quot" => "QUOTATION MARK",                             # " U+0022
436     "ldquo" => "LEFT DOUBLE QUOTATION MARK",                 # “ U+201C
437     "rdquo" => "RIGHT DOUBLE QUOTATION MARK",                # ” U+201D
438     "bdquo" => "DOUBLE LOW-9 QUOTATION MARK",                # „ U+201E
439     "laquo" => "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK",  # « U+00AB
440     "raquo" => "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK", # » U+00BB
441      "lang" => "LEFT-POINTING ANGLE BRACKET",                # 〈 U+2329
442      "rang" => "RIGHT-POINTING ANGLE BRACKET",               # 〉 U+232A
443      "sect" => "SECTION SIGN",                               # § U+00A7
444      "para" => "PILCROW SIGN",                               # ¶ U+00B6
445      "copy" => "COPYRIGHT SIGN",                             # © U+00A9
446       "reg" => "REGISTERED SIGN",                            # ® U+00AE
447     "frasl" => "FRACTION SLASH",                             # ⁄ U+2044
448       "amp" => "AMPERSAND",                                  # & U+0026
449    "permil" => "PER MILLE SIGN",                             # ‰ U+2030
450    "dagger" => "DAGGER",                                     # † U+2020
451    "Dagger" => "DOUBLE DAGGER",                              # ‡ U+2021
452      "bull" => "BULLET",                                     # • U+2022
453     "prime" => "PRIME",                                      # ′ U+2032
454     "Prime" => "DOUBLE PRIME",                               # ″ U+2033
455      "circ" => "MODIFIER LETTER CIRCUMFLEX ACCENT",          # ˆ U+02C6
456       "deg" => "DEGREE SIGN",                                # ° U+00B0
457    "weierp" => "SCRIPT CAPITAL P",                           # ℘ U+2118
458      "larr" => "LEFTWARDS ARROW",                            # ← U+2190
459      "rarr" => "RIGHTWARDS ARROW",                           # → U+2192
460      "uarr" => "UPWARDS ARROW",                              # ↑ U+2191
461      "darr" => "DOWNWARDS ARROW",                            # ↓ U+2193
462      "harr" => "LEFT RIGHT ARROW",                           # ↔ U+2194
463     "crarr" => "DOWNWARDS ARROW WITH CORNER LEFTWARDS",      # ↵ U+21B5
464      "lArr" => "LEFTWARDS DOUBLE ARROW",                     # ⇐ U+21D0
465      "uArr" => "UPWARDS DOUBLE ARROW",                       # ⇑ U+21D1
466      "rArr" => "RIGHTWARDS DOUBLE ARROW",                    # ⇒ U+21D2
467      "dArr" => "DOWNWARDS DOUBLE ARROW",                     # ⇓ U+21D3
468      "hArr" => "LEFT RIGHT DOUBLE ARROW",                    # ⇔ U+21D4
469    "forall" => "FOR ALL",                                    # ∀ U+2200
470      "part" => "PARTIAL DIFFERENTIAL",                       # ∂ U+2202
471     "exist" => "THERE EXISTS",                               # ∃ U+2203
472     "empty" => "EMPTY SET",                                  # ∅ U+2205
473     "nabla" => "NABLA",                                      # ∇ U+2207
474      "isin" => "ELEMENT OF",                                 # ∈ U+2208
475     "notin" => "NOT AN ELEMENT OF",                          # ∉ U+2209
476        "ni" => "CONTAINS AS MEMBER",                         # ∋ U+220B
477      "prod" => "N-ARY PRODUCT",                              # ∏ U+220F
478       "sum" => "N-ARY SUMMATION",                            # ∑ U+2211
479    "plusmn" => "PLUS-MINUS SIGN",                            # ± U+00B1
480    "divide" => "DIVISION SIGN",                              # ÷ U+00F7
481     "times" => "MULTIPLICATION SIGN",                        # × U+00D7
482        "lt" => "LESS-THAN SIGN",                             # < U+003C
483        "ne" => "NOT EQUAL TO",                               # ≠ U+2260
484        "gt" => "GREATER-THAN SIGN",                          # > U+003E
485       "not" => "NOT SIGN",                                   # ¬ U+00AC
486    "brvbar" => "BROKEN BAR",                                 # ¦ U+00A6
487     "minus" => "MINUS SIGN",                                 # − U+2212
488    "lowast" => "ASTERISK OPERATOR",                          # ∗ U+2217
489     "radic" => "SQUARE ROOT",                                # √ U+221A
490      "prop" => "PROPORTIONAL TO",                            # ∝ U+221D
491     "infin" => "INFINITY",                                   # ∞ U+221E
492       "ang" => "ANGLE",                                      # ∠ U+2220
493       "and" => "LOGICAL AND",                                # ∧ U+2227
494        "or" => "LOGICAL OR",                                 # ∨ U+2228
495       "cap" => "INTERSECTION",                               # ∩ U+2229
496       "cup" => "UNION",                                      # ∪ U+222A
497       "int" => "INTEGRAL",                                   # ∫ U+222B
498    "there4" => "THEREFORE",                                  # ∴ U+2234
499       "sim" => "TILDE OPERATOR",                             # ∼ U+223C
500      "cong" => "APPROXIMATELY EQUAL TO",                     # ≅ U+2245
501     "asymp" => "ALMOST EQUAL TO",                            # ≈ U+2248
502     "equiv" => "IDENTICAL TO",                               # ≡ U+2261
503        "le" => "LESS-THAN OR EQUAL TO",                      # ≤ U+2264
504        "ge" => "GREATER-THAN OR EQUAL TO",                   # ≥ U+2265
505       "sub" => "SUBSET OF",                                  # ⊂ U+2282
506      "nsub" => "NOT A SUBSET OF",                            # ⊄ U+2284
507       "sup" => "SUPERSET OF",                                # ⊃ U+2283
508      "sube" => "SUBSET OF OR EQUAL TO",                      # ⊆ U+2286
509      "supe" => "SUPERSET OF OR EQUAL TO",                    # ⊇ U+2287
510     "oplus" => "CIRCLED PLUS",                               # ⊕ U+2295
511    "otimes" => "CIRCLED TIMES",                              # ⊗ U+2297
512      "perp" => "UP TACK",                                    # ⊥ U+22A5
513      "sdot" => "DOT OPERATOR",                               # ⋅ U+22C5
514     "lceil" => "LEFT CEILING",                               # ⌈ U+2308
515     "rceil" => "RIGHT CEILING",                              # ⌉ U+2309
516    "lfloor" => "LEFT FLOOR",                                 # ⌊ U+230A
517    "rfloor" => "RIGHT FLOOR",                                # ⌋ U+230B
518       "loz" => "LOZENGE",                                    # ◊ U+25CA
519    "spades" => "BLACK SPADE SUIT",                           # ♠ U+2660
520     "clubs" => "BLACK CLUB SUIT",                            # ♣ U+2663
521    "hearts" => "BLACK HEART SUIT",                           # ♥ U+2665
522     "diams" => "BLACK DIAMOND SUIT",                         # ♦ U+2666
523
524    #
525    # override non-combining forms
526    #
527
528        "ACUTE"                 => "COMBINING ACUTE ACCENT",
529        "acute"                 => "COMBINING ACUTE ACCENT",
530
531        "GRAVE"                 => "COMBINING GRAVE ACCENT",
532        "grave"                 => "COMBINING GRAVE ACCENT",
533
534        "CIRCUMFLEX"            => "COMBINING CIRCUMFLEX ACCENT",
535        "CIRCUM"                => "COMBINING CIRCUMFLEX ACCENT",
536        "CIRC"                  => "COMBINING CIRCUMFLEX ACCENT",
537        "circumflex"            => "COMBINING CIRCUMFLEX ACCENT",
538        "circum"                => "COMBINING CIRCUMFLEX ACCENT",
539        "circ"                  => "COMBINING CIRCUMFLEX ACCENT",
540
541                        # typo protection
542
543        "COMBINING DIERESIS"    => "COMBINING DIAERESIS",
544        "COMBINING DIEARESIS"   => "COMBINING DIAERESIS",
545        "DIERESIS"              => "COMBINING DIAERESIS",
546        "DIEARESIS"             => "COMBINING DIAERESIS",
547        "DIAERESIS"             => "COMBINING DIAERESIS",
548        "dieresis"              => "COMBINING DIAERESIS",
549        "diearesis"             => "COMBINING DIAERESIS",
550        "diaeresis"             => "COMBINING DIAERESIS",
551        "diaer"                 => "COMBINING DIAERESIS",
552        "diear"                 => "COMBINING DIAERESIS",
553        "dier"                  => "COMBINING DIAERESIS",
554
555        "TILDE"                 => "COMBINING TILDE",
556        "tilde"                 => "COMBINING TILDE",
557        "til"                   => "COMBINING TILDE",
558
559        "CEDILLE"               => "COMBINING CEDILLA",
560        "CEDILLA"               => "COMBINING CEDILLA",
561        "CEDIL"                 => "COMBINING CEDILLA",
562        "cedille"               => "COMBINING CEDILLA",
563        "cedilla"               => "COMBINING CEDILLA",
564        "cedil"                 => "COMBINING CEDILLA",
565
566        "MACRON"                => "COMBINING MACRON",
567        "macron"                => "COMBINING MACRON",
568
569	"CARON"			=> "COMBINING CARON",
570	"caron"			=> "COMBINING CARON",
571
572    #
573    # special glyphs
574    #
575
576	# Hawaiʻi, aloha ʻoe
577    	"okina"			=> "MODIFIER LETTER TURNED COMMA",
578
579        # * transliteration of Arabic ain (voiced pharyngeal fricative)
580        "ain"                   => "MODIFIER LETTER LEFT HALF RING",
581
582        "stress"                => "MODIFIER LETTER VERTICAL LINE",
583        "stress1"               => "MODIFIER LETTER VERTICAL LINE",
584        "primary_stress"        => "MODIFIER LETTER VERTICAL LINE",
585        "pstress"               => "MODIFIER LETTER VERTICAL LINE",
586        "pstr"                  => "MODIFIER LETTER VERTICAL LINE",
587
588        "secondary_stress"      => "MODIFIER LETTER LOW VERTICAL LINE",
589        "stress2"               => "MODIFIER LETTER LOW VERTICAL LINE",
590        "sstress"               => "MODIFIER LETTER LOW VERTICAL LINE",
591        "sstr"                  => "MODIFIER LETTER LOW VERTICAL LINE",
592
593    #
594    # classification glyphs
595    #
596
597        # OBSOLETE
598        "obs"                   => "DAGGER",
599        "obsolete"              => "DAGGER",
600        "dagger"                => "DAGGER",
601
602        # ALIEN
603        "ali"                   => "DOUBLE VERTICAL LINE",
604        "alien"                 => "DOUBLE VERTICAL LINE",
605        "foreign"               => "DOUBLE VERTICAL LINE",
606        "unassimilated"         => "DOUBLE VERTICAL LINE",
607
608        # ERRONEOUS
609        "err"                   => "CURVED STEM PARAGRAPH SIGN ORNAMENT",
610        "erron"                 => "CURVED STEM PARAGRAPH SIGN ORNAMENT",
611        "erroneous"             => "CURVED STEM PARAGRAPH SIGN ORNAMENT",
612
613        # CATACHRESTIC
614        "spu"                   => "PILCROW SIGN",
615        "spurious"              => "PILCROW SIGN",
616        "catachrestic"          => "PILCROW SIGN",
617        "catach"                => "PILCROW SIGN",
618        "cata"                  => "PILCROW SIGN",
619
620        # CROSS REFERENCE
621        "xref"                  => "MULTIPLICATION SIGN",
622
623        # ILLUSTRATIVE
624        "ill"                   => "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK",
625        "illus"                 => "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK",
626        "illustrative"          => "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK",
627
628    },
629
630);  # end use charnames
631
632use constant DATABASE_NAME => "words.utf8";
633
634use subs qw[ dump ];  # like I really want a SIGABORT, not!
635
636#################################################################
637
638use Carp;
639use File::Spec;
640use English qw[ -no_match_vars ];
641use Getopt::Long  qw[ GetOptions ];
642use Pod::Usage;
643# use Search::Dict;
644use Unicode::Normalize;
645
646use Encode qw( encode decode );
647
648#################################################################
649
650sub deQ($);
651
652#################################################################
653
654our %Opt;
655our $DB_Name;
656our $Shown_Count = 0;
657
658#################################################################
659
660main();
661NOT_REACHED();
662
663#################################################################
664
665sub main {
666    init();
667
668    my $count = $Opt{fuzzy}   ? run_agrep()
669	      : $Opt{pattern} ? run_grep()
670	      :                 run_look();
671
672    debug("found $count matches");
673
674    if ($Shown_Count == 0) {
675	exit 1;
676    } else {
677	exit 0;
678    }
679}
680
681#################################################################
682
683sub init {
684
685    eval q{ END { eval { close STDOUT } } };
686
687    $SIG{PIPE} = sub { exit };
688
689    $| = 1;
690
691    binmode(STDOUT, ":utf8");
692    binmode(STDERR, ":utf8");
693
694    @ARGV = map { decode("UTF-8", $_) } @ARGV;
695
696    handle_options();
697
698    validate_database();
699
700}
701
702#################################################################
703
704sub validate_database {
705
706    return if $DB_Name && locate_textfile($DB_Name);
707
708    my $database = $Opt{database} || DATABASE_NAME;
709
710    unless ($DB_Name = locate_textfile($database)) {
711	die "$0: no database $database\n";
712    }
713
714}
715
716#################################################################
717
718sub handle_options {
719
720    pod2usage("$0: usage error: expected arguments\n") if @ARGV == 0;
721
722    Getopt::Long::Configure qw[ bundling auto_version no_ignore_case ];
723
724    dump("pre getopt options are:", \%Opt);
725
726    GetOptions(\%Opt => qw[
727
728        help|?
729        man|m
730        debug|d
731
732	datafile|D=s
733	pattern|grep|g=s
734
735	nopager
736	sort|s
737
738	verbose|v+
739	showkey|raw|V
740
741	everything|all|a
742	all-verbose|A
743
744	headwords-only|h
745
746	regular|normal|n
747	foreign|alien|f
748	catachrestic|erroneous|e
749	obsolete|old|o
750	crossreference|xref|x
751	illustrations|i
752
753	noregular|nonormal|N
754	noforeign|noalian|F
755	nocatachrestic|noerroneous|E
756	noobsolete|noold|O
757	nocrossreference|noxref|X
758	noillustrations|I
759
760	part-of-speech|partofspeech|speech|pos|p=s
761	nopart-of-speech|nopartofspeech|nospeech|nopos|P=s
762
763	fuzzy|z
764	all-fuzzy|Z
765
766    ]) || pod2usage(2);
767
768    $Opt{verbose} ||= 0;
769
770    if ($Opt{"all-fuzzy"}) {
771	$Opt{"fuzzy"}++;
772	$Opt{"all-verbose"}++;
773	# FALLTHROUGH
774    }
775
776    if ($Opt{"all-verbose"}) {
777	$Opt{"everything"}++;
778	$Opt{"verbose"} = 2;
779    }
780
781    my @yes_types = qw{
782	foreign
783	catachrestic
784	obsolete
785	crossreference
786	illustrations
787	regular
788    };
789
790    if ($Opt{"showkey"}) {
791	$Opt{"verbose"} = 3;
792    }
793
794    my @no_types = map { "no$_" } @yes_types;
795
796    if ( ( grep { exists $Opt{$_} } @no_types ) && ( grep { exists $Opt{$_} } @yes_types ) ) {
797	# can't have both
798	pod2usage("Usage error: incompatible mix of yes and no options");
799    }
800
801    # if (my @no_opts = @no_types ~~ %Opt) {
802    if (my @no_opts = grep { $Opt{$_} } @no_types) {
803	s/^no// for @no_opts;
804	debug("opt set 1");
805	@Opt{ @yes_types } = (1) x @yes_types;
806	@Opt{ @no_opts   } = (0) x @no_opts;
807    }
808    # elsif (@yes_types ~~ %Opt) {
809    elsif (grep { $Opt{$_} } @yes_types) {
810	debug("opt set 2");
811	# then we're fine, use only these
812    } else {
813	debug("opt set 3");
814	# neither yes nor no, so turn all yeses on
815	@Opt{ @yes_types } = (1) x @yes_types;
816	unless ($Opt{everything}) {
817	    $Opt{"illustrations"}  = 0;
818	    $Opt{"obsolete"}       = 0;
819	    $Opt{"catachrestic"}   = 0;
820	    $Opt{"crossreference"} = 0;
821	}
822    }
823
824    if ($Opt{"headwords-only"}) {
825	$Opt{"illustrations"} = 0;
826	$Opt{"crossreference"} = 0;
827    }
828
829    dump("post getopt options are", \%Opt);
830
831    pod2usage(0)                                 if $Opt{help};
832    pod2usage(-exitstatus => 0, -verbose => 2)   if $Opt{man};
833
834    unless ($Opt{pattern} || @ARGV) {
835	@ARGV = (".");
836        # pod2usage("$0: expected arguments\n");
837    }
838
839    if (!$Opt{pattern} && $ARGV[0] =~ /\PL/) {
840	$Opt{pattern} = shift @ARGV;
841	$Opt{pattern} =~ s#^/## && $Opt{pattern} =~ s#/$##;
842    }
843
844}
845
846
847#################################################################
848
849sub run_look {
850    ARGCOUNT() if @_;
851
852    validate_database();
853
854    my $look_word =  lc NFD "@ARGV";
855       $look_word =~ s/\PL+//g;
856
857    my $look_fh;
858
859    # because otherwise the look program misbehaves;
860    #   	env LC_ALL=C
861    #
862    $ENV{LC_ALL} = "C";
863
864    my $lookpath = locate_program("look");
865    die "no look program" unless $lookpath;
866
867    # can't do this many arguments in old perls
868    if ($] >= 5.013_000) {
869	open($look_fh, "-| :utf8", $lookpath, $look_word,  $DB_Name,   );
870    } else {
871	open($look_fh,            "$lookpath '$look_word' '$DB_Name' |");
872	binmode($look_fh, ":utf8");
873    }
874
875    my $found = 0;
876
877    local $_;
878
879    while (<$look_fh>) {
880	idem_print($_);
881	$found++;
882    }
883
884    eval { close $look_fh };
885
886    # die "look failed: $?" if $?;
887
888    all_done();
889
890    debug("returning $found matched");
891
892    return $found;
893}
894
895
896#################################################################
897
898sub run_grep {
899    validate_database();
900
901    my $search_string = NFD $Opt{pattern};
902
903    die "$0: bad search string $search_string\n"
904	unless length $search_string;
905
906    $search_string =~ tr/`'/\N{lsquo}\N{rsquo}/;
907
908    local $SIG{__WARN__} = sub { die "FATALIZED WARNING: @_" };
909    my $pattern = eval qq{ qr{$search_string} };
910    die if $@;
911
912    open(my $raw_db, "< :utf8", $DB_Name);
913
914    my $found = 0;
915
916    local $_;
917
918    while (<$raw_db>) {
919	next unless /$pattern/ || NFC($_) =~ /$pattern/;
920	$found++;
921	idem_print($_);
922    }
923    close $raw_db;
924
925    all_done();
926
927    return $found;
928}
929
930#################################################################
931
932sub run_agrep {
933    ARGCOUNT() if @_;
934
935    validate_database();
936
937    my $agrep_word =  lc NFD "@ARGV";
938       $agrep_word =~ s/\PL+//g;
939
940    my $agrep_fh;
941
942    my $agrep_path = locate_program("agrep");
943    die "no agrep program" unless $agrep_path;
944
945    my $yes_path = locate_program("yes");
946    die "no yes program" unless $yes_path;
947
948    my $arg_string = "$yes_path | $agrep_path -B '$agrep_word' '$DB_Name' 2>/dev/null |";
949    debug("running  $arg_string");
950
951    open($agrep_fh, $arg_string);
952    binmode($agrep_fh, ":utf8");
953
954    my $found = 0;
955
956    local $_;
957    while (<$agrep_fh>) {
958	idem_print($_);
959	$found++;
960    }
961
962    eval { close $agrep_fh };
963
964    # die "agrep failed: $?" if $?;
965
966    all_done();
967
968    debug("returning $found matched");
969
970    return $found;
971}
972
973#################################################################
974
975sub idem_print {
976    ARGCOUNT() unless @_ == 1;
977
978    my $entry = NFC shift();
979
980    local $_ = $entry;
981    s/.*\t// || panic("malformed input");
982
983    if (/\N{LEFTWARDS ARROW}/) {
984	debug("filter left arrow");
985	return if     $Opt{"headwords-only"};
986    }
987
988    if (/\N{RIGHTWARDS ARROW}/) {
989	debug("filter left arrow");
990	return if     $Opt{"headwords-only"};
991    }
992
993    if (/^ \N{ill} /) {
994	debug("filter ill");
995	return if     $Opt{"headwords-only"};
996	return unless $Opt{illustrations};
997    }
998    elsif (/^ \N{ali}/) {
999	debug("filter ali");
1000	return unless $Opt{"foreign"};
1001    }
1002    elsif (/^ \N{xref}/) {
1003	debug("filter xref");
1004	return unless $Opt{"crossreference"};
1005    }
1006    elsif (/^ [\N{spu}\N{err}]/) {
1007	debug("filter spu");
1008	return unless $Opt{"catachrestic"};
1009    }
1010    elsif (/^ \N{obs}/) {
1011	debug("filter obs");
1012	return unless $Opt{"obsolete"};
1013    }
1014    else {
1015	debug("filter regular");
1016	return unless $Opt{"regular"};
1017    }
1018
1019    if ($Opt{"part-of-speech"}) {
1020	debug("filter pos yes");
1021	return if pos_filtered($_, $Opt{"part-of-speech"});
1022    }
1023
1024    if ($Opt{"nopart-of-speech"}) {
1025	debug("filter pos yes");
1026	return unless pos_filtered($_, $Opt{"nopart-of-speech"});
1027    }
1028
1029    debug("FILTER FALLTHRU");
1030
1031    unless ($Opt{verbose}) {
1032	s/\h\[.*//;
1033	s/\h\N{LEFTWARDS ARROW}.*//;
1034	s/\h\N{RIGHTWARDS ARROW}.*//;
1035    }
1036
1037    {
1038	next if m{
1039	    \b (?:
1040
1041		     \N{ae}lfe?
1042
1043                 |   \N{oe}il
1044                 |   \N{oe}illade
1045                 |   \N{oe}ufs?
1046                 |   \N{oe}uvres?
1047                 |  b\N{oe}ufs?
1048                 |  c\N{oe}urs?
1049                 | ch\N{oe}nix
1050                 |  m\N{oe}urs
1051                 |  v\N{oe}ux?
1052
1053	    ) \b
1054	  |  (?<!man|ped)\N{oe}uvr
1055	  | c\N{oe}ur
1056	}xi;
1057
1058	unless ( /\N{ae}\N{acute}|[\N{eth}\N{thorn}]/ ) {
1059	    s{ \N{AE} }{Ae}xg;
1060	    s{ \N{ae} }{ae}xg;
1061	}
1062	s{ \N{OE} }{Oe}xg;
1063	s{ \N{oe} }{oe}xg;
1064    }
1065
1066    unless ($Opt{verbose} > 1)  {
1067	s/^\h+//;
1068	s/[\N{ali}\N{xref}\N{spu}\N{err}\N{obs}\N{ill}]\h*//g;
1069	s/[\N{stress1}\N{stress2}]//g;
1070	s/[\N{MIDDLE DOT}\N{ONE DOT LEADER}]//g;
1071    }
1072
1073    display($Opt{verbose} < 3 ? $_ : $entry);
1074
1075}
1076
1077sub pos_filtered($$) {
1078    my ($entry, $pos_list) = @_;
1079
1080    state $pos_map = {
1081        abbreviation    => qr{ \b abbr     \. }x,
1082        abbrev          => qr{ \b abbr     \. }x,
1083        abbr            => qr{ \b abbr     \. }x,
1084        absolute        => qr{ \b absol    \. }x,
1085        absol           => qr{ \b absol    \. }x,
1086        abs             => qr{ \b absol    \. }x,
1087        adjective       => qr{ \b adj      \. }x,
1088        adj             => qr{ \b adj      \. }x,
1089        a               => qr{ \b adj      \. }x,
1090        adverb          => qr{ \b adv      \. }x,
1091        adv             => qr{ \b adv      \. }x,
1092        adverbial       => qr{ \b advb     \. }x,
1093        advb            => qr{ \b advb     \. }x,
1094        attributive     => qr{ \b attrib   \. }x,
1095        attrib          => qr{ \b attrib   \. }x,
1096        attr            => qr{ \b attrib   \. }x,
1097        combining       => qr{ \b comb     \. }x,
1098        comb            => qr{ \b comb     \. }x,
1099        comparitive     => qr{ \b compar   \. }x,
1100        compar          => qr{ \b compar   \. }x,
1101        compound        => qr{ \b comp     \. }x,
1102        comp            => qr{ \b comp     \. }x,
1103        conjunction     => qr{ \b conj     \. }x,
1104        conj            => qr{ \b conj     \. }x,
1105        contraction     => qr{ \b contr    \. }x,
1106        contr           => qr{ \b contr    \. }x,
1107        cont            => qr{ \b contr    \. }x,
1108        demonstrative   => qr{ \b dem      \. }x,
1109        demon           => qr{ \b dem      \. }x,
1110        dem             => qr{ \b dem      \. }x,
1111        feminine        => qr{ \b fem      \. }x,
1112        fem             => qr{ \b fem      \. }x,
1113        impersonal      => qr{ \b imp      \. }x,
1114        impers          => qr{ \b imp      \. }x,
1115        imp             => qr{ \b imp      \. }x,
1116        indefinite      => qr{ \b indef    \. }x,
1117        indef           => qr{ \b indef    \. }x,
1118        ind             => qr{ \b indef    \. }x,
1119        infinitive      => qr{ \b inf      \. }x,
1120        infin           => qr{ \b inf      \. }x,
1121        inf             => qr{ \b inf      \. }x,
1122        interjection    => qr{ \b int      \. }x,
1123        interj          => qr{ \b int      \. }x,
1124        int             => qr{ \b int      \. }x,
1125        interrogative   => qr{ \b interrog \. }x,
1126        interrog        => qr{ \b interrog \. }x,
1127        interr          => qr{ \b interrog \. }x,
1128        inter           => qr{ \b interrog \. }x,
1129        intransitive    => qr{ \b intr     \. }x,
1130        intrans         => qr{ \b intr     \. }x,
1131        intr            => qr{ \b intr     \. }x,
1132        masculine       => qr{ \b masc     \. }x,
1133        masc            => qr{ \b masc     \. }x,
1134        name            => qr{ \b name     \b }x,
1135        noun            => qr{ \b n        \. }x,
1136        n               => qr{ \b n        \. }x,
1137        numeral         => qr{ \b numeral  \b }x,
1138        num             => qr{ \b numeral  \b }x,
1139        participial     => qr{ \b pple?    \. }x,
1140        part            => qr{ \b pple?    \. }x,
1141        pple            => qr{ \b pple?    \. }x,
1142        ppl             => qr{ \b pple?    \. }x,
1143        participle      => qr{ \b pple?    \. }x,
1144        particle        => qr{ \b particle \b }x,
1145        past            => qr{ \b pa       \. }x,
1146        pa              => qr{ \b pa       \. }x,
1147        personal        => qr{ \b pers     \. }x,
1148        pers            => qr{ \b pers     \. }x,
1149        phrasal         => qr{ \b phr      \. }x,
1150        phr             => qr{ \b phr      \. }x,
1151        phrase          => qr{ \b phrase   \b }x,
1152        plural          => qr{ \b pl       \. }x,
1153        pl              => qr{ \b pl       \. }x,
1154        possessive      => qr{ \b poss     \. }x,
1155        poss            => qr{ \b poss     \. }x,
1156        predicate       => qr{ \b pred     \. }x,
1157        pred            => qr{ \b pred     \. }x,
1158        prefix          => qr{ \b pref     \. }x,
1159        pref            => qr{ \b pref     \. }x,
1160        preposition     => qr{ \b prep     \. }x,
1161        prep            => qr{ \b prep     \. }x,
1162        present         => qr{ \b pres     \. }x,
1163        pres            => qr{ \b pres     \. }x,
1164        pr              => qr{ \b pres     \. }x,
1165        pronoun         => qr{ \b (?:pron|pers) \. }x,
1166        pron            => qr{ \b (?:pron|pers) \. }x,
1167        pro             => qr{ \b (?:pron|pers) \. }x,
1168        relative        => qr{ \b rel      \. }x,
1169        rel             => qr{ \b rel      \. }x,
1170        singular        => qr{ \b sing     \. }x,
1171        sing            => qr{ \b sing     \. }x,
1172        sg              => qr{ \b sing     \. }x,
1173        suffix          => qr{ \b suff     \. }x,
1174        suff            => qr{ \b suff     \. }x,
1175        superlative     => qr{ \b superl   \. }x,
1176        superl          => qr{ \b superl   \. }x,
1177        super           => qr{ \b superl   \. }x,
1178        transitive      => qr{ \b trans    \. }x,
1179        trans           => qr{ \b trans    \. }x,
1180        tr              => qr{ \b trans    \. }x,
1181        verb            => qr{ \b v        \. }x,
1182        v               => qr{ \b v        \. }x,
1183        verbal          => qr{ \b vbl      \. }x,
1184        vbl             => qr{ \b vbl      \. }x,
1185
1186	# affix		=> qr{ \b (?: suf | pre ) f     \. }x,
1187    };
1188
1189    my @want_parts = split /[.,\h]+/ => $pos_list;
1190    my $have_parts = $entry =~ m{ \[ (.+) \] }x ? $1 : q();
1191
1192    for my $want (@want_parts) {
1193	my $pat = $pos_map->{$want};
1194	die "$0: No such part of speech as <$want>.\n" unless defined $pat;
1195	return 1 unless $have_parts =~ $pat;
1196    }
1197    return 0;
1198}
1199
1200sub display {
1201    ARGCOUNT() unless @_ == 1;
1202
1203    my $string = $_[0];
1204
1205    state $seen = {};
1206
1207    return if $seen->{$string}++;
1208
1209    state $begun_pager;
1210    start_pager() unless $begun_pager++;
1211
1212    $Shown_Count++;
1213
1214    if ($Opt{sort}) {
1215	treasure_up($string);
1216    } else {
1217	print $string;
1218    }
1219
1220}
1221
1222{   my @saved_lines;
1223
1224    sub treasure_up {
1225	ARGCOUNT() unless @_ == 1;
1226	push(@saved_lines, $_[0]);
1227    }
1228
1229    sub all_done {
1230	ARGCOUNT() unless @_ == 0;
1231
1232	return unless @saved_lines;
1233
1234	require Unicode::Collate;
1235
1236	my $sorter = new Unicode::Collate::
1237			    upper_before_lower	=> 1,
1238			    preprocess		=> \&reduce_for_sorting,
1239			    entry		=> deQ<<'END_OF_OVERRIDE'
1240             |Q|        005B 006E 002E ; [.0200.0020.0002.0391] # [n.
1241             |Q|        005B           ; [.0220.0020.0002.0392] # [
1242             |Q|        005D           ; [.0225.0020.0002.0395] # ]
1243END_OF_OVERRIDE
1244	     ;
1245
1246	print for $sorter->sort(@saved_lines);
1247    }
1248
1249}
1250
1251sub reduce_for_sorting {
1252    ARGCOUNT() unless @_ == 1;
1253
1254    local $_ = $_[0];
1255
1256    s/[\N{LEFTWARDS ARROW}\N{RIGHTWARDS ARROW}].*//;
1257
1258    s/(\d+)/sprintf("%020d", $1)/ge;
1259
1260    s/^.*\t// if $Opt{showkey};
1261
1262    return $_;
1263}
1264
1265#################################################################
1266
1267sub am_running_perldb {
1268    no warnings "once";
1269    return keys(%DB::sub) > 0;
1270}
1271
1272sub start_pager {
1273    ARGCOUNT() unless @_ == 0;
1274
1275    return if am_running_perldb();
1276
1277    return if $Opt{nopager};
1278
1279    return unless -t STDOUT;
1280
1281    my $his_pager  =  locate_program($ENV{PAGER})
1282		   || locate_program("less")
1283		   || locate_program("more")
1284		   || locate_program("type")
1285		  ;
1286
1287    return unless $his_pager;
1288    local $ENV{LESSCHARSET} = "utf-8" if $his_pager =~ /\bless\b/i;
1289    open(STDOUT, "|- :utf8", $his_pager);
1290}
1291
1292#################################################################
1293
1294sub locate_textfile {
1295    ARGCOUNT() unless @_ == 1;
1296
1297    my $textfile = $_[0];
1298
1299    return unless grep { defined && length } $textfile;
1300
1301    if (File::Spec->file_name_is_absolute($textfile)) {
1302	return is_legible($textfile);
1303    }
1304
1305    my @maybe_dirs = qw{
1306			   /usr/local/share/dict
1307			   /usr/share/dict
1308			   /usr/local/etc
1309			   /etc
1310			   /opt/local/etc
1311			   /opt/local/etc/dict
1312     };
1313
1314     push @maybe_dirs, @INC;
1315     push @maybe_dirs, File::Spec->path();
1316     push @maybe_dirs, $ENV{HOME} || $ENV{LOGDIR} || ".";
1317
1318    for my $dir (@maybe_dirs) {
1319	my $pathname = File::Spec->catfile($dir, $textfile);
1320	my $dbpath;
1321	return $dbpath if $dbpath = is_legible($pathname);
1322    }
1323
1324    return;
1325}
1326
1327sub locate_program {
1328    ARGCOUNT() unless @_ == 1;
1329
1330    my $program = $_[0];
1331
1332    return unless defined $program
1333	       && length  $program;
1334
1335    if (File::Spec->file_name_is_absolute($program)) {
1336	return is_runnable($program);
1337    }
1338
1339    my @path_dirs = File::Spec->path();
1340
1341    for my $dir (@path_dirs) {
1342	my $pathname = File::Spec->catfile($dir, $program);
1343	my $runpath;
1344	return $runpath if $runpath = is_runnable($pathname);
1345    }
1346
1347    return;
1348}
1349
1350sub is_legible {
1351    ARGCOUNT() unless @_ == 1;
1352    my $fullpath = $_[0];
1353
1354    if (-f $fullpath && -r _ && -T $fullpath) {
1355	return $fullpath;
1356    }
1357    elsif (stupid_evil_and_wrong()  &&  $fullpath !~ /\.txt\z/i) {
1358	return is_runnable("$fullpath.txt")
1359    }
1360    else {
1361	return ();
1362    }
1363
1364    NOT_REACHED();
1365}
1366
1367sub is_runnable {
1368    ARGCOUNT() unless @_ == 1;
1369    my $fullpath = $_[0];
1370
1371    if (-x $fullpath && ! -d _) {
1372	return $fullpath;
1373    }
1374    elsif (stupid_evil_and_wrong()  &&  $fullpath !~ /\.exe\z/i) {
1375	return is_runnable("$fullpath.exe")
1376    }
1377    else {
1378	return ();
1379    }
1380
1381    NOT_REACHED();
1382}
1383
1384sub stupid_evil_and_wrong {
1385	my $name = lc $OSNAME;
1386    return grep { $name eq $_ } qw<dos os2 netware symbian mswin32>;
1387}
1388
1389#################################################################
1390
1391sub debug {
1392    ARGCOUNT() unless @_ > 0;
1393    return unless $Opt{debug};
1394    print STDERR "@_\n" if @_;
1395}
1396
1397sub dump {
1398    ARGCOUNT() unless @_ == 2;
1399    state $dumper;
1400
1401    return unless $Opt{debug};
1402
1403    require Dumpvalue;
1404    unless ($dumper) {
1405	$dumper = new Dumpvalue:: ;
1406    }
1407
1408    my($message, $ref) = @_;
1409
1410    say "$message: ";
1411    dumpValue $dumper $ref;
1412    say "";
1413}
1414
1415sub panic {
1416    confess "$0: INTERNAL ERROR: @_";
1417}
1418
1419sub NOT_REACHED {
1420    panic("NOT REACHED");
1421}
1422
1423sub ARGCOUNT {
1424    panic("wrong arguments to function");
1425}
1426
1427
1428sub dequeue($$) {
1429    my($leader, $body) = @_;
1430    $body =~ s/^\s*\Q$leader\E ?//gm;
1431    return $body;
1432}
1433
1434sub deQ($) {
1435    my $text = $_[0];
1436    return dequeue q<|Q|>,  $text;
1437}
1438
1439sub deQQ($) {
1440    my $text = $_[0];
1441    return dequeue qq<|QQ|>, $text;
1442}
1443
1444#################################################################
1445#################################################################
1446#################################################################
1447
1448__END__
1449
1450#################################################################
1451
1452=head1 NAME
1453
1454word - display words starting or matching a string or pattern
1455
1456=head1 SYNOPSIS
1457
1458word [options] [string | pattern]
1459
1460Given a string, show all words starting with that string (look mode).
1461Given a pattern, show all lines matching that pattern (grep mode).
1462
1463An argument with non-alphabetic characters is always a pattern.
1464Force grep mode with B<--grep=pattern> or by starting the pattern
1465with a slash, which will be ignored.
1466
1467Use B<--man> to get the full manpage.
1468
1469=head1 DESCRIPTION
1470
1471Search a large list of words in one of two modes.  In look mode,
1472only words starting with the given string are displayed.  This
1473mode runs very quickly.  Only purely alphabetic strings are allowed.
1474The system look(1) program is co-opted into helping.
1475
1476In grep mode, any entries matching the pattern are shown.  This
1477takes much longer to run, because the entire 26 megabyte file must
1478be grepped through.  The pattern is not a grep(1) pattern, but
1479rather a perl(1) pattern.  You may use Unicode named characters,
1480plus several custom aliases, in your pattern.
1481
1482=head1 EXAMPLES
1483
1484Look up terms starting with "cat":
1485
1486    % word cat
1487
1488The same, but bump verbose display level to see parts of speech:
1489
1490    % word -v cat
1491
1492Look at only verbs starting with cat:
1493
1494    % word -pv cat
1495
1496Look at all "cat" entries, with verbose set high:
1497
1498    % word -A cat
1499
1500Look for all (irregular) plurals that start with "ex":
1501
1502    % word -ppl ex
1503
1504Look for obsolete prefixes that start with "s":
1505
1506    % word -o -ppref s
1507
1508Grep terms with "cat" anywhere at all:
1509
1510    % word --grep cat
1511    % word /cat
1512
1513Grep terms containing "cat" or "cats" surrounded by
1514word boundaries:
1515
1516    % word '\bcats?\b'
1517
1518Grep terms with the Unicode "Mark" property:
1519
1520    % word '\pM'
1521
1522Grep all plurals ending in "-ata":
1523
1524    % word -A -ppl 'ata\b'
1525
1526Grep terms with the Unicode "Dash" property:
1527
1528    % word '\p{Dash}'
1529
1530Grep for an "e" with an acute accent:
1531
1532    % word '\N{eacute}'
1533
1534Grep for any acute accents no matter the letter:
1535
1536    % word '\N{acute}'
1537
1538Grep for terms containing an "a", "o", "u" in any case, followed
1539by a diaeresis:
1540
1541    % word '(?i)[oau]\N{dier}'
1542
1543=head1 OPTIONS
1544
1545Display options are:
1546
1547    --verbose / -v	use up to three times for more verbosity
1548
1549	level 0 is just the word, like look
1550	level 1 includes parts of speech
1551	level 2 also includes assorted markings
1552	level 3 is the entire original entry
1553
1554    --nopager		never call the pager
1555
1556Part of speech filtering options are:
1557
1558    --pos /   -p POS    only entries matching all POS shown
1559    --nopos / -P POS    no   entries matching any POS shown
1560
1561    POS is a comma-separated list of parts of speech like
1562    n/noun, v/verb, a/adjective, adv/adverb, pro/pronoun,
1563    and pl/plural.
1564
1565Type of entry filtering options are:
1566
1567    --headwords      -h	 show headwords only
1568    --everything     -a	 include all types of entry
1569    --all-verbose    -A  all entries, plus sets verbose to 2
1570
1571Some entries contain markings telling what kind it is.
1572Include or exclude such entries using:
1573
1574    --normal         -n  normal entries (on by default)
1575    --foreign        -f  unassimilated entries (on by default)
1576
1577    --obsolete       -o  obsolete entries (off by default)
1578    --catachrestic   -e  catechrestic entries (off by default)
1579    --illustrations  -i  illustrative examples (off by default)
1580    --crossref       -x  crossrefs w/old spellings (off by default)
1581
1582The previous six entry types can be excluded using the corresponding
1583B<--noXXX> long option or the capitalized short option; e.g.,
1584B<--noforeign> is equivalent to B<-F>.
1585
1586Other options:
1587
1588    --version		print version info and exit
1589    --help		this help page
1590    --man		the full manpage
1591    --debug		internal debugging
1592
1593    --fuzzy          -z use agrep(1) fuzzy matching in "best mode"
1594    --all-fuzzy      -Z like -zavv
1595
1596=head1 PATTERN SHORTCUTS
1597
1598Besides all normal Perl pattern syntax, an extensive set of
1599named characters is provide for nmemonic convenience so you
1600don't have to write numeric code points like C<\x{3b2}>
1601for non-ASCII characters.
1602
1603=over
1604
1605=item *
1606
1607The full Unicode name, like
1608C<\N{EN DASH}> or
1609C<\N{LATIN SMALL LETTER THORN}>, or
1610Latin or Greek letter names, like
1611C<\N{thorn}> or
1612C<\N{alpha}>.
1613
1614=item *
1615
1616HTML abbrevations like
1617C<\N{eacute}>,
1618C<\N{ccedil}>,
1619C<\N{iuml}>.
1620
1621=item *
1622
1623Diacritic abbreviations:
1624C<\N{macron}>,
1625C<\N{acute}>,
1626C<\N{grave}>,
1627C<\N{diaeresis }>,
1628C<\N{dier}>,
1629C<\N{circumflex }>,
1630C<\N{circ}>,
1631and
1632C<\N{tilde}>;
1633C<\N{stress1}> and
1634C<\N{stress2}>.
1635
1636=item *
1637
1638Abbreviations for the type of entry:
1639
1640C<\N{ali}> (unassimilated),
1641C<\N{obs}> (obsolete),
1642C<\N{xref}> (crossreference),
1643C<\N{ill}> (illustrative),
1644C<\N{spu}> (catachrestic), and
1645C<\N{err}> (erroneous).
1646
1647=back
1648
1649=head1 ERRORS
1650
1651TO BE WRITTEN: ERRORS
1652
1653=head1 ENVIRONMENT
1654
1655PAGER
1656
1657=head1 FILES
1658
1659F<words.utf8>
1660
1661=head1 PROGRAMS
1662
1663F<look>, F<agrep>
1664
1665=head1 BUGS
1666
1667TO BE WRITTEN: BUGS
1668
1669=head1 SEE ALSO
1670
1671perlre(1), perlunicode(1)
1672
1673=head1 AUTHOR
1674
1675TO BE WRITTEN: AUTHOR
1676
1677=head1 COPYRIGHT AND LICENCE
1678
1679TO BE WRITTEN: COPYRIGHT AND LICENCE
1680