1#!/usr/local/bin/perl 2 3################################################################# 4# 5# unilook - improved version of look(1) program for Unicode 6# 7################################################################# 8 9use strict; 10use 5.010_000; 11use if $] > 5.010, "autodie"; 12use warnings; # qw[ FATAL all ]; 13 14our $VERSION = v0.6.0; 15 16# In case customer charnames files are in bin not lib... 17use FindBin; 18use lib $FindBin::Bin; 19 20# XXX: These aliases should be in separate files 21# but that would require a more elaborate deployment strategy. 22# Note that these are ordered, and that therefore dupes that 23# occur later are meant to override earlier entries. 24 25use charnames ( 26 ":full" , 27 ":short" , 28 29 "latin" , 30 "greek" , 31 32 ":alias" => 33 { 34 35 "Aacu" => "LATIN CAPITAL LETTER A WITH ACUTE", # Á U+00C1 36 "aacu" => "LATIN SMALL LETTER A WITH ACUTE", # á U+00E1 37 "Acirc" => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", #  U+00C2 38 "acirc" => "LATIN SMALL LETTER A WITH CIRCUMFLEX", # â U+00E2 39 "acu" => "COMBINING ACUTE ACCENT", # ́ U+0301 40 "AE" => "LATIN CAPITAL LETTER AE", # Æ U+00C6 41 "Ae" => "LATIN CAPITAL LETTER AE", # Æ U+00C6 42 "ae" => "LATIN SMALL LETTER AE", # æ U+00E6 43 "Agrave" => "LATIN CAPITAL LETTER A WITH GRAVE", # À U+00C0 44 "agrave" => "LATIN SMALL LETTER A WITH GRAVE", # à U+00E0 45 "Alpha" => "GREEK CAPITAL LETTER ALPHA WITH TONOS", # Ά U+0386 46 "alpha" => "GREEK SMALL LETTER ALPHA", # α U+03B1 47 "ang" => "LATIN SMALL LETTER A WITH RING ABOVE", # å U+00E5 48 "Asg" => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", #  U+00C2 49 "asg" => "LATIN SMALL LETTER G WITH DOT ABOVE", # ġ U+0121 50 "asper" => "COMBINING REVERSED COMMA ABOVE", # ̔ U+0314 51 "Auml" => "LATIN CAPITAL LETTER A WITH DIAERESIS", # Ä U+00C4 52 "auml" => "LATIN SMALL LETTER A WITH DIAERESIS", # ä U+00E4 53 "bbar" => "LATIN SMALL LETTER B WITH STROKE", # ƀ U+0180 54 "Beta" => "GREEK CAPITAL LETTER BETA", # Β U+0392 55 "beta" => "GREEK SMALL LETTER BETA", # β U+03B2 56 "breve" => "COMBINING BREVE", # ̆ U+0306 57 "Ccdil" => "LATIN CAPITAL LETTER C WITH CEDILLA", # Ç U+00C7 58 "ccdil" => "LATIN SMALL LETTER C WITH CEDILLA", # ç U+00E7 59 "cdil" => "COMBINING CEDILLA", # ̧ U+0327 60 "cdl" => "LATIN SMALL LETTER C WITH CEDILLA", # ç U+00E7 61 "cent" => "CENT SIGN", # ¢ U+00A2 62 "Chi" => "GREEK CAPITAL LETTER CHI", # Χ U+03A7 63 "chi" => "GREEK SMALL LETTER CHI", # χ U+03C7 64 "circ" => "COMBINING CIRCUMFLEX ACCENT", # ̂ U+0302 65 "circbl" => "COMBINING CIRCUMFLEX ACCENT BELOW", # ̭ U+032D 66 "dag" => "DAGGER", # † U+2020 67 "deg" => "DEGREE SIGN", # ° U+00B0 68 "Delta" => "GREEK CAPITAL LETTER DELTA", # Δ U+0394 69 "delta" => "GREEK SMALL LETTER DELTA", # δ U+03B4 70 "div" => "DIVISION SLASH", # ∕ U+2215 71 "dollar" => "DOLLAR SIGN", # $ U+0024 72 "dotab" => "COMBINING DOT ABOVE", # ̇ U+0307 73 "dotbl" => "COMBINING DOT BELOW", # ̣ U+0323 74 "dubh" => "HYPHEN", # ‐ U+2010 75 "Eacu" => "LATIN CAPITAL LETTER E WITH ACUTE", # É U+00C9 76 "eacu" => "LATIN SMALL LETTER E WITH ACUTE", # é U+00E9 77 "Ecirc" => "LATIN CAPITAL LETTER E WITH CIRCUMFLEX", # Ê U+00CA 78 "ecirc" => "LATIN SMALL LETTER E WITH CIRCUMFLEX", # ê U+00EA 79 "Edh" => "LATIN CAPITAL LETTER ETH", # Ð U+00D0 80 "edh" => "LATIN SMALL LETTER ETH", # ð U+00F0 81 "Egrave" => "LATIN CAPITAL LETTER E WITH GRAVE", # È U+00C8 82 "egrave" => "LATIN SMALL LETTER E WITH GRAVE", # è U+00E8 83 "Epsilon" => "GREEK CAPITAL LETTER EPSILON", # Ε U+0395 84 "epsilon" => "GREEK SMALL LETTER EPSILON", # ε U+03B5 85 "Eta" => "GREEK CAPITAL LETTER ETA", # Η U+0397 86 "eta" => "GREEK SMALL LETTER BETA", # β U+03B2 87 "Eth" => "LATIN CAPITAL LETTER ETH", # Ð U+00D0 88 "eth" => "LATIN SMALL LETTER ETH", # ð U+00F0 89 "Euml" => "LATIN CAPITAL LETTER E WITH DIAERESIS", # Ë U+00CB 90 "euml" => "LATIN SMALL LETTER E WITH DIAERESIS", # ë U+00EB 91 "fata" => "LATIN SMALL LETTER ALPHA", # ɑ U+0251 92 "fatax" => "LATIN SMALL LETTER ALPHA", # ɑ U+0251 93 "fatpara" => "DOUBLE-STRUCK CAPITAL P", # ℙ U+2119 94 "frown" => "COMBINING BREVE", # ̆ U+0306 95 "Gamma" => "GREEK CAPITAL LETTER GAMMA", # Γ U+0393 96 "gamma" => "GREEK SMALL LETTER GAMMA", # γ U+03B3 97 "ge" => "GREATER-THAN OR EQUAL TO", # ≥ U+2265 98 "grave" => "COMBINING GRAVE ACCENT", # ̀ U+0300 99 "gt" => "GREATER-THAN SIGN", # > U+003E 100 "h01" => "HEBREW LETTER HET", # ח U+05D7 101 "h02" => "ARABIC SHADDA", # ّ U+0651 102 "hacek" => "COMBINING CARON", # ̌ U+030C 103 "hash" => "NUMBER SIGN", # # U+0023 104 "hbar" => "LATIN SMALL LETTER H WITH STROKE", # ħ U+0127 105 "hgz" => "LATIN SMALL LETTER Z WITH HOOK", # ȥ U+0225 106 "hook" => "COMBINING CEDILLA", # ̧ U+0327 107 "ia" => "LATIN SMALL LETTER ALPHA", # ɑ U+0251 108 "Iacu" => "LATIN CAPITAL LETTER I WITH ACUTE", # Í U+00CD 109 "iacu" => "LATIN SMALL LETTER I WITH ACUTE", # í U+00ED 110 "ib" => "GREEK SMALL LETTER BETA", # β U+03B2 111 "Icirc" => "LATIN CAPITAL LETTER I WITH CIRCUMFLEX", # Î U+00CE 112 "icirc" => "LATIN SMALL LETTER I WITH CIRCUMFLEX", # î U+00EE 113 "id" => "GREEK SMALL LETTER DELTA", # δ U+03B4 114 "ie" => "LATIN SMALL LETTER OPEN E", # ɛ U+025B 115 "ig" => "LATIN SMALL LETTER GAMMA", # ɣ U+0263 116 "Igrave" => "LATIN CAPITAL LETTER I WITH GRAVE", # Ì U+00CC 117 "igrave" => "LATIN SMALL LETTER I WITH GRAVE", # ì U+00EC 118 "ih" => "GREEK SMALL LETTER ETA", # η U+03B7 119 "ii" => "LATIN SMALL LETTER IOTA", # ɩ U+0269 120 "infin" => "INFINITY", # ∞ U+221E 121 "Iota" => "GREEK CAPITAL LETTER IOTA", # Ι U+0399 122 "iota" => "GREEK CAPITAL LETTER CHI", # Χ U+03A7 123 "iq" => "GREEK SMALL LETTER THETA", # θ U+03B8 124 "isub" => "COMBINING GREEK YPOGEGRAMMENI", # ͅ U+0345 125 "Iuml" => "LATIN CAPITAL LETTER I WITH DIAERESIS", # Ï U+00CF 126 "iuml" => "LATIN SMALL LETTER I WITH DIAERESIS", # ï U+00EF 127 "iz" => "GREEK SMALL LETTER ZETA", # ζ U+03B6 128 "Kappa" => "GREEK CAPITAL LETTER KAPPA", # Κ U+039A 129 "kappa" => "GREEK SMALL LETTER KAPPA", # κ U+03BA 130 "Lambda" => "GREEK CAPITAL LETTER LAMDA", # Λ U+039B 131 "lambda" => "GREEK SMALL LETTER LAMDA", # λ U+03BB 132 "lar" => "LEFTWARDS ARROW", # ← U+2190 133 "Lbar" => "LATIN CAPITAL LETTER L WITH STROKE", # Ł U+0141 134 "lbar" => "LATIN CAPITAL LETTER O WITH MACRON", # Ō U+014C 135 "le" => "LESS-THAN OVER EQUAL TO", # ≦ U+2266 136 "lenis" => "COMBINING REVERSED COMMA ABOVE", # ̔ U+0314 137 "lm" => "MODIFIER LETTER TRIANGULAR COLON", # ː U+02D0 138 "lt" => "LESS-THAN SIGN", # < U+003C 139 "mac" => "COMBINING MACRON", # ̄ U+0304 140 "min" => "MINUS SIGN", # − U+2212 141 "Mu" => "GREEK CAPITAL LETTER MU", # Μ U+039C 142 "mu" => "GREEK SMALL LETTER MU", # μ U+03BC 143 "ng" => "LATIN SMALL LETTER ENG", # ŋ U+014B 144 "ngx" => "LATIN SMALL LETTER ENG", # ŋ U+014B 145 "Nu" => "GREEK CAPITAL LETTER NU", # Ν U+039D 146 "nu" => "GREEK SMALL LETTER NU", # ν U+03BD 147 "Oacu" => "LATIN CAPITAL LETTER O WITH ACUTE", # Ó U+00D3 148 "oacu" => "LATIN SMALL LETTER O WITH ACUTE", # ó U+00F3 149 "Obar" => "LATIN CAPITAL LETTER O WITH STROKE", # Ø U+00D8 150 "obar" => "LATIN SMALL LETTER O WITH STROKE", # ø U+00F8 151 "Ocirc" => "LATIN CAPITAL LETTER O WITH CIRCUMFLEX", # Ô U+00D4 152 "ocirc" => "LATIN SMALL LETTER O WITH CIRCUMFLEX", # ô U+00F4 153 "OE" => "LATIN CAPITAL LIGATURE OE", # Œ U+0152 154 "Oe" => "LATIN CAPITAL LIGATURE OE", # Œ U+0152 155 "oe" => "LATIN SMALL LIGATURE OE", # œ U+0153 156 "Ograve" => "LATIN CAPITAL LETTER O WITH GRAVE", # Ò U+00D2 157 "ograve" => "LATIN SMALL LETTER O WITH GRAVE", # ò U+00F2 158 "Omega" => "GREEK CAPITAL LETTER OMEGA", # Ω U+03A9 159 "omega" => "GREEK SMALL LETTER OMEGA", # ω U+03C9 160 "Omicron" => "GREEK CAPITAL LETTER OMICRON", # Ο U+039F 161 "omicron" => "GREEK SMALL LETTER OMICRON", # ο U+03BF 162 "ope" => "LATIN SMALL LETTER OPEN E", # ɛ U+025B 163 "Ouml" => "LATIN CAPITAL LETTER O WITH DIAERESIS", # Ö U+00D6 164 "ouml" => "LATIN SMALL LETTER A WITH DIAERESIS", # ä U+00E4 165 "pa" => "GREEK LETTER ARCHAIC KOPPA", # Ϙ U+03D8 166 "pall" => "LATIN SMALL LETTER TURNED Y", # ʎ U+028E 167 "paln" => "LATIN SMALL LETTER N WITH LEFT HOOK", # ɲ U+0272 168 "para" => "REVERSED PILCROW SIGN", # ⁋ U+204B 169 "Phi" => "GREEK CAPITAL LETTER PHI", # Φ U+03A6 170 "phi" => "LATIN SMALL LETTER PHI", # ɸ U+0278 171 "Pi" => "GREEK CAPITAL LETTER PI", # Π U+03A0 172 "pi" => "GREEK SMALL LETTER PI", # π U+03C0 173 "pm" => "PLUS-MINUS SIGN", # ± U+00B1 174 "pp" => "DOUBLE PRIME", # ″ U+2033 175 "Psi" => "GREEK CAPITAL LETTER PSI", # Ψ U+03A8 176 "psi" => "GREEK SMALL LETTER PSI", # ψ U+03C8 177 "pstlg" => "POUND SIGN", # £ U+00A3 178 "rar" => "RIGHTWARDS ARROW", # → U+2192 179 "revc" => "LATIN SMALL LETTER OPEN O", # ɔ U+0254 180 "revope" => "LATIN SMALL LETTER REVERSED OPEN E", # ɜ U+025C 181 "revr" => "MODIFIER LETTER RHOTIC HOOK", # ˞ U+02DE 182 "revrx" => "LATIN SMALL LETTER TURNED R", # ɹ U+0279 183 "revv" => "LATIN SMALL LETTER TURNED V", # ʌ U+028C 184 "rfa" => "LATIN SMALL LETTER TURNED ALPHA", # ɒ U+0252 185 "Rho" => "GREEK CAPITAL LETTER RHO", # Ρ U+03A1 186 "rho" => "GREEK SMALL LETTER RHO", # ρ U+03C1 187 "schwa" => "LATIN SMALL LETTER SCHWA", # ə U+0259 188 "schwax" => "LATIN SMALL LETTER SCHWA", # ə U+0259 189 "sect" => "SECTION SIGN", # § U+00A7 190 "sh" => "LATIN SMALL LETTER ESH", # ʃ U+0283 191 "shti" => "LATIN LETTER SMALL CAPITAL I", # ɪ U+026A 192 "shtu" => "LATIN SMALL LETTER UPSILON", # ʊ U+028A 193 "shty" => "LATIN LETTER SMALL CAPITAL Y", # ʏ U+028F 194 "shx" => "LATIN SMALL LETTER ESH", # ʃ U+0283 195 "Sigma" => "GREEK CAPITAL LETTER SIGMA", # Σ U+03A3 196 "sigma" => "GREEK SMALL LETTER SIGMA", # σ U+03C3 197 "sm" => "MODIFIER LETTER VERTICAL LINE", # ˈ U+02C8 198 "smm" => "MODIFIER LETTER LOW VERTICAL LINE", # ˌ U+02CC 199 "sqrt" => "SQUARE ROOT", # √ U+221A 200 "Tau" => "GREEK CAPITAL LETTER TAU", # Τ U+03A4 201 "tau" => "GREEK SMALL LETTER TAU", # τ U+03C4 202 "Th" => "LATIN CAPITAL LETTER THORN", # Þ U+00DE 203 "th" => "LATIN SMALL LETTER THORN", # þ U+00FE 204 "Theta" => "GREEK CAPITAL LETTER THETA", # Θ U+0398 205 "theta" => "GREEK SMALL LETTER THETA", # θ U+03B8 206 "tilde" => "COMBINING TILDE", # ̃ U+0303 207 "times" => "MULTIPLICATION SIGN", # × U+00D7 208 "trli" => "PARALLEL TO", # ∥ U+2225 209 "Uacu" => "LATIN CAPITAL LETTER U WITH ACUTE", # Ú U+00DA 210 "uacu" => "LATIN SMALL LETTER U WITH ACUTE", # ú U+00FA 211 "Ucirc" => "LATIN CAPITAL LETTER U WITH CIRCUMFLEX", # Û U+00DB 212 "ucirc" => "LATIN SMALL LETTER U WITH CIRCUMFLEX", # û U+00FB 213 "udtr" => "NABLA", # ∇ U+2207 214 "Ugrave" => "LATIN CAPITAL LETTER U WITH GRAVE", # Ù U+00D9 215 "ugrave" => "LATIN SMALL LETTER U WITH GRAVE", # ù U+00F9 216 "uml" => "COMBINING DIAERESIS", # ̈ U+0308 217 "undl" => "COMBINING MINUS SIGN BELOW", # ̠ U+0320 218 "Upsilon" => "GREEK CAPITAL LETTER UPSILON", # Υ U+03A5 219 "upsilon" => "LATIN SMALL LETTER UPSILON", # ʊ U+028A 220 "Uuml" => "LATIN CAPITAL LETTER U WITH DIAERESIS", # Ü U+00DC 221 "uuml" => "LATIN SMALL LETTER U WITH DIAERESIS", # ü U+00FC 222 "vb" => "VERTICAL LINE", # | U+007C 223 "vvf" => "LATIN SMALL LETTER GAMMA", # ɣ U+0263 224 "Xi" => "GREEK CAPITAL LETTER XI", # Ξ U+039E 225 "xi" => "GREEK SMALL LETTER XI", # ξ U+03BE 226 "Yacu" => "LATIN SMALL LETTER Y WITH ACUTE", # ý U+00FD 227 "yacu" => "LATIN SMALL LETTER Y WITH ACUTE", # ý U+00FD 228 "Ygh" => "LATIN CAPITAL LETTER YOGH", # Ȝ U+021C 229 "ygh" => "LATIN SMALL LETTER YOGH", # ȝ U+021D 230 "yuml" => "LATIN SMALL LETTER Y WITH DIAERESIS", # ÿ U+00FF 231 "Zeta" => "GREEK CAPITAL LETTER ZETA", # Ζ U+0396 232 "zeta" => "GREEK SMALL LETTER ZETA", # ζ U+03B6 233 "zh" => "LATIN SMALL LETTER EZH", # ʒ U+0292 234 235# Number aliases: these are \p{Other_Number} 236 "sup1" => "SUPERSCRIPT ONE", # ¹ U+00B9 237 "sup2" => "SUPERSCRIPT TWO", # ² U+00B2 238 "sup3" => "SUPERSCRIPT THREE", # ³ U+00B3 239 "frac12" => "VULGAR FRACTION ONE HALF", # ½ U+00BD 240 "frac14" => "VULGAR FRACTION ONE QUARTER", # ¼ U+00BC 241 "frac34" => "VULGAR FRACTION THREE QUARTERS", # ¾ U+00BE 242 243# Currency sign aliases: \p{Currency_Symbol} 244 245 "curren" => "CURRENCY SIGN", # ¤ U+00A4 246 "cent" => "CENT SIGN", # ¢ U+00A2 247 "pound" => "POUND SIGN", # £ U+00A3 248 "yen" => "YEN SIGN", # ¥ U+00A5 249 "euro" => "EURO SIGN", # € U+20AC 250 251# Latin letter aliases in NFC and grouped by first letter 252# 253# NOTE: some like BLACK LETTER blah and the trademark 254# symbol are only Latin in NFKD form. 255 256 "ordf" => "FEMININE ORDINAL INDICATOR", # ª U+00AA 257 "Oacute" => "LATIN CAPITAL LETTER O WITH ACUTE", # Ó U+00D3 258 "Aacute" => "LATIN CAPITAL LETTER A WITH ACUTE", # Á U+00C1 259 "aacute" => "LATIN SMALL LETTER A WITH ACUTE", # á U+00E1 260 "Agrave" => "LATIN CAPITAL LETTER A WITH GRAVE", # À U+00C0 261 "agrave" => "LATIN SMALL LETTER A WITH GRAVE", # à U+00E0 262 "Acirc" => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX", #  U+00C2 263 "acirc" => "LATIN SMALL LETTER A WITH CIRCUMFLEX", # â U+00E2 264 "Aring" => "LATIN CAPITAL LETTER A WITH RING ABOVE", # Å U+00C5 265 "aring" => "LATIN SMALL LETTER A WITH RING ABOVE", # å U+00E5 266 "Auml" => "LATIN CAPITAL LETTER A WITH DIAERESIS", # Ä U+00C4 267 "auml" => "LATIN SMALL LETTER A WITH DIAERESIS", # ä U+00E4 268 "Atilde" => "LATIN CAPITAL LETTER A WITH TILDE", # à U+00C3 269 "atilde" => "LATIN SMALL LETTER A WITH TILDE", # ã U+00E3 270 "AElig" => "LATIN CAPITAL LETTER AE", # Æ U+00C6 271 "aelig" => "LATIN SMALL LETTER AE", # æ U+00E6 272 273 "Ccedil" => "LATIN CAPITAL LETTER C WITH CEDILLA", # Ç U+00C7 274 "ccedil" => "LATIN SMALL LETTER C WITH CEDILLA", # ç U+00E7 275 276 "ETH" => "LATIN CAPITAL LETTER ETH", # Ð U+00D0 277 "eth" => "LATIN SMALL LETTER ETH", # ð U+00F0 278 279 "Eacute" => "LATIN CAPITAL LETTER E WITH ACUTE", # É U+00C9 280 "eacute" => "LATIN SMALL LETTER E WITH ACUTE", # é U+00E9 281 "Egrave" => "LATIN CAPITAL LETTER E WITH GRAVE", # È U+00C8 282 "egrave" => "LATIN SMALL LETTER E WITH GRAVE", # è U+00E8 283 "Ecirc" => "LATIN CAPITAL LETTER E WITH CIRCUMFLEX", # Ê U+00CA 284 "ecirc" => "LATIN SMALL LETTER E WITH CIRCUMFLEX", # ê U+00EA 285 "Euml" => "LATIN CAPITAL LETTER E WITH DIAERESIS", # Ë U+00CB 286 "euml" => "LATIN SMALL LETTER E WITH DIAERESIS", # ë U+00EB 287 288 "fnof" => "LATIN SMALL LETTER F WITH HOOK", # ƒ U+0192 289 290 "image" => "BLACK-LETTER CAPITAL I", # ℑ U+2111 291 "Iacute" => "LATIN CAPITAL LETTER I WITH ACUTE", # Í U+00CD 292 "iacute" => "LATIN SMALL LETTER I WITH ACUTE", # í U+00ED 293 "Igrave" => "LATIN CAPITAL LETTER I WITH GRAVE", # Ì U+00CC 294 "igrave" => "LATIN SMALL LETTER I WITH GRAVE", # ì U+00EC 295 "Icirc" => "LATIN CAPITAL LETTER I WITH CIRCUMFLEX", # Î U+00CE 296 "icirc" => "LATIN SMALL LETTER I WITH CIRCUMFLEX", # î U+00EE 297 "Iuml" => "LATIN CAPITAL LETTER I WITH DIAERESIS", # Ï U+00CF 298 "iuml" => "LATIN SMALL LETTER I WITH DIAERESIS", # ï U+00EF 299 300 "Ntilde" => "LATIN CAPITAL LETTER N WITH TILDE", # Ñ U+00D1 301 "ntilde" => "LATIN SMALL LETTER N WITH TILDE", # ñ U+00F1 302 303 "ordm" => "MASCULINE ORDINAL INDICATOR", # º U+00BA 304 "oacute" => "LATIN SMALL LETTER O WITH ACUTE", # ó U+00F3 305 "Ograve" => "LATIN CAPITAL LETTER O WITH GRAVE", # Ò U+00D2 306 "ograve" => "LATIN SMALL LETTER O WITH GRAVE", # ò U+00F2 307 "Ocirc" => "LATIN CAPITAL LETTER O WITH CIRCUMFLEX", # Ô U+00D4 308 "ocirc" => "LATIN SMALL LETTER O WITH CIRCUMFLEX", # ô U+00F4 309 "Ouml" => "LATIN CAPITAL LETTER O WITH DIAERESIS", # Ö U+00D6 310 "ouml" => "LATIN SMALL LETTER O WITH DIAERESIS", # ö U+00F6 311 "Otilde" => "LATIN CAPITAL LETTER O WITH TILDE", # Õ U+00D5 312 "otilde" => "LATIN SMALL LETTER O WITH TILDE", # õ U+00F5 313 "Oslash" => "LATIN CAPITAL LETTER O WITH STROKE", # Ø U+00D8 314 "oslash" => "LATIN SMALL LETTER O WITH STROKE", # ø U+00F8 315 "OElig" => "LATIN CAPITAL LIGATURE OE", # Œ U+0152 316 "oelig" => "LATIN SMALL LIGATURE OE", # œ U+0153 317 318 "real" => "BLACK-LETTER CAPITAL R", # ℜ U+211C 319 320 "Scaron" => "LATIN CAPITAL LETTER S WITH CARON", # Š U+0160 321 "scaron" => "LATIN SMALL LETTER S WITH CARON", # š U+0161 322 "szlig" => "LATIN SMALL LETTER SHARP S", # ß U+00DF 323 324 "trade" => "TRADE MARK SIGN", # ™ U+2122 325 326 "Uacute" => "LATIN CAPITAL LETTER U WITH ACUTE", # Ú U+00DA 327 "uacute" => "LATIN SMALL LETTER U WITH ACUTE", # ú U+00FA 328 "Ugrave" => "LATIN CAPITAL LETTER U WITH GRAVE", # Ù U+00D9 329 "ugrave" => "LATIN SMALL LETTER U WITH GRAVE", # ù U+00F9 330 "Ucirc" => "LATIN CAPITAL LETTER U WITH CIRCUMFLEX", # Û U+00DB 331 "ucirc" => "LATIN SMALL LETTER U WITH CIRCUMFLEX", # û U+00FB 332 "Uuml" => "LATIN CAPITAL LETTER U WITH DIAERESIS", # Ü U+00DC 333 "uuml" => "LATIN SMALL LETTER U WITH DIAERESIS", # ü U+00FC 334 335 "Yacute" => "LATIN CAPITAL LETTER Y WITH ACUTE", # Ý U+00DD 336 "yacute" => "LATIN SMALL LETTER Y WITH ACUTE", # ý U+00FD 337 "Yuml" => "LATIN CAPITAL LETTER Y WITH DIAERESIS", # Ÿ U+0178 338 "yuml" => "LATIN SMALL LETTER Y WITH DIAERESIS", # ÿ U+00FF 339 340 "THORN" => "LATIN CAPITAL LETTER THORN", # Þ U+00DE 341 "thorn" => "LATIN SMALL LETTER THORN", # þ U+00FE 342 343# This is *not* the same as the HEBREW LETTER ALEF (aleph), 344# although it is a \p{Other_Letter} not a \p{Symbol}. 345 "alefsym" => "ALEF SYMBOL", # ℵ U+2135 346 347# Greek letter aliases, or things that sort with them 348 349 "Alpha" => "GREEK CAPITAL LETTER ALPHA", # Α U+0391 350 "alpha" => "GREEK SMALL LETTER ALPHA", # α U+03B1 351 "Beta" => "GREEK CAPITAL LETTER BETA", # Β U+0392 352 "beta" => "GREEK SMALL LETTER BETA", # β U+03B2 353 "Gamma" => "GREEK CAPITAL LETTER GAMMA", # Γ U+0393 354 "gamma" => "GREEK SMALL LETTER GAMMA", # γ U+03B3 355 "Delta" => "GREEK CAPITAL LETTER DELTA", # Δ U+0394 356 "delta" => "GREEK SMALL LETTER DELTA", # δ U+03B4 357 "Epsilon" => "GREEK CAPITAL LETTER EPSILON", # Ε U+0395 358 "epsilon" => "GREEK SMALL LETTER EPSILON", # ε U+03B5 359 "Zeta" => "GREEK CAPITAL LETTER ZETA", # Ζ U+0396 360 "zeta" => "GREEK SMALL LETTER ZETA", # ζ U+03B6 361 "Eta" => "GREEK CAPITAL LETTER ETA", # Η U+0397 362 "eta" => "GREEK SMALL LETTER ETA", # η U+03B7 363 "Theta" => "GREEK CAPITAL LETTER THETA", # Θ U+0398 364 "thetasym" => "GREEK THETA SYMBOL", # ϑ U+03D1 365 "theta" => "GREEK SMALL LETTER THETA", # θ U+03B8 366 "Iota" => "GREEK CAPITAL LETTER IOTA", # Ι U+0399 367 "iota" => "GREEK SMALL LETTER IOTA", # ι U+03B9 368 "Kappa" => "GREEK CAPITAL LETTER KAPPA", # Κ U+039A 369 "kappa" => "GREEK SMALL LETTER KAPPA", # κ U+03BA 370 "Lambda" => "GREEK CAPITAL LETTER LAMDA", # Λ U+039B 371 "lambda" => "GREEK SMALL LETTER LAMDA", # λ U+03BB 372 "Mu" => "GREEK CAPITAL LETTER MU", # Μ U+039C 373 "micro" => "MICRO SIGN", # µ U+00B5 374 "mu" => "GREEK SMALL LETTER MU", # μ U+03BC 375 "Nu" => "GREEK CAPITAL LETTER NU", # Ν U+039D 376 "nu" => "GREEK SMALL LETTER NU", # ν U+03BD 377 "Xi" => "GREEK CAPITAL LETTER XI", # Ξ U+039E 378 "xi" => "GREEK SMALL LETTER XI", # ξ U+03BE 379 "Omicron" => "GREEK CAPITAL LETTER OMICRON", # Ο U+039F 380 "omicron" => "GREEK SMALL LETTER OMICRON", # ο U+03BF 381 "Pi" => "GREEK CAPITAL LETTER PI", # Π U+03A0 382 "piv" => "GREEK PI SYMBOL", # ϖ U+03D6 383 "pi" => "GREEK SMALL LETTER PI", # π U+03C0 384 "Rho" => "GREEK CAPITAL LETTER RHO", # Ρ U+03A1 385 "rho" => "GREEK SMALL LETTER RHO", # ρ U+03C1 386 "sigma" => "GREEK SMALL LETTER SIGMA", # σ U+03C3 387 "sigmaf" => "GREEK SMALL LETTER FINAL SIGMA", # ς U+03C2 388 "Tau" => "GREEK CAPITAL LETTER TAU", # Τ U+03A4 389 "tau" => "GREEK SMALL LETTER TAU", # τ U+03C4 390 "upsih" => "GREEK UPSILON WITH HOOK SYMBOL", # ϒ U+03D2 391 "Upsilon" => "GREEK CAPITAL LETTER UPSILON", # Υ U+03A5 392 "upsilon" => "GREEK SMALL LETTER UPSILON", # υ U+03C5 393 "Phi" => "GREEK CAPITAL LETTER PHI", # Φ U+03A6 394 "phi" => "GREEK SMALL LETTER PHI", # φ U+03C6 395 "Chi" => "GREEK CAPITAL LETTER CHI", # Χ U+03A7 396 "chi" => "GREEK SMALL LETTER CHI", # χ U+03C7 397 "Psi" => "GREEK CAPITAL LETTER PSI", # Ψ U+03A8 398 "psi" => "GREEK SMALL LETTER PSI", # ψ U+03C8 399 "Omega" => "GREEK CAPITAL LETTER OMEGA", # Ω U+03A9 400 "omega" => "GREEK SMALL LETTER OMEGA", # ω U+03C9 401 402# \p{Format} characters 403 404 "zwj" => "ZERO WIDTH JOINER", # U+200D 405 "zwnj" => "ZERO WIDTH NON-JOINER", # U+200C 406 "rlm" => "RIGHT-TO-LEFT MARK", # U+200F 407 "lrm" => "LEFT-TO-RIGHT MARK", # U+200E 408 409# Various punctuation and symbols in UCA order. 410# None of these is a combining Mark. 411 412 "oline" => "OVERLINE", # ‾ U+203E 413 "ensp" => "EN SPACE", # U+2002 414 "nbsp" => "NO-BREAK SPACE", # U+00A0 415 "cedil" => "CEDILLA", # ¸ U+00B8 416 "uml" => "DIAERESIS", # ¨ U+00A8 417 "acute" => "ACUTE ACCENT", # ´ U+00B4 418 "tilde" => "SMALL TILDE", # ˜ U+02DC 419 "emsp" => "EM SPACE", # U+2003 420 "macr" => "MACRON", # ¯ U+00AF 421 "thinsp" => "THIN SPACE", # U+2009 422 "shy" => "SOFT HYPHEN", # U+00AD 423 "ndash" => "EN DASH", # – U+2013 424 "mdash" => "EM DASH", # — U+2014 425 "iexcl" => "INVERTED EXCLAMATION MARK", # ¡ U+00A1 426 "iquest" => "INVERTED QUESTION MARK", # ¿ U+00BF 427 "hellip" => "HORIZONTAL ELLIPSIS", # … U+2026 428 "middot" => "MIDDLE DOT", # · U+00B7 429 "apos" => "APOSTROPHE", # ' U+0027 430 "lsquo" => "LEFT SINGLE QUOTATION MARK", # ‘ U+2018 431 "rsquo" => "RIGHT SINGLE QUOTATION MARK", # ’ U+2019 432 "sbquo" => "SINGLE LOW-9 QUOTATION MARK", # ‚ U+201A 433 "lsaquo" => "SINGLE LEFT-POINTING ANGLE QUOTATION MARK", # ‹ U+2039 434 "rsaquo" => "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK", # › U+203A 435 "quot" => "QUOTATION MARK", # " U+0022 436 "ldquo" => "LEFT DOUBLE QUOTATION MARK", # “ U+201C 437 "rdquo" => "RIGHT DOUBLE QUOTATION MARK", # ” U+201D 438 "bdquo" => "DOUBLE LOW-9 QUOTATION MARK", # „ U+201E 439 "laquo" => "LEFT-POINTING DOUBLE ANGLE QUOTATION MARK", # « U+00AB 440 "raquo" => "RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK", # » U+00BB 441 "lang" => "LEFT-POINTING ANGLE BRACKET", # 〈 U+2329 442 "rang" => "RIGHT-POINTING ANGLE BRACKET", # 〉 U+232A 443 "sect" => "SECTION SIGN", # § U+00A7 444 "para" => "PILCROW SIGN", # ¶ U+00B6 445 "copy" => "COPYRIGHT SIGN", # © U+00A9 446 "reg" => "REGISTERED SIGN", # ® U+00AE 447 "frasl" => "FRACTION SLASH", # ⁄ U+2044 448 "amp" => "AMPERSAND", # & U+0026 449 "permil" => "PER MILLE SIGN", # ‰ U+2030 450 "dagger" => "DAGGER", # † U+2020 451 "Dagger" => "DOUBLE DAGGER", # ‡ U+2021 452 "bull" => "BULLET", # • U+2022 453 "prime" => "PRIME", # ′ U+2032 454 "Prime" => "DOUBLE PRIME", # ″ U+2033 455 "circ" => "MODIFIER LETTER CIRCUMFLEX ACCENT", # ˆ U+02C6 456 "deg" => "DEGREE SIGN", # ° U+00B0 457 "weierp" => "SCRIPT CAPITAL P", # ℘ U+2118 458 "larr" => "LEFTWARDS ARROW", # ← U+2190 459 "rarr" => "RIGHTWARDS ARROW", # → U+2192 460 "uarr" => "UPWARDS ARROW", # ↑ U+2191 461 "darr" => "DOWNWARDS ARROW", # ↓ U+2193 462 "harr" => "LEFT RIGHT ARROW", # ↔ U+2194 463 "crarr" => "DOWNWARDS ARROW WITH CORNER LEFTWARDS", # ↵ U+21B5 464 "lArr" => "LEFTWARDS DOUBLE ARROW", # ⇐ U+21D0 465 "uArr" => "UPWARDS DOUBLE ARROW", # ⇑ U+21D1 466 "rArr" => "RIGHTWARDS DOUBLE ARROW", # ⇒ U+21D2 467 "dArr" => "DOWNWARDS DOUBLE ARROW", # ⇓ U+21D3 468 "hArr" => "LEFT RIGHT DOUBLE ARROW", # ⇔ U+21D4 469 "forall" => "FOR ALL", # ∀ U+2200 470 "part" => "PARTIAL DIFFERENTIAL", # ∂ U+2202 471 "exist" => "THERE EXISTS", # ∃ U+2203 472 "empty" => "EMPTY SET", # ∅ U+2205 473 "nabla" => "NABLA", # ∇ U+2207 474 "isin" => "ELEMENT OF", # ∈ U+2208 475 "notin" => "NOT AN ELEMENT OF", # ∉ U+2209 476 "ni" => "CONTAINS AS MEMBER", # ∋ U+220B 477 "prod" => "N-ARY PRODUCT", # ∏ U+220F 478 "sum" => "N-ARY SUMMATION", # ∑ U+2211 479 "plusmn" => "PLUS-MINUS SIGN", # ± U+00B1 480 "divide" => "DIVISION SIGN", # ÷ U+00F7 481 "times" => "MULTIPLICATION SIGN", # × U+00D7 482 "lt" => "LESS-THAN SIGN", # < U+003C 483 "ne" => "NOT EQUAL TO", # ≠ U+2260 484 "gt" => "GREATER-THAN SIGN", # > U+003E 485 "not" => "NOT SIGN", # ¬ U+00AC 486 "brvbar" => "BROKEN BAR", # ¦ U+00A6 487 "minus" => "MINUS SIGN", # − U+2212 488 "lowast" => "ASTERISK OPERATOR", # ∗ U+2217 489 "radic" => "SQUARE ROOT", # √ U+221A 490 "prop" => "PROPORTIONAL TO", # ∝ U+221D 491 "infin" => "INFINITY", # ∞ U+221E 492 "ang" => "ANGLE", # ∠ U+2220 493 "and" => "LOGICAL AND", # ∧ U+2227 494 "or" => "LOGICAL OR", # ∨ U+2228 495 "cap" => "INTERSECTION", # ∩ U+2229 496 "cup" => "UNION", # ∪ U+222A 497 "int" => "INTEGRAL", # ∫ U+222B 498 "there4" => "THEREFORE", # ∴ U+2234 499 "sim" => "TILDE OPERATOR", # ∼ U+223C 500 "cong" => "APPROXIMATELY EQUAL TO", # ≅ U+2245 501 "asymp" => "ALMOST EQUAL TO", # ≈ U+2248 502 "equiv" => "IDENTICAL TO", # ≡ U+2261 503 "le" => "LESS-THAN OR EQUAL TO", # ≤ U+2264 504 "ge" => "GREATER-THAN OR EQUAL TO", # ≥ U+2265 505 "sub" => "SUBSET OF", # ⊂ U+2282 506 "nsub" => "NOT A SUBSET OF", # ⊄ U+2284 507 "sup" => "SUPERSET OF", # ⊃ U+2283 508 "sube" => "SUBSET OF OR EQUAL TO", # ⊆ U+2286 509 "supe" => "SUPERSET OF OR EQUAL TO", # ⊇ U+2287 510 "oplus" => "CIRCLED PLUS", # ⊕ U+2295 511 "otimes" => "CIRCLED TIMES", # ⊗ U+2297 512 "perp" => "UP TACK", # ⊥ U+22A5 513 "sdot" => "DOT OPERATOR", # ⋅ U+22C5 514 "lceil" => "LEFT CEILING", # ⌈ U+2308 515 "rceil" => "RIGHT CEILING", # ⌉ U+2309 516 "lfloor" => "LEFT FLOOR", # ⌊ U+230A 517 "rfloor" => "RIGHT FLOOR", # ⌋ U+230B 518 "loz" => "LOZENGE", # ◊ U+25CA 519 "spades" => "BLACK SPADE SUIT", # ♠ U+2660 520 "clubs" => "BLACK CLUB SUIT", # ♣ U+2663 521 "hearts" => "BLACK HEART SUIT", # ♥ U+2665 522 "diams" => "BLACK DIAMOND SUIT", # ♦ U+2666 523 524 # 525 # override non-combining forms 526 # 527 528 "ACUTE" => "COMBINING ACUTE ACCENT", 529 "acute" => "COMBINING ACUTE ACCENT", 530 531 "GRAVE" => "COMBINING GRAVE ACCENT", 532 "grave" => "COMBINING GRAVE ACCENT", 533 534 "CIRCUMFLEX" => "COMBINING CIRCUMFLEX ACCENT", 535 "CIRCUM" => "COMBINING CIRCUMFLEX ACCENT", 536 "CIRC" => "COMBINING CIRCUMFLEX ACCENT", 537 "circumflex" => "COMBINING CIRCUMFLEX ACCENT", 538 "circum" => "COMBINING CIRCUMFLEX ACCENT", 539 "circ" => "COMBINING CIRCUMFLEX ACCENT", 540 541 # typo protection 542 543 "COMBINING DIERESIS" => "COMBINING DIAERESIS", 544 "COMBINING DIEARESIS" => "COMBINING DIAERESIS", 545 "DIERESIS" => "COMBINING DIAERESIS", 546 "DIEARESIS" => "COMBINING DIAERESIS", 547 "DIAERESIS" => "COMBINING DIAERESIS", 548 "dieresis" => "COMBINING DIAERESIS", 549 "diearesis" => "COMBINING DIAERESIS", 550 "diaeresis" => "COMBINING DIAERESIS", 551 "diaer" => "COMBINING DIAERESIS", 552 "diear" => "COMBINING DIAERESIS", 553 "dier" => "COMBINING DIAERESIS", 554 555 "TILDE" => "COMBINING TILDE", 556 "tilde" => "COMBINING TILDE", 557 "til" => "COMBINING TILDE", 558 559 "CEDILLE" => "COMBINING CEDILLA", 560 "CEDILLA" => "COMBINING CEDILLA", 561 "CEDIL" => "COMBINING CEDILLA", 562 "cedille" => "COMBINING CEDILLA", 563 "cedilla" => "COMBINING CEDILLA", 564 "cedil" => "COMBINING CEDILLA", 565 566 "MACRON" => "COMBINING MACRON", 567 "macron" => "COMBINING MACRON", 568 569 "CARON" => "COMBINING CARON", 570 "caron" => "COMBINING CARON", 571 572 # 573 # special glyphs 574 # 575 576 # Hawaiʻi, aloha ʻoe 577 "okina" => "MODIFIER LETTER TURNED COMMA", 578 579 # * transliteration of Arabic ain (voiced pharyngeal fricative) 580 "ain" => "MODIFIER LETTER LEFT HALF RING", 581 582 "stress" => "MODIFIER LETTER VERTICAL LINE", 583 "stress1" => "MODIFIER LETTER VERTICAL LINE", 584 "primary_stress" => "MODIFIER LETTER VERTICAL LINE", 585 "pstress" => "MODIFIER LETTER VERTICAL LINE", 586 "pstr" => "MODIFIER LETTER VERTICAL LINE", 587 588 "secondary_stress" => "MODIFIER LETTER LOW VERTICAL LINE", 589 "stress2" => "MODIFIER LETTER LOW VERTICAL LINE", 590 "sstress" => "MODIFIER LETTER LOW VERTICAL LINE", 591 "sstr" => "MODIFIER LETTER LOW VERTICAL LINE", 592 593 # 594 # classification glyphs 595 # 596 597 # OBSOLETE 598 "obs" => "DAGGER", 599 "obsolete" => "DAGGER", 600 "dagger" => "DAGGER", 601 602 # ALIEN 603 "ali" => "DOUBLE VERTICAL LINE", 604 "alien" => "DOUBLE VERTICAL LINE", 605 "foreign" => "DOUBLE VERTICAL LINE", 606 "unassimilated" => "DOUBLE VERTICAL LINE", 607 608 # ERRONEOUS 609 "err" => "CURVED STEM PARAGRAPH SIGN ORNAMENT", 610 "erron" => "CURVED STEM PARAGRAPH SIGN ORNAMENT", 611 "erroneous" => "CURVED STEM PARAGRAPH SIGN ORNAMENT", 612 613 # CATACHRESTIC 614 "spu" => "PILCROW SIGN", 615 "spurious" => "PILCROW SIGN", 616 "catachrestic" => "PILCROW SIGN", 617 "catach" => "PILCROW SIGN", 618 "cata" => "PILCROW SIGN", 619 620 # CROSS REFERENCE 621 "xref" => "MULTIPLICATION SIGN", 622 623 # ILLUSTRATIVE 624 "ill" => "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK", 625 "illus" => "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK", 626 "illustrative" => "SINGLE RIGHT-POINTING ANGLE QUOTATION MARK", 627 628 }, 629 630); # end use charnames 631 632use constant DATABASE_NAME => "words.utf8"; 633 634use subs qw[ dump ]; # like I really want a SIGABORT, not! 635 636################################################################# 637 638use Carp; 639use File::Spec; 640use English qw[ -no_match_vars ]; 641use Getopt::Long qw[ GetOptions ]; 642use Pod::Usage; 643# use Search::Dict; 644use Unicode::Normalize; 645 646use Encode qw( encode decode ); 647 648################################################################# 649 650sub deQ($); 651 652################################################################# 653 654our %Opt; 655our $DB_Name; 656our $Shown_Count = 0; 657 658################################################################# 659 660main(); 661NOT_REACHED(); 662 663################################################################# 664 665sub main { 666 init(); 667 668 my $count = $Opt{fuzzy} ? run_agrep() 669 : $Opt{pattern} ? run_grep() 670 : run_look(); 671 672 debug("found $count matches"); 673 674 if ($Shown_Count == 0) { 675 exit 1; 676 } else { 677 exit 0; 678 } 679} 680 681################################################################# 682 683sub init { 684 685 eval q{ END { eval { close STDOUT } } }; 686 687 $SIG{PIPE} = sub { exit }; 688 689 $| = 1; 690 691 binmode(STDOUT, ":utf8"); 692 binmode(STDERR, ":utf8"); 693 694 @ARGV = map { decode("UTF-8", $_) } @ARGV; 695 696 handle_options(); 697 698 validate_database(); 699 700} 701 702################################################################# 703 704sub validate_database { 705 706 return if $DB_Name && locate_textfile($DB_Name); 707 708 my $database = $Opt{database} || DATABASE_NAME; 709 710 unless ($DB_Name = locate_textfile($database)) { 711 die "$0: no database $database\n"; 712 } 713 714} 715 716################################################################# 717 718sub handle_options { 719 720 pod2usage("$0: usage error: expected arguments\n") if @ARGV == 0; 721 722 Getopt::Long::Configure qw[ bundling auto_version no_ignore_case ]; 723 724 dump("pre getopt options are:", \%Opt); 725 726 GetOptions(\%Opt => qw[ 727 728 help|? 729 man|m 730 debug|d 731 732 datafile|D=s 733 pattern|grep|g=s 734 735 nopager 736 sort|s 737 738 verbose|v+ 739 showkey|raw|V 740 741 everything|all|a 742 all-verbose|A 743 744 headwords-only|h 745 746 regular|normal|n 747 foreign|alien|f 748 catachrestic|erroneous|e 749 obsolete|old|o 750 crossreference|xref|x 751 illustrations|i 752 753 noregular|nonormal|N 754 noforeign|noalian|F 755 nocatachrestic|noerroneous|E 756 noobsolete|noold|O 757 nocrossreference|noxref|X 758 noillustrations|I 759 760 part-of-speech|partofspeech|speech|pos|p=s 761 nopart-of-speech|nopartofspeech|nospeech|nopos|P=s 762 763 fuzzy|z 764 all-fuzzy|Z 765 766 ]) || pod2usage(2); 767 768 $Opt{verbose} ||= 0; 769 770 if ($Opt{"all-fuzzy"}) { 771 $Opt{"fuzzy"}++; 772 $Opt{"all-verbose"}++; 773 # FALLTHROUGH 774 } 775 776 if ($Opt{"all-verbose"}) { 777 $Opt{"everything"}++; 778 $Opt{"verbose"} = 2; 779 } 780 781 my @yes_types = qw{ 782 foreign 783 catachrestic 784 obsolete 785 crossreference 786 illustrations 787 regular 788 }; 789 790 if ($Opt{"showkey"}) { 791 $Opt{"verbose"} = 3; 792 } 793 794 my @no_types = map { "no$_" } @yes_types; 795 796 if ( ( grep { exists $Opt{$_} } @no_types ) && ( grep { exists $Opt{$_} } @yes_types ) ) { 797 # can't have both 798 pod2usage("Usage error: incompatible mix of yes and no options"); 799 } 800 801 # if (my @no_opts = @no_types ~~ %Opt) { 802 if (my @no_opts = grep { $Opt{$_} } @no_types) { 803 s/^no// for @no_opts; 804 debug("opt set 1"); 805 @Opt{ @yes_types } = (1) x @yes_types; 806 @Opt{ @no_opts } = (0) x @no_opts; 807 } 808 # elsif (@yes_types ~~ %Opt) { 809 elsif (grep { $Opt{$_} } @yes_types) { 810 debug("opt set 2"); 811 # then we're fine, use only these 812 } else { 813 debug("opt set 3"); 814 # neither yes nor no, so turn all yeses on 815 @Opt{ @yes_types } = (1) x @yes_types; 816 unless ($Opt{everything}) { 817 $Opt{"illustrations"} = 0; 818 $Opt{"obsolete"} = 0; 819 $Opt{"catachrestic"} = 0; 820 $Opt{"crossreference"} = 0; 821 } 822 } 823 824 if ($Opt{"headwords-only"}) { 825 $Opt{"illustrations"} = 0; 826 $Opt{"crossreference"} = 0; 827 } 828 829 dump("post getopt options are", \%Opt); 830 831 pod2usage(0) if $Opt{help}; 832 pod2usage(-exitstatus => 0, -verbose => 2) if $Opt{man}; 833 834 unless ($Opt{pattern} || @ARGV) { 835 @ARGV = ("."); 836 # pod2usage("$0: expected arguments\n"); 837 } 838 839 if (!$Opt{pattern} && $ARGV[0] =~ /\PL/) { 840 $Opt{pattern} = shift @ARGV; 841 $Opt{pattern} =~ s#^/## && $Opt{pattern} =~ s#/$##; 842 } 843 844} 845 846 847################################################################# 848 849sub run_look { 850 ARGCOUNT() if @_; 851 852 validate_database(); 853 854 my $look_word = lc NFD "@ARGV"; 855 $look_word =~ s/\PL+//g; 856 857 my $look_fh; 858 859 # because otherwise the look program misbehaves; 860 # env LC_ALL=C 861 # 862 $ENV{LC_ALL} = "C"; 863 864 my $lookpath = locate_program("look"); 865 die "no look program" unless $lookpath; 866 867 # can't do this many arguments in old perls 868 if ($] >= 5.013_000) { 869 open($look_fh, "-| :utf8", $lookpath, $look_word, $DB_Name, ); 870 } else { 871 open($look_fh, "$lookpath '$look_word' '$DB_Name' |"); 872 binmode($look_fh, ":utf8"); 873 } 874 875 my $found = 0; 876 877 local $_; 878 879 while (<$look_fh>) { 880 idem_print($_); 881 $found++; 882 } 883 884 eval { close $look_fh }; 885 886 # die "look failed: $?" if $?; 887 888 all_done(); 889 890 debug("returning $found matched"); 891 892 return $found; 893} 894 895 896################################################################# 897 898sub run_grep { 899 validate_database(); 900 901 my $search_string = NFD $Opt{pattern}; 902 903 die "$0: bad search string $search_string\n" 904 unless length $search_string; 905 906 $search_string =~ tr/`'/\N{lsquo}\N{rsquo}/; 907 908 local $SIG{__WARN__} = sub { die "FATALIZED WARNING: @_" }; 909 my $pattern = eval qq{ qr{$search_string} }; 910 die if $@; 911 912 open(my $raw_db, "< :utf8", $DB_Name); 913 914 my $found = 0; 915 916 local $_; 917 918 while (<$raw_db>) { 919 next unless /$pattern/ || NFC($_) =~ /$pattern/; 920 $found++; 921 idem_print($_); 922 } 923 close $raw_db; 924 925 all_done(); 926 927 return $found; 928} 929 930################################################################# 931 932sub run_agrep { 933 ARGCOUNT() if @_; 934 935 validate_database(); 936 937 my $agrep_word = lc NFD "@ARGV"; 938 $agrep_word =~ s/\PL+//g; 939 940 my $agrep_fh; 941 942 my $agrep_path = locate_program("agrep"); 943 die "no agrep program" unless $agrep_path; 944 945 my $yes_path = locate_program("yes"); 946 die "no yes program" unless $yes_path; 947 948 my $arg_string = "$yes_path | $agrep_path -B '$agrep_word' '$DB_Name' 2>/dev/null |"; 949 debug("running $arg_string"); 950 951 open($agrep_fh, $arg_string); 952 binmode($agrep_fh, ":utf8"); 953 954 my $found = 0; 955 956 local $_; 957 while (<$agrep_fh>) { 958 idem_print($_); 959 $found++; 960 } 961 962 eval { close $agrep_fh }; 963 964 # die "agrep failed: $?" if $?; 965 966 all_done(); 967 968 debug("returning $found matched"); 969 970 return $found; 971} 972 973################################################################# 974 975sub idem_print { 976 ARGCOUNT() unless @_ == 1; 977 978 my $entry = NFC shift(); 979 980 local $_ = $entry; 981 s/.*\t// || panic("malformed input"); 982 983 if (/\N{LEFTWARDS ARROW}/) { 984 debug("filter left arrow"); 985 return if $Opt{"headwords-only"}; 986 } 987 988 if (/\N{RIGHTWARDS ARROW}/) { 989 debug("filter left arrow"); 990 return if $Opt{"headwords-only"}; 991 } 992 993 if (/^ \N{ill} /) { 994 debug("filter ill"); 995 return if $Opt{"headwords-only"}; 996 return unless $Opt{illustrations}; 997 } 998 elsif (/^ \N{ali}/) { 999 debug("filter ali"); 1000 return unless $Opt{"foreign"}; 1001 } 1002 elsif (/^ \N{xref}/) { 1003 debug("filter xref"); 1004 return unless $Opt{"crossreference"}; 1005 } 1006 elsif (/^ [\N{spu}\N{err}]/) { 1007 debug("filter spu"); 1008 return unless $Opt{"catachrestic"}; 1009 } 1010 elsif (/^ \N{obs}/) { 1011 debug("filter obs"); 1012 return unless $Opt{"obsolete"}; 1013 } 1014 else { 1015 debug("filter regular"); 1016 return unless $Opt{"regular"}; 1017 } 1018 1019 if ($Opt{"part-of-speech"}) { 1020 debug("filter pos yes"); 1021 return if pos_filtered($_, $Opt{"part-of-speech"}); 1022 } 1023 1024 if ($Opt{"nopart-of-speech"}) { 1025 debug("filter pos yes"); 1026 return unless pos_filtered($_, $Opt{"nopart-of-speech"}); 1027 } 1028 1029 debug("FILTER FALLTHRU"); 1030 1031 unless ($Opt{verbose}) { 1032 s/\h\[.*//; 1033 s/\h\N{LEFTWARDS ARROW}.*//; 1034 s/\h\N{RIGHTWARDS ARROW}.*//; 1035 } 1036 1037 { 1038 next if m{ 1039 \b (?: 1040 1041 \N{ae}lfe? 1042 1043 | \N{oe}il 1044 | \N{oe}illade 1045 | \N{oe}ufs? 1046 | \N{oe}uvres? 1047 | b\N{oe}ufs? 1048 | c\N{oe}urs? 1049 | ch\N{oe}nix 1050 | m\N{oe}urs 1051 | v\N{oe}ux? 1052 1053 ) \b 1054 | (?<!man|ped)\N{oe}uvr 1055 | c\N{oe}ur 1056 }xi; 1057 1058 unless ( /\N{ae}\N{acute}|[\N{eth}\N{thorn}]/ ) { 1059 s{ \N{AE} }{Ae}xg; 1060 s{ \N{ae} }{ae}xg; 1061 } 1062 s{ \N{OE} }{Oe}xg; 1063 s{ \N{oe} }{oe}xg; 1064 } 1065 1066 unless ($Opt{verbose} > 1) { 1067 s/^\h+//; 1068 s/[\N{ali}\N{xref}\N{spu}\N{err}\N{obs}\N{ill}]\h*//g; 1069 s/[\N{stress1}\N{stress2}]//g; 1070 s/[\N{MIDDLE DOT}\N{ONE DOT LEADER}]//g; 1071 } 1072 1073 display($Opt{verbose} < 3 ? $_ : $entry); 1074 1075} 1076 1077sub pos_filtered($$) { 1078 my ($entry, $pos_list) = @_; 1079 1080 state $pos_map = { 1081 abbreviation => qr{ \b abbr \. }x, 1082 abbrev => qr{ \b abbr \. }x, 1083 abbr => qr{ \b abbr \. }x, 1084 absolute => qr{ \b absol \. }x, 1085 absol => qr{ \b absol \. }x, 1086 abs => qr{ \b absol \. }x, 1087 adjective => qr{ \b adj \. }x, 1088 adj => qr{ \b adj \. }x, 1089 a => qr{ \b adj \. }x, 1090 adverb => qr{ \b adv \. }x, 1091 adv => qr{ \b adv \. }x, 1092 adverbial => qr{ \b advb \. }x, 1093 advb => qr{ \b advb \. }x, 1094 attributive => qr{ \b attrib \. }x, 1095 attrib => qr{ \b attrib \. }x, 1096 attr => qr{ \b attrib \. }x, 1097 combining => qr{ \b comb \. }x, 1098 comb => qr{ \b comb \. }x, 1099 comparitive => qr{ \b compar \. }x, 1100 compar => qr{ \b compar \. }x, 1101 compound => qr{ \b comp \. }x, 1102 comp => qr{ \b comp \. }x, 1103 conjunction => qr{ \b conj \. }x, 1104 conj => qr{ \b conj \. }x, 1105 contraction => qr{ \b contr \. }x, 1106 contr => qr{ \b contr \. }x, 1107 cont => qr{ \b contr \. }x, 1108 demonstrative => qr{ \b dem \. }x, 1109 demon => qr{ \b dem \. }x, 1110 dem => qr{ \b dem \. }x, 1111 feminine => qr{ \b fem \. }x, 1112 fem => qr{ \b fem \. }x, 1113 impersonal => qr{ \b imp \. }x, 1114 impers => qr{ \b imp \. }x, 1115 imp => qr{ \b imp \. }x, 1116 indefinite => qr{ \b indef \. }x, 1117 indef => qr{ \b indef \. }x, 1118 ind => qr{ \b indef \. }x, 1119 infinitive => qr{ \b inf \. }x, 1120 infin => qr{ \b inf \. }x, 1121 inf => qr{ \b inf \. }x, 1122 interjection => qr{ \b int \. }x, 1123 interj => qr{ \b int \. }x, 1124 int => qr{ \b int \. }x, 1125 interrogative => qr{ \b interrog \. }x, 1126 interrog => qr{ \b interrog \. }x, 1127 interr => qr{ \b interrog \. }x, 1128 inter => qr{ \b interrog \. }x, 1129 intransitive => qr{ \b intr \. }x, 1130 intrans => qr{ \b intr \. }x, 1131 intr => qr{ \b intr \. }x, 1132 masculine => qr{ \b masc \. }x, 1133 masc => qr{ \b masc \. }x, 1134 name => qr{ \b name \b }x, 1135 noun => qr{ \b n \. }x, 1136 n => qr{ \b n \. }x, 1137 numeral => qr{ \b numeral \b }x, 1138 num => qr{ \b numeral \b }x, 1139 participial => qr{ \b pple? \. }x, 1140 part => qr{ \b pple? \. }x, 1141 pple => qr{ \b pple? \. }x, 1142 ppl => qr{ \b pple? \. }x, 1143 participle => qr{ \b pple? \. }x, 1144 particle => qr{ \b particle \b }x, 1145 past => qr{ \b pa \. }x, 1146 pa => qr{ \b pa \. }x, 1147 personal => qr{ \b pers \. }x, 1148 pers => qr{ \b pers \. }x, 1149 phrasal => qr{ \b phr \. }x, 1150 phr => qr{ \b phr \. }x, 1151 phrase => qr{ \b phrase \b }x, 1152 plural => qr{ \b pl \. }x, 1153 pl => qr{ \b pl \. }x, 1154 possessive => qr{ \b poss \. }x, 1155 poss => qr{ \b poss \. }x, 1156 predicate => qr{ \b pred \. }x, 1157 pred => qr{ \b pred \. }x, 1158 prefix => qr{ \b pref \. }x, 1159 pref => qr{ \b pref \. }x, 1160 preposition => qr{ \b prep \. }x, 1161 prep => qr{ \b prep \. }x, 1162 present => qr{ \b pres \. }x, 1163 pres => qr{ \b pres \. }x, 1164 pr => qr{ \b pres \. }x, 1165 pronoun => qr{ \b (?:pron|pers) \. }x, 1166 pron => qr{ \b (?:pron|pers) \. }x, 1167 pro => qr{ \b (?:pron|pers) \. }x, 1168 relative => qr{ \b rel \. }x, 1169 rel => qr{ \b rel \. }x, 1170 singular => qr{ \b sing \. }x, 1171 sing => qr{ \b sing \. }x, 1172 sg => qr{ \b sing \. }x, 1173 suffix => qr{ \b suff \. }x, 1174 suff => qr{ \b suff \. }x, 1175 superlative => qr{ \b superl \. }x, 1176 superl => qr{ \b superl \. }x, 1177 super => qr{ \b superl \. }x, 1178 transitive => qr{ \b trans \. }x, 1179 trans => qr{ \b trans \. }x, 1180 tr => qr{ \b trans \. }x, 1181 verb => qr{ \b v \. }x, 1182 v => qr{ \b v \. }x, 1183 verbal => qr{ \b vbl \. }x, 1184 vbl => qr{ \b vbl \. }x, 1185 1186 # affix => qr{ \b (?: suf | pre ) f \. }x, 1187 }; 1188 1189 my @want_parts = split /[.,\h]+/ => $pos_list; 1190 my $have_parts = $entry =~ m{ \[ (.+) \] }x ? $1 : q(); 1191 1192 for my $want (@want_parts) { 1193 my $pat = $pos_map->{$want}; 1194 die "$0: No such part of speech as <$want>.\n" unless defined $pat; 1195 return 1 unless $have_parts =~ $pat; 1196 } 1197 return 0; 1198} 1199 1200sub display { 1201 ARGCOUNT() unless @_ == 1; 1202 1203 my $string = $_[0]; 1204 1205 state $seen = {}; 1206 1207 return if $seen->{$string}++; 1208 1209 state $begun_pager; 1210 start_pager() unless $begun_pager++; 1211 1212 $Shown_Count++; 1213 1214 if ($Opt{sort}) { 1215 treasure_up($string); 1216 } else { 1217 print $string; 1218 } 1219 1220} 1221 1222{ my @saved_lines; 1223 1224 sub treasure_up { 1225 ARGCOUNT() unless @_ == 1; 1226 push(@saved_lines, $_[0]); 1227 } 1228 1229 sub all_done { 1230 ARGCOUNT() unless @_ == 0; 1231 1232 return unless @saved_lines; 1233 1234 require Unicode::Collate; 1235 1236 my $sorter = new Unicode::Collate:: 1237 upper_before_lower => 1, 1238 preprocess => \&reduce_for_sorting, 1239 entry => deQ<<'END_OF_OVERRIDE' 1240 |Q| 005B 006E 002E ; [.0200.0020.0002.0391] # [n. 1241 |Q| 005B ; [.0220.0020.0002.0392] # [ 1242 |Q| 005D ; [.0225.0020.0002.0395] # ] 1243END_OF_OVERRIDE 1244 ; 1245 1246 print for $sorter->sort(@saved_lines); 1247 } 1248 1249} 1250 1251sub reduce_for_sorting { 1252 ARGCOUNT() unless @_ == 1; 1253 1254 local $_ = $_[0]; 1255 1256 s/[\N{LEFTWARDS ARROW}\N{RIGHTWARDS ARROW}].*//; 1257 1258 s/(\d+)/sprintf("%020d", $1)/ge; 1259 1260 s/^.*\t// if $Opt{showkey}; 1261 1262 return $_; 1263} 1264 1265################################################################# 1266 1267sub am_running_perldb { 1268 no warnings "once"; 1269 return keys(%DB::sub) > 0; 1270} 1271 1272sub start_pager { 1273 ARGCOUNT() unless @_ == 0; 1274 1275 return if am_running_perldb(); 1276 1277 return if $Opt{nopager}; 1278 1279 return unless -t STDOUT; 1280 1281 my $his_pager = locate_program($ENV{PAGER}) 1282 || locate_program("less") 1283 || locate_program("more") 1284 || locate_program("type") 1285 ; 1286 1287 return unless $his_pager; 1288 local $ENV{LESSCHARSET} = "utf-8" if $his_pager =~ /\bless\b/i; 1289 open(STDOUT, "|- :utf8", $his_pager); 1290} 1291 1292################################################################# 1293 1294sub locate_textfile { 1295 ARGCOUNT() unless @_ == 1; 1296 1297 my $textfile = $_[0]; 1298 1299 return unless grep { defined && length } $textfile; 1300 1301 if (File::Spec->file_name_is_absolute($textfile)) { 1302 return is_legible($textfile); 1303 } 1304 1305 my @maybe_dirs = qw{ 1306 /usr/local/share/dict 1307 /usr/share/dict 1308 /usr/local/etc 1309 /etc 1310 /opt/local/etc 1311 /opt/local/etc/dict 1312 }; 1313 1314 push @maybe_dirs, @INC; 1315 push @maybe_dirs, File::Spec->path(); 1316 push @maybe_dirs, $ENV{HOME} || $ENV{LOGDIR} || "."; 1317 1318 for my $dir (@maybe_dirs) { 1319 my $pathname = File::Spec->catfile($dir, $textfile); 1320 my $dbpath; 1321 return $dbpath if $dbpath = is_legible($pathname); 1322 } 1323 1324 return; 1325} 1326 1327sub locate_program { 1328 ARGCOUNT() unless @_ == 1; 1329 1330 my $program = $_[0]; 1331 1332 return unless defined $program 1333 && length $program; 1334 1335 if (File::Spec->file_name_is_absolute($program)) { 1336 return is_runnable($program); 1337 } 1338 1339 my @path_dirs = File::Spec->path(); 1340 1341 for my $dir (@path_dirs) { 1342 my $pathname = File::Spec->catfile($dir, $program); 1343 my $runpath; 1344 return $runpath if $runpath = is_runnable($pathname); 1345 } 1346 1347 return; 1348} 1349 1350sub is_legible { 1351 ARGCOUNT() unless @_ == 1; 1352 my $fullpath = $_[0]; 1353 1354 if (-f $fullpath && -r _ && -T $fullpath) { 1355 return $fullpath; 1356 } 1357 elsif (stupid_evil_and_wrong() && $fullpath !~ /\.txt\z/i) { 1358 return is_runnable("$fullpath.txt") 1359 } 1360 else { 1361 return (); 1362 } 1363 1364 NOT_REACHED(); 1365} 1366 1367sub is_runnable { 1368 ARGCOUNT() unless @_ == 1; 1369 my $fullpath = $_[0]; 1370 1371 if (-x $fullpath && ! -d _) { 1372 return $fullpath; 1373 } 1374 elsif (stupid_evil_and_wrong() && $fullpath !~ /\.exe\z/i) { 1375 return is_runnable("$fullpath.exe") 1376 } 1377 else { 1378 return (); 1379 } 1380 1381 NOT_REACHED(); 1382} 1383 1384sub stupid_evil_and_wrong { 1385 my $name = lc $OSNAME; 1386 return grep { $name eq $_ } qw<dos os2 netware symbian mswin32>; 1387} 1388 1389################################################################# 1390 1391sub debug { 1392 ARGCOUNT() unless @_ > 0; 1393 return unless $Opt{debug}; 1394 print STDERR "@_\n" if @_; 1395} 1396 1397sub dump { 1398 ARGCOUNT() unless @_ == 2; 1399 state $dumper; 1400 1401 return unless $Opt{debug}; 1402 1403 require Dumpvalue; 1404 unless ($dumper) { 1405 $dumper = new Dumpvalue:: ; 1406 } 1407 1408 my($message, $ref) = @_; 1409 1410 say "$message: "; 1411 dumpValue $dumper $ref; 1412 say ""; 1413} 1414 1415sub panic { 1416 confess "$0: INTERNAL ERROR: @_"; 1417} 1418 1419sub NOT_REACHED { 1420 panic("NOT REACHED"); 1421} 1422 1423sub ARGCOUNT { 1424 panic("wrong arguments to function"); 1425} 1426 1427 1428sub dequeue($$) { 1429 my($leader, $body) = @_; 1430 $body =~ s/^\s*\Q$leader\E ?//gm; 1431 return $body; 1432} 1433 1434sub deQ($) { 1435 my $text = $_[0]; 1436 return dequeue q<|Q|>, $text; 1437} 1438 1439sub deQQ($) { 1440 my $text = $_[0]; 1441 return dequeue qq<|QQ|>, $text; 1442} 1443 1444################################################################# 1445################################################################# 1446################################################################# 1447 1448__END__ 1449 1450################################################################# 1451 1452=head1 NAME 1453 1454word - display words starting or matching a string or pattern 1455 1456=head1 SYNOPSIS 1457 1458word [options] [string | pattern] 1459 1460Given a string, show all words starting with that string (look mode). 1461Given a pattern, show all lines matching that pattern (grep mode). 1462 1463An argument with non-alphabetic characters is always a pattern. 1464Force grep mode with B<--grep=pattern> or by starting the pattern 1465with a slash, which will be ignored. 1466 1467Use B<--man> to get the full manpage. 1468 1469=head1 DESCRIPTION 1470 1471Search a large list of words in one of two modes. In look mode, 1472only words starting with the given string are displayed. This 1473mode runs very quickly. Only purely alphabetic strings are allowed. 1474The system look(1) program is co-opted into helping. 1475 1476In grep mode, any entries matching the pattern are shown. This 1477takes much longer to run, because the entire 26 megabyte file must 1478be grepped through. The pattern is not a grep(1) pattern, but 1479rather a perl(1) pattern. You may use Unicode named characters, 1480plus several custom aliases, in your pattern. 1481 1482=head1 EXAMPLES 1483 1484Look up terms starting with "cat": 1485 1486 % word cat 1487 1488The same, but bump verbose display level to see parts of speech: 1489 1490 % word -v cat 1491 1492Look at only verbs starting with cat: 1493 1494 % word -pv cat 1495 1496Look at all "cat" entries, with verbose set high: 1497 1498 % word -A cat 1499 1500Look for all (irregular) plurals that start with "ex": 1501 1502 % word -ppl ex 1503 1504Look for obsolete prefixes that start with "s": 1505 1506 % word -o -ppref s 1507 1508Grep terms with "cat" anywhere at all: 1509 1510 % word --grep cat 1511 % word /cat 1512 1513Grep terms containing "cat" or "cats" surrounded by 1514word boundaries: 1515 1516 % word '\bcats?\b' 1517 1518Grep terms with the Unicode "Mark" property: 1519 1520 % word '\pM' 1521 1522Grep all plurals ending in "-ata": 1523 1524 % word -A -ppl 'ata\b' 1525 1526Grep terms with the Unicode "Dash" property: 1527 1528 % word '\p{Dash}' 1529 1530Grep for an "e" with an acute accent: 1531 1532 % word '\N{eacute}' 1533 1534Grep for any acute accents no matter the letter: 1535 1536 % word '\N{acute}' 1537 1538Grep for terms containing an "a", "o", "u" in any case, followed 1539by a diaeresis: 1540 1541 % word '(?i)[oau]\N{dier}' 1542 1543=head1 OPTIONS 1544 1545Display options are: 1546 1547 --verbose / -v use up to three times for more verbosity 1548 1549 level 0 is just the word, like look 1550 level 1 includes parts of speech 1551 level 2 also includes assorted markings 1552 level 3 is the entire original entry 1553 1554 --nopager never call the pager 1555 1556Part of speech filtering options are: 1557 1558 --pos / -p POS only entries matching all POS shown 1559 --nopos / -P POS no entries matching any POS shown 1560 1561 POS is a comma-separated list of parts of speech like 1562 n/noun, v/verb, a/adjective, adv/adverb, pro/pronoun, 1563 and pl/plural. 1564 1565Type of entry filtering options are: 1566 1567 --headwords -h show headwords only 1568 --everything -a include all types of entry 1569 --all-verbose -A all entries, plus sets verbose to 2 1570 1571Some entries contain markings telling what kind it is. 1572Include or exclude such entries using: 1573 1574 --normal -n normal entries (on by default) 1575 --foreign -f unassimilated entries (on by default) 1576 1577 --obsolete -o obsolete entries (off by default) 1578 --catachrestic -e catechrestic entries (off by default) 1579 --illustrations -i illustrative examples (off by default) 1580 --crossref -x crossrefs w/old spellings (off by default) 1581 1582The previous six entry types can be excluded using the corresponding 1583B<--noXXX> long option or the capitalized short option; e.g., 1584B<--noforeign> is equivalent to B<-F>. 1585 1586Other options: 1587 1588 --version print version info and exit 1589 --help this help page 1590 --man the full manpage 1591 --debug internal debugging 1592 1593 --fuzzy -z use agrep(1) fuzzy matching in "best mode" 1594 --all-fuzzy -Z like -zavv 1595 1596=head1 PATTERN SHORTCUTS 1597 1598Besides all normal Perl pattern syntax, an extensive set of 1599named characters is provide for nmemonic convenience so you 1600don't have to write numeric code points like C<\x{3b2}> 1601for non-ASCII characters. 1602 1603=over 1604 1605=item * 1606 1607The full Unicode name, like 1608C<\N{EN DASH}> or 1609C<\N{LATIN SMALL LETTER THORN}>, or 1610Latin or Greek letter names, like 1611C<\N{thorn}> or 1612C<\N{alpha}>. 1613 1614=item * 1615 1616HTML abbrevations like 1617C<\N{eacute}>, 1618C<\N{ccedil}>, 1619C<\N{iuml}>. 1620 1621=item * 1622 1623Diacritic abbreviations: 1624C<\N{macron}>, 1625C<\N{acute}>, 1626C<\N{grave}>, 1627C<\N{diaeresis }>, 1628C<\N{dier}>, 1629C<\N{circumflex }>, 1630C<\N{circ}>, 1631and 1632C<\N{tilde}>; 1633C<\N{stress1}> and 1634C<\N{stress2}>. 1635 1636=item * 1637 1638Abbreviations for the type of entry: 1639 1640C<\N{ali}> (unassimilated), 1641C<\N{obs}> (obsolete), 1642C<\N{xref}> (crossreference), 1643C<\N{ill}> (illustrative), 1644C<\N{spu}> (catachrestic), and 1645C<\N{err}> (erroneous). 1646 1647=back 1648 1649=head1 ERRORS 1650 1651TO BE WRITTEN: ERRORS 1652 1653=head1 ENVIRONMENT 1654 1655PAGER 1656 1657=head1 FILES 1658 1659F<words.utf8> 1660 1661=head1 PROGRAMS 1662 1663F<look>, F<agrep> 1664 1665=head1 BUGS 1666 1667TO BE WRITTEN: BUGS 1668 1669=head1 SEE ALSO 1670 1671perlre(1), perlunicode(1) 1672 1673=head1 AUTHOR 1674 1675TO BE WRITTEN: AUTHOR 1676 1677=head1 COPYRIGHT AND LICENCE 1678 1679TO BE WRITTEN: COPYRIGHT AND LICENCE 1680