1 // This is an open source non-commercial project. Dear PVS-Studio, please check
2 // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3
4 // spellfile.c: code for reading and writing spell files.
5 //
6 // See spell.c for information about spell checking.
7
8 // Vim spell file format: <HEADER>
9 // <SECTIONS>
10 // <LWORDTREE>
11 // <KWORDTREE>
12 // <PREFIXTREE>
13 //
14 // <HEADER>: <fileID> <versionnr>
15 //
16 // <fileID> 8 bytes "VIMspell"
17 // <versionnr> 1 byte VIMSPELLVERSION
18 //
19 //
20 // Sections make it possible to add information to the .spl file without
21 // making it incompatible with previous versions. There are two kinds of
22 // sections:
23 // 1. Not essential for correct spell checking. E.g. for making suggestions.
24 // These are skipped when not supported.
25 // 2. Optional information, but essential for spell checking when present.
26 // E.g. conditions for affixes. When this section is present but not
27 // supported an error message is given.
28 //
29 // <SECTIONS>: <section> ... <sectionend>
30 //
31 // <section>: <sectionID> <sectionflags> <sectionlen> (section contents)
32 //
33 // <sectionID> 1 byte number from 0 to 254 identifying the section
34 //
35 // <sectionflags> 1 byte SNF_REQUIRED: this section is required for correct
36 // spell checking
37 //
38 // <sectionlen> 4 bytes length of section contents, MSB first
39 //
40 // <sectionend> 1 byte SN_END
41 //
42 //
43 // sectionID == SN_INFO: <infotext>
44 // <infotext> N bytes free format text with spell file info (version,
45 // website, etc)
46 //
47 // sectionID == SN_REGION: <regionname> ...
48 // <regionname> 2 bytes Up to MAXREGIONS region names: ca, au, etc.
49 // Lower case.
50 // First <regionname> is region 1.
51 //
52 // sectionID == SN_CHARFLAGS: <charflagslen> <charflags>
53 // <folcharslen> <folchars>
54 // <charflagslen> 1 byte Number of bytes in <charflags> (should be 128).
55 // <charflags> N bytes List of flags (first one is for character 128):
56 // 0x01 word character CF_WORD
57 // 0x02 upper-case character CF_UPPER
58 // <folcharslen> 2 bytes Number of bytes in <folchars>.
59 // <folchars> N bytes Folded characters, first one is for character 128.
60 //
61 // sectionID == SN_MIDWORD: <midword>
62 // <midword> N bytes Characters that are word characters only when used
63 // in the middle of a word.
64 //
65 // sectionID == SN_PREFCOND: <prefcondcnt> <prefcond> ...
66 // <prefcondcnt> 2 bytes Number of <prefcond> items following.
67 // <prefcond> : <condlen> <condstr>
68 // <condlen> 1 byte Length of <condstr>.
69 // <condstr> N bytes Condition for the prefix.
70 //
71 // sectionID == SN_REP: <repcount> <rep> ...
72 // <repcount> 2 bytes number of <rep> items, MSB first.
73 // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
74 // <repfromlen> 1 byte length of <repfrom>
75 // <repfrom> N bytes "from" part of replacement
76 // <reptolen> 1 byte length of <repto>
77 // <repto> N bytes "to" part of replacement
78 //
79 // sectionID == SN_REPSAL: <repcount> <rep> ...
80 // just like SN_REP but for soundfolded words
81 //
82 // sectionID == SN_SAL: <salflags> <salcount> <sal> ...
83 // <salflags> 1 byte flags for soundsalike conversion:
84 // SAL_F0LLOWUP
85 // SAL_COLLAPSE
86 // SAL_REM_ACCENTS
87 // <salcount> 2 bytes number of <sal> items following
88 // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
89 // <salfromlen> 1 byte length of <salfrom>
90 // <salfrom> N bytes "from" part of soundsalike
91 // <saltolen> 1 byte length of <salto>
92 // <salto> N bytes "to" part of soundsalike
93 //
94 // sectionID == SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
95 // <sofofromlen> 2 bytes length of <sofofrom>
96 // <sofofrom> N bytes "from" part of soundfold
97 // <sofotolen> 2 bytes length of <sofoto>
98 // <sofoto> N bytes "to" part of soundfold
99 //
100 // sectionID == SN_SUGFILE: <timestamp>
101 // <timestamp> 8 bytes time in seconds that must match with .sug file
102 //
103 // sectionID == SN_NOSPLITSUGS: nothing
104 //
105 // sectionID == SN_NOCOMPOUNDSUGS: nothing
106 //
107 // sectionID == SN_WORDS: <word> ...
108 // <word> N bytes NUL terminated common word
109 //
110 // sectionID == SN_MAP: <mapstr>
111 // <mapstr> N bytes String with sequences of similar characters,
112 // separated by slashes.
113 //
114 // sectionID == SN_COMPOUND: <compmax> <compminlen> <compsylmax> <compoptions>
115 // <comppatcount> <comppattern> ... <compflags>
116 // <compmax> 1 byte Maximum nr of words in compound word.
117 // <compminlen> 1 byte Minimal word length for compounding.
118 // <compsylmax> 1 byte Maximum nr of syllables in compound word.
119 // <compoptions> 2 bytes COMP_ flags.
120 // <comppatcount> 2 bytes number of <comppattern> following
121 // <compflags> N bytes Flags from COMPOUNDRULE items, separated by
122 // slashes.
123 //
124 // <comppattern>: <comppatlen> <comppattext>
125 // <comppatlen> 1 byte length of <comppattext>
126 // <comppattext> N bytes end or begin chars from CHECKCOMPOUNDPATTERN
127 //
128 // sectionID == SN_NOBREAK: (empty, its presence is what matters)
129 //
130 // sectionID == SN_SYLLABLE: <syllable>
131 // <syllable> N bytes String from SYLLABLE item.
132 //
133 // <LWORDTREE>: <wordtree>
134 //
135 // <KWORDTREE>: <wordtree>
136 //
137 // <PREFIXTREE>: <wordtree>
138 //
139 //
140 // <wordtree>: <nodecount> <nodedata> ...
141 //
142 // <nodecount> 4 bytes Number of nodes following. MSB first.
143 //
144 // <nodedata>: <siblingcount> <sibling> ...
145 //
146 // <siblingcount> 1 byte Number of siblings in this node. The siblings
147 // follow in sorted order.
148 //
149 // <sibling>: <byte> [ <nodeidx> <xbyte>
150 // | <flags> [<flags2>] [<region>] [<affixID>]
151 // | [<pflags>] <affixID> <prefcondnr> ]
152 //
153 // <byte> 1 byte Byte value of the sibling. Special cases:
154 // BY_NOFLAGS: End of word without flags and for all
155 // regions.
156 // For PREFIXTREE <affixID> and
157 // <prefcondnr> follow.
158 // BY_FLAGS: End of word, <flags> follow.
159 // For PREFIXTREE <pflags>, <affixID>
160 // and <prefcondnr> follow.
161 // BY_FLAGS2: End of word, <flags> and <flags2>
162 // follow. Not used in PREFIXTREE.
163 // BY_INDEX: Child of sibling is shared, <nodeidx>
164 // and <xbyte> follow.
165 //
166 // <nodeidx> 3 bytes Index of child for this sibling, MSB first.
167 //
168 // <xbyte> 1 byte Byte value of the sibling.
169 //
170 // <flags> 1 byte Bitmask of:
171 // WF_ALLCAP word must have only capitals
172 // WF_ONECAP first char of word must be capital
173 // WF_KEEPCAP keep-case word
174 // WF_FIXCAP keep-case word, all caps not allowed
175 // WF_RARE rare word
176 // WF_BANNED bad word
177 // WF_REGION <region> follows
178 // WF_AFX <affixID> follows
179 //
180 // <flags2> 1 byte Bitmask of:
181 // WF_HAS_AFF >> 8 word includes affix
182 // WF_NEEDCOMP >> 8 word only valid in compound
183 // WF_NOSUGGEST >> 8 word not used for suggestions
184 // WF_COMPROOT >> 8 word already a compound
185 // WF_NOCOMPBEF >> 8 no compounding before this word
186 // WF_NOCOMPAFT >> 8 no compounding after this word
187 //
188 // <pflags> 1 byte Bitmask of:
189 // WFP_RARE rare prefix
190 // WFP_NC non-combining prefix
191 // WFP_UP letter after prefix made upper case
192 //
193 // <region> 1 byte Bitmask for regions in which word is valid. When
194 // omitted it's valid in all regions.
195 // Lowest bit is for region 1.
196 //
197 // <affixID> 1 byte ID of affix that can be used with this word. In
198 // PREFIXTREE used for the required prefix ID.
199 //
200 // <prefcondnr> 2 bytes Prefix condition number, index in <prefcond> list
201 // from HEADER.
202 //
203 // All text characters are in 'encoding', but stored as single bytes.
204
205 // Vim .sug file format: <SUGHEADER>
206 // <SUGWORDTREE>
207 // <SUGTABLE>
208 //
209 // <SUGHEADER>: <fileID> <versionnr> <timestamp>
210 //
211 // <fileID> 6 bytes "VIMsug"
212 // <versionnr> 1 byte VIMSUGVERSION
213 // <timestamp> 8 bytes timestamp that must match with .spl file
214 //
215 //
216 // <SUGWORDTREE>: <wordtree> (see above, no flags or region used)
217 //
218 //
219 // <SUGTABLE>: <sugwcount> <sugline> ...
220 //
221 // <sugwcount> 4 bytes number of <sugline> following
222 //
223 // <sugline>: <sugnr> ... NUL
224 //
225 // <sugnr>: X bytes word number that results in this soundfolded word,
226 // stored as an offset to the previous number in as
227 // few bytes as possible, see offset2bytes())
228
229 #include <stdint.h>
230 #include <stdio.h>
231 #include <wctype.h>
232
233 #include "nvim/ascii.h"
234 #include "nvim/buffer.h"
235 #include "nvim/charset.h"
236 #include "nvim/ex_cmds2.h"
237 #include "nvim/fileio.h"
238 #include "nvim/memline.h"
239 #include "nvim/memory.h"
240 #include "nvim/misc1.h"
241 #include "nvim/option.h"
242 #include "nvim/os/os.h"
243 #include "nvim/path.h"
244 #include "nvim/regexp.h"
245 #include "nvim/screen.h"
246 #include "nvim/spell.h"
247 #include "nvim/spell_defs.h"
248 #include "nvim/spellfile.h"
249 #include "nvim/ui.h"
250 #include "nvim/undo.h"
251 #include "nvim/vim.h"
252
253 #ifndef UNIX // it's in os/unix_defs.h for Unix
254 # include <time.h> // for time_t
255 #endif
256
257 // Special byte values for <byte>. Some are only used in the tree for
258 // postponed prefixes, some only in the other trees. This is a bit messy...
259 #define BY_NOFLAGS 0 // end of word without flags or region; for
260 // postponed prefix: no <pflags>
261 #define BY_INDEX 1 // child is shared, index follows
262 #define BY_FLAGS 2 // end of word, <flags> byte follows; for
263 // postponed prefix: <pflags> follows
264 #define BY_FLAGS2 3 // end of word, <flags> and <flags2> bytes
265 // follow; never used in prefix tree
266 #define BY_SPECIAL BY_FLAGS2 // highest special byte value
267
268 #define ZERO_FLAG 65009 // used when flag is zero: "0"
269
270 // Flags used in .spl file for soundsalike flags.
271 #define SAL_F0LLOWUP 1
272 #define SAL_COLLAPSE 2
273 #define SAL_REM_ACCENTS 4
274
275 #define VIMSPELLMAGIC "VIMspell" // string at start of Vim spell file
276 #define VIMSPELLMAGICL (sizeof(VIMSPELLMAGIC) - 1)
277 #define VIMSPELLVERSION 50
278
279 // Section IDs. Only renumber them when VIMSPELLVERSION changes!
280 #define SN_REGION 0 // <regionname> section
281 #define SN_CHARFLAGS 1 // charflags section
282 #define SN_MIDWORD 2 // <midword> section
283 #define SN_PREFCOND 3 // <prefcond> section
284 #define SN_REP 4 // REP items section
285 #define SN_SAL 5 // SAL items section
286 #define SN_SOFO 6 // soundfolding section
287 #define SN_MAP 7 // MAP items section
288 #define SN_COMPOUND 8 // compound words section
289 #define SN_SYLLABLE 9 // syllable section
290 #define SN_NOBREAK 10 // NOBREAK section
291 #define SN_SUGFILE 11 // timestamp for .sug file
292 #define SN_REPSAL 12 // REPSAL items section
293 #define SN_WORDS 13 // common words
294 #define SN_NOSPLITSUGS 14 // don't split word for suggestions
295 #define SN_INFO 15 // info section
296 #define SN_NOCOMPOUNDSUGS 16 // don't compound for suggestions
297 #define SN_END 255 // end of sections
298
299 #define SNF_REQUIRED 1 // <sectionflags>: required section
300
301 #define CF_WORD 0x01
302 #define CF_UPPER 0x02
303
304 static char *e_spell_trunc = N_("E758: Truncated spell file");
305 static char *e_afftrailing = N_("Trailing text in %s line %d: %s");
306 static char *e_affname = N_("Affix name too long in %s line %d: %s");
307 static char *msg_compressing = N_("Compressing word tree...");
308
309 #define MAXLINELEN 500 // Maximum length in bytes of a line in a .aff
310 // and .dic file.
311 // Main structure to store the contents of a ".aff" file.
312 typedef struct afffile_S {
313 char_u *af_enc; // "SET", normalized, alloc'ed string or NULL
314 int af_flagtype; // AFT_CHAR, AFT_LONG, AFT_NUM or AFT_CAPLONG
315 unsigned af_rare; // RARE ID for rare word
316 unsigned af_keepcase; // KEEPCASE ID for keep-case word
317 unsigned af_bad; // BAD ID for banned word
318 unsigned af_needaffix; // NEEDAFFIX ID
319 unsigned af_circumfix; // CIRCUMFIX ID
320 unsigned af_needcomp; // NEEDCOMPOUND ID
321 unsigned af_comproot; // COMPOUNDROOT ID
322 unsigned af_compforbid; // COMPOUNDFORBIDFLAG ID
323 unsigned af_comppermit; // COMPOUNDPERMITFLAG ID
324 unsigned af_nosuggest; // NOSUGGEST ID
325 int af_pfxpostpone; // postpone prefixes without chop string and
326 // without flags
327 bool af_ignoreextra; // IGNOREEXTRA present
328 hashtab_T af_pref; // hashtable for prefixes, affheader_T
329 hashtab_T af_suff; // hashtable for suffixes, affheader_T
330 hashtab_T af_comp; // hashtable for compound flags, compitem_T
331 } afffile_T;
332
333 #define AFT_CHAR 0 // flags are one character
334 #define AFT_LONG 1 // flags are two characters
335 #define AFT_CAPLONG 2 // flags are one or two characters
336 #define AFT_NUM 3 // flags are numbers, comma separated
337
338 typedef struct affentry_S affentry_T;
339 // Affix entry from ".aff" file. Used for prefixes and suffixes.
340 struct affentry_S {
341 affentry_T *ae_next; // next affix with same name/number
342 char_u *ae_chop; // text to chop off basic word (can be NULL)
343 char_u *ae_add; // text to add to basic word (can be NULL)
344 char_u *ae_flags; // flags on the affix (can be NULL)
345 char_u *ae_cond; // condition (NULL for ".")
346 regprog_T *ae_prog; // regexp program for ae_cond or NULL
347 char ae_compforbid; // COMPOUNDFORBIDFLAG found
348 char ae_comppermit; // COMPOUNDPERMITFLAG found
349 };
350
351 #define AH_KEY_LEN 17 // 2 x 8 bytes + NUL
352
353 // Affix header from ".aff" file. Used for af_pref and af_suff.
354 typedef struct affheader_S {
355 char_u ah_key[AH_KEY_LEN]; // key for hashtab == name of affix
356 unsigned ah_flag; // affix name as number, uses "af_flagtype"
357 int ah_newID; // prefix ID after renumbering; 0 if not used
358 int ah_combine; // suffix may combine with prefix
359 int ah_follows; // another affix block should be following
360 affentry_T *ah_first; // first affix entry
361 } affheader_T;
362
363 #define HI2AH(hi) ((affheader_T *)(hi)->hi_key)
364
365 // Flag used in compound items.
366 typedef struct compitem_S {
367 char_u ci_key[AH_KEY_LEN]; // key for hashtab == name of compound
368 unsigned ci_flag; // affix name as number, uses "af_flagtype"
369 int ci_newID; // affix ID after renumbering.
370 } compitem_T;
371
372 #define HI2CI(hi) ((compitem_T *)(hi)->hi_key)
373
374 // Structure that is used to store the items in the word tree. This avoids
375 // the need to keep track of each allocated thing, everything is freed all at
376 // once after ":mkspell" is done.
377 // Note: "sb_next" must be just before "sb_data" to make sure the alignment of
378 // "sb_data" is correct for systems where pointers must be aligned on
379 // pointer-size boundaries and sizeof(pointer) > sizeof(int) (e.g., Sparc).
380 #define SBLOCKSIZE 16000 // size of sb_data
381 typedef struct sblock_S sblock_T;
382 struct sblock_S {
383 int sb_used; // nr of bytes already in use
384 sblock_T *sb_next; // next block in list
385 char_u sb_data[1]; // data, actually longer
386 };
387
388 // A node in the tree.
389 typedef struct wordnode_S wordnode_T;
390 struct wordnode_S {
391 union { // shared to save space
392 char_u hashkey[6]; // the hash key, only used while compressing
393 int index; // index in written nodes (valid after first
394 // round)
395 } wn_u1;
396 union { // shared to save space
397 wordnode_T *next; // next node with same hash key
398 wordnode_T *wnode; // parent node that will write this node
399 } wn_u2;
400 wordnode_T *wn_child; // child (next byte in word)
401 wordnode_T *wn_sibling; // next sibling (alternate byte in word,
402 // always sorted)
403 int wn_refs; // Nr. of references to this node. Only
404 // relevant for first node in a list of
405 // siblings, in following siblings it is
406 // always one.
407 char_u wn_byte; // Byte for this node. NUL for word end
408
409 // Info for when "wn_byte" is NUL.
410 // In PREFIXTREE "wn_region" is used for the prefcondnr.
411 // In the soundfolded word tree "wn_flags" has the MSW of the wordnr and
412 // "wn_region" the LSW of the wordnr.
413 char_u wn_affixID; // supported/required prefix ID or 0
414 uint16_t wn_flags; // WF_ flags
415 short wn_region; // region mask
416
417 #ifdef SPELL_PRINTTREE
418 int wn_nr; // sequence nr for printing
419 #endif
420 };
421
422 #define WN_MASK 0xffff // mask relevant bits of "wn_flags"
423
424 #define HI2WN(hi) (wordnode_T *)((hi)->hi_key)
425
426 // Info used while reading the spell files.
427 typedef struct spellinfo_S {
428 wordnode_T *si_foldroot; // tree with case-folded words
429 long si_foldwcount; // nr of words in si_foldroot
430
431 wordnode_T *si_keeproot; // tree with keep-case words
432 long si_keepwcount; // nr of words in si_keeproot
433
434 wordnode_T *si_prefroot; // tree with postponed prefixes
435
436 long si_sugtree; // creating the soundfolding trie
437
438 sblock_T *si_blocks; // memory blocks used
439 long si_blocks_cnt; // memory blocks allocated
440 int si_did_emsg; // TRUE when ran out of memory
441
442 long si_compress_cnt; // words to add before lowering
443 // compression limit
444 wordnode_T *si_first_free; // List of nodes that have been freed during
445 // compression, linked by "wn_child" field.
446 long si_free_count; // number of nodes in si_first_free
447 #ifdef SPELL_PRINTTREE
448 int si_wordnode_nr; // sequence nr for nodes
449 #endif
450 buf_T *si_spellbuf; // buffer used to store soundfold word table
451
452 int si_ascii; // handling only ASCII words
453 int si_add; // addition file
454 int si_clear_chartab; // when TRUE clear char tables
455 int si_region; // region mask
456 vimconv_T si_conv; // for conversion to 'encoding'
457 int si_memtot; // runtime memory used
458 int si_verbose; // verbose messages
459 int si_msg_count; // number of words added since last message
460 char_u *si_info; // info text chars or NULL
461 int si_region_count; // number of regions supported (1 when there
462 // are no regions)
463 char_u si_region_name[MAXREGIONS * 2 + 1];
464 // region names; used only if
465 // si_region_count > 1)
466
467 garray_T si_rep; // list of fromto_T entries from REP lines
468 garray_T si_repsal; // list of fromto_T entries from REPSAL lines
469 garray_T si_sal; // list of fromto_T entries from SAL lines
470 char_u *si_sofofr; // SOFOFROM text
471 char_u *si_sofoto; // SOFOTO text
472 int si_nosugfile; // NOSUGFILE item found
473 int si_nosplitsugs; // NOSPLITSUGS item found
474 int si_nocompoundsugs; // NOCOMPOUNDSUGS item found
475 int si_followup; // soundsalike: ?
476 int si_collapse; // soundsalike: ?
477 hashtab_T si_commonwords; // hashtable for common words
478 time_t si_sugtime; // timestamp for .sug file
479 int si_rem_accents; // soundsalike: remove accents
480 garray_T si_map; // MAP info concatenated
481 char_u *si_midword; // MIDWORD chars or NULL
482 int si_compmax; // max nr of words for compounding
483 int si_compminlen; // minimal length for compounding
484 int si_compsylmax; // max nr of syllables for compounding
485 int si_compoptions; // COMP_ flags
486 garray_T si_comppat; // CHECKCOMPOUNDPATTERN items, each stored as
487 // a string
488 char_u *si_compflags; // flags used for compounding
489 char_u si_nobreak; // NOBREAK
490 char_u *si_syllable; // syllable string
491 garray_T si_prefcond; // table with conditions for postponed
492 // prefixes, each stored as a string
493 int si_newprefID; // current value for ah_newID
494 int si_newcompID; // current value for compound ID
495 } spellinfo_T;
496
497 #ifdef INCLUDE_GENERATED_DECLARATIONS
498 # include "spellfile.c.generated.h"
499 #endif
500
501 /// Read n bytes from fd to buf, returning on errors
502 ///
503 /// @param[out] buf Buffer to read to, must be at least n bytes long.
504 /// @param[in] n Amount of bytes to read.
505 /// @param fd FILE* to read from.
506 /// @param exit_code Code to run before returning.
507 ///
508 /// @return Allows to proceed if everything is OK, returns SP_TRUNCERROR if
509 /// there are not enough bytes, returns SP_OTHERERROR if reading failed.
510 #define SPELL_READ_BYTES(buf, n, fd, exit_code) \
511 do { \
512 const size_t n__SPRB = (n); \
513 FILE *const fd__SPRB = (fd); \
514 char *const buf__SPRB = (buf); \
515 const size_t read_bytes__SPRB = fread(buf__SPRB, 1, n__SPRB, fd__SPRB); \
516 if (read_bytes__SPRB != n__SPRB) { \
517 exit_code; \
518 return feof(fd__SPRB) ? SP_TRUNCERROR : SP_OTHERERROR; \
519 } \
520 } while (0)
521
522 /// Like #SPELL_READ_BYTES, but also error out if NUL byte was read
523 ///
524 /// @return Allows to proceed if everything is OK, returns SP_TRUNCERROR if
525 /// there are not enough bytes, returns SP_OTHERERROR if reading failed,
526 /// returns SP_FORMERROR if read out a NUL byte.
527 #define SPELL_READ_NONNUL_BYTES(buf, n, fd, exit_code) \
528 do { \
529 const size_t n__SPRNB = (n); \
530 FILE *const fd__SPRNB = (fd); \
531 char *const buf__SPRNB = (buf); \
532 SPELL_READ_BYTES(buf__SPRNB, n__SPRNB, fd__SPRNB, exit_code); \
533 if (memchr(buf__SPRNB, NUL, (size_t)n__SPRNB)) { \
534 exit_code; \
535 return SP_FORMERROR; \
536 } \
537 } while (0)
538
539 /// Check that spell file starts with a magic string
540 ///
541 /// Does not check for version of the file.
542 ///
543 /// @param fd File to check.
544 ///
545 /// @return 0 in case of success, SP_TRUNCERROR if file contains not enough
546 /// bytes, SP_FORMERROR if it does not match magic string and
547 /// SP_OTHERERROR if reading file failed.
spell_check_magic_string(FILE * const fd)548 static inline int spell_check_magic_string(FILE *const fd)
549 FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
550 {
551 char buf[VIMSPELLMAGICL];
552 SPELL_READ_BYTES(buf, VIMSPELLMAGICL, fd,; );
553 if (memcmp(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0) {
554 return SP_FORMERROR;
555 }
556 return 0;
557 }
558
559 /// Load one spell file and store the info into a slang_T.
560 ///
561 /// This is invoked in three ways:
562 /// - From spell_load_cb() to load a spell file for the first time. "lang" is
563 /// the language name, "old_lp" is NULL. Will allocate an slang_T.
564 /// - To reload a spell file that was changed. "lang" is NULL and "old_lp"
565 /// points to the existing slang_T.
566 /// - Just after writing a .spl file; it's read back to produce the .sug file.
567 /// "old_lp" is NULL and "lang" is NULL. Will allocate an slang_T.
568 ///
569 /// @param silent no error if file doesn't exist
570 ///
571 /// @return the slang_T the spell file was loaded into. NULL for error.
spell_load_file(char_u * fname,char_u * lang,slang_T * old_lp,bool silent)572 slang_T *spell_load_file(char_u *fname, char_u *lang, slang_T *old_lp, bool silent)
573 {
574 FILE *fd;
575 char_u *p;
576 int n;
577 int len;
578 char_u *save_sourcing_name = sourcing_name;
579 linenr_T save_sourcing_lnum = sourcing_lnum;
580 slang_T *lp = NULL;
581 int c = 0;
582 int res;
583
584 fd = os_fopen((char *)fname, "r");
585 if (fd == NULL) {
586 if (!silent) {
587 semsg(_(e_notopen), fname);
588 } else if (p_verbose > 2) {
589 verbose_enter();
590 smsg((char *)e_notopen, fname);
591 verbose_leave();
592 }
593 goto endFAIL;
594 }
595 if (p_verbose > 2) {
596 verbose_enter();
597 smsg(_("Reading spell file \"%s\""), fname);
598 verbose_leave();
599 }
600
601 if (old_lp == NULL) {
602 lp = slang_alloc(lang);
603
604 // Remember the file name, used to reload the file when it's updated.
605 lp->sl_fname = vim_strsave(fname);
606
607 // Check for .add.spl.
608 lp->sl_add = strstr((char *)path_tail(fname), SPL_FNAME_ADD) != NULL;
609 } else {
610 lp = old_lp;
611 }
612
613 // Set sourcing_name, so that error messages mention the file name.
614 sourcing_name = fname;
615 sourcing_lnum = 0;
616
617 // <HEADER>: <fileID>
618 const int scms_ret = spell_check_magic_string(fd);
619 switch (scms_ret) {
620 case SP_FORMERROR:
621 case SP_TRUNCERROR:
622 semsg("%s", _("E757: This does not look like a spell file"));
623 goto endFAIL;
624 case SP_OTHERERROR:
625 semsg(_("E5042: Failed to read spell file %s: %s"),
626 fname, strerror(ferror(fd)));
627 goto endFAIL;
628 case 0:
629 break;
630 }
631 c = getc(fd); // <versionnr>
632 if (c < VIMSPELLVERSION) {
633 emsg(_("E771: Old spell file, needs to be updated"));
634 goto endFAIL;
635 } else if (c > VIMSPELLVERSION) {
636 emsg(_("E772: Spell file is for newer version of Vim"));
637 goto endFAIL;
638 }
639
640
641 // <SECTIONS>: <section> ... <sectionend>
642 // <section>: <sectionID> <sectionflags> <sectionlen> (section contents)
643 for (;;) {
644 n = getc(fd); // <sectionID> or <sectionend>
645 if (n == SN_END) {
646 break;
647 }
648 c = getc(fd); // <sectionflags>
649 len = get4c(fd); // <sectionlen>
650 if (len < 0) {
651 goto truncerr;
652 }
653
654 res = 0;
655 switch (n) {
656 case SN_INFO:
657 lp->sl_info = READ_STRING(fd, len); // <infotext>
658 if (lp->sl_info == NULL) {
659 goto endFAIL;
660 }
661 break;
662
663 case SN_REGION:
664 res = read_region_section(fd, lp, len);
665 break;
666
667 case SN_CHARFLAGS:
668 res = read_charflags_section(fd);
669 break;
670
671 case SN_MIDWORD:
672 lp->sl_midword = READ_STRING(fd, len); // <midword>
673 if (lp->sl_midword == NULL) {
674 goto endFAIL;
675 }
676 break;
677
678 case SN_PREFCOND:
679 res = read_prefcond_section(fd, lp);
680 break;
681
682 case SN_REP:
683 res = read_rep_section(fd, &lp->sl_rep, lp->sl_rep_first);
684 break;
685
686 case SN_REPSAL:
687 res = read_rep_section(fd, &lp->sl_repsal, lp->sl_repsal_first);
688 break;
689
690 case SN_SAL:
691 res = read_sal_section(fd, lp);
692 break;
693
694 case SN_SOFO:
695 res = read_sofo_section(fd, lp);
696 break;
697
698 case SN_MAP:
699 p = READ_STRING(fd, len); // <mapstr>
700 if (p == NULL) {
701 goto endFAIL;
702 }
703 set_map_str(lp, p);
704 xfree(p);
705 break;
706
707 case SN_WORDS:
708 res = read_words_section(fd, lp, len);
709 break;
710
711 case SN_SUGFILE:
712 lp->sl_sugtime = get8ctime(fd); // <timestamp>
713 break;
714
715 case SN_NOSPLITSUGS:
716 lp->sl_nosplitsugs = true;
717 break;
718
719 case SN_NOCOMPOUNDSUGS:
720 lp->sl_nocompoundsugs = true;
721 break;
722
723 case SN_COMPOUND:
724 res = read_compound(fd, lp, len);
725 break;
726
727 case SN_NOBREAK:
728 lp->sl_nobreak = true;
729 break;
730
731 case SN_SYLLABLE:
732 lp->sl_syllable = READ_STRING(fd, len); // <syllable>
733 if (lp->sl_syllable == NULL) {
734 goto endFAIL;
735 }
736 if (init_syl_tab(lp) == FAIL) {
737 goto endFAIL;
738 }
739 break;
740
741 default:
742 // Unsupported section. When it's required give an error
743 // message. When it's not required skip the contents.
744 if (c & SNF_REQUIRED) {
745 emsg(_("E770: Unsupported section in spell file"));
746 goto endFAIL;
747 }
748 while (--len >= 0) {
749 if (getc(fd) < 0) {
750 goto truncerr;
751 }
752 }
753 break;
754 }
755 someerror:
756 if (res == SP_FORMERROR) {
757 emsg(_(e_format));
758 goto endFAIL;
759 }
760 if (res == SP_TRUNCERROR) {
761 truncerr:
762 emsg(_(e_spell_trunc));
763 goto endFAIL;
764 }
765 if (res == SP_OTHERERROR) {
766 goto endFAIL;
767 }
768 }
769
770 // <LWORDTREE>
771 res = spell_read_tree(fd, &lp->sl_fbyts, &lp->sl_fbyts_len,
772 &lp->sl_fidxs, false, 0);
773 if (res != 0) {
774 goto someerror;
775 }
776
777 // <KWORDTREE>
778 res = spell_read_tree(fd, &lp->sl_kbyts, NULL, &lp->sl_kidxs, false, 0);
779 if (res != 0) {
780 goto someerror;
781 }
782
783 // <PREFIXTREE>
784 res = spell_read_tree(fd, &lp->sl_pbyts, NULL, &lp->sl_pidxs, true,
785 lp->sl_prefixcnt);
786 if (res != 0) {
787 goto someerror;
788 }
789
790 // For a new file link it in the list of spell files.
791 if (old_lp == NULL && lang != NULL) {
792 lp->sl_next = first_lang;
793 first_lang = lp;
794 }
795
796 goto endOK;
797
798 endFAIL:
799 if (lang != NULL) {
800 // truncating the name signals the error to spell_load_lang()
801 *lang = NUL;
802 }
803 if (lp != NULL && old_lp == NULL) {
804 slang_free(lp);
805 }
806 lp = NULL;
807
808 endOK:
809 if (fd != NULL) {
810 fclose(fd);
811 }
812 sourcing_name = save_sourcing_name;
813 sourcing_lnum = save_sourcing_lnum;
814
815 return lp;
816 }
817
818 // Fill in the wordcount fields for a trie.
819 // Returns the total number of words.
tree_count_words(char_u * byts,idx_T * idxs)820 static void tree_count_words(char_u *byts, idx_T *idxs)
821 {
822 int depth;
823 idx_T arridx[MAXWLEN];
824 int curi[MAXWLEN];
825 int c;
826 idx_T n;
827 int wordcount[MAXWLEN];
828
829 arridx[0] = 0;
830 curi[0] = 1;
831 wordcount[0] = 0;
832 depth = 0;
833 while (depth >= 0 && !got_int) {
834 if (curi[depth] > byts[arridx[depth]]) {
835 // Done all bytes at this node, go up one level.
836 idxs[arridx[depth]] = wordcount[depth];
837 if (depth > 0) {
838 wordcount[depth - 1] += wordcount[depth];
839 }
840
841 --depth;
842 fast_breakcheck();
843 } else {
844 // Do one more byte at this node.
845 n = arridx[depth] + curi[depth];
846 ++curi[depth];
847
848 c = byts[n];
849 if (c == 0) {
850 // End of word, count it.
851 ++wordcount[depth];
852
853 // Skip over any other NUL bytes (same word with different
854 // flags).
855 while (byts[n + 1] == 0) {
856 ++n;
857 ++curi[depth];
858 }
859 } else {
860 // Normal char, go one level deeper to count the words.
861 ++depth;
862 arridx[depth] = idxs[n];
863 curi[depth] = 1;
864 wordcount[depth] = 0;
865 }
866 }
867 }
868 }
869
870 // Load the .sug files for languages that have one and weren't loaded yet.
suggest_load_files(void)871 void suggest_load_files(void)
872 {
873 langp_T *lp;
874 slang_T *slang;
875 char_u *dotp;
876 FILE *fd;
877 char_u buf[MAXWLEN];
878 int i;
879 time_t timestamp;
880 int wcount;
881 int wordnr;
882 garray_T ga;
883 int c;
884
885 // Do this for all languages that support sound folding.
886 for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
887 lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
888 slang = lp->lp_slang;
889 if (slang->sl_sugtime != 0 && !slang->sl_sugloaded) {
890 // Change ".spl" to ".sug" and open the file. When the file isn't
891 // found silently skip it. Do set "sl_sugloaded" so that we
892 // don't try again and again.
893 slang->sl_sugloaded = true;
894
895 dotp = STRRCHR(slang->sl_fname, '.');
896 if (dotp == NULL || fnamecmp(dotp, ".spl") != 0) {
897 continue;
898 }
899 STRCPY(dotp, ".sug");
900 fd = os_fopen((char *)slang->sl_fname, "r");
901 if (fd == NULL) {
902 goto nextone;
903 }
904
905 // <SUGHEADER>: <fileID> <versionnr> <timestamp>
906 for (i = 0; i < VIMSUGMAGICL; ++i) {
907 buf[i] = getc(fd); // <fileID>
908 }
909 if (STRNCMP(buf, VIMSUGMAGIC, VIMSUGMAGICL) != 0) {
910 semsg(_("E778: This does not look like a .sug file: %s"),
911 slang->sl_fname);
912 goto nextone;
913 }
914 c = getc(fd); // <versionnr>
915 if (c < VIMSUGVERSION) {
916 semsg(_("E779: Old .sug file, needs to be updated: %s"),
917 slang->sl_fname);
918 goto nextone;
919 } else if (c > VIMSUGVERSION) {
920 semsg(_("E780: .sug file is for newer version of Vim: %s"),
921 slang->sl_fname);
922 goto nextone;
923 }
924
925 // Check the timestamp, it must be exactly the same as the one in
926 // the .spl file. Otherwise the word numbers won't match.
927 timestamp = get8ctime(fd); // <timestamp>
928 if (timestamp != slang->sl_sugtime) {
929 semsg(_("E781: .sug file doesn't match .spl file: %s"),
930 slang->sl_fname);
931 goto nextone;
932 }
933
934 // <SUGWORDTREE>: <wordtree>
935 // Read the trie with the soundfolded words.
936 if (spell_read_tree(fd, &slang->sl_sbyts, NULL, &slang->sl_sidxs,
937 false, 0) != 0) {
938 someerror:
939 semsg(_("E782: error while reading .sug file: %s"),
940 slang->sl_fname);
941 slang_clear_sug(slang);
942 goto nextone;
943 }
944
945 // <SUGTABLE>: <sugwcount> <sugline> ...
946 //
947 // Read the table with word numbers. We use a file buffer for
948 // this, because it's so much like a file with lines. Makes it
949 // possible to swap the info and save on memory use.
950 slang->sl_sugbuf = open_spellbuf();
951
952 // <sugwcount>
953 wcount = get4c(fd);
954 if (wcount < 0) {
955 goto someerror;
956 }
957
958 // Read all the wordnr lists into the buffer, one NUL terminated
959 // list per line.
960 ga_init(&ga, 1, 100);
961 for (wordnr = 0; wordnr < wcount; ++wordnr) {
962 ga.ga_len = 0;
963 for (;;) {
964 c = getc(fd); // <sugline>
965 if (c < 0) {
966 goto someerror;
967 }
968 GA_APPEND(char_u, &ga, c);
969 if (c == NUL) {
970 break;
971 }
972 }
973 if (ml_append_buf(slang->sl_sugbuf, (linenr_T)wordnr,
974 ga.ga_data, ga.ga_len, true) == FAIL) {
975 goto someerror;
976 }
977 }
978 ga_clear(&ga);
979
980 // Need to put word counts in the word tries, so that we can find
981 // a word by its number.
982 tree_count_words(slang->sl_fbyts, slang->sl_fidxs);
983 tree_count_words(slang->sl_sbyts, slang->sl_sidxs);
984
985 nextone:
986 if (fd != NULL) {
987 fclose(fd);
988 }
989 STRCPY(dotp, ".spl");
990 }
991 }
992 }
993
994
995 // Read a length field from "fd" in "cnt_bytes" bytes.
996 // Allocate memory, read the string into it and add a NUL at the end.
997 // Returns NULL when the count is zero.
998 // Sets "*cntp" to SP_*ERROR when there is an error, length of the result
999 // otherwise.
read_cnt_string(FILE * fd,int cnt_bytes,int * cntp)1000 static char_u *read_cnt_string(FILE *fd, int cnt_bytes, int *cntp)
1001 {
1002 int cnt = 0;
1003 char_u *str;
1004
1005 // read the length bytes, MSB first
1006 for (int i = 0; i < cnt_bytes; i++) {
1007 const int c = getc(fd);
1008
1009 if (c == EOF) {
1010 *cntp = SP_TRUNCERROR;
1011 return NULL;
1012 }
1013 cnt = (cnt << 8) + (unsigned)c;
1014 }
1015 *cntp = cnt;
1016 if (cnt == 0) {
1017 return NULL; // nothing to read, return NULL
1018 }
1019 str = READ_STRING(fd, cnt);
1020 if (str == NULL) {
1021 *cntp = SP_OTHERERROR;
1022 }
1023 return str;
1024 }
1025
1026 // Read SN_REGION: <regionname> ...
1027 // Return SP_*ERROR flags.
read_region_section(FILE * fd,slang_T * lp,int len)1028 static int read_region_section(FILE *fd, slang_T *lp, int len)
1029 {
1030 if (len > MAXREGIONS * 2) {
1031 return SP_FORMERROR;
1032 }
1033 SPELL_READ_NONNUL_BYTES((char *)lp->sl_regions, (size_t)len, fd,; );
1034 lp->sl_regions[len] = NUL;
1035 return 0;
1036 }
1037
1038 // Read SN_CHARFLAGS section: <charflagslen> <charflags>
1039 // <folcharslen> <folchars>
1040 // Return SP_*ERROR flags.
read_charflags_section(FILE * fd)1041 static int read_charflags_section(FILE *fd)
1042 {
1043 char_u *flags;
1044 char_u *fol;
1045 int flagslen, follen;
1046
1047 // <charflagslen> <charflags>
1048 flags = read_cnt_string(fd, 1, &flagslen);
1049 if (flagslen < 0) {
1050 return flagslen;
1051 }
1052
1053 // <folcharslen> <folchars>
1054 fol = read_cnt_string(fd, 2, &follen);
1055 if (follen < 0) {
1056 xfree(flags);
1057 return follen;
1058 }
1059
1060 // Set the word-char flags and fill SPELL_ISUPPER() table.
1061 if (flags != NULL && fol != NULL) {
1062 set_spell_charflags(flags, flagslen, fol);
1063 }
1064
1065 xfree(flags);
1066 xfree(fol);
1067
1068 // When <charflagslen> is zero then <fcharlen> must also be zero.
1069 if ((flags == NULL) != (fol == NULL)) {
1070 return SP_FORMERROR;
1071 }
1072 return 0;
1073 }
1074
1075 // Read SN_PREFCOND section.
1076 // Return SP_*ERROR flags.
read_prefcond_section(FILE * fd,slang_T * lp)1077 static int read_prefcond_section(FILE *fd, slang_T *lp)
1078 {
1079 // <prefcondcnt> <prefcond> ...
1080 const int cnt = get2c(fd); // <prefcondcnt>
1081 if (cnt <= 0) {
1082 return SP_FORMERROR;
1083 }
1084
1085 lp->sl_prefprog = xcalloc(cnt, sizeof(regprog_T *));
1086 lp->sl_prefixcnt = cnt;
1087
1088 for (int i = 0; i < cnt; i++) {
1089 // <prefcond> : <condlen> <condstr>
1090 const int n = getc(fd); // <condlen>
1091 if (n < 0 || n >= MAXWLEN) {
1092 return SP_FORMERROR;
1093 }
1094
1095 // When <condlen> is zero we have an empty condition. Otherwise
1096 // compile the regexp program used to check for the condition.
1097 if (n > 0) {
1098 char buf[MAXWLEN + 1];
1099 buf[0] = '^'; // always match at one position only
1100 SPELL_READ_NONNUL_BYTES(buf + 1, (size_t)n, fd,; );
1101 buf[n + 1] = NUL;
1102 lp->sl_prefprog[i] = vim_regcomp((char_u *)buf, RE_MAGIC | RE_STRING);
1103 }
1104 }
1105 return 0;
1106 }
1107
1108 // Read REP or REPSAL items section from "fd": <repcount> <rep> ...
1109 // Return SP_*ERROR flags.
read_rep_section(FILE * fd,garray_T * gap,int16_t * first)1110 static int read_rep_section(FILE *fd, garray_T *gap, int16_t *first)
1111 {
1112 int cnt;
1113 fromto_T *ftp;
1114
1115 cnt = get2c(fd); // <repcount>
1116 if (cnt < 0) {
1117 return SP_TRUNCERROR;
1118 }
1119
1120 ga_grow(gap, cnt);
1121
1122 // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
1123 for (; gap->ga_len < cnt; ++gap->ga_len) {
1124 int c;
1125 ftp = &((fromto_T *)gap->ga_data)[gap->ga_len];
1126 ftp->ft_from = read_cnt_string(fd, 1, &c);
1127 if (c < 0) {
1128 return c;
1129 }
1130 if (c == 0) {
1131 return SP_FORMERROR;
1132 }
1133 ftp->ft_to = read_cnt_string(fd, 1, &c);
1134 if (c <= 0) {
1135 xfree(ftp->ft_from);
1136 if (c < 0) {
1137 return c;
1138 }
1139 return SP_FORMERROR;
1140 }
1141 }
1142
1143 // Fill the first-index table.
1144 for (int i = 0; i < 256; ++i) {
1145 first[i] = -1;
1146 }
1147 for (int i = 0; i < gap->ga_len; ++i) {
1148 ftp = &((fromto_T *)gap->ga_data)[i];
1149 if (first[*ftp->ft_from] == -1) {
1150 first[*ftp->ft_from] = i;
1151 }
1152 }
1153 return 0;
1154 }
1155
1156 // Read SN_SAL section: <salflags> <salcount> <sal> ...
1157 // Return SP_*ERROR flags.
read_sal_section(FILE * fd,slang_T * slang)1158 static int read_sal_section(FILE *fd, slang_T *slang)
1159 {
1160 int cnt;
1161 garray_T *gap;
1162 salitem_T *smp;
1163 int ccnt;
1164 char_u *p;
1165
1166 slang->sl_sofo = false;
1167
1168 const int flags = getc(fd); // <salflags>
1169 if (flags & SAL_F0LLOWUP) {
1170 slang->sl_followup = true;
1171 }
1172 if (flags & SAL_COLLAPSE) {
1173 slang->sl_collapse = true;
1174 }
1175 if (flags & SAL_REM_ACCENTS) {
1176 slang->sl_rem_accents = true;
1177 }
1178
1179 cnt = get2c(fd); // <salcount>
1180 if (cnt < 0) {
1181 return SP_TRUNCERROR;
1182 }
1183
1184 gap = &slang->sl_sal;
1185 ga_init(gap, sizeof(salitem_T), 10);
1186 ga_grow(gap, cnt + 1);
1187
1188 // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
1189 for (; gap->ga_len < cnt; gap->ga_len++) {
1190 int c = NUL;
1191
1192 smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
1193 ccnt = getc(fd); // <salfromlen>
1194 if (ccnt < 0) {
1195 return SP_TRUNCERROR;
1196 }
1197 p = xmalloc(ccnt + 2);
1198 smp->sm_lead = p;
1199
1200 // Read up to the first special char into sm_lead.
1201 int i = 0;
1202 for (; i < ccnt; ++i) {
1203 c = getc(fd); // <salfrom>
1204 if (vim_strchr((char_u *)"0123456789(-<^$", c) != NULL) {
1205 break;
1206 }
1207 *p++ = c;
1208 }
1209 smp->sm_leadlen = (int)(p - smp->sm_lead);
1210 *p++ = NUL;
1211
1212 // Put (abc) chars in sm_oneof, if any.
1213 if (c == '(') {
1214 smp->sm_oneof = p;
1215 for (++i; i < ccnt; ++i) {
1216 c = getc(fd); // <salfrom>
1217 if (c == ')') {
1218 break;
1219 }
1220 *p++ = c;
1221 }
1222 *p++ = NUL;
1223 if (++i < ccnt) {
1224 c = getc(fd);
1225 }
1226 } else {
1227 smp->sm_oneof = NULL;
1228 }
1229
1230 // Any following chars go in sm_rules.
1231 smp->sm_rules = p;
1232 if (i < ccnt) {
1233 // store the char we got while checking for end of sm_lead
1234 *p++ = c;
1235 }
1236 i++;
1237 if (i < ccnt) {
1238 SPELL_READ_NONNUL_BYTES( // <salfrom>
1239 (char *)p, (size_t)(ccnt - i), fd,
1240 xfree(smp->sm_lead));
1241 p += (ccnt - i);
1242 }
1243 *p++ = NUL;
1244
1245 // <saltolen> <salto>
1246 smp->sm_to = read_cnt_string(fd, 1, &ccnt);
1247 if (ccnt < 0) {
1248 xfree(smp->sm_lead);
1249 return ccnt;
1250 }
1251
1252 // convert the multi-byte strings to wide char strings
1253 smp->sm_lead_w = mb_str2wide(smp->sm_lead);
1254 smp->sm_leadlen = mb_charlen(smp->sm_lead);
1255 if (smp->sm_oneof == NULL) {
1256 smp->sm_oneof_w = NULL;
1257 } else {
1258 smp->sm_oneof_w = mb_str2wide(smp->sm_oneof);
1259 }
1260 if (smp->sm_to == NULL) {
1261 smp->sm_to_w = NULL;
1262 } else {
1263 smp->sm_to_w = mb_str2wide(smp->sm_to);
1264 }
1265 }
1266
1267 if (!GA_EMPTY(gap)) {
1268 // Add one extra entry to mark the end with an empty sm_lead. Avoids
1269 // that we need to check the index every time.
1270 smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
1271 p = xmalloc(1);
1272 p[0] = NUL;
1273 smp->sm_lead = p;
1274 smp->sm_lead_w = mb_str2wide(smp->sm_lead);
1275 smp->sm_leadlen = 0;
1276 smp->sm_oneof = NULL;
1277 smp->sm_oneof_w = NULL;
1278 smp->sm_rules = p;
1279 smp->sm_to = NULL;
1280 smp->sm_to_w = NULL;
1281 gap->ga_len++;
1282 }
1283
1284 // Fill the first-index table.
1285 set_sal_first(slang);
1286
1287 return 0;
1288 }
1289
1290 // Read SN_WORDS: <word> ...
1291 // Return SP_*ERROR flags.
read_words_section(FILE * fd,slang_T * lp,int len)1292 static int read_words_section(FILE *fd, slang_T *lp, int len)
1293 {
1294 int done = 0;
1295 int i;
1296 int c;
1297 char_u word[MAXWLEN];
1298
1299 while (done < len) {
1300 // Read one word at a time.
1301 for (i = 0;; ++i) {
1302 c = getc(fd);
1303 if (c == EOF) {
1304 return SP_TRUNCERROR;
1305 }
1306 word[i] = c;
1307 if (word[i] == NUL) {
1308 break;
1309 }
1310 if (i == MAXWLEN - 1) {
1311 return SP_FORMERROR;
1312 }
1313 }
1314
1315 // Init the count to 10.
1316 count_common_word(lp, word, -1, 10);
1317 done += i + 1;
1318 }
1319 return 0;
1320 }
1321
1322 // SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
1323 // Return SP_*ERROR flags.
read_sofo_section(FILE * fd,slang_T * slang)1324 static int read_sofo_section(FILE *fd, slang_T *slang)
1325 {
1326 int cnt;
1327 char_u *from, *to;
1328 int res;
1329
1330 slang->sl_sofo = true;
1331
1332 // <sofofromlen> <sofofrom>
1333 from = read_cnt_string(fd, 2, &cnt);
1334 if (cnt < 0) {
1335 return cnt;
1336 }
1337
1338 // <sofotolen> <sofoto>
1339 to = read_cnt_string(fd, 2, &cnt);
1340 if (cnt < 0) {
1341 xfree(from);
1342 return cnt;
1343 }
1344
1345 // Store the info in slang->sl_sal and/or slang->sl_sal_first.
1346 if (from != NULL && to != NULL) {
1347 res = set_sofo(slang, from, to);
1348 } else if (from != NULL || to != NULL) {
1349 res = SP_FORMERROR; // only one of two strings is an error
1350 } else {
1351 res = 0;
1352 }
1353
1354 xfree(from);
1355 xfree(to);
1356 return res;
1357 }
1358
1359 // Read the compound section from the .spl file:
1360 // <compmax> <compminlen> <compsylmax> <compoptions> <compflags>
1361 // Returns SP_*ERROR flags.
read_compound(FILE * fd,slang_T * slang,int len)1362 static int read_compound(FILE *fd, slang_T *slang, int len)
1363 {
1364 int todo = len;
1365 int c;
1366 int atstart;
1367 char_u *pat;
1368 char_u *pp;
1369 char_u *cp;
1370 char_u *ap;
1371 char_u *crp;
1372 int cnt;
1373 garray_T *gap;
1374
1375 if (todo < 2) {
1376 return SP_FORMERROR; // need at least two bytes
1377 }
1378 --todo;
1379 c = getc(fd); // <compmax>
1380 if (c < 2) {
1381 c = MAXWLEN;
1382 }
1383 slang->sl_compmax = c;
1384
1385 --todo;
1386 c = getc(fd); // <compminlen>
1387 if (c < 1) {
1388 c = 0;
1389 }
1390 slang->sl_compminlen = c;
1391
1392 --todo;
1393 c = getc(fd); // <compsylmax>
1394 if (c < 1) {
1395 c = MAXWLEN;
1396 }
1397 slang->sl_compsylmax = c;
1398
1399 c = getc(fd); // <compoptions>
1400 if (c != 0) {
1401 ungetc(c, fd); // be backwards compatible with Vim 7.0b
1402 } else {
1403 --todo;
1404 c = getc(fd); // only use the lower byte for now
1405 --todo;
1406 slang->sl_compoptions = c;
1407
1408 gap = &slang->sl_comppat;
1409 c = get2c(fd); // <comppatcount>
1410 if (c < 0) {
1411 return SP_TRUNCERROR;
1412 }
1413 todo -= 2;
1414 ga_init(gap, sizeof(char_u *), c);
1415 ga_grow(gap, c);
1416 while (--c >= 0) {
1417 ((char_u **)(gap->ga_data))[gap->ga_len++] =
1418 read_cnt_string(fd, 1, &cnt);
1419 // <comppatlen> <comppattext>
1420 if (cnt < 0) {
1421 return cnt;
1422 }
1423 todo -= cnt + 1;
1424 }
1425 }
1426 if (todo < 0) {
1427 return SP_FORMERROR;
1428 }
1429
1430 // Turn the COMPOUNDRULE items into a regexp pattern:
1431 // "a[bc]/a*b+" -> "^\(a[bc]\|a*b\+\)$".
1432 // Inserting backslashes may double the length, "^\(\)$<Nul>" is 7 bytes.
1433 // Conversion to utf-8 may double the size.
1434 c = todo * 2 + 7;
1435 c += todo * 2;
1436 pat = xmalloc(c);
1437
1438 // We also need a list of all flags that can appear at the start and one
1439 // for all flags.
1440 cp = xmalloc(todo + 1);
1441 slang->sl_compstartflags = cp;
1442 *cp = NUL;
1443
1444 ap = xmalloc(todo + 1);
1445 slang->sl_compallflags = ap;
1446 *ap = NUL;
1447
1448 // And a list of all patterns in their original form, for checking whether
1449 // compounding may work in match_compoundrule(). This is freed when we
1450 // encounter a wildcard, the check doesn't work then.
1451 crp = xmalloc(todo + 1);
1452 slang->sl_comprules = crp;
1453
1454 pp = pat;
1455 *pp++ = '^';
1456 *pp++ = '\\';
1457 *pp++ = '(';
1458
1459 atstart = 1;
1460 while (todo-- > 0) {
1461 c = getc(fd); // <compflags>
1462 if (c == EOF) {
1463 xfree(pat);
1464 return SP_TRUNCERROR;
1465 }
1466
1467 // Add all flags to "sl_compallflags".
1468 if (vim_strchr((char_u *)"?*+[]/", c) == NULL
1469 && !byte_in_str(slang->sl_compallflags, c)) {
1470 *ap++ = c;
1471 *ap = NUL;
1472 }
1473
1474 if (atstart != 0) {
1475 // At start of item: copy flags to "sl_compstartflags". For a
1476 // [abc] item set "atstart" to 2 and copy up to the ']'.
1477 if (c == '[') {
1478 atstart = 2;
1479 } else if (c == ']') {
1480 atstart = 0;
1481 } else {
1482 if (!byte_in_str(slang->sl_compstartflags, c)) {
1483 *cp++ = c;
1484 *cp = NUL;
1485 }
1486 if (atstart == 1) {
1487 atstart = 0;
1488 }
1489 }
1490 }
1491
1492 // Copy flag to "sl_comprules", unless we run into a wildcard.
1493 if (crp != NULL) {
1494 if (c == '?' || c == '+' || c == '*') {
1495 XFREE_CLEAR(slang->sl_comprules);
1496 crp = NULL;
1497 } else {
1498 *crp++ = c;
1499 }
1500 }
1501
1502 if (c == '/') { // slash separates two items
1503 *pp++ = '\\';
1504 *pp++ = '|';
1505 atstart = 1;
1506 } else { // normal char, "[abc]" and '*' are copied as-is
1507 if (c == '?' || c == '+' || c == '~') {
1508 *pp++ = '\\'; // "a?" becomes "a\?", "a+" becomes "a\+"
1509 }
1510 pp += utf_char2bytes(c, pp);
1511 }
1512 }
1513
1514 *pp++ = '\\';
1515 *pp++ = ')';
1516 *pp++ = '$';
1517 *pp = NUL;
1518
1519 if (crp != NULL) {
1520 *crp = NUL;
1521 }
1522
1523 slang->sl_compprog = vim_regcomp(pat, RE_MAGIC + RE_STRING + RE_STRICT);
1524 xfree(pat);
1525 if (slang->sl_compprog == NULL) {
1526 return SP_FORMERROR;
1527 }
1528
1529 return 0;
1530 }
1531
1532 // Set the SOFOFROM and SOFOTO items in language "lp".
1533 // Returns SP_*ERROR flags when there is something wrong.
set_sofo(slang_T * lp,char_u * from,char_u * to)1534 static int set_sofo(slang_T *lp, char_u *from, char_u *to)
1535 {
1536 char_u *s;
1537 char_u *p;
1538
1539 // Use "sl_sal" as an array with 256 pointers to a list of wide
1540 // characters. The index is the low byte of the character.
1541 // The list contains from-to pairs with a terminating NUL.
1542 // sl_sal_first[] is used for latin1 "from" characters.
1543 garray_T *gap = &lp->sl_sal;
1544 ga_init(gap, sizeof(int *), 1);
1545 ga_grow(gap, 256);
1546 memset(gap->ga_data, 0, sizeof(int *) * 256);
1547 gap->ga_len = 256;
1548
1549 // First count the number of items for each list. Temporarily use
1550 // sl_sal_first[] for this.
1551 for (p = from, s = to; *p != NUL && *s != NUL;) {
1552 const int c = mb_cptr2char_adv((const char_u **)&p);
1553 MB_CPTR_ADV(s);
1554 if (c >= 256) {
1555 lp->sl_sal_first[c & 0xff]++;
1556 }
1557 }
1558 if (*p != NUL || *s != NUL) { // lengths differ
1559 return SP_FORMERROR;
1560 }
1561
1562 // Allocate the lists.
1563 for (int i = 0; i < 256; i++) {
1564 if (lp->sl_sal_first[i] > 0) {
1565 p = xmalloc(sizeof(int) * (lp->sl_sal_first[i] * 2 + 1));
1566 ((int **)gap->ga_data)[i] = (int *)p;
1567 *(int *)p = 0;
1568 }
1569 }
1570
1571 // Put the characters up to 255 in sl_sal_first[] the rest in a sl_sal
1572 // list.
1573 memset(lp->sl_sal_first, 0, sizeof(salfirst_T) * 256);
1574 for (p = from, s = to; *p != NUL && *s != NUL;) {
1575 const int c = mb_cptr2char_adv((const char_u **)&p);
1576 const int i = mb_cptr2char_adv((const char_u **)&s);
1577 if (c >= 256) {
1578 // Append the from-to chars at the end of the list with
1579 // the low byte.
1580 int *inp = ((int **)gap->ga_data)[c & 0xff];
1581 while (*inp != 0) {
1582 inp++;
1583 }
1584 *inp++ = c; // from char
1585 *inp++ = i; // to char
1586 *inp++ = NUL; // NUL at the end
1587 } else {
1588 // mapping byte to char is done in sl_sal_first[]
1589 lp->sl_sal_first[c] = i;
1590 }
1591 }
1592
1593 return 0;
1594 }
1595
1596 // Fill the first-index table for "lp".
set_sal_first(slang_T * lp)1597 static void set_sal_first(slang_T *lp)
1598 {
1599 salfirst_T *sfirst;
1600 salitem_T *smp;
1601 int c;
1602 garray_T *gap = &lp->sl_sal;
1603
1604 sfirst = lp->sl_sal_first;
1605 for (int i = 0; i < 256; ++i) {
1606 sfirst[i] = -1;
1607 }
1608 smp = (salitem_T *)gap->ga_data;
1609 for (int i = 0; i < gap->ga_len; i++) {
1610 // Use the lowest byte of the first character. For latin1 it's
1611 // the character, for other encodings it should differ for most
1612 // characters.
1613 c = *smp[i].sm_lead_w & 0xff;
1614 if (sfirst[c] == -1) {
1615 sfirst[c] = i;
1616
1617 // Make sure all entries with this byte are following each
1618 // other. Move the ones that are in the wrong position. Do
1619 // keep the same ordering!
1620 while (i + 1 < gap->ga_len
1621 && (*smp[i + 1].sm_lead_w & 0xff) == c) {
1622 // Skip over entry with same index byte.
1623 i++;
1624 }
1625
1626 for (int n = 1; i + n < gap->ga_len; n++) {
1627 if ((*smp[i + n].sm_lead_w & 0xff) == c) {
1628 salitem_T tsal;
1629
1630 // Move entry with same index byte after the entries
1631 // we already found.
1632 i++;
1633 n--;
1634 tsal = smp[i + n];
1635 memmove(smp + i + 1, smp + i, sizeof(salitem_T) * n);
1636 smp[i] = tsal;
1637 }
1638 }
1639 }
1640 }
1641 }
1642
1643 // Turn a multi-byte string into a wide character string.
1644 // Return it in allocated memory.
mb_str2wide(char_u * s)1645 static int *mb_str2wide(char_u *s)
1646 {
1647 int i = 0;
1648
1649 int *res = xmalloc((mb_charlen(s) + 1) * sizeof(int));
1650 for (char_u *p = s; *p != NUL;) {
1651 res[i++] = mb_ptr2char_adv((const char_u **)&p);
1652 }
1653 res[i] = NUL;
1654
1655 return res;
1656 }
1657
1658 /// Reads a tree from the .spl or .sug file.
1659 /// Allocates the memory and stores pointers in "bytsp" and "idxsp".
1660 /// This is skipped when the tree has zero length.
1661 ///
1662 /// @param prefixtree true for the prefix tree
1663 /// @param prefixcnt when "prefixtree" is true: prefix count
1664 ///
1665 /// @return zero when OK, SP_ value for an error.
spell_read_tree(FILE * fd,char_u ** bytsp,long * bytsp_len,idx_T ** idxsp,bool prefixtree,int prefixcnt)1666 static int spell_read_tree(FILE *fd, char_u **bytsp, long *bytsp_len, idx_T **idxsp,
1667 bool prefixtree, int prefixcnt)
1668 FUNC_ATTR_NONNULL_ARG(1, 2, 4)
1669 {
1670 int idx;
1671 char_u *bp;
1672 idx_T *ip;
1673
1674 // The tree size was computed when writing the file, so that we can
1675 // allocate it as one long block. <nodecount>
1676 long len = get4c(fd);
1677 if (len < 0) {
1678 return SP_TRUNCERROR;
1679 }
1680 if ((size_t)len >= SIZE_MAX / sizeof(int)) { // -V547
1681 // Invalid length, multiply with sizeof(int) would overflow.
1682 return SP_FORMERROR;
1683 }
1684 if (len > 0) {
1685 // Allocate the byte array.
1686 bp = xmalloc(len);
1687 *bytsp = bp;
1688 if (bytsp_len != NULL) {
1689 *bytsp_len = len;
1690 }
1691
1692 // Allocate the index array.
1693 ip = xcalloc(len, sizeof(*ip));
1694 *idxsp = ip;
1695
1696 // Recursively read the tree and store it in the array.
1697 idx = read_tree_node(fd, bp, ip, len, 0, prefixtree, prefixcnt);
1698 if (idx < 0) {
1699 return idx;
1700 }
1701 }
1702 return 0;
1703 }
1704
1705 /// Read one row of siblings from the spell file and store it in the byte array
1706 /// "byts" and index array "idxs". Recursively read the children.
1707 ///
1708 /// NOTE: The code here must match put_node()!
1709 ///
1710 /// Returns the index (>= 0) following the siblings.
1711 /// Returns SP_TRUNCERROR if the file is shorter than expected.
1712 /// Returns SP_FORMERROR if there is a format error.
1713 ///
1714 /// @param maxidx size of arrays
1715 /// @param startidx current index in "byts" and "idxs"
1716 /// @param prefixtree true for reading PREFIXTREE
1717 /// @param maxprefcondnr maximum for <prefcondnr>
read_tree_node(FILE * fd,char_u * byts,idx_T * idxs,int maxidx,idx_T startidx,bool prefixtree,int maxprefcondnr)1718 static idx_T read_tree_node(FILE *fd, char_u *byts, idx_T *idxs, int maxidx, idx_T startidx,
1719 bool prefixtree, int maxprefcondnr)
1720 {
1721 int len;
1722 int i;
1723 int n;
1724 idx_T idx = startidx;
1725 int c;
1726 int c2;
1727 #define SHARED_MASK 0x8000000
1728
1729 len = getc(fd); // <siblingcount>
1730 if (len <= 0) {
1731 return SP_TRUNCERROR;
1732 }
1733
1734 if (startidx + len >= maxidx) {
1735 return SP_FORMERROR;
1736 }
1737 byts[idx++] = len;
1738
1739 // Read the byte values, flag/region bytes and shared indexes.
1740 for (i = 1; i <= len; ++i) {
1741 c = getc(fd); // <byte>
1742 if (c < 0) {
1743 return SP_TRUNCERROR;
1744 }
1745 if (c <= BY_SPECIAL) {
1746 if (c == BY_NOFLAGS && !prefixtree) {
1747 // No flags, all regions.
1748 idxs[idx] = 0;
1749 } else if (c != BY_INDEX) {
1750 if (prefixtree) {
1751 // Read the optional pflags byte, the prefix ID and the
1752 // condition nr. In idxs[] store the prefix ID in the low
1753 // byte, the condition index shifted up 8 bits, the flags
1754 // shifted up 24 bits.
1755 if (c == BY_FLAGS) {
1756 c = getc(fd) << 24; // <pflags>
1757 } else {
1758 c = 0;
1759 }
1760
1761 c |= getc(fd); // <affixID>
1762
1763 n = get2c(fd); // <prefcondnr>
1764 if (n >= maxprefcondnr) {
1765 return SP_FORMERROR;
1766 }
1767 c |= (n << 8);
1768 } else { // c must be BY_FLAGS or BY_FLAGS2
1769 // Read flags and optional region and prefix ID. In
1770 // idxs[] the flags go in the low two bytes, region above
1771 // that and prefix ID above the region.
1772 c2 = c;
1773 c = getc(fd); // <flags>
1774 if (c2 == BY_FLAGS2) {
1775 c = (getc(fd) << 8) + c; // <flags2>
1776 }
1777 if (c & WF_REGION) {
1778 c = (getc(fd) << 16) + c; // <region>
1779 }
1780 if (c & WF_AFX) {
1781 c = (getc(fd) << 24) + c; // <affixID>
1782 }
1783 }
1784
1785 idxs[idx] = c;
1786 c = 0;
1787 } else { // c == BY_INDEX
1788 // <nodeidx>
1789 n = get3c(fd);
1790 if (n < 0 || n >= maxidx) {
1791 return SP_FORMERROR;
1792 }
1793 idxs[idx] = n + SHARED_MASK;
1794 c = getc(fd); // <xbyte>
1795 }
1796 }
1797 byts[idx++] = c;
1798 }
1799
1800 // Recursively read the children for non-shared siblings.
1801 // Skip the end-of-word ones (zero byte value) and the shared ones (and
1802 // remove SHARED_MASK)
1803 for (i = 1; i <= len; ++i) {
1804 if (byts[startidx + i] != 0) {
1805 if (idxs[startidx + i] & SHARED_MASK) {
1806 idxs[startidx + i] &= ~SHARED_MASK;
1807 } else {
1808 idxs[startidx + i] = idx;
1809 idx = read_tree_node(fd, byts, idxs, maxidx, idx,
1810 prefixtree, maxprefcondnr);
1811 if (idx < 0) {
1812 break;
1813 }
1814 }
1815 }
1816 }
1817
1818 return idx;
1819 }
1820
1821 /// Reload the spell file "fname" if it's loaded.
1822 ///
1823 /// @param added_word invoked through "zg"
spell_reload_one(char_u * fname,bool added_word)1824 static void spell_reload_one(char_u *fname, bool added_word)
1825 {
1826 slang_T *slang;
1827 bool didit = false;
1828
1829 for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
1830 if (path_full_compare(fname, slang->sl_fname, false, true) == kEqualFiles) {
1831 slang_clear(slang);
1832 if (spell_load_file(fname, NULL, slang, false) == NULL) {
1833 // reloading failed, clear the language
1834 slang_clear(slang);
1835 }
1836 redraw_all_later(SOME_VALID);
1837 didit = true;
1838 }
1839 }
1840
1841 // When "zg" was used and the file wasn't loaded yet, should redo
1842 // 'spelllang' to load it now.
1843 if (added_word && !didit) {
1844 did_set_spelllang(curwin);
1845 }
1846 }
1847
1848 // Functions for ":mkspell".
1849
1850 // In the postponed prefixes tree wn_flags is used to store the WFP_ flags,
1851 // but it must be negative to indicate the prefix tree to tree_add_word().
1852 // Use a negative number with the lower 8 bits zero.
1853 #define PFX_FLAGS -256
1854
1855 // flags for "condit" argument of store_aff_word()
1856 #define CONDIT_COMB 1 // affix must combine
1857 #define CONDIT_CFIX 2 // affix must have CIRCUMFIX flag
1858 #define CONDIT_SUF 4 // add a suffix for matching flags
1859 #define CONDIT_AFF 8 // word already has an affix
1860
1861 // Tunable parameters for when the tree is compressed. Filled from the
1862 // 'mkspellmem' option.
1863 static long compress_start = 30000; // memory / SBLOCKSIZE
1864 static long compress_inc = 100; // memory / SBLOCKSIZE
1865 static long compress_added = 500000; // word count
1866
1867 // Check the 'mkspellmem' option. Return FAIL if it's wrong.
1868 // Sets "sps_flags".
spell_check_msm(void)1869 int spell_check_msm(void)
1870 {
1871 char_u *p = p_msm;
1872 long start = 0;
1873 long incr = 0;
1874 long added = 0;
1875
1876 if (!ascii_isdigit(*p)) {
1877 return FAIL;
1878 }
1879 // block count = (value * 1024) / SBLOCKSIZE (but avoid overflow)
1880 start = (getdigits_long(&p, true, 0) * 10) / (SBLOCKSIZE / 102);
1881 if (*p != ',') {
1882 return FAIL;
1883 }
1884 p++;
1885 if (!ascii_isdigit(*p)) {
1886 return FAIL;
1887 }
1888 incr = (getdigits_long(&p, true, 0) * 102) / (SBLOCKSIZE / 10);
1889 if (*p != ',') {
1890 return FAIL;
1891 }
1892 p++;
1893 if (!ascii_isdigit(*p)) {
1894 return FAIL;
1895 }
1896 added = getdigits_long(&p, true, 0) * 1024;
1897 if (*p != NUL) {
1898 return FAIL;
1899 }
1900
1901 if (start == 0 || incr == 0 || added == 0 || incr > start) {
1902 return FAIL;
1903 }
1904
1905 compress_start = start;
1906 compress_inc = incr;
1907 compress_added = added;
1908 return OK;
1909 }
1910
1911 #ifdef SPELL_PRINTTREE
1912 // For debugging the tree code: print the current tree in a (more or less)
1913 // readable format, so that we can see what happens when adding a word and/or
1914 // compressing the tree.
1915 // Based on code from Olaf Seibert.
1916 # define PRINTLINESIZE 1000
1917 # define PRINTWIDTH 6
1918
1919 # define PRINTSOME(l, depth, fmt, a1, a2) vim_snprintf(l + depth * PRINTWIDTH, \
1920 PRINTLINESIZE - PRINTWIDTH * depth, fmt, a1, \
1921 a2)
1922
1923 static char line1[PRINTLINESIZE];
1924 static char line2[PRINTLINESIZE];
1925 static char line3[PRINTLINESIZE];
1926
spell_clear_flags(wordnode_T * node)1927 static void spell_clear_flags(wordnode_T *node)
1928 {
1929 wordnode_T *np;
1930
1931 for (np = node; np != NULL; np = np->wn_sibling) {
1932 np->wn_u1.index = FALSE;
1933 spell_clear_flags(np->wn_child);
1934 }
1935 }
1936
spell_print_node(wordnode_T * node,int depth)1937 static void spell_print_node(wordnode_T *node, int depth)
1938 {
1939 if (node->wn_u1.index) {
1940 // Done this node before, print the reference.
1941 PRINTSOME(line1, depth, "(%d)", node->wn_nr, 0);
1942 PRINTSOME(line2, depth, " ", 0, 0);
1943 PRINTSOME(line3, depth, " ", 0, 0);
1944 msg((char_u *)line1);
1945 msg((char_u *)line2);
1946 msg((char_u *)line3);
1947 } else {
1948 node->wn_u1.index = TRUE;
1949
1950 if (node->wn_byte != NUL) {
1951 if (node->wn_child != NULL) {
1952 PRINTSOME(line1, depth, " %c -> ", node->wn_byte, 0);
1953 } else {
1954 // Cannot happen?
1955 PRINTSOME(line1, depth, " %c ???", node->wn_byte, 0);
1956 }
1957 } else {
1958 PRINTSOME(line1, depth, " $ ", 0, 0);
1959 }
1960
1961 PRINTSOME(line2, depth, "%d/%d ", node->wn_nr, node->wn_refs);
1962
1963 if (node->wn_sibling != NULL) {
1964 PRINTSOME(line3, depth, " | ", 0, 0);
1965 } else {
1966 PRINTSOME(line3, depth, " ", 0, 0);
1967 }
1968
1969 if (node->wn_byte == NUL) {
1970 msg((char_u *)line1);
1971 msg((char_u *)line2);
1972 msg((char_u *)line3);
1973 }
1974
1975 // do the children
1976 if (node->wn_byte != NUL && node->wn_child != NULL) {
1977 spell_print_node(node->wn_child, depth + 1);
1978 }
1979
1980 // do the siblings
1981 if (node->wn_sibling != NULL) {
1982 // get rid of all parent details except |
1983 STRCPY(line1, line3);
1984 STRCPY(line2, line3);
1985 spell_print_node(node->wn_sibling, depth);
1986 }
1987 }
1988 }
1989
spell_print_tree(wordnode_T * root)1990 static void spell_print_tree(wordnode_T *root)
1991 {
1992 if (root != NULL) {
1993 // Clear the "wn_u1.index" fields, used to remember what has been
1994 // done.
1995 spell_clear_flags(root);
1996
1997 // Recursively print the tree.
1998 spell_print_node(root, 0);
1999 }
2000 }
2001
2002 #endif // SPELL_PRINTTREE
2003
2004 // Reads the affix file "fname".
2005 // Returns an afffile_T, NULL for complete failure.
spell_read_aff(spellinfo_T * spin,char_u * fname)2006 static afffile_T *spell_read_aff(spellinfo_T *spin, char_u *fname)
2007 {
2008 FILE *fd;
2009 char_u rline[MAXLINELEN];
2010 char_u *line;
2011 char_u *pc = NULL;
2012 #define MAXITEMCNT 30
2013 char_u *(items[MAXITEMCNT]);
2014 int itemcnt;
2015 char_u *p;
2016 int lnum = 0;
2017 affheader_T *cur_aff = NULL;
2018 bool did_postpone_prefix = false;
2019 int aff_todo = 0;
2020 hashtab_T *tp;
2021 char_u *low = NULL;
2022 char_u *fol = NULL;
2023 char_u *upp = NULL;
2024 int do_rep;
2025 int do_repsal;
2026 int do_sal;
2027 int do_mapline;
2028 bool found_map = false;
2029 hashitem_T *hi;
2030 int l;
2031 int compminlen = 0; // COMPOUNDMIN value
2032 int compsylmax = 0; // COMPOUNDSYLMAX value
2033 int compoptions = 0; // COMP_ flags
2034 int compmax = 0; // COMPOUNDWORDMAX value
2035 char_u *compflags = NULL; // COMPOUNDFLAG and COMPOUNDRULE
2036 // concatenated
2037 char_u *midword = NULL; // MIDWORD value
2038 char_u *syllable = NULL; // SYLLABLE value
2039 char_u *sofofrom = NULL; // SOFOFROM value
2040 char_u *sofoto = NULL; // SOFOTO value
2041
2042 // Open the file.
2043 fd = os_fopen((char *)fname, "r");
2044 if (fd == NULL) {
2045 semsg(_(e_notopen), fname);
2046 return NULL;
2047 }
2048
2049 vim_snprintf((char *)IObuff, IOSIZE, _("Reading affix file %s..."), fname);
2050 spell_message(spin, IObuff);
2051
2052 // Only do REP lines when not done in another .aff file already.
2053 do_rep = GA_EMPTY(&spin->si_rep);
2054
2055 // Only do REPSAL lines when not done in another .aff file already.
2056 do_repsal = GA_EMPTY(&spin->si_repsal);
2057
2058 // Only do SAL lines when not done in another .aff file already.
2059 do_sal = GA_EMPTY(&spin->si_sal);
2060
2061 // Only do MAP lines when not done in another .aff file already.
2062 do_mapline = GA_EMPTY(&spin->si_map);
2063
2064 // Allocate and init the afffile_T structure.
2065 afffile_T *aff = getroom(spin, sizeof(*aff), true);
2066 hash_init(&aff->af_pref);
2067 hash_init(&aff->af_suff);
2068 hash_init(&aff->af_comp);
2069
2070 // Read all the lines in the file one by one.
2071 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) {
2072 line_breakcheck();
2073 ++lnum;
2074
2075 // Skip comment lines.
2076 if (*rline == '#') {
2077 continue;
2078 }
2079
2080 // Convert from "SET" to 'encoding' when needed.
2081 xfree(pc);
2082 if (spin->si_conv.vc_type != CONV_NONE) {
2083 pc = string_convert(&spin->si_conv, rline, NULL);
2084 if (pc == NULL) {
2085 smsg(_("Conversion failure for word in %s line %d: %s"),
2086 fname, lnum, rline);
2087 continue;
2088 }
2089 line = pc;
2090 } else {
2091 pc = NULL;
2092 line = rline;
2093 }
2094
2095 // Split the line up in white separated items. Put a NUL after each
2096 // item.
2097 itemcnt = 0;
2098 for (p = line;;) {
2099 while (*p != NUL && *p <= ' ') { // skip white space and CR/NL
2100 ++p;
2101 }
2102 if (*p == NUL) {
2103 break;
2104 }
2105 if (itemcnt == MAXITEMCNT) { // too many items
2106 break;
2107 }
2108 items[itemcnt++] = p;
2109 // A few items have arbitrary text argument, don't split them.
2110 if (itemcnt == 2 && spell_info_item(items[0])) {
2111 while (*p >= ' ' || *p == TAB) { // skip until CR/NL
2112 ++p;
2113 }
2114 } else {
2115 while (*p > ' ') { // skip until white space or CR/NL
2116 ++p;
2117 }
2118 }
2119 if (*p == NUL) {
2120 break;
2121 }
2122 *p++ = NUL;
2123 }
2124
2125 // Handle non-empty lines.
2126 if (itemcnt > 0) {
2127 if (is_aff_rule(items, itemcnt, "SET", 2) && aff->af_enc == NULL) {
2128 // Setup for conversion from "ENC" to 'encoding'.
2129 aff->af_enc = enc_canonize(items[1]);
2130 if (!spin->si_ascii
2131 && convert_setup(&spin->si_conv, aff->af_enc,
2132 p_enc) == FAIL) {
2133 smsg(_("Conversion in %s not supported: from %s to %s"),
2134 fname, aff->af_enc, p_enc);
2135 }
2136 spin->si_conv.vc_fail = true;
2137 } else if (is_aff_rule(items, itemcnt, "FLAG", 2)
2138 && aff->af_flagtype == AFT_CHAR) {
2139 if (STRCMP(items[1], "long") == 0) {
2140 aff->af_flagtype = AFT_LONG;
2141 } else if (STRCMP(items[1], "num") == 0) {
2142 aff->af_flagtype = AFT_NUM;
2143 } else if (STRCMP(items[1], "caplong") == 0) {
2144 aff->af_flagtype = AFT_CAPLONG;
2145 } else {
2146 smsg(_("Invalid value for FLAG in %s line %d: %s"),
2147 fname, lnum, items[1]);
2148 }
2149 if (aff->af_rare != 0
2150 || aff->af_keepcase != 0
2151 || aff->af_bad != 0
2152 || aff->af_needaffix != 0
2153 || aff->af_circumfix != 0
2154 || aff->af_needcomp != 0
2155 || aff->af_comproot != 0
2156 || aff->af_nosuggest != 0
2157 || compflags != NULL
2158 || aff->af_suff.ht_used > 0
2159 || aff->af_pref.ht_used > 0) {
2160 smsg(_("FLAG after using flags in %s line %d: %s"),
2161 fname, lnum, items[1]);
2162 }
2163 } else if (spell_info_item(items[0]) && itemcnt > 1) {
2164 p = getroom(spin,
2165 (spin->si_info == NULL ? 0 : STRLEN(spin->si_info))
2166 + STRLEN(items[0])
2167 + STRLEN(items[1]) + 3, false);
2168 if (spin->si_info != NULL) {
2169 STRCPY(p, spin->si_info);
2170 STRCAT(p, "\n");
2171 }
2172 STRCAT(p, items[0]);
2173 STRCAT(p, " ");
2174 STRCAT(p, items[1]);
2175 spin->si_info = p;
2176 } else if (is_aff_rule(items, itemcnt, "MIDWORD", 2)
2177 && midword == NULL) {
2178 midword = getroom_save(spin, items[1]);
2179 } else if (is_aff_rule(items, itemcnt, "TRY", 2)) {
2180 // ignored, we look in the tree for what chars may appear
2181 }
2182 // TODO: remove "RAR" later
2183 else if ((is_aff_rule(items, itemcnt, "RAR", 2)
2184 || is_aff_rule(items, itemcnt, "RARE", 2))
2185 && aff->af_rare == 0) {
2186 aff->af_rare = affitem2flag(aff->af_flagtype, items[1],
2187 fname, lnum);
2188 }
2189 // TODO: remove "KEP" later
2190 else if ((is_aff_rule(items, itemcnt, "KEP", 2)
2191 || is_aff_rule(items, itemcnt, "KEEPCASE", 2))
2192 && aff->af_keepcase == 0) {
2193 aff->af_keepcase = affitem2flag(aff->af_flagtype, items[1],
2194 fname, lnum);
2195 } else if ((is_aff_rule(items, itemcnt, "BAD", 2)
2196 || is_aff_rule(items, itemcnt, "FORBIDDENWORD", 2))
2197 && aff->af_bad == 0) {
2198 aff->af_bad = affitem2flag(aff->af_flagtype, items[1],
2199 fname, lnum);
2200 } else if (is_aff_rule(items, itemcnt, "NEEDAFFIX", 2)
2201 && aff->af_needaffix == 0) {
2202 aff->af_needaffix = affitem2flag(aff->af_flagtype, items[1],
2203 fname, lnum);
2204 } else if (is_aff_rule(items, itemcnt, "CIRCUMFIX", 2)
2205 && aff->af_circumfix == 0) {
2206 aff->af_circumfix = affitem2flag(aff->af_flagtype, items[1],
2207 fname, lnum);
2208 } else if (is_aff_rule(items, itemcnt, "NOSUGGEST", 2)
2209 && aff->af_nosuggest == 0) {
2210 aff->af_nosuggest = affitem2flag(aff->af_flagtype, items[1],
2211 fname, lnum);
2212 } else if ((is_aff_rule(items, itemcnt, "NEEDCOMPOUND", 2)
2213 || is_aff_rule(items, itemcnt, "ONLYINCOMPOUND", 2))
2214 && aff->af_needcomp == 0) {
2215 aff->af_needcomp = affitem2flag(aff->af_flagtype, items[1],
2216 fname, lnum);
2217 } else if (is_aff_rule(items, itemcnt, "COMPOUNDROOT", 2)
2218 && aff->af_comproot == 0) {
2219 aff->af_comproot = affitem2flag(aff->af_flagtype, items[1],
2220 fname, lnum);
2221 } else if (is_aff_rule(items, itemcnt, "COMPOUNDFORBIDFLAG", 2)
2222 && aff->af_compforbid == 0) {
2223 aff->af_compforbid = affitem2flag(aff->af_flagtype, items[1],
2224 fname, lnum);
2225 if (aff->af_pref.ht_used > 0) {
2226 smsg(_("Defining COMPOUNDFORBIDFLAG after PFX item may give wrong results in %s line %d"),
2227 fname, lnum);
2228 }
2229 } else if (is_aff_rule(items, itemcnt, "COMPOUNDPERMITFLAG", 2)
2230 && aff->af_comppermit == 0) {
2231 aff->af_comppermit = affitem2flag(aff->af_flagtype, items[1],
2232 fname, lnum);
2233 if (aff->af_pref.ht_used > 0) {
2234 smsg(_("Defining COMPOUNDPERMITFLAG after PFX item may give wrong results in %s line %d"),
2235 fname, lnum);
2236 }
2237 } else if (is_aff_rule(items, itemcnt, "COMPOUNDFLAG", 2)
2238 && compflags == NULL) {
2239 // Turn flag "c" into COMPOUNDRULE compatible string "c+",
2240 // "Na" into "Na+", "1234" into "1234+".
2241 p = getroom(spin, STRLEN(items[1]) + 2, false);
2242 STRCPY(p, items[1]);
2243 STRCAT(p, "+");
2244 compflags = p;
2245 } else if (is_aff_rule(items, itemcnt, "COMPOUNDRULES", 2)) {
2246 // We don't use the count, but do check that it's a number and
2247 // not COMPOUNDRULE mistyped.
2248 if (atoi((char *)items[1]) == 0) {
2249 smsg(_("Wrong COMPOUNDRULES value in %s line %d: %s"),
2250 fname, lnum, items[1]);
2251 }
2252 } else if (is_aff_rule(items, itemcnt, "COMPOUNDRULE", 2)) {
2253 // Don't use the first rule if it is a number.
2254 if (compflags != NULL || *skipdigits(items[1]) != NUL) {
2255 // Concatenate this string to previously defined ones,
2256 // using a slash to separate them.
2257 l = (int)STRLEN(items[1]) + 1;
2258 if (compflags != NULL) {
2259 l += (int)STRLEN(compflags) + 1;
2260 }
2261 p = getroom(spin, l, false);
2262 if (compflags != NULL) {
2263 STRCPY(p, compflags);
2264 STRCAT(p, "/");
2265 }
2266 STRCAT(p, items[1]);
2267 compflags = p;
2268 }
2269 } else if (is_aff_rule(items, itemcnt, "COMPOUNDWORDMAX", 2)
2270 && compmax == 0) {
2271 compmax = atoi((char *)items[1]);
2272 if (compmax == 0) {
2273 smsg(_("Wrong COMPOUNDWORDMAX value in %s line %d: %s"),
2274 fname, lnum, items[1]);
2275 }
2276 } else if (is_aff_rule(items, itemcnt, "COMPOUNDMIN", 2)
2277 && compminlen == 0) {
2278 compminlen = atoi((char *)items[1]);
2279 if (compminlen == 0) {
2280 smsg(_("Wrong COMPOUNDMIN value in %s line %d: %s"),
2281 fname, lnum, items[1]);
2282 }
2283 } else if (is_aff_rule(items, itemcnt, "COMPOUNDSYLMAX", 2)
2284 && compsylmax == 0) {
2285 compsylmax = atoi((char *)items[1]);
2286 if (compsylmax == 0) {
2287 smsg(_("Wrong COMPOUNDSYLMAX value in %s line %d: %s"),
2288 fname, lnum, items[1]);
2289 }
2290 } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDDUP", 1)) {
2291 compoptions |= COMP_CHECKDUP;
2292 } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDREP", 1)) {
2293 compoptions |= COMP_CHECKREP;
2294 } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDCASE", 1)) {
2295 compoptions |= COMP_CHECKCASE;
2296 } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDTRIPLE", 1)) {
2297 compoptions |= COMP_CHECKTRIPLE;
2298 } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDPATTERN", 2)) {
2299 if (atoi((char *)items[1]) == 0) {
2300 smsg(_("Wrong CHECKCOMPOUNDPATTERN value in %s line %d: %s"),
2301 fname, lnum, items[1]);
2302 }
2303 } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDPATTERN", 3)) {
2304 garray_T *gap = &spin->si_comppat;
2305 int i;
2306
2307 // Only add the couple if it isn't already there.
2308 for (i = 0; i < gap->ga_len - 1; i += 2) {
2309 if (STRCMP(((char_u **)(gap->ga_data))[i], items[1]) == 0
2310 && STRCMP(((char_u **)(gap->ga_data))[i + 1],
2311 items[2]) == 0) {
2312 break;
2313 }
2314 }
2315 if (i >= gap->ga_len) {
2316 ga_grow(gap, 2);
2317 ((char_u **)(gap->ga_data))[gap->ga_len++]
2318 = getroom_save(spin, items[1]);
2319 ((char_u **)(gap->ga_data))[gap->ga_len++]
2320 = getroom_save(spin, items[2]);
2321 }
2322 } else if (is_aff_rule(items, itemcnt, "SYLLABLE", 2)
2323 && syllable == NULL) {
2324 syllable = getroom_save(spin, items[1]);
2325 } else if (is_aff_rule(items, itemcnt, "NOBREAK", 1)) {
2326 spin->si_nobreak = true;
2327 } else if (is_aff_rule(items, itemcnt, "NOSPLITSUGS", 1)) {
2328 spin->si_nosplitsugs = true;
2329 } else if (is_aff_rule(items, itemcnt, "NOCOMPOUNDSUGS", 1)) {
2330 spin->si_nocompoundsugs = true;
2331 } else if (is_aff_rule(items, itemcnt, "NOSUGFILE", 1)) {
2332 spin->si_nosugfile = true;
2333 } else if (is_aff_rule(items, itemcnt, "PFXPOSTPONE", 1)) {
2334 aff->af_pfxpostpone = true;
2335 } else if (is_aff_rule(items, itemcnt, "IGNOREEXTRA", 1)) {
2336 aff->af_ignoreextra = true;
2337 } else if ((STRCMP(items[0], "PFX") == 0
2338 || STRCMP(items[0], "SFX") == 0)
2339 && aff_todo == 0
2340 && itemcnt >= 4) {
2341 int lasti = 4;
2342 char_u key[AH_KEY_LEN];
2343
2344 if (*items[0] == 'P') {
2345 tp = &aff->af_pref;
2346 } else {
2347 tp = &aff->af_suff;
2348 }
2349
2350 // Myspell allows the same affix name to be used multiple
2351 // times. The affix files that do this have an undocumented
2352 // "S" flag on all but the last block, thus we check for that
2353 // and store it in ah_follows.
2354 STRLCPY(key, items[1], AH_KEY_LEN);
2355 hi = hash_find(tp, key);
2356 if (!HASHITEM_EMPTY(hi)) {
2357 cur_aff = HI2AH(hi);
2358 if (cur_aff->ah_combine != (*items[2] == 'Y')) {
2359 smsg(_("Different combining flag in continued affix block in %s line %d: %s"),
2360 fname, lnum, items[1]);
2361 }
2362 if (!cur_aff->ah_follows) {
2363 smsg(_("Duplicate affix in %s line %d: %s"),
2364 fname, lnum, items[1]);
2365 }
2366 } else {
2367 // New affix letter.
2368 cur_aff = getroom(spin, sizeof(*cur_aff), true);
2369 cur_aff->ah_flag = affitem2flag(aff->af_flagtype, items[1],
2370 fname, lnum);
2371 if (cur_aff->ah_flag == 0 || STRLEN(items[1]) >= AH_KEY_LEN) {
2372 break;
2373 }
2374 if (cur_aff->ah_flag == aff->af_bad
2375 || cur_aff->ah_flag == aff->af_rare
2376 || cur_aff->ah_flag == aff->af_keepcase
2377 || cur_aff->ah_flag == aff->af_needaffix
2378 || cur_aff->ah_flag == aff->af_circumfix
2379 || cur_aff->ah_flag == aff->af_nosuggest
2380 || cur_aff->ah_flag == aff->af_needcomp
2381 || cur_aff->ah_flag == aff->af_comproot) {
2382 smsg(_("Affix also used for "
2383 "BAD/RARE/KEEPCASE/NEEDAFFIX/NEEDCOMPOUND/NOSUGGEST"
2384 "in %s line %d: %s"),
2385 fname, lnum, items[1]);
2386 }
2387 STRCPY(cur_aff->ah_key, items[1]);
2388 hash_add(tp, cur_aff->ah_key);
2389
2390 cur_aff->ah_combine = (*items[2] == 'Y');
2391 }
2392
2393 // Check for the "S" flag, which apparently means that another
2394 // block with the same affix name is following.
2395 if (itemcnt > lasti && STRCMP(items[lasti], "S") == 0) {
2396 ++lasti;
2397 cur_aff->ah_follows = true;
2398 } else {
2399 cur_aff->ah_follows = false;
2400 }
2401
2402 // Myspell allows extra text after the item, but that might
2403 // mean mistakes go unnoticed. Require a comment-starter,
2404 // unless IGNOREEXTRA is used. Hunspell uses a "-" item.
2405 if (itemcnt > lasti
2406 && !aff->af_ignoreextra
2407 && *items[lasti] != '#') {
2408 smsg(_(e_afftrailing), fname, lnum, items[lasti]);
2409 }
2410
2411 if (STRCMP(items[2], "Y") != 0 && STRCMP(items[2], "N") != 0) {
2412 smsg(_("Expected Y or N in %s line %d: %s"),
2413 fname, lnum, items[2]);
2414 }
2415
2416 if (*items[0] == 'P' && aff->af_pfxpostpone) {
2417 if (cur_aff->ah_newID == 0) {
2418 // Use a new number in the .spl file later, to be able
2419 // to handle multiple .aff files.
2420 check_renumber(spin);
2421 cur_aff->ah_newID = ++spin->si_newprefID;
2422
2423 // We only really use ah_newID if the prefix is
2424 // postponed. We know that only after handling all
2425 // the items.
2426 did_postpone_prefix = false;
2427 } else {
2428 // Did use the ID in a previous block.
2429 did_postpone_prefix = true;
2430 }
2431 }
2432
2433 aff_todo = atoi((char *)items[3]);
2434 } else if ((STRCMP(items[0], "PFX") == 0
2435 || STRCMP(items[0], "SFX") == 0)
2436 && aff_todo > 0
2437 && STRCMP(cur_aff->ah_key, items[1]) == 0
2438 && itemcnt >= 5) {
2439 affentry_T *aff_entry;
2440 bool upper = false;
2441 int lasti = 5;
2442
2443 // Myspell allows extra text after the item, but that might
2444 // mean mistakes go unnoticed. Require a comment-starter.
2445 // Hunspell uses a "-" item.
2446 if (itemcnt > lasti && *items[lasti] != '#'
2447 && (STRCMP(items[lasti], "-") != 0
2448 || itemcnt != lasti + 1)) {
2449 smsg(_(e_afftrailing), fname, lnum, items[lasti]);
2450 }
2451
2452 // New item for an affix letter.
2453 aff_todo--;
2454 aff_entry = getroom(spin, sizeof(*aff_entry), true);
2455
2456 if (STRCMP(items[2], "0") != 0) {
2457 aff_entry->ae_chop = getroom_save(spin, items[2]);
2458 }
2459 if (STRCMP(items[3], "0") != 0) {
2460 aff_entry->ae_add = getroom_save(spin, items[3]);
2461
2462 // Recognize flags on the affix: abcd/XYZ
2463 aff_entry->ae_flags = vim_strchr(aff_entry->ae_add, '/');
2464 if (aff_entry->ae_flags != NULL) {
2465 *aff_entry->ae_flags++ = NUL;
2466 aff_process_flags(aff, aff_entry);
2467 }
2468 }
2469
2470 // Don't use an affix entry with non-ASCII characters when
2471 // "spin->si_ascii" is true.
2472 if (!spin->si_ascii || !(has_non_ascii(aff_entry->ae_chop)
2473 || has_non_ascii(aff_entry->ae_add))) {
2474 aff_entry->ae_next = cur_aff->ah_first;
2475 cur_aff->ah_first = aff_entry;
2476
2477 if (STRCMP(items[4], ".") != 0) {
2478 char_u buf[MAXLINELEN];
2479
2480 aff_entry->ae_cond = getroom_save(spin, items[4]);
2481 if (*items[0] == 'P') {
2482 sprintf((char *)buf, "^%s", items[4]);
2483 } else {
2484 sprintf((char *)buf, "%s$", items[4]);
2485 }
2486 aff_entry->ae_prog = vim_regcomp(buf,
2487 RE_MAGIC + RE_STRING + RE_STRICT);
2488 if (aff_entry->ae_prog == NULL) {
2489 smsg(_("Broken condition in %s line %d: %s"),
2490 fname, lnum, items[4]);
2491 }
2492 }
2493
2494 // For postponed prefixes we need an entry in si_prefcond
2495 // for the condition. Use an existing one if possible.
2496 // Can't be done for an affix with flags, ignoring
2497 // COMPOUNDFORBIDFLAG and COMPOUNDPERMITFLAG.
2498 if (*items[0] == 'P' && aff->af_pfxpostpone
2499 && aff_entry->ae_flags == NULL) {
2500 // When the chop string is one lower-case letter and
2501 // the add string ends in the upper-case letter we set
2502 // the "upper" flag, clear "ae_chop" and remove the
2503 // letters from "ae_add". The condition must either
2504 // be empty or start with the same letter.
2505 if (aff_entry->ae_chop != NULL
2506 && aff_entry->ae_add != NULL
2507 && aff_entry->ae_chop[utfc_ptr2len(aff_entry->ae_chop)] ==
2508 NUL) {
2509 int c, c_up;
2510
2511 c = utf_ptr2char(aff_entry->ae_chop);
2512 c_up = SPELL_TOUPPER(c);
2513 if (c_up != c
2514 && (aff_entry->ae_cond == NULL
2515 || utf_ptr2char(aff_entry->ae_cond) == c)) {
2516 p = aff_entry->ae_add
2517 + STRLEN(aff_entry->ae_add);
2518 MB_PTR_BACK(aff_entry->ae_add, p);
2519 if (utf_ptr2char(p) == c_up) {
2520 upper = true;
2521 aff_entry->ae_chop = NULL;
2522 *p = NUL;
2523
2524 // The condition is matched with the
2525 // actual word, thus must check for the
2526 // upper-case letter.
2527 if (aff_entry->ae_cond != NULL) {
2528 char_u buf[MAXLINELEN];
2529 onecap_copy(items[4], buf, true);
2530 aff_entry->ae_cond = getroom_save(spin, buf);
2531 if (aff_entry->ae_cond != NULL) {
2532 sprintf((char *)buf, "^%s",
2533 aff_entry->ae_cond);
2534 vim_regfree(aff_entry->ae_prog);
2535 aff_entry->ae_prog = vim_regcomp(buf, RE_MAGIC + RE_STRING);
2536 }
2537 }
2538 }
2539 }
2540 }
2541
2542 if (aff_entry->ae_chop == NULL) {
2543 int idx;
2544 char_u **pp;
2545 int n;
2546
2547 // Find a previously used condition.
2548 for (idx = spin->si_prefcond.ga_len - 1; idx >= 0;
2549 --idx) {
2550 p = ((char_u **)spin->si_prefcond.ga_data)[idx];
2551 if (str_equal(p, aff_entry->ae_cond)) {
2552 break;
2553 }
2554 }
2555 if (idx < 0) {
2556 // Not found, add a new condition.
2557 idx = spin->si_prefcond.ga_len;
2558 pp = GA_APPEND_VIA_PTR(char_u *, &spin->si_prefcond);
2559 *pp = (aff_entry->ae_cond == NULL) ?
2560 NULL : getroom_save(spin, aff_entry->ae_cond);
2561 }
2562
2563 // Add the prefix to the prefix tree.
2564 if (aff_entry->ae_add == NULL) {
2565 p = (char_u *)"";
2566 } else {
2567 p = aff_entry->ae_add;
2568 }
2569
2570 // PFX_FLAGS is a negative number, so that
2571 // tree_add_word() knows this is the prefix tree.
2572 n = PFX_FLAGS;
2573 if (!cur_aff->ah_combine) {
2574 n |= WFP_NC;
2575 }
2576 if (upper) {
2577 n |= WFP_UP;
2578 }
2579 if (aff_entry->ae_comppermit) {
2580 n |= WFP_COMPPERMIT;
2581 }
2582 if (aff_entry->ae_compforbid) {
2583 n |= WFP_COMPFORBID;
2584 }
2585 tree_add_word(spin, p, spin->si_prefroot, n,
2586 idx, cur_aff->ah_newID);
2587 did_postpone_prefix = true;
2588 }
2589
2590 // Didn't actually use ah_newID, backup si_newprefID.
2591 if (aff_todo == 0 && !did_postpone_prefix) {
2592 --spin->si_newprefID;
2593 cur_aff->ah_newID = 0;
2594 }
2595 }
2596 }
2597 } else if (is_aff_rule(items, itemcnt, "FOL", 2) && fol == NULL) {
2598 fol = vim_strsave(items[1]);
2599 } else if (is_aff_rule(items, itemcnt, "LOW", 2) && low == NULL) {
2600 low = vim_strsave(items[1]);
2601 } else if (is_aff_rule(items, itemcnt, "UPP", 2) && upp == NULL) {
2602 upp = vim_strsave(items[1]);
2603 } else if (is_aff_rule(items, itemcnt, "REP", 2)
2604 || is_aff_rule(items, itemcnt, "REPSAL", 2)) {
2605 // Ignore REP/REPSAL count
2606 if (!isdigit(*items[1])) {
2607 smsg(_("Expected REP(SAL) count in %s line %d"),
2608 fname, lnum);
2609 }
2610 } else if ((STRCMP(items[0], "REP") == 0
2611 || STRCMP(items[0], "REPSAL") == 0)
2612 && itemcnt >= 3) {
2613 // REP/REPSAL item
2614 // Myspell ignores extra arguments, we require it starts with
2615 // # to detect mistakes.
2616 if (itemcnt > 3 && items[3][0] != '#') {
2617 smsg(_(e_afftrailing), fname, lnum, items[3]);
2618 }
2619 if (items[0][3] == 'S' ? do_repsal : do_rep) {
2620 // Replace underscore with space (can't include a space
2621 // directly).
2622 for (p = items[1]; *p != NUL; MB_PTR_ADV(p)) {
2623 if (*p == '_') {
2624 *p = ' ';
2625 }
2626 }
2627 for (p = items[2]; *p != NUL; MB_PTR_ADV(p)) {
2628 if (*p == '_') {
2629 *p = ' ';
2630 }
2631 }
2632 add_fromto(spin, items[0][3] == 'S'
2633 ? &spin->si_repsal
2634 : &spin->si_rep, items[1], items[2]);
2635 }
2636 } else if (is_aff_rule(items, itemcnt, "MAP", 2)) {
2637 // MAP item or count
2638 if (!found_map) {
2639 // First line contains the count.
2640 found_map = true;
2641 if (!isdigit(*items[1])) {
2642 smsg(_("Expected MAP count in %s line %d"),
2643 fname, lnum);
2644 }
2645 } else if (do_mapline) {
2646 int c;
2647
2648 // Check that every character appears only once.
2649 for (p = items[1]; *p != NUL;) {
2650 c = mb_ptr2char_adv((const char_u **)&p);
2651 if ((!GA_EMPTY(&spin->si_map)
2652 && vim_strchr(spin->si_map.ga_data, c)
2653 != NULL)
2654 || vim_strchr(p, c) != NULL) {
2655 smsg(_("Duplicate character in MAP in %s line %d"),
2656 fname, lnum);
2657 }
2658 }
2659
2660 // We simply concatenate all the MAP strings, separated by
2661 // slashes.
2662 ga_concat(&spin->si_map, (char *)items[1]);
2663 ga_append(&spin->si_map, '/');
2664 }
2665 }
2666 // Accept "SAL from to" and "SAL from to #comment".
2667 else if (is_aff_rule(items, itemcnt, "SAL", 3)) {
2668 if (do_sal) {
2669 // SAL item (sounds-a-like)
2670 // Either one of the known keys or a from-to pair.
2671 if (STRCMP(items[1], "followup") == 0) {
2672 spin->si_followup = sal_to_bool(items[2]);
2673 } else if (STRCMP(items[1], "collapse_result") == 0) {
2674 spin->si_collapse = sal_to_bool(items[2]);
2675 } else if (STRCMP(items[1], "remove_accents") == 0) {
2676 spin->si_rem_accents = sal_to_bool(items[2]);
2677 } else {
2678 // when "to" is "_" it means empty
2679 add_fromto(spin, &spin->si_sal, items[1],
2680 STRCMP(items[2], "_") == 0 ? (char_u *)""
2681 : items[2]);
2682 }
2683 }
2684 } else if (is_aff_rule(items, itemcnt, "SOFOFROM", 2)
2685 && sofofrom == NULL) {
2686 sofofrom = getroom_save(spin, items[1]);
2687 } else if (is_aff_rule(items, itemcnt, "SOFOTO", 2)
2688 && sofoto == NULL) {
2689 sofoto = getroom_save(spin, items[1]);
2690 } else if (STRCMP(items[0], "COMMON") == 0) {
2691 int i;
2692
2693 for (i = 1; i < itemcnt; ++i) {
2694 if (HASHITEM_EMPTY(hash_find(&spin->si_commonwords,
2695 items[i]))) {
2696 p = vim_strsave(items[i]);
2697 hash_add(&spin->si_commonwords, p);
2698 }
2699 }
2700 } else {
2701 smsg(_("Unrecognized or duplicate item in %s line %d: %s"),
2702 fname, lnum, items[0]);
2703 }
2704 }
2705 }
2706
2707 if (fol != NULL || low != NULL || upp != NULL) {
2708 if (spin->si_clear_chartab) {
2709 // Clear the char type tables, don't want to use any of the
2710 // currently used spell properties.
2711 init_spell_chartab();
2712 spin->si_clear_chartab = false;
2713 }
2714
2715 xfree(fol);
2716 xfree(low);
2717 xfree(upp);
2718 }
2719
2720 // Use compound specifications of the .aff file for the spell info.
2721 if (compmax != 0) {
2722 aff_check_number(spin->si_compmax, compmax, "COMPOUNDWORDMAX");
2723 spin->si_compmax = compmax;
2724 }
2725
2726 if (compminlen != 0) {
2727 aff_check_number(spin->si_compminlen, compminlen, "COMPOUNDMIN");
2728 spin->si_compminlen = compminlen;
2729 }
2730
2731 if (compsylmax != 0) {
2732 if (syllable == NULL) {
2733 smsg("%s", _("COMPOUNDSYLMAX used without SYLLABLE"));
2734 }
2735 aff_check_number(spin->si_compsylmax, compsylmax, "COMPOUNDSYLMAX");
2736 spin->si_compsylmax = compsylmax;
2737 }
2738
2739 if (compoptions != 0) {
2740 aff_check_number(spin->si_compoptions, compoptions, "COMPOUND options");
2741 spin->si_compoptions |= compoptions;
2742 }
2743
2744 if (compflags != NULL) {
2745 process_compflags(spin, aff, compflags);
2746 }
2747
2748 // Check that we didn't use too many renumbered flags.
2749 if (spin->si_newcompID < spin->si_newprefID) {
2750 if (spin->si_newcompID == 127 || spin->si_newcompID == 255) {
2751 msg(_("Too many postponed prefixes"));
2752 } else if (spin->si_newprefID == 0 || spin->si_newprefID == 127) {
2753 msg(_("Too many compound flags"));
2754 } else {
2755 msg(_("Too many postponed prefixes and/or compound flags"));
2756 }
2757 }
2758
2759 if (syllable != NULL) {
2760 aff_check_string(spin->si_syllable, syllable, "SYLLABLE");
2761 spin->si_syllable = syllable;
2762 }
2763
2764 if (sofofrom != NULL || sofoto != NULL) {
2765 if (sofofrom == NULL || sofoto == NULL) {
2766 smsg(_("Missing SOFO%s line in %s"),
2767 sofofrom == NULL ? "FROM" : "TO", fname);
2768 } else if (!GA_EMPTY(&spin->si_sal)) {
2769 smsg(_("Both SAL and SOFO lines in %s"), fname);
2770 } else {
2771 aff_check_string(spin->si_sofofr, sofofrom, "SOFOFROM");
2772 aff_check_string(spin->si_sofoto, sofoto, "SOFOTO");
2773 spin->si_sofofr = sofofrom;
2774 spin->si_sofoto = sofoto;
2775 }
2776 }
2777
2778 if (midword != NULL) {
2779 aff_check_string(spin->si_midword, midword, "MIDWORD");
2780 spin->si_midword = midword;
2781 }
2782
2783 xfree(pc);
2784 fclose(fd);
2785 return aff;
2786 }
2787
2788 // Returns true when items[0] equals "rulename", there are "mincount" items or
2789 // a comment is following after item "mincount".
is_aff_rule(char_u ** items,int itemcnt,char * rulename,int mincount)2790 static bool is_aff_rule(char_u **items, int itemcnt, char *rulename, int mincount)
2791 {
2792 return STRCMP(items[0], rulename) == 0
2793 && (itemcnt == mincount
2794 || (itemcnt > mincount && items[mincount][0] == '#'));
2795 }
2796
2797 // For affix "entry" move COMPOUNDFORBIDFLAG and COMPOUNDPERMITFLAG from
2798 // ae_flags to ae_comppermit and ae_compforbid.
aff_process_flags(afffile_T * affile,affentry_T * entry)2799 static void aff_process_flags(afffile_T *affile, affentry_T *entry)
2800 {
2801 char_u *p;
2802 char_u *prevp;
2803 unsigned flag;
2804
2805 if (entry->ae_flags != NULL
2806 && (affile->af_compforbid != 0 || affile->af_comppermit != 0)) {
2807 for (p = entry->ae_flags; *p != NUL;) {
2808 prevp = p;
2809 flag = get_affitem(affile->af_flagtype, &p);
2810 if (flag == affile->af_comppermit || flag == affile->af_compforbid) {
2811 STRMOVE(prevp, p);
2812 p = prevp;
2813 if (flag == affile->af_comppermit) {
2814 entry->ae_comppermit = true;
2815 } else {
2816 entry->ae_compforbid = true;
2817 }
2818 }
2819 if (affile->af_flagtype == AFT_NUM && *p == ',') {
2820 ++p;
2821 }
2822 }
2823 if (*entry->ae_flags == NUL) {
2824 entry->ae_flags = NULL; // nothing left
2825 }
2826 }
2827 }
2828
2829 // Returns true if "s" is the name of an info item in the affix file.
spell_info_item(char_u * s)2830 static bool spell_info_item(char_u *s)
2831 {
2832 return STRCMP(s, "NAME") == 0
2833 || STRCMP(s, "HOME") == 0
2834 || STRCMP(s, "VERSION") == 0
2835 || STRCMP(s, "AUTHOR") == 0
2836 || STRCMP(s, "EMAIL") == 0
2837 || STRCMP(s, "COPYRIGHT") == 0;
2838 }
2839
2840 // Turn an affix flag name into a number, according to the FLAG type.
2841 // returns zero for failure.
affitem2flag(int flagtype,char_u * item,char_u * fname,int lnum)2842 static unsigned affitem2flag(int flagtype, char_u *item, char_u *fname, int lnum)
2843 {
2844 unsigned res;
2845 char_u *p = item;
2846
2847 res = get_affitem(flagtype, &p);
2848 if (res == 0) {
2849 if (flagtype == AFT_NUM) {
2850 smsg(_("Flag is not a number in %s line %d: %s"),
2851 fname, lnum, item);
2852 } else {
2853 smsg(_("Illegal flag in %s line %d: %s"),
2854 fname, lnum, item);
2855 }
2856 }
2857 if (*p != NUL) {
2858 smsg(_(e_affname), fname, lnum, item);
2859 return 0;
2860 }
2861
2862 return res;
2863 }
2864
2865 // Get one affix name from "*pp" and advance the pointer.
2866 // Returns ZERO_FLAG for "0".
2867 // Returns zero for an error, still advances the pointer then.
get_affitem(int flagtype,char_u ** pp)2868 static unsigned get_affitem(int flagtype, char_u **pp)
2869 {
2870 int res;
2871
2872 if (flagtype == AFT_NUM) {
2873 if (!ascii_isdigit(**pp)) {
2874 ++*pp; // always advance, avoid getting stuck
2875 return 0;
2876 }
2877 res = getdigits_int(pp, true, 0);
2878 if (res == 0) {
2879 res = ZERO_FLAG;
2880 }
2881 } else {
2882 res = mb_ptr2char_adv((const char_u **)pp);
2883 if (flagtype == AFT_LONG || (flagtype == AFT_CAPLONG
2884 && res >= 'A' && res <= 'Z')) {
2885 if (**pp == NUL) {
2886 return 0;
2887 }
2888 res = mb_ptr2char_adv((const char_u **)pp) + (res << 16);
2889 }
2890 }
2891 return res;
2892 }
2893
2894 // Process the "compflags" string used in an affix file and append it to
2895 // spin->si_compflags.
2896 // The processing involves changing the affix names to ID numbers, so that
2897 // they fit in one byte.
process_compflags(spellinfo_T * spin,afffile_T * aff,char_u * compflags)2898 static void process_compflags(spellinfo_T *spin, afffile_T *aff, char_u *compflags)
2899 {
2900 char_u *p;
2901 char_u *prevp;
2902 unsigned flag;
2903 compitem_T *ci;
2904 int id;
2905 int len;
2906 char_u *tp;
2907 char_u key[AH_KEY_LEN];
2908 hashitem_T *hi;
2909
2910 // Make room for the old and the new compflags, concatenated with a / in
2911 // between. Processing it makes it shorter, but we don't know by how
2912 // much, thus allocate the maximum.
2913 len = (int)STRLEN(compflags) + 1;
2914 if (spin->si_compflags != NULL) {
2915 len += (int)STRLEN(spin->si_compflags) + 1;
2916 }
2917 p = getroom(spin, len, false);
2918 if (spin->si_compflags != NULL) {
2919 STRCPY(p, spin->si_compflags);
2920 STRCAT(p, "/");
2921 }
2922 spin->si_compflags = p;
2923 tp = p + STRLEN(p);
2924
2925 for (p = compflags; *p != NUL;) {
2926 if (vim_strchr((char_u *)"/?*+[]", *p) != NULL) {
2927 // Copy non-flag characters directly.
2928 *tp++ = *p++;
2929 } else {
2930 // First get the flag number, also checks validity.
2931 prevp = p;
2932 flag = get_affitem(aff->af_flagtype, &p);
2933 if (flag != 0) {
2934 // Find the flag in the hashtable. If it was used before, use
2935 // the existing ID. Otherwise add a new entry.
2936 STRLCPY(key, prevp, p - prevp + 1);
2937 hi = hash_find(&aff->af_comp, key);
2938 if (!HASHITEM_EMPTY(hi)) {
2939 id = HI2CI(hi)->ci_newID;
2940 } else {
2941 ci = getroom(spin, sizeof(compitem_T), true);
2942 STRCPY(ci->ci_key, key);
2943 ci->ci_flag = flag;
2944 // Avoid using a flag ID that has a special meaning in a
2945 // regexp (also inside []).
2946 do {
2947 check_renumber(spin);
2948 id = spin->si_newcompID--;
2949 } while (vim_strchr((char_u *)"/?*+[]\\-^", id) != NULL);
2950 ci->ci_newID = id;
2951 hash_add(&aff->af_comp, ci->ci_key);
2952 }
2953 *tp++ = id;
2954 }
2955 if (aff->af_flagtype == AFT_NUM && *p == ',') {
2956 ++p;
2957 }
2958 }
2959 }
2960
2961 *tp = NUL;
2962 }
2963
2964 // Check that the new IDs for postponed affixes and compounding don't overrun
2965 // each other. We have almost 255 available, but start at 0-127 to avoid
2966 // using two bytes for utf-8. When the 0-127 range is used up go to 128-255.
2967 // When that is used up an error message is given.
check_renumber(spellinfo_T * spin)2968 static void check_renumber(spellinfo_T *spin)
2969 {
2970 if (spin->si_newprefID == spin->si_newcompID && spin->si_newcompID < 128) {
2971 spin->si_newprefID = 127;
2972 spin->si_newcompID = 255;
2973 }
2974 }
2975
2976 // Returns true if flag "flag" appears in affix list "afflist".
flag_in_afflist(int flagtype,char_u * afflist,unsigned flag)2977 static bool flag_in_afflist(int flagtype, char_u *afflist, unsigned flag)
2978 {
2979 char_u *p;
2980 unsigned n;
2981
2982 switch (flagtype) {
2983 case AFT_CHAR:
2984 return vim_strchr(afflist, flag) != NULL;
2985
2986 case AFT_CAPLONG:
2987 case AFT_LONG:
2988 for (p = afflist; *p != NUL;) {
2989 n = mb_ptr2char_adv((const char_u **)&p);
2990 if ((flagtype == AFT_LONG || (n >= 'A' && n <= 'Z'))
2991 && *p != NUL) {
2992 n = mb_ptr2char_adv((const char_u **)&p) + (n << 16);
2993 }
2994 if (n == flag) {
2995 return true;
2996 }
2997 }
2998 break;
2999
3000 case AFT_NUM:
3001 for (p = afflist; *p != NUL;) {
3002 int digits = getdigits_int(&p, true, 0);
3003 assert(digits >= 0);
3004 n = (unsigned int)digits;
3005 if (n == 0) {
3006 n = ZERO_FLAG;
3007 }
3008 if (n == flag) {
3009 return true;
3010 }
3011 if (*p != NUL) { // skip over comma
3012 p++;
3013 }
3014 }
3015 break;
3016 }
3017 return false;
3018 }
3019
3020 // Give a warning when "spinval" and "affval" numbers are set and not the same.
aff_check_number(int spinval,int affval,char * name)3021 static void aff_check_number(int spinval, int affval, char *name)
3022 {
3023 if (spinval != 0 && spinval != affval) {
3024 smsg(_("%s value differs from what is used in another .aff file"),
3025 name);
3026 }
3027 }
3028
3029 // Give a warning when "spinval" and "affval" strings are set and not the same.
aff_check_string(char_u * spinval,char_u * affval,char * name)3030 static void aff_check_string(char_u *spinval, char_u *affval, char *name)
3031 {
3032 if (spinval != NULL && STRCMP(spinval, affval) != 0) {
3033 smsg(_("%s value differs from what is used in another .aff file"),
3034 name);
3035 }
3036 }
3037
3038 // Returns true if strings "s1" and "s2" are equal. Also consider both being
3039 // NULL as equal.
str_equal(char_u * s1,char_u * s2)3040 static bool str_equal(char_u *s1, char_u *s2)
3041 {
3042 if (s1 == NULL || s2 == NULL) {
3043 return s1 == s2;
3044 }
3045 return STRCMP(s1, s2) == 0;
3046 }
3047
3048 // Add a from-to item to "gap". Used for REP and SAL items.
3049 // They are stored case-folded.
add_fromto(spellinfo_T * spin,garray_T * gap,char_u * from,char_u * to)3050 static void add_fromto(spellinfo_T *spin, garray_T *gap, char_u *from, char_u *to)
3051 {
3052 char_u word[MAXWLEN];
3053
3054 fromto_T *ftp = GA_APPEND_VIA_PTR(fromto_T, gap);
3055 (void)spell_casefold(curwin, from, (int)STRLEN(from), word, MAXWLEN);
3056 ftp->ft_from = getroom_save(spin, word);
3057 (void)spell_casefold(curwin, to, (int)STRLEN(to), word, MAXWLEN);
3058 ftp->ft_to = getroom_save(spin, word);
3059 }
3060
3061 // Converts a boolean argument in a SAL line to true or false;
sal_to_bool(char_u * s)3062 static bool sal_to_bool(char_u *s)
3063 {
3064 return STRCMP(s, "1") == 0 || STRCMP(s, "true") == 0;
3065 }
3066
3067 // Free the structure filled by spell_read_aff().
spell_free_aff(afffile_T * aff)3068 static void spell_free_aff(afffile_T *aff)
3069 {
3070 hashtab_T *ht;
3071 hashitem_T *hi;
3072 int todo;
3073 affheader_T *ah;
3074 affentry_T *ae;
3075
3076 xfree(aff->af_enc);
3077
3078 // All this trouble to free the "ae_prog" items...
3079 for (ht = &aff->af_pref;; ht = &aff->af_suff) {
3080 todo = (int)ht->ht_used;
3081 for (hi = ht->ht_array; todo > 0; ++hi) {
3082 if (!HASHITEM_EMPTY(hi)) {
3083 --todo;
3084 ah = HI2AH(hi);
3085 for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next) {
3086 vim_regfree(ae->ae_prog);
3087 }
3088 }
3089 }
3090 if (ht == &aff->af_suff) {
3091 break;
3092 }
3093 }
3094
3095 hash_clear(&aff->af_pref);
3096 hash_clear(&aff->af_suff);
3097 hash_clear(&aff->af_comp);
3098 }
3099
3100 // Read dictionary file "fname".
3101 // Returns OK or FAIL;
spell_read_dic(spellinfo_T * spin,char_u * fname,afffile_T * affile)3102 static int spell_read_dic(spellinfo_T *spin, char_u *fname, afffile_T *affile)
3103 {
3104 hashtab_T ht;
3105 char_u line[MAXLINELEN];
3106 char_u *p;
3107 char_u *afflist;
3108 char_u store_afflist[MAXWLEN];
3109 int pfxlen;
3110 bool need_affix;
3111 char_u *dw;
3112 char_u *pc;
3113 char_u *w;
3114 int l;
3115 hash_T hash;
3116 hashitem_T *hi;
3117 FILE *fd;
3118 int lnum = 1;
3119 int non_ascii = 0;
3120 int retval = OK;
3121 char_u message[MAXLINELEN + MAXWLEN];
3122 int flags;
3123 int duplicate = 0;
3124 Timestamp last_msg_time = 0;
3125
3126 // Open the file.
3127 fd = os_fopen((char *)fname, "r");
3128 if (fd == NULL) {
3129 semsg(_(e_notopen), fname);
3130 return FAIL;
3131 }
3132
3133 // The hashtable is only used to detect duplicated words.
3134 hash_init(&ht);
3135
3136 vim_snprintf((char *)IObuff, IOSIZE,
3137 _("Reading dictionary file %s..."), fname);
3138 spell_message(spin, IObuff);
3139
3140 // start with a message for the first line
3141 spin->si_msg_count = 999999;
3142
3143 // Read and ignore the first line: word count.
3144 if (vim_fgets(line, MAXLINELEN, fd) || !ascii_isdigit(*skipwhite(line))) {
3145 semsg(_("E760: No word count in %s"), fname);
3146 }
3147
3148 // Read all the lines in the file one by one.
3149 // The words are converted to 'encoding' here, before being added to
3150 // the hashtable.
3151 while (!vim_fgets(line, MAXLINELEN, fd) && !got_int) {
3152 line_breakcheck();
3153 ++lnum;
3154 if (line[0] == '#' || line[0] == '/') {
3155 continue; // comment line
3156 }
3157 // Remove CR, LF and white space from the end. White space halfway through
3158 // the word is kept to allow multi-word terms like "et al.".
3159 l = (int)STRLEN(line);
3160 while (l > 0 && line[l - 1] <= ' ') {
3161 --l;
3162 }
3163 if (l == 0) {
3164 continue; // empty line
3165 }
3166 line[l] = NUL;
3167
3168 // Convert from "SET" to 'encoding' when needed.
3169 if (spin->si_conv.vc_type != CONV_NONE) {
3170 pc = string_convert(&spin->si_conv, line, NULL);
3171 if (pc == NULL) {
3172 smsg(_("Conversion failure for word in %s line %d: %s"),
3173 fname, lnum, line);
3174 continue;
3175 }
3176 w = pc;
3177 } else {
3178 pc = NULL;
3179 w = line;
3180 }
3181
3182 // Truncate the word at the "/", set "afflist" to what follows.
3183 // Replace "\/" by "/" and "\\" by "\".
3184 afflist = NULL;
3185 for (p = w; *p != NUL; MB_PTR_ADV(p)) {
3186 if (*p == '\\' && (p[1] == '\\' || p[1] == '/')) {
3187 STRMOVE(p, p + 1);
3188 } else if (*p == '/') {
3189 *p = NUL;
3190 afflist = p + 1;
3191 break;
3192 }
3193 }
3194
3195 // Skip non-ASCII words when "spin->si_ascii" is true.
3196 if (spin->si_ascii && has_non_ascii(w)) {
3197 ++non_ascii;
3198 xfree(pc);
3199 continue;
3200 }
3201
3202 // This takes time, print a message every 10000 words, but not more
3203 // often than once per second.
3204 if (spin->si_verbose && spin->si_msg_count > 10000) {
3205 spin->si_msg_count = 0;
3206 if (os_time() > last_msg_time) {
3207 last_msg_time = os_time();
3208 vim_snprintf((char *)message, sizeof(message),
3209 _("line %6d, word %6ld - %s"),
3210 lnum, spin->si_foldwcount + spin->si_keepwcount, w);
3211 msg_start();
3212 msg_outtrans_long_attr(message, 0);
3213 msg_clr_eos();
3214 msg_didout = false;
3215 msg_col = 0;
3216 ui_flush();
3217 }
3218 }
3219
3220 // Store the word in the hashtable to be able to find duplicates.
3221 dw = getroom_save(spin, w);
3222 if (dw == NULL) {
3223 retval = FAIL;
3224 xfree(pc);
3225 break;
3226 }
3227
3228 hash = hash_hash(dw);
3229 hi = hash_lookup(&ht, (const char *)dw, STRLEN(dw), hash);
3230 if (!HASHITEM_EMPTY(hi)) {
3231 if (p_verbose > 0) {
3232 smsg(_("Duplicate word in %s line %d: %s"),
3233 fname, lnum, dw);
3234 } else if (duplicate == 0) {
3235 smsg(_("First duplicate word in %s line %d: %s"),
3236 fname, lnum, dw);
3237 }
3238 ++duplicate;
3239 } else {
3240 hash_add_item(&ht, hi, dw, hash);
3241 }
3242
3243 flags = 0;
3244 store_afflist[0] = NUL;
3245 pfxlen = 0;
3246 need_affix = false;
3247 if (afflist != NULL) {
3248 // Extract flags from the affix list.
3249 flags |= get_affix_flags(affile, afflist);
3250
3251 if (affile->af_needaffix != 0
3252 && flag_in_afflist(affile->af_flagtype, afflist,
3253 affile->af_needaffix)) {
3254 need_affix = true;
3255 }
3256
3257 if (affile->af_pfxpostpone) {
3258 // Need to store the list of prefix IDs with the word.
3259 pfxlen = get_pfxlist(affile, afflist, store_afflist);
3260 }
3261
3262 if (spin->si_compflags != NULL) {
3263 // Need to store the list of compound flags with the word.
3264 // Concatenate them to the list of prefix IDs.
3265 get_compflags(affile, afflist, store_afflist + pfxlen);
3266 }
3267 }
3268
3269 // Add the word to the word tree(s).
3270 if (store_word(spin, dw, flags, spin->si_region,
3271 store_afflist, need_affix) == FAIL) {
3272 retval = FAIL;
3273 }
3274
3275 if (afflist != NULL) {
3276 // Find all matching suffixes and add the resulting words.
3277 // Additionally do matching prefixes that combine.
3278 if (store_aff_word(spin, dw, afflist, affile,
3279 &affile->af_suff, &affile->af_pref,
3280 CONDIT_SUF, flags, store_afflist, pfxlen) == FAIL) {
3281 retval = FAIL;
3282 }
3283
3284 // Find all matching prefixes and add the resulting words.
3285 if (store_aff_word(spin, dw, afflist, affile,
3286 &affile->af_pref, NULL,
3287 CONDIT_SUF, flags, store_afflist, pfxlen) == FAIL) {
3288 retval = FAIL;
3289 }
3290 }
3291
3292 xfree(pc);
3293 }
3294
3295 if (duplicate > 0) {
3296 smsg(_("%d duplicate word(s) in %s"), duplicate, fname);
3297 }
3298 if (spin->si_ascii && non_ascii > 0) {
3299 smsg(_("Ignored %d word(s) with non-ASCII characters in %s"),
3300 non_ascii, fname);
3301 }
3302 hash_clear(&ht);
3303
3304 fclose(fd);
3305 return retval;
3306 }
3307
3308 // Check for affix flags in "afflist" that are turned into word flags.
3309 // Return WF_ flags.
get_affix_flags(afffile_T * affile,char_u * afflist)3310 static int get_affix_flags(afffile_T *affile, char_u *afflist)
3311 {
3312 int flags = 0;
3313
3314 if (affile->af_keepcase != 0
3315 && flag_in_afflist(affile->af_flagtype, afflist,
3316 affile->af_keepcase)) {
3317 flags |= WF_KEEPCAP | WF_FIXCAP;
3318 }
3319 if (affile->af_rare != 0
3320 && flag_in_afflist(affile->af_flagtype, afflist, affile->af_rare)) {
3321 flags |= WF_RARE;
3322 }
3323 if (affile->af_bad != 0
3324 && flag_in_afflist(affile->af_flagtype, afflist, affile->af_bad)) {
3325 flags |= WF_BANNED;
3326 }
3327 if (affile->af_needcomp != 0
3328 && flag_in_afflist(affile->af_flagtype, afflist,
3329 affile->af_needcomp)) {
3330 flags |= WF_NEEDCOMP;
3331 }
3332 if (affile->af_comproot != 0
3333 && flag_in_afflist(affile->af_flagtype, afflist,
3334 affile->af_comproot)) {
3335 flags |= WF_COMPROOT;
3336 }
3337 if (affile->af_nosuggest != 0
3338 && flag_in_afflist(affile->af_flagtype, afflist,
3339 affile->af_nosuggest)) {
3340 flags |= WF_NOSUGGEST;
3341 }
3342 return flags;
3343 }
3344
3345 // Get the list of prefix IDs from the affix list "afflist".
3346 // Used for PFXPOSTPONE.
3347 // Put the resulting flags in "store_afflist[MAXWLEN]" with a terminating NUL
3348 // and return the number of affixes.
get_pfxlist(afffile_T * affile,char_u * afflist,char_u * store_afflist)3349 static int get_pfxlist(afffile_T *affile, char_u *afflist, char_u *store_afflist)
3350 {
3351 char_u *p;
3352 char_u *prevp;
3353 int cnt = 0;
3354 int id;
3355 char_u key[AH_KEY_LEN];
3356 hashitem_T *hi;
3357
3358 for (p = afflist; *p != NUL;) {
3359 prevp = p;
3360 if (get_affitem(affile->af_flagtype, &p) != 0) {
3361 // A flag is a postponed prefix flag if it appears in "af_pref"
3362 // and its ID is not zero.
3363 STRLCPY(key, prevp, p - prevp + 1);
3364 hi = hash_find(&affile->af_pref, key);
3365 if (!HASHITEM_EMPTY(hi)) {
3366 id = HI2AH(hi)->ah_newID;
3367 if (id != 0) {
3368 store_afflist[cnt++] = id;
3369 }
3370 }
3371 }
3372 if (affile->af_flagtype == AFT_NUM && *p == ',') {
3373 ++p;
3374 }
3375 }
3376
3377 store_afflist[cnt] = NUL;
3378 return cnt;
3379 }
3380
3381 // Get the list of compound IDs from the affix list "afflist" that are used
3382 // for compound words.
3383 // Puts the flags in "store_afflist[]".
get_compflags(afffile_T * affile,char_u * afflist,char_u * store_afflist)3384 static void get_compflags(afffile_T *affile, char_u *afflist, char_u *store_afflist)
3385 {
3386 char_u *p;
3387 char_u *prevp;
3388 int cnt = 0;
3389 char_u key[AH_KEY_LEN];
3390 hashitem_T *hi;
3391
3392 for (p = afflist; *p != NUL;) {
3393 prevp = p;
3394 if (get_affitem(affile->af_flagtype, &p) != 0) {
3395 // A flag is a compound flag if it appears in "af_comp".
3396 STRLCPY(key, prevp, p - prevp + 1);
3397 hi = hash_find(&affile->af_comp, key);
3398 if (!HASHITEM_EMPTY(hi)) {
3399 store_afflist[cnt++] = HI2CI(hi)->ci_newID;
3400 }
3401 }
3402 if (affile->af_flagtype == AFT_NUM && *p == ',') {
3403 ++p;
3404 }
3405 }
3406
3407 store_afflist[cnt] = NUL;
3408 }
3409
3410 /// Apply affixes to a word and store the resulting words.
3411 /// "ht" is the hashtable with affentry_T that need to be applied, either
3412 /// prefixes or suffixes.
3413 /// "xht", when not NULL, is the prefix hashtable, to be used additionally on
3414 /// the resulting words for combining affixes.
3415 ///
3416 /// @param spin spell info
3417 /// @param word basic word start
3418 /// @param afflist list of names of supported affixes
3419 /// @param condit CONDIT_SUF et al.
3420 /// @param flags flags for the word
3421 /// @param pfxlist list of prefix IDs
3422 /// @param pfxlen nr of flags in "pfxlist" for prefixes, rest is compound flags
3423 ///
3424 /// @return FAIL when out of memory.
store_aff_word(spellinfo_T * spin,char_u * word,char_u * afflist,afffile_T * affile,hashtab_T * ht,hashtab_T * xht,int condit,int flags,char_u * pfxlist,int pfxlen)3425 static int store_aff_word(spellinfo_T *spin, char_u *word, char_u *afflist, afffile_T *affile,
3426 hashtab_T *ht, hashtab_T *xht, int condit, int flags, char_u *pfxlist,
3427 int pfxlen)
3428 {
3429 int todo;
3430 hashitem_T *hi;
3431 affheader_T *ah;
3432 affentry_T *ae;
3433 char_u newword[MAXWLEN];
3434 int retval = OK;
3435 int i, j;
3436 char_u *p;
3437 int use_flags;
3438 char_u *use_pfxlist;
3439 int use_pfxlen;
3440 bool need_affix;
3441 char_u store_afflist[MAXWLEN];
3442 char_u pfx_pfxlist[MAXWLEN];
3443 size_t wordlen = STRLEN(word);
3444 int use_condit;
3445
3446 todo = (int)ht->ht_used;
3447 for (hi = ht->ht_array; todo > 0 && retval == OK; ++hi) {
3448 if (!HASHITEM_EMPTY(hi)) {
3449 --todo;
3450 ah = HI2AH(hi);
3451
3452 // Check that the affix combines, if required, and that the word
3453 // supports this affix.
3454 if (((condit & CONDIT_COMB) == 0 || ah->ah_combine)
3455 && flag_in_afflist(affile->af_flagtype, afflist,
3456 ah->ah_flag)) {
3457 // Loop over all affix entries with this name.
3458 for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next) {
3459 // Check the condition. It's not logical to match case
3460 // here, but it is required for compatibility with
3461 // Myspell.
3462 // Another requirement from Myspell is that the chop
3463 // string is shorter than the word itself.
3464 // For prefixes, when "PFXPOSTPONE" was used, only do
3465 // prefixes with a chop string and/or flags.
3466 // When a previously added affix had CIRCUMFIX this one
3467 // must have it too, if it had not then this one must not
3468 // have one either.
3469 if ((xht != NULL || !affile->af_pfxpostpone
3470 || ae->ae_chop != NULL
3471 || ae->ae_flags != NULL)
3472 && (ae->ae_chop == NULL
3473 || STRLEN(ae->ae_chop) < wordlen)
3474 && (ae->ae_prog == NULL
3475 || vim_regexec_prog(&ae->ae_prog, false, word, (colnr_T)0))
3476 && (((condit & CONDIT_CFIX) == 0)
3477 == ((condit & CONDIT_AFF) == 0
3478 || ae->ae_flags == NULL
3479 || !flag_in_afflist(affile->af_flagtype,
3480 ae->ae_flags, affile->af_circumfix)))) {
3481 // Match. Remove the chop and add the affix.
3482 if (xht == NULL) {
3483 // prefix: chop/add at the start of the word
3484 if (ae->ae_add == NULL) {
3485 *newword = NUL;
3486 } else {
3487 STRLCPY(newword, ae->ae_add, MAXWLEN);
3488 }
3489 p = word;
3490 if (ae->ae_chop != NULL) {
3491 // Skip chop string.
3492 i = mb_charlen(ae->ae_chop);
3493 for (; i > 0; i--) {
3494 MB_PTR_ADV(p);
3495 }
3496 }
3497 STRCAT(newword, p);
3498 } else {
3499 // suffix: chop/add at the end of the word
3500 STRLCPY(newword, word, MAXWLEN);
3501 if (ae->ae_chop != NULL) {
3502 // Remove chop string.
3503 p = newword + STRLEN(newword);
3504 i = mb_charlen(ae->ae_chop);
3505 for (; i > 0; i--) {
3506 MB_PTR_BACK(newword, p);
3507 }
3508 *p = NUL;
3509 }
3510 if (ae->ae_add != NULL) {
3511 STRCAT(newword, ae->ae_add);
3512 }
3513 }
3514
3515 use_flags = flags;
3516 use_pfxlist = pfxlist;
3517 use_pfxlen = pfxlen;
3518 need_affix = false;
3519 use_condit = condit | CONDIT_COMB | CONDIT_AFF;
3520 if (ae->ae_flags != NULL) {
3521 // Extract flags from the affix list.
3522 use_flags |= get_affix_flags(affile, ae->ae_flags);
3523
3524 if (affile->af_needaffix != 0 && flag_in_afflist(affile->af_flagtype, ae->ae_flags,
3525 affile->af_needaffix)) {
3526 need_affix = true;
3527 }
3528
3529 // When there is a CIRCUMFIX flag the other affix
3530 // must also have it and we don't add the word
3531 // with one affix.
3532 if (affile->af_circumfix != 0 && flag_in_afflist(affile->af_flagtype, ae->ae_flags,
3533 affile->af_circumfix)) {
3534 use_condit |= CONDIT_CFIX;
3535 if ((condit & CONDIT_CFIX) == 0) {
3536 need_affix = true;
3537 }
3538 }
3539
3540 if (affile->af_pfxpostpone
3541 || spin->si_compflags != NULL) {
3542 if (affile->af_pfxpostpone) {
3543 // Get prefix IDS from the affix list.
3544 use_pfxlen = get_pfxlist(affile,
3545 ae->ae_flags, store_afflist);
3546 } else {
3547 use_pfxlen = 0;
3548 }
3549 use_pfxlist = store_afflist;
3550
3551 // Combine the prefix IDs. Avoid adding the
3552 // same ID twice.
3553 for (i = 0; i < pfxlen; ++i) {
3554 for (j = 0; j < use_pfxlen; ++j) {
3555 if (pfxlist[i] == use_pfxlist[j]) {
3556 break;
3557 }
3558 }
3559 if (j == use_pfxlen) {
3560 use_pfxlist[use_pfxlen++] = pfxlist[i];
3561 }
3562 }
3563
3564 if (spin->si_compflags != NULL) {
3565 // Get compound IDS from the affix list.
3566 get_compflags(affile, ae->ae_flags,
3567 use_pfxlist + use_pfxlen);
3568 } else {
3569 use_pfxlist[use_pfxlen] = NUL;
3570 }
3571
3572 // Combine the list of compound flags.
3573 // Concatenate them to the prefix IDs list.
3574 // Avoid adding the same ID twice.
3575 for (i = pfxlen; pfxlist[i] != NUL; ++i) {
3576 for (j = use_pfxlen;
3577 use_pfxlist[j] != NUL; ++j) {
3578 if (pfxlist[i] == use_pfxlist[j]) {
3579 break;
3580 }
3581 }
3582 if (use_pfxlist[j] == NUL) {
3583 use_pfxlist[j++] = pfxlist[i];
3584 use_pfxlist[j] = NUL;
3585 }
3586 }
3587 }
3588 }
3589
3590 // Obey a "COMPOUNDFORBIDFLAG" of the affix: don't
3591 // use the compound flags.
3592 if (use_pfxlist != NULL && ae->ae_compforbid) {
3593 STRLCPY(pfx_pfxlist, use_pfxlist, use_pfxlen + 1);
3594 use_pfxlist = pfx_pfxlist;
3595 }
3596
3597 // When there are postponed prefixes...
3598 if (spin->si_prefroot != NULL
3599 && spin->si_prefroot->wn_sibling != NULL) {
3600 // ... add a flag to indicate an affix was used.
3601 use_flags |= WF_HAS_AFF;
3602
3603 // ... don't use a prefix list if combining
3604 // affixes is not allowed. But do use the
3605 // compound flags after them.
3606 if (!ah->ah_combine && use_pfxlist != NULL) {
3607 use_pfxlist += use_pfxlen;
3608 }
3609 }
3610
3611 // When compounding is supported and there is no
3612 // "COMPOUNDPERMITFLAG" then forbid compounding on the
3613 // side where the affix is applied.
3614 if (spin->si_compflags != NULL && !ae->ae_comppermit) {
3615 if (xht != NULL) {
3616 use_flags |= WF_NOCOMPAFT;
3617 } else {
3618 use_flags |= WF_NOCOMPBEF;
3619 }
3620 }
3621
3622 // Store the modified word.
3623 if (store_word(spin, newword, use_flags,
3624 spin->si_region, use_pfxlist,
3625 need_affix) == FAIL) {
3626 retval = FAIL;
3627 }
3628
3629 // When added a prefix or a first suffix and the affix
3630 // has flags may add a(nother) suffix. RECURSIVE!
3631 if ((condit & CONDIT_SUF) && ae->ae_flags != NULL) {
3632 if (store_aff_word(spin, newword, ae->ae_flags,
3633 affile, &affile->af_suff, xht,
3634 use_condit & (xht == NULL
3635 ? ~0 : ~CONDIT_SUF),
3636 use_flags, use_pfxlist, pfxlen) == FAIL) {
3637 retval = FAIL;
3638 }
3639 }
3640
3641 // When added a suffix and combining is allowed also
3642 // try adding a prefix additionally. Both for the
3643 // word flags and for the affix flags. RECURSIVE!
3644 if (xht != NULL && ah->ah_combine) {
3645 if (store_aff_word(spin, newword,
3646 afflist, affile,
3647 xht, NULL, use_condit,
3648 use_flags, use_pfxlist,
3649 pfxlen) == FAIL
3650 || (ae->ae_flags != NULL
3651 && store_aff_word(spin, newword,
3652 ae->ae_flags, affile,
3653 xht, NULL, use_condit,
3654 use_flags, use_pfxlist,
3655 pfxlen) == FAIL)) {
3656 retval = FAIL;
3657 }
3658 }
3659 }
3660 }
3661 }
3662 }
3663 }
3664
3665 return retval;
3666 }
3667
3668 // Read a file with a list of words.
spell_read_wordfile(spellinfo_T * spin,char_u * fname)3669 static int spell_read_wordfile(spellinfo_T *spin, char_u *fname)
3670 {
3671 FILE *fd;
3672 long lnum = 0;
3673 char_u rline[MAXLINELEN];
3674 char_u *line;
3675 char_u *pc = NULL;
3676 char_u *p;
3677 int l;
3678 int retval = OK;
3679 bool did_word = false;
3680 int non_ascii = 0;
3681 int flags;
3682 int regionmask;
3683
3684 // Open the file.
3685 fd = os_fopen((char *)fname, "r");
3686 if (fd == NULL) {
3687 semsg(_(e_notopen), fname);
3688 return FAIL;
3689 }
3690
3691 vim_snprintf((char *)IObuff, IOSIZE, _("Reading word file %s..."), fname);
3692 spell_message(spin, IObuff);
3693
3694 // Read all the lines in the file one by one.
3695 while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) {
3696 line_breakcheck();
3697 ++lnum;
3698
3699 // Skip comment lines.
3700 if (*rline == '#') {
3701 continue;
3702 }
3703
3704 // Remove CR, LF and white space from the end.
3705 l = (int)STRLEN(rline);
3706 while (l > 0 && rline[l - 1] <= ' ') {
3707 --l;
3708 }
3709 if (l == 0) {
3710 continue; // empty or blank line
3711 }
3712 rline[l] = NUL;
3713
3714 // Convert from "/encoding={encoding}" to 'encoding' when needed.
3715 xfree(pc);
3716 if (spin->si_conv.vc_type != CONV_NONE) {
3717 pc = string_convert(&spin->si_conv, rline, NULL);
3718 if (pc == NULL) {
3719 smsg(_("Conversion failure for word in %s line %ld: %s"),
3720 fname, lnum, rline);
3721 continue;
3722 }
3723 line = pc;
3724 } else {
3725 pc = NULL;
3726 line = rline;
3727 }
3728
3729 if (*line == '/') {
3730 ++line;
3731 if (STRNCMP(line, "encoding=", 9) == 0) {
3732 if (spin->si_conv.vc_type != CONV_NONE) {
3733 smsg(_("Duplicate /encoding= line ignored in %s line %ld: %s"),
3734 fname, lnum, line - 1);
3735 } else if (did_word) {
3736 smsg(_("/encoding= line after word ignored in %s line %ld: %s"),
3737 fname, lnum, line - 1);
3738 } else {
3739 char_u *enc;
3740
3741 // Setup for conversion to 'encoding'.
3742 line += 9;
3743 enc = enc_canonize(line);
3744 if (!spin->si_ascii
3745 && convert_setup(&spin->si_conv, enc,
3746 p_enc) == FAIL) {
3747 smsg(_("Conversion in %s not supported: from %s to %s"),
3748 fname, line, p_enc);
3749 }
3750 xfree(enc);
3751 spin->si_conv.vc_fail = true;
3752 }
3753 continue;
3754 }
3755
3756 if (STRNCMP(line, "regions=", 8) == 0) {
3757 if (spin->si_region_count > 1) {
3758 smsg(_("Duplicate /regions= line ignored in %s line %ld: %s"),
3759 fname, lnum, line);
3760 } else {
3761 line += 8;
3762 if (STRLEN(line) > MAXREGIONS * 2) {
3763 smsg(_("Too many regions in %s line %ld: %s"),
3764 fname, lnum, line);
3765 } else {
3766 spin->si_region_count = (int)STRLEN(line) / 2;
3767 STRCPY(spin->si_region_name, line);
3768
3769 // Adjust the mask for a word valid in all regions.
3770 spin->si_region = (1 << spin->si_region_count) - 1;
3771 }
3772 }
3773 continue;
3774 }
3775
3776 smsg(_("/ line ignored in %s line %ld: %s"),
3777 fname, lnum, line - 1);
3778 continue;
3779 }
3780
3781 flags = 0;
3782 regionmask = spin->si_region;
3783
3784 // Check for flags and region after a slash.
3785 p = vim_strchr(line, '/');
3786 if (p != NULL) {
3787 *p++ = NUL;
3788 while (*p != NUL) {
3789 if (*p == '=') { // keep-case word
3790 flags |= WF_KEEPCAP | WF_FIXCAP;
3791 } else if (*p == '!') { // Bad, bad, wicked word.
3792 flags |= WF_BANNED;
3793 } else if (*p == '?') { // Rare word.
3794 flags |= WF_RARE;
3795 } else if (ascii_isdigit(*p)) { // region number(s)
3796 if ((flags & WF_REGION) == 0) { // first one
3797 regionmask = 0;
3798 }
3799 flags |= WF_REGION;
3800
3801 l = *p - '0';
3802 if (l == 0 || l > spin->si_region_count) {
3803 smsg(_("Invalid region nr in %s line %ld: %s"),
3804 fname, lnum, p);
3805 break;
3806 }
3807 regionmask |= 1 << (l - 1);
3808 } else {
3809 smsg(_("Unrecognized flags in %s line %ld: %s"),
3810 fname, lnum, p);
3811 break;
3812 }
3813 ++p;
3814 }
3815 }
3816
3817 // Skip non-ASCII words when "spin->si_ascii" is true.
3818 if (spin->si_ascii && has_non_ascii(line)) {
3819 ++non_ascii;
3820 continue;
3821 }
3822
3823 // Normal word: store it.
3824 if (store_word(spin, line, flags, regionmask, NULL, false) == FAIL) {
3825 retval = FAIL;
3826 break;
3827 }
3828 did_word = true;
3829 }
3830
3831 xfree(pc);
3832 fclose(fd);
3833
3834 if (spin->si_ascii && non_ascii > 0) {
3835 vim_snprintf((char *)IObuff, IOSIZE,
3836 _("Ignored %d words with non-ASCII characters"), non_ascii);
3837 spell_message(spin, IObuff);
3838 }
3839
3840 return retval;
3841 }
3842
3843 /// Get part of an sblock_T, "len" bytes long.
3844 /// This avoids calling free() for every little struct we use (and keeping
3845 /// track of them).
3846 /// The memory is cleared to all zeros.
3847 ///
3848 /// @param len Length needed (<= SBLOCKSIZE).
3849 /// @param align Align for pointer.
3850 /// @return Pointer into block data.
getroom(spellinfo_T * spin,size_t len,bool align)3851 static void *getroom(spellinfo_T *spin, size_t len, bool align)
3852 FUNC_ATTR_NONNULL_RET
3853 {
3854 char_u *p;
3855 sblock_T *bl = spin->si_blocks;
3856
3857 assert(len <= SBLOCKSIZE);
3858
3859 if (align && bl != NULL) {
3860 // Round size up for alignment. On some systems structures need to be
3861 // aligned to the size of a pointer (e.g., SPARC).
3862 bl->sb_used = (bl->sb_used + sizeof(char *) - 1)
3863 & ~(sizeof(char *) - 1);
3864 }
3865
3866 if (bl == NULL || bl->sb_used + len > SBLOCKSIZE) {
3867 // Allocate a block of memory. It is not freed until much later.
3868 bl = xcalloc(1, (sizeof(sblock_T) + SBLOCKSIZE));
3869 bl->sb_next = spin->si_blocks;
3870 spin->si_blocks = bl;
3871 bl->sb_used = 0;
3872 ++spin->si_blocks_cnt;
3873 }
3874
3875 p = bl->sb_data + bl->sb_used;
3876 bl->sb_used += (int)len;
3877
3878 return p;
3879 }
3880
3881 // Make a copy of a string into memory allocated with getroom().
3882 // Returns NULL when out of memory.
getroom_save(spellinfo_T * spin,char_u * s)3883 static char_u *getroom_save(spellinfo_T *spin, char_u *s)
3884 {
3885 const size_t s_size = STRLEN(s) + 1;
3886 return memcpy(getroom(spin, s_size, false), s, s_size);
3887 }
3888
3889
3890 // Free the list of allocated sblock_T.
free_blocks(sblock_T * bl)3891 static void free_blocks(sblock_T *bl)
3892 {
3893 sblock_T *next;
3894
3895 while (bl != NULL) {
3896 next = bl->sb_next;
3897 xfree(bl);
3898 bl = next;
3899 }
3900 }
3901
3902 // Allocate the root of a word tree.
3903 // Returns NULL when out of memory.
wordtree_alloc(spellinfo_T * spin)3904 static wordnode_T *wordtree_alloc(spellinfo_T *spin)
3905 FUNC_ATTR_NONNULL_RET
3906 {
3907 return (wordnode_T *)getroom(spin, sizeof(wordnode_T), true);
3908 }
3909
3910 /// Store a word in the tree(s).
3911 /// Always store it in the case-folded tree. For a keep-case word this is
3912 /// useful when the word can also be used with all caps (no WF_FIXCAP flag) and
3913 /// used to find suggestions.
3914 /// For a keep-case word also store it in the keep-case tree.
3915 /// When "pfxlist" is not NULL store the word for each postponed prefix ID and
3916 /// compound flag.
3917 ///
3918 /// @param flags extra flags, wf_banned
3919 /// @param region supported region(s)
3920 /// @param pfxlist list of prefix ids or null
3921 /// @param need_affix only store word with affix id
store_word(spellinfo_T * spin,char_u * word,int flags,int region,const char_u * pfxlist,bool need_affix)3922 static int store_word(spellinfo_T *spin, char_u *word, int flags, int region, const char_u *pfxlist,
3923 bool need_affix)
3924 {
3925 int len = (int)STRLEN(word);
3926 int ct = captype(word, word + len);
3927 char_u foldword[MAXWLEN];
3928 int res = OK;
3929
3930 (void)spell_casefold(curwin, word, len, foldword, MAXWLEN);
3931 for (const char_u *p = pfxlist; res == OK; p++) {
3932 if (!need_affix || (p != NULL && *p != NUL)) {
3933 res = tree_add_word(spin, foldword, spin->si_foldroot, ct | flags,
3934 region, p == NULL ? 0 : *p);
3935 }
3936 if (p == NULL || *p == NUL) {
3937 break;
3938 }
3939 }
3940 ++spin->si_foldwcount;
3941
3942 if (res == OK && (ct == WF_KEEPCAP || (flags & WF_KEEPCAP))) {
3943 for (const char_u *p = pfxlist; res == OK; p++) {
3944 if (!need_affix || (p != NULL && *p != NUL)) {
3945 res = tree_add_word(spin, word, spin->si_keeproot, flags,
3946 region, p == NULL ? 0 : *p);
3947 }
3948 if (p == NULL || *p == NUL) {
3949 break;
3950 }
3951 }
3952 ++spin->si_keepwcount;
3953 }
3954 return res;
3955 }
3956
3957 // Add word "word" to a word tree at "root".
3958 // When "flags" < 0 we are adding to the prefix tree where "flags" is used for
3959 // "rare" and "region" is the condition nr.
3960 // Returns FAIL when out of memory.
tree_add_word(spellinfo_T * spin,char_u * word,wordnode_T * root,int flags,int region,int affixID)3961 static int tree_add_word(spellinfo_T *spin, char_u *word, wordnode_T *root, int flags, int region,
3962 int affixID)
3963 {
3964 wordnode_T *node = root;
3965 wordnode_T *np;
3966 wordnode_T *copyp, **copyprev;
3967 wordnode_T **prev = NULL;
3968 int i;
3969
3970 // Add each byte of the word to the tree, including the NUL at the end.
3971 for (i = 0;; ++i) {
3972 // When there is more than one reference to this node we need to make
3973 // a copy, so that we can modify it. Copy the whole list of siblings
3974 // (we don't optimize for a partly shared list of siblings).
3975 if (node != NULL && node->wn_refs > 1) {
3976 --node->wn_refs;
3977 copyprev = prev;
3978 for (copyp = node; copyp != NULL; copyp = copyp->wn_sibling) {
3979 // Allocate a new node and copy the info.
3980 np = get_wordnode(spin);
3981 if (np == NULL) {
3982 return FAIL;
3983 }
3984 np->wn_child = copyp->wn_child;
3985 if (np->wn_child != NULL) {
3986 ++np->wn_child->wn_refs; // child gets extra ref
3987 }
3988 np->wn_byte = copyp->wn_byte;
3989 if (np->wn_byte == NUL) {
3990 np->wn_flags = copyp->wn_flags;
3991 np->wn_region = copyp->wn_region;
3992 np->wn_affixID = copyp->wn_affixID;
3993 }
3994
3995 // Link the new node in the list, there will be one ref.
3996 np->wn_refs = 1;
3997 if (copyprev != NULL) {
3998 *copyprev = np;
3999 }
4000 copyprev = &np->wn_sibling;
4001
4002 // Let "node" point to the head of the copied list.
4003 if (copyp == node) {
4004 node = np;
4005 }
4006 }
4007 }
4008
4009 // Look for the sibling that has the same character. They are sorted
4010 // on byte value, thus stop searching when a sibling is found with a
4011 // higher byte value. For zero bytes (end of word) the sorting is
4012 // done on flags and then on affixID.
4013 while (node != NULL
4014 && (node->wn_byte < word[i]
4015 || (node->wn_byte == NUL
4016 && (flags < 0
4017 ? node->wn_affixID < (unsigned)affixID
4018 : (node->wn_flags < (unsigned)(flags & WN_MASK)
4019 || (node->wn_flags == (flags & WN_MASK)
4020 && (spin->si_sugtree
4021 ? (node->wn_region & 0xffff) < region
4022 : node->wn_affixID
4023 < (unsigned)affixID))))))) {
4024 prev = &node->wn_sibling;
4025 node = *prev;
4026 }
4027 if (node == NULL
4028 || node->wn_byte != word[i]
4029 || (word[i] == NUL
4030 && (flags < 0
4031 || spin->si_sugtree
4032 || node->wn_flags != (flags & WN_MASK)
4033 || node->wn_affixID != affixID))) {
4034 // Allocate a new node.
4035 np = get_wordnode(spin);
4036 if (np == NULL) {
4037 return FAIL;
4038 }
4039 np->wn_byte = word[i];
4040
4041 // If "node" is NULL this is a new child or the end of the sibling
4042 // list: ref count is one. Otherwise use ref count of sibling and
4043 // make ref count of sibling one (matters when inserting in front
4044 // of the list of siblings).
4045 if (node == NULL) {
4046 np->wn_refs = 1;
4047 } else {
4048 np->wn_refs = node->wn_refs;
4049 node->wn_refs = 1;
4050 }
4051 if (prev != NULL) {
4052 *prev = np;
4053 }
4054 np->wn_sibling = node;
4055 node = np;
4056 }
4057
4058 if (word[i] == NUL) {
4059 node->wn_flags = flags;
4060 node->wn_region |= region;
4061 node->wn_affixID = affixID;
4062 break;
4063 }
4064 prev = &node->wn_child;
4065 node = *prev;
4066 }
4067 #ifdef SPELL_PRINTTREE
4068 smsg((char_u *)"Added \"%s\"", word);
4069 spell_print_tree(root->wn_sibling);
4070 #endif
4071
4072 // count nr of words added since last message
4073 ++spin->si_msg_count;
4074
4075 if (spin->si_compress_cnt > 1) {
4076 if (--spin->si_compress_cnt == 1) {
4077 // Did enough words to lower the block count limit.
4078 spin->si_blocks_cnt += compress_inc;
4079 }
4080 }
4081
4082 // When we have allocated lots of memory we need to compress the word tree
4083 // to free up some room. But compression is slow, and we might actually
4084 // need that room, thus only compress in the following situations:
4085 // 1. When not compressed before (si_compress_cnt == 0): when using
4086 // "compress_start" blocks.
4087 // 2. When compressed before and used "compress_inc" blocks before
4088 // adding "compress_added" words (si_compress_cnt > 1).
4089 // 3. When compressed before, added "compress_added" words
4090 // (si_compress_cnt == 1) and the number of free nodes drops below the
4091 // maximum word length.
4092 #ifndef SPELL_COMPRESS_ALLWAYS
4093 if (spin->si_compress_cnt == 1 // NOLINT(readability/braces)
4094 ? spin->si_free_count < MAXWLEN
4095 : spin->si_blocks_cnt >= compress_start)
4096 #endif
4097 {
4098 // Decrement the block counter. The effect is that we compress again
4099 // when the freed up room has been used and another "compress_inc"
4100 // blocks have been allocated. Unless "compress_added" words have
4101 // been added, then the limit is put back again.
4102 spin->si_blocks_cnt -= compress_inc;
4103 spin->si_compress_cnt = compress_added;
4104
4105 if (spin->si_verbose) {
4106 msg_start();
4107 msg_puts(_(msg_compressing));
4108 msg_clr_eos();
4109 msg_didout = false;
4110 msg_col = 0;
4111 ui_flush();
4112 }
4113
4114 // Compress both trees. Either they both have many nodes, which makes
4115 // compression useful, or one of them is small, which means
4116 // compression goes fast. But when filling the soundfold word tree
4117 // there is no keep-case tree.
4118 wordtree_compress(spin, spin->si_foldroot, "case-folded");
4119 if (affixID >= 0) {
4120 wordtree_compress(spin, spin->si_keeproot, "keep-case");
4121 }
4122 }
4123
4124 return OK;
4125 }
4126
4127 // Get a wordnode_T, either from the list of previously freed nodes or
4128 // allocate a new one.
4129 // Returns NULL when out of memory.
get_wordnode(spellinfo_T * spin)4130 static wordnode_T *get_wordnode(spellinfo_T *spin)
4131 {
4132 wordnode_T *n;
4133
4134 if (spin->si_first_free == NULL) {
4135 n = (wordnode_T *)getroom(spin, sizeof(wordnode_T), true);
4136 } else {
4137 n = spin->si_first_free;
4138 spin->si_first_free = n->wn_child;
4139 memset(n, 0, sizeof(wordnode_T));
4140 --spin->si_free_count;
4141 }
4142 #ifdef SPELL_PRINTTREE
4143 if (n != NULL) {
4144 n->wn_nr = ++spin->si_wordnode_nr;
4145 }
4146 #endif
4147 return n;
4148 }
4149
4150 // Decrement the reference count on a node (which is the head of a list of
4151 // siblings). If the reference count becomes zero free the node and its
4152 // siblings.
4153 // Returns the number of nodes actually freed.
deref_wordnode(spellinfo_T * spin,wordnode_T * node)4154 static int deref_wordnode(spellinfo_T *spin, wordnode_T *node)
4155 FUNC_ATTR_NONNULL_ALL
4156 {
4157 wordnode_T *np;
4158 int cnt = 0;
4159
4160 if (--node->wn_refs == 0) {
4161 for (np = node; np != NULL; np = np->wn_sibling) {
4162 if (np->wn_child != NULL) {
4163 cnt += deref_wordnode(spin, np->wn_child);
4164 }
4165 free_wordnode(spin, np);
4166 ++cnt;
4167 }
4168 ++cnt; // length field
4169 }
4170 return cnt;
4171 }
4172
4173 // Free a wordnode_T for re-use later.
4174 // Only the "wn_child" field becomes invalid.
free_wordnode(spellinfo_T * spin,wordnode_T * n)4175 static void free_wordnode(spellinfo_T *spin, wordnode_T *n)
4176 FUNC_ATTR_NONNULL_ALL
4177 {
4178 n->wn_child = spin->si_first_free;
4179 spin->si_first_free = n;
4180 ++spin->si_free_count;
4181 }
4182
4183 // Compress a tree: find tails that are identical and can be shared.
wordtree_compress(spellinfo_T * spin,wordnode_T * root,const char * name)4184 static void wordtree_compress(spellinfo_T *spin, wordnode_T *root, const char *name)
4185 FUNC_ATTR_NONNULL_ALL
4186 {
4187 hashtab_T ht;
4188 long tot = 0;
4189 long perc;
4190
4191 // Skip the root itself, it's not actually used. The first sibling is the
4192 // start of the tree.
4193 if (root->wn_sibling != NULL) {
4194 hash_init(&ht);
4195 const long n = node_compress(spin, root->wn_sibling, &ht, &tot);
4196
4197 #ifndef SPELL_PRINTTREE
4198 if (spin->si_verbose || p_verbose > 2)
4199 #endif
4200 {
4201 if (tot > 1000000) {
4202 perc = (tot - n) / (tot / 100);
4203 } else if (tot == 0) {
4204 perc = 0;
4205 } else {
4206 perc = (tot - n) * 100 / tot;
4207 }
4208 vim_snprintf((char *)IObuff, IOSIZE,
4209 _("Compressed %s of %ld nodes; %ld (%ld%%) remaining"),
4210 name, tot, tot - n, perc);
4211 spell_message(spin, IObuff);
4212 }
4213 #ifdef SPELL_PRINTTREE
4214 spell_print_tree(root->wn_sibling);
4215 #endif
4216 hash_clear(&ht);
4217 }
4218 }
4219
4220 /// Compress a node, its siblings and its children, depth first.
4221 /// Returns the number of compressed nodes.
4222 ///
4223 /// @param tot total count of nodes before compressing, incremented while going through the tree
node_compress(spellinfo_T * spin,wordnode_T * node,hashtab_T * ht,long * tot)4224 static long node_compress(spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, long *tot)
4225 FUNC_ATTR_NONNULL_ALL
4226 {
4227 wordnode_T *np;
4228 wordnode_T *tp;
4229 wordnode_T *child;
4230 hash_T hash;
4231 hashitem_T *hi;
4232 long len = 0;
4233 unsigned nr, n;
4234 long compressed = 0;
4235
4236 // Go through the list of siblings. Compress each child and then try
4237 // finding an identical child to replace it.
4238 // Note that with "child" we mean not just the node that is pointed to,
4239 // but the whole list of siblings of which the child node is the first.
4240 for (np = node; np != NULL && !got_int; np = np->wn_sibling) {
4241 ++len;
4242 if ((child = np->wn_child) != NULL) {
4243 // Compress the child first. This fills hashkey.
4244 compressed += node_compress(spin, child, ht, tot);
4245
4246 // Try to find an identical child.
4247 hash = hash_hash(child->wn_u1.hashkey);
4248 hi = hash_lookup(ht, (const char *)child->wn_u1.hashkey,
4249 STRLEN(child->wn_u1.hashkey), hash);
4250 if (!HASHITEM_EMPTY(hi)) {
4251 // There are children we encountered before with a hash value
4252 // identical to the current child. Now check if there is one
4253 // that is really identical.
4254 for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_u2.next) {
4255 if (node_equal(child, tp)) {
4256 // Found one! Now use that child in place of the
4257 // current one. This means the current child and all
4258 // its siblings is unlinked from the tree.
4259 ++tp->wn_refs;
4260 compressed += deref_wordnode(spin, child);
4261 np->wn_child = tp;
4262 break;
4263 }
4264 }
4265 if (tp == NULL) {
4266 // No other child with this hash value equals the child of
4267 // the node, add it to the linked list after the first
4268 // item.
4269 tp = HI2WN(hi);
4270 child->wn_u2.next = tp->wn_u2.next;
4271 tp->wn_u2.next = child;
4272 }
4273 } else {
4274 // No other child has this hash value, add it to the
4275 // hashtable.
4276 hash_add_item(ht, hi, child->wn_u1.hashkey, hash);
4277 }
4278 }
4279 }
4280 *tot += len + 1; // add one for the node that stores the length
4281
4282 // Make a hash key for the node and its siblings, so that we can quickly
4283 // find a lookalike node. This must be done after compressing the sibling
4284 // list, otherwise the hash key would become invalid by the compression.
4285 node->wn_u1.hashkey[0] = len;
4286 nr = 0;
4287 for (np = node; np != NULL; np = np->wn_sibling) {
4288 if (np->wn_byte == NUL) {
4289 // end node: use wn_flags, wn_region and wn_affixID
4290 n = np->wn_flags + (np->wn_region << 8) + (np->wn_affixID << 16);
4291 } else {
4292 // byte node: use the byte value and the child pointer
4293 n = (unsigned)(np->wn_byte + ((uintptr_t)np->wn_child << 8));
4294 }
4295 nr = nr * 101 + n;
4296 }
4297
4298 // Avoid NUL bytes, it terminates the hash key.
4299 n = nr & 0xff;
4300 node->wn_u1.hashkey[1] = n == 0 ? 1 : n;
4301 n = (nr >> 8) & 0xff;
4302 node->wn_u1.hashkey[2] = n == 0 ? 1 : n;
4303 n = (nr >> 16) & 0xff;
4304 node->wn_u1.hashkey[3] = n == 0 ? 1 : n;
4305 n = (nr >> 24) & 0xff;
4306 node->wn_u1.hashkey[4] = n == 0 ? 1 : n;
4307 node->wn_u1.hashkey[5] = NUL;
4308
4309 // Check for CTRL-C pressed now and then.
4310 veryfast_breakcheck();
4311
4312 return compressed;
4313 }
4314
4315 // Returns true when two nodes have identical siblings and children.
node_equal(wordnode_T * n1,wordnode_T * n2)4316 static bool node_equal(wordnode_T *n1, wordnode_T *n2)
4317 {
4318 wordnode_T *p1;
4319 wordnode_T *p2;
4320
4321 for (p1 = n1, p2 = n2; p1 != NULL && p2 != NULL;
4322 p1 = p1->wn_sibling, p2 = p2->wn_sibling) {
4323 if (p1->wn_byte != p2->wn_byte
4324 || (p1->wn_byte == NUL
4325 ? (p1->wn_flags != p2->wn_flags
4326 || p1->wn_region != p2->wn_region
4327 || p1->wn_affixID != p2->wn_affixID)
4328 : (p1->wn_child != p2->wn_child))) {
4329 break;
4330 }
4331 }
4332
4333 return p1 == NULL && p2 == NULL;
4334 }
4335
4336
4337 // Function given to qsort() to sort the REP items on "from" string.
rep_compare(const void * s1,const void * s2)4338 static int rep_compare(const void *s1, const void *s2)
4339 {
4340 fromto_T *p1 = (fromto_T *)s1;
4341 fromto_T *p2 = (fromto_T *)s2;
4342
4343 return STRCMP(p1->ft_from, p2->ft_from);
4344 }
4345
4346 // Write the Vim .spl file "fname".
4347 // Return OK/FAIL.
write_vim_spell(spellinfo_T * spin,char_u * fname)4348 static int write_vim_spell(spellinfo_T *spin, char_u *fname)
4349 {
4350 int retval = OK;
4351 int regionmask;
4352
4353 FILE *fd = os_fopen((char *)fname, "w");
4354 if (fd == NULL) {
4355 semsg(_(e_notopen), fname);
4356 return FAIL;
4357 }
4358
4359 // <HEADER>: <fileID> <versionnr>
4360 // <fileID>
4361 size_t fwv = fwrite(VIMSPELLMAGIC, VIMSPELLMAGICL, 1, fd);
4362 if (fwv != (size_t)1) {
4363 // Catch first write error, don't try writing more.
4364 goto theend;
4365 }
4366
4367 putc(VIMSPELLVERSION, fd); // <versionnr>
4368
4369 // <SECTIONS>: <section> ... <sectionend>
4370
4371 // SN_INFO: <infotext>
4372 if (spin->si_info != NULL) {
4373 putc(SN_INFO, fd); // <sectionID>
4374 putc(0, fd); // <sectionflags>
4375 size_t i = STRLEN(spin->si_info);
4376 put_bytes(fd, i, 4); // <sectionlen>
4377 fwv &= fwrite(spin->si_info, i, 1, fd); // <infotext>
4378 }
4379
4380 // SN_REGION: <regionname> ...
4381 // Write the region names only if there is more than one.
4382 if (spin->si_region_count > 1) {
4383 putc(SN_REGION, fd); // <sectionID>
4384 putc(SNF_REQUIRED, fd); // <sectionflags>
4385 size_t l = (size_t)spin->si_region_count * 2;
4386 put_bytes(fd, l, 4); // <sectionlen>
4387 fwv &= fwrite(spin->si_region_name, l, 1, fd);
4388 // <regionname> ...
4389 regionmask = (1 << spin->si_region_count) - 1;
4390 } else {
4391 regionmask = 0;
4392 }
4393
4394 // SN_CHARFLAGS: <charflagslen> <charflags> <folcharslen> <folchars>
4395 //
4396 // The table with character flags and the table for case folding.
4397 // This makes sure the same characters are recognized as word characters
4398 // when generating an when using a spell file.
4399 // Skip this for ASCII, the table may conflict with the one used for
4400 // 'encoding'.
4401 // Also skip this for an .add.spl file, the main spell file must contain
4402 // the table (avoids that it conflicts). File is shorter too.
4403 if (!spin->si_ascii && !spin->si_add) {
4404 char_u folchars[128 * 8];
4405 int flags;
4406
4407 putc(SN_CHARFLAGS, fd); // <sectionID>
4408 putc(SNF_REQUIRED, fd); // <sectionflags>
4409
4410 // Form the <folchars> string first, we need to know its length.
4411 size_t l = 0;
4412 for (size_t i = 128; i < 256; i++) {
4413 l += (size_t)utf_char2bytes(spelltab.st_fold[i], folchars + l);
4414 }
4415 put_bytes(fd, 1 + 128 + 2 + l, 4); // <sectionlen>
4416
4417 fputc(128, fd); // <charflagslen>
4418 for (size_t i = 128; i < 256; ++i) {
4419 flags = 0;
4420 if (spelltab.st_isw[i]) {
4421 flags |= CF_WORD;
4422 }
4423 if (spelltab.st_isu[i]) {
4424 flags |= CF_UPPER;
4425 }
4426 fputc(flags, fd); // <charflags>
4427 }
4428
4429 put_bytes(fd, l, 2); // <folcharslen>
4430 fwv &= fwrite(folchars, l, 1, fd); // <folchars>
4431 }
4432
4433 // SN_MIDWORD: <midword>
4434 if (spin->si_midword != NULL) {
4435 putc(SN_MIDWORD, fd); // <sectionID>
4436 putc(SNF_REQUIRED, fd); // <sectionflags>
4437
4438 size_t i = STRLEN(spin->si_midword);
4439 put_bytes(fd, i, 4); // <sectionlen>
4440 fwv &= fwrite(spin->si_midword, i, 1, fd);
4441 // <midword>
4442 }
4443
4444 // SN_PREFCOND: <prefcondcnt> <prefcond> ...
4445 if (!GA_EMPTY(&spin->si_prefcond)) {
4446 putc(SN_PREFCOND, fd); // <sectionID>
4447 putc(SNF_REQUIRED, fd); // <sectionflags>
4448
4449 size_t l = (size_t)write_spell_prefcond(NULL, &spin->si_prefcond);
4450 put_bytes(fd, l, 4); // <sectionlen>
4451
4452 write_spell_prefcond(fd, &spin->si_prefcond);
4453 }
4454
4455 // SN_REP: <repcount> <rep> ...
4456 // SN_SAL: <salflags> <salcount> <sal> ...
4457 // SN_REPSAL: <repcount> <rep> ...
4458
4459 // round 1: SN_REP section
4460 // round 2: SN_SAL section (unless SN_SOFO is used)
4461 // round 3: SN_REPSAL section
4462 for (unsigned int round = 1; round <= 3; ++round) {
4463 garray_T *gap;
4464 if (round == 1) {
4465 gap = &spin->si_rep;
4466 } else if (round == 2) {
4467 // Don't write SN_SAL when using a SN_SOFO section
4468 if (spin->si_sofofr != NULL && spin->si_sofoto != NULL) {
4469 continue;
4470 }
4471 gap = &spin->si_sal;
4472 } else {
4473 gap = &spin->si_repsal;
4474 }
4475
4476 // Don't write the section if there are no items.
4477 if (GA_EMPTY(gap)) {
4478 continue;
4479 }
4480
4481 // Sort the REP/REPSAL items.
4482 if (round != 2) {
4483 qsort(gap->ga_data, (size_t)gap->ga_len,
4484 sizeof(fromto_T), rep_compare);
4485 }
4486
4487 int sect_id = round == 1 ? SN_REP : (round == 2 ? SN_SAL : SN_REPSAL);
4488 putc(sect_id, fd); // <sectionID>
4489
4490 // This is for making suggestions, section is not required.
4491 putc(0, fd); // <sectionflags>
4492
4493 // Compute the length of what follows.
4494 size_t l = 2; // count <repcount> or <salcount>
4495 assert(gap->ga_len >= 0);
4496 for (size_t i = 0; i < (size_t)gap->ga_len; ++i) {
4497 fromto_T *ftp = &((fromto_T *)gap->ga_data)[i];
4498 l += 1 + STRLEN(ftp->ft_from); // count <*fromlen> and <*from>
4499 l += 1 + STRLEN(ftp->ft_to); // count <*tolen> and <*to>
4500 }
4501 if (round == 2) {
4502 ++l; // count <salflags>
4503 }
4504 put_bytes(fd, l, 4); // <sectionlen>
4505
4506 if (round == 2) {
4507 int i = 0;
4508 if (spin->si_followup) {
4509 i |= SAL_F0LLOWUP;
4510 }
4511 if (spin->si_collapse) {
4512 i |= SAL_COLLAPSE;
4513 }
4514 if (spin->si_rem_accents) {
4515 i |= SAL_REM_ACCENTS;
4516 }
4517 putc(i, fd); // <salflags>
4518 }
4519
4520 put_bytes(fd, (uintmax_t)gap->ga_len, 2); // <repcount> or <salcount>
4521 for (size_t i = 0; i < (size_t)gap->ga_len; ++i) {
4522 // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
4523 // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
4524 fromto_T *ftp = &((fromto_T *)gap->ga_data)[i];
4525 for (unsigned int rr = 1; rr <= 2; ++rr) {
4526 char_u *p = rr == 1 ? ftp->ft_from : ftp->ft_to;
4527 l = STRLEN(p);
4528 assert(l < INT_MAX);
4529 putc((int)l, fd);
4530 if (l > 0) {
4531 fwv &= fwrite(p, l, 1, fd);
4532 }
4533 }
4534 }
4535 }
4536
4537 // SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
4538 // This is for making suggestions, section is not required.
4539 if (spin->si_sofofr != NULL && spin->si_sofoto != NULL) {
4540 putc(SN_SOFO, fd); // <sectionID>
4541 putc(0, fd); // <sectionflags>
4542
4543 size_t l = STRLEN(spin->si_sofofr);
4544 put_bytes(fd, l + STRLEN(spin->si_sofoto) + 4, 4); // <sectionlen>
4545
4546 put_bytes(fd, l, 2); // <sofofromlen>
4547 fwv &= fwrite(spin->si_sofofr, l, 1, fd); // <sofofrom>
4548
4549 l = STRLEN(spin->si_sofoto);
4550 put_bytes(fd, l, 2); // <sofotolen>
4551 fwv &= fwrite(spin->si_sofoto, l, 1, fd); // <sofoto>
4552 }
4553
4554 // SN_WORDS: <word> ...
4555 // This is for making suggestions, section is not required.
4556 if (spin->si_commonwords.ht_used > 0) {
4557 putc(SN_WORDS, fd); // <sectionID>
4558 putc(0, fd); // <sectionflags>
4559
4560 // round 1: count the bytes
4561 // round 2: write the bytes
4562 for (unsigned int round = 1; round <= 2; ++round) {
4563 size_t todo;
4564 size_t len = 0;
4565 hashitem_T *hi;
4566
4567 todo = spin->si_commonwords.ht_used;
4568 for (hi = spin->si_commonwords.ht_array; todo > 0; ++hi) {
4569 if (!HASHITEM_EMPTY(hi)) {
4570 size_t l = STRLEN(hi->hi_key) + 1;
4571 len += l;
4572 if (round == 2) { // <word>
4573 fwv &= fwrite(hi->hi_key, l, 1, fd);
4574 }
4575 --todo;
4576 }
4577 }
4578 if (round == 1) {
4579 put_bytes(fd, len, 4); // <sectionlen>
4580 }
4581 }
4582 }
4583
4584 // SN_MAP: <mapstr>
4585 // This is for making suggestions, section is not required.
4586 if (!GA_EMPTY(&spin->si_map)) {
4587 putc(SN_MAP, fd); // <sectionID>
4588 putc(0, fd); // <sectionflags>
4589 size_t l = (size_t)spin->si_map.ga_len;
4590 put_bytes(fd, l, 4); // <sectionlen>
4591 fwv &= fwrite(spin->si_map.ga_data, l, 1, fd); // <mapstr>
4592 }
4593
4594 // SN_SUGFILE: <timestamp>
4595 // This is used to notify that a .sug file may be available and at the
4596 // same time allows for checking that a .sug file that is found matches
4597 // with this .spl file. That's because the word numbers must be exactly
4598 // right.
4599 if (!spin->si_nosugfile
4600 && (!GA_EMPTY(&spin->si_sal)
4601 || (spin->si_sofofr != NULL && spin->si_sofoto != NULL))) {
4602 putc(SN_SUGFILE, fd); // <sectionID>
4603 putc(0, fd); // <sectionflags>
4604 put_bytes(fd, 8, 4); // <sectionlen>
4605
4606 // Set si_sugtime and write it to the file.
4607 spin->si_sugtime = time(NULL);
4608 put_time(fd, spin->si_sugtime); // <timestamp>
4609 }
4610
4611 // SN_NOSPLITSUGS: nothing
4612 // This is used to notify that no suggestions with word splits are to be
4613 // made.
4614 if (spin->si_nosplitsugs) {
4615 putc(SN_NOSPLITSUGS, fd); // <sectionID>
4616 putc(0, fd); // <sectionflags>
4617 put_bytes(fd, 0, 4); // <sectionlen>
4618 }
4619
4620 // SN_NOCOMPUNDSUGS: nothing
4621 // This is used to notify that no suggestions with compounds are to be
4622 // made.
4623 if (spin->si_nocompoundsugs) {
4624 putc(SN_NOCOMPOUNDSUGS, fd); // <sectionID>
4625 putc(0, fd); // <sectionflags>
4626 put_bytes(fd, 0, 4); // <sectionlen>
4627 }
4628
4629 // SN_COMPOUND: compound info.
4630 // We don't mark it required, when not supported all compound words will
4631 // be bad words.
4632 if (spin->si_compflags != NULL) {
4633 putc(SN_COMPOUND, fd); // <sectionID>
4634 putc(0, fd); // <sectionflags>
4635
4636 size_t l = STRLEN(spin->si_compflags);
4637 assert(spin->si_comppat.ga_len >= 0);
4638 for (size_t i = 0; i < (size_t)spin->si_comppat.ga_len; ++i) {
4639 l += STRLEN(((char_u **)(spin->si_comppat.ga_data))[i]) + 1;
4640 }
4641 put_bytes(fd, l + 7, 4); // <sectionlen>
4642
4643 putc(spin->si_compmax, fd); // <compmax>
4644 putc(spin->si_compminlen, fd); // <compminlen>
4645 putc(spin->si_compsylmax, fd); // <compsylmax>
4646 putc(0, fd); // for Vim 7.0b compatibility
4647 putc(spin->si_compoptions, fd); // <compoptions>
4648 put_bytes(fd, (uintmax_t)spin->si_comppat.ga_len, 2); // <comppatcount>
4649 for (size_t i = 0; i < (size_t)spin->si_comppat.ga_len; ++i) {
4650 char_u *p = ((char_u **)(spin->si_comppat.ga_data))[i];
4651 assert(STRLEN(p) < INT_MAX);
4652 putc((int)STRLEN(p), fd); // <comppatlen>
4653 fwv &= fwrite(p, STRLEN(p), 1, fd); // <comppattext>
4654 }
4655 // <compflags>
4656 fwv &= fwrite(spin->si_compflags, STRLEN(spin->si_compflags), 1, fd);
4657 }
4658
4659 // SN_NOBREAK: NOBREAK flag
4660 if (spin->si_nobreak) {
4661 putc(SN_NOBREAK, fd); // <sectionID>
4662 putc(0, fd); // <sectionflags>
4663
4664 // It's empty, the presence of the section flags the feature.
4665 put_bytes(fd, 0, 4); // <sectionlen>
4666 }
4667
4668 // SN_SYLLABLE: syllable info.
4669 // We don't mark it required, when not supported syllables will not be
4670 // counted.
4671 if (spin->si_syllable != NULL) {
4672 putc(SN_SYLLABLE, fd); // <sectionID>
4673 putc(0, fd); // <sectionflags>
4674
4675 size_t l = STRLEN(spin->si_syllable);
4676 put_bytes(fd, l, 4); // <sectionlen>
4677 fwv &= fwrite(spin->si_syllable, l, 1, fd); // <syllable>
4678 }
4679
4680 // end of <SECTIONS>
4681 putc(SN_END, fd); // <sectionend>
4682
4683
4684 // <LWORDTREE> <KWORDTREE> <PREFIXTREE>
4685 spin->si_memtot = 0;
4686 for (unsigned int round = 1; round <= 3; ++round) {
4687 wordnode_T *tree;
4688 if (round == 1) {
4689 tree = spin->si_foldroot->wn_sibling;
4690 } else if (round == 2) {
4691 tree = spin->si_keeproot->wn_sibling;
4692 } else {
4693 tree = spin->si_prefroot->wn_sibling;
4694 }
4695
4696 // Clear the index and wnode fields in the tree.
4697 clear_node(tree);
4698
4699 // Count the number of nodes. Needed to be able to allocate the
4700 // memory when reading the nodes. Also fills in index for shared
4701 // nodes.
4702 size_t nodecount = (size_t)put_node(NULL, tree, 0, regionmask, round == 3);
4703
4704 // number of nodes in 4 bytes
4705 put_bytes(fd, nodecount, 4); // <nodecount>
4706 assert(nodecount + nodecount * sizeof(int) < INT_MAX);
4707 spin->si_memtot += (int)(nodecount + nodecount * sizeof(int));
4708
4709 // Write the nodes.
4710 (void)put_node(fd, tree, 0, regionmask, round == 3);
4711 }
4712
4713 // Write another byte to check for errors (file system full).
4714 if (putc(0, fd) == EOF) {
4715 retval = FAIL;
4716 }
4717 theend:
4718 if (fclose(fd) == EOF) {
4719 retval = FAIL;
4720 }
4721
4722 if (fwv != (size_t)1) {
4723 retval = FAIL;
4724 }
4725 if (retval == FAIL) {
4726 emsg(_(e_write));
4727 }
4728
4729 return retval;
4730 }
4731
4732 // Clear the index and wnode fields of "node", it siblings and its
4733 // children. This is needed because they are a union with other items to save
4734 // space.
clear_node(wordnode_T * node)4735 static void clear_node(wordnode_T *node)
4736 {
4737 wordnode_T *np;
4738
4739 if (node != NULL) {
4740 for (np = node; np != NULL; np = np->wn_sibling) {
4741 np->wn_u1.index = 0;
4742 np->wn_u2.wnode = NULL;
4743
4744 if (np->wn_byte != NUL) {
4745 clear_node(np->wn_child);
4746 }
4747 }
4748 }
4749 }
4750
4751
4752 /// Dump a word tree at node "node".
4753 ///
4754 /// This first writes the list of possible bytes (siblings). Then for each
4755 /// byte recursively write the children.
4756 ///
4757 /// NOTE: The code here must match the code in read_tree_node(), since
4758 /// assumptions are made about the indexes (so that we don't have to write them
4759 /// in the file).
4760 ///
4761 /// @param fd NULL when only counting
4762 /// @param prefixtree true for PREFIXTREE
4763 ///
4764 /// @return the number of nodes used.
put_node(FILE * fd,wordnode_T * node,int idx,int regionmask,bool prefixtree)4765 static int put_node(FILE *fd, wordnode_T *node, int idx, int regionmask, bool prefixtree)
4766 {
4767 // If "node" is zero the tree is empty.
4768 if (node == NULL) {
4769 return 0;
4770 }
4771
4772 // Store the index where this node is written.
4773 node->wn_u1.index = idx;
4774
4775 // Count the number of siblings.
4776 int siblingcount = 0;
4777 for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
4778 ++siblingcount;
4779 }
4780
4781 // Write the sibling count.
4782 if (fd != NULL) {
4783 putc(siblingcount, fd); // <siblingcount>
4784 }
4785 // Write each sibling byte and optionally extra info.
4786 for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
4787 if (np->wn_byte == 0) {
4788 if (fd != NULL) {
4789 // For a NUL byte (end of word) write the flags etc.
4790 if (prefixtree) {
4791 // In PREFIXTREE write the required affixID and the
4792 // associated condition nr (stored in wn_region). The
4793 // byte value is misused to store the "rare" and "not
4794 // combining" flags
4795 if (np->wn_flags == (uint16_t)PFX_FLAGS) {
4796 putc(BY_NOFLAGS, fd); // <byte>
4797 } else {
4798 putc(BY_FLAGS, fd); // <byte>
4799 putc(np->wn_flags, fd); // <pflags>
4800 }
4801 putc(np->wn_affixID, fd); // <affixID>
4802 put_bytes(fd, (uintmax_t)np->wn_region, 2); // <prefcondnr>
4803 } else {
4804 // For word trees we write the flag/region items.
4805 int flags = np->wn_flags;
4806 if (regionmask != 0 && np->wn_region != regionmask) {
4807 flags |= WF_REGION;
4808 }
4809 if (np->wn_affixID != 0) {
4810 flags |= WF_AFX;
4811 }
4812 if (flags == 0) {
4813 // word without flags or region
4814 putc(BY_NOFLAGS, fd); // <byte>
4815 } else {
4816 if (np->wn_flags >= 0x100) {
4817 putc(BY_FLAGS2, fd); // <byte>
4818 putc(flags, fd); // <flags>
4819 putc((int)((unsigned)flags >> 8), fd); // <flags2>
4820 } else {
4821 putc(BY_FLAGS, fd); // <byte>
4822 putc(flags, fd); // <flags>
4823 }
4824 if (flags & WF_REGION) {
4825 putc(np->wn_region, fd); // <region>
4826 }
4827 if (flags & WF_AFX) {
4828 putc(np->wn_affixID, fd); // <affixID>
4829 }
4830 }
4831 }
4832 }
4833 } else {
4834 if (np->wn_child->wn_u1.index != 0
4835 && np->wn_child->wn_u2.wnode != node) {
4836 // The child is written elsewhere, write the reference.
4837 if (fd != NULL) {
4838 putc(BY_INDEX, fd); // <byte>
4839 put_bytes(fd, (uintmax_t)np->wn_child->wn_u1.index, 3); // <nodeidx>
4840 }
4841 } else if (np->wn_child->wn_u2.wnode == NULL) {
4842 // We will write the child below and give it an index.
4843 np->wn_child->wn_u2.wnode = node;
4844 }
4845
4846 if (fd != NULL) {
4847 if (putc(np->wn_byte, fd) == EOF) { // <byte> or <xbyte>
4848 emsg(_(e_write));
4849 return 0;
4850 }
4851 }
4852 }
4853 }
4854
4855 // Space used in the array when reading: one for each sibling and one for
4856 // the count.
4857 int newindex = idx + siblingcount + 1;
4858
4859 // Recursively dump the children of each sibling.
4860 for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
4861 if (np->wn_byte != 0 && np->wn_child->wn_u2.wnode == node) {
4862 newindex = put_node(fd, np->wn_child, newindex, regionmask,
4863 prefixtree);
4864 }
4865 }
4866
4867 return newindex;
4868 }
4869
4870
4871 // ":mkspell [-ascii] outfile infile ..."
4872 // ":mkspell [-ascii] addfile"
ex_mkspell(exarg_T * eap)4873 void ex_mkspell(exarg_T *eap)
4874 {
4875 int fcount;
4876 char_u **fnames;
4877 char_u *arg = eap->arg;
4878 bool ascii = false;
4879
4880 if (STRNCMP(arg, "-ascii", 6) == 0) {
4881 ascii = true;
4882 arg = skipwhite(arg + 6);
4883 }
4884
4885 // Expand all the remaining arguments (e.g., $VIMRUNTIME).
4886 if (get_arglist_exp(arg, &fcount, &fnames, false) == OK) {
4887 mkspell(fcount, fnames, ascii, eap->forceit, false);
4888 FreeWild(fcount, fnames);
4889 }
4890 }
4891
4892 // Create the .sug file.
4893 // Uses the soundfold info in "spin".
4894 // Writes the file with the name "wfname", with ".spl" changed to ".sug".
spell_make_sugfile(spellinfo_T * spin,char_u * wfname)4895 static void spell_make_sugfile(spellinfo_T *spin, char_u *wfname)
4896 {
4897 char_u *fname = NULL;
4898 int len;
4899 slang_T *slang;
4900 bool free_slang = false;
4901
4902 // Read back the .spl file that was written. This fills the required
4903 // info for soundfolding. This also uses less memory than the
4904 // pointer-linked version of the trie. And it avoids having two versions
4905 // of the code for the soundfolding stuff.
4906 // It might have been done already by spell_reload_one().
4907 for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
4908 if (path_full_compare(wfname, slang->sl_fname, false, true)
4909 == kEqualFiles) {
4910 break;
4911 }
4912 }
4913 if (slang == NULL) {
4914 spell_message(spin, (char_u *)_("Reading back spell file..."));
4915 slang = spell_load_file(wfname, NULL, NULL, false);
4916 if (slang == NULL) {
4917 return;
4918 }
4919 free_slang = true;
4920 }
4921
4922 // Clear the info in "spin" that is used.
4923 spin->si_blocks = NULL;
4924 spin->si_blocks_cnt = 0;
4925 spin->si_compress_cnt = 0; // will stay at 0 all the time
4926 spin->si_free_count = 0;
4927 spin->si_first_free = NULL;
4928 spin->si_foldwcount = 0;
4929
4930 // Go through the trie of good words, soundfold each word and add it to
4931 // the soundfold trie.
4932 spell_message(spin, (char_u *)_("Performing soundfolding..."));
4933 if (sug_filltree(spin, slang) == FAIL) {
4934 goto theend;
4935 }
4936
4937 // Create the table which links each soundfold word with a list of the
4938 // good words it may come from. Creates buffer "spin->si_spellbuf".
4939 // This also removes the wordnr from the NUL byte entries to make
4940 // compression possible.
4941 if (sug_maketable(spin) == FAIL) {
4942 goto theend;
4943 }
4944
4945 smsg(_("Number of words after soundfolding: %" PRId64),
4946 (int64_t)spin->si_spellbuf->b_ml.ml_line_count);
4947
4948 // Compress the soundfold trie.
4949 spell_message(spin, (char_u *)_(msg_compressing));
4950 wordtree_compress(spin, spin->si_foldroot, "case-folded");
4951
4952 // Write the .sug file.
4953 // Make the file name by changing ".spl" to ".sug".
4954 fname = xmalloc(MAXPATHL);
4955 STRLCPY(fname, wfname, MAXPATHL);
4956 len = (int)STRLEN(fname);
4957 fname[len - 2] = 'u';
4958 fname[len - 1] = 'g';
4959 sug_write(spin, fname);
4960
4961 theend:
4962 xfree(fname);
4963 if (free_slang) {
4964 slang_free(slang);
4965 }
4966 free_blocks(spin->si_blocks);
4967 close_spellbuf(spin->si_spellbuf);
4968 }
4969
4970 // Build the soundfold trie for language "slang".
sug_filltree(spellinfo_T * spin,slang_T * slang)4971 static int sug_filltree(spellinfo_T *spin, slang_T *slang)
4972 {
4973 char_u *byts;
4974 idx_T *idxs;
4975 int depth;
4976 idx_T arridx[MAXWLEN];
4977 int curi[MAXWLEN];
4978 char_u tword[MAXWLEN];
4979 char_u tsalword[MAXWLEN];
4980 int c;
4981 idx_T n;
4982 unsigned words_done = 0;
4983 int wordcount[MAXWLEN];
4984
4985 // We use si_foldroot for the soundfolded trie.
4986 spin->si_foldroot = wordtree_alloc(spin);
4987
4988 // Let tree_add_word() know we're adding to the soundfolded tree
4989 spin->si_sugtree = true;
4990
4991 // Go through the whole case-folded tree, soundfold each word and put it
4992 // in the trie.
4993 byts = slang->sl_fbyts;
4994 idxs = slang->sl_fidxs;
4995
4996 arridx[0] = 0;
4997 curi[0] = 1;
4998 wordcount[0] = 0;
4999
5000 depth = 0;
5001 while (depth >= 0 && !got_int) {
5002 if (curi[depth] > byts[arridx[depth]]) {
5003 // Done all bytes at this node, go up one level.
5004 idxs[arridx[depth]] = wordcount[depth];
5005 if (depth > 0) {
5006 wordcount[depth - 1] += wordcount[depth];
5007 }
5008
5009 --depth;
5010 line_breakcheck();
5011 } else {
5012 // Do one more byte at this node.
5013 n = arridx[depth] + curi[depth];
5014 ++curi[depth];
5015
5016 c = byts[n];
5017 if (c == 0) {
5018 // Sound-fold the word.
5019 tword[depth] = NUL;
5020 spell_soundfold(slang, tword, true, tsalword);
5021
5022 // We use the "flags" field for the MSB of the wordnr,
5023 // "region" for the LSB of the wordnr.
5024 if (tree_add_word(spin, tsalword, spin->si_foldroot,
5025 words_done >> 16, words_done & 0xffff,
5026 0) == FAIL) {
5027 return FAIL;
5028 }
5029
5030 ++words_done;
5031 ++wordcount[depth];
5032
5033 // Reset the block count each time to avoid compression
5034 // kicking in.
5035 spin->si_blocks_cnt = 0;
5036
5037 // Skip over any other NUL bytes (same word with different
5038 // flags). But don't go over the end.
5039 while (n + 1 < slang->sl_fbyts_len && byts[n + 1] == 0) {
5040 n++;
5041 curi[depth]++;
5042 }
5043 } else {
5044 // Normal char, go one level deeper.
5045 tword[depth++] = c;
5046 arridx[depth] = idxs[n];
5047 curi[depth] = 1;
5048 wordcount[depth] = 0;
5049 }
5050 }
5051 }
5052
5053 smsg(_("Total number of words: %d"), words_done);
5054
5055 return OK;
5056 }
5057
5058 // Make the table that links each word in the soundfold trie to the words it
5059 // can be produced from.
5060 // This is not unlike lines in a file, thus use a memfile to be able to access
5061 // the table efficiently.
5062 // Returns FAIL when out of memory.
sug_maketable(spellinfo_T * spin)5063 static int sug_maketable(spellinfo_T *spin)
5064 {
5065 garray_T ga;
5066 int res = OK;
5067
5068 // Allocate a buffer, open a memline for it and create the swap file
5069 // (uses a temp file, not a .swp file).
5070 spin->si_spellbuf = open_spellbuf();
5071
5072 // Use a buffer to store the line info, avoids allocating many small
5073 // pieces of memory.
5074 ga_init(&ga, 1, 100);
5075
5076 // recursively go through the tree
5077 if (sug_filltable(spin, spin->si_foldroot->wn_sibling, 0, &ga) == -1) {
5078 res = FAIL;
5079 }
5080
5081 ga_clear(&ga);
5082 return res;
5083 }
5084
5085 /// Fill the table for one node and its children.
5086 /// Returns the wordnr at the start of the node.
5087 /// Returns -1 when out of memory.
5088 ///
5089 /// @param gap place to store line of numbers
sug_filltable(spellinfo_T * spin,wordnode_T * node,int startwordnr,garray_T * gap)5090 static int sug_filltable(spellinfo_T *spin, wordnode_T *node, int startwordnr, garray_T *gap)
5091 {
5092 wordnode_T *p, *np;
5093 int wordnr = startwordnr;
5094 int nr;
5095 int prev_nr;
5096
5097 for (p = node; p != NULL; p = p->wn_sibling) {
5098 if (p->wn_byte == NUL) {
5099 gap->ga_len = 0;
5100 prev_nr = 0;
5101 for (np = p; np != NULL && np->wn_byte == NUL; np = np->wn_sibling) {
5102 ga_grow(gap, 10);
5103
5104 nr = (np->wn_flags << 16) + (np->wn_region & 0xffff);
5105 // Compute the offset from the previous nr and store the
5106 // offset in a way that it takes a minimum number of bytes.
5107 // It's a bit like utf-8, but without the need to mark
5108 // following bytes.
5109 nr -= prev_nr;
5110 prev_nr += nr;
5111 gap->ga_len += offset2bytes(nr,
5112 (char_u *)gap->ga_data + gap->ga_len);
5113 }
5114
5115 // add the NUL byte
5116 ((char_u *)gap->ga_data)[gap->ga_len++] = NUL;
5117
5118 if (ml_append_buf(spin->si_spellbuf, (linenr_T)wordnr,
5119 gap->ga_data, gap->ga_len, true) == FAIL) {
5120 return -1;
5121 }
5122 wordnr++;
5123
5124 // Remove extra NUL entries, we no longer need them. We don't
5125 // bother freeing the nodes, the won't be reused anyway.
5126 while (p->wn_sibling != NULL && p->wn_sibling->wn_byte == NUL) {
5127 p->wn_sibling = p->wn_sibling->wn_sibling;
5128 }
5129
5130 // Clear the flags on the remaining NUL node, so that compression
5131 // works a lot better.
5132 p->wn_flags = 0;
5133 p->wn_region = 0;
5134 } else {
5135 wordnr = sug_filltable(spin, p->wn_child, wordnr, gap);
5136 if (wordnr == -1) {
5137 return -1;
5138 }
5139 }
5140 }
5141 return wordnr;
5142 }
5143
5144 // Convert an offset into a minimal number of bytes.
5145 // Similar to utf_char2byters, but use 8 bits in followup bytes and avoid NUL
5146 // bytes.
offset2bytes(int nr,char_u * buf)5147 static int offset2bytes(int nr, char_u *buf)
5148 {
5149 int rem;
5150 int b1, b2, b3, b4;
5151
5152 // Split the number in parts of base 255. We need to avoid NUL bytes.
5153 b1 = nr % 255 + 1;
5154 rem = nr / 255;
5155 b2 = rem % 255 + 1;
5156 rem = rem / 255;
5157 b3 = rem % 255 + 1;
5158 b4 = rem / 255 + 1;
5159
5160 if (b4 > 1 || b3 > 0x1f) { // 4 bytes
5161 buf[0] = 0xe0 + b4;
5162 buf[1] = b3;
5163 buf[2] = b2;
5164 buf[3] = b1;
5165 return 4;
5166 }
5167 if (b3 > 1 || b2 > 0x3f) { // 3 bytes
5168 buf[0] = 0xc0 + b3;
5169 buf[1] = b2;
5170 buf[2] = b1;
5171 return 3;
5172 }
5173 if (b2 > 1 || b1 > 0x7f) { // 2 bytes
5174 buf[0] = 0x80 + b2;
5175 buf[1] = b1;
5176 return 2;
5177 }
5178 // 1 byte
5179 buf[0] = b1;
5180 return 1;
5181 }
5182
5183 // Write the .sug file in "fname".
sug_write(spellinfo_T * spin,char_u * fname)5184 static void sug_write(spellinfo_T *spin, char_u *fname)
5185 {
5186 // Create the file. Note that an existing file is silently overwritten!
5187 FILE *fd = os_fopen((char *)fname, "w");
5188 if (fd == NULL) {
5189 semsg(_(e_notopen), fname);
5190 return;
5191 }
5192
5193 vim_snprintf((char *)IObuff, IOSIZE,
5194 _("Writing suggestion file %s..."), fname);
5195 spell_message(spin, IObuff);
5196
5197 // <SUGHEADER>: <fileID> <versionnr> <timestamp>
5198 if (fwrite(VIMSUGMAGIC, VIMSUGMAGICL, (size_t)1, fd) != 1) { // <fileID>
5199 emsg(_(e_write));
5200 goto theend;
5201 }
5202 putc(VIMSUGVERSION, fd); // <versionnr>
5203
5204 // Write si_sugtime to the file.
5205 put_time(fd, spin->si_sugtime); // <timestamp>
5206
5207 // <SUGWORDTREE>
5208 spin->si_memtot = 0;
5209 wordnode_T *tree = spin->si_foldroot->wn_sibling;
5210
5211 // Clear the index and wnode fields in the tree.
5212 clear_node(tree);
5213
5214 // Count the number of nodes. Needed to be able to allocate the
5215 // memory when reading the nodes. Also fills in index for shared
5216 // nodes.
5217 size_t nodecount = (size_t)put_node(NULL, tree, 0, 0, false);
5218
5219 // number of nodes in 4 bytes
5220 put_bytes(fd, nodecount, 4); // <nodecount>
5221 assert(nodecount + nodecount * sizeof(int) < INT_MAX);
5222 spin->si_memtot += (int)(nodecount + nodecount * sizeof(int));
5223
5224 // Write the nodes.
5225 (void)put_node(fd, tree, 0, 0, false);
5226
5227 // <SUGTABLE>: <sugwcount> <sugline> ...
5228 linenr_T wcount = spin->si_spellbuf->b_ml.ml_line_count;
5229 assert(wcount >= 0);
5230 put_bytes(fd, (uintmax_t)wcount, 4); // <sugwcount>
5231
5232 for (linenr_T lnum = 1; lnum <= wcount; ++lnum) {
5233 // <sugline>: <sugnr> ... NUL
5234 char_u *line = ml_get_buf(spin->si_spellbuf, lnum, false);
5235 size_t len = STRLEN(line) + 1;
5236 if (fwrite(line, len, 1, fd) == 0) {
5237 emsg(_(e_write));
5238 goto theend;
5239 }
5240 assert((size_t)spin->si_memtot + len <= INT_MAX);
5241 spin->si_memtot += (int)len;
5242 }
5243
5244 // Write another byte to check for errors.
5245 if (putc(0, fd) == EOF) {
5246 emsg(_(e_write));
5247 }
5248
5249 vim_snprintf((char *)IObuff, IOSIZE,
5250 _("Estimated runtime memory use: %d bytes"), spin->si_memtot);
5251 spell_message(spin, IObuff);
5252
5253 theend:
5254 // close the file
5255 fclose(fd);
5256 }
5257
5258
5259 /// Create a Vim spell file from one or more word lists.
5260 /// "fnames[0]" is the output file name.
5261 /// "fnames[fcount - 1]" is the last input file name.
5262 /// Exception: when "fnames[0]" ends in ".add" it's used as the input file name
5263 /// and ".spl" is appended to make the output file name.
5264 ///
5265 /// @param ascii -ascii argument given
5266 /// @param over_write overwrite existing output file
5267 /// @param added_word invoked through "zg"
mkspell(int fcount,char_u ** fnames,bool ascii,bool over_write,bool added_word)5268 static void mkspell(int fcount, char_u **fnames, bool ascii, bool over_write, bool added_word)
5269 {
5270 char_u *fname = NULL;
5271 char_u *wfname;
5272 char_u **innames;
5273 int incount;
5274 afffile_T *(afile[MAXREGIONS]);
5275 int i;
5276 int len;
5277 bool error = false;
5278 spellinfo_T spin;
5279
5280 memset(&spin, 0, sizeof(spin));
5281 spin.si_verbose = !added_word;
5282 spin.si_ascii = ascii;
5283 spin.si_followup = true;
5284 spin.si_rem_accents = true;
5285 ga_init(&spin.si_rep, (int)sizeof(fromto_T), 20);
5286 ga_init(&spin.si_repsal, (int)sizeof(fromto_T), 20);
5287 ga_init(&spin.si_sal, (int)sizeof(fromto_T), 20);
5288 ga_init(&spin.si_map, (int)sizeof(char_u), 100);
5289 ga_init(&spin.si_comppat, (int)sizeof(char_u *), 20);
5290 ga_init(&spin.si_prefcond, (int)sizeof(char_u *), 50);
5291 hash_init(&spin.si_commonwords);
5292 spin.si_newcompID = 127; // start compound ID at first maximum
5293
5294 // default: fnames[0] is output file, following are input files
5295 // When "fcount" is 1 there is only one file.
5296 innames = &fnames[fcount == 1 ? 0 : 1];
5297 incount = fcount - 1;
5298
5299 wfname = xmalloc(MAXPATHL);
5300
5301 if (fcount >= 1) {
5302 len = (int)STRLEN(fnames[0]);
5303 if (fcount == 1 && len > 4 && STRCMP(fnames[0] + len - 4, ".add") == 0) {
5304 // For ":mkspell path/en.latin1.add" output file is
5305 // "path/en.latin1.add.spl".
5306 incount = 1;
5307 vim_snprintf((char *)wfname, MAXPATHL, "%s.spl", fnames[0]);
5308 } else if (fcount == 1) {
5309 // For ":mkspell path/vim" output file is "path/vim.latin1.spl".
5310 incount = 1;
5311 vim_snprintf((char *)wfname, MAXPATHL, SPL_FNAME_TMPL,
5312 fnames[0], spin.si_ascii ? (char_u *)"ascii" : spell_enc());
5313 } else if (len > 4 && STRCMP(fnames[0] + len - 4, ".spl") == 0) {
5314 // Name ends in ".spl", use as the file name.
5315 STRLCPY(wfname, fnames[0], MAXPATHL);
5316 } else {
5317 // Name should be language, make the file name from it.
5318 vim_snprintf((char *)wfname, MAXPATHL, SPL_FNAME_TMPL,
5319 fnames[0], spin.si_ascii ? (char_u *)"ascii" : spell_enc());
5320 }
5321
5322 // Check for .ascii.spl.
5323 if (strstr((char *)path_tail(wfname), SPL_FNAME_ASCII) != NULL) {
5324 spin.si_ascii = true;
5325 }
5326
5327 // Check for .add.spl.
5328 if (strstr((char *)path_tail(wfname), SPL_FNAME_ADD) != NULL) {
5329 spin.si_add = true;
5330 }
5331 }
5332
5333 if (incount <= 0) {
5334 emsg(_(e_invarg)); // need at least output and input names
5335 } else if (vim_strchr(path_tail(wfname), '_') != NULL) {
5336 emsg(_("E751: Output file name must not have region name"));
5337 } else if (incount > MAXREGIONS) {
5338 semsg(_("E754: Only up to %d regions supported"), MAXREGIONS);
5339 } else {
5340 // Check for overwriting before doing things that may take a lot of
5341 // time.
5342 if (!over_write && os_path_exists(wfname)) {
5343 emsg(_(e_exists));
5344 goto theend;
5345 }
5346 if (os_isdir(wfname)) {
5347 semsg(_(e_isadir2), wfname);
5348 goto theend;
5349 }
5350
5351 fname = xmalloc(MAXPATHL);
5352
5353 // Init the aff and dic pointers.
5354 // Get the region names if there are more than 2 arguments.
5355 for (i = 0; i < incount; ++i) {
5356 afile[i] = NULL;
5357
5358 if (incount > 1) {
5359 len = (int)STRLEN(innames[i]);
5360 if (STRLEN(path_tail(innames[i])) < 5
5361 || innames[i][len - 3] != '_') {
5362 semsg(_("E755: Invalid region in %s"), innames[i]);
5363 goto theend;
5364 }
5365 spin.si_region_name[i * 2] = TOLOWER_ASC(innames[i][len - 2]);
5366 spin.si_region_name[i * 2 + 1] =
5367 TOLOWER_ASC(innames[i][len - 1]);
5368 }
5369 }
5370 spin.si_region_count = incount;
5371
5372 spin.si_foldroot = wordtree_alloc(&spin);
5373 spin.si_keeproot = wordtree_alloc(&spin);
5374 spin.si_prefroot = wordtree_alloc(&spin);
5375
5376 // When not producing a .add.spl file clear the character table when
5377 // we encounter one in the .aff file. This means we dump the current
5378 // one in the .spl file if the .aff file doesn't define one. That's
5379 // better than guessing the contents, the table will match a
5380 // previously loaded spell file.
5381 if (!spin.si_add) {
5382 spin.si_clear_chartab = true;
5383 }
5384
5385 // Read all the .aff and .dic files.
5386 // Text is converted to 'encoding'.
5387 // Words are stored in the case-folded and keep-case trees.
5388 for (i = 0; i < incount && !error; ++i) {
5389 spin.si_conv.vc_type = CONV_NONE;
5390 spin.si_region = 1 << i;
5391
5392 vim_snprintf((char *)fname, MAXPATHL, "%s.aff", innames[i]);
5393 if (os_path_exists(fname)) {
5394 // Read the .aff file. Will init "spin->si_conv" based on the
5395 // "SET" line.
5396 afile[i] = spell_read_aff(&spin, fname);
5397 if (afile[i] == NULL) {
5398 error = true;
5399 } else {
5400 // Read the .dic file and store the words in the trees.
5401 vim_snprintf((char *)fname, MAXPATHL, "%s.dic",
5402 innames[i]);
5403 if (spell_read_dic(&spin, fname, afile[i]) == FAIL) {
5404 error = true;
5405 }
5406 }
5407 } else {
5408 // No .aff file, try reading the file as a word list. Store
5409 // the words in the trees.
5410 if (spell_read_wordfile(&spin, innames[i]) == FAIL) {
5411 error = true;
5412 }
5413 }
5414
5415 // Free any conversion stuff.
5416 convert_setup(&spin.si_conv, NULL, NULL);
5417 }
5418
5419 if (spin.si_compflags != NULL && spin.si_nobreak) {
5420 msg(_("Warning: both compounding and NOBREAK specified"));
5421 }
5422
5423 if (!error && !got_int) {
5424 // Combine tails in the tree.
5425 spell_message(&spin, (char_u *)_(msg_compressing));
5426 wordtree_compress(&spin, spin.si_foldroot, "case-folded");
5427 wordtree_compress(&spin, spin.si_keeproot, "keep-case");
5428 wordtree_compress(&spin, spin.si_prefroot, "prefixes");
5429 }
5430
5431 if (!error && !got_int) {
5432 // Write the info in the spell file.
5433 vim_snprintf((char *)IObuff, IOSIZE,
5434 _("Writing spell file %s..."), wfname);
5435 spell_message(&spin, IObuff);
5436
5437 error = write_vim_spell(&spin, wfname) == FAIL;
5438
5439 spell_message(&spin, (char_u *)_("Done!"));
5440 vim_snprintf((char *)IObuff, IOSIZE,
5441 _("Estimated runtime memory use: %d bytes"), spin.si_memtot);
5442 spell_message(&spin, IObuff);
5443
5444 // If the file is loaded need to reload it.
5445 if (!error) {
5446 spell_reload_one(wfname, added_word);
5447 }
5448 }
5449
5450 // Free the allocated memory.
5451 ga_clear(&spin.si_rep);
5452 ga_clear(&spin.si_repsal);
5453 ga_clear(&spin.si_sal);
5454 ga_clear(&spin.si_map);
5455 ga_clear(&spin.si_comppat);
5456 ga_clear(&spin.si_prefcond);
5457 hash_clear_all(&spin.si_commonwords, 0);
5458
5459 // Free the .aff file structures.
5460 for (i = 0; i < incount; ++i) {
5461 if (afile[i] != NULL) {
5462 spell_free_aff(afile[i]);
5463 }
5464 }
5465
5466 // Free all the bits and pieces at once.
5467 free_blocks(spin.si_blocks);
5468
5469 // If there is soundfolding info and no NOSUGFILE item create the
5470 // .sug file with the soundfolded word trie.
5471 if (spin.si_sugtime != 0 && !error && !got_int) {
5472 spell_make_sugfile(&spin, wfname);
5473 }
5474 }
5475
5476 theend:
5477 xfree(fname);
5478 xfree(wfname);
5479 }
5480
5481 // Display a message for spell file processing when 'verbose' is set or using
5482 // ":mkspell". "str" can be IObuff.
spell_message(const spellinfo_T * spin,char_u * str)5483 static void spell_message(const spellinfo_T *spin, char_u *str)
5484 FUNC_ATTR_NONNULL_ALL
5485 {
5486 if (spin->si_verbose || p_verbose > 2) {
5487 if (!spin->si_verbose) {
5488 verbose_enter();
5489 }
5490 msg((char *)str);
5491 ui_flush();
5492 if (!spin->si_verbose) {
5493 verbose_leave();
5494 }
5495 }
5496 }
5497
5498 // ":[count]spellgood {word}"
5499 // ":[count]spellwrong {word}"
5500 // ":[count]spellundo {word}"
5501 // ":[count]spellrare {word}"
ex_spell(exarg_T * eap)5502 void ex_spell(exarg_T *eap)
5503 {
5504 spell_add_word(eap->arg, (int)STRLEN(eap->arg),
5505 eap->cmdidx == CMD_spellwrong ? SPELL_ADD_BAD :
5506 eap->cmdidx == CMD_spellrare ? SPELL_ADD_RARE : SPELL_ADD_GOOD,
5507 eap->forceit ? 0 : (int)eap->line2,
5508 eap->cmdidx == CMD_spellundo);
5509 }
5510
5511 /// Add "word[len]" to 'spellfile' as a good or bad word.
5512 ///
5513 /// @param what SPELL_ADD_ values
5514 /// @param idx "zG" and "zW": zero, otherwise index in 'spellfile'
5515 /// @param bool // true for "zug", "zuG", "zuw" and "zuW"
spell_add_word(char_u * word,int len,SpellAddType what,int idx,bool undo)5516 void spell_add_word(char_u *word, int len, SpellAddType what, int idx, bool undo)
5517 {
5518 FILE *fd = NULL;
5519 buf_T *buf = NULL;
5520 bool new_spf = false;
5521 char_u *fname;
5522 char_u *fnamebuf = NULL;
5523 char_u line[MAXWLEN * 2];
5524 long fpos, fpos_next = 0;
5525 int i;
5526 char_u *spf;
5527
5528 if (idx == 0) { // use internal wordlist
5529 if (int_wordlist == NULL) {
5530 int_wordlist = vim_tempname();
5531 if (int_wordlist == NULL) {
5532 return;
5533 }
5534 }
5535 fname = int_wordlist;
5536 } else {
5537 // If 'spellfile' isn't set figure out a good default value.
5538 if (*curwin->w_s->b_p_spf == NUL) {
5539 init_spellfile();
5540 new_spf = true;
5541 }
5542
5543 if (*curwin->w_s->b_p_spf == NUL) {
5544 semsg(_(e_notset), "spellfile");
5545 return;
5546 }
5547 fnamebuf = xmalloc(MAXPATHL);
5548
5549 for (spf = curwin->w_s->b_p_spf, i = 1; *spf != NUL; ++i) {
5550 copy_option_part(&spf, fnamebuf, MAXPATHL, ",");
5551 if (i == idx) {
5552 break;
5553 }
5554 if (*spf == NUL) {
5555 semsg(_("E765: 'spellfile' does not have %" PRId64 " entries"), (int64_t)idx);
5556 xfree(fnamebuf);
5557 return;
5558 }
5559 }
5560
5561 // Check that the user isn't editing the .add file somewhere.
5562 buf = buflist_findname_exp(fnamebuf);
5563 if (buf != NULL && buf->b_ml.ml_mfp == NULL) {
5564 buf = NULL;
5565 }
5566 if (buf != NULL && bufIsChanged(buf)) {
5567 emsg(_(e_bufloaded));
5568 xfree(fnamebuf);
5569 return;
5570 }
5571
5572 fname = fnamebuf;
5573 }
5574
5575 if (what == SPELL_ADD_BAD || undo) {
5576 // When the word appears as good word we need to remove that one,
5577 // since its flags sort before the one with WF_BANNED.
5578 fd = os_fopen((char *)fname, "r");
5579 if (fd != NULL) {
5580 while (!vim_fgets(line, MAXWLEN * 2, fd)) {
5581 fpos = fpos_next;
5582 fpos_next = ftell(fd);
5583 if (STRNCMP(word, line, len) == 0
5584 && (line[len] == '/' || line[len] < ' ')) {
5585 // Found duplicate word. Remove it by writing a '#' at
5586 // the start of the line. Mixing reading and writing
5587 // doesn't work for all systems, close the file first.
5588 fclose(fd);
5589 fd = os_fopen((char *)fname, "r+");
5590 if (fd == NULL) {
5591 break;
5592 }
5593 if (fseek(fd, fpos, SEEK_SET) == 0) {
5594 fputc('#', fd);
5595 if (undo) {
5596 home_replace(NULL, fname, NameBuff, MAXPATHL, TRUE);
5597 smsg(_("Word '%.*s' removed from %s"),
5598 len, word, NameBuff);
5599 }
5600 }
5601 if (fseek(fd, fpos_next, SEEK_SET) != 0) {
5602 PERROR(_("Seek error in spellfile"));
5603 break;
5604 }
5605 }
5606 }
5607 if (fd != NULL) {
5608 fclose(fd);
5609 }
5610 }
5611 }
5612
5613 if (!undo) {
5614 fd = os_fopen((char *)fname, "a");
5615 if (fd == NULL && new_spf) {
5616 char_u *p;
5617
5618 // We just initialized the 'spellfile' option and can't open the
5619 // file. We may need to create the "spell" directory first. We
5620 // already checked the runtime directory is writable in
5621 // init_spellfile().
5622 if (!dir_of_file_exists(fname) && (p = path_tail_with_sep(fname)) != fname) {
5623 int c = *p;
5624
5625 // The directory doesn't exist. Try creating it and opening
5626 // the file again.
5627 *p = NUL;
5628 os_mkdir((char *)fname, 0755);
5629 *p = c;
5630 fd = os_fopen((char *)fname, "a");
5631 }
5632 }
5633
5634 if (fd == NULL) {
5635 semsg(_(e_notopen), fname);
5636 } else {
5637 if (what == SPELL_ADD_BAD) {
5638 fprintf(fd, "%.*s/!\n", len, word);
5639 } else if (what == SPELL_ADD_RARE) {
5640 fprintf(fd, "%.*s/?\n", len, word);
5641 } else {
5642 fprintf(fd, "%.*s\n", len, word);
5643 }
5644 fclose(fd);
5645
5646 home_replace(NULL, fname, NameBuff, MAXPATHL, TRUE);
5647 smsg(_("Word '%.*s' added to %s"), len, word, NameBuff);
5648 }
5649 }
5650
5651 if (fd != NULL) {
5652 // Update the .add.spl file.
5653 mkspell(1, &fname, false, true, true);
5654
5655 // If the .add file is edited somewhere, reload it.
5656 if (buf != NULL) {
5657 buf_reload(buf, buf->b_orig_mode);
5658 }
5659
5660 redraw_all_later(SOME_VALID);
5661 }
5662 xfree(fnamebuf);
5663 }
5664
5665 // Initialize 'spellfile' for the current buffer.
init_spellfile(void)5666 static void init_spellfile(void)
5667 {
5668 char_u *buf;
5669 int l;
5670 char_u *fname;
5671 char_u *rtp;
5672 char_u *lend;
5673 bool aspath = false;
5674 char_u *lstart = curbuf->b_s.b_p_spl;
5675
5676 if (*curwin->w_s->b_p_spl != NUL && !GA_EMPTY(&curwin->w_s->b_langp)) {
5677 buf = xmalloc(MAXPATHL);
5678
5679 // Find the end of the language name. Exclude the region. If there
5680 // is a path separator remember the start of the tail.
5681 for (lend = curwin->w_s->b_p_spl; *lend != NUL
5682 && vim_strchr((char_u *)",._", *lend) == NULL; ++lend) {
5683 if (vim_ispathsep(*lend)) {
5684 aspath = true;
5685 lstart = lend + 1;
5686 }
5687 }
5688
5689 // Loop over all entries in 'runtimepath'. Use the first one where we
5690 // are allowed to write.
5691 rtp = p_rtp;
5692 while (*rtp != NUL) {
5693 if (aspath) {
5694 // Use directory of an entry with path, e.g., for
5695 // "/dir/lg.utf-8.spl" use "/dir".
5696 STRLCPY(buf, curbuf->b_s.b_p_spl,
5697 lstart - curbuf->b_s.b_p_spl);
5698 } else {
5699 // Copy the path from 'runtimepath' to buf[].
5700 copy_option_part(&rtp, buf, MAXPATHL, ",");
5701 }
5702 if (os_file_is_writable((char *)buf) == 2) {
5703 // Use the first language name from 'spelllang' and the
5704 // encoding used in the first loaded .spl file.
5705 if (aspath) {
5706 STRLCPY(buf, curbuf->b_s.b_p_spl,
5707 lend - curbuf->b_s.b_p_spl + 1);
5708 } else {
5709 // Create the "spell" directory if it doesn't exist yet.
5710 l = (int)STRLEN(buf);
5711 vim_snprintf((char *)buf + l, MAXPATHL - l, "/spell");
5712 if (os_file_is_writable((char *)buf) != 2) {
5713 os_mkdir((char *)buf, 0755);
5714 }
5715
5716 l = (int)STRLEN(buf);
5717 vim_snprintf((char *)buf + l, MAXPATHL - l,
5718 "/%.*s", (int)(lend - lstart), lstart);
5719 }
5720 l = (int)STRLEN(buf);
5721 fname = LANGP_ENTRY(curwin->w_s->b_langp, 0)
5722 ->lp_slang->sl_fname;
5723 vim_snprintf((char *)buf + l, MAXPATHL - l, ".%s.add",
5724 ((fname != NULL
5725 && strstr((char *)path_tail(fname), ".ascii.") != NULL)
5726 ? "ascii"
5727 : (const char *)spell_enc()));
5728 set_option_value("spellfile", 0L, (const char *)buf, OPT_LOCAL);
5729 break;
5730 }
5731 aspath = false;
5732 }
5733
5734 xfree(buf);
5735 }
5736 }
5737
5738 /// Set the spell character tables from strings in the .spl file.
5739 ///
5740 /// @param cnt length of "flags"
set_spell_charflags(char_u * flags,int cnt,char_u * fol)5741 static void set_spell_charflags(char_u *flags, int cnt, char_u *fol)
5742 {
5743 // We build the new tables here first, so that we can compare with the
5744 // previous one.
5745 spelltab_T new_st;
5746 int i;
5747 char_u *p = fol;
5748 int c;
5749
5750 clear_spell_chartab(&new_st);
5751
5752 for (i = 0; i < 128; ++i) {
5753 if (i < cnt) {
5754 new_st.st_isw[i + 128] = (flags[i] & CF_WORD) != 0;
5755 new_st.st_isu[i + 128] = (flags[i] & CF_UPPER) != 0;
5756 }
5757
5758 if (*p != NUL) {
5759 c = mb_ptr2char_adv((const char_u **)&p);
5760 new_st.st_fold[i + 128] = c;
5761 if (i + 128 != c && new_st.st_isu[i + 128] && c < 256) {
5762 new_st.st_upper[c] = i + 128;
5763 }
5764 }
5765 }
5766
5767 (void)set_spell_finish(&new_st);
5768 }
5769
set_spell_finish(spelltab_T * new_st)5770 static int set_spell_finish(spelltab_T *new_st)
5771 {
5772 int i;
5773
5774 if (did_set_spelltab) {
5775 // check that it's the same table
5776 for (i = 0; i < 256; ++i) {
5777 if (spelltab.st_isw[i] != new_st->st_isw[i]
5778 || spelltab.st_isu[i] != new_st->st_isu[i]
5779 || spelltab.st_fold[i] != new_st->st_fold[i]
5780 || spelltab.st_upper[i] != new_st->st_upper[i]) {
5781 emsg(_("E763: Word characters differ between spell files"));
5782 return FAIL;
5783 }
5784 }
5785 } else {
5786 // copy the new spelltab into the one being used
5787 spelltab = *new_st;
5788 did_set_spelltab = true;
5789 }
5790
5791 return OK;
5792 }
5793
5794 // Write the table with prefix conditions to the .spl file.
5795 // When "fd" is NULL only count the length of what is written.
write_spell_prefcond(FILE * fd,garray_T * gap)5796 static int write_spell_prefcond(FILE *fd, garray_T *gap)
5797 {
5798 assert(gap->ga_len >= 0);
5799
5800 if (fd != NULL) {
5801 put_bytes(fd, (uintmax_t)gap->ga_len, 2); // <prefcondcnt>
5802 }
5803 size_t totlen = 2 + (size_t)gap->ga_len; // <prefcondcnt> and <condlen> bytes
5804 size_t x = 1; // collect return value of fwrite()
5805 for (int i = 0; i < gap->ga_len; ++i) {
5806 // <prefcond> : <condlen> <condstr>
5807 char_u *p = ((char_u **)gap->ga_data)[i];
5808 if (p != NULL) {
5809 size_t len = STRLEN(p);
5810 if (fd != NULL) {
5811 assert(len <= INT_MAX);
5812 fputc((int)len, fd);
5813 x &= fwrite(p, len, 1, fd);
5814 }
5815 totlen += len;
5816 } else if (fd != NULL) {
5817 fputc(0, fd);
5818 }
5819 }
5820
5821 assert(totlen <= INT_MAX);
5822 return (int)totlen;
5823 }
5824
5825 // Use map string "map" for languages "lp".
set_map_str(slang_T * lp,char_u * map)5826 static void set_map_str(slang_T *lp, char_u *map)
5827 {
5828 char_u *p;
5829 int headc = 0;
5830 int c;
5831 int i;
5832
5833 if (*map == NUL) {
5834 lp->sl_has_map = false;
5835 return;
5836 }
5837 lp->sl_has_map = true;
5838
5839 // Init the array and hash tables empty.
5840 for (i = 0; i < 256; ++i) {
5841 lp->sl_map_array[i] = 0;
5842 }
5843 hash_init(&lp->sl_map_hash);
5844
5845 // The similar characters are stored separated with slashes:
5846 // "aaa/bbb/ccc/". Fill sl_map_array[c] with the character before c and
5847 // before the same slash. For characters above 255 sl_map_hash is used.
5848 for (p = map; *p != NUL;) {
5849 c = mb_cptr2char_adv((const char_u **)&p);
5850 if (c == '/') {
5851 headc = 0;
5852 } else {
5853 if (headc == 0) {
5854 headc = c;
5855 }
5856
5857 // Characters above 255 don't fit in sl_map_array[], put them in
5858 // the hash table. Each entry is the char, a NUL the headchar and
5859 // a NUL.
5860 if (c >= 256) {
5861 int cl = utf_char2len(c);
5862 int headcl = utf_char2len(headc);
5863 char_u *b;
5864 hash_T hash;
5865 hashitem_T *hi;
5866
5867 b = xmalloc(cl + headcl + 2);
5868 utf_char2bytes(c, b);
5869 b[cl] = NUL;
5870 utf_char2bytes(headc, b + cl + 1);
5871 b[cl + 1 + headcl] = NUL;
5872 hash = hash_hash(b);
5873 hi = hash_lookup(&lp->sl_map_hash, (const char *)b, STRLEN(b), hash);
5874 if (HASHITEM_EMPTY(hi)) {
5875 hash_add_item(&lp->sl_map_hash, hi, b, hash);
5876 } else {
5877 // This should have been checked when generating the .spl
5878 // file.
5879 emsg(_("E783: duplicate char in MAP entry"));
5880 xfree(b);
5881 }
5882 } else {
5883 lp->sl_map_array[c] = headc;
5884 }
5885 }
5886 }
5887 }
5888
5889