1 // This is an open source non-commercial project. Dear PVS-Studio, please check
2 // it. PVS-Studio Static Code Analyzer for C, C++ and C#: http://www.viva64.com
3 
4 // spellfile.c: code for reading and writing spell files.
5 //
6 // See spell.c for information about spell checking.
7 
8 // Vim spell file format: <HEADER>
9 //                        <SECTIONS>
10 //                        <LWORDTREE>
11 //                        <KWORDTREE>
12 //                        <PREFIXTREE>
13 //
14 // <HEADER>: <fileID> <versionnr>
15 //
16 // <fileID>     8 bytes    "VIMspell"
17 // <versionnr>  1 byte      VIMSPELLVERSION
18 //
19 //
20 // Sections make it possible to add information to the .spl file without
21 // making it incompatible with previous versions.  There are two kinds of
22 // sections:
23 // 1. Not essential for correct spell checking.  E.g. for making suggestions.
24 //    These are skipped when not supported.
25 // 2. Optional information, but essential for spell checking when present.
26 //    E.g. conditions for affixes.  When this section is present but not
27 //    supported an error message is given.
28 //
29 // <SECTIONS>: <section> ... <sectionend>
30 //
31 // <section>: <sectionID> <sectionflags> <sectionlen> (section contents)
32 //
33 // <sectionID>    1 byte    number from 0 to 254 identifying the section
34 //
35 // <sectionflags> 1 byte    SNF_REQUIRED: this section is required for correct
36 //                                          spell checking
37 //
38 // <sectionlen>   4 bytes   length of section contents, MSB first
39 //
40 // <sectionend>   1 byte    SN_END
41 //
42 //
43 // sectionID == SN_INFO: <infotext>
44 // <infotext>    N bytes    free format text with spell file info (version,
45 //                          website, etc)
46 //
47 // sectionID == SN_REGION: <regionname> ...
48 // <regionname>  2 bytes    Up to MAXREGIONS region names: ca, au, etc.
49 //                          Lower case.
50 //                          First <regionname> is region 1.
51 //
52 // sectionID == SN_CHARFLAGS: <charflagslen> <charflags>
53 //                              <folcharslen> <folchars>
54 // <charflagslen> 1 byte    Number of bytes in <charflags> (should be 128).
55 // <charflags>  N bytes     List of flags (first one is for character 128):
56 //                          0x01  word character        CF_WORD
57 //                          0x02  upper-case character  CF_UPPER
58 // <folcharslen>  2 bytes   Number of bytes in <folchars>.
59 // <folchars>     N bytes   Folded characters, first one is for character 128.
60 //
61 // sectionID == SN_MIDWORD: <midword>
62 // <midword>     N bytes    Characters that are word characters only when used
63 //                          in the middle of a word.
64 //
65 // sectionID == SN_PREFCOND: <prefcondcnt> <prefcond> ...
66 // <prefcondcnt> 2 bytes    Number of <prefcond> items following.
67 // <prefcond> : <condlen> <condstr>
68 // <condlen>    1 byte      Length of <condstr>.
69 // <condstr>    N bytes     Condition for the prefix.
70 //
71 // sectionID == SN_REP: <repcount> <rep> ...
72 // <repcount>    2 bytes    number of <rep> items, MSB first.
73 // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
74 // <repfromlen>  1 byte     length of <repfrom>
75 // <repfrom>     N bytes    "from" part of replacement
76 // <reptolen>    1 byte     length of <repto>
77 // <repto>       N bytes    "to" part of replacement
78 //
79 // sectionID == SN_REPSAL: <repcount> <rep> ...
80 //   just like SN_REP but for soundfolded words
81 //
82 // sectionID == SN_SAL: <salflags> <salcount> <sal> ...
83 // <salflags>    1 byte     flags for soundsalike conversion:
84 //                          SAL_F0LLOWUP
85 //                          SAL_COLLAPSE
86 //                          SAL_REM_ACCENTS
87 // <salcount>    2 bytes    number of <sal> items following
88 // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
89 // <salfromlen>  1 byte     length of <salfrom>
90 // <salfrom>     N bytes    "from" part of soundsalike
91 // <saltolen>    1 byte     length of <salto>
92 // <salto>       N bytes    "to" part of soundsalike
93 //
94 // sectionID == SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
95 // <sofofromlen> 2 bytes    length of <sofofrom>
96 // <sofofrom>    N bytes    "from" part of soundfold
97 // <sofotolen>   2 bytes    length of <sofoto>
98 // <sofoto>      N bytes    "to" part of soundfold
99 //
100 // sectionID == SN_SUGFILE: <timestamp>
101 // <timestamp>   8 bytes    time in seconds that must match with .sug file
102 //
103 // sectionID == SN_NOSPLITSUGS: nothing
104 //
105 // sectionID == SN_NOCOMPOUNDSUGS: nothing
106 //
107 // sectionID == SN_WORDS: <word> ...
108 // <word>        N bytes    NUL terminated common word
109 //
110 // sectionID == SN_MAP: <mapstr>
111 // <mapstr>      N bytes    String with sequences of similar characters,
112 //                          separated by slashes.
113 //
114 // sectionID == SN_COMPOUND: <compmax> <compminlen> <compsylmax> <compoptions>
115 //                              <comppatcount> <comppattern> ... <compflags>
116 // <compmax>     1 byte     Maximum nr of words in compound word.
117 // <compminlen>  1 byte     Minimal word length for compounding.
118 // <compsylmax>  1 byte     Maximum nr of syllables in compound word.
119 // <compoptions> 2 bytes    COMP_ flags.
120 // <comppatcount> 2 bytes   number of <comppattern> following
121 // <compflags>   N bytes    Flags from COMPOUNDRULE items, separated by
122 //                          slashes.
123 //
124 // <comppattern>: <comppatlen> <comppattext>
125 // <comppatlen>  1 byte     length of <comppattext>
126 // <comppattext> N bytes    end or begin chars from CHECKCOMPOUNDPATTERN
127 //
128 // sectionID == SN_NOBREAK: (empty, its presence is what matters)
129 //
130 // sectionID == SN_SYLLABLE: <syllable>
131 // <syllable>    N bytes    String from SYLLABLE item.
132 //
133 // <LWORDTREE>: <wordtree>
134 //
135 // <KWORDTREE>: <wordtree>
136 //
137 // <PREFIXTREE>: <wordtree>
138 //
139 //
140 // <wordtree>: <nodecount> <nodedata> ...
141 //
142 // <nodecount>  4 bytes     Number of nodes following.  MSB first.
143 //
144 // <nodedata>: <siblingcount> <sibling> ...
145 //
146 // <siblingcount> 1 byte    Number of siblings in this node.  The siblings
147 //                          follow in sorted order.
148 //
149 // <sibling>: <byte> [ <nodeidx> <xbyte>
150 //                    | <flags> [<flags2>] [<region>] [<affixID>]
151 //                    | [<pflags>] <affixID> <prefcondnr> ]
152 //
153 // <byte>       1 byte      Byte value of the sibling.  Special cases:
154 //                          BY_NOFLAGS: End of word without flags and for all
155 //                                      regions.
156 //                                      For PREFIXTREE <affixID> and
157 //                                      <prefcondnr> follow.
158 //                          BY_FLAGS:   End of word, <flags> follow.
159 //                                      For PREFIXTREE <pflags>, <affixID>
160 //                                      and <prefcondnr> follow.
161 //                          BY_FLAGS2:  End of word, <flags> and <flags2>
162 //                                      follow.  Not used in PREFIXTREE.
163 //                          BY_INDEX:   Child of sibling is shared, <nodeidx>
164 //                                      and <xbyte> follow.
165 //
166 // <nodeidx>    3 bytes     Index of child for this sibling, MSB first.
167 //
168 // <xbyte>      1 byte      Byte value of the sibling.
169 //
170 // <flags>      1 byte      Bitmask of:
171 //                          WF_ALLCAP   word must have only capitals
172 //                          WF_ONECAP   first char of word must be capital
173 //                          WF_KEEPCAP  keep-case word
174 //                          WF_FIXCAP   keep-case word, all caps not allowed
175 //                          WF_RARE     rare word
176 //                          WF_BANNED   bad word
177 //                          WF_REGION   <region> follows
178 //                          WF_AFX      <affixID> follows
179 //
180 // <flags2>     1 byte      Bitmask of:
181 //                          WF_HAS_AFF >> 8   word includes affix
182 //                          WF_NEEDCOMP >> 8  word only valid in compound
183 //                          WF_NOSUGGEST >> 8  word not used for suggestions
184 //                          WF_COMPROOT >> 8  word already a compound
185 //                          WF_NOCOMPBEF >> 8 no compounding before this word
186 //                          WF_NOCOMPAFT >> 8 no compounding after this word
187 //
188 // <pflags>     1 byte      Bitmask of:
189 //                          WFP_RARE    rare prefix
190 //                          WFP_NC      non-combining prefix
191 //                          WFP_UP      letter after prefix made upper case
192 //
193 // <region>     1 byte      Bitmask for regions in which word is valid.  When
194 //                          omitted it's valid in all regions.
195 //                          Lowest bit is for region 1.
196 //
197 // <affixID>    1 byte      ID of affix that can be used with this word.  In
198 //                          PREFIXTREE used for the required prefix ID.
199 //
200 // <prefcondnr> 2 bytes     Prefix condition number, index in <prefcond> list
201 //                          from HEADER.
202 //
203 // All text characters are in 'encoding', but stored as single bytes.
204 
205 // Vim .sug file format:  <SUGHEADER>
206 //                        <SUGWORDTREE>
207 //                        <SUGTABLE>
208 //
209 // <SUGHEADER>: <fileID> <versionnr> <timestamp>
210 //
211 // <fileID>     6 bytes     "VIMsug"
212 // <versionnr>  1 byte      VIMSUGVERSION
213 // <timestamp>  8 bytes     timestamp that must match with .spl file
214 //
215 //
216 // <SUGWORDTREE>: <wordtree>  (see above, no flags or region used)
217 //
218 //
219 // <SUGTABLE>: <sugwcount> <sugline> ...
220 //
221 // <sugwcount>  4 bytes     number of <sugline> following
222 //
223 // <sugline>: <sugnr> ... NUL
224 //
225 // <sugnr>:     X bytes     word number that results in this soundfolded word,
226 //                          stored as an offset to the previous number in as
227 //                          few bytes as possible, see offset2bytes())
228 
229 #include <stdint.h>
230 #include <stdio.h>
231 #include <wctype.h>
232 
233 #include "nvim/ascii.h"
234 #include "nvim/buffer.h"
235 #include "nvim/charset.h"
236 #include "nvim/ex_cmds2.h"
237 #include "nvim/fileio.h"
238 #include "nvim/memline.h"
239 #include "nvim/memory.h"
240 #include "nvim/misc1.h"
241 #include "nvim/option.h"
242 #include "nvim/os/os.h"
243 #include "nvim/path.h"
244 #include "nvim/regexp.h"
245 #include "nvim/screen.h"
246 #include "nvim/spell.h"
247 #include "nvim/spell_defs.h"
248 #include "nvim/spellfile.h"
249 #include "nvim/ui.h"
250 #include "nvim/undo.h"
251 #include "nvim/vim.h"
252 
253 #ifndef UNIX            // it's in os/unix_defs.h for Unix
254 # include <time.h>      // for time_t
255 #endif
256 
257 // Special byte values for <byte>.  Some are only used in the tree for
258 // postponed prefixes, some only in the other trees.  This is a bit messy...
259 #define BY_NOFLAGS      0       // end of word without flags or region; for
260                                 // postponed prefix: no <pflags>
261 #define BY_INDEX        1       // child is shared, index follows
262 #define BY_FLAGS        2       // end of word, <flags> byte follows; for
263                                 // postponed prefix: <pflags> follows
264 #define BY_FLAGS2       3       // end of word, <flags> and <flags2> bytes
265                                 // follow; never used in prefix tree
266 #define BY_SPECIAL  BY_FLAGS2   // highest special byte value
267 
268 #define ZERO_FLAG   65009       // used when flag is zero: "0"
269 
270 // Flags used in .spl file for soundsalike flags.
271 #define SAL_F0LLOWUP            1
272 #define SAL_COLLAPSE            2
273 #define SAL_REM_ACCENTS         4
274 
275 #define VIMSPELLMAGIC "VIMspell"  // string at start of Vim spell file
276 #define VIMSPELLMAGICL (sizeof(VIMSPELLMAGIC) - 1)
277 #define VIMSPELLVERSION 50
278 
279 // Section IDs.  Only renumber them when VIMSPELLVERSION changes!
280 #define SN_REGION       0       // <regionname> section
281 #define SN_CHARFLAGS    1       // charflags section
282 #define SN_MIDWORD      2       // <midword> section
283 #define SN_PREFCOND     3       // <prefcond> section
284 #define SN_REP          4       // REP items section
285 #define SN_SAL          5       // SAL items section
286 #define SN_SOFO         6       // soundfolding section
287 #define SN_MAP          7       // MAP items section
288 #define SN_COMPOUND     8       // compound words section
289 #define SN_SYLLABLE     9       // syllable section
290 #define SN_NOBREAK      10      // NOBREAK section
291 #define SN_SUGFILE      11      // timestamp for .sug file
292 #define SN_REPSAL       12      // REPSAL items section
293 #define SN_WORDS        13      // common words
294 #define SN_NOSPLITSUGS  14      // don't split word for suggestions
295 #define SN_INFO         15      // info section
296 #define SN_NOCOMPOUNDSUGS 16    // don't compound for suggestions
297 #define SN_END          255     // end of sections
298 
299 #define SNF_REQUIRED    1       // <sectionflags>: required section
300 
301 #define CF_WORD         0x01
302 #define CF_UPPER        0x02
303 
304 static char *e_spell_trunc = N_("E758: Truncated spell file");
305 static char *e_afftrailing = N_("Trailing text in %s line %d: %s");
306 static char *e_affname = N_("Affix name too long in %s line %d: %s");
307 static char *msg_compressing = N_("Compressing word tree...");
308 
309 #define MAXLINELEN  500         // Maximum length in bytes of a line in a .aff
310                                 // and .dic file.
311 // Main structure to store the contents of a ".aff" file.
312 typedef struct afffile_S {
313   char_u *af_enc;          // "SET", normalized, alloc'ed string or NULL
314   int af_flagtype;              // AFT_CHAR, AFT_LONG, AFT_NUM or AFT_CAPLONG
315   unsigned af_rare;             // RARE ID for rare word
316   unsigned af_keepcase;         // KEEPCASE ID for keep-case word
317   unsigned af_bad;              // BAD ID for banned word
318   unsigned af_needaffix;        // NEEDAFFIX ID
319   unsigned af_circumfix;        // CIRCUMFIX ID
320   unsigned af_needcomp;         // NEEDCOMPOUND ID
321   unsigned af_comproot;         // COMPOUNDROOT ID
322   unsigned af_compforbid;       // COMPOUNDFORBIDFLAG ID
323   unsigned af_comppermit;       // COMPOUNDPERMITFLAG ID
324   unsigned af_nosuggest;        // NOSUGGEST ID
325   int af_pfxpostpone;           // postpone prefixes without chop string and
326                                 // without flags
327   bool af_ignoreextra;          // IGNOREEXTRA present
328   hashtab_T af_pref;            // hashtable for prefixes, affheader_T
329   hashtab_T af_suff;            // hashtable for suffixes, affheader_T
330   hashtab_T af_comp;            // hashtable for compound flags, compitem_T
331 } afffile_T;
332 
333 #define AFT_CHAR        0       // flags are one character
334 #define AFT_LONG        1       // flags are two characters
335 #define AFT_CAPLONG     2       // flags are one or two characters
336 #define AFT_NUM         3       // flags are numbers, comma separated
337 
338 typedef struct affentry_S affentry_T;
339 // Affix entry from ".aff" file.  Used for prefixes and suffixes.
340 struct affentry_S {
341   affentry_T *ae_next;         // next affix with same name/number
342   char_u *ae_chop;         // text to chop off basic word (can be NULL)
343   char_u *ae_add;          // text to add to basic word (can be NULL)
344   char_u *ae_flags;        // flags on the affix (can be NULL)
345   char_u *ae_cond;         // condition (NULL for ".")
346   regprog_T *ae_prog;         // regexp program for ae_cond or NULL
347   char ae_compforbid;           // COMPOUNDFORBIDFLAG found
348   char ae_comppermit;           // COMPOUNDPERMITFLAG found
349 };
350 
351 #define AH_KEY_LEN 17          // 2 x 8 bytes + NUL
352 
353 // Affix header from ".aff" file.  Used for af_pref and af_suff.
354 typedef struct affheader_S {
355   char_u ah_key[AH_KEY_LEN];    // key for hashtab == name of affix
356   unsigned ah_flag;             // affix name as number, uses "af_flagtype"
357   int ah_newID;                 // prefix ID after renumbering; 0 if not used
358   int ah_combine;               // suffix may combine with prefix
359   int ah_follows;               // another affix block should be following
360   affentry_T *ah_first;        // first affix entry
361 } affheader_T;
362 
363 #define HI2AH(hi)   ((affheader_T *)(hi)->hi_key)
364 
365 // Flag used in compound items.
366 typedef struct compitem_S {
367   char_u ci_key[AH_KEY_LEN];    // key for hashtab == name of compound
368   unsigned ci_flag;             // affix name as number, uses "af_flagtype"
369   int ci_newID;                 // affix ID after renumbering.
370 } compitem_T;
371 
372 #define HI2CI(hi)   ((compitem_T *)(hi)->hi_key)
373 
374 // Structure that is used to store the items in the word tree.  This avoids
375 // the need to keep track of each allocated thing, everything is freed all at
376 // once after ":mkspell" is done.
377 // Note: "sb_next" must be just before "sb_data" to make sure the alignment of
378 // "sb_data" is correct for systems where pointers must be aligned on
379 // pointer-size boundaries and sizeof(pointer) > sizeof(int) (e.g., Sparc).
380 #define  SBLOCKSIZE 16000       // size of sb_data
381 typedef struct sblock_S sblock_T;
382 struct sblock_S {
383   int sb_used;                  // nr of bytes already in use
384   sblock_T *sb_next;         // next block in list
385   char_u sb_data[1];            // data, actually longer
386 };
387 
388 // A node in the tree.
389 typedef struct wordnode_S wordnode_T;
390 struct wordnode_S {
391   union {   // shared to save space
392     char_u hashkey[6];          // the hash key, only used while compressing
393     int index;                  // index in written nodes (valid after first
394                                 // round)
395   } wn_u1;
396   union {   // shared to save space
397     wordnode_T *next;           // next node with same hash key
398     wordnode_T *wnode;          // parent node that will write this node
399   } wn_u2;
400   wordnode_T *wn_child;        // child (next byte in word)
401   wordnode_T *wn_sibling;      // next sibling (alternate byte in word,
402                                //   always sorted)
403   int wn_refs;                  // Nr. of references to this node.  Only
404                                 //   relevant for first node in a list of
405                                 //   siblings, in following siblings it is
406                                 //   always one.
407   char_u wn_byte;               // Byte for this node. NUL for word end
408 
409   // Info for when "wn_byte" is NUL.
410   // In PREFIXTREE "wn_region" is used for the prefcondnr.
411   // In the soundfolded word tree "wn_flags" has the MSW of the wordnr and
412   // "wn_region" the LSW of the wordnr.
413   char_u wn_affixID;            // supported/required prefix ID or 0
414   uint16_t wn_flags;            // WF_ flags
415   short wn_region;              // region mask
416 
417 #ifdef SPELL_PRINTTREE
418   int wn_nr;                    // sequence nr for printing
419 #endif
420 };
421 
422 #define WN_MASK  0xffff         // mask relevant bits of "wn_flags"
423 
424 #define HI2WN(hi)    (wordnode_T *)((hi)->hi_key)
425 
426 // Info used while reading the spell files.
427 typedef struct spellinfo_S {
428   wordnode_T *si_foldroot;     // tree with case-folded words
429   long si_foldwcount;           // nr of words in si_foldroot
430 
431   wordnode_T *si_keeproot;     // tree with keep-case words
432   long si_keepwcount;           // nr of words in si_keeproot
433 
434   wordnode_T *si_prefroot;     // tree with postponed prefixes
435 
436   long si_sugtree;              // creating the soundfolding trie
437 
438   sblock_T *si_blocks;       // memory blocks used
439   long si_blocks_cnt;           // memory blocks allocated
440   int si_did_emsg;              // TRUE when ran out of memory
441 
442   long si_compress_cnt;         // words to add before lowering
443                                 // compression limit
444   wordnode_T *si_first_free;   // List of nodes that have been freed during
445                                // compression, linked by "wn_child" field.
446   long si_free_count;           // number of nodes in si_first_free
447 #ifdef SPELL_PRINTTREE
448   int si_wordnode_nr;           // sequence nr for nodes
449 #endif
450   buf_T *si_spellbuf;     // buffer used to store soundfold word table
451 
452   int si_ascii;                 // handling only ASCII words
453   int si_add;                   // addition file
454   int si_clear_chartab;             // when TRUE clear char tables
455   int si_region;                // region mask
456   vimconv_T si_conv;            // for conversion to 'encoding'
457   int si_memtot;                // runtime memory used
458   int si_verbose;               // verbose messages
459   int si_msg_count;             // number of words added since last message
460   char_u *si_info;         // info text chars or NULL
461   int si_region_count;          // number of regions supported (1 when there
462                                 // are no regions)
463   char_u si_region_name[MAXREGIONS * 2 + 1];
464   // region names; used only if
465   // si_region_count > 1)
466 
467   garray_T si_rep;              // list of fromto_T entries from REP lines
468   garray_T si_repsal;           // list of fromto_T entries from REPSAL lines
469   garray_T si_sal;              // list of fromto_T entries from SAL lines
470   char_u *si_sofofr;       // SOFOFROM text
471   char_u *si_sofoto;       // SOFOTO text
472   int si_nosugfile;             // NOSUGFILE item found
473   int si_nosplitsugs;           // NOSPLITSUGS item found
474   int si_nocompoundsugs;        // NOCOMPOUNDSUGS item found
475   int si_followup;              // soundsalike: ?
476   int si_collapse;              // soundsalike: ?
477   hashtab_T si_commonwords;     // hashtable for common words
478   time_t si_sugtime;            // timestamp for .sug file
479   int si_rem_accents;           // soundsalike: remove accents
480   garray_T si_map;              // MAP info concatenated
481   char_u *si_midword;      // MIDWORD chars or NULL
482   int si_compmax;               // max nr of words for compounding
483   int si_compminlen;            // minimal length for compounding
484   int si_compsylmax;            // max nr of syllables for compounding
485   int si_compoptions;           // COMP_ flags
486   garray_T si_comppat;          // CHECKCOMPOUNDPATTERN items, each stored as
487                                 // a string
488   char_u *si_compflags;    // flags used for compounding
489   char_u si_nobreak;            // NOBREAK
490   char_u *si_syllable;     // syllable string
491   garray_T si_prefcond;         // table with conditions for postponed
492                                 // prefixes, each stored as a string
493   int si_newprefID;             // current value for ah_newID
494   int si_newcompID;             // current value for compound ID
495 } spellinfo_T;
496 
497 #ifdef INCLUDE_GENERATED_DECLARATIONS
498 # include "spellfile.c.generated.h"
499 #endif
500 
501 /// Read n bytes from fd to buf, returning on errors
502 ///
503 /// @param[out]  buf  Buffer to read to, must be at least n bytes long.
504 /// @param[in]  n  Amount of bytes to read.
505 /// @param  fd  FILE* to read from.
506 /// @param  exit_code  Code to run before returning.
507 ///
508 /// @return Allows to proceed if everything is OK, returns SP_TRUNCERROR if
509 ///         there are not enough bytes, returns SP_OTHERERROR if reading failed.
510 #define SPELL_READ_BYTES(buf, n, fd, exit_code) \
511   do { \
512     const size_t n__SPRB = (n); \
513     FILE *const fd__SPRB = (fd); \
514     char *const buf__SPRB = (buf); \
515     const size_t read_bytes__SPRB = fread(buf__SPRB, 1, n__SPRB, fd__SPRB); \
516     if (read_bytes__SPRB != n__SPRB) { \
517       exit_code; \
518       return feof(fd__SPRB) ? SP_TRUNCERROR : SP_OTHERERROR; \
519     } \
520   } while (0)
521 
522 /// Like #SPELL_READ_BYTES, but also error out if NUL byte was read
523 ///
524 /// @return Allows to proceed if everything is OK, returns SP_TRUNCERROR if
525 ///         there are not enough bytes, returns SP_OTHERERROR if reading failed,
526 ///         returns SP_FORMERROR if read out a NUL byte.
527 #define SPELL_READ_NONNUL_BYTES(buf, n, fd, exit_code) \
528   do { \
529     const size_t n__SPRNB = (n); \
530     FILE *const fd__SPRNB = (fd); \
531     char *const buf__SPRNB = (buf); \
532     SPELL_READ_BYTES(buf__SPRNB, n__SPRNB, fd__SPRNB, exit_code); \
533     if (memchr(buf__SPRNB, NUL, (size_t)n__SPRNB)) { \
534       exit_code; \
535       return SP_FORMERROR; \
536     } \
537   } while (0)
538 
539 /// Check that spell file starts with a magic string
540 ///
541 /// Does not check for version of the file.
542 ///
543 /// @param  fd  File to check.
544 ///
545 /// @return 0 in case of success, SP_TRUNCERROR if file contains not enough
546 ///         bytes, SP_FORMERROR if it does not match magic string and
547 ///         SP_OTHERERROR if reading file failed.
spell_check_magic_string(FILE * const fd)548 static inline int spell_check_magic_string(FILE *const fd)
549   FUNC_ATTR_NONNULL_ALL FUNC_ATTR_WARN_UNUSED_RESULT FUNC_ATTR_ALWAYS_INLINE
550 {
551   char buf[VIMSPELLMAGICL];
552   SPELL_READ_BYTES(buf, VIMSPELLMAGICL, fd,; );
553   if (memcmp(buf, VIMSPELLMAGIC, VIMSPELLMAGICL) != 0) {
554     return SP_FORMERROR;
555   }
556   return 0;
557 }
558 
559 /// Load one spell file and store the info into a slang_T.
560 ///
561 /// This is invoked in three ways:
562 /// - From spell_load_cb() to load a spell file for the first time.  "lang" is
563 ///   the language name, "old_lp" is NULL.  Will allocate an slang_T.
564 /// - To reload a spell file that was changed.  "lang" is NULL and "old_lp"
565 ///   points to the existing slang_T.
566 /// - Just after writing a .spl file; it's read back to produce the .sug file.
567 ///   "old_lp" is NULL and "lang" is NULL.  Will allocate an slang_T.
568 ///
569 /// @param silent  no error if file doesn't exist
570 ///
571 /// @return  the slang_T the spell file was loaded into.  NULL for error.
spell_load_file(char_u * fname,char_u * lang,slang_T * old_lp,bool silent)572 slang_T *spell_load_file(char_u *fname, char_u *lang, slang_T *old_lp, bool silent)
573 {
574   FILE *fd;
575   char_u *p;
576   int n;
577   int len;
578   char_u *save_sourcing_name = sourcing_name;
579   linenr_T save_sourcing_lnum = sourcing_lnum;
580   slang_T *lp = NULL;
581   int c = 0;
582   int res;
583 
584   fd = os_fopen((char *)fname, "r");
585   if (fd == NULL) {
586     if (!silent) {
587       semsg(_(e_notopen), fname);
588     } else if (p_verbose > 2) {
589       verbose_enter();
590       smsg((char *)e_notopen, fname);
591       verbose_leave();
592     }
593     goto endFAIL;
594   }
595   if (p_verbose > 2) {
596     verbose_enter();
597     smsg(_("Reading spell file \"%s\""), fname);
598     verbose_leave();
599   }
600 
601   if (old_lp == NULL) {
602     lp = slang_alloc(lang);
603 
604     // Remember the file name, used to reload the file when it's updated.
605     lp->sl_fname = vim_strsave(fname);
606 
607     // Check for .add.spl.
608     lp->sl_add = strstr((char *)path_tail(fname), SPL_FNAME_ADD) != NULL;
609   } else {
610     lp = old_lp;
611   }
612 
613   // Set sourcing_name, so that error messages mention the file name.
614   sourcing_name = fname;
615   sourcing_lnum = 0;
616 
617   // <HEADER>: <fileID>
618   const int scms_ret = spell_check_magic_string(fd);
619   switch (scms_ret) {
620   case SP_FORMERROR:
621   case SP_TRUNCERROR:
622     semsg("%s", _("E757: This does not look like a spell file"));
623     goto endFAIL;
624   case SP_OTHERERROR:
625     semsg(_("E5042: Failed to read spell file %s: %s"),
626           fname, strerror(ferror(fd)));
627     goto endFAIL;
628   case 0:
629     break;
630   }
631   c = getc(fd);                                         // <versionnr>
632   if (c < VIMSPELLVERSION) {
633     emsg(_("E771: Old spell file, needs to be updated"));
634     goto endFAIL;
635   } else if (c > VIMSPELLVERSION) {
636     emsg(_("E772: Spell file is for newer version of Vim"));
637     goto endFAIL;
638   }
639 
640 
641   // <SECTIONS>: <section> ... <sectionend>
642   // <section>: <sectionID> <sectionflags> <sectionlen> (section contents)
643   for (;;) {
644     n = getc(fd);                           // <sectionID> or <sectionend>
645     if (n == SN_END) {
646       break;
647     }
648     c = getc(fd);                                       // <sectionflags>
649     len = get4c(fd);                                    // <sectionlen>
650     if (len < 0) {
651       goto truncerr;
652     }
653 
654     res = 0;
655     switch (n) {
656     case SN_INFO:
657       lp->sl_info = READ_STRING(fd, len);               // <infotext>
658       if (lp->sl_info == NULL) {
659         goto endFAIL;
660       }
661       break;
662 
663     case SN_REGION:
664       res = read_region_section(fd, lp, len);
665       break;
666 
667     case SN_CHARFLAGS:
668       res = read_charflags_section(fd);
669       break;
670 
671     case SN_MIDWORD:
672       lp->sl_midword = READ_STRING(fd, len);            // <midword>
673       if (lp->sl_midword == NULL) {
674         goto endFAIL;
675       }
676       break;
677 
678     case SN_PREFCOND:
679       res = read_prefcond_section(fd, lp);
680       break;
681 
682     case SN_REP:
683       res = read_rep_section(fd, &lp->sl_rep, lp->sl_rep_first);
684       break;
685 
686     case SN_REPSAL:
687       res = read_rep_section(fd, &lp->sl_repsal, lp->sl_repsal_first);
688       break;
689 
690     case SN_SAL:
691       res = read_sal_section(fd, lp);
692       break;
693 
694     case SN_SOFO:
695       res = read_sofo_section(fd, lp);
696       break;
697 
698     case SN_MAP:
699       p = READ_STRING(fd, len);                         // <mapstr>
700       if (p == NULL) {
701         goto endFAIL;
702       }
703       set_map_str(lp, p);
704       xfree(p);
705       break;
706 
707     case SN_WORDS:
708       res = read_words_section(fd, lp, len);
709       break;
710 
711     case SN_SUGFILE:
712       lp->sl_sugtime = get8ctime(fd);                   // <timestamp>
713       break;
714 
715     case SN_NOSPLITSUGS:
716       lp->sl_nosplitsugs = true;
717       break;
718 
719     case SN_NOCOMPOUNDSUGS:
720       lp->sl_nocompoundsugs = true;
721       break;
722 
723     case SN_COMPOUND:
724       res = read_compound(fd, lp, len);
725       break;
726 
727     case SN_NOBREAK:
728       lp->sl_nobreak = true;
729       break;
730 
731     case SN_SYLLABLE:
732       lp->sl_syllable = READ_STRING(fd, len);           // <syllable>
733       if (lp->sl_syllable == NULL) {
734         goto endFAIL;
735       }
736       if (init_syl_tab(lp) == FAIL) {
737         goto endFAIL;
738       }
739       break;
740 
741     default:
742       // Unsupported section.  When it's required give an error
743       // message.  When it's not required skip the contents.
744       if (c & SNF_REQUIRED) {
745         emsg(_("E770: Unsupported section in spell file"));
746         goto endFAIL;
747       }
748       while (--len >= 0) {
749         if (getc(fd) < 0) {
750           goto truncerr;
751         }
752       }
753       break;
754     }
755 someerror:
756     if (res == SP_FORMERROR) {
757       emsg(_(e_format));
758       goto endFAIL;
759     }
760     if (res == SP_TRUNCERROR) {
761 truncerr:
762       emsg(_(e_spell_trunc));
763       goto endFAIL;
764     }
765     if (res == SP_OTHERERROR) {
766       goto endFAIL;
767     }
768   }
769 
770   // <LWORDTREE>
771   res = spell_read_tree(fd, &lp->sl_fbyts, &lp->sl_fbyts_len,
772                         &lp->sl_fidxs, false, 0);
773   if (res != 0) {
774     goto someerror;
775   }
776 
777   // <KWORDTREE>
778   res = spell_read_tree(fd, &lp->sl_kbyts, NULL, &lp->sl_kidxs, false, 0);
779   if (res != 0) {
780     goto someerror;
781   }
782 
783   // <PREFIXTREE>
784   res = spell_read_tree(fd, &lp->sl_pbyts, NULL, &lp->sl_pidxs, true,
785                         lp->sl_prefixcnt);
786   if (res != 0) {
787     goto someerror;
788   }
789 
790   // For a new file link it in the list of spell files.
791   if (old_lp == NULL && lang != NULL) {
792     lp->sl_next = first_lang;
793     first_lang = lp;
794   }
795 
796   goto endOK;
797 
798 endFAIL:
799   if (lang != NULL) {
800     // truncating the name signals the error to spell_load_lang()
801     *lang = NUL;
802   }
803   if (lp != NULL && old_lp == NULL) {
804     slang_free(lp);
805   }
806   lp = NULL;
807 
808 endOK:
809   if (fd != NULL) {
810     fclose(fd);
811   }
812   sourcing_name = save_sourcing_name;
813   sourcing_lnum = save_sourcing_lnum;
814 
815   return lp;
816 }
817 
818 // Fill in the wordcount fields for a trie.
819 // Returns the total number of words.
tree_count_words(char_u * byts,idx_T * idxs)820 static void tree_count_words(char_u *byts, idx_T *idxs)
821 {
822   int depth;
823   idx_T arridx[MAXWLEN];
824   int curi[MAXWLEN];
825   int c;
826   idx_T n;
827   int wordcount[MAXWLEN];
828 
829   arridx[0] = 0;
830   curi[0] = 1;
831   wordcount[0] = 0;
832   depth = 0;
833   while (depth >= 0 && !got_int) {
834     if (curi[depth] > byts[arridx[depth]]) {
835       // Done all bytes at this node, go up one level.
836       idxs[arridx[depth]] = wordcount[depth];
837       if (depth > 0) {
838         wordcount[depth - 1] += wordcount[depth];
839       }
840 
841       --depth;
842       fast_breakcheck();
843     } else {
844       // Do one more byte at this node.
845       n = arridx[depth] + curi[depth];
846       ++curi[depth];
847 
848       c = byts[n];
849       if (c == 0) {
850         // End of word, count it.
851         ++wordcount[depth];
852 
853         // Skip over any other NUL bytes (same word with different
854         // flags).
855         while (byts[n + 1] == 0) {
856           ++n;
857           ++curi[depth];
858         }
859       } else {
860         // Normal char, go one level deeper to count the words.
861         ++depth;
862         arridx[depth] = idxs[n];
863         curi[depth] = 1;
864         wordcount[depth] = 0;
865       }
866     }
867   }
868 }
869 
870 // Load the .sug files for languages that have one and weren't loaded yet.
suggest_load_files(void)871 void suggest_load_files(void)
872 {
873   langp_T *lp;
874   slang_T *slang;
875   char_u *dotp;
876   FILE *fd;
877   char_u buf[MAXWLEN];
878   int i;
879   time_t timestamp;
880   int wcount;
881   int wordnr;
882   garray_T ga;
883   int c;
884 
885   // Do this for all languages that support sound folding.
886   for (int lpi = 0; lpi < curwin->w_s->b_langp.ga_len; ++lpi) {
887     lp = LANGP_ENTRY(curwin->w_s->b_langp, lpi);
888     slang = lp->lp_slang;
889     if (slang->sl_sugtime != 0 && !slang->sl_sugloaded) {
890       // Change ".spl" to ".sug" and open the file.  When the file isn't
891       // found silently skip it.  Do set "sl_sugloaded" so that we
892       // don't try again and again.
893       slang->sl_sugloaded = true;
894 
895       dotp = STRRCHR(slang->sl_fname, '.');
896       if (dotp == NULL || fnamecmp(dotp, ".spl") != 0) {
897         continue;
898       }
899       STRCPY(dotp, ".sug");
900       fd = os_fopen((char *)slang->sl_fname, "r");
901       if (fd == NULL) {
902         goto nextone;
903       }
904 
905       // <SUGHEADER>: <fileID> <versionnr> <timestamp>
906       for (i = 0; i < VIMSUGMAGICL; ++i) {
907         buf[i] = getc(fd);                              // <fileID>
908       }
909       if (STRNCMP(buf, VIMSUGMAGIC, VIMSUGMAGICL) != 0) {
910         semsg(_("E778: This does not look like a .sug file: %s"),
911               slang->sl_fname);
912         goto nextone;
913       }
914       c = getc(fd);                                     // <versionnr>
915       if (c < VIMSUGVERSION) {
916         semsg(_("E779: Old .sug file, needs to be updated: %s"),
917               slang->sl_fname);
918         goto nextone;
919       } else if (c > VIMSUGVERSION) {
920         semsg(_("E780: .sug file is for newer version of Vim: %s"),
921               slang->sl_fname);
922         goto nextone;
923       }
924 
925       // Check the timestamp, it must be exactly the same as the one in
926       // the .spl file.  Otherwise the word numbers won't match.
927       timestamp = get8ctime(fd);                        // <timestamp>
928       if (timestamp != slang->sl_sugtime) {
929         semsg(_("E781: .sug file doesn't match .spl file: %s"),
930               slang->sl_fname);
931         goto nextone;
932       }
933 
934       // <SUGWORDTREE>: <wordtree>
935       // Read the trie with the soundfolded words.
936       if (spell_read_tree(fd, &slang->sl_sbyts, NULL, &slang->sl_sidxs,
937                           false, 0) != 0) {
938 someerror:
939         semsg(_("E782: error while reading .sug file: %s"),
940               slang->sl_fname);
941         slang_clear_sug(slang);
942         goto nextone;
943       }
944 
945       // <SUGTABLE>: <sugwcount> <sugline> ...
946       //
947       // Read the table with word numbers.  We use a file buffer for
948       // this, because it's so much like a file with lines.  Makes it
949       // possible to swap the info and save on memory use.
950       slang->sl_sugbuf = open_spellbuf();
951 
952       // <sugwcount>
953       wcount = get4c(fd);
954       if (wcount < 0) {
955         goto someerror;
956       }
957 
958       // Read all the wordnr lists into the buffer, one NUL terminated
959       // list per line.
960       ga_init(&ga, 1, 100);
961       for (wordnr = 0; wordnr < wcount; ++wordnr) {
962         ga.ga_len = 0;
963         for (;;) {
964           c = getc(fd);                                     // <sugline>
965           if (c < 0) {
966             goto someerror;
967           }
968           GA_APPEND(char_u, &ga, c);
969           if (c == NUL) {
970             break;
971           }
972         }
973         if (ml_append_buf(slang->sl_sugbuf, (linenr_T)wordnr,
974                           ga.ga_data, ga.ga_len, true) == FAIL) {
975           goto someerror;
976         }
977       }
978       ga_clear(&ga);
979 
980       // Need to put word counts in the word tries, so that we can find
981       // a word by its number.
982       tree_count_words(slang->sl_fbyts, slang->sl_fidxs);
983       tree_count_words(slang->sl_sbyts, slang->sl_sidxs);
984 
985 nextone:
986       if (fd != NULL) {
987         fclose(fd);
988       }
989       STRCPY(dotp, ".spl");
990     }
991   }
992 }
993 
994 
995 // Read a length field from "fd" in "cnt_bytes" bytes.
996 // Allocate memory, read the string into it and add a NUL at the end.
997 // Returns NULL when the count is zero.
998 // Sets "*cntp" to SP_*ERROR when there is an error, length of the result
999 // otherwise.
read_cnt_string(FILE * fd,int cnt_bytes,int * cntp)1000 static char_u *read_cnt_string(FILE *fd, int cnt_bytes, int *cntp)
1001 {
1002   int cnt = 0;
1003   char_u *str;
1004 
1005   // read the length bytes, MSB first
1006   for (int i = 0; i < cnt_bytes; i++) {
1007     const int c = getc(fd);
1008 
1009     if (c == EOF) {
1010       *cntp = SP_TRUNCERROR;
1011       return NULL;
1012     }
1013     cnt = (cnt << 8) + (unsigned)c;
1014   }
1015   *cntp = cnt;
1016   if (cnt == 0) {
1017     return NULL;            // nothing to read, return NULL
1018   }
1019   str = READ_STRING(fd, cnt);
1020   if (str == NULL) {
1021     *cntp = SP_OTHERERROR;
1022   }
1023   return str;
1024 }
1025 
1026 // Read SN_REGION: <regionname> ...
1027 // Return SP_*ERROR flags.
read_region_section(FILE * fd,slang_T * lp,int len)1028 static int read_region_section(FILE *fd, slang_T *lp, int len)
1029 {
1030   if (len > MAXREGIONS * 2) {
1031     return SP_FORMERROR;
1032   }
1033   SPELL_READ_NONNUL_BYTES((char *)lp->sl_regions, (size_t)len, fd,; );
1034   lp->sl_regions[len] = NUL;
1035   return 0;
1036 }
1037 
1038 // Read SN_CHARFLAGS section: <charflagslen> <charflags>
1039 //                              <folcharslen> <folchars>
1040 // Return SP_*ERROR flags.
read_charflags_section(FILE * fd)1041 static int read_charflags_section(FILE *fd)
1042 {
1043   char_u *flags;
1044   char_u *fol;
1045   int flagslen, follen;
1046 
1047   // <charflagslen> <charflags>
1048   flags = read_cnt_string(fd, 1, &flagslen);
1049   if (flagslen < 0) {
1050     return flagslen;
1051   }
1052 
1053   // <folcharslen> <folchars>
1054   fol = read_cnt_string(fd, 2, &follen);
1055   if (follen < 0) {
1056     xfree(flags);
1057     return follen;
1058   }
1059 
1060   // Set the word-char flags and fill SPELL_ISUPPER() table.
1061   if (flags != NULL && fol != NULL) {
1062     set_spell_charflags(flags, flagslen, fol);
1063   }
1064 
1065   xfree(flags);
1066   xfree(fol);
1067 
1068   // When <charflagslen> is zero then <fcharlen> must also be zero.
1069   if ((flags == NULL) != (fol == NULL)) {
1070     return SP_FORMERROR;
1071   }
1072   return 0;
1073 }
1074 
1075 // Read SN_PREFCOND section.
1076 // Return SP_*ERROR flags.
read_prefcond_section(FILE * fd,slang_T * lp)1077 static int read_prefcond_section(FILE *fd, slang_T *lp)
1078 {
1079   // <prefcondcnt> <prefcond> ...
1080   const int cnt = get2c(fd);  // <prefcondcnt>
1081   if (cnt <= 0) {
1082     return SP_FORMERROR;
1083   }
1084 
1085   lp->sl_prefprog = xcalloc(cnt, sizeof(regprog_T *));
1086   lp->sl_prefixcnt = cnt;
1087 
1088   for (int i = 0; i < cnt; i++) {
1089     // <prefcond> : <condlen> <condstr>
1090     const int n = getc(fd);  // <condlen>
1091     if (n < 0 || n >= MAXWLEN) {
1092       return SP_FORMERROR;
1093     }
1094 
1095     // When <condlen> is zero we have an empty condition.  Otherwise
1096     // compile the regexp program used to check for the condition.
1097     if (n > 0) {
1098       char buf[MAXWLEN + 1];
1099       buf[0] = '^';  // always match at one position only
1100       SPELL_READ_NONNUL_BYTES(buf + 1, (size_t)n, fd,; );
1101       buf[n + 1] = NUL;
1102       lp->sl_prefprog[i] = vim_regcomp((char_u *)buf, RE_MAGIC | RE_STRING);
1103     }
1104   }
1105   return 0;
1106 }
1107 
1108 // Read REP or REPSAL items section from "fd": <repcount> <rep> ...
1109 // Return SP_*ERROR flags.
read_rep_section(FILE * fd,garray_T * gap,int16_t * first)1110 static int read_rep_section(FILE *fd, garray_T *gap, int16_t *first)
1111 {
1112   int cnt;
1113   fromto_T *ftp;
1114 
1115   cnt = get2c(fd);                                      // <repcount>
1116   if (cnt < 0) {
1117     return SP_TRUNCERROR;
1118   }
1119 
1120   ga_grow(gap, cnt);
1121 
1122   // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
1123   for (; gap->ga_len < cnt; ++gap->ga_len) {
1124     int c;
1125     ftp = &((fromto_T *)gap->ga_data)[gap->ga_len];
1126     ftp->ft_from = read_cnt_string(fd, 1, &c);
1127     if (c < 0) {
1128       return c;
1129     }
1130     if (c == 0) {
1131       return SP_FORMERROR;
1132     }
1133     ftp->ft_to = read_cnt_string(fd, 1, &c);
1134     if (c <= 0) {
1135       xfree(ftp->ft_from);
1136       if (c < 0) {
1137         return c;
1138       }
1139       return SP_FORMERROR;
1140     }
1141   }
1142 
1143   // Fill the first-index table.
1144   for (int i = 0; i < 256; ++i) {
1145     first[i] = -1;
1146   }
1147   for (int i = 0; i < gap->ga_len; ++i) {
1148     ftp = &((fromto_T *)gap->ga_data)[i];
1149     if (first[*ftp->ft_from] == -1) {
1150       first[*ftp->ft_from] = i;
1151     }
1152   }
1153   return 0;
1154 }
1155 
1156 // Read SN_SAL section: <salflags> <salcount> <sal> ...
1157 // Return SP_*ERROR flags.
read_sal_section(FILE * fd,slang_T * slang)1158 static int read_sal_section(FILE *fd, slang_T *slang)
1159 {
1160   int cnt;
1161   garray_T *gap;
1162   salitem_T *smp;
1163   int ccnt;
1164   char_u *p;
1165 
1166   slang->sl_sofo = false;
1167 
1168   const int flags = getc(fd);                   // <salflags>
1169   if (flags & SAL_F0LLOWUP) {
1170     slang->sl_followup = true;
1171   }
1172   if (flags & SAL_COLLAPSE) {
1173     slang->sl_collapse = true;
1174   }
1175   if (flags & SAL_REM_ACCENTS) {
1176     slang->sl_rem_accents = true;
1177   }
1178 
1179   cnt = get2c(fd);                              // <salcount>
1180   if (cnt < 0) {
1181     return SP_TRUNCERROR;
1182   }
1183 
1184   gap = &slang->sl_sal;
1185   ga_init(gap, sizeof(salitem_T), 10);
1186   ga_grow(gap, cnt + 1);
1187 
1188   // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
1189   for (; gap->ga_len < cnt; gap->ga_len++) {
1190     int c = NUL;
1191 
1192     smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
1193     ccnt = getc(fd);                            // <salfromlen>
1194     if (ccnt < 0) {
1195       return SP_TRUNCERROR;
1196     }
1197     p = xmalloc(ccnt + 2);
1198     smp->sm_lead = p;
1199 
1200     // Read up to the first special char into sm_lead.
1201     int i = 0;
1202     for (; i < ccnt; ++i) {
1203       c = getc(fd);                             // <salfrom>
1204       if (vim_strchr((char_u *)"0123456789(-<^$", c) != NULL) {
1205         break;
1206       }
1207       *p++ = c;
1208     }
1209     smp->sm_leadlen = (int)(p - smp->sm_lead);
1210     *p++ = NUL;
1211 
1212     // Put (abc) chars in sm_oneof, if any.
1213     if (c == '(') {
1214       smp->sm_oneof = p;
1215       for (++i; i < ccnt; ++i) {
1216         c = getc(fd);                           // <salfrom>
1217         if (c == ')') {
1218           break;
1219         }
1220         *p++ = c;
1221       }
1222       *p++ = NUL;
1223       if (++i < ccnt) {
1224         c = getc(fd);
1225       }
1226     } else {
1227       smp->sm_oneof = NULL;
1228     }
1229 
1230     // Any following chars go in sm_rules.
1231     smp->sm_rules = p;
1232     if (i < ccnt) {
1233       // store the char we got while checking for end of sm_lead
1234       *p++ = c;
1235     }
1236     i++;
1237     if (i < ccnt) {
1238       SPELL_READ_NONNUL_BYTES(                  // <salfrom>
1239                                                 (char *)p, (size_t)(ccnt - i), fd,
1240                                                 xfree(smp->sm_lead));
1241       p += (ccnt - i);
1242     }
1243     *p++ = NUL;
1244 
1245     // <saltolen> <salto>
1246     smp->sm_to = read_cnt_string(fd, 1, &ccnt);
1247     if (ccnt < 0) {
1248       xfree(smp->sm_lead);
1249       return ccnt;
1250     }
1251 
1252     // convert the multi-byte strings to wide char strings
1253     smp->sm_lead_w = mb_str2wide(smp->sm_lead);
1254     smp->sm_leadlen = mb_charlen(smp->sm_lead);
1255     if (smp->sm_oneof == NULL) {
1256       smp->sm_oneof_w = NULL;
1257     } else {
1258       smp->sm_oneof_w = mb_str2wide(smp->sm_oneof);
1259     }
1260     if (smp->sm_to == NULL) {
1261       smp->sm_to_w = NULL;
1262     } else {
1263       smp->sm_to_w = mb_str2wide(smp->sm_to);
1264     }
1265   }
1266 
1267   if (!GA_EMPTY(gap)) {
1268     // Add one extra entry to mark the end with an empty sm_lead.  Avoids
1269     // that we need to check the index every time.
1270     smp = &((salitem_T *)gap->ga_data)[gap->ga_len];
1271     p = xmalloc(1);
1272     p[0] = NUL;
1273     smp->sm_lead = p;
1274     smp->sm_lead_w = mb_str2wide(smp->sm_lead);
1275     smp->sm_leadlen = 0;
1276     smp->sm_oneof = NULL;
1277     smp->sm_oneof_w = NULL;
1278     smp->sm_rules = p;
1279     smp->sm_to = NULL;
1280     smp->sm_to_w = NULL;
1281     gap->ga_len++;
1282   }
1283 
1284   // Fill the first-index table.
1285   set_sal_first(slang);
1286 
1287   return 0;
1288 }
1289 
1290 // Read SN_WORDS: <word> ...
1291 // Return SP_*ERROR flags.
read_words_section(FILE * fd,slang_T * lp,int len)1292 static int read_words_section(FILE *fd, slang_T *lp, int len)
1293 {
1294   int done = 0;
1295   int i;
1296   int c;
1297   char_u word[MAXWLEN];
1298 
1299   while (done < len) {
1300     // Read one word at a time.
1301     for (i = 0;; ++i) {
1302       c = getc(fd);
1303       if (c == EOF) {
1304         return SP_TRUNCERROR;
1305       }
1306       word[i] = c;
1307       if (word[i] == NUL) {
1308         break;
1309       }
1310       if (i == MAXWLEN - 1) {
1311         return SP_FORMERROR;
1312       }
1313     }
1314 
1315     // Init the count to 10.
1316     count_common_word(lp, word, -1, 10);
1317     done += i + 1;
1318   }
1319   return 0;
1320 }
1321 
1322 // SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
1323 // Return SP_*ERROR flags.
read_sofo_section(FILE * fd,slang_T * slang)1324 static int read_sofo_section(FILE *fd, slang_T *slang)
1325 {
1326   int cnt;
1327   char_u *from, *to;
1328   int res;
1329 
1330   slang->sl_sofo = true;
1331 
1332   // <sofofromlen> <sofofrom>
1333   from = read_cnt_string(fd, 2, &cnt);
1334   if (cnt < 0) {
1335     return cnt;
1336   }
1337 
1338   // <sofotolen> <sofoto>
1339   to = read_cnt_string(fd, 2, &cnt);
1340   if (cnt < 0) {
1341     xfree(from);
1342     return cnt;
1343   }
1344 
1345   // Store the info in slang->sl_sal and/or slang->sl_sal_first.
1346   if (from != NULL && to != NULL) {
1347     res = set_sofo(slang, from, to);
1348   } else if (from != NULL || to != NULL) {
1349     res = SP_FORMERROR;        // only one of two strings is an error
1350   } else {
1351     res = 0;
1352   }
1353 
1354   xfree(from);
1355   xfree(to);
1356   return res;
1357 }
1358 
1359 // Read the compound section from the .spl file:
1360 //      <compmax> <compminlen> <compsylmax> <compoptions> <compflags>
1361 // Returns SP_*ERROR flags.
read_compound(FILE * fd,slang_T * slang,int len)1362 static int read_compound(FILE *fd, slang_T *slang, int len)
1363 {
1364   int todo = len;
1365   int c;
1366   int atstart;
1367   char_u *pat;
1368   char_u *pp;
1369   char_u *cp;
1370   char_u *ap;
1371   char_u *crp;
1372   int cnt;
1373   garray_T *gap;
1374 
1375   if (todo < 2) {
1376     return SP_FORMERROR;        // need at least two bytes
1377   }
1378   --todo;
1379   c = getc(fd);                                         // <compmax>
1380   if (c < 2) {
1381     c = MAXWLEN;
1382   }
1383   slang->sl_compmax = c;
1384 
1385   --todo;
1386   c = getc(fd);                                         // <compminlen>
1387   if (c < 1) {
1388     c = 0;
1389   }
1390   slang->sl_compminlen = c;
1391 
1392   --todo;
1393   c = getc(fd);                                         // <compsylmax>
1394   if (c < 1) {
1395     c = MAXWLEN;
1396   }
1397   slang->sl_compsylmax = c;
1398 
1399   c = getc(fd);                                         // <compoptions>
1400   if (c != 0) {
1401     ungetc(c, fd);          // be backwards compatible with Vim 7.0b
1402   } else {
1403     --todo;
1404     c = getc(fd);           // only use the lower byte for now
1405     --todo;
1406     slang->sl_compoptions = c;
1407 
1408     gap = &slang->sl_comppat;
1409     c = get2c(fd);                                      // <comppatcount>
1410     if (c < 0) {
1411       return SP_TRUNCERROR;
1412     }
1413     todo -= 2;
1414     ga_init(gap, sizeof(char_u *), c);
1415     ga_grow(gap, c);
1416     while (--c >= 0) {
1417       ((char_u **)(gap->ga_data))[gap->ga_len++] =
1418         read_cnt_string(fd, 1, &cnt);
1419       // <comppatlen> <comppattext>
1420       if (cnt < 0) {
1421         return cnt;
1422       }
1423       todo -= cnt + 1;
1424     }
1425   }
1426   if (todo < 0) {
1427     return SP_FORMERROR;
1428   }
1429 
1430   // Turn the COMPOUNDRULE items into a regexp pattern:
1431   // "a[bc]/a*b+" -> "^\(a[bc]\|a*b\+\)$".
1432   // Inserting backslashes may double the length, "^\(\)$<Nul>" is 7 bytes.
1433   // Conversion to utf-8 may double the size.
1434   c = todo * 2 + 7;
1435   c += todo * 2;
1436   pat = xmalloc(c);
1437 
1438   // We also need a list of all flags that can appear at the start and one
1439   // for all flags.
1440   cp = xmalloc(todo + 1);
1441   slang->sl_compstartflags = cp;
1442   *cp = NUL;
1443 
1444   ap = xmalloc(todo + 1);
1445   slang->sl_compallflags = ap;
1446   *ap = NUL;
1447 
1448   // And a list of all patterns in their original form, for checking whether
1449   // compounding may work in match_compoundrule().  This is freed when we
1450   // encounter a wildcard, the check doesn't work then.
1451   crp = xmalloc(todo + 1);
1452   slang->sl_comprules = crp;
1453 
1454   pp = pat;
1455   *pp++ = '^';
1456   *pp++ = '\\';
1457   *pp++ = '(';
1458 
1459   atstart = 1;
1460   while (todo-- > 0) {
1461     c = getc(fd);                                       // <compflags>
1462     if (c == EOF) {
1463       xfree(pat);
1464       return SP_TRUNCERROR;
1465     }
1466 
1467     // Add all flags to "sl_compallflags".
1468     if (vim_strchr((char_u *)"?*+[]/", c) == NULL
1469         && !byte_in_str(slang->sl_compallflags, c)) {
1470       *ap++ = c;
1471       *ap = NUL;
1472     }
1473 
1474     if (atstart != 0) {
1475       // At start of item: copy flags to "sl_compstartflags".  For a
1476       // [abc] item set "atstart" to 2 and copy up to the ']'.
1477       if (c == '[') {
1478         atstart = 2;
1479       } else if (c == ']') {
1480         atstart = 0;
1481       } else {
1482         if (!byte_in_str(slang->sl_compstartflags, c)) {
1483           *cp++ = c;
1484           *cp = NUL;
1485         }
1486         if (atstart == 1) {
1487           atstart = 0;
1488         }
1489       }
1490     }
1491 
1492     // Copy flag to "sl_comprules", unless we run into a wildcard.
1493     if (crp != NULL) {
1494       if (c == '?' || c == '+' || c == '*') {
1495         XFREE_CLEAR(slang->sl_comprules);
1496         crp = NULL;
1497       } else {
1498         *crp++ = c;
1499       }
1500     }
1501 
1502     if (c == '/') {         // slash separates two items
1503       *pp++ = '\\';
1504       *pp++ = '|';
1505       atstart = 1;
1506     } else {              // normal char, "[abc]" and '*' are copied as-is
1507       if (c == '?' || c == '+' || c == '~') {
1508         *pp++ = '\\';               // "a?" becomes "a\?", "a+" becomes "a\+"
1509       }
1510       pp += utf_char2bytes(c, pp);
1511     }
1512   }
1513 
1514   *pp++ = '\\';
1515   *pp++ = ')';
1516   *pp++ = '$';
1517   *pp = NUL;
1518 
1519   if (crp != NULL) {
1520     *crp = NUL;
1521   }
1522 
1523   slang->sl_compprog = vim_regcomp(pat, RE_MAGIC + RE_STRING + RE_STRICT);
1524   xfree(pat);
1525   if (slang->sl_compprog == NULL) {
1526     return SP_FORMERROR;
1527   }
1528 
1529   return 0;
1530 }
1531 
1532 // Set the SOFOFROM and SOFOTO items in language "lp".
1533 // Returns SP_*ERROR flags when there is something wrong.
set_sofo(slang_T * lp,char_u * from,char_u * to)1534 static int set_sofo(slang_T *lp, char_u *from, char_u *to)
1535 {
1536   char_u *s;
1537   char_u *p;
1538 
1539   // Use "sl_sal" as an array with 256 pointers to a list of wide
1540   // characters.  The index is the low byte of the character.
1541   // The list contains from-to pairs with a terminating NUL.
1542   // sl_sal_first[] is used for latin1 "from" characters.
1543   garray_T *gap = &lp->sl_sal;
1544   ga_init(gap, sizeof(int *), 1);
1545   ga_grow(gap, 256);
1546   memset(gap->ga_data, 0, sizeof(int *) * 256);
1547   gap->ga_len = 256;
1548 
1549   // First count the number of items for each list.  Temporarily use
1550   // sl_sal_first[] for this.
1551   for (p = from, s = to; *p != NUL && *s != NUL;) {
1552     const int c = mb_cptr2char_adv((const char_u **)&p);
1553     MB_CPTR_ADV(s);
1554     if (c >= 256) {
1555       lp->sl_sal_first[c & 0xff]++;
1556     }
1557   }
1558   if (*p != NUL || *s != NUL) {  // lengths differ
1559     return SP_FORMERROR;
1560   }
1561 
1562   // Allocate the lists.
1563   for (int i = 0; i < 256; i++) {
1564     if (lp->sl_sal_first[i] > 0) {
1565       p = xmalloc(sizeof(int) * (lp->sl_sal_first[i] * 2 + 1));
1566       ((int **)gap->ga_data)[i] = (int *)p;
1567       *(int *)p = 0;
1568     }
1569   }
1570 
1571   // Put the characters up to 255 in sl_sal_first[] the rest in a sl_sal
1572   // list.
1573   memset(lp->sl_sal_first, 0, sizeof(salfirst_T) * 256);
1574   for (p = from, s = to; *p != NUL && *s != NUL;) {
1575     const int c = mb_cptr2char_adv((const char_u **)&p);
1576     const int i = mb_cptr2char_adv((const char_u **)&s);
1577     if (c >= 256) {
1578       // Append the from-to chars at the end of the list with
1579       // the low byte.
1580       int *inp = ((int **)gap->ga_data)[c & 0xff];
1581       while (*inp != 0) {
1582         inp++;
1583       }
1584       *inp++ = c;                     // from char
1585       *inp++ = i;                     // to char
1586       *inp++ = NUL;                   // NUL at the end
1587     } else {
1588       // mapping byte to char is done in sl_sal_first[]
1589       lp->sl_sal_first[c] = i;
1590     }
1591   }
1592 
1593   return 0;
1594 }
1595 
1596 // Fill the first-index table for "lp".
set_sal_first(slang_T * lp)1597 static void set_sal_first(slang_T *lp)
1598 {
1599   salfirst_T *sfirst;
1600   salitem_T *smp;
1601   int c;
1602   garray_T *gap = &lp->sl_sal;
1603 
1604   sfirst = lp->sl_sal_first;
1605   for (int i = 0; i < 256; ++i) {
1606     sfirst[i] = -1;
1607   }
1608   smp = (salitem_T *)gap->ga_data;
1609   for (int i = 0; i < gap->ga_len; i++) {
1610     // Use the lowest byte of the first character.  For latin1 it's
1611     // the character, for other encodings it should differ for most
1612     // characters.
1613     c = *smp[i].sm_lead_w & 0xff;
1614     if (sfirst[c] == -1) {
1615       sfirst[c] = i;
1616 
1617       // Make sure all entries with this byte are following each
1618       // other.  Move the ones that are in the wrong position.  Do
1619       // keep the same ordering!
1620       while (i + 1 < gap->ga_len
1621              && (*smp[i + 1].sm_lead_w & 0xff) == c) {
1622         // Skip over entry with same index byte.
1623         i++;
1624       }
1625 
1626       for (int n = 1; i + n < gap->ga_len; n++) {
1627         if ((*smp[i + n].sm_lead_w & 0xff) == c) {
1628           salitem_T tsal;
1629 
1630           // Move entry with same index byte after the entries
1631           // we already found.
1632           i++;
1633           n--;
1634           tsal = smp[i + n];
1635           memmove(smp + i + 1, smp + i, sizeof(salitem_T) * n);
1636           smp[i] = tsal;
1637         }
1638       }
1639     }
1640   }
1641 }
1642 
1643 // Turn a multi-byte string into a wide character string.
1644 // Return it in allocated memory.
mb_str2wide(char_u * s)1645 static int *mb_str2wide(char_u *s)
1646 {
1647   int i = 0;
1648 
1649   int *res = xmalloc((mb_charlen(s) + 1) * sizeof(int));
1650   for (char_u *p = s; *p != NUL;) {
1651     res[i++] = mb_ptr2char_adv((const char_u **)&p);
1652   }
1653   res[i] = NUL;
1654 
1655   return res;
1656 }
1657 
1658 /// Reads a tree from the .spl or .sug file.
1659 /// Allocates the memory and stores pointers in "bytsp" and "idxsp".
1660 /// This is skipped when the tree has zero length.
1661 ///
1662 /// @param prefixtree  true for the prefix tree
1663 /// @param prefixcnt  when "prefixtree" is true: prefix count
1664 ///
1665 /// @return  zero when OK, SP_ value for an error.
spell_read_tree(FILE * fd,char_u ** bytsp,long * bytsp_len,idx_T ** idxsp,bool prefixtree,int prefixcnt)1666 static int spell_read_tree(FILE *fd, char_u **bytsp, long *bytsp_len, idx_T **idxsp,
1667                            bool prefixtree, int prefixcnt)
1668   FUNC_ATTR_NONNULL_ARG(1, 2, 4)
1669 {
1670   int idx;
1671   char_u *bp;
1672   idx_T *ip;
1673 
1674   // The tree size was computed when writing the file, so that we can
1675   // allocate it as one long block. <nodecount>
1676   long len = get4c(fd);
1677   if (len < 0) {
1678     return SP_TRUNCERROR;
1679   }
1680   if ((size_t)len >= SIZE_MAX / sizeof(int)) {  // -V547
1681     // Invalid length, multiply with sizeof(int) would overflow.
1682     return SP_FORMERROR;
1683   }
1684   if (len > 0) {
1685     // Allocate the byte array.
1686     bp = xmalloc(len);
1687     *bytsp = bp;
1688     if (bytsp_len != NULL) {
1689       *bytsp_len = len;
1690     }
1691 
1692     // Allocate the index array.
1693     ip = xcalloc(len, sizeof(*ip));
1694     *idxsp = ip;
1695 
1696     // Recursively read the tree and store it in the array.
1697     idx = read_tree_node(fd, bp, ip, len, 0, prefixtree, prefixcnt);
1698     if (idx < 0) {
1699       return idx;
1700     }
1701   }
1702   return 0;
1703 }
1704 
1705 /// Read one row of siblings from the spell file and store it in the byte array
1706 /// "byts" and index array "idxs".  Recursively read the children.
1707 ///
1708 /// NOTE: The code here must match put_node()!
1709 ///
1710 /// Returns the index (>= 0) following the siblings.
1711 /// Returns SP_TRUNCERROR if the file is shorter than expected.
1712 /// Returns SP_FORMERROR if there is a format error.
1713 ///
1714 /// @param maxidx  size of arrays
1715 /// @param startidx  current index in "byts" and "idxs"
1716 /// @param prefixtree  true for reading PREFIXTREE
1717 /// @param maxprefcondnr  maximum for <prefcondnr>
read_tree_node(FILE * fd,char_u * byts,idx_T * idxs,int maxidx,idx_T startidx,bool prefixtree,int maxprefcondnr)1718 static idx_T read_tree_node(FILE *fd, char_u *byts, idx_T *idxs, int maxidx, idx_T startidx,
1719                             bool prefixtree, int maxprefcondnr)
1720 {
1721   int len;
1722   int i;
1723   int n;
1724   idx_T idx = startidx;
1725   int c;
1726   int c2;
1727 #define SHARED_MASK     0x8000000
1728 
1729   len = getc(fd);                                       // <siblingcount>
1730   if (len <= 0) {
1731     return SP_TRUNCERROR;
1732   }
1733 
1734   if (startidx + len >= maxidx) {
1735     return SP_FORMERROR;
1736   }
1737   byts[idx++] = len;
1738 
1739   // Read the byte values, flag/region bytes and shared indexes.
1740   for (i = 1; i <= len; ++i) {
1741     c = getc(fd);                                       // <byte>
1742     if (c < 0) {
1743       return SP_TRUNCERROR;
1744     }
1745     if (c <= BY_SPECIAL) {
1746       if (c == BY_NOFLAGS && !prefixtree) {
1747         // No flags, all regions.
1748         idxs[idx] = 0;
1749       } else if (c != BY_INDEX) {
1750         if (prefixtree) {
1751           // Read the optional pflags byte, the prefix ID and the
1752           // condition nr.  In idxs[] store the prefix ID in the low
1753           // byte, the condition index shifted up 8 bits, the flags
1754           // shifted up 24 bits.
1755           if (c == BY_FLAGS) {
1756             c = getc(fd) << 24;                         // <pflags>
1757           } else {
1758             c = 0;
1759           }
1760 
1761           c |= getc(fd);                                // <affixID>
1762 
1763           n = get2c(fd);                                // <prefcondnr>
1764           if (n >= maxprefcondnr) {
1765             return SP_FORMERROR;
1766           }
1767           c |= (n << 8);
1768         } else {    // c must be BY_FLAGS or BY_FLAGS2
1769                     // Read flags and optional region and prefix ID.  In
1770                     // idxs[] the flags go in the low two bytes, region above
1771                     // that and prefix ID above the region.
1772           c2 = c;
1773           c = getc(fd);                                 // <flags>
1774           if (c2 == BY_FLAGS2) {
1775             c = (getc(fd) << 8) + c;                    // <flags2>
1776           }
1777           if (c & WF_REGION) {
1778             c = (getc(fd) << 16) + c;                   // <region>
1779           }
1780           if (c & WF_AFX) {
1781             c = (getc(fd) << 24) + c;                   // <affixID>
1782           }
1783         }
1784 
1785         idxs[idx] = c;
1786         c = 0;
1787       } else {  // c == BY_INDEX
1788         // <nodeidx>
1789         n = get3c(fd);
1790         if (n < 0 || n >= maxidx) {
1791           return SP_FORMERROR;
1792         }
1793         idxs[idx] = n + SHARED_MASK;
1794         c = getc(fd);                                   // <xbyte>
1795       }
1796     }
1797     byts[idx++] = c;
1798   }
1799 
1800   // Recursively read the children for non-shared siblings.
1801   // Skip the end-of-word ones (zero byte value) and the shared ones (and
1802   // remove SHARED_MASK)
1803   for (i = 1; i <= len; ++i) {
1804     if (byts[startidx + i] != 0) {
1805       if (idxs[startidx + i] & SHARED_MASK) {
1806         idxs[startidx + i] &= ~SHARED_MASK;
1807       } else {
1808         idxs[startidx + i] = idx;
1809         idx = read_tree_node(fd, byts, idxs, maxidx, idx,
1810                              prefixtree, maxprefcondnr);
1811         if (idx < 0) {
1812           break;
1813         }
1814       }
1815     }
1816   }
1817 
1818   return idx;
1819 }
1820 
1821 /// Reload the spell file "fname" if it's loaded.
1822 ///
1823 /// @param added_word  invoked through "zg"
spell_reload_one(char_u * fname,bool added_word)1824 static void spell_reload_one(char_u *fname, bool added_word)
1825 {
1826   slang_T *slang;
1827   bool didit = false;
1828 
1829   for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
1830     if (path_full_compare(fname, slang->sl_fname, false, true) == kEqualFiles) {
1831       slang_clear(slang);
1832       if (spell_load_file(fname, NULL, slang, false) == NULL) {
1833         // reloading failed, clear the language
1834         slang_clear(slang);
1835       }
1836       redraw_all_later(SOME_VALID);
1837       didit = true;
1838     }
1839   }
1840 
1841   // When "zg" was used and the file wasn't loaded yet, should redo
1842   // 'spelllang' to load it now.
1843   if (added_word && !didit) {
1844     did_set_spelllang(curwin);
1845   }
1846 }
1847 
1848 // Functions for ":mkspell".
1849 
1850 // In the postponed prefixes tree wn_flags is used to store the WFP_ flags,
1851 // but it must be negative to indicate the prefix tree to tree_add_word().
1852 // Use a negative number with the lower 8 bits zero.
1853 #define PFX_FLAGS       -256
1854 
1855 // flags for "condit" argument of store_aff_word()
1856 #define CONDIT_COMB     1       // affix must combine
1857 #define CONDIT_CFIX     2       // affix must have CIRCUMFIX flag
1858 #define CONDIT_SUF      4       // add a suffix for matching flags
1859 #define CONDIT_AFF      8       // word already has an affix
1860 
1861 // Tunable parameters for when the tree is compressed.  Filled from the
1862 // 'mkspellmem' option.
1863 static long compress_start = 30000;     // memory / SBLOCKSIZE
1864 static long compress_inc = 100;         // memory / SBLOCKSIZE
1865 static long compress_added = 500000;    // word count
1866 
1867 // Check the 'mkspellmem' option.  Return FAIL if it's wrong.
1868 // Sets "sps_flags".
spell_check_msm(void)1869 int spell_check_msm(void)
1870 {
1871   char_u *p = p_msm;
1872   long start = 0;
1873   long incr = 0;
1874   long added = 0;
1875 
1876   if (!ascii_isdigit(*p)) {
1877     return FAIL;
1878   }
1879   // block count = (value * 1024) / SBLOCKSIZE (but avoid overflow)
1880   start = (getdigits_long(&p, true, 0) * 10) / (SBLOCKSIZE / 102);
1881   if (*p != ',') {
1882     return FAIL;
1883   }
1884   p++;
1885   if (!ascii_isdigit(*p)) {
1886     return FAIL;
1887   }
1888   incr = (getdigits_long(&p, true, 0) * 102) / (SBLOCKSIZE / 10);
1889   if (*p != ',') {
1890     return FAIL;
1891   }
1892   p++;
1893   if (!ascii_isdigit(*p)) {
1894     return FAIL;
1895   }
1896   added = getdigits_long(&p, true, 0) * 1024;
1897   if (*p != NUL) {
1898     return FAIL;
1899   }
1900 
1901   if (start == 0 || incr == 0 || added == 0 || incr > start) {
1902     return FAIL;
1903   }
1904 
1905   compress_start = start;
1906   compress_inc = incr;
1907   compress_added = added;
1908   return OK;
1909 }
1910 
1911 #ifdef SPELL_PRINTTREE
1912 // For debugging the tree code: print the current tree in a (more or less)
1913 // readable format, so that we can see what happens when adding a word and/or
1914 // compressing the tree.
1915 // Based on code from Olaf Seibert.
1916 # define PRINTLINESIZE   1000
1917 # define PRINTWIDTH      6
1918 
1919 # define PRINTSOME(l, depth, fmt, a1, a2) vim_snprintf(l + depth * PRINTWIDTH, \
1920                                                        PRINTLINESIZE - PRINTWIDTH * depth, fmt, a1, \
1921                                                        a2)
1922 
1923 static char line1[PRINTLINESIZE];
1924 static char line2[PRINTLINESIZE];
1925 static char line3[PRINTLINESIZE];
1926 
spell_clear_flags(wordnode_T * node)1927 static void spell_clear_flags(wordnode_T *node)
1928 {
1929   wordnode_T *np;
1930 
1931   for (np = node; np != NULL; np = np->wn_sibling) {
1932     np->wn_u1.index = FALSE;
1933     spell_clear_flags(np->wn_child);
1934   }
1935 }
1936 
spell_print_node(wordnode_T * node,int depth)1937 static void spell_print_node(wordnode_T *node, int depth)
1938 {
1939   if (node->wn_u1.index) {
1940     // Done this node before, print the reference.
1941     PRINTSOME(line1, depth, "(%d)", node->wn_nr, 0);
1942     PRINTSOME(line2, depth, "    ", 0, 0);
1943     PRINTSOME(line3, depth, "    ", 0, 0);
1944     msg((char_u *)line1);
1945     msg((char_u *)line2);
1946     msg((char_u *)line3);
1947   } else {
1948     node->wn_u1.index = TRUE;
1949 
1950     if (node->wn_byte != NUL) {
1951       if (node->wn_child != NULL) {
1952         PRINTSOME(line1, depth, " %c -> ", node->wn_byte, 0);
1953       } else {
1954         // Cannot happen?
1955         PRINTSOME(line1, depth, " %c ???", node->wn_byte, 0);
1956       }
1957     } else {
1958       PRINTSOME(line1, depth, " $    ", 0, 0);
1959     }
1960 
1961     PRINTSOME(line2, depth, "%d/%d    ", node->wn_nr, node->wn_refs);
1962 
1963     if (node->wn_sibling != NULL) {
1964       PRINTSOME(line3, depth, " |    ", 0, 0);
1965     } else {
1966       PRINTSOME(line3, depth, "      ", 0, 0);
1967     }
1968 
1969     if (node->wn_byte == NUL) {
1970       msg((char_u *)line1);
1971       msg((char_u *)line2);
1972       msg((char_u *)line3);
1973     }
1974 
1975     // do the children
1976     if (node->wn_byte != NUL && node->wn_child != NULL) {
1977       spell_print_node(node->wn_child, depth + 1);
1978     }
1979 
1980     // do the siblings
1981     if (node->wn_sibling != NULL) {
1982       // get rid of all parent details except |
1983       STRCPY(line1, line3);
1984       STRCPY(line2, line3);
1985       spell_print_node(node->wn_sibling, depth);
1986     }
1987   }
1988 }
1989 
spell_print_tree(wordnode_T * root)1990 static void spell_print_tree(wordnode_T *root)
1991 {
1992   if (root != NULL) {
1993     // Clear the "wn_u1.index" fields, used to remember what has been
1994     // done.
1995     spell_clear_flags(root);
1996 
1997     // Recursively print the tree.
1998     spell_print_node(root, 0);
1999   }
2000 }
2001 
2002 #endif // SPELL_PRINTTREE
2003 
2004 // Reads the affix file "fname".
2005 // Returns an afffile_T, NULL for complete failure.
spell_read_aff(spellinfo_T * spin,char_u * fname)2006 static afffile_T *spell_read_aff(spellinfo_T *spin, char_u *fname)
2007 {
2008   FILE *fd;
2009   char_u rline[MAXLINELEN];
2010   char_u *line;
2011   char_u *pc = NULL;
2012 #define MAXITEMCNT  30
2013   char_u *(items[MAXITEMCNT]);
2014   int itemcnt;
2015   char_u *p;
2016   int lnum = 0;
2017   affheader_T *cur_aff = NULL;
2018   bool did_postpone_prefix = false;
2019   int aff_todo = 0;
2020   hashtab_T *tp;
2021   char_u *low = NULL;
2022   char_u *fol = NULL;
2023   char_u *upp = NULL;
2024   int do_rep;
2025   int do_repsal;
2026   int do_sal;
2027   int do_mapline;
2028   bool found_map = false;
2029   hashitem_T *hi;
2030   int l;
2031   int compminlen = 0;                   // COMPOUNDMIN value
2032   int compsylmax = 0;                   // COMPOUNDSYLMAX value
2033   int compoptions = 0;                  // COMP_ flags
2034   int compmax = 0;                      // COMPOUNDWORDMAX value
2035   char_u *compflags = NULL;        // COMPOUNDFLAG and COMPOUNDRULE
2036                                    // concatenated
2037   char_u *midword = NULL;          // MIDWORD value
2038   char_u *syllable = NULL;         // SYLLABLE value
2039   char_u *sofofrom = NULL;         // SOFOFROM value
2040   char_u *sofoto = NULL;           // SOFOTO value
2041 
2042   // Open the file.
2043   fd = os_fopen((char *)fname, "r");
2044   if (fd == NULL) {
2045     semsg(_(e_notopen), fname);
2046     return NULL;
2047   }
2048 
2049   vim_snprintf((char *)IObuff, IOSIZE, _("Reading affix file %s..."), fname);
2050   spell_message(spin, IObuff);
2051 
2052   // Only do REP lines when not done in another .aff file already.
2053   do_rep = GA_EMPTY(&spin->si_rep);
2054 
2055   // Only do REPSAL lines when not done in another .aff file already.
2056   do_repsal = GA_EMPTY(&spin->si_repsal);
2057 
2058   // Only do SAL lines when not done in another .aff file already.
2059   do_sal = GA_EMPTY(&spin->si_sal);
2060 
2061   // Only do MAP lines when not done in another .aff file already.
2062   do_mapline = GA_EMPTY(&spin->si_map);
2063 
2064   // Allocate and init the afffile_T structure.
2065   afffile_T *aff = getroom(spin, sizeof(*aff), true);
2066   hash_init(&aff->af_pref);
2067   hash_init(&aff->af_suff);
2068   hash_init(&aff->af_comp);
2069 
2070   // Read all the lines in the file one by one.
2071   while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) {
2072     line_breakcheck();
2073     ++lnum;
2074 
2075     // Skip comment lines.
2076     if (*rline == '#') {
2077       continue;
2078     }
2079 
2080     // Convert from "SET" to 'encoding' when needed.
2081     xfree(pc);
2082     if (spin->si_conv.vc_type != CONV_NONE) {
2083       pc = string_convert(&spin->si_conv, rline, NULL);
2084       if (pc == NULL) {
2085         smsg(_("Conversion failure for word in %s line %d: %s"),
2086              fname, lnum, rline);
2087         continue;
2088       }
2089       line = pc;
2090     } else {
2091       pc = NULL;
2092       line = rline;
2093     }
2094 
2095     // Split the line up in white separated items.  Put a NUL after each
2096     // item.
2097     itemcnt = 0;
2098     for (p = line;;) {
2099       while (*p != NUL && *p <= ' ') {  // skip white space and CR/NL
2100         ++p;
2101       }
2102       if (*p == NUL) {
2103         break;
2104       }
2105       if (itemcnt == MAXITEMCNT) {          // too many items
2106         break;
2107       }
2108       items[itemcnt++] = p;
2109       // A few items have arbitrary text argument, don't split them.
2110       if (itemcnt == 2 && spell_info_item(items[0])) {
2111         while (*p >= ' ' || *p == TAB) {  // skip until CR/NL
2112           ++p;
2113         }
2114       } else {
2115         while (*p > ' ') {  // skip until white space or CR/NL
2116           ++p;
2117         }
2118       }
2119       if (*p == NUL) {
2120         break;
2121       }
2122       *p++ = NUL;
2123     }
2124 
2125     // Handle non-empty lines.
2126     if (itemcnt > 0) {
2127       if (is_aff_rule(items, itemcnt, "SET", 2) && aff->af_enc == NULL) {
2128         // Setup for conversion from "ENC" to 'encoding'.
2129         aff->af_enc = enc_canonize(items[1]);
2130         if (!spin->si_ascii
2131             && convert_setup(&spin->si_conv, aff->af_enc,
2132                              p_enc) == FAIL) {
2133           smsg(_("Conversion in %s not supported: from %s to %s"),
2134                fname, aff->af_enc, p_enc);
2135         }
2136         spin->si_conv.vc_fail = true;
2137       } else if (is_aff_rule(items, itemcnt, "FLAG", 2)
2138                  && aff->af_flagtype == AFT_CHAR) {
2139         if (STRCMP(items[1], "long") == 0) {
2140           aff->af_flagtype = AFT_LONG;
2141         } else if (STRCMP(items[1], "num") == 0) {
2142           aff->af_flagtype = AFT_NUM;
2143         } else if (STRCMP(items[1], "caplong") == 0) {
2144           aff->af_flagtype = AFT_CAPLONG;
2145         } else {
2146           smsg(_("Invalid value for FLAG in %s line %d: %s"),
2147                fname, lnum, items[1]);
2148         }
2149         if (aff->af_rare != 0
2150             || aff->af_keepcase != 0
2151             || aff->af_bad != 0
2152             || aff->af_needaffix != 0
2153             || aff->af_circumfix != 0
2154             || aff->af_needcomp != 0
2155             || aff->af_comproot != 0
2156             || aff->af_nosuggest != 0
2157             || compflags != NULL
2158             || aff->af_suff.ht_used > 0
2159             || aff->af_pref.ht_used > 0) {
2160           smsg(_("FLAG after using flags in %s line %d: %s"),
2161                fname, lnum, items[1]);
2162         }
2163       } else if (spell_info_item(items[0]) && itemcnt > 1) {
2164         p = getroom(spin,
2165                     (spin->si_info == NULL ? 0 : STRLEN(spin->si_info))
2166                     + STRLEN(items[0])
2167                     + STRLEN(items[1]) + 3, false);
2168         if (spin->si_info != NULL) {
2169           STRCPY(p, spin->si_info);
2170           STRCAT(p, "\n");
2171         }
2172         STRCAT(p, items[0]);
2173         STRCAT(p, " ");
2174         STRCAT(p, items[1]);
2175         spin->si_info = p;
2176       } else if (is_aff_rule(items, itemcnt, "MIDWORD", 2)
2177                  && midword == NULL) {
2178         midword = getroom_save(spin, items[1]);
2179       } else if (is_aff_rule(items, itemcnt, "TRY", 2)) {
2180         // ignored, we look in the tree for what chars may appear
2181       }
2182       // TODO: remove "RAR" later
2183       else if ((is_aff_rule(items, itemcnt, "RAR", 2)
2184                 || is_aff_rule(items, itemcnt, "RARE", 2))
2185                && aff->af_rare == 0) {
2186         aff->af_rare = affitem2flag(aff->af_flagtype, items[1],
2187                                     fname, lnum);
2188       }
2189       // TODO: remove "KEP" later
2190       else if ((is_aff_rule(items, itemcnt, "KEP", 2)
2191                 || is_aff_rule(items, itemcnt, "KEEPCASE", 2))
2192                && aff->af_keepcase == 0) {
2193         aff->af_keepcase = affitem2flag(aff->af_flagtype, items[1],
2194                                         fname, lnum);
2195       } else if ((is_aff_rule(items, itemcnt, "BAD", 2)
2196                   || is_aff_rule(items, itemcnt, "FORBIDDENWORD", 2))
2197                  && aff->af_bad == 0) {
2198         aff->af_bad = affitem2flag(aff->af_flagtype, items[1],
2199                                    fname, lnum);
2200       } else if (is_aff_rule(items, itemcnt, "NEEDAFFIX", 2)
2201                  && aff->af_needaffix == 0) {
2202         aff->af_needaffix = affitem2flag(aff->af_flagtype, items[1],
2203                                          fname, lnum);
2204       } else if (is_aff_rule(items, itemcnt, "CIRCUMFIX", 2)
2205                  && aff->af_circumfix == 0) {
2206         aff->af_circumfix = affitem2flag(aff->af_flagtype, items[1],
2207                                          fname, lnum);
2208       } else if (is_aff_rule(items, itemcnt, "NOSUGGEST", 2)
2209                  && aff->af_nosuggest == 0) {
2210         aff->af_nosuggest = affitem2flag(aff->af_flagtype, items[1],
2211                                          fname, lnum);
2212       } else if ((is_aff_rule(items, itemcnt, "NEEDCOMPOUND", 2)
2213                   || is_aff_rule(items, itemcnt, "ONLYINCOMPOUND", 2))
2214                  && aff->af_needcomp == 0) {
2215         aff->af_needcomp = affitem2flag(aff->af_flagtype, items[1],
2216                                         fname, lnum);
2217       } else if (is_aff_rule(items, itemcnt, "COMPOUNDROOT", 2)
2218                  && aff->af_comproot == 0) {
2219         aff->af_comproot = affitem2flag(aff->af_flagtype, items[1],
2220                                         fname, lnum);
2221       } else if (is_aff_rule(items, itemcnt, "COMPOUNDFORBIDFLAG", 2)
2222                  && aff->af_compforbid == 0) {
2223         aff->af_compforbid = affitem2flag(aff->af_flagtype, items[1],
2224                                           fname, lnum);
2225         if (aff->af_pref.ht_used > 0) {
2226           smsg(_("Defining COMPOUNDFORBIDFLAG after PFX item may give wrong results in %s line %d"),
2227                fname, lnum);
2228         }
2229       } else if (is_aff_rule(items, itemcnt, "COMPOUNDPERMITFLAG", 2)
2230                  && aff->af_comppermit == 0) {
2231         aff->af_comppermit = affitem2flag(aff->af_flagtype, items[1],
2232                                           fname, lnum);
2233         if (aff->af_pref.ht_used > 0) {
2234           smsg(_("Defining COMPOUNDPERMITFLAG after PFX item may give wrong results in %s line %d"),
2235                fname, lnum);
2236         }
2237       } else if (is_aff_rule(items, itemcnt, "COMPOUNDFLAG", 2)
2238                  && compflags == NULL) {
2239         // Turn flag "c" into COMPOUNDRULE compatible string "c+",
2240         // "Na" into "Na+", "1234" into "1234+".
2241         p = getroom(spin, STRLEN(items[1]) + 2, false);
2242         STRCPY(p, items[1]);
2243         STRCAT(p, "+");
2244         compflags = p;
2245       } else if (is_aff_rule(items, itemcnt, "COMPOUNDRULES", 2)) {
2246         // We don't use the count, but do check that it's a number and
2247         // not COMPOUNDRULE mistyped.
2248         if (atoi((char *)items[1]) == 0) {
2249           smsg(_("Wrong COMPOUNDRULES value in %s line %d: %s"),
2250                fname, lnum, items[1]);
2251         }
2252       } else if (is_aff_rule(items, itemcnt, "COMPOUNDRULE", 2)) {
2253         // Don't use the first rule if it is a number.
2254         if (compflags != NULL || *skipdigits(items[1]) != NUL) {
2255           // Concatenate this string to previously defined ones,
2256           // using a slash to separate them.
2257           l = (int)STRLEN(items[1]) + 1;
2258           if (compflags != NULL) {
2259             l += (int)STRLEN(compflags) + 1;
2260           }
2261           p = getroom(spin, l, false);
2262           if (compflags != NULL) {
2263             STRCPY(p, compflags);
2264             STRCAT(p, "/");
2265           }
2266           STRCAT(p, items[1]);
2267           compflags = p;
2268         }
2269       } else if (is_aff_rule(items, itemcnt, "COMPOUNDWORDMAX", 2)
2270                  && compmax == 0) {
2271         compmax = atoi((char *)items[1]);
2272         if (compmax == 0) {
2273           smsg(_("Wrong COMPOUNDWORDMAX value in %s line %d: %s"),
2274                fname, lnum, items[1]);
2275         }
2276       } else if (is_aff_rule(items, itemcnt, "COMPOUNDMIN", 2)
2277                  && compminlen == 0) {
2278         compminlen = atoi((char *)items[1]);
2279         if (compminlen == 0) {
2280           smsg(_("Wrong COMPOUNDMIN value in %s line %d: %s"),
2281                fname, lnum, items[1]);
2282         }
2283       } else if (is_aff_rule(items, itemcnt, "COMPOUNDSYLMAX", 2)
2284                  && compsylmax == 0) {
2285         compsylmax = atoi((char *)items[1]);
2286         if (compsylmax == 0) {
2287           smsg(_("Wrong COMPOUNDSYLMAX value in %s line %d: %s"),
2288                fname, lnum, items[1]);
2289         }
2290       } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDDUP", 1)) {
2291         compoptions |= COMP_CHECKDUP;
2292       } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDREP", 1)) {
2293         compoptions |= COMP_CHECKREP;
2294       } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDCASE", 1)) {
2295         compoptions |= COMP_CHECKCASE;
2296       } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDTRIPLE", 1)) {
2297         compoptions |= COMP_CHECKTRIPLE;
2298       } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDPATTERN", 2)) {
2299         if (atoi((char *)items[1]) == 0) {
2300           smsg(_("Wrong CHECKCOMPOUNDPATTERN value in %s line %d: %s"),
2301                fname, lnum, items[1]);
2302         }
2303       } else if (is_aff_rule(items, itemcnt, "CHECKCOMPOUNDPATTERN", 3)) {
2304         garray_T *gap = &spin->si_comppat;
2305         int i;
2306 
2307         // Only add the couple if it isn't already there.
2308         for (i = 0; i < gap->ga_len - 1; i += 2) {
2309           if (STRCMP(((char_u **)(gap->ga_data))[i], items[1]) == 0
2310               && STRCMP(((char_u **)(gap->ga_data))[i + 1],
2311                         items[2]) == 0) {
2312             break;
2313           }
2314         }
2315         if (i >= gap->ga_len) {
2316           ga_grow(gap, 2);
2317           ((char_u **)(gap->ga_data))[gap->ga_len++]
2318             = getroom_save(spin, items[1]);
2319           ((char_u **)(gap->ga_data))[gap->ga_len++]
2320             = getroom_save(spin, items[2]);
2321         }
2322       } else if (is_aff_rule(items, itemcnt, "SYLLABLE", 2)
2323                  && syllable == NULL) {
2324         syllable = getroom_save(spin, items[1]);
2325       } else if (is_aff_rule(items, itemcnt, "NOBREAK", 1)) {
2326         spin->si_nobreak = true;
2327       } else if (is_aff_rule(items, itemcnt, "NOSPLITSUGS", 1)) {
2328         spin->si_nosplitsugs = true;
2329       } else if (is_aff_rule(items, itemcnt, "NOCOMPOUNDSUGS", 1)) {
2330         spin->si_nocompoundsugs = true;
2331       } else if (is_aff_rule(items, itemcnt, "NOSUGFILE", 1)) {
2332         spin->si_nosugfile = true;
2333       } else if (is_aff_rule(items, itemcnt, "PFXPOSTPONE", 1)) {
2334         aff->af_pfxpostpone = true;
2335       } else if (is_aff_rule(items, itemcnt, "IGNOREEXTRA", 1)) {
2336         aff->af_ignoreextra = true;
2337       } else if ((STRCMP(items[0], "PFX") == 0
2338                   || STRCMP(items[0], "SFX") == 0)
2339                  && aff_todo == 0
2340                  && itemcnt >= 4) {
2341         int lasti = 4;
2342         char_u key[AH_KEY_LEN];
2343 
2344         if (*items[0] == 'P') {
2345           tp = &aff->af_pref;
2346         } else {
2347           tp = &aff->af_suff;
2348         }
2349 
2350         // Myspell allows the same affix name to be used multiple
2351         // times.  The affix files that do this have an undocumented
2352         // "S" flag on all but the last block, thus we check for that
2353         // and store it in ah_follows.
2354         STRLCPY(key, items[1], AH_KEY_LEN);
2355         hi = hash_find(tp, key);
2356         if (!HASHITEM_EMPTY(hi)) {
2357           cur_aff = HI2AH(hi);
2358           if (cur_aff->ah_combine != (*items[2] == 'Y')) {
2359             smsg(_("Different combining flag in continued affix block in %s line %d: %s"),
2360                  fname, lnum, items[1]);
2361           }
2362           if (!cur_aff->ah_follows) {
2363             smsg(_("Duplicate affix in %s line %d: %s"),
2364                  fname, lnum, items[1]);
2365           }
2366         } else {
2367           // New affix letter.
2368           cur_aff = getroom(spin, sizeof(*cur_aff), true);
2369           cur_aff->ah_flag = affitem2flag(aff->af_flagtype, items[1],
2370                                           fname, lnum);
2371           if (cur_aff->ah_flag == 0 || STRLEN(items[1]) >= AH_KEY_LEN) {
2372             break;
2373           }
2374           if (cur_aff->ah_flag == aff->af_bad
2375               || cur_aff->ah_flag == aff->af_rare
2376               || cur_aff->ah_flag == aff->af_keepcase
2377               || cur_aff->ah_flag == aff->af_needaffix
2378               || cur_aff->ah_flag == aff->af_circumfix
2379               || cur_aff->ah_flag == aff->af_nosuggest
2380               || cur_aff->ah_flag == aff->af_needcomp
2381               || cur_aff->ah_flag == aff->af_comproot) {
2382             smsg(_("Affix also used for "
2383                    "BAD/RARE/KEEPCASE/NEEDAFFIX/NEEDCOMPOUND/NOSUGGEST"
2384                    "in %s line %d: %s"),
2385                  fname, lnum, items[1]);
2386           }
2387           STRCPY(cur_aff->ah_key, items[1]);
2388           hash_add(tp, cur_aff->ah_key);
2389 
2390           cur_aff->ah_combine = (*items[2] == 'Y');
2391         }
2392 
2393         // Check for the "S" flag, which apparently means that another
2394         // block with the same affix name is following.
2395         if (itemcnt > lasti && STRCMP(items[lasti], "S") == 0) {
2396           ++lasti;
2397           cur_aff->ah_follows = true;
2398         } else {
2399           cur_aff->ah_follows = false;
2400         }
2401 
2402         // Myspell allows extra text after the item, but that might
2403         // mean mistakes go unnoticed.  Require a comment-starter,
2404         // unless IGNOREEXTRA is used.  Hunspell uses a "-" item.
2405         if (itemcnt > lasti
2406             && !aff->af_ignoreextra
2407             && *items[lasti] != '#') {
2408           smsg(_(e_afftrailing), fname, lnum, items[lasti]);
2409         }
2410 
2411         if (STRCMP(items[2], "Y") != 0 && STRCMP(items[2], "N") != 0) {
2412           smsg(_("Expected Y or N in %s line %d: %s"),
2413                fname, lnum, items[2]);
2414         }
2415 
2416         if (*items[0] == 'P' && aff->af_pfxpostpone) {
2417           if (cur_aff->ah_newID == 0) {
2418             // Use a new number in the .spl file later, to be able
2419             // to handle multiple .aff files.
2420             check_renumber(spin);
2421             cur_aff->ah_newID = ++spin->si_newprefID;
2422 
2423             // We only really use ah_newID if the prefix is
2424             // postponed.  We know that only after handling all
2425             // the items.
2426             did_postpone_prefix = false;
2427           } else {
2428             // Did use the ID in a previous block.
2429             did_postpone_prefix = true;
2430           }
2431         }
2432 
2433         aff_todo = atoi((char *)items[3]);
2434       } else if ((STRCMP(items[0], "PFX") == 0
2435                   || STRCMP(items[0], "SFX") == 0)
2436                  && aff_todo > 0
2437                  && STRCMP(cur_aff->ah_key, items[1]) == 0
2438                  && itemcnt >= 5) {
2439         affentry_T *aff_entry;
2440         bool upper = false;
2441         int lasti = 5;
2442 
2443         // Myspell allows extra text after the item, but that might
2444         // mean mistakes go unnoticed.  Require a comment-starter.
2445         // Hunspell uses a "-" item.
2446         if (itemcnt > lasti && *items[lasti] != '#'
2447             && (STRCMP(items[lasti], "-") != 0
2448                 || itemcnt != lasti + 1)) {
2449           smsg(_(e_afftrailing), fname, lnum, items[lasti]);
2450         }
2451 
2452         // New item for an affix letter.
2453         aff_todo--;
2454         aff_entry = getroom(spin, sizeof(*aff_entry), true);
2455 
2456         if (STRCMP(items[2], "0") != 0) {
2457           aff_entry->ae_chop = getroom_save(spin, items[2]);
2458         }
2459         if (STRCMP(items[3], "0") != 0) {
2460           aff_entry->ae_add = getroom_save(spin, items[3]);
2461 
2462           // Recognize flags on the affix: abcd/XYZ
2463           aff_entry->ae_flags = vim_strchr(aff_entry->ae_add, '/');
2464           if (aff_entry->ae_flags != NULL) {
2465             *aff_entry->ae_flags++ = NUL;
2466             aff_process_flags(aff, aff_entry);
2467           }
2468         }
2469 
2470         // Don't use an affix entry with non-ASCII characters when
2471         // "spin->si_ascii" is true.
2472         if (!spin->si_ascii || !(has_non_ascii(aff_entry->ae_chop)
2473                                  || has_non_ascii(aff_entry->ae_add))) {
2474           aff_entry->ae_next = cur_aff->ah_first;
2475           cur_aff->ah_first = aff_entry;
2476 
2477           if (STRCMP(items[4], ".") != 0) {
2478             char_u buf[MAXLINELEN];
2479 
2480             aff_entry->ae_cond = getroom_save(spin, items[4]);
2481             if (*items[0] == 'P') {
2482               sprintf((char *)buf, "^%s", items[4]);
2483             } else {
2484               sprintf((char *)buf, "%s$", items[4]);
2485             }
2486             aff_entry->ae_prog = vim_regcomp(buf,
2487                                              RE_MAGIC + RE_STRING + RE_STRICT);
2488             if (aff_entry->ae_prog == NULL) {
2489               smsg(_("Broken condition in %s line %d: %s"),
2490                    fname, lnum, items[4]);
2491             }
2492           }
2493 
2494           // For postponed prefixes we need an entry in si_prefcond
2495           // for the condition.  Use an existing one if possible.
2496           // Can't be done for an affix with flags, ignoring
2497           // COMPOUNDFORBIDFLAG and COMPOUNDPERMITFLAG.
2498           if (*items[0] == 'P' && aff->af_pfxpostpone
2499               && aff_entry->ae_flags == NULL) {
2500             // When the chop string is one lower-case letter and
2501             // the add string ends in the upper-case letter we set
2502             // the "upper" flag, clear "ae_chop" and remove the
2503             // letters from "ae_add".  The condition must either
2504             // be empty or start with the same letter.
2505             if (aff_entry->ae_chop != NULL
2506                 && aff_entry->ae_add != NULL
2507                 && aff_entry->ae_chop[utfc_ptr2len(aff_entry->ae_chop)] ==
2508                 NUL) {
2509               int c, c_up;
2510 
2511               c = utf_ptr2char(aff_entry->ae_chop);
2512               c_up = SPELL_TOUPPER(c);
2513               if (c_up != c
2514                   && (aff_entry->ae_cond == NULL
2515                       || utf_ptr2char(aff_entry->ae_cond) == c)) {
2516                 p = aff_entry->ae_add
2517                     + STRLEN(aff_entry->ae_add);
2518                 MB_PTR_BACK(aff_entry->ae_add, p);
2519                 if (utf_ptr2char(p) == c_up) {
2520                   upper = true;
2521                   aff_entry->ae_chop = NULL;
2522                   *p = NUL;
2523 
2524                   // The condition is matched with the
2525                   // actual word, thus must check for the
2526                   // upper-case letter.
2527                   if (aff_entry->ae_cond != NULL) {
2528                     char_u buf[MAXLINELEN];
2529                     onecap_copy(items[4], buf, true);
2530                     aff_entry->ae_cond = getroom_save(spin, buf);
2531                     if (aff_entry->ae_cond != NULL) {
2532                       sprintf((char *)buf, "^%s",
2533                               aff_entry->ae_cond);
2534                       vim_regfree(aff_entry->ae_prog);
2535                       aff_entry->ae_prog = vim_regcomp(buf, RE_MAGIC + RE_STRING);
2536                     }
2537                   }
2538                 }
2539               }
2540             }
2541 
2542             if (aff_entry->ae_chop == NULL) {
2543               int idx;
2544               char_u **pp;
2545               int n;
2546 
2547               // Find a previously used condition.
2548               for (idx = spin->si_prefcond.ga_len - 1; idx >= 0;
2549                    --idx) {
2550                 p = ((char_u **)spin->si_prefcond.ga_data)[idx];
2551                 if (str_equal(p, aff_entry->ae_cond)) {
2552                   break;
2553                 }
2554               }
2555               if (idx < 0) {
2556                 // Not found, add a new condition.
2557                 idx = spin->si_prefcond.ga_len;
2558                 pp = GA_APPEND_VIA_PTR(char_u *, &spin->si_prefcond);
2559                 *pp = (aff_entry->ae_cond == NULL) ?
2560                       NULL : getroom_save(spin, aff_entry->ae_cond);
2561               }
2562 
2563               // Add the prefix to the prefix tree.
2564               if (aff_entry->ae_add == NULL) {
2565                 p = (char_u *)"";
2566               } else {
2567                 p = aff_entry->ae_add;
2568               }
2569 
2570               // PFX_FLAGS is a negative number, so that
2571               // tree_add_word() knows this is the prefix tree.
2572               n = PFX_FLAGS;
2573               if (!cur_aff->ah_combine) {
2574                 n |= WFP_NC;
2575               }
2576               if (upper) {
2577                 n |= WFP_UP;
2578               }
2579               if (aff_entry->ae_comppermit) {
2580                 n |= WFP_COMPPERMIT;
2581               }
2582               if (aff_entry->ae_compforbid) {
2583                 n |= WFP_COMPFORBID;
2584               }
2585               tree_add_word(spin, p, spin->si_prefroot, n,
2586                             idx, cur_aff->ah_newID);
2587               did_postpone_prefix = true;
2588             }
2589 
2590             // Didn't actually use ah_newID, backup si_newprefID.
2591             if (aff_todo == 0 && !did_postpone_prefix) {
2592               --spin->si_newprefID;
2593               cur_aff->ah_newID = 0;
2594             }
2595           }
2596         }
2597       } else if (is_aff_rule(items, itemcnt, "FOL", 2) && fol == NULL) {
2598         fol = vim_strsave(items[1]);
2599       } else if (is_aff_rule(items, itemcnt, "LOW", 2) && low == NULL) {
2600         low = vim_strsave(items[1]);
2601       } else if (is_aff_rule(items, itemcnt, "UPP", 2) && upp == NULL) {
2602         upp = vim_strsave(items[1]);
2603       } else if (is_aff_rule(items, itemcnt, "REP", 2)
2604                  || is_aff_rule(items, itemcnt, "REPSAL", 2)) {
2605         // Ignore REP/REPSAL count
2606         if (!isdigit(*items[1])) {
2607           smsg(_("Expected REP(SAL) count in %s line %d"),
2608                fname, lnum);
2609         }
2610       } else if ((STRCMP(items[0], "REP") == 0
2611                   || STRCMP(items[0], "REPSAL") == 0)
2612                  && itemcnt >= 3) {
2613         // REP/REPSAL item
2614         // Myspell ignores extra arguments, we require it starts with
2615         // # to detect mistakes.
2616         if (itemcnt > 3 && items[3][0] != '#') {
2617           smsg(_(e_afftrailing), fname, lnum, items[3]);
2618         }
2619         if (items[0][3] == 'S' ? do_repsal : do_rep) {
2620           // Replace underscore with space (can't include a space
2621           // directly).
2622           for (p = items[1]; *p != NUL; MB_PTR_ADV(p)) {
2623             if (*p == '_') {
2624               *p = ' ';
2625             }
2626           }
2627           for (p = items[2]; *p != NUL; MB_PTR_ADV(p)) {
2628             if (*p == '_') {
2629               *p = ' ';
2630             }
2631           }
2632           add_fromto(spin, items[0][3] == 'S'
2633               ? &spin->si_repsal
2634               : &spin->si_rep, items[1], items[2]);
2635         }
2636       } else if (is_aff_rule(items, itemcnt, "MAP", 2)) {
2637         // MAP item or count
2638         if (!found_map) {
2639           // First line contains the count.
2640           found_map = true;
2641           if (!isdigit(*items[1])) {
2642             smsg(_("Expected MAP count in %s line %d"),
2643                  fname, lnum);
2644           }
2645         } else if (do_mapline) {
2646           int c;
2647 
2648           // Check that every character appears only once.
2649           for (p = items[1]; *p != NUL;) {
2650             c = mb_ptr2char_adv((const char_u **)&p);
2651             if ((!GA_EMPTY(&spin->si_map)
2652                  && vim_strchr(spin->si_map.ga_data, c)
2653                  != NULL)
2654                 || vim_strchr(p, c) != NULL) {
2655               smsg(_("Duplicate character in MAP in %s line %d"),
2656                    fname, lnum);
2657             }
2658           }
2659 
2660           // We simply concatenate all the MAP strings, separated by
2661           // slashes.
2662           ga_concat(&spin->si_map, (char *)items[1]);
2663           ga_append(&spin->si_map, '/');
2664         }
2665       }
2666       // Accept "SAL from to" and "SAL from to  #comment".
2667       else if (is_aff_rule(items, itemcnt, "SAL", 3)) {
2668         if (do_sal) {
2669           // SAL item (sounds-a-like)
2670           // Either one of the known keys or a from-to pair.
2671           if (STRCMP(items[1], "followup") == 0) {
2672             spin->si_followup = sal_to_bool(items[2]);
2673           } else if (STRCMP(items[1], "collapse_result") == 0) {
2674             spin->si_collapse = sal_to_bool(items[2]);
2675           } else if (STRCMP(items[1], "remove_accents") == 0) {
2676             spin->si_rem_accents = sal_to_bool(items[2]);
2677           } else {
2678             // when "to" is "_" it means empty
2679             add_fromto(spin, &spin->si_sal, items[1],
2680                        STRCMP(items[2], "_") == 0 ? (char_u *)""
2681                                                   : items[2]);
2682           }
2683         }
2684       } else if (is_aff_rule(items, itemcnt, "SOFOFROM", 2)
2685                  && sofofrom == NULL) {
2686         sofofrom = getroom_save(spin, items[1]);
2687       } else if (is_aff_rule(items, itemcnt, "SOFOTO", 2)
2688                  && sofoto == NULL) {
2689         sofoto = getroom_save(spin, items[1]);
2690       } else if (STRCMP(items[0], "COMMON") == 0) {
2691         int i;
2692 
2693         for (i = 1; i < itemcnt; ++i) {
2694           if (HASHITEM_EMPTY(hash_find(&spin->si_commonwords,
2695                                        items[i]))) {
2696             p = vim_strsave(items[i]);
2697             hash_add(&spin->si_commonwords, p);
2698           }
2699         }
2700       } else {
2701         smsg(_("Unrecognized or duplicate item in %s line %d: %s"),
2702              fname, lnum, items[0]);
2703       }
2704     }
2705   }
2706 
2707   if (fol != NULL || low != NULL || upp != NULL) {
2708     if (spin->si_clear_chartab) {
2709       // Clear the char type tables, don't want to use any of the
2710       // currently used spell properties.
2711       init_spell_chartab();
2712       spin->si_clear_chartab = false;
2713     }
2714 
2715     xfree(fol);
2716     xfree(low);
2717     xfree(upp);
2718   }
2719 
2720   // Use compound specifications of the .aff file for the spell info.
2721   if (compmax != 0) {
2722     aff_check_number(spin->si_compmax, compmax, "COMPOUNDWORDMAX");
2723     spin->si_compmax = compmax;
2724   }
2725 
2726   if (compminlen != 0) {
2727     aff_check_number(spin->si_compminlen, compminlen, "COMPOUNDMIN");
2728     spin->si_compminlen = compminlen;
2729   }
2730 
2731   if (compsylmax != 0) {
2732     if (syllable == NULL) {
2733       smsg("%s", _("COMPOUNDSYLMAX used without SYLLABLE"));
2734     }
2735     aff_check_number(spin->si_compsylmax, compsylmax, "COMPOUNDSYLMAX");
2736     spin->si_compsylmax = compsylmax;
2737   }
2738 
2739   if (compoptions != 0) {
2740     aff_check_number(spin->si_compoptions, compoptions, "COMPOUND options");
2741     spin->si_compoptions |= compoptions;
2742   }
2743 
2744   if (compflags != NULL) {
2745     process_compflags(spin, aff, compflags);
2746   }
2747 
2748   // Check that we didn't use too many renumbered flags.
2749   if (spin->si_newcompID < spin->si_newprefID) {
2750     if (spin->si_newcompID == 127 || spin->si_newcompID == 255) {
2751       msg(_("Too many postponed prefixes"));
2752     } else if (spin->si_newprefID == 0 || spin->si_newprefID == 127) {
2753       msg(_("Too many compound flags"));
2754     } else {
2755       msg(_("Too many postponed prefixes and/or compound flags"));
2756     }
2757   }
2758 
2759   if (syllable != NULL) {
2760     aff_check_string(spin->si_syllable, syllable, "SYLLABLE");
2761     spin->si_syllable = syllable;
2762   }
2763 
2764   if (sofofrom != NULL || sofoto != NULL) {
2765     if (sofofrom == NULL || sofoto == NULL) {
2766       smsg(_("Missing SOFO%s line in %s"),
2767            sofofrom == NULL ? "FROM" : "TO", fname);
2768     } else if (!GA_EMPTY(&spin->si_sal)) {
2769       smsg(_("Both SAL and SOFO lines in %s"), fname);
2770     } else {
2771       aff_check_string(spin->si_sofofr, sofofrom, "SOFOFROM");
2772       aff_check_string(spin->si_sofoto, sofoto, "SOFOTO");
2773       spin->si_sofofr = sofofrom;
2774       spin->si_sofoto = sofoto;
2775     }
2776   }
2777 
2778   if (midword != NULL) {
2779     aff_check_string(spin->si_midword, midword, "MIDWORD");
2780     spin->si_midword = midword;
2781   }
2782 
2783   xfree(pc);
2784   fclose(fd);
2785   return aff;
2786 }
2787 
2788 // Returns true when items[0] equals "rulename", there are "mincount" items or
2789 // a comment is following after item "mincount".
is_aff_rule(char_u ** items,int itemcnt,char * rulename,int mincount)2790 static bool is_aff_rule(char_u **items, int itemcnt, char *rulename, int mincount)
2791 {
2792   return STRCMP(items[0], rulename) == 0
2793          && (itemcnt == mincount
2794              || (itemcnt > mincount && items[mincount][0] == '#'));
2795 }
2796 
2797 // For affix "entry" move COMPOUNDFORBIDFLAG and COMPOUNDPERMITFLAG from
2798 // ae_flags to ae_comppermit and ae_compforbid.
aff_process_flags(afffile_T * affile,affentry_T * entry)2799 static void aff_process_flags(afffile_T *affile, affentry_T *entry)
2800 {
2801   char_u *p;
2802   char_u *prevp;
2803   unsigned flag;
2804 
2805   if (entry->ae_flags != NULL
2806       && (affile->af_compforbid != 0 || affile->af_comppermit != 0)) {
2807     for (p = entry->ae_flags; *p != NUL;) {
2808       prevp = p;
2809       flag = get_affitem(affile->af_flagtype, &p);
2810       if (flag == affile->af_comppermit || flag == affile->af_compforbid) {
2811         STRMOVE(prevp, p);
2812         p = prevp;
2813         if (flag == affile->af_comppermit) {
2814           entry->ae_comppermit = true;
2815         } else {
2816           entry->ae_compforbid = true;
2817         }
2818       }
2819       if (affile->af_flagtype == AFT_NUM && *p == ',') {
2820         ++p;
2821       }
2822     }
2823     if (*entry->ae_flags == NUL) {
2824       entry->ae_flags = NULL;           // nothing left
2825     }
2826   }
2827 }
2828 
2829 // Returns true if "s" is the name of an info item in the affix file.
spell_info_item(char_u * s)2830 static bool spell_info_item(char_u *s)
2831 {
2832   return STRCMP(s, "NAME") == 0
2833          || STRCMP(s, "HOME") == 0
2834          || STRCMP(s, "VERSION") == 0
2835          || STRCMP(s, "AUTHOR") == 0
2836          || STRCMP(s, "EMAIL") == 0
2837          || STRCMP(s, "COPYRIGHT") == 0;
2838 }
2839 
2840 // Turn an affix flag name into a number, according to the FLAG type.
2841 // returns zero for failure.
affitem2flag(int flagtype,char_u * item,char_u * fname,int lnum)2842 static unsigned affitem2flag(int flagtype, char_u *item, char_u *fname, int lnum)
2843 {
2844   unsigned res;
2845   char_u *p = item;
2846 
2847   res = get_affitem(flagtype, &p);
2848   if (res == 0) {
2849     if (flagtype == AFT_NUM) {
2850       smsg(_("Flag is not a number in %s line %d: %s"),
2851            fname, lnum, item);
2852     } else {
2853       smsg(_("Illegal flag in %s line %d: %s"),
2854            fname, lnum, item);
2855     }
2856   }
2857   if (*p != NUL) {
2858     smsg(_(e_affname), fname, lnum, item);
2859     return 0;
2860   }
2861 
2862   return res;
2863 }
2864 
2865 // Get one affix name from "*pp" and advance the pointer.
2866 // Returns ZERO_FLAG for "0".
2867 // Returns zero for an error, still advances the pointer then.
get_affitem(int flagtype,char_u ** pp)2868 static unsigned get_affitem(int flagtype, char_u **pp)
2869 {
2870   int res;
2871 
2872   if (flagtype == AFT_NUM) {
2873     if (!ascii_isdigit(**pp)) {
2874       ++*pp;            // always advance, avoid getting stuck
2875       return 0;
2876     }
2877     res = getdigits_int(pp, true, 0);
2878     if (res == 0) {
2879       res = ZERO_FLAG;
2880     }
2881   } else {
2882     res = mb_ptr2char_adv((const char_u **)pp);
2883     if (flagtype == AFT_LONG || (flagtype == AFT_CAPLONG
2884                                  && res >= 'A' && res <= 'Z')) {
2885       if (**pp == NUL) {
2886         return 0;
2887       }
2888       res = mb_ptr2char_adv((const char_u **)pp) + (res << 16);
2889     }
2890   }
2891   return res;
2892 }
2893 
2894 // Process the "compflags" string used in an affix file and append it to
2895 // spin->si_compflags.
2896 // The processing involves changing the affix names to ID numbers, so that
2897 // they fit in one byte.
process_compflags(spellinfo_T * spin,afffile_T * aff,char_u * compflags)2898 static void process_compflags(spellinfo_T *spin, afffile_T *aff, char_u *compflags)
2899 {
2900   char_u *p;
2901   char_u *prevp;
2902   unsigned flag;
2903   compitem_T *ci;
2904   int id;
2905   int len;
2906   char_u *tp;
2907   char_u key[AH_KEY_LEN];
2908   hashitem_T *hi;
2909 
2910   // Make room for the old and the new compflags, concatenated with a / in
2911   // between.  Processing it makes it shorter, but we don't know by how
2912   // much, thus allocate the maximum.
2913   len = (int)STRLEN(compflags) + 1;
2914   if (spin->si_compflags != NULL) {
2915     len += (int)STRLEN(spin->si_compflags) + 1;
2916   }
2917   p = getroom(spin, len, false);
2918   if (spin->si_compflags != NULL) {
2919     STRCPY(p, spin->si_compflags);
2920     STRCAT(p, "/");
2921   }
2922   spin->si_compflags = p;
2923   tp = p + STRLEN(p);
2924 
2925   for (p = compflags; *p != NUL;) {
2926     if (vim_strchr((char_u *)"/?*+[]", *p) != NULL) {
2927       // Copy non-flag characters directly.
2928       *tp++ = *p++;
2929     } else {
2930       // First get the flag number, also checks validity.
2931       prevp = p;
2932       flag = get_affitem(aff->af_flagtype, &p);
2933       if (flag != 0) {
2934         // Find the flag in the hashtable.  If it was used before, use
2935         // the existing ID.  Otherwise add a new entry.
2936         STRLCPY(key, prevp, p - prevp + 1);
2937         hi = hash_find(&aff->af_comp, key);
2938         if (!HASHITEM_EMPTY(hi)) {
2939           id = HI2CI(hi)->ci_newID;
2940         } else {
2941           ci = getroom(spin, sizeof(compitem_T), true);
2942           STRCPY(ci->ci_key, key);
2943           ci->ci_flag = flag;
2944           // Avoid using a flag ID that has a special meaning in a
2945           // regexp (also inside []).
2946           do {
2947             check_renumber(spin);
2948             id = spin->si_newcompID--;
2949           } while (vim_strchr((char_u *)"/?*+[]\\-^", id) != NULL);
2950           ci->ci_newID = id;
2951           hash_add(&aff->af_comp, ci->ci_key);
2952         }
2953         *tp++ = id;
2954       }
2955       if (aff->af_flagtype == AFT_NUM && *p == ',') {
2956         ++p;
2957       }
2958     }
2959   }
2960 
2961   *tp = NUL;
2962 }
2963 
2964 // Check that the new IDs for postponed affixes and compounding don't overrun
2965 // each other.  We have almost 255 available, but start at 0-127 to avoid
2966 // using two bytes for utf-8.  When the 0-127 range is used up go to 128-255.
2967 // When that is used up an error message is given.
check_renumber(spellinfo_T * spin)2968 static void check_renumber(spellinfo_T *spin)
2969 {
2970   if (spin->si_newprefID == spin->si_newcompID && spin->si_newcompID < 128) {
2971     spin->si_newprefID = 127;
2972     spin->si_newcompID = 255;
2973   }
2974 }
2975 
2976 // Returns true if flag "flag" appears in affix list "afflist".
flag_in_afflist(int flagtype,char_u * afflist,unsigned flag)2977 static bool flag_in_afflist(int flagtype, char_u *afflist, unsigned flag)
2978 {
2979   char_u *p;
2980   unsigned n;
2981 
2982   switch (flagtype) {
2983   case AFT_CHAR:
2984     return vim_strchr(afflist, flag) != NULL;
2985 
2986   case AFT_CAPLONG:
2987   case AFT_LONG:
2988     for (p = afflist; *p != NUL;) {
2989       n = mb_ptr2char_adv((const char_u **)&p);
2990       if ((flagtype == AFT_LONG || (n >= 'A' && n <= 'Z'))
2991           && *p != NUL) {
2992         n = mb_ptr2char_adv((const char_u **)&p) + (n << 16);
2993       }
2994       if (n == flag) {
2995         return true;
2996       }
2997     }
2998     break;
2999 
3000   case AFT_NUM:
3001     for (p = afflist; *p != NUL;) {
3002       int digits = getdigits_int(&p, true, 0);
3003       assert(digits >= 0);
3004       n = (unsigned int)digits;
3005       if (n == 0) {
3006         n = ZERO_FLAG;
3007       }
3008       if (n == flag) {
3009         return true;
3010       }
3011       if (*p != NUL) {          // skip over comma
3012         p++;
3013       }
3014     }
3015     break;
3016   }
3017   return false;
3018 }
3019 
3020 // Give a warning when "spinval" and "affval" numbers are set and not the same.
aff_check_number(int spinval,int affval,char * name)3021 static void aff_check_number(int spinval, int affval, char *name)
3022 {
3023   if (spinval != 0 && spinval != affval) {
3024     smsg(_("%s value differs from what is used in another .aff file"),
3025          name);
3026   }
3027 }
3028 
3029 // Give a warning when "spinval" and "affval" strings are set and not the same.
aff_check_string(char_u * spinval,char_u * affval,char * name)3030 static void aff_check_string(char_u *spinval, char_u *affval, char *name)
3031 {
3032   if (spinval != NULL && STRCMP(spinval, affval) != 0) {
3033     smsg(_("%s value differs from what is used in another .aff file"),
3034          name);
3035   }
3036 }
3037 
3038 // Returns true if strings "s1" and "s2" are equal.  Also consider both being
3039 // NULL as equal.
str_equal(char_u * s1,char_u * s2)3040 static bool str_equal(char_u *s1, char_u *s2)
3041 {
3042   if (s1 == NULL || s2 == NULL) {
3043     return s1 == s2;
3044   }
3045   return STRCMP(s1, s2) == 0;
3046 }
3047 
3048 // Add a from-to item to "gap".  Used for REP and SAL items.
3049 // They are stored case-folded.
add_fromto(spellinfo_T * spin,garray_T * gap,char_u * from,char_u * to)3050 static void add_fromto(spellinfo_T *spin, garray_T *gap, char_u *from, char_u *to)
3051 {
3052   char_u word[MAXWLEN];
3053 
3054   fromto_T *ftp = GA_APPEND_VIA_PTR(fromto_T, gap);
3055   (void)spell_casefold(curwin, from, (int)STRLEN(from), word, MAXWLEN);
3056   ftp->ft_from = getroom_save(spin, word);
3057   (void)spell_casefold(curwin, to, (int)STRLEN(to), word, MAXWLEN);
3058   ftp->ft_to = getroom_save(spin, word);
3059 }
3060 
3061 // Converts a boolean argument in a SAL line to true or false;
sal_to_bool(char_u * s)3062 static bool sal_to_bool(char_u *s)
3063 {
3064   return STRCMP(s, "1") == 0 || STRCMP(s, "true") == 0;
3065 }
3066 
3067 // Free the structure filled by spell_read_aff().
spell_free_aff(afffile_T * aff)3068 static void spell_free_aff(afffile_T *aff)
3069 {
3070   hashtab_T *ht;
3071   hashitem_T *hi;
3072   int todo;
3073   affheader_T *ah;
3074   affentry_T *ae;
3075 
3076   xfree(aff->af_enc);
3077 
3078   // All this trouble to free the "ae_prog" items...
3079   for (ht = &aff->af_pref;; ht = &aff->af_suff) {
3080     todo = (int)ht->ht_used;
3081     for (hi = ht->ht_array; todo > 0; ++hi) {
3082       if (!HASHITEM_EMPTY(hi)) {
3083         --todo;
3084         ah = HI2AH(hi);
3085         for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next) {
3086           vim_regfree(ae->ae_prog);
3087         }
3088       }
3089     }
3090     if (ht == &aff->af_suff) {
3091       break;
3092     }
3093   }
3094 
3095   hash_clear(&aff->af_pref);
3096   hash_clear(&aff->af_suff);
3097   hash_clear(&aff->af_comp);
3098 }
3099 
3100 // Read dictionary file "fname".
3101 // Returns OK or FAIL;
spell_read_dic(spellinfo_T * spin,char_u * fname,afffile_T * affile)3102 static int spell_read_dic(spellinfo_T *spin, char_u *fname, afffile_T *affile)
3103 {
3104   hashtab_T ht;
3105   char_u line[MAXLINELEN];
3106   char_u *p;
3107   char_u *afflist;
3108   char_u store_afflist[MAXWLEN];
3109   int pfxlen;
3110   bool need_affix;
3111   char_u *dw;
3112   char_u *pc;
3113   char_u *w;
3114   int l;
3115   hash_T hash;
3116   hashitem_T *hi;
3117   FILE *fd;
3118   int lnum = 1;
3119   int non_ascii = 0;
3120   int retval = OK;
3121   char_u message[MAXLINELEN + MAXWLEN];
3122   int flags;
3123   int duplicate = 0;
3124   Timestamp last_msg_time = 0;
3125 
3126   // Open the file.
3127   fd = os_fopen((char *)fname, "r");
3128   if (fd == NULL) {
3129     semsg(_(e_notopen), fname);
3130     return FAIL;
3131   }
3132 
3133   // The hashtable is only used to detect duplicated words.
3134   hash_init(&ht);
3135 
3136   vim_snprintf((char *)IObuff, IOSIZE,
3137                _("Reading dictionary file %s..."), fname);
3138   spell_message(spin, IObuff);
3139 
3140   // start with a message for the first line
3141   spin->si_msg_count = 999999;
3142 
3143   // Read and ignore the first line: word count.
3144   if (vim_fgets(line, MAXLINELEN, fd) || !ascii_isdigit(*skipwhite(line))) {
3145     semsg(_("E760: No word count in %s"), fname);
3146   }
3147 
3148   // Read all the lines in the file one by one.
3149   // The words are converted to 'encoding' here, before being added to
3150   // the hashtable.
3151   while (!vim_fgets(line, MAXLINELEN, fd) && !got_int) {
3152     line_breakcheck();
3153     ++lnum;
3154     if (line[0] == '#' || line[0] == '/') {
3155       continue;         // comment line
3156     }
3157     // Remove CR, LF and white space from the end.  White space halfway through
3158     // the word is kept to allow multi-word terms like "et al.".
3159     l = (int)STRLEN(line);
3160     while (l > 0 && line[l - 1] <= ' ') {
3161       --l;
3162     }
3163     if (l == 0) {
3164       continue;         // empty line
3165     }
3166     line[l] = NUL;
3167 
3168     // Convert from "SET" to 'encoding' when needed.
3169     if (spin->si_conv.vc_type != CONV_NONE) {
3170       pc = string_convert(&spin->si_conv, line, NULL);
3171       if (pc == NULL) {
3172         smsg(_("Conversion failure for word in %s line %d: %s"),
3173              fname, lnum, line);
3174         continue;
3175       }
3176       w = pc;
3177     } else {
3178       pc = NULL;
3179       w = line;
3180     }
3181 
3182     // Truncate the word at the "/", set "afflist" to what follows.
3183     // Replace "\/" by "/" and "\\" by "\".
3184     afflist = NULL;
3185     for (p = w; *p != NUL; MB_PTR_ADV(p)) {
3186       if (*p == '\\' && (p[1] == '\\' || p[1] == '/')) {
3187         STRMOVE(p, p + 1);
3188       } else if (*p == '/') {
3189         *p = NUL;
3190         afflist = p + 1;
3191         break;
3192       }
3193     }
3194 
3195     // Skip non-ASCII words when "spin->si_ascii" is true.
3196     if (spin->si_ascii && has_non_ascii(w)) {
3197       ++non_ascii;
3198       xfree(pc);
3199       continue;
3200     }
3201 
3202     // This takes time, print a message every 10000 words, but not more
3203     // often than once per second.
3204     if (spin->si_verbose && spin->si_msg_count > 10000) {
3205       spin->si_msg_count = 0;
3206       if (os_time() > last_msg_time) {
3207         last_msg_time = os_time();
3208         vim_snprintf((char *)message, sizeof(message),
3209                      _("line %6d, word %6ld - %s"),
3210                      lnum, spin->si_foldwcount + spin->si_keepwcount, w);
3211         msg_start();
3212         msg_outtrans_long_attr(message, 0);
3213         msg_clr_eos();
3214         msg_didout = false;
3215         msg_col = 0;
3216         ui_flush();
3217       }
3218     }
3219 
3220     // Store the word in the hashtable to be able to find duplicates.
3221     dw = getroom_save(spin, w);
3222     if (dw == NULL) {
3223       retval = FAIL;
3224       xfree(pc);
3225       break;
3226     }
3227 
3228     hash = hash_hash(dw);
3229     hi = hash_lookup(&ht, (const char *)dw, STRLEN(dw), hash);
3230     if (!HASHITEM_EMPTY(hi)) {
3231       if (p_verbose > 0) {
3232         smsg(_("Duplicate word in %s line %d: %s"),
3233              fname, lnum, dw);
3234       } else if (duplicate == 0) {
3235         smsg(_("First duplicate word in %s line %d: %s"),
3236              fname, lnum, dw);
3237       }
3238       ++duplicate;
3239     } else {
3240       hash_add_item(&ht, hi, dw, hash);
3241     }
3242 
3243     flags = 0;
3244     store_afflist[0] = NUL;
3245     pfxlen = 0;
3246     need_affix = false;
3247     if (afflist != NULL) {
3248       // Extract flags from the affix list.
3249       flags |= get_affix_flags(affile, afflist);
3250 
3251       if (affile->af_needaffix != 0
3252           && flag_in_afflist(affile->af_flagtype, afflist,
3253                              affile->af_needaffix)) {
3254         need_affix = true;
3255       }
3256 
3257       if (affile->af_pfxpostpone) {
3258         // Need to store the list of prefix IDs with the word.
3259         pfxlen = get_pfxlist(affile, afflist, store_afflist);
3260       }
3261 
3262       if (spin->si_compflags != NULL) {
3263         // Need to store the list of compound flags with the word.
3264         // Concatenate them to the list of prefix IDs.
3265         get_compflags(affile, afflist, store_afflist + pfxlen);
3266       }
3267     }
3268 
3269     // Add the word to the word tree(s).
3270     if (store_word(spin, dw, flags, spin->si_region,
3271                    store_afflist, need_affix) == FAIL) {
3272       retval = FAIL;
3273     }
3274 
3275     if (afflist != NULL) {
3276       // Find all matching suffixes and add the resulting words.
3277       // Additionally do matching prefixes that combine.
3278       if (store_aff_word(spin, dw, afflist, affile,
3279                          &affile->af_suff, &affile->af_pref,
3280                          CONDIT_SUF, flags, store_afflist, pfxlen) == FAIL) {
3281         retval = FAIL;
3282       }
3283 
3284       // Find all matching prefixes and add the resulting words.
3285       if (store_aff_word(spin, dw, afflist, affile,
3286                          &affile->af_pref, NULL,
3287                          CONDIT_SUF, flags, store_afflist, pfxlen) == FAIL) {
3288         retval = FAIL;
3289       }
3290     }
3291 
3292     xfree(pc);
3293   }
3294 
3295   if (duplicate > 0) {
3296     smsg(_("%d duplicate word(s) in %s"), duplicate, fname);
3297   }
3298   if (spin->si_ascii && non_ascii > 0) {
3299     smsg(_("Ignored %d word(s) with non-ASCII characters in %s"),
3300          non_ascii, fname);
3301   }
3302   hash_clear(&ht);
3303 
3304   fclose(fd);
3305   return retval;
3306 }
3307 
3308 // Check for affix flags in "afflist" that are turned into word flags.
3309 // Return WF_ flags.
get_affix_flags(afffile_T * affile,char_u * afflist)3310 static int get_affix_flags(afffile_T *affile, char_u *afflist)
3311 {
3312   int flags = 0;
3313 
3314   if (affile->af_keepcase != 0
3315       && flag_in_afflist(affile->af_flagtype, afflist,
3316                          affile->af_keepcase)) {
3317     flags |= WF_KEEPCAP | WF_FIXCAP;
3318   }
3319   if (affile->af_rare != 0
3320       && flag_in_afflist(affile->af_flagtype, afflist, affile->af_rare)) {
3321     flags |= WF_RARE;
3322   }
3323   if (affile->af_bad != 0
3324       && flag_in_afflist(affile->af_flagtype, afflist, affile->af_bad)) {
3325     flags |= WF_BANNED;
3326   }
3327   if (affile->af_needcomp != 0
3328       && flag_in_afflist(affile->af_flagtype, afflist,
3329                          affile->af_needcomp)) {
3330     flags |= WF_NEEDCOMP;
3331   }
3332   if (affile->af_comproot != 0
3333       && flag_in_afflist(affile->af_flagtype, afflist,
3334                          affile->af_comproot)) {
3335     flags |= WF_COMPROOT;
3336   }
3337   if (affile->af_nosuggest != 0
3338       && flag_in_afflist(affile->af_flagtype, afflist,
3339                          affile->af_nosuggest)) {
3340     flags |= WF_NOSUGGEST;
3341   }
3342   return flags;
3343 }
3344 
3345 // Get the list of prefix IDs from the affix list "afflist".
3346 // Used for PFXPOSTPONE.
3347 // Put the resulting flags in "store_afflist[MAXWLEN]" with a terminating NUL
3348 // and return the number of affixes.
get_pfxlist(afffile_T * affile,char_u * afflist,char_u * store_afflist)3349 static int get_pfxlist(afffile_T *affile, char_u *afflist, char_u *store_afflist)
3350 {
3351   char_u *p;
3352   char_u *prevp;
3353   int cnt = 0;
3354   int id;
3355   char_u key[AH_KEY_LEN];
3356   hashitem_T *hi;
3357 
3358   for (p = afflist; *p != NUL;) {
3359     prevp = p;
3360     if (get_affitem(affile->af_flagtype, &p) != 0) {
3361       // A flag is a postponed prefix flag if it appears in "af_pref"
3362       // and its ID is not zero.
3363       STRLCPY(key, prevp, p - prevp + 1);
3364       hi = hash_find(&affile->af_pref, key);
3365       if (!HASHITEM_EMPTY(hi)) {
3366         id = HI2AH(hi)->ah_newID;
3367         if (id != 0) {
3368           store_afflist[cnt++] = id;
3369         }
3370       }
3371     }
3372     if (affile->af_flagtype == AFT_NUM && *p == ',') {
3373       ++p;
3374     }
3375   }
3376 
3377   store_afflist[cnt] = NUL;
3378   return cnt;
3379 }
3380 
3381 // Get the list of compound IDs from the affix list "afflist" that are used
3382 // for compound words.
3383 // Puts the flags in "store_afflist[]".
get_compflags(afffile_T * affile,char_u * afflist,char_u * store_afflist)3384 static void get_compflags(afffile_T *affile, char_u *afflist, char_u *store_afflist)
3385 {
3386   char_u *p;
3387   char_u *prevp;
3388   int cnt = 0;
3389   char_u key[AH_KEY_LEN];
3390   hashitem_T *hi;
3391 
3392   for (p = afflist; *p != NUL;) {
3393     prevp = p;
3394     if (get_affitem(affile->af_flagtype, &p) != 0) {
3395       // A flag is a compound flag if it appears in "af_comp".
3396       STRLCPY(key, prevp, p - prevp + 1);
3397       hi = hash_find(&affile->af_comp, key);
3398       if (!HASHITEM_EMPTY(hi)) {
3399         store_afflist[cnt++] = HI2CI(hi)->ci_newID;
3400       }
3401     }
3402     if (affile->af_flagtype == AFT_NUM && *p == ',') {
3403       ++p;
3404     }
3405   }
3406 
3407   store_afflist[cnt] = NUL;
3408 }
3409 
3410 /// Apply affixes to a word and store the resulting words.
3411 /// "ht" is the hashtable with affentry_T that need to be applied, either
3412 /// prefixes or suffixes.
3413 /// "xht", when not NULL, is the prefix hashtable, to be used additionally on
3414 /// the resulting words for combining affixes.
3415 ///
3416 /// @param spin  spell info
3417 /// @param word  basic word start
3418 /// @param afflist  list of names of supported affixes
3419 /// @param condit  CONDIT_SUF et al.
3420 /// @param flags  flags for the word
3421 /// @param pfxlist  list of prefix IDs
3422 /// @param pfxlen  nr of flags in "pfxlist" for prefixes, rest is compound flags
3423 ///
3424 /// @return  FAIL when out of memory.
store_aff_word(spellinfo_T * spin,char_u * word,char_u * afflist,afffile_T * affile,hashtab_T * ht,hashtab_T * xht,int condit,int flags,char_u * pfxlist,int pfxlen)3425 static int store_aff_word(spellinfo_T *spin, char_u *word, char_u *afflist, afffile_T *affile,
3426                           hashtab_T *ht, hashtab_T *xht, int condit, int flags, char_u *pfxlist,
3427                           int pfxlen)
3428 {
3429   int todo;
3430   hashitem_T *hi;
3431   affheader_T *ah;
3432   affentry_T *ae;
3433   char_u newword[MAXWLEN];
3434   int retval = OK;
3435   int i, j;
3436   char_u *p;
3437   int use_flags;
3438   char_u *use_pfxlist;
3439   int use_pfxlen;
3440   bool need_affix;
3441   char_u store_afflist[MAXWLEN];
3442   char_u pfx_pfxlist[MAXWLEN];
3443   size_t wordlen = STRLEN(word);
3444   int use_condit;
3445 
3446   todo = (int)ht->ht_used;
3447   for (hi = ht->ht_array; todo > 0 && retval == OK; ++hi) {
3448     if (!HASHITEM_EMPTY(hi)) {
3449       --todo;
3450       ah = HI2AH(hi);
3451 
3452       // Check that the affix combines, if required, and that the word
3453       // supports this affix.
3454       if (((condit & CONDIT_COMB) == 0 || ah->ah_combine)
3455           && flag_in_afflist(affile->af_flagtype, afflist,
3456                              ah->ah_flag)) {
3457         // Loop over all affix entries with this name.
3458         for (ae = ah->ah_first; ae != NULL; ae = ae->ae_next) {
3459           // Check the condition.  It's not logical to match case
3460           // here, but it is required for compatibility with
3461           // Myspell.
3462           // Another requirement from Myspell is that the chop
3463           // string is shorter than the word itself.
3464           // For prefixes, when "PFXPOSTPONE" was used, only do
3465           // prefixes with a chop string and/or flags.
3466           // When a previously added affix had CIRCUMFIX this one
3467           // must have it too, if it had not then this one must not
3468           // have one either.
3469           if ((xht != NULL || !affile->af_pfxpostpone
3470                || ae->ae_chop != NULL
3471                || ae->ae_flags != NULL)
3472               && (ae->ae_chop == NULL
3473                   || STRLEN(ae->ae_chop) < wordlen)
3474               && (ae->ae_prog == NULL
3475                   || vim_regexec_prog(&ae->ae_prog, false, word, (colnr_T)0))
3476               && (((condit & CONDIT_CFIX) == 0)
3477                   == ((condit & CONDIT_AFF) == 0
3478                       || ae->ae_flags == NULL
3479                       || !flag_in_afflist(affile->af_flagtype,
3480                                           ae->ae_flags, affile->af_circumfix)))) {
3481             // Match.  Remove the chop and add the affix.
3482             if (xht == NULL) {
3483               // prefix: chop/add at the start of the word
3484               if (ae->ae_add == NULL) {
3485                 *newword = NUL;
3486               } else {
3487                 STRLCPY(newword, ae->ae_add, MAXWLEN);
3488               }
3489               p = word;
3490               if (ae->ae_chop != NULL) {
3491                 // Skip chop string.
3492                 i = mb_charlen(ae->ae_chop);
3493                 for (; i > 0; i--) {
3494                   MB_PTR_ADV(p);
3495                 }
3496               }
3497               STRCAT(newword, p);
3498             } else {
3499               // suffix: chop/add at the end of the word
3500               STRLCPY(newword, word, MAXWLEN);
3501               if (ae->ae_chop != NULL) {
3502                 // Remove chop string.
3503                 p = newword + STRLEN(newword);
3504                 i = mb_charlen(ae->ae_chop);
3505                 for (; i > 0; i--) {
3506                   MB_PTR_BACK(newword, p);
3507                 }
3508                 *p = NUL;
3509               }
3510               if (ae->ae_add != NULL) {
3511                 STRCAT(newword, ae->ae_add);
3512               }
3513             }
3514 
3515             use_flags = flags;
3516             use_pfxlist = pfxlist;
3517             use_pfxlen = pfxlen;
3518             need_affix = false;
3519             use_condit = condit | CONDIT_COMB | CONDIT_AFF;
3520             if (ae->ae_flags != NULL) {
3521               // Extract flags from the affix list.
3522               use_flags |= get_affix_flags(affile, ae->ae_flags);
3523 
3524               if (affile->af_needaffix != 0 && flag_in_afflist(affile->af_flagtype, ae->ae_flags,
3525                                                                affile->af_needaffix)) {
3526                 need_affix = true;
3527               }
3528 
3529               // When there is a CIRCUMFIX flag the other affix
3530               // must also have it and we don't add the word
3531               // with one affix.
3532               if (affile->af_circumfix != 0 && flag_in_afflist(affile->af_flagtype, ae->ae_flags,
3533                                                                affile->af_circumfix)) {
3534                 use_condit |= CONDIT_CFIX;
3535                 if ((condit & CONDIT_CFIX) == 0) {
3536                   need_affix = true;
3537                 }
3538               }
3539 
3540               if (affile->af_pfxpostpone
3541                   || spin->si_compflags != NULL) {
3542                 if (affile->af_pfxpostpone) {
3543                   // Get prefix IDS from the affix list.
3544                   use_pfxlen = get_pfxlist(affile,
3545                                            ae->ae_flags, store_afflist);
3546                 } else {
3547                   use_pfxlen = 0;
3548                 }
3549                 use_pfxlist = store_afflist;
3550 
3551                 // Combine the prefix IDs. Avoid adding the
3552                 // same ID twice.
3553                 for (i = 0; i < pfxlen; ++i) {
3554                   for (j = 0; j < use_pfxlen; ++j) {
3555                     if (pfxlist[i] == use_pfxlist[j]) {
3556                       break;
3557                     }
3558                   }
3559                   if (j == use_pfxlen) {
3560                     use_pfxlist[use_pfxlen++] = pfxlist[i];
3561                   }
3562                 }
3563 
3564                 if (spin->si_compflags != NULL) {
3565                   // Get compound IDS from the affix list.
3566                   get_compflags(affile, ae->ae_flags,
3567                                 use_pfxlist + use_pfxlen);
3568                 } else {
3569                   use_pfxlist[use_pfxlen] = NUL;
3570                 }
3571 
3572                 // Combine the list of compound flags.
3573                 // Concatenate them to the prefix IDs list.
3574                 // Avoid adding the same ID twice.
3575                 for (i = pfxlen; pfxlist[i] != NUL; ++i) {
3576                   for (j = use_pfxlen;
3577                        use_pfxlist[j] != NUL; ++j) {
3578                     if (pfxlist[i] == use_pfxlist[j]) {
3579                       break;
3580                     }
3581                   }
3582                   if (use_pfxlist[j] == NUL) {
3583                     use_pfxlist[j++] = pfxlist[i];
3584                     use_pfxlist[j] = NUL;
3585                   }
3586                 }
3587               }
3588             }
3589 
3590             // Obey a "COMPOUNDFORBIDFLAG" of the affix: don't
3591             // use the compound flags.
3592             if (use_pfxlist != NULL && ae->ae_compforbid) {
3593               STRLCPY(pfx_pfxlist, use_pfxlist, use_pfxlen + 1);
3594               use_pfxlist = pfx_pfxlist;
3595             }
3596 
3597             // When there are postponed prefixes...
3598             if (spin->si_prefroot != NULL
3599                 && spin->si_prefroot->wn_sibling != NULL) {
3600               // ... add a flag to indicate an affix was used.
3601               use_flags |= WF_HAS_AFF;
3602 
3603               // ... don't use a prefix list if combining
3604               // affixes is not allowed.  But do use the
3605               // compound flags after them.
3606               if (!ah->ah_combine && use_pfxlist != NULL) {
3607                 use_pfxlist += use_pfxlen;
3608               }
3609             }
3610 
3611             // When compounding is supported and there is no
3612             // "COMPOUNDPERMITFLAG" then forbid compounding on the
3613             // side where the affix is applied.
3614             if (spin->si_compflags != NULL && !ae->ae_comppermit) {
3615               if (xht != NULL) {
3616                 use_flags |= WF_NOCOMPAFT;
3617               } else {
3618                 use_flags |= WF_NOCOMPBEF;
3619               }
3620             }
3621 
3622             // Store the modified word.
3623             if (store_word(spin, newword, use_flags,
3624                            spin->si_region, use_pfxlist,
3625                            need_affix) == FAIL) {
3626               retval = FAIL;
3627             }
3628 
3629             // When added a prefix or a first suffix and the affix
3630             // has flags may add a(nother) suffix.  RECURSIVE!
3631             if ((condit & CONDIT_SUF) && ae->ae_flags != NULL) {
3632               if (store_aff_word(spin, newword, ae->ae_flags,
3633                                  affile, &affile->af_suff, xht,
3634                                  use_condit & (xht == NULL
3635                                     ? ~0 :  ~CONDIT_SUF),
3636                                  use_flags, use_pfxlist, pfxlen) == FAIL) {
3637                 retval = FAIL;
3638               }
3639             }
3640 
3641             // When added a suffix and combining is allowed also
3642             // try adding a prefix additionally.  Both for the
3643             // word flags and for the affix flags.  RECURSIVE!
3644             if (xht != NULL && ah->ah_combine) {
3645               if (store_aff_word(spin, newword,
3646                                  afflist, affile,
3647                                  xht, NULL, use_condit,
3648                                  use_flags, use_pfxlist,
3649                                  pfxlen) == FAIL
3650                   || (ae->ae_flags != NULL
3651                       && store_aff_word(spin, newword,
3652                                         ae->ae_flags, affile,
3653                                         xht, NULL, use_condit,
3654                                         use_flags, use_pfxlist,
3655                                         pfxlen) == FAIL)) {
3656                 retval = FAIL;
3657               }
3658             }
3659           }
3660         }
3661       }
3662     }
3663   }
3664 
3665   return retval;
3666 }
3667 
3668 // Read a file with a list of words.
spell_read_wordfile(spellinfo_T * spin,char_u * fname)3669 static int spell_read_wordfile(spellinfo_T *spin, char_u *fname)
3670 {
3671   FILE *fd;
3672   long lnum = 0;
3673   char_u rline[MAXLINELEN];
3674   char_u *line;
3675   char_u *pc = NULL;
3676   char_u *p;
3677   int l;
3678   int retval = OK;
3679   bool did_word = false;
3680   int non_ascii = 0;
3681   int flags;
3682   int regionmask;
3683 
3684   // Open the file.
3685   fd = os_fopen((char *)fname, "r");
3686   if (fd == NULL) {
3687     semsg(_(e_notopen), fname);
3688     return FAIL;
3689   }
3690 
3691   vim_snprintf((char *)IObuff, IOSIZE, _("Reading word file %s..."), fname);
3692   spell_message(spin, IObuff);
3693 
3694   // Read all the lines in the file one by one.
3695   while (!vim_fgets(rline, MAXLINELEN, fd) && !got_int) {
3696     line_breakcheck();
3697     ++lnum;
3698 
3699     // Skip comment lines.
3700     if (*rline == '#') {
3701       continue;
3702     }
3703 
3704     // Remove CR, LF and white space from the end.
3705     l = (int)STRLEN(rline);
3706     while (l > 0 && rline[l - 1] <= ' ') {
3707       --l;
3708     }
3709     if (l == 0) {
3710       continue;         // empty or blank line
3711     }
3712     rline[l] = NUL;
3713 
3714     // Convert from "/encoding={encoding}" to 'encoding' when needed.
3715     xfree(pc);
3716     if (spin->si_conv.vc_type != CONV_NONE) {
3717       pc = string_convert(&spin->si_conv, rline, NULL);
3718       if (pc == NULL) {
3719         smsg(_("Conversion failure for word in %s line %ld: %s"),
3720              fname, lnum, rline);
3721         continue;
3722       }
3723       line = pc;
3724     } else {
3725       pc = NULL;
3726       line = rline;
3727     }
3728 
3729     if (*line == '/') {
3730       ++line;
3731       if (STRNCMP(line, "encoding=", 9) == 0) {
3732         if (spin->si_conv.vc_type != CONV_NONE) {
3733           smsg(_("Duplicate /encoding= line ignored in %s line %ld: %s"),
3734                fname, lnum, line - 1);
3735         } else if (did_word) {
3736           smsg(_("/encoding= line after word ignored in %s line %ld: %s"),
3737                fname, lnum, line - 1);
3738         } else {
3739           char_u *enc;
3740 
3741           // Setup for conversion to 'encoding'.
3742           line += 9;
3743           enc = enc_canonize(line);
3744           if (!spin->si_ascii
3745               && convert_setup(&spin->si_conv, enc,
3746                                p_enc) == FAIL) {
3747             smsg(_("Conversion in %s not supported: from %s to %s"),
3748                  fname, line, p_enc);
3749           }
3750           xfree(enc);
3751           spin->si_conv.vc_fail = true;
3752         }
3753         continue;
3754       }
3755 
3756       if (STRNCMP(line, "regions=", 8) == 0) {
3757         if (spin->si_region_count > 1) {
3758           smsg(_("Duplicate /regions= line ignored in %s line %ld: %s"),
3759                fname, lnum, line);
3760         } else {
3761           line += 8;
3762           if (STRLEN(line) > MAXREGIONS * 2) {
3763             smsg(_("Too many regions in %s line %ld: %s"),
3764                  fname, lnum, line);
3765           } else {
3766             spin->si_region_count = (int)STRLEN(line) / 2;
3767             STRCPY(spin->si_region_name, line);
3768 
3769             // Adjust the mask for a word valid in all regions.
3770             spin->si_region = (1 << spin->si_region_count) - 1;
3771           }
3772         }
3773         continue;
3774       }
3775 
3776       smsg(_("/ line ignored in %s line %ld: %s"),
3777            fname, lnum, line - 1);
3778       continue;
3779     }
3780 
3781     flags = 0;
3782     regionmask = spin->si_region;
3783 
3784     // Check for flags and region after a slash.
3785     p = vim_strchr(line, '/');
3786     if (p != NULL) {
3787       *p++ = NUL;
3788       while (*p != NUL) {
3789         if (*p == '=') {                // keep-case word
3790           flags |= WF_KEEPCAP | WF_FIXCAP;
3791         } else if (*p == '!') {                  // Bad, bad, wicked word.
3792           flags |= WF_BANNED;
3793         } else if (*p == '?') {                  // Rare word.
3794           flags |= WF_RARE;
3795         } else if (ascii_isdigit(*p)) {              // region number(s)
3796           if ((flags & WF_REGION) == 0) {           // first one
3797             regionmask = 0;
3798           }
3799           flags |= WF_REGION;
3800 
3801           l = *p - '0';
3802           if (l == 0 || l > spin->si_region_count) {
3803             smsg(_("Invalid region nr in %s line %ld: %s"),
3804                  fname, lnum, p);
3805             break;
3806           }
3807           regionmask |= 1 << (l - 1);
3808         } else {
3809           smsg(_("Unrecognized flags in %s line %ld: %s"),
3810                fname, lnum, p);
3811           break;
3812         }
3813         ++p;
3814       }
3815     }
3816 
3817     // Skip non-ASCII words when "spin->si_ascii" is true.
3818     if (spin->si_ascii && has_non_ascii(line)) {
3819       ++non_ascii;
3820       continue;
3821     }
3822 
3823     // Normal word: store it.
3824     if (store_word(spin, line, flags, regionmask, NULL, false) == FAIL) {
3825       retval = FAIL;
3826       break;
3827     }
3828     did_word = true;
3829   }
3830 
3831   xfree(pc);
3832   fclose(fd);
3833 
3834   if (spin->si_ascii && non_ascii > 0) {
3835     vim_snprintf((char *)IObuff, IOSIZE,
3836                  _("Ignored %d words with non-ASCII characters"), non_ascii);
3837     spell_message(spin, IObuff);
3838   }
3839 
3840   return retval;
3841 }
3842 
3843 /// Get part of an sblock_T, "len" bytes long.
3844 /// This avoids calling free() for every little struct we use (and keeping
3845 /// track of them).
3846 /// The memory is cleared to all zeros.
3847 ///
3848 /// @param len Length needed (<= SBLOCKSIZE).
3849 /// @param align Align for pointer.
3850 /// @return Pointer into block data.
getroom(spellinfo_T * spin,size_t len,bool align)3851 static void *getroom(spellinfo_T *spin, size_t len, bool align)
3852   FUNC_ATTR_NONNULL_RET
3853 {
3854   char_u *p;
3855   sblock_T *bl = spin->si_blocks;
3856 
3857   assert(len <= SBLOCKSIZE);
3858 
3859   if (align && bl != NULL) {
3860     // Round size up for alignment.  On some systems structures need to be
3861     // aligned to the size of a pointer (e.g., SPARC).
3862     bl->sb_used = (bl->sb_used + sizeof(char *) - 1)
3863                   & ~(sizeof(char *) - 1);
3864   }
3865 
3866   if (bl == NULL || bl->sb_used + len > SBLOCKSIZE) {
3867     // Allocate a block of memory. It is not freed until much later.
3868     bl = xcalloc(1, (sizeof(sblock_T) + SBLOCKSIZE));
3869     bl->sb_next = spin->si_blocks;
3870     spin->si_blocks = bl;
3871     bl->sb_used = 0;
3872     ++spin->si_blocks_cnt;
3873   }
3874 
3875   p = bl->sb_data + bl->sb_used;
3876   bl->sb_used += (int)len;
3877 
3878   return p;
3879 }
3880 
3881 // Make a copy of a string into memory allocated with getroom().
3882 // Returns NULL when out of memory.
getroom_save(spellinfo_T * spin,char_u * s)3883 static char_u *getroom_save(spellinfo_T *spin, char_u *s)
3884 {
3885   const size_t s_size = STRLEN(s) + 1;
3886   return memcpy(getroom(spin, s_size, false), s, s_size);
3887 }
3888 
3889 
3890 // Free the list of allocated sblock_T.
free_blocks(sblock_T * bl)3891 static void free_blocks(sblock_T *bl)
3892 {
3893   sblock_T *next;
3894 
3895   while (bl != NULL) {
3896     next = bl->sb_next;
3897     xfree(bl);
3898     bl = next;
3899   }
3900 }
3901 
3902 // Allocate the root of a word tree.
3903 // Returns NULL when out of memory.
wordtree_alloc(spellinfo_T * spin)3904 static wordnode_T *wordtree_alloc(spellinfo_T *spin)
3905   FUNC_ATTR_NONNULL_RET
3906 {
3907   return (wordnode_T *)getroom(spin, sizeof(wordnode_T), true);
3908 }
3909 
3910 /// Store a word in the tree(s).
3911 /// Always store it in the case-folded tree.  For a keep-case word this is
3912 /// useful when the word can also be used with all caps (no WF_FIXCAP flag) and
3913 /// used to find suggestions.
3914 /// For a keep-case word also store it in the keep-case tree.
3915 /// When "pfxlist" is not NULL store the word for each postponed prefix ID and
3916 /// compound flag.
3917 ///
3918 /// @param flags  extra flags, wf_banned
3919 /// @param region  supported region(s)
3920 /// @param pfxlist  list of prefix ids or null
3921 /// @param need_affix  only store word with affix id
store_word(spellinfo_T * spin,char_u * word,int flags,int region,const char_u * pfxlist,bool need_affix)3922 static int store_word(spellinfo_T *spin, char_u *word, int flags, int region, const char_u *pfxlist,
3923                       bool need_affix)
3924 {
3925   int len = (int)STRLEN(word);
3926   int ct = captype(word, word + len);
3927   char_u foldword[MAXWLEN];
3928   int res = OK;
3929 
3930   (void)spell_casefold(curwin, word, len, foldword, MAXWLEN);
3931   for (const char_u *p = pfxlist; res == OK; p++) {
3932     if (!need_affix || (p != NULL && *p != NUL)) {
3933       res = tree_add_word(spin, foldword, spin->si_foldroot, ct | flags,
3934                           region, p == NULL ? 0 : *p);
3935     }
3936     if (p == NULL || *p == NUL) {
3937       break;
3938     }
3939   }
3940   ++spin->si_foldwcount;
3941 
3942   if (res == OK && (ct == WF_KEEPCAP || (flags & WF_KEEPCAP))) {
3943     for (const char_u *p = pfxlist; res == OK; p++) {
3944       if (!need_affix || (p != NULL && *p != NUL)) {
3945         res = tree_add_word(spin, word, spin->si_keeproot, flags,
3946                             region, p == NULL ? 0 : *p);
3947       }
3948       if (p == NULL || *p == NUL) {
3949         break;
3950       }
3951     }
3952     ++spin->si_keepwcount;
3953   }
3954   return res;
3955 }
3956 
3957 // Add word "word" to a word tree at "root".
3958 // When "flags" < 0 we are adding to the prefix tree where "flags" is used for
3959 // "rare" and "region" is the condition nr.
3960 // Returns FAIL when out of memory.
tree_add_word(spellinfo_T * spin,char_u * word,wordnode_T * root,int flags,int region,int affixID)3961 static int tree_add_word(spellinfo_T *spin, char_u *word, wordnode_T *root, int flags, int region,
3962                          int affixID)
3963 {
3964   wordnode_T *node = root;
3965   wordnode_T *np;
3966   wordnode_T *copyp, **copyprev;
3967   wordnode_T **prev = NULL;
3968   int i;
3969 
3970   // Add each byte of the word to the tree, including the NUL at the end.
3971   for (i = 0;; ++i) {
3972     // When there is more than one reference to this node we need to make
3973     // a copy, so that we can modify it.  Copy the whole list of siblings
3974     // (we don't optimize for a partly shared list of siblings).
3975     if (node != NULL && node->wn_refs > 1) {
3976       --node->wn_refs;
3977       copyprev = prev;
3978       for (copyp = node; copyp != NULL; copyp = copyp->wn_sibling) {
3979         // Allocate a new node and copy the info.
3980         np = get_wordnode(spin);
3981         if (np == NULL) {
3982           return FAIL;
3983         }
3984         np->wn_child = copyp->wn_child;
3985         if (np->wn_child != NULL) {
3986           ++np->wn_child->wn_refs;              // child gets extra ref
3987         }
3988         np->wn_byte = copyp->wn_byte;
3989         if (np->wn_byte == NUL) {
3990           np->wn_flags = copyp->wn_flags;
3991           np->wn_region = copyp->wn_region;
3992           np->wn_affixID = copyp->wn_affixID;
3993         }
3994 
3995         // Link the new node in the list, there will be one ref.
3996         np->wn_refs = 1;
3997         if (copyprev != NULL) {
3998           *copyprev = np;
3999         }
4000         copyprev = &np->wn_sibling;
4001 
4002         // Let "node" point to the head of the copied list.
4003         if (copyp == node) {
4004           node = np;
4005         }
4006       }
4007     }
4008 
4009     // Look for the sibling that has the same character.  They are sorted
4010     // on byte value, thus stop searching when a sibling is found with a
4011     // higher byte value.  For zero bytes (end of word) the sorting is
4012     // done on flags and then on affixID.
4013     while (node != NULL
4014            && (node->wn_byte < word[i]
4015                || (node->wn_byte == NUL
4016                    && (flags < 0
4017                        ? node->wn_affixID < (unsigned)affixID
4018                        : (node->wn_flags < (unsigned)(flags & WN_MASK)
4019                           || (node->wn_flags == (flags & WN_MASK)
4020                               && (spin->si_sugtree
4021                                   ? (node->wn_region & 0xffff) < region
4022                                   : node->wn_affixID
4023                                   < (unsigned)affixID))))))) {
4024       prev = &node->wn_sibling;
4025       node = *prev;
4026     }
4027     if (node == NULL
4028         || node->wn_byte != word[i]
4029         || (word[i] == NUL
4030             && (flags < 0
4031                 || spin->si_sugtree
4032                 || node->wn_flags != (flags & WN_MASK)
4033                 || node->wn_affixID != affixID))) {
4034       // Allocate a new node.
4035       np = get_wordnode(spin);
4036       if (np == NULL) {
4037         return FAIL;
4038       }
4039       np->wn_byte = word[i];
4040 
4041       // If "node" is NULL this is a new child or the end of the sibling
4042       // list: ref count is one.  Otherwise use ref count of sibling and
4043       // make ref count of sibling one (matters when inserting in front
4044       // of the list of siblings).
4045       if (node == NULL) {
4046         np->wn_refs = 1;
4047       } else {
4048         np->wn_refs = node->wn_refs;
4049         node->wn_refs = 1;
4050       }
4051       if (prev != NULL) {
4052         *prev = np;
4053       }
4054       np->wn_sibling = node;
4055       node = np;
4056     }
4057 
4058     if (word[i] == NUL) {
4059       node->wn_flags = flags;
4060       node->wn_region |= region;
4061       node->wn_affixID = affixID;
4062       break;
4063     }
4064     prev = &node->wn_child;
4065     node = *prev;
4066   }
4067 #ifdef SPELL_PRINTTREE
4068   smsg((char_u *)"Added \"%s\"", word);
4069   spell_print_tree(root->wn_sibling);
4070 #endif
4071 
4072   // count nr of words added since last message
4073   ++spin->si_msg_count;
4074 
4075   if (spin->si_compress_cnt > 1) {
4076     if (--spin->si_compress_cnt == 1) {
4077       // Did enough words to lower the block count limit.
4078       spin->si_blocks_cnt += compress_inc;
4079     }
4080   }
4081 
4082   // When we have allocated lots of memory we need to compress the word tree
4083   // to free up some room.  But compression is slow, and we might actually
4084   // need that room, thus only compress in the following situations:
4085   // 1. When not compressed before (si_compress_cnt == 0): when using
4086   //    "compress_start" blocks.
4087   // 2. When compressed before and used "compress_inc" blocks before
4088   //    adding "compress_added" words (si_compress_cnt > 1).
4089   // 3. When compressed before, added "compress_added" words
4090   //    (si_compress_cnt == 1) and the number of free nodes drops below the
4091   //    maximum word length.
4092 #ifndef SPELL_COMPRESS_ALLWAYS
4093   if (spin->si_compress_cnt == 1       // NOLINT(readability/braces)
4094       ? spin->si_free_count < MAXWLEN
4095       : spin->si_blocks_cnt >= compress_start)
4096 #endif
4097   {
4098     // Decrement the block counter.  The effect is that we compress again
4099     // when the freed up room has been used and another "compress_inc"
4100     // blocks have been allocated.  Unless "compress_added" words have
4101     // been added, then the limit is put back again.
4102     spin->si_blocks_cnt -= compress_inc;
4103     spin->si_compress_cnt = compress_added;
4104 
4105     if (spin->si_verbose) {
4106       msg_start();
4107       msg_puts(_(msg_compressing));
4108       msg_clr_eos();
4109       msg_didout = false;
4110       msg_col = 0;
4111       ui_flush();
4112     }
4113 
4114     // Compress both trees.  Either they both have many nodes, which makes
4115     // compression useful, or one of them is small, which means
4116     // compression goes fast.  But when filling the soundfold word tree
4117     // there is no keep-case tree.
4118     wordtree_compress(spin, spin->si_foldroot, "case-folded");
4119     if (affixID >= 0) {
4120       wordtree_compress(spin, spin->si_keeproot, "keep-case");
4121     }
4122   }
4123 
4124   return OK;
4125 }
4126 
4127 // Get a wordnode_T, either from the list of previously freed nodes or
4128 // allocate a new one.
4129 // Returns NULL when out of memory.
get_wordnode(spellinfo_T * spin)4130 static wordnode_T *get_wordnode(spellinfo_T *spin)
4131 {
4132   wordnode_T *n;
4133 
4134   if (spin->si_first_free == NULL) {
4135     n = (wordnode_T *)getroom(spin, sizeof(wordnode_T), true);
4136   } else {
4137     n = spin->si_first_free;
4138     spin->si_first_free = n->wn_child;
4139     memset(n, 0, sizeof(wordnode_T));
4140     --spin->si_free_count;
4141   }
4142 #ifdef SPELL_PRINTTREE
4143   if (n != NULL) {
4144     n->wn_nr = ++spin->si_wordnode_nr;
4145   }
4146 #endif
4147   return n;
4148 }
4149 
4150 // Decrement the reference count on a node (which is the head of a list of
4151 // siblings).  If the reference count becomes zero free the node and its
4152 // siblings.
4153 // Returns the number of nodes actually freed.
deref_wordnode(spellinfo_T * spin,wordnode_T * node)4154 static int deref_wordnode(spellinfo_T *spin, wordnode_T *node)
4155   FUNC_ATTR_NONNULL_ALL
4156 {
4157   wordnode_T *np;
4158   int cnt = 0;
4159 
4160   if (--node->wn_refs == 0) {
4161     for (np = node; np != NULL; np = np->wn_sibling) {
4162       if (np->wn_child != NULL) {
4163         cnt += deref_wordnode(spin, np->wn_child);
4164       }
4165       free_wordnode(spin, np);
4166       ++cnt;
4167     }
4168     ++cnt;          // length field
4169   }
4170   return cnt;
4171 }
4172 
4173 // Free a wordnode_T for re-use later.
4174 // Only the "wn_child" field becomes invalid.
free_wordnode(spellinfo_T * spin,wordnode_T * n)4175 static void free_wordnode(spellinfo_T *spin, wordnode_T *n)
4176   FUNC_ATTR_NONNULL_ALL
4177 {
4178   n->wn_child = spin->si_first_free;
4179   spin->si_first_free = n;
4180   ++spin->si_free_count;
4181 }
4182 
4183 // Compress a tree: find tails that are identical and can be shared.
wordtree_compress(spellinfo_T * spin,wordnode_T * root,const char * name)4184 static void wordtree_compress(spellinfo_T *spin, wordnode_T *root, const char *name)
4185   FUNC_ATTR_NONNULL_ALL
4186 {
4187   hashtab_T ht;
4188   long tot = 0;
4189   long perc;
4190 
4191   // Skip the root itself, it's not actually used.  The first sibling is the
4192   // start of the tree.
4193   if (root->wn_sibling != NULL) {
4194     hash_init(&ht);
4195     const long n = node_compress(spin, root->wn_sibling, &ht, &tot);
4196 
4197 #ifndef SPELL_PRINTTREE
4198     if (spin->si_verbose || p_verbose > 2)
4199 #endif
4200     {
4201       if (tot > 1000000) {
4202         perc = (tot - n) / (tot / 100);
4203       } else if (tot == 0) {
4204         perc = 0;
4205       } else {
4206         perc = (tot - n) * 100 / tot;
4207       }
4208       vim_snprintf((char *)IObuff, IOSIZE,
4209                    _("Compressed %s of %ld nodes; %ld (%ld%%) remaining"),
4210                    name, tot, tot - n, perc);
4211       spell_message(spin, IObuff);
4212     }
4213 #ifdef SPELL_PRINTTREE
4214     spell_print_tree(root->wn_sibling);
4215 #endif
4216     hash_clear(&ht);
4217   }
4218 }
4219 
4220 /// Compress a node, its siblings and its children, depth first.
4221 /// Returns the number of compressed nodes.
4222 ///
4223 /// @param tot  total count of nodes before compressing, incremented while going through the tree
node_compress(spellinfo_T * spin,wordnode_T * node,hashtab_T * ht,long * tot)4224 static long node_compress(spellinfo_T *spin, wordnode_T *node, hashtab_T *ht, long *tot)
4225   FUNC_ATTR_NONNULL_ALL
4226 {
4227   wordnode_T *np;
4228   wordnode_T *tp;
4229   wordnode_T *child;
4230   hash_T hash;
4231   hashitem_T *hi;
4232   long len = 0;
4233   unsigned nr, n;
4234   long compressed = 0;
4235 
4236   // Go through the list of siblings.  Compress each child and then try
4237   // finding an identical child to replace it.
4238   // Note that with "child" we mean not just the node that is pointed to,
4239   // but the whole list of siblings of which the child node is the first.
4240   for (np = node; np != NULL && !got_int; np = np->wn_sibling) {
4241     ++len;
4242     if ((child = np->wn_child) != NULL) {
4243       // Compress the child first.  This fills hashkey.
4244       compressed += node_compress(spin, child, ht, tot);
4245 
4246       // Try to find an identical child.
4247       hash = hash_hash(child->wn_u1.hashkey);
4248       hi = hash_lookup(ht, (const char *)child->wn_u1.hashkey,
4249                        STRLEN(child->wn_u1.hashkey), hash);
4250       if (!HASHITEM_EMPTY(hi)) {
4251         // There are children we encountered before with a hash value
4252         // identical to the current child.  Now check if there is one
4253         // that is really identical.
4254         for (tp = HI2WN(hi); tp != NULL; tp = tp->wn_u2.next) {
4255           if (node_equal(child, tp)) {
4256             // Found one!  Now use that child in place of the
4257             // current one.  This means the current child and all
4258             // its siblings is unlinked from the tree.
4259             ++tp->wn_refs;
4260             compressed += deref_wordnode(spin, child);
4261             np->wn_child = tp;
4262             break;
4263           }
4264         }
4265         if (tp == NULL) {
4266           // No other child with this hash value equals the child of
4267           // the node, add it to the linked list after the first
4268           // item.
4269           tp = HI2WN(hi);
4270           child->wn_u2.next = tp->wn_u2.next;
4271           tp->wn_u2.next = child;
4272         }
4273       } else {
4274         // No other child has this hash value, add it to the
4275         // hashtable.
4276         hash_add_item(ht, hi, child->wn_u1.hashkey, hash);
4277       }
4278     }
4279   }
4280   *tot += len + 1;      // add one for the node that stores the length
4281 
4282   // Make a hash key for the node and its siblings, so that we can quickly
4283   // find a lookalike node.  This must be done after compressing the sibling
4284   // list, otherwise the hash key would become invalid by the compression.
4285   node->wn_u1.hashkey[0] = len;
4286   nr = 0;
4287   for (np = node; np != NULL; np = np->wn_sibling) {
4288     if (np->wn_byte == NUL) {
4289       // end node: use wn_flags, wn_region and wn_affixID
4290       n = np->wn_flags + (np->wn_region << 8) + (np->wn_affixID << 16);
4291     } else {
4292       // byte node: use the byte value and the child pointer
4293       n = (unsigned)(np->wn_byte + ((uintptr_t)np->wn_child << 8));
4294     }
4295     nr = nr * 101 + n;
4296   }
4297 
4298   // Avoid NUL bytes, it terminates the hash key.
4299   n = nr & 0xff;
4300   node->wn_u1.hashkey[1] = n == 0 ? 1 : n;
4301   n = (nr >> 8) & 0xff;
4302   node->wn_u1.hashkey[2] = n == 0 ? 1 : n;
4303   n = (nr >> 16) & 0xff;
4304   node->wn_u1.hashkey[3] = n == 0 ? 1 : n;
4305   n = (nr >> 24) & 0xff;
4306   node->wn_u1.hashkey[4] = n == 0 ? 1 : n;
4307   node->wn_u1.hashkey[5] = NUL;
4308 
4309   // Check for CTRL-C pressed now and then.
4310   veryfast_breakcheck();
4311 
4312   return compressed;
4313 }
4314 
4315 // Returns true when two nodes have identical siblings and children.
node_equal(wordnode_T * n1,wordnode_T * n2)4316 static bool node_equal(wordnode_T *n1, wordnode_T *n2)
4317 {
4318   wordnode_T *p1;
4319   wordnode_T *p2;
4320 
4321   for (p1 = n1, p2 = n2; p1 != NULL && p2 != NULL;
4322        p1 = p1->wn_sibling, p2 = p2->wn_sibling) {
4323     if (p1->wn_byte != p2->wn_byte
4324         || (p1->wn_byte == NUL
4325             ? (p1->wn_flags != p2->wn_flags
4326                || p1->wn_region != p2->wn_region
4327                || p1->wn_affixID != p2->wn_affixID)
4328             : (p1->wn_child != p2->wn_child))) {
4329       break;
4330     }
4331   }
4332 
4333   return p1 == NULL && p2 == NULL;
4334 }
4335 
4336 
4337 // Function given to qsort() to sort the REP items on "from" string.
rep_compare(const void * s1,const void * s2)4338 static int rep_compare(const void *s1, const void *s2)
4339 {
4340   fromto_T *p1 = (fromto_T *)s1;
4341   fromto_T *p2 = (fromto_T *)s2;
4342 
4343   return STRCMP(p1->ft_from, p2->ft_from);
4344 }
4345 
4346 // Write the Vim .spl file "fname".
4347 // Return OK/FAIL.
write_vim_spell(spellinfo_T * spin,char_u * fname)4348 static int write_vim_spell(spellinfo_T *spin, char_u *fname)
4349 {
4350   int retval = OK;
4351   int regionmask;
4352 
4353   FILE *fd = os_fopen((char *)fname, "w");
4354   if (fd == NULL) {
4355     semsg(_(e_notopen), fname);
4356     return FAIL;
4357   }
4358 
4359   // <HEADER>: <fileID> <versionnr>
4360   // <fileID>
4361   size_t fwv = fwrite(VIMSPELLMAGIC, VIMSPELLMAGICL, 1, fd);
4362   if (fwv != (size_t)1) {
4363     // Catch first write error, don't try writing more.
4364     goto theend;
4365   }
4366 
4367   putc(VIMSPELLVERSION, fd);                                // <versionnr>
4368 
4369   // <SECTIONS>: <section> ... <sectionend>
4370 
4371   // SN_INFO: <infotext>
4372   if (spin->si_info != NULL) {
4373     putc(SN_INFO, fd);                                  // <sectionID>
4374     putc(0, fd);                                        // <sectionflags>
4375     size_t i = STRLEN(spin->si_info);
4376     put_bytes(fd, i, 4);                                // <sectionlen>
4377     fwv &= fwrite(spin->si_info, i, 1, fd);             // <infotext>
4378   }
4379 
4380   // SN_REGION: <regionname> ...
4381   // Write the region names only if there is more than one.
4382   if (spin->si_region_count > 1) {
4383     putc(SN_REGION, fd);                                // <sectionID>
4384     putc(SNF_REQUIRED, fd);                             // <sectionflags>
4385     size_t l = (size_t)spin->si_region_count * 2;
4386     put_bytes(fd, l, 4);                                // <sectionlen>
4387     fwv &= fwrite(spin->si_region_name, l, 1, fd);
4388     // <regionname> ...
4389     regionmask = (1 << spin->si_region_count) - 1;
4390   } else {
4391     regionmask = 0;
4392   }
4393 
4394   // SN_CHARFLAGS: <charflagslen> <charflags> <folcharslen> <folchars>
4395   //
4396   // The table with character flags and the table for case folding.
4397   // This makes sure the same characters are recognized as word characters
4398   // when generating an when using a spell file.
4399   // Skip this for ASCII, the table may conflict with the one used for
4400   // 'encoding'.
4401   // Also skip this for an .add.spl file, the main spell file must contain
4402   // the table (avoids that it conflicts).  File is shorter too.
4403   if (!spin->si_ascii && !spin->si_add) {
4404     char_u folchars[128 * 8];
4405     int flags;
4406 
4407     putc(SN_CHARFLAGS, fd);                             // <sectionID>
4408     putc(SNF_REQUIRED, fd);                             // <sectionflags>
4409 
4410     // Form the <folchars> string first, we need to know its length.
4411     size_t l = 0;
4412     for (size_t i = 128; i < 256; i++) {
4413       l += (size_t)utf_char2bytes(spelltab.st_fold[i], folchars + l);
4414     }
4415     put_bytes(fd, 1 + 128 + 2 + l, 4);                  // <sectionlen>
4416 
4417     fputc(128, fd);                                     // <charflagslen>
4418     for (size_t i = 128; i < 256; ++i) {
4419       flags = 0;
4420       if (spelltab.st_isw[i]) {
4421         flags |= CF_WORD;
4422       }
4423       if (spelltab.st_isu[i]) {
4424         flags |= CF_UPPER;
4425       }
4426       fputc(flags, fd);                                 // <charflags>
4427     }
4428 
4429     put_bytes(fd, l, 2);                                // <folcharslen>
4430     fwv &= fwrite(folchars, l, 1, fd);                  // <folchars>
4431   }
4432 
4433   // SN_MIDWORD: <midword>
4434   if (spin->si_midword != NULL) {
4435     putc(SN_MIDWORD, fd);                               // <sectionID>
4436     putc(SNF_REQUIRED, fd);                             // <sectionflags>
4437 
4438     size_t i = STRLEN(spin->si_midword);
4439     put_bytes(fd, i, 4);                                // <sectionlen>
4440     fwv &= fwrite(spin->si_midword, i, 1, fd);
4441     // <midword>
4442   }
4443 
4444   // SN_PREFCOND: <prefcondcnt> <prefcond> ...
4445   if (!GA_EMPTY(&spin->si_prefcond)) {
4446     putc(SN_PREFCOND, fd);                              // <sectionID>
4447     putc(SNF_REQUIRED, fd);                             // <sectionflags>
4448 
4449     size_t l = (size_t)write_spell_prefcond(NULL, &spin->si_prefcond);
4450     put_bytes(fd, l, 4);                                // <sectionlen>
4451 
4452     write_spell_prefcond(fd, &spin->si_prefcond);
4453   }
4454 
4455   // SN_REP: <repcount> <rep> ...
4456   // SN_SAL: <salflags> <salcount> <sal> ...
4457   // SN_REPSAL: <repcount> <rep> ...
4458 
4459   // round 1: SN_REP section
4460   // round 2: SN_SAL section (unless SN_SOFO is used)
4461   // round 3: SN_REPSAL section
4462   for (unsigned int round = 1; round <= 3; ++round) {
4463     garray_T *gap;
4464     if (round == 1) {
4465       gap = &spin->si_rep;
4466     } else if (round == 2) {
4467       // Don't write SN_SAL when using a SN_SOFO section
4468       if (spin->si_sofofr != NULL && spin->si_sofoto != NULL) {
4469         continue;
4470       }
4471       gap = &spin->si_sal;
4472     } else {
4473       gap = &spin->si_repsal;
4474     }
4475 
4476     // Don't write the section if there are no items.
4477     if (GA_EMPTY(gap)) {
4478       continue;
4479     }
4480 
4481     // Sort the REP/REPSAL items.
4482     if (round != 2) {
4483       qsort(gap->ga_data, (size_t)gap->ga_len,
4484             sizeof(fromto_T), rep_compare);
4485     }
4486 
4487     int sect_id = round == 1 ? SN_REP : (round == 2 ? SN_SAL : SN_REPSAL);
4488     putc(sect_id, fd);                                  // <sectionID>
4489 
4490     // This is for making suggestions, section is not required.
4491     putc(0, fd);                                        // <sectionflags>
4492 
4493     // Compute the length of what follows.
4494     size_t l = 2;  // count <repcount> or <salcount>
4495     assert(gap->ga_len >= 0);
4496     for (size_t i = 0; i < (size_t)gap->ga_len; ++i) {
4497       fromto_T *ftp = &((fromto_T *)gap->ga_data)[i];
4498       l += 1 + STRLEN(ftp->ft_from);  // count <*fromlen> and <*from>
4499       l += 1 + STRLEN(ftp->ft_to);    // count <*tolen> and <*to>
4500     }
4501     if (round == 2) {
4502       ++l;                            // count <salflags>
4503     }
4504     put_bytes(fd, l, 4);                                // <sectionlen>
4505 
4506     if (round == 2) {
4507       int i = 0;
4508       if (spin->si_followup) {
4509         i |= SAL_F0LLOWUP;
4510       }
4511       if (spin->si_collapse) {
4512         i |= SAL_COLLAPSE;
4513       }
4514       if (spin->si_rem_accents) {
4515         i |= SAL_REM_ACCENTS;
4516       }
4517       putc(i, fd);                                      // <salflags>
4518     }
4519 
4520     put_bytes(fd, (uintmax_t)gap->ga_len, 2);    // <repcount> or <salcount>
4521     for (size_t i = 0; i < (size_t)gap->ga_len; ++i) {
4522       // <rep> : <repfromlen> <repfrom> <reptolen> <repto>
4523       // <sal> : <salfromlen> <salfrom> <saltolen> <salto>
4524       fromto_T *ftp = &((fromto_T *)gap->ga_data)[i];
4525       for (unsigned int rr = 1; rr <= 2; ++rr) {
4526         char_u *p = rr == 1 ? ftp->ft_from : ftp->ft_to;
4527         l = STRLEN(p);
4528         assert(l < INT_MAX);
4529         putc((int)l, fd);
4530         if (l > 0) {
4531           fwv &= fwrite(p, l, 1, fd);
4532         }
4533       }
4534     }
4535   }
4536 
4537   // SN_SOFO: <sofofromlen> <sofofrom> <sofotolen> <sofoto>
4538   // This is for making suggestions, section is not required.
4539   if (spin->si_sofofr != NULL && spin->si_sofoto != NULL) {
4540     putc(SN_SOFO, fd);                                  // <sectionID>
4541     putc(0, fd);                                        // <sectionflags>
4542 
4543     size_t l = STRLEN(spin->si_sofofr);
4544     put_bytes(fd, l + STRLEN(spin->si_sofoto) + 4, 4);  // <sectionlen>
4545 
4546     put_bytes(fd, l, 2);                                // <sofofromlen>
4547     fwv &= fwrite(spin->si_sofofr, l, 1, fd);           // <sofofrom>
4548 
4549     l = STRLEN(spin->si_sofoto);
4550     put_bytes(fd, l, 2);                                // <sofotolen>
4551     fwv &= fwrite(spin->si_sofoto, l, 1, fd);           // <sofoto>
4552   }
4553 
4554   // SN_WORDS: <word> ...
4555   // This is for making suggestions, section is not required.
4556   if (spin->si_commonwords.ht_used > 0) {
4557     putc(SN_WORDS, fd);                                 // <sectionID>
4558     putc(0, fd);                                        // <sectionflags>
4559 
4560     // round 1: count the bytes
4561     // round 2: write the bytes
4562     for (unsigned int round = 1; round <= 2; ++round) {
4563       size_t todo;
4564       size_t len = 0;
4565       hashitem_T *hi;
4566 
4567       todo = spin->si_commonwords.ht_used;
4568       for (hi = spin->si_commonwords.ht_array; todo > 0; ++hi) {
4569         if (!HASHITEM_EMPTY(hi)) {
4570           size_t l = STRLEN(hi->hi_key) + 1;
4571           len += l;
4572           if (round == 2) {                             // <word>
4573             fwv &= fwrite(hi->hi_key, l, 1, fd);
4574           }
4575           --todo;
4576         }
4577       }
4578       if (round == 1) {
4579         put_bytes(fd, len, 4);                          // <sectionlen>
4580       }
4581     }
4582   }
4583 
4584   // SN_MAP: <mapstr>
4585   // This is for making suggestions, section is not required.
4586   if (!GA_EMPTY(&spin->si_map)) {
4587     putc(SN_MAP, fd);                                   // <sectionID>
4588     putc(0, fd);                                        // <sectionflags>
4589     size_t l = (size_t)spin->si_map.ga_len;
4590     put_bytes(fd, l, 4);                                // <sectionlen>
4591     fwv &= fwrite(spin->si_map.ga_data, l, 1, fd);      // <mapstr>
4592   }
4593 
4594   // SN_SUGFILE: <timestamp>
4595   // This is used to notify that a .sug file may be available and at the
4596   // same time allows for checking that a .sug file that is found matches
4597   // with this .spl file.  That's because the word numbers must be exactly
4598   // right.
4599   if (!spin->si_nosugfile
4600       && (!GA_EMPTY(&spin->si_sal)
4601           || (spin->si_sofofr != NULL && spin->si_sofoto != NULL))) {
4602     putc(SN_SUGFILE, fd);                               // <sectionID>
4603     putc(0, fd);                                        // <sectionflags>
4604     put_bytes(fd, 8, 4);                                // <sectionlen>
4605 
4606     // Set si_sugtime and write it to the file.
4607     spin->si_sugtime = time(NULL);
4608     put_time(fd, spin->si_sugtime);                     // <timestamp>
4609   }
4610 
4611   // SN_NOSPLITSUGS: nothing
4612   // This is used to notify that no suggestions with word splits are to be
4613   // made.
4614   if (spin->si_nosplitsugs) {
4615     putc(SN_NOSPLITSUGS, fd);                           // <sectionID>
4616     putc(0, fd);                                        // <sectionflags>
4617     put_bytes(fd, 0, 4);                                // <sectionlen>
4618   }
4619 
4620   // SN_NOCOMPUNDSUGS: nothing
4621   // This is used to notify that no suggestions with compounds are to be
4622   // made.
4623   if (spin->si_nocompoundsugs) {
4624     putc(SN_NOCOMPOUNDSUGS, fd);                        // <sectionID>
4625     putc(0, fd);                                        // <sectionflags>
4626     put_bytes(fd, 0, 4);                                // <sectionlen>
4627   }
4628 
4629   // SN_COMPOUND: compound info.
4630   // We don't mark it required, when not supported all compound words will
4631   // be bad words.
4632   if (spin->si_compflags != NULL) {
4633     putc(SN_COMPOUND, fd);                              // <sectionID>
4634     putc(0, fd);                                        // <sectionflags>
4635 
4636     size_t l = STRLEN(spin->si_compflags);
4637     assert(spin->si_comppat.ga_len >= 0);
4638     for (size_t i = 0; i < (size_t)spin->si_comppat.ga_len; ++i) {
4639       l += STRLEN(((char_u **)(spin->si_comppat.ga_data))[i]) + 1;
4640     }
4641     put_bytes(fd, l + 7, 4);                            // <sectionlen>
4642 
4643     putc(spin->si_compmax, fd);                         // <compmax>
4644     putc(spin->si_compminlen, fd);                      // <compminlen>
4645     putc(spin->si_compsylmax, fd);                      // <compsylmax>
4646     putc(0, fd);                // for Vim 7.0b compatibility
4647     putc(spin->si_compoptions, fd);                     // <compoptions>
4648     put_bytes(fd, (uintmax_t)spin->si_comppat.ga_len, 2);  // <comppatcount>
4649     for (size_t i = 0; i < (size_t)spin->si_comppat.ga_len; ++i) {
4650       char_u *p = ((char_u **)(spin->si_comppat.ga_data))[i];
4651       assert(STRLEN(p) < INT_MAX);
4652       putc((int)STRLEN(p), fd);                         // <comppatlen>
4653       fwv &= fwrite(p, STRLEN(p), 1, fd);               // <comppattext>
4654     }
4655     // <compflags>
4656     fwv &= fwrite(spin->si_compflags, STRLEN(spin->si_compflags), 1, fd);
4657   }
4658 
4659   // SN_NOBREAK: NOBREAK flag
4660   if (spin->si_nobreak) {
4661     putc(SN_NOBREAK, fd);                               // <sectionID>
4662     putc(0, fd);                                        // <sectionflags>
4663 
4664     // It's empty, the presence of the section flags the feature.
4665     put_bytes(fd, 0, 4);                                // <sectionlen>
4666   }
4667 
4668   // SN_SYLLABLE: syllable info.
4669   // We don't mark it required, when not supported syllables will not be
4670   // counted.
4671   if (spin->si_syllable != NULL) {
4672     putc(SN_SYLLABLE, fd);                              // <sectionID>
4673     putc(0, fd);                                        // <sectionflags>
4674 
4675     size_t l = STRLEN(spin->si_syllable);
4676     put_bytes(fd, l, 4);                                // <sectionlen>
4677     fwv &= fwrite(spin->si_syllable, l, 1, fd);         // <syllable>
4678   }
4679 
4680   // end of <SECTIONS>
4681   putc(SN_END, fd);                                     // <sectionend>
4682 
4683 
4684   // <LWORDTREE>  <KWORDTREE>  <PREFIXTREE>
4685   spin->si_memtot = 0;
4686   for (unsigned int round = 1; round <= 3; ++round) {
4687     wordnode_T *tree;
4688     if (round == 1) {
4689       tree = spin->si_foldroot->wn_sibling;
4690     } else if (round == 2) {
4691       tree = spin->si_keeproot->wn_sibling;
4692     } else {
4693       tree = spin->si_prefroot->wn_sibling;
4694     }
4695 
4696     // Clear the index and wnode fields in the tree.
4697     clear_node(tree);
4698 
4699     // Count the number of nodes.  Needed to be able to allocate the
4700     // memory when reading the nodes.  Also fills in index for shared
4701     // nodes.
4702     size_t nodecount = (size_t)put_node(NULL, tree, 0, regionmask, round == 3);
4703 
4704     // number of nodes in 4 bytes
4705     put_bytes(fd, nodecount, 4);                        // <nodecount>
4706     assert(nodecount + nodecount * sizeof(int) < INT_MAX);
4707     spin->si_memtot += (int)(nodecount + nodecount * sizeof(int));
4708 
4709     // Write the nodes.
4710     (void)put_node(fd, tree, 0, regionmask, round == 3);
4711   }
4712 
4713   // Write another byte to check for errors (file system full).
4714   if (putc(0, fd) == EOF) {
4715     retval = FAIL;
4716   }
4717 theend:
4718   if (fclose(fd) == EOF) {
4719     retval = FAIL;
4720   }
4721 
4722   if (fwv != (size_t)1) {
4723     retval = FAIL;
4724   }
4725   if (retval == FAIL) {
4726     emsg(_(e_write));
4727   }
4728 
4729   return retval;
4730 }
4731 
4732 // Clear the index and wnode fields of "node", it siblings and its
4733 // children.  This is needed because they are a union with other items to save
4734 // space.
clear_node(wordnode_T * node)4735 static void clear_node(wordnode_T *node)
4736 {
4737   wordnode_T *np;
4738 
4739   if (node != NULL) {
4740     for (np = node; np != NULL; np = np->wn_sibling) {
4741       np->wn_u1.index = 0;
4742       np->wn_u2.wnode = NULL;
4743 
4744       if (np->wn_byte != NUL) {
4745         clear_node(np->wn_child);
4746       }
4747     }
4748   }
4749 }
4750 
4751 
4752 /// Dump a word tree at node "node".
4753 ///
4754 /// This first writes the list of possible bytes (siblings).  Then for each
4755 /// byte recursively write the children.
4756 ///
4757 /// NOTE: The code here must match the code in read_tree_node(), since
4758 /// assumptions are made about the indexes (so that we don't have to write them
4759 /// in the file).
4760 ///
4761 /// @param fd  NULL when only counting
4762 /// @param prefixtree  true for PREFIXTREE
4763 ///
4764 /// @return  the number of nodes used.
put_node(FILE * fd,wordnode_T * node,int idx,int regionmask,bool prefixtree)4765 static int put_node(FILE *fd, wordnode_T *node, int idx, int regionmask, bool prefixtree)
4766 {
4767   // If "node" is zero the tree is empty.
4768   if (node == NULL) {
4769     return 0;
4770   }
4771 
4772   // Store the index where this node is written.
4773   node->wn_u1.index = idx;
4774 
4775   // Count the number of siblings.
4776   int siblingcount = 0;
4777   for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
4778     ++siblingcount;
4779   }
4780 
4781   // Write the sibling count.
4782   if (fd != NULL) {
4783     putc(siblingcount, fd);                             // <siblingcount>
4784   }
4785   // Write each sibling byte and optionally extra info.
4786   for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
4787     if (np->wn_byte == 0) {
4788       if (fd != NULL) {
4789         // For a NUL byte (end of word) write the flags etc.
4790         if (prefixtree) {
4791           // In PREFIXTREE write the required affixID and the
4792           // associated condition nr (stored in wn_region).  The
4793           // byte value is misused to store the "rare" and "not
4794           // combining" flags
4795           if (np->wn_flags == (uint16_t)PFX_FLAGS) {
4796             putc(BY_NOFLAGS, fd);                       // <byte>
4797           } else {
4798             putc(BY_FLAGS, fd);                         // <byte>
4799             putc(np->wn_flags, fd);                     // <pflags>
4800           }
4801           putc(np->wn_affixID, fd);                     // <affixID>
4802           put_bytes(fd, (uintmax_t)np->wn_region, 2);   // <prefcondnr>
4803         } else {
4804           // For word trees we write the flag/region items.
4805           int flags = np->wn_flags;
4806           if (regionmask != 0 && np->wn_region != regionmask) {
4807             flags |= WF_REGION;
4808           }
4809           if (np->wn_affixID != 0) {
4810             flags |= WF_AFX;
4811           }
4812           if (flags == 0) {
4813             // word without flags or region
4814             putc(BY_NOFLAGS, fd);                               // <byte>
4815           } else {
4816             if (np->wn_flags >= 0x100) {
4817               putc(BY_FLAGS2, fd);                              // <byte>
4818               putc(flags, fd);                                  // <flags>
4819               putc((int)((unsigned)flags >> 8), fd);            // <flags2>
4820             } else {
4821               putc(BY_FLAGS, fd);                               // <byte>
4822               putc(flags, fd);                                  // <flags>
4823             }
4824             if (flags & WF_REGION) {
4825               putc(np->wn_region, fd);                          // <region>
4826             }
4827             if (flags & WF_AFX) {
4828               putc(np->wn_affixID, fd);                         // <affixID>
4829             }
4830           }
4831         }
4832       }
4833     } else {
4834       if (np->wn_child->wn_u1.index != 0
4835           && np->wn_child->wn_u2.wnode != node) {
4836         // The child is written elsewhere, write the reference.
4837         if (fd != NULL) {
4838           putc(BY_INDEX, fd);                                      // <byte>
4839           put_bytes(fd, (uintmax_t)np->wn_child->wn_u1.index, 3);  // <nodeidx>
4840         }
4841       } else if (np->wn_child->wn_u2.wnode == NULL) {
4842         // We will write the child below and give it an index.
4843         np->wn_child->wn_u2.wnode = node;
4844       }
4845 
4846       if (fd != NULL) {
4847         if (putc(np->wn_byte, fd) == EOF) {       // <byte> or <xbyte>
4848           emsg(_(e_write));
4849           return 0;
4850         }
4851       }
4852     }
4853   }
4854 
4855   // Space used in the array when reading: one for each sibling and one for
4856   // the count.
4857   int newindex = idx + siblingcount + 1;
4858 
4859   // Recursively dump the children of each sibling.
4860   for (wordnode_T *np = node; np != NULL; np = np->wn_sibling) {
4861     if (np->wn_byte != 0 && np->wn_child->wn_u2.wnode == node) {
4862       newindex = put_node(fd, np->wn_child, newindex, regionmask,
4863                           prefixtree);
4864     }
4865   }
4866 
4867   return newindex;
4868 }
4869 
4870 
4871 // ":mkspell [-ascii] outfile  infile ..."
4872 // ":mkspell [-ascii] addfile"
ex_mkspell(exarg_T * eap)4873 void ex_mkspell(exarg_T *eap)
4874 {
4875   int fcount;
4876   char_u **fnames;
4877   char_u *arg = eap->arg;
4878   bool ascii = false;
4879 
4880   if (STRNCMP(arg, "-ascii", 6) == 0) {
4881     ascii = true;
4882     arg = skipwhite(arg + 6);
4883   }
4884 
4885   // Expand all the remaining arguments (e.g., $VIMRUNTIME).
4886   if (get_arglist_exp(arg, &fcount, &fnames, false) == OK) {
4887     mkspell(fcount, fnames, ascii, eap->forceit, false);
4888     FreeWild(fcount, fnames);
4889   }
4890 }
4891 
4892 // Create the .sug file.
4893 // Uses the soundfold info in "spin".
4894 // Writes the file with the name "wfname", with ".spl" changed to ".sug".
spell_make_sugfile(spellinfo_T * spin,char_u * wfname)4895 static void spell_make_sugfile(spellinfo_T *spin, char_u *wfname)
4896 {
4897   char_u *fname = NULL;
4898   int len;
4899   slang_T *slang;
4900   bool free_slang = false;
4901 
4902   // Read back the .spl file that was written.  This fills the required
4903   // info for soundfolding.  This also uses less memory than the
4904   // pointer-linked version of the trie.  And it avoids having two versions
4905   // of the code for the soundfolding stuff.
4906   // It might have been done already by spell_reload_one().
4907   for (slang = first_lang; slang != NULL; slang = slang->sl_next) {
4908     if (path_full_compare(wfname, slang->sl_fname, false, true)
4909         == kEqualFiles) {
4910       break;
4911     }
4912   }
4913   if (slang == NULL) {
4914     spell_message(spin, (char_u *)_("Reading back spell file..."));
4915     slang = spell_load_file(wfname, NULL, NULL, false);
4916     if (slang == NULL) {
4917       return;
4918     }
4919     free_slang = true;
4920   }
4921 
4922   // Clear the info in "spin" that is used.
4923   spin->si_blocks = NULL;
4924   spin->si_blocks_cnt = 0;
4925   spin->si_compress_cnt = 0;        // will stay at 0 all the time
4926   spin->si_free_count = 0;
4927   spin->si_first_free = NULL;
4928   spin->si_foldwcount = 0;
4929 
4930   // Go through the trie of good words, soundfold each word and add it to
4931   // the soundfold trie.
4932   spell_message(spin, (char_u *)_("Performing soundfolding..."));
4933   if (sug_filltree(spin, slang) == FAIL) {
4934     goto theend;
4935   }
4936 
4937   // Create the table which links each soundfold word with a list of the
4938   // good words it may come from.  Creates buffer "spin->si_spellbuf".
4939   // This also removes the wordnr from the NUL byte entries to make
4940   // compression possible.
4941   if (sug_maketable(spin) == FAIL) {
4942     goto theend;
4943   }
4944 
4945   smsg(_("Number of words after soundfolding: %" PRId64),
4946        (int64_t)spin->si_spellbuf->b_ml.ml_line_count);
4947 
4948   // Compress the soundfold trie.
4949   spell_message(spin, (char_u *)_(msg_compressing));
4950   wordtree_compress(spin, spin->si_foldroot, "case-folded");
4951 
4952   // Write the .sug file.
4953   // Make the file name by changing ".spl" to ".sug".
4954   fname = xmalloc(MAXPATHL);
4955   STRLCPY(fname, wfname, MAXPATHL);
4956   len = (int)STRLEN(fname);
4957   fname[len - 2] = 'u';
4958   fname[len - 1] = 'g';
4959   sug_write(spin, fname);
4960 
4961 theend:
4962   xfree(fname);
4963   if (free_slang) {
4964     slang_free(slang);
4965   }
4966   free_blocks(spin->si_blocks);
4967   close_spellbuf(spin->si_spellbuf);
4968 }
4969 
4970 // Build the soundfold trie for language "slang".
sug_filltree(spellinfo_T * spin,slang_T * slang)4971 static int sug_filltree(spellinfo_T *spin, slang_T *slang)
4972 {
4973   char_u *byts;
4974   idx_T *idxs;
4975   int depth;
4976   idx_T arridx[MAXWLEN];
4977   int curi[MAXWLEN];
4978   char_u tword[MAXWLEN];
4979   char_u tsalword[MAXWLEN];
4980   int c;
4981   idx_T n;
4982   unsigned words_done = 0;
4983   int wordcount[MAXWLEN];
4984 
4985   // We use si_foldroot for the soundfolded trie.
4986   spin->si_foldroot = wordtree_alloc(spin);
4987 
4988   // Let tree_add_word() know we're adding to the soundfolded tree
4989   spin->si_sugtree = true;
4990 
4991   // Go through the whole case-folded tree, soundfold each word and put it
4992   // in the trie.
4993   byts = slang->sl_fbyts;
4994   idxs = slang->sl_fidxs;
4995 
4996   arridx[0] = 0;
4997   curi[0] = 1;
4998   wordcount[0] = 0;
4999 
5000   depth = 0;
5001   while (depth >= 0 && !got_int) {
5002     if (curi[depth] > byts[arridx[depth]]) {
5003       // Done all bytes at this node, go up one level.
5004       idxs[arridx[depth]] = wordcount[depth];
5005       if (depth > 0) {
5006         wordcount[depth - 1] += wordcount[depth];
5007       }
5008 
5009       --depth;
5010       line_breakcheck();
5011     } else {
5012       // Do one more byte at this node.
5013       n = arridx[depth] + curi[depth];
5014       ++curi[depth];
5015 
5016       c = byts[n];
5017       if (c == 0) {
5018         // Sound-fold the word.
5019         tword[depth] = NUL;
5020         spell_soundfold(slang, tword, true, tsalword);
5021 
5022         // We use the "flags" field for the MSB of the wordnr,
5023         // "region" for the LSB of the wordnr.
5024         if (tree_add_word(spin, tsalword, spin->si_foldroot,
5025                           words_done >> 16, words_done & 0xffff,
5026                           0) == FAIL) {
5027           return FAIL;
5028         }
5029 
5030         ++words_done;
5031         ++wordcount[depth];
5032 
5033         // Reset the block count each time to avoid compression
5034         // kicking in.
5035         spin->si_blocks_cnt = 0;
5036 
5037         // Skip over any other NUL bytes (same word with different
5038         // flags).  But don't go over the end.
5039         while (n + 1 < slang->sl_fbyts_len && byts[n + 1] == 0) {
5040           n++;
5041           curi[depth]++;
5042         }
5043       } else {
5044         // Normal char, go one level deeper.
5045         tword[depth++] = c;
5046         arridx[depth] = idxs[n];
5047         curi[depth] = 1;
5048         wordcount[depth] = 0;
5049       }
5050     }
5051   }
5052 
5053   smsg(_("Total number of words: %d"), words_done);
5054 
5055   return OK;
5056 }
5057 
5058 // Make the table that links each word in the soundfold trie to the words it
5059 // can be produced from.
5060 // This is not unlike lines in a file, thus use a memfile to be able to access
5061 // the table efficiently.
5062 // Returns FAIL when out of memory.
sug_maketable(spellinfo_T * spin)5063 static int sug_maketable(spellinfo_T *spin)
5064 {
5065   garray_T ga;
5066   int res = OK;
5067 
5068   // Allocate a buffer, open a memline for it and create the swap file
5069   // (uses a temp file, not a .swp file).
5070   spin->si_spellbuf = open_spellbuf();
5071 
5072   // Use a buffer to store the line info, avoids allocating many small
5073   // pieces of memory.
5074   ga_init(&ga, 1, 100);
5075 
5076   // recursively go through the tree
5077   if (sug_filltable(spin, spin->si_foldroot->wn_sibling, 0, &ga) == -1) {
5078     res = FAIL;
5079   }
5080 
5081   ga_clear(&ga);
5082   return res;
5083 }
5084 
5085 /// Fill the table for one node and its children.
5086 /// Returns the wordnr at the start of the node.
5087 /// Returns -1 when out of memory.
5088 ///
5089 /// @param gap  place to store line of numbers
sug_filltable(spellinfo_T * spin,wordnode_T * node,int startwordnr,garray_T * gap)5090 static int sug_filltable(spellinfo_T *spin, wordnode_T *node, int startwordnr, garray_T *gap)
5091 {
5092   wordnode_T *p, *np;
5093   int wordnr = startwordnr;
5094   int nr;
5095   int prev_nr;
5096 
5097   for (p = node; p != NULL; p = p->wn_sibling) {
5098     if (p->wn_byte == NUL) {
5099       gap->ga_len = 0;
5100       prev_nr = 0;
5101       for (np = p; np != NULL && np->wn_byte == NUL; np = np->wn_sibling) {
5102         ga_grow(gap, 10);
5103 
5104         nr = (np->wn_flags << 16) + (np->wn_region & 0xffff);
5105         // Compute the offset from the previous nr and store the
5106         // offset in a way that it takes a minimum number of bytes.
5107         // It's a bit like utf-8, but without the need to mark
5108         // following bytes.
5109         nr -= prev_nr;
5110         prev_nr += nr;
5111         gap->ga_len += offset2bytes(nr,
5112                                     (char_u *)gap->ga_data + gap->ga_len);
5113       }
5114 
5115       // add the NUL byte
5116       ((char_u *)gap->ga_data)[gap->ga_len++] = NUL;
5117 
5118       if (ml_append_buf(spin->si_spellbuf, (linenr_T)wordnr,
5119                         gap->ga_data, gap->ga_len, true) == FAIL) {
5120         return -1;
5121       }
5122       wordnr++;
5123 
5124       // Remove extra NUL entries, we no longer need them. We don't
5125       // bother freeing the nodes, the won't be reused anyway.
5126       while (p->wn_sibling != NULL && p->wn_sibling->wn_byte == NUL) {
5127         p->wn_sibling = p->wn_sibling->wn_sibling;
5128       }
5129 
5130       // Clear the flags on the remaining NUL node, so that compression
5131       // works a lot better.
5132       p->wn_flags = 0;
5133       p->wn_region = 0;
5134     } else {
5135       wordnr = sug_filltable(spin, p->wn_child, wordnr, gap);
5136       if (wordnr == -1) {
5137         return -1;
5138       }
5139     }
5140   }
5141   return wordnr;
5142 }
5143 
5144 // Convert an offset into a minimal number of bytes.
5145 // Similar to utf_char2byters, but use 8 bits in followup bytes and avoid NUL
5146 // bytes.
offset2bytes(int nr,char_u * buf)5147 static int offset2bytes(int nr, char_u *buf)
5148 {
5149   int rem;
5150   int b1, b2, b3, b4;
5151 
5152   // Split the number in parts of base 255.  We need to avoid NUL bytes.
5153   b1 = nr % 255 + 1;
5154   rem = nr / 255;
5155   b2 = rem % 255 + 1;
5156   rem = rem / 255;
5157   b3 = rem % 255 + 1;
5158   b4 = rem / 255 + 1;
5159 
5160   if (b4 > 1 || b3 > 0x1f) {    // 4 bytes
5161     buf[0] = 0xe0 + b4;
5162     buf[1] = b3;
5163     buf[2] = b2;
5164     buf[3] = b1;
5165     return 4;
5166   }
5167   if (b3 > 1 || b2 > 0x3f) {   // 3 bytes
5168     buf[0] = 0xc0 + b3;
5169     buf[1] = b2;
5170     buf[2] = b1;
5171     return 3;
5172   }
5173   if (b2 > 1 || b1 > 0x7f) {   // 2 bytes
5174     buf[0] = 0x80 + b2;
5175     buf[1] = b1;
5176     return 2;
5177   }
5178   // 1 byte
5179   buf[0] = b1;
5180   return 1;
5181 }
5182 
5183 // Write the .sug file in "fname".
sug_write(spellinfo_T * spin,char_u * fname)5184 static void sug_write(spellinfo_T *spin, char_u *fname)
5185 {
5186   // Create the file.  Note that an existing file is silently overwritten!
5187   FILE *fd = os_fopen((char *)fname, "w");
5188   if (fd == NULL) {
5189     semsg(_(e_notopen), fname);
5190     return;
5191   }
5192 
5193   vim_snprintf((char *)IObuff, IOSIZE,
5194                _("Writing suggestion file %s..."), fname);
5195   spell_message(spin, IObuff);
5196 
5197   // <SUGHEADER>: <fileID> <versionnr> <timestamp>
5198   if (fwrite(VIMSUGMAGIC, VIMSUGMAGICL, (size_t)1, fd) != 1) {  // <fileID>
5199     emsg(_(e_write));
5200     goto theend;
5201   }
5202   putc(VIMSUGVERSION, fd);                              // <versionnr>
5203 
5204   // Write si_sugtime to the file.
5205   put_time(fd, spin->si_sugtime);                       // <timestamp>
5206 
5207   // <SUGWORDTREE>
5208   spin->si_memtot = 0;
5209   wordnode_T *tree = spin->si_foldroot->wn_sibling;
5210 
5211   // Clear the index and wnode fields in the tree.
5212   clear_node(tree);
5213 
5214   // Count the number of nodes.  Needed to be able to allocate the
5215   // memory when reading the nodes.  Also fills in index for shared
5216   // nodes.
5217   size_t nodecount = (size_t)put_node(NULL, tree, 0, 0, false);
5218 
5219   // number of nodes in 4 bytes
5220   put_bytes(fd, nodecount, 4);                          // <nodecount>
5221   assert(nodecount + nodecount * sizeof(int) < INT_MAX);
5222   spin->si_memtot += (int)(nodecount + nodecount * sizeof(int));
5223 
5224   // Write the nodes.
5225   (void)put_node(fd, tree, 0, 0, false);
5226 
5227   // <SUGTABLE>: <sugwcount> <sugline> ...
5228   linenr_T wcount = spin->si_spellbuf->b_ml.ml_line_count;
5229   assert(wcount >= 0);
5230   put_bytes(fd, (uintmax_t)wcount, 4);                  // <sugwcount>
5231 
5232   for (linenr_T lnum = 1; lnum <= wcount; ++lnum) {
5233     // <sugline>: <sugnr> ... NUL
5234     char_u *line = ml_get_buf(spin->si_spellbuf, lnum, false);
5235     size_t len = STRLEN(line) + 1;
5236     if (fwrite(line, len, 1, fd) == 0) {
5237       emsg(_(e_write));
5238       goto theend;
5239     }
5240     assert((size_t)spin->si_memtot + len <= INT_MAX);
5241     spin->si_memtot += (int)len;
5242   }
5243 
5244   // Write another byte to check for errors.
5245   if (putc(0, fd) == EOF) {
5246     emsg(_(e_write));
5247   }
5248 
5249   vim_snprintf((char *)IObuff, IOSIZE,
5250                _("Estimated runtime memory use: %d bytes"), spin->si_memtot);
5251   spell_message(spin, IObuff);
5252 
5253 theend:
5254   // close the file
5255   fclose(fd);
5256 }
5257 
5258 
5259 /// Create a Vim spell file from one or more word lists.
5260 /// "fnames[0]" is the output file name.
5261 /// "fnames[fcount - 1]" is the last input file name.
5262 /// Exception: when "fnames[0]" ends in ".add" it's used as the input file name
5263 /// and ".spl" is appended to make the output file name.
5264 ///
5265 /// @param ascii  -ascii argument given
5266 /// @param over_write  overwrite existing output file
5267 /// @param added_word  invoked through "zg"
mkspell(int fcount,char_u ** fnames,bool ascii,bool over_write,bool added_word)5268 static void mkspell(int fcount, char_u **fnames, bool ascii, bool over_write, bool added_word)
5269 {
5270   char_u *fname = NULL;
5271   char_u *wfname;
5272   char_u **innames;
5273   int incount;
5274   afffile_T *(afile[MAXREGIONS]);
5275   int i;
5276   int len;
5277   bool error = false;
5278   spellinfo_T spin;
5279 
5280   memset(&spin, 0, sizeof(spin));
5281   spin.si_verbose = !added_word;
5282   spin.si_ascii = ascii;
5283   spin.si_followup = true;
5284   spin.si_rem_accents = true;
5285   ga_init(&spin.si_rep, (int)sizeof(fromto_T), 20);
5286   ga_init(&spin.si_repsal, (int)sizeof(fromto_T), 20);
5287   ga_init(&spin.si_sal, (int)sizeof(fromto_T), 20);
5288   ga_init(&spin.si_map, (int)sizeof(char_u), 100);
5289   ga_init(&spin.si_comppat, (int)sizeof(char_u *), 20);
5290   ga_init(&spin.si_prefcond, (int)sizeof(char_u *), 50);
5291   hash_init(&spin.si_commonwords);
5292   spin.si_newcompID = 127;      // start compound ID at first maximum
5293 
5294   // default: fnames[0] is output file, following are input files
5295   // When "fcount" is 1 there is only one file.
5296   innames = &fnames[fcount == 1 ? 0 : 1];
5297   incount = fcount - 1;
5298 
5299   wfname = xmalloc(MAXPATHL);
5300 
5301   if (fcount >= 1) {
5302     len = (int)STRLEN(fnames[0]);
5303     if (fcount == 1 && len > 4 && STRCMP(fnames[0] + len - 4, ".add") == 0) {
5304       // For ":mkspell path/en.latin1.add" output file is
5305       // "path/en.latin1.add.spl".
5306       incount = 1;
5307       vim_snprintf((char *)wfname, MAXPATHL, "%s.spl", fnames[0]);
5308     } else if (fcount == 1) {
5309       // For ":mkspell path/vim" output file is "path/vim.latin1.spl".
5310       incount = 1;
5311       vim_snprintf((char *)wfname, MAXPATHL, SPL_FNAME_TMPL,
5312                    fnames[0], spin.si_ascii ? (char_u *)"ascii" : spell_enc());
5313     } else if (len > 4 && STRCMP(fnames[0] + len - 4, ".spl") == 0) {
5314       // Name ends in ".spl", use as the file name.
5315       STRLCPY(wfname, fnames[0], MAXPATHL);
5316     } else {
5317       // Name should be language, make the file name from it.
5318       vim_snprintf((char *)wfname, MAXPATHL, SPL_FNAME_TMPL,
5319                    fnames[0], spin.si_ascii ? (char_u *)"ascii" : spell_enc());
5320     }
5321 
5322     // Check for .ascii.spl.
5323     if (strstr((char *)path_tail(wfname), SPL_FNAME_ASCII) != NULL) {
5324       spin.si_ascii = true;
5325     }
5326 
5327     // Check for .add.spl.
5328     if (strstr((char *)path_tail(wfname), SPL_FNAME_ADD) != NULL) {
5329       spin.si_add = true;
5330     }
5331   }
5332 
5333   if (incount <= 0) {
5334     emsg(_(e_invarg));          // need at least output and input names
5335   } else if (vim_strchr(path_tail(wfname), '_') != NULL) {
5336     emsg(_("E751: Output file name must not have region name"));
5337   } else if (incount > MAXREGIONS) {
5338     semsg(_("E754: Only up to %d regions supported"), MAXREGIONS);
5339   } else {
5340     // Check for overwriting before doing things that may take a lot of
5341     // time.
5342     if (!over_write && os_path_exists(wfname)) {
5343       emsg(_(e_exists));
5344       goto theend;
5345     }
5346     if (os_isdir(wfname)) {
5347       semsg(_(e_isadir2), wfname);
5348       goto theend;
5349     }
5350 
5351     fname = xmalloc(MAXPATHL);
5352 
5353     // Init the aff and dic pointers.
5354     // Get the region names if there are more than 2 arguments.
5355     for (i = 0; i < incount; ++i) {
5356       afile[i] = NULL;
5357 
5358       if (incount > 1) {
5359         len = (int)STRLEN(innames[i]);
5360         if (STRLEN(path_tail(innames[i])) < 5
5361             || innames[i][len - 3] != '_') {
5362           semsg(_("E755: Invalid region in %s"), innames[i]);
5363           goto theend;
5364         }
5365         spin.si_region_name[i * 2] = TOLOWER_ASC(innames[i][len - 2]);
5366         spin.si_region_name[i * 2 + 1] =
5367           TOLOWER_ASC(innames[i][len - 1]);
5368       }
5369     }
5370     spin.si_region_count = incount;
5371 
5372     spin.si_foldroot = wordtree_alloc(&spin);
5373     spin.si_keeproot = wordtree_alloc(&spin);
5374     spin.si_prefroot = wordtree_alloc(&spin);
5375 
5376     // When not producing a .add.spl file clear the character table when
5377     // we encounter one in the .aff file.  This means we dump the current
5378     // one in the .spl file if the .aff file doesn't define one.  That's
5379     // better than guessing the contents, the table will match a
5380     // previously loaded spell file.
5381     if (!spin.si_add) {
5382       spin.si_clear_chartab = true;
5383     }
5384 
5385     // Read all the .aff and .dic files.
5386     // Text is converted to 'encoding'.
5387     // Words are stored in the case-folded and keep-case trees.
5388     for (i = 0; i < incount && !error; ++i) {
5389       spin.si_conv.vc_type = CONV_NONE;
5390       spin.si_region = 1 << i;
5391 
5392       vim_snprintf((char *)fname, MAXPATHL, "%s.aff", innames[i]);
5393       if (os_path_exists(fname)) {
5394         // Read the .aff file.  Will init "spin->si_conv" based on the
5395         // "SET" line.
5396         afile[i] = spell_read_aff(&spin, fname);
5397         if (afile[i] == NULL) {
5398           error = true;
5399         } else {
5400           // Read the .dic file and store the words in the trees.
5401           vim_snprintf((char *)fname, MAXPATHL, "%s.dic",
5402                        innames[i]);
5403           if (spell_read_dic(&spin, fname, afile[i]) == FAIL) {
5404             error = true;
5405           }
5406         }
5407       } else {
5408         // No .aff file, try reading the file as a word list.  Store
5409         // the words in the trees.
5410         if (spell_read_wordfile(&spin, innames[i]) == FAIL) {
5411           error = true;
5412         }
5413       }
5414 
5415       // Free any conversion stuff.
5416       convert_setup(&spin.si_conv, NULL, NULL);
5417     }
5418 
5419     if (spin.si_compflags != NULL && spin.si_nobreak) {
5420       msg(_("Warning: both compounding and NOBREAK specified"));
5421     }
5422 
5423     if (!error && !got_int) {
5424       // Combine tails in the tree.
5425       spell_message(&spin, (char_u *)_(msg_compressing));
5426       wordtree_compress(&spin, spin.si_foldroot, "case-folded");
5427       wordtree_compress(&spin, spin.si_keeproot, "keep-case");
5428       wordtree_compress(&spin, spin.si_prefroot, "prefixes");
5429     }
5430 
5431     if (!error && !got_int) {
5432       // Write the info in the spell file.
5433       vim_snprintf((char *)IObuff, IOSIZE,
5434                    _("Writing spell file %s..."), wfname);
5435       spell_message(&spin, IObuff);
5436 
5437       error = write_vim_spell(&spin, wfname) == FAIL;
5438 
5439       spell_message(&spin, (char_u *)_("Done!"));
5440       vim_snprintf((char *)IObuff, IOSIZE,
5441                    _("Estimated runtime memory use: %d bytes"), spin.si_memtot);
5442       spell_message(&spin, IObuff);
5443 
5444       // If the file is loaded need to reload it.
5445       if (!error) {
5446         spell_reload_one(wfname, added_word);
5447       }
5448     }
5449 
5450     // Free the allocated memory.
5451     ga_clear(&spin.si_rep);
5452     ga_clear(&spin.si_repsal);
5453     ga_clear(&spin.si_sal);
5454     ga_clear(&spin.si_map);
5455     ga_clear(&spin.si_comppat);
5456     ga_clear(&spin.si_prefcond);
5457     hash_clear_all(&spin.si_commonwords, 0);
5458 
5459     // Free the .aff file structures.
5460     for (i = 0; i < incount; ++i) {
5461       if (afile[i] != NULL) {
5462         spell_free_aff(afile[i]);
5463       }
5464     }
5465 
5466     // Free all the bits and pieces at once.
5467     free_blocks(spin.si_blocks);
5468 
5469     // If there is soundfolding info and no NOSUGFILE item create the
5470     // .sug file with the soundfolded word trie.
5471     if (spin.si_sugtime != 0 && !error && !got_int) {
5472       spell_make_sugfile(&spin, wfname);
5473     }
5474   }
5475 
5476 theend:
5477   xfree(fname);
5478   xfree(wfname);
5479 }
5480 
5481 // Display a message for spell file processing when 'verbose' is set or using
5482 // ":mkspell".  "str" can be IObuff.
spell_message(const spellinfo_T * spin,char_u * str)5483 static void spell_message(const spellinfo_T *spin, char_u *str)
5484   FUNC_ATTR_NONNULL_ALL
5485 {
5486   if (spin->si_verbose || p_verbose > 2) {
5487     if (!spin->si_verbose) {
5488       verbose_enter();
5489     }
5490     msg((char *)str);
5491     ui_flush();
5492     if (!spin->si_verbose) {
5493       verbose_leave();
5494     }
5495   }
5496 }
5497 
5498 // ":[count]spellgood  {word}"
5499 // ":[count]spellwrong {word}"
5500 // ":[count]spellundo  {word}"
5501 // ":[count]spellrare  {word}"
ex_spell(exarg_T * eap)5502 void ex_spell(exarg_T *eap)
5503 {
5504   spell_add_word(eap->arg, (int)STRLEN(eap->arg),
5505                  eap->cmdidx == CMD_spellwrong ? SPELL_ADD_BAD :
5506                  eap->cmdidx == CMD_spellrare ? SPELL_ADD_RARE : SPELL_ADD_GOOD,
5507                  eap->forceit ? 0 : (int)eap->line2,
5508                  eap->cmdidx == CMD_spellundo);
5509 }
5510 
5511 /// Add "word[len]" to 'spellfile' as a good or bad word.
5512 ///
5513 /// @param what  SPELL_ADD_ values
5514 /// @param idx  "zG" and "zW": zero, otherwise index in 'spellfile'
5515 /// @param bool  // true for "zug", "zuG", "zuw" and "zuW"
spell_add_word(char_u * word,int len,SpellAddType what,int idx,bool undo)5516 void spell_add_word(char_u *word, int len, SpellAddType what, int idx, bool undo)
5517 {
5518   FILE *fd = NULL;
5519   buf_T *buf = NULL;
5520   bool new_spf = false;
5521   char_u *fname;
5522   char_u *fnamebuf = NULL;
5523   char_u line[MAXWLEN * 2];
5524   long fpos, fpos_next = 0;
5525   int i;
5526   char_u *spf;
5527 
5528   if (idx == 0) {           // use internal wordlist
5529     if (int_wordlist == NULL) {
5530       int_wordlist = vim_tempname();
5531       if (int_wordlist == NULL) {
5532         return;
5533       }
5534     }
5535     fname = int_wordlist;
5536   } else {
5537     // If 'spellfile' isn't set figure out a good default value.
5538     if (*curwin->w_s->b_p_spf == NUL) {
5539       init_spellfile();
5540       new_spf = true;
5541     }
5542 
5543     if (*curwin->w_s->b_p_spf == NUL) {
5544       semsg(_(e_notset), "spellfile");
5545       return;
5546     }
5547     fnamebuf = xmalloc(MAXPATHL);
5548 
5549     for (spf = curwin->w_s->b_p_spf, i = 1; *spf != NUL; ++i) {
5550       copy_option_part(&spf, fnamebuf, MAXPATHL, ",");
5551       if (i == idx) {
5552         break;
5553       }
5554       if (*spf == NUL) {
5555         semsg(_("E765: 'spellfile' does not have %" PRId64 " entries"), (int64_t)idx);
5556         xfree(fnamebuf);
5557         return;
5558       }
5559     }
5560 
5561     // Check that the user isn't editing the .add file somewhere.
5562     buf = buflist_findname_exp(fnamebuf);
5563     if (buf != NULL && buf->b_ml.ml_mfp == NULL) {
5564       buf = NULL;
5565     }
5566     if (buf != NULL && bufIsChanged(buf)) {
5567       emsg(_(e_bufloaded));
5568       xfree(fnamebuf);
5569       return;
5570     }
5571 
5572     fname = fnamebuf;
5573   }
5574 
5575   if (what == SPELL_ADD_BAD || undo) {
5576     // When the word appears as good word we need to remove that one,
5577     // since its flags sort before the one with WF_BANNED.
5578     fd = os_fopen((char *)fname, "r");
5579     if (fd != NULL) {
5580       while (!vim_fgets(line, MAXWLEN * 2, fd)) {
5581         fpos = fpos_next;
5582         fpos_next = ftell(fd);
5583         if (STRNCMP(word, line, len) == 0
5584             && (line[len] == '/' || line[len] < ' ')) {
5585           // Found duplicate word.  Remove it by writing a '#' at
5586           // the start of the line.  Mixing reading and writing
5587           // doesn't work for all systems, close the file first.
5588           fclose(fd);
5589           fd = os_fopen((char *)fname, "r+");
5590           if (fd == NULL) {
5591             break;
5592           }
5593           if (fseek(fd, fpos, SEEK_SET) == 0) {
5594             fputc('#', fd);
5595             if (undo) {
5596               home_replace(NULL, fname, NameBuff, MAXPATHL, TRUE);
5597               smsg(_("Word '%.*s' removed from %s"),
5598                    len, word, NameBuff);
5599             }
5600           }
5601           if (fseek(fd, fpos_next, SEEK_SET) != 0) {
5602             PERROR(_("Seek error in spellfile"));
5603             break;
5604           }
5605         }
5606       }
5607       if (fd != NULL) {
5608         fclose(fd);
5609       }
5610     }
5611   }
5612 
5613   if (!undo) {
5614     fd = os_fopen((char *)fname, "a");
5615     if (fd == NULL && new_spf) {
5616       char_u *p;
5617 
5618       // We just initialized the 'spellfile' option and can't open the
5619       // file.  We may need to create the "spell" directory first.  We
5620       // already checked the runtime directory is writable in
5621       // init_spellfile().
5622       if (!dir_of_file_exists(fname) && (p = path_tail_with_sep(fname)) != fname) {
5623         int c = *p;
5624 
5625         // The directory doesn't exist.  Try creating it and opening
5626         // the file again.
5627         *p = NUL;
5628         os_mkdir((char *)fname, 0755);
5629         *p = c;
5630         fd = os_fopen((char *)fname, "a");
5631       }
5632     }
5633 
5634     if (fd == NULL) {
5635       semsg(_(e_notopen), fname);
5636     } else {
5637       if (what == SPELL_ADD_BAD) {
5638         fprintf(fd, "%.*s/!\n", len, word);
5639       } else if (what == SPELL_ADD_RARE) {
5640         fprintf(fd, "%.*s/?\n", len, word);
5641       } else {
5642         fprintf(fd, "%.*s\n", len, word);
5643       }
5644       fclose(fd);
5645 
5646       home_replace(NULL, fname, NameBuff, MAXPATHL, TRUE);
5647       smsg(_("Word '%.*s' added to %s"), len, word, NameBuff);
5648     }
5649   }
5650 
5651   if (fd != NULL) {
5652     // Update the .add.spl file.
5653     mkspell(1, &fname, false, true, true);
5654 
5655     // If the .add file is edited somewhere, reload it.
5656     if (buf != NULL) {
5657       buf_reload(buf, buf->b_orig_mode);
5658     }
5659 
5660     redraw_all_later(SOME_VALID);
5661   }
5662   xfree(fnamebuf);
5663 }
5664 
5665 // Initialize 'spellfile' for the current buffer.
init_spellfile(void)5666 static void init_spellfile(void)
5667 {
5668   char_u *buf;
5669   int l;
5670   char_u *fname;
5671   char_u *rtp;
5672   char_u *lend;
5673   bool aspath = false;
5674   char_u *lstart = curbuf->b_s.b_p_spl;
5675 
5676   if (*curwin->w_s->b_p_spl != NUL && !GA_EMPTY(&curwin->w_s->b_langp)) {
5677     buf = xmalloc(MAXPATHL);
5678 
5679     // Find the end of the language name.  Exclude the region.  If there
5680     // is a path separator remember the start of the tail.
5681     for (lend = curwin->w_s->b_p_spl; *lend != NUL
5682          && vim_strchr((char_u *)",._", *lend) == NULL; ++lend) {
5683       if (vim_ispathsep(*lend)) {
5684         aspath = true;
5685         lstart = lend + 1;
5686       }
5687     }
5688 
5689     // Loop over all entries in 'runtimepath'.  Use the first one where we
5690     // are allowed to write.
5691     rtp = p_rtp;
5692     while (*rtp != NUL) {
5693       if (aspath) {
5694         // Use directory of an entry with path, e.g., for
5695         // "/dir/lg.utf-8.spl" use "/dir".
5696         STRLCPY(buf, curbuf->b_s.b_p_spl,
5697                 lstart - curbuf->b_s.b_p_spl);
5698       } else {
5699         // Copy the path from 'runtimepath' to buf[].
5700         copy_option_part(&rtp, buf, MAXPATHL, ",");
5701       }
5702       if (os_file_is_writable((char *)buf) == 2) {
5703         // Use the first language name from 'spelllang' and the
5704         // encoding used in the first loaded .spl file.
5705         if (aspath) {
5706           STRLCPY(buf, curbuf->b_s.b_p_spl,
5707                   lend - curbuf->b_s.b_p_spl + 1);
5708         } else {
5709           // Create the "spell" directory if it doesn't exist yet.
5710           l = (int)STRLEN(buf);
5711           vim_snprintf((char *)buf + l, MAXPATHL - l, "/spell");
5712           if (os_file_is_writable((char *)buf) != 2) {
5713             os_mkdir((char *)buf, 0755);
5714           }
5715 
5716           l = (int)STRLEN(buf);
5717           vim_snprintf((char *)buf + l, MAXPATHL - l,
5718                        "/%.*s", (int)(lend - lstart), lstart);
5719         }
5720         l = (int)STRLEN(buf);
5721         fname = LANGP_ENTRY(curwin->w_s->b_langp, 0)
5722                 ->lp_slang->sl_fname;
5723         vim_snprintf((char *)buf + l, MAXPATHL - l, ".%s.add",
5724                      ((fname != NULL
5725                        && strstr((char *)path_tail(fname), ".ascii.") != NULL)
5726                       ? "ascii"
5727                       : (const char *)spell_enc()));
5728         set_option_value("spellfile", 0L, (const char *)buf, OPT_LOCAL);
5729         break;
5730       }
5731       aspath = false;
5732     }
5733 
5734     xfree(buf);
5735   }
5736 }
5737 
5738 /// Set the spell character tables from strings in the .spl file.
5739 ///
5740 /// @param cnt  length of "flags"
set_spell_charflags(char_u * flags,int cnt,char_u * fol)5741 static void set_spell_charflags(char_u *flags, int cnt, char_u *fol)
5742 {
5743   // We build the new tables here first, so that we can compare with the
5744   // previous one.
5745   spelltab_T new_st;
5746   int i;
5747   char_u *p = fol;
5748   int c;
5749 
5750   clear_spell_chartab(&new_st);
5751 
5752   for (i = 0; i < 128; ++i) {
5753     if (i < cnt) {
5754       new_st.st_isw[i + 128] = (flags[i] & CF_WORD) != 0;
5755       new_st.st_isu[i + 128] = (flags[i] & CF_UPPER) != 0;
5756     }
5757 
5758     if (*p != NUL) {
5759       c = mb_ptr2char_adv((const char_u **)&p);
5760       new_st.st_fold[i + 128] = c;
5761       if (i + 128 != c && new_st.st_isu[i + 128] && c < 256) {
5762         new_st.st_upper[c] = i + 128;
5763       }
5764     }
5765   }
5766 
5767   (void)set_spell_finish(&new_st);
5768 }
5769 
set_spell_finish(spelltab_T * new_st)5770 static int set_spell_finish(spelltab_T *new_st)
5771 {
5772   int i;
5773 
5774   if (did_set_spelltab) {
5775     // check that it's the same table
5776     for (i = 0; i < 256; ++i) {
5777       if (spelltab.st_isw[i] != new_st->st_isw[i]
5778           || spelltab.st_isu[i] != new_st->st_isu[i]
5779           || spelltab.st_fold[i] != new_st->st_fold[i]
5780           || spelltab.st_upper[i] != new_st->st_upper[i]) {
5781         emsg(_("E763: Word characters differ between spell files"));
5782         return FAIL;
5783       }
5784     }
5785   } else {
5786     // copy the new spelltab into the one being used
5787     spelltab = *new_st;
5788     did_set_spelltab = true;
5789   }
5790 
5791   return OK;
5792 }
5793 
5794 // Write the table with prefix conditions to the .spl file.
5795 // When "fd" is NULL only count the length of what is written.
write_spell_prefcond(FILE * fd,garray_T * gap)5796 static int write_spell_prefcond(FILE *fd, garray_T *gap)
5797 {
5798   assert(gap->ga_len >= 0);
5799 
5800   if (fd != NULL) {
5801     put_bytes(fd, (uintmax_t)gap->ga_len, 2);           // <prefcondcnt>
5802   }
5803   size_t totlen = 2 + (size_t)gap->ga_len;  // <prefcondcnt> and <condlen> bytes
5804   size_t x = 1;  // collect return value of fwrite()
5805   for (int i = 0; i < gap->ga_len; ++i) {
5806     // <prefcond> : <condlen> <condstr>
5807     char_u *p = ((char_u **)gap->ga_data)[i];
5808     if (p != NULL) {
5809       size_t len = STRLEN(p);
5810       if (fd != NULL) {
5811         assert(len <= INT_MAX);
5812         fputc((int)len, fd);
5813         x &= fwrite(p, len, 1, fd);
5814       }
5815       totlen += len;
5816     } else if (fd != NULL) {
5817       fputc(0, fd);
5818     }
5819   }
5820 
5821   assert(totlen <= INT_MAX);
5822   return (int)totlen;
5823 }
5824 
5825 // Use map string "map" for languages "lp".
set_map_str(slang_T * lp,char_u * map)5826 static void set_map_str(slang_T *lp, char_u *map)
5827 {
5828   char_u *p;
5829   int headc = 0;
5830   int c;
5831   int i;
5832 
5833   if (*map == NUL) {
5834     lp->sl_has_map = false;
5835     return;
5836   }
5837   lp->sl_has_map = true;
5838 
5839   // Init the array and hash tables empty.
5840   for (i = 0; i < 256; ++i) {
5841     lp->sl_map_array[i] = 0;
5842   }
5843   hash_init(&lp->sl_map_hash);
5844 
5845   // The similar characters are stored separated with slashes:
5846   // "aaa/bbb/ccc/".  Fill sl_map_array[c] with the character before c and
5847   // before the same slash.  For characters above 255 sl_map_hash is used.
5848   for (p = map; *p != NUL;) {
5849     c = mb_cptr2char_adv((const char_u **)&p);
5850     if (c == '/') {
5851       headc = 0;
5852     } else {
5853       if (headc == 0) {
5854         headc = c;
5855       }
5856 
5857       // Characters above 255 don't fit in sl_map_array[], put them in
5858       // the hash table.  Each entry is the char, a NUL the headchar and
5859       // a NUL.
5860       if (c >= 256) {
5861         int cl = utf_char2len(c);
5862         int headcl = utf_char2len(headc);
5863         char_u *b;
5864         hash_T hash;
5865         hashitem_T *hi;
5866 
5867         b = xmalloc(cl + headcl + 2);
5868         utf_char2bytes(c, b);
5869         b[cl] = NUL;
5870         utf_char2bytes(headc, b + cl + 1);
5871         b[cl + 1 + headcl] = NUL;
5872         hash = hash_hash(b);
5873         hi = hash_lookup(&lp->sl_map_hash, (const char *)b, STRLEN(b), hash);
5874         if (HASHITEM_EMPTY(hi)) {
5875           hash_add_item(&lp->sl_map_hash, hi, b, hash);
5876         } else {
5877           // This should have been checked when generating the .spl
5878           // file.
5879           emsg(_("E783: duplicate char in MAP entry"));
5880           xfree(b);
5881         }
5882       } else {
5883         lp->sl_map_array[c] = headc;
5884       }
5885     }
5886   }
5887 }
5888 
5889