1% @Log: array.w,v @
2% Revision 1.4  1996/09/18 00:37:25  mike
3% 1) Fix stupid bozo in A[expr], expr is numeric and not integer.
4% 2) Add static for non-ansi compilers.
5% 3) Minor tweaks to documentation.
6%
7% Revision 1.3  1996/07/28 21:55:32  mike
8% trivial change -- add extra {}
9%
10% Revision 1.2  1996/02/25  23:42:25  mike
11% Fix zfree bug in array_clear.
12% Clean up documentation.
13%
14
15\input mwebmac
16\input ctmac
17
18\RCSID{$Id: array.w,v 1.18 2014/08/14 23:34:44 mike Exp $}
19
20\TOC{Mawk Arrays}
21
22\def\expr{{\it expr}}
23\def\Null{{\it null}}
24
25%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
26
27
28
29\section{Introduction}
30This is the source and documentation for the [[mawk]] implementation
31of awk arrays.  Arrays in awk are associations of strings to awk scalar
32values.   The mawk implementation stores the associations in
33hash tables.   The hash table scheme was influenced by
34and is similar to
35the design presented in Griswold and Townsend,
36{\sl The Design and Implementation of
37Dynamic Hashing Sets and Tables in Icon},
38{\bf Software Practice and Experience}, 23, 351-367, 1993.
39
40
41@
42\section{Data Structures}
43@
44\subsection{Array Structure}
45The type [[ARRAY]] is a pointer to a [[struct array]].
46The [[size]] field is the number of elements in the table.
47The meaning of the other fields depends on the [[type]] field.
48
49<<array typedefs and [[#defines]]>>=
50typedef struct array {
51   PTR ptr ;  /* What this points to depends on the type */
52   size_t size ; /* number of elts in the table */
53   size_t limit ; /* Meaning depends on type */
54   unsigned hmask ; /* bitwise and with hash value to get table index */
55   short type ;  /* values in AY_NULL .. AY_SPLIT */
56} *ARRAY ;
57@ %def ARRAY
58
59There are three types of arrays and these are distinguished by the
60[[type]] field in the structure.  The types are:
61
62\I[[AY_NULL]] The array is empty and the [[size]] field is always
63zero.  The other fields have no meaning.
64
65\I[[AY_SPLIT]] The array was created by the [[AWK]] built-in
66[[split]].  The return value from [[split]] is stored in the [[size]]
67field.  The [[ptr]] field points at a vector of [[CELLs]].  The number
68of [[CELLs]] is the [[limit]] field. It is always true that
69${\it size}\leq{\it limit}$.  The address of [[A[i]]] is [[(CELL*)A->ptr+i-1]]
70for $1\leq i\leq{\it size}$.  The [[hmask]] field has no meaning.
71
72\I{\bf Hash Table} The array is a hash table.  If the [[AY_STR]] bit
73in the [[type]] field is set, then the table is keyed on strings.
74If the [[AY_INT]] bit in the [[type]] field is set, then the table is
75keyed on integers.  Both bits can be set, and then the two keys are
76consistent, i.e., look up of [[A[-14]]] and [[A["-14"]]] will
77return identical [[CELL]] pointers although the look up methods will
78be different.  In this case, the [[size]] field is the number of hash nodes
79in the table.  When insertion of a new element would cause [[size]] to
80exceed [[limit]], the table grows by doubling the number of hash chains.
81The invariant,
82$({\it hmask}+1){\it max\_ave\_list\_length}={\it limit}$, is always true.
83{\it Max\_ave\_list\_length} is a tunable constant.
84\endhitems
85
86
87<<array typedefs and [[#defines]]>>=
88#define AY_NULL		0
89#define AY_INT		1
90#define AY_STR		2
91#define AY_SPLIT	4
92@ %def AY_NULL AY_INT AY_STR AY_SPLIT
93
94@
95\subsection{Hash Tables}
96The hash tables are linked lists of nodes, called [[ANODEs]].
97The number of lists is [[hmask+1]] which is always a power of two.
98The [[ptr]] field points at a vector of list heads.  Since there are
99potentially two types of lists, integer lists and strings lists,
100each list head is a structure, [[DUAL_LINK]].
101
102<<local constants, defs and prototypes>>=
103struct anode ;
104typedef struct {struct anode *slink, *ilink ;} DUAL_LINK ;
105@ %def anode DUAL_LINK
106
107@
108The string lists are chains connected by [[slinks]] and the integer
109lists are chains connected by [[ilinks]].  We sometimes refer to these
110lists as slists and ilists, respectively.
111The elements on the lists are [[ANODEs]].
112The fields of an [[ANODE]] are:
113
114\I[[slink]] The link field for slists.
115\I[[ilink]] The link field for ilists.
116\I[[sval]] If non-null, then [[sval]] is a pointer to a string
117key.  For a given table, if the [[AY_STR]] bit is set then every
118[[ANODE]] has a non-null [[sval]] field and conversely, if [[AY_STR]]
119is not set, then every [[sval]] field is null.
120
121\I[[hval]] The hash value of [[sval]].  This field has no
122meaning if [[sval]] is null.
123
124\I[[ival]] The integer key.  The field has no meaning if set
125to the constant, [[NOT_AN_IVALUE]].  If the [[AY_STR]] bit is off,
126then every [[ANODE]] will have a valid [[ival]] field.  If the
127[[AY_STR]] bit is on, then the [[ival]] field may or may not be
128valid.
129
130\I[[cell]] The data field in the hash table.
131\endhitems
132
133\noindent
134So the value of $A[\expr]$ is stored in the [[cell]] field, and if
135\expr{} is an integer, then \expr{} is stored in [[ival]], else it
136is stored in [[sval]].
137
138<<local constants, defs and prototypes>>=
139typedef struct anode {
140   struct anode *slink ;
141   struct anode  *ilink ;
142   STRING *sval ;
143   unsigned hval ;
144   Int     ival ;
145   CELL    cell ;
146} ANODE ;
147@ %def ANODE
148
149
150@
151\section{Array Operations}
152The functions that operate on arrays are,
153
154\I[[CELL* array_find(ARRAY A, CELL *cp, int create_flag)]] returns a
155pointer to $A[\expr]$ where [[cp]] is a pointer to the [[CELL]]
156holding \expr\/.  If the [[create_flag]] is on and \expr\/ is not
157an element of [[A]], then the element is created with value \Null\/.
158
159\I[[void array_delete(ARRAY A, CELL *cp)]] removes an element
160$A[\expr]$ from the array $A$.  [[cp]] points at the [[CELL]] holding
161\expr\/.
162
163\I[[void array_load(ARRAY A, size_t cnt)]] builds a split array.  The
164values [[A[1..cnt]]] are moved into [[A]] from an anonymous
165buffer with [[transfer_to_array()]] which is declared in
166[[split.h]].
167
168\I[[void array_clear(ARRAY A)]] removes all elements of $A$.  The
169type of $A$ is then [[AY_NULL]].
170
171\I[[STRING** array_loop_vector(ARRAY A, size_t *sizep)]]
172returns a pointer
173to a linear vector that holds all the strings that are indices of $A$.
174The size of the the vector is returned indirectly in [[*sizep]].
175If [[A->size==0]], a \Null{} pointer is returned.
176
177\I[[CELL* array_cat(CELL *sp, int cnt)]] concatenates the elements
178of ${\it sp}[1-cnt..0]$, with each element separated by [[SUBSEP]], to
179compute an array index.  For example, on a reference to $A[i,j]$,
180[[array_cat]] computes $i\circ{\it SUBSEP}\circ j$ where
181$\circ$ denotes concatenation.
182\endhitems
183
184<<interface prototypes>>=
185CELL* array_find(ARRAY, CELL*, int);
186void  array_delete(ARRAY, CELL*);
187void  array_load(ARRAY, size_t);
188void  array_clear(ARRAY);
189STRING** array_loop_vector(ARRAY, size_t*);
190CELL* array_cat(CELL*, int);
191
192@
193\subsection{Array Find}
194Any reference to $A[\expr]$ creates a call to
195[[array_find(A,cp,CREATE)]] where [[cp]] points at the cell holding
196\expr\/.  The test, $\expr \hbox{ in } A$, creates a call to
197[[array_find(A,cp,NO_CREATE)]].
198
199<<array typedefs and [[#defines]]>>=
200#define NO_CREATE  0
201#define CREATE     1
202@ %def NO_CREATE CREATE
203
204@
205[[Array_find]] is hash-table lookup that breaks into two cases:
206
207\list
208\item{(1)} If [[*cp]] is numeric and integer valued, then lookup by
209integer value using [[find_by_ival]].  If [[*cp]] is numeric, but not
210integer valued, then convert to string with [[sprintf(CONVFMT,...)]] and
211go to case~2.
212
213\item{(2)} If [[*cp]] is string valued, then lookup by string value
214using [[find_by_sval]].
215\endlist
216
217<<interface functions>>=
218CELL* array_find(
219   ARRAY A,
220   CELL *cp,
221   int create_flag)
222{
223   ANODE *ap ;
224   int redid ;
225   if (A->size == 0 && !create_flag)
226      /* eliminating this trivial case early avoids unnecessary conversions later */
227      return (CELL*) 0 ;
228   switch (cp->type) {
229      case C_DOUBLE:
230	 <<if the [[*cp]] is an integer, find by integer value else find by string value>>
231	 break ;
232      case C_NOINIT:
233	 ap = find_by_sval(A, &null_str, create_flag, &redid) ;
234	 break ;
235      default:
236	 ap = find_by_sval(A, string(cp), create_flag, &redid) ;
237	 break ;
238   }
239   return ap ? &ap->cell : (CELL *) 0 ;
240}
241@ %def array_find
242
243@
244To test whether [[cp->dval]] is integer, we convert to the nearest
245integer by rounding towards zero (done by [[do_to_I]]) and then cast
246back to double.  If we get the same number we started with, then
247[[cp->dval]] is integer valued.
248
249<<if the [[*cp]] is an integer, find by integer value else find by string value>>=
250{
251   double d = cp->dval ;
252   Int ival = d_to_I(d) ;
253   if ((double)ival == d) {
254      if (A->type == AY_SPLIT) {
255         if (ival >= 1 && ival <= (int) A->size)
256            return (CELL*)A->ptr+(ival-1) ;
257         if (!create_flag) return (CELL*) 0 ;
258         convert_split_array_to_table(A) ;
259      }
260      else if (A->type == AY_NULL) make_empty_table(A, AY_INT) ;
261      ap = find_by_ival(A, ival, create_flag, &redid) ;
262   }
263   else {
264      /* convert to string */
265      char buff[260] ;
266      STRING *sval ;
267      sprintf(buff, string(CONVFMT)->str, d) ;
268      sval = new_STRING(buff) ;
269      ap = find_by_sval(A, sval, create_flag, &redid) ;
270      free_STRING(sval) ;
271   }
272}
273
274@
275When we get to the function [[find_by_ival]], the search has been reduced
276to lookup in a hash table by integer value.
277
278<<local functions>>=
279static ANODE* find_by_ival(
280   ARRAY A ,
281   Int ival ,
282   int create_flag ,
283   int *redo )
284{
285   DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
286   unsigned indx = (unsigned) ival & A->hmask ;
287   ANODE *p = table[indx].ilink ; /* walks ilist */
288   ANODE *q = (ANODE*) 0 ; /* trails p */
289   while(1) {
290      if (!p) {
291	  /* search failed */
292	  <<search by string value if needed and create if needed>>
293	  break ;
294      }
295      else if (p->ival == ival) {
296	 /* found it, now move to the front */
297	 if (!q) /* already at the front */
298	    return p ;
299	 /* delete for insertion at the front */
300	 q->ilink = p->ilink ;
301	 break ;
302      }
303      q = p ; p = q->ilink ;
304   }
305   /* insert at the front */
306   p->ilink = table[indx].ilink ;
307   table[indx].ilink = p ;
308   return p ;
309}
310@ %def find_by_ival
311
312<<local constants, defs and prototypes>>=
313static ANODE* find_by_ival(ARRAY, Int, int, int*);
314
315@
316When a search by integer value fails, we have to check by string
317value to correctly
318handle the case insertion by [[A["123"]]] and later search as
319[[A[123]]].  This string search is necessary if and only if the
320[[AY_STR]] bit is set.  An important point is that all [[ANODEs]] get
321created with a valid [[sval]] if [[AY_STR]] is set, because then creation
322of new nodes always occurs in a call to [[find_by_sval]].
323
324<<search by string value if needed and create if needed>>=
325if (A->type & AY_STR) {
326   /* need to search by string */
327   char buff[256] ;
328   STRING *sval ;
329   sprintf(buff, INT_FMT, ival) ;
330   sval = new_STRING(buff) ;
331   p = find_by_sval(A, sval, create_flag, redo) ;
332   if (*redo) {
333      table = (DUAL_LINK*) A->ptr ;
334   }
335   free_STRING(sval) ;
336   if (!p) return (ANODE*) 0 ;
337}
338else if (create_flag) {
339   p = ZMALLOC(ANODE) ;
340   p->sval = (STRING*) 0 ;
341   p->cell.type = C_NOINIT ;
342   if (++A->size > A->limit) {
343      double_the_hash_table(A) ; /* changes table, may change index */
344      table = (DUAL_LINK*) A->ptr ;
345      indx = A->hmask & (unsigned) ival ;
346   }
347}
348else return (ANODE*) 0 ;
349p->ival = ival ;
350A->type |= AY_INT ;
351
352@
353Searching by string value is easier because [[AWK]] arrays are really
354string associations.  If the array does not have the [[AY_STR]] bit set,
355then we have to convert the array to a dual hash table with strings
356which is done by the function [[add_string_associations]].
357
358<<local functions>>=
359static ANODE* find_by_sval(
360   ARRAY A ,
361   STRING *sval ,
362   int create_flag ,
363   int *redo )
364{
365   unsigned hval = ahash(sval) ;
366   char *str = sval->str ;
367   DUAL_LINK *table ;
368   unsigned indx ;
369   ANODE *p ;  /* walks list */
370   ANODE *q = (ANODE*) 0 ; /* trails p */
371   if (! (A->type & AY_STR)) add_string_associations(A) ;
372   table = (DUAL_LINK*) A->ptr ;
373   indx = hval & A->hmask ;
374   p = table[indx].slink ;
375   *redo = 0 ;
376   while(1) {
377      if (!p)  {
378         if (create_flag) {
379	    <<create a new anode for [[sval]]>>
380	    break ;
381	 }
382	 return (ANODE*) 0 ;
383      }
384      else if (p->hval == hval) {
385	 if (strcmp(p->sval->str,str) == 0 ) {
386	    /* found */
387	    if (!q) /* already at the front */
388	       return p ;
389	    else { /* delete for move to the front */
390	       q->slink = p->slink ;
391	       break ;
392	    }
393	 }
394      }
395      q = p ; p = q->slink ;
396   }
397   p->slink = table[indx].slink ;
398   table[indx].slink = p ;
399   return p ;
400}
401@ %def find_by_sval
402
403<<local constants, defs and prototypes>>=
404static ANODE* find_by_sval(ARRAY, STRING*, int, int*);
405
406@
407One [[Int]] value is reserved to show that the [[ival]] field is invalid.
408This works because [[d_to_I]] returns a value in [[[-Max_Int, Max_Int]]].
409
410<<local constants, defs and prototypes>>=
411#define NOT_AN_IVALUE (-Max_Int-1)  /* usually 0x80000000 */
412@ %def NOT_AN_IVALUE
413
414<<create a new anode for [[sval]]>>=
415{
416   p = ZMALLOC(ANODE) ;
417   p->sval = sval ;
418   sval->ref_cnt++ ;
419   p->ival = NOT_AN_IVALUE ;
420   p->hval = hval ;
421   p->cell.type = C_NOINIT ;
422   if (++A->size > A->limit) {
423      double_the_hash_table(A) ; /* changes table, may change index */
424      table = (DUAL_LINK*) A->ptr ;
425      indx = hval & A->hmask ;
426      *redo = 1 ;
427   }
428}
429
430@
431On entry to [[add_string_associations]], we know that the [[AY_STR]] bit
432is not set. We convert to a dual hash table, then walk all the integer
433lists and put each [[ANODE]] on a string list.
434
435<<local functions>>=
436static void add_string_associations(ARRAY A)
437{
438   if (A->type == AY_NULL) make_empty_table(A, AY_STR) ;
439   else {
440      DUAL_LINK *table ;
441      int i ; /* walks table */
442      ANODE *p ; /* walks ilist */
443      char buff[256] ;
444      if (A->type == AY_SPLIT) convert_split_array_to_table(A) ;
445      table = (DUAL_LINK*) A->ptr ;
446      for(i=0; (unsigned) i <= A->hmask; i++) {
447	 p = table[i].ilink ;
448	 while(p) {
449	    sprintf(buff, INT_FMT, p->ival) ;
450	    p->sval = new_STRING(buff) ;
451	    p->hval = ahash(p->sval) ;
452	    p->slink = table[A->hmask&p->hval].slink ;
453	    table[A->hmask&p->hval].slink = p ;
454	    p = p->ilink ;
455	 }
456      }
457      A->type |= AY_STR ;
458   }
459}
460@ %def add_string_associations
461
462<<local constants, defs and prototypes>>=
463static void add_string_associations(ARRAY);
464
465@
466\subsection{Array Delete}
467The execution of the statement, $\hbox{\it delete }A[\expr]$, creates a
468call to{\hfil\break}[[array_delete(ARRAY A, CELL *cp)]].  Depending on the
469type of [[*cp]], the call is routed to [[find_by_sval]] or [[find_by_ival]].
470Each of these functions leaves its return value on the front of an
471slist or ilist, respectively, and then it is deleted from the front of
472the list.  The case where $A[\expr]$ is on two lists, e.g.,
473[[A[12]]] and [[A["12"]]] is checked by examining the [[sval]] and
474[[ival]] fields of the returned [[ANODE*]].
475
476<<interface functions>>=
477void array_delete(
478   ARRAY A,
479   CELL *cp)
480{
481   ANODE *ap ;
482   int redid ;
483   if (A->size == 0) return ;
484   switch(cp->type) {
485      case C_DOUBLE :
486	 {
487	    double d = cp->dval ;
488	    Int ival = d_to_I(d) ;
489	    if ((double)ival == d) <<delete by integer value and return>>
490	    else { /* get the string value */
491	       char buff[260] ;
492	       STRING *sval ;
493	       sprintf(buff, string(CONVFMT)->str, d) ;
494	       sval = new_STRING(buff) ;
495	       ap = find_by_sval(A, sval, NO_CREATE, &redid) ;
496	       free_STRING(sval) ;
497	    }
498	 }
499	 break ;
500      case C_NOINIT :
501	 ap = find_by_sval(A, &null_str, NO_CREATE, &redid) ;
502	 break ;
503      default :
504	 ap = find_by_sval(A, string(cp), NO_CREATE, &redid) ;
505	 break ;
506   }
507   if (ap) { /* remove from the front of the slist */
508      DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
509      table[ap->hval & A->hmask].slink = ap->slink ;
510      <<if [[ival]] is valid, remove [[ap]] from its ilist>>
511      free_STRING(ap->sval) ;
512      cell_destroy(&ap->cell) ;
513      ZFREE(ap) ;
514      <<decrement [[A->size]]>>
515   }
516}
517
518<<delete by integer value and return>>=
519{
520   if (A->type == AY_SPLIT)
521     {
522      if (ival >=1 && ival <= (int) A->size)
523          convert_split_array_to_table(A) ;
524      else return ; /* ival not in range */
525     }
526   ap = find_by_ival(A, ival, NO_CREATE, &redid) ;
527   if (ap) { /* remove from the front of the ilist */
528      DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
529      table[(unsigned) ap->ival & A->hmask].ilink = ap->ilink ;
530      <<if [[sval]] is valid, remove [[ap]] from its slist>>
531      cell_destroy(&ap->cell) ;
532      ZFREE(ap) ;
533      <<decrement [[A->size]]>>
534   }
535   return ;
536}
537
538@
539Even though we found a node by searching an ilist it might also
540be on an slist and vice-versa.
541
542<<if [[sval]] is valid, remove [[ap]] from its slist>>=
543if (ap->sval) {
544   ANODE *p, *q = 0 ;
545   unsigned indx = (unsigned) ap->hval & A->hmask ;
546   p = table[indx].slink ;
547   while(p != ap) { q = p ; p = q->slink ; }
548   if (q) q->slink = p->slink ;
549   else table[indx].slink = p->slink ;
550   free_STRING(ap->sval) ;
551}
552
553<<if [[ival]] is valid, remove [[ap]] from its ilist>>=
554if (ap->ival != NOT_AN_IVALUE) {
555   ANODE *p, *q = 0 ;
556   unsigned indx = (unsigned) ap->ival & A->hmask ;
557   p = table[indx].ilink ;
558   while(p != ap) { q = p ; p = q->ilink ; }
559   if (q) q->ilink = p->ilink ;
560   else table[indx].ilink = p->ilink ;
561}
562
563@
564When the size of a hash table drops below a certain value, it might
565be profitable to shrink the hash table.  Currently we don't do this,
566because our guess is that it would be a waste of time for most
567[[AWK]] applications.  However, we do convert an array to [[AY_NULL]]
568when the size goes to zero which would resize a large hash table
569that had been completely cleared by successive deletions.
570
571<<decrement [[A->size]]>>=
572if (--A->size == 0) array_clear(A) ;
573
574
575@
576\subsection{Building an Array with Split}
577A simple operation is to create an array with the [[AWK]]
578primitive [[split]].  The code that performs [[split]] puts the
579pieces in an anonymous buffer.
580[[array_load(A, cnt)]] moves the [[cnt]] elements from the anonymous
581buffer into [[A]].
582This is the only way an array of type [[AY_SPLIT]] is
583created.
584
585<<interface functions>>=
586void array_load(
587   ARRAY A,
588   size_t cnt)
589{
590   <<clean up the existing array and prepare an empty split array of size [[cnt]]>>
591   A->size = cnt ;
592   transfer_to_array((CELL*) A->ptr, cnt) ;
593}
594@ %def array_load
595
596@
597If the array [[A]] is a split array and big enough then we reuse it,
598otherwise we need to allocate a new split array.
599When we allocate a block of [[CELLs]] for a split array, we round up
600to a multiple of 4.
601
602<<clean up the existing array and prepare an empty split array of size [[cnt]]>>=
603if (A->type != AY_SPLIT || A->limit < cnt) {
604    array_clear(A) ;
605    A->limit = (cnt & (size_t) ~3) + 4 ;
606    A->ptr = zmalloc(A->limit*sizeof(CELL)) ;
607    A->type = AY_SPLIT ;
608}
609else
610{
611    /* reusing an existing AY_SPLIT array */
612    size_t i ;
613    for(i=0; i < A->size; i++) {
614       cell_destroy((CELL*)A->ptr + i) ;
615    }
616}
617
618@
619\subsection{Array Clear}
620The function [[array_clear(ARRAY A)]] converts [[A]] to type [[AY_NULL]]
621and frees all storage used by [[A]] except for the [[struct array]]
622itself.  This function gets called in three contexts:
623(1)~when an array local to a user function goes out of scope,
624(2)~execution of the [[AWK]] statement, [[delete A]] and
625(3)~when an existing changes type or size from [[split()]].
626
627<<interface functions>>=
628void array_clear(ARRAY A)
629{
630   unsigned i ;
631   ANODE *p, *q ;
632   if (A->type == AY_SPLIT) {
633      for(i = 0; i < A->size; i++)
634          cell_destroy((CELL*)A->ptr+i) ;
635      zfree(A->ptr, A->limit * sizeof(CELL)) ;
636   }
637   else if (A->type & AY_STR) {
638      DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
639      for(i=0; (unsigned) i <= A->hmask; i++) {
640	 p = table[i].slink ;
641	 while(p) {
642	    q = p ; p = q->slink ;
643	    free_STRING(q->sval) ;
644	    cell_destroy(&q->cell) ;
645	    ZFREE(q) ;
646	 }
647      }
648      zfree(A->ptr, (A->hmask+1)*sizeof(DUAL_LINK)) ;
649   }
650   else if (A->type & AY_INT) {
651      DUAL_LINK *table = (DUAL_LINK*) A->ptr ;
652      for(i=0; (unsigned) i <= A->hmask; i++) {
653	 p = table[i].ilink ;
654	 while(p) {
655	    q = p ; p = q->ilink ;
656	    cell_destroy(&q->cell) ;
657	    ZFREE(q) ;
658	 }
659      }
660      zfree(A->ptr, (A->hmask+1)*sizeof(DUAL_LINK)) ;
661   }
662   memset(A, 0, sizeof(*A)) ;
663}
664@ %def array_clear
665
666
667@
668\subsection{Constructor and Conversions}
669Arrays are always created as empty arrays of type [[AY_NULL]].
670Global arrays are never destroyed although they can go empty or have
671their type change by conversion.  The only constructor function is
672a macro.
673
674<<array typedefs and [[#defines]]>>=
675#define new_ARRAY()  ((ARRAY)memset(ZMALLOC(struct array),0,sizeof(struct array)))
676@ %def new_ARRAY
677
678@
679Hash tables only get constructed by conversion.  This happens in two
680ways.
681The function [[make_empty_table]] converts an empty array of type
682[[AY_NULL]] to an empty hash table.  The number of lists in the table
683is a power of 2 determined by the constant [[STARTING_HMASK]].
684The limit size of the table is determined by the constant
685[[MAX_AVE_LIST_LENGTH]] which is the largest average size of the hash
686lists that we are willing to tolerate before enlarging the table.
687When [[A->size]] exceeds [[A->limit]],
688the hash table grows in size by doubling the number of lists.
689[[A->limit]] is then reset to [[MAX_AVE_LIST_LENGTH]] times
690[[A->hmask+1]].
691
692<<local constants, defs and prototypes>>=
693#define STARTING_HMASK    63  /* 2^6-1, must have form 2^n-1 */
694#define MAX_AVE_LIST_LENGTH   12
695#define hmask_to_limit(x) (((x)+1)*MAX_AVE_LIST_LENGTH)
696#define ahash(sval) hash2((sval)->str, (sval)->len)
697@ %def STARTING_HMASK
698@ %def MAX_AVE_LIST_LENGTH
699@ %def hmask_to_limit
700@ %def ahash
701
702<<local functions>>=
703static void make_empty_table(
704   ARRAY A ,
705   int type ) /* AY_INT or AY_STR */
706{
707   size_t sz = (STARTING_HMASK+1)*sizeof(DUAL_LINK) ;
708   A->type = (short) type ;
709   A->hmask = STARTING_HMASK ;
710   A->limit = hmask_to_limit(STARTING_HMASK) ;
711   A->ptr = memset(zmalloc(sz), 0, sz) ;
712}
713@ %def make_empty_table
714
715<<local constants, defs and prototypes>>=
716static void make_empty_table(ARRAY, int);
717
718@
719The other way a hash table gets constructed is when a split array is
720converted to a hash table of type [[AY_INT]].
721
722<<local functions>>=
723static void convert_split_array_to_table(ARRAY A)
724{
725   CELL *cells = (CELL*) A->ptr ;
726   unsigned i ; /* walks cells */
727   DUAL_LINK *table ;
728   unsigned j ; /* walks table */
729   size_t entry_limit = A->limit ;
730   <<determine the size of the hash table and allocate>>
731   /* insert each cells[i] in the new hash table on an ilist */
732   for(i=0, j=1; i < A->size; i++) {
733      ANODE *p = ZMALLOC(ANODE) ;
734      p->sval = (STRING*) 0 ;
735      p->ival = (Int) (i + 1) ;
736      p->cell = cells[i] ;
737      p->ilink = table[j].ilink ;
738      table[j].ilink = p ;
739      j++ ; j &= A->hmask ;
740   }
741   A->type = AY_INT ;
742   zfree(cells, entry_limit*sizeof(CELL)) ;
743}
744@ %def convert_split_array_to_table
745
746<<local constants, defs and prototypes>>=
747static void convert_split_array_to_table(ARRAY);
748
749@
750To determine the size of the table, we set the initial size to
751[[STARTING_HMASK+1]] and then double the size until
752[[A->size <= A->limit]].
753
754<<determine the size of the hash table and allocate>>=
755A->hmask = STARTING_HMASK ;
756A->limit = hmask_to_limit(STARTING_HMASK) ;
757while(A->size > A->limit) {
758   A->hmask = (A->hmask<<1) + 1 ; /* double the size */
759   A->limit = hmask_to_limit(A->hmask) ;
760}
761{
762   size_t sz = (A->hmask+1)*sizeof(DUAL_LINK) ;
763   A->ptr = memset(zmalloc(sz), 0, sz) ;
764   table = (DUAL_LINK*) A->ptr ;
765}
766
767
768@
769\subsection{Doubling the Size of a Hash Table}
770The whole point of making the table size a power of two is to
771facilitate resizing the table.  If the table size is $2^n$ and
772$h$ is the hash key, then $h\bmod 2^n$ is the hash chain index
773which can be calculated with bit-wise and,
774{\mathchardef~="2026 $h ~ (2^n-1)$}.
775When the table size doubles, the new bit-mask has one more bit
776turned on.  Elements of an old hash chain whose hash value have this bit
777turned on get moved to a new chain. Elements with this bit turned off
778stay on the same chain.  On average only half the old chain moves to the
779new chain.  If the old chain is at ${\it table}[i],\ 0\le i < 2^n$,
780then the elements that move, all move to the new chain at
781${\it table}[i+2^n]$.
782
783<<local functions>>=
784static void double_the_hash_table(ARRAY A)
785{
786   unsigned old_hmask = A->hmask ;
787   unsigned new_hmask = (old_hmask<<1)+1 ;
788   DUAL_LINK *table ;
789   <<allocate the new hash table>>
790   <<if the old table has string lists, move about half the string nodes>>
791   <<if the old table has integer lists, move about half the integer nodes>>
792   A->hmask = new_hmask ;
793   A->limit = hmask_to_limit(new_hmask) ;
794}
795@ %def double_the_hash_table
796
797<<local constants, defs and prototypes>>=
798static void double_the_hash_table(ARRAY);
799
800<<allocate the new hash table>>=
801A->ptr = zrealloc(A->ptr, (old_hmask+1)*sizeof(DUAL_LINK),
802			  (new_hmask+1)*sizeof(DUAL_LINK)) ;
803table = (DUAL_LINK*) A->ptr ;
804/* zero out the new part which is the back half */
805memset(&table[old_hmask+1], 0, (old_hmask+1)*sizeof(DUAL_LINK)) ;
806
807<<if the old table has string lists, move about half the string nodes>>=
808if (A->type & AY_STR) {
809   unsigned i ; /* index to old lists */
810   unsigned j ; /* index to new lists */
811   ANODE *p ; /* walks an old list */
812   ANODE *q ; /* trails p for deletion */
813   ANODE *tail ; /* builds new list from the back */
814   ANODE dummy0, dummy1 ;
815   for(i=0, j=old_hmask+1; i <= old_hmask; i++, j++)
816      <<walk one old string list, creating one new string list>>
817}
818
819@
820As we walk an old string list with pointer [[p]], the expression
821[[p->hval & new_hmask]] takes one of two values.  If it is equal
822to [[p->hval & old_hmask]] (which equals [[i]]),
823then the node stays otherwise it gets moved
824to a new string list at [[j]].  The new string list preserves order so that
825the positions of the move-to-the-front heuristic are preserved.
826Nodes moving to the new list are appended at pointer [[tail]].
827The [[ANODEs]], [[dummy0]]~and [[dummy1]], are sentinels that remove
828special handling of boundary conditions.
829
830<<walk one old string list, creating one new string list>>=
831{
832   q = &dummy0 ;
833   q->slink = p = table[i].slink ;
834   tail = &dummy1 ;
835   while (p) {
836      if ((p->hval & new_hmask) != (unsigned) i) { /* move it */
837	 q->slink = p->slink ;
838	 tail = tail->slink = p ;
839      }
840      else q = p ;
841      p = q->slink ;
842   }
843   table[i].slink = dummy0.slink ;
844   tail->slink = (ANODE*) 0 ;
845   table[j].slink = dummy1.slink ;
846}
847
848@
849The doubling of the integer lists is exactly the same except that
850[[slink]] is replaced by [[ilink]] and [[hval]] is replaced by [[ival]].
851
852<<if the old table has integer lists, move about half the integer nodes>>=
853if (A->type & AY_INT) {
854   unsigned i ; /* index to old lists */
855   unsigned j ; /* index to new lists */
856   ANODE *p ; /* walks an old list */
857   ANODE *q ; /* trails p for deletion */
858   ANODE *tail ; /* builds new list from the back */
859   ANODE dummy0, dummy1 ;
860   for(i=0, j=old_hmask+1; i <= old_hmask; i++, j++)
861      <<walk one old integer list, creating one new integer list>>
862}
863
864<<walk one old integer list, creating one new integer list>>=
865{
866   q = &dummy0 ;
867   q->ilink = p = table[i].ilink ;
868   tail = &dummy1 ;
869   while (p) {
870      if (((unsigned) p->ival & new_hmask) != i) { /* move it */
871	 q->ilink = p->ilink ;
872	 tail = tail->ilink = p ;
873      }
874      else q = p ;
875      p = q->ilink ;
876   }
877   table[i].ilink = dummy0.ilink ;
878   tail->ilink = (ANODE*) 0 ;
879   table[j].ilink = dummy1.ilink ;
880}
881
882@ Initializing Array Loops
883Our mechanism for dealing with execution of the statement,
884\medskip
885\centerline{[[for(i in A) {]] {\it statements} [[}]]}
886\medskip
887\noindent
888is simple. We allocate a vector of [[STRING*]] of size,
889[[A->size]].  Each element of the vector is a string key for~[[A]].
890Note that if the [[AY_STR]] bit of [[A]] is not set, then [[A]]
891has to be converted to a string hash table, because the index
892[[i]] walks string indices.
893
894To execute the loop, the only state that needs to be saved is the
895address of [[i]] and an index into the vector of string keys.  Since
896nothing about [[A]] is saved as state, the user
897program can do anything to [[A]] inside the body of
898the loop, even [[delete A]], and the loop
899still works.  Essentially, we have traded data space (the string vector)
900in exchange for implementation simplicity.  On a 32-bit system, each
901[[ANODE]] is 36 bytes, so the extra memory needed for the array loop is
90211\% more than the memory consumed by the [[ANODEs]] of the array.
903Note that the large size of the [[ANODEs]] is indicative of our whole
904design which pays data space for integer lookup speed and algorithm
905simplicity.
906
907The only aspect of array loops that occurs in [[array.c]] is construction
908of the string vector.  The rest of the implementation
909is in the file [[execute.c]].
910
911<<interface functions>>=
912static int string_compare(
913   const void *l,
914   const void *r)
915{
916   STRING*const * a = (STRING *const *) l;
917   STRING*const * b = (STRING *const *) r;
918
919   return strcmp((*a)->str, (*b)->str);
920}
921@ %def string_compare
922
923<<interface functions>>=
924STRING** array_loop_vector(
925   ARRAY A,
926   size_t *sizep)
927{
928   STRING** ret ;
929   *sizep = A->size ;
930   if (A->size > 0) {
931      if (!(A->type & AY_STR)) add_string_associations(A) ;
932      ret = (STRING**) zmalloc(A->size*sizeof(STRING*)) ;
933      <<for each [[ANODE]] in [[A]], put one string in [[ret]]>>
934      if (getenv("WHINY_USERS") != NULL)	/* gawk compability */
935	qsort(ret, A->size, sizeof(STRING*), string_compare);
936      return ret ;
937   }
938   return (STRING**) 0 ;
939}
940@ %def array_loop_vector
941
942@
943As we walk over the hash table [[ANODEs]], putting each [[sval]] in
944[[ret]], we need to increment each reference count.  The user of the
945return value is responsible for these new reference counts.
946
947<<for each [[ANODE]] in [[A]], put one string in [[ret]]>>=
948{
949   int r = 0 ; /* indexes ret */
950   DUAL_LINK* table = (DUAL_LINK*) A->ptr ;
951   int i ; /* indexes table */
952   ANODE *p ; /* walks slists */
953   for(i=0; (unsigned) i <= A->hmask; i++) {
954      for(p = table[i].slink; p ; p = p->slink) {
955	 ret[r++] = p->sval ;
956	 p->sval->ref_cnt++ ;
957      }
958   }
959}
960
961@
962\subsection{Concatenating Array Indices}
963In [[AWK]], an array expression [[A[i,j]]] is equivalent to the
964expression [[A[i SUBSEP j]]], i.e., the index is the
965concatenation of the three
966elements [[i]], [[SUBSEP]] and [[j]].  This is performed by the
967function [[array_cat]].  On entry, [[sp]] points at the top of a
968stack of [[CELLs]].
969[[Cnt]] cells are popped off the stack and concatenated together
970separated by [[SUBSEP]] and the result is pushed back on the stack.
971On entry, the first multi-index is in [[sp[1-cnt]]] and the last is
972in [[sp[0]]].  The return value is the new stack top.
973(The stack is the run-time evaluation stack.
974This operation really has nothing to do with array structure, so
975logically this code belongs in [[execute.c]], but remains here for
976historical reasons.)
977
978
979<<interface functions>>=
980CELL *array_cat(
981   CELL *sp,
982   int cnt)
983{
984   CELL *p ;  /* walks the eval stack */
985   CELL subsep ;  /* local copy of SUBSEP */
986   <<subsep parts>>
987   size_t total_len ;  /* length of cat'ed expression */
988   CELL *top ;   /* value of sp at entry */
989   char *target ;  /* build cat'ed char* here */
990   STRING *sval ;  /* build cat'ed STRING here */
991   <<get subsep and compute parts>>
992   <<set [[top]] and return value of [[sp]]>>
993   <<cast cells to string and compute [[total_len]]>>
994   <<build the cat'ed [[STRING]] in [[sval]]>>
995   <<cleanup, set [[sp]] and return>>
996}
997@ %def array_cat
998
999@
1000We make a copy of [[SUBSEP]] which we can cast to string in the
1001unlikely event the user has assigned a number to [[SUBSEP]].
1002
1003<<subsep parts>>=
1004size_t subsep_len ; /* string length of subsep_str */
1005char *subsep_str ;
1006
1007<<get subsep and compute parts>>=
1008cellcpy(&subsep, SUBSEP) ;
1009if ( subsep.type < C_STRING ) cast1_to_s(&subsep) ;
1010subsep_len = string(&subsep)->len ;
1011subsep_str = string(&subsep)->str ;
1012
1013@
1014Set [[sp]] and [[top]] so the cells to concatenate are inclusively
1015between [[sp]] and [[top]].
1016
1017<<set [[top]] and return value of [[sp]]>>=
1018assert(cnt > 0);
1019
1020top = sp ; sp -= (cnt-1) ;
1021
1022@
1023The [[total_len]] is the sum of the lengths of the [[cnt]]
1024strings and the [[cnt-1]] copies of [[subsep]].
1025
1026<<cast cells to string and compute [[total_len]]>>=
1027total_len = ((size_t) (cnt-1)) * subsep_len ;
1028for(p = sp ; p <= top ; p++) {
1029   if ( p->type < C_STRING ) cast1_to_s(p) ;
1030   total_len += string(p)->len ;
1031}
1032
1033<<build the cat'ed [[STRING]] in [[sval]]>>=
1034sval = new_STRING0(total_len) ;
1035target = sval->str ;
1036for(p = sp ; p < top ; p++) {
1037   memcpy(target, string(p)->str, string(p)->len) ;
1038   target += string(p)->len ;
1039   memcpy(target, subsep_str, subsep_len) ;
1040   target += subsep_len ;
1041}
1042/* now p == top */
1043memcpy(target, string(p)->str, string(p)->len) ;
1044
1045@
1046The return value is [[sp]] and it is already set correctly.  We
1047just need to free the strings and set the contents of [[sp]].
1048
1049<<cleanup, set [[sp]] and return>>=
1050for(p = sp; p <= top ; p++) free_STRING(string(p)) ;
1051free_STRING(string(&subsep)) ;
1052/* set contents of sp , sp->type > C_STRING is possible so reset */
1053sp->type = C_STRING ;
1054sp->ptr = (PTR) sval ;
1055return sp ;
1056
1057
1058@
1059\section{Source Files}
1060
1061<<"array.h">>=
1062/* array.h */
1063<<blurb>>
1064#ifndef ARRAY_H
1065#define ARRAY_H 1
1066
1067#include "nstd.h"
1068#include "types.h"
1069
1070<<array typedefs and [[#defines]]>>
1071<<interface prototypes>>
1072#endif /* ARRAY_H */
1073
1074<<"array.c">>=
1075/* array.c */
1076<<blurb>>
1077#include "mawk.h"
1078#include "symtype.h"
1079#include "memory.h"
1080#include "split.h"
1081#include "field.h"
1082#include "bi_vars.h"
1083<<local constants, defs and prototypes>>
1084<<interface functions>>
1085<<local functions>>
1086
1087<<blurb>>=
1088/*
1089$MawkId: array.w,v 1.18 2014/08/14 23:34:44 mike Exp $
1090
1091copyright 2009,2010, Thomas E. Dickey
1092copyright 1991-1996,2014 Michael D. Brennan
1093
1094This is a source file for mawk, an implementation of
1095the AWK programming language.
1096
1097Mawk is distributed without warranty under the terms of
1098the GNU General Public License, version 2, 1991.
1099
1100array.c and array.h were generated with the commands
1101
1102   notangle -R'"array.c"' array.w > array.c
1103   notangle -R'"array.h"' array.w > array.h
1104
1105Notangle is part of Norman Ramsey's noweb literate programming package
1106available from CTAN(ftp.shsu.edu).
1107
1108It's easiest to read or modify this file by working with array.w.
1109*/
1110
1111@
1112\idindex
1113