1 // ================================================================
2 // This is a hashless implementation of insertion-ordered key-value pairs for
3 // Miller's fundamental record data structure.  It implements the same
4 // interface as the hashed version (see lhmss.h).
5 //
6 // Design:
7 //
8 // * It keeps a doubly-linked list of key-value pairs.
9 // * No hash functions are computed when the map is written to or read from.
10 // * Gets are implemented by sequential scan through the list: given a key,
11 //   the key-value pairs are scanned through until a match is (or is not) found.
12 // * Performance improvement of 10-15% percent over lhmss is found (for test data).
13 //
14 // Motivation:
15 //
16 // * The use case for records in Miller is that *all* fields are read from
17 //   strings & written to strings (split/join), while only *some* fields are
18 //   operated on.
19 //
20 // * Meanwhile there are few repeated accesses to a given record: the
21 //   access-to-construct ratio is quite low for Miller data records.  Miller
22 //   instantiates thousands, millions, billions of records (depending on the
23 //   input data) but accesses each record only once per mapping operation.
24 //   (This is in contrast to accumulator hashmaps which are repeatedly accessed
25 //   during a stats run.)
26 //
27 // * The hashed impl computes hashsums for *all* fields whether operated on or not,
28 //   for the benefit of the *few* fields looked up during the mapping operation.
29 //
30 // * The hashless impl only keeps string pointers.  Lookups are done at runtime
31 //   doing prefix search on the key names. Assuming field names are distinct,
32 //   this is just a few char-ptr accesses which (in experiments) turn out to
33 //   offer about a 10-15% performance improvement.
34 //
35 // * Added benefit: the field-rename operation (preserving field order) becomes
36 //   trivial.
37 //
38 // Notes:
39 // * null key is not supported.
40 // * null value is supported.
41 // ================================================================
42 
43 #ifndef LREC_H
44 #define LREC_H
45 
46 #include "lib/free_flags.h"
47 #include "containers/sllv.h"
48 #include "containers/slls.h"
49 #include "containers/hss.h"
50 #include "containers/header_keeper.h"
51 
52 #define FIELD_QUOTED_ON_INPUT 0x02
53 
54 struct _lrec_t; // forward reference
55 typedef struct _lrec_t lrec_t;
56 
57 typedef void lrec_free_func_t(lrec_t* prec);
58 
59 // ----------------------------------------------------------------
60 typedef struct _lrece_t {
61 	char* key;
62 	char* value;
63 	// These indicate whether the key/value should be freed on lrec_free().
64 	// Affirmative example: key/value is strdup of something.
65 	// Negative example: key/value are pointers into a line the memory
66 	// management of which is separately managed.
67 	// Another negative example: key/value is a string literal, e.g. "".
68 	char free_flags;
69 	char quote_flags;
70 
71 	struct _lrece_t *pprev;
72 	struct _lrece_t *pnext;
73 } lrece_t;
74 
75 struct _lrec_t {
76 	//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
77 	int      field_count;
78 	lrece_t* phead;
79 	lrece_t* ptail;
80 
81 	//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
82 	// See comments above free_flags. Used to track a mallocked pointer to be
83 	// freed at lrec_free().
84 
85 	// E.g. for NIDX, DKVP, and CSV formats (header handled separately in the
86 	// latter case).
87 	char* psingle_line;
88 
89 	// For XTAB format.
90 	slls_t* pxtab_lines;
91 
92 	//  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
93 	// Format-dependent virtual-function pointer:
94 	lrec_free_func_t* pfree_backing_func;
95 };
96 
97 // ----------------------------------------------------------------
98 lrec_t* lrec_unbacked_alloc();
99 lrec_t* lrec_dkvp_alloc(char* line);
100 lrec_t* lrec_nidx_alloc(char* line);
101 lrec_t* lrec_csvlite_alloc(char* data_line);
102 lrec_t* lrec_csv_alloc(char* data_line);
103 lrec_t* lrec_xtab_alloc(slls_t* pxtab_lines);
104 
105 void lrec_clear(lrec_t* prec);
106 void  lrec_free(lrec_t* prec);
107 lrec_t* lrec_copy(lrec_t* pinrec);
108 
109 // The only difference between lrec_put and lrec_prepend is that the latter
110 // adds to the end of the record, while the former adds to the beginning.
111 //
112 // For both, the key/value respectively will be freed by lrec_free if the
113 // corresponding bits are set in the free_flags.
114 //
115 // * If a string literal or other non-allocated pointer (e.g. mmapped memory
116 //   from a file reader) is passed in, the free flag should not be set.
117 //
118 // * If dynamically allocated pointers are passed in, then either:
119 //
120 //   o The respective free_flag(s) should be set and the caller should be sure
121 //     not to also free (else, there will be heap corruption due to
122 //     double-free), or
123 //
124 //   o The respective free_flag(s) should not be set and the caller should
125 //     free the memory (else, there will be a memory leak).
126 void  lrec_put(lrec_t* prec, char* key, char* value, char free_flags);
127 void  lrec_put_ext(lrec_t* prec, char* key, char* value, char free_flags, char quote_flags);
128 // Like lrec_put: if key is present, modify value. But if not, add new field at start of record, not at end.
129 void  lrec_prepend(lrec_t* prec, char* key, char* value, char free_flags);
130 // Like lrec_put: if key is present, modify value. But if not, add new field after specified entry, not at end.
131 // Returns a pointer to the added/modified node.
132 lrece_t*  lrec_put_after(lrec_t* prec, lrece_t* pd, char* key, char* value, char free_flags);
133 
134 char* lrec_get(lrec_t* prec, char* key);
135 lrece_t* lrec_get_pair_by_position(lrec_t* prec, int position); // 1-up not 0-up
136 char* lrec_get_key_by_position(lrec_t* prec, int position); // 1-up not 0-up
137 char* lrec_get_value_by_position(lrec_t* prec, int position); // 1-up not 0-up
138 
139 // This returns a pointer to the lrec's free-flags so that the caller can do ownership-transfer
140 // of about-to-be-removed key-value pairs.
141 char* lrec_get_pff(lrec_t* prec, char* key, char** ppfree_flags);
142 
143 // This returns a pointer to the entry so the caller can update it directly without needing
144 // to do another field-scan on subsequent lrec_put etc. This is a performance optimization;
145 // it also allows mlr nest --explode to do explode-in-place rather than explode-at-end.
146 char* lrec_get_ext(lrec_t* prec, char* key, lrece_t** ppentry);
147 
148 void  lrec_remove(lrec_t* prec, char* key);
149 void  lrec_remove_by_position(lrec_t* prec, int position); // 1-up not 0-up
150 void  lrec_rename(lrec_t* prec, char* old_key, char* new_key, int new_needs_freeing);
151 void  lrec_rename_at_position(lrec_t* prec, int position, char* new_key, int new_needs_freeing); // 1-up not 0-up
152 void  lrec_move_to_head(lrec_t* prec, char* key);
153 void  lrec_move_to_tail(lrec_t* prec, char* key);
154 // Renames the first n fields where n is the length of pnames.
155 // The hash-set argument is for efficient dedupe.
156 // Assumes as a precondition that pnames_as_list has no duplicates.
157 // If the new labels include any field names existing later on in the record, those are unset.
158 // For example, input record "a=1,b=2,c=3,d=4,e=5" with labels "d,x,f" results in output record "d=1,x=2,f=3,e=5".
159 void  lrec_label(lrec_t* prec, slls_t* pnames_as_list, hss_t* pnames_as_set);
160 
161 void lrece_update_value(lrece_t* pe, char* new_value, int new_needs_freeing);
162 
163 // For lrec-internal use:
164 void lrec_unlink(lrec_t* prec, lrece_t* pe);
165 // May be used for removing fields from a record while iterating over it:
166 void lrec_unlink_and_free(lrec_t* prec, lrece_t* pe);
167 
168 void lrec_print(lrec_t* prec);
169 void lrec_dump(lrec_t* prec);
170 void lrec_dump_fp(lrec_t* prec, FILE* fp);
171 void lrec_dump_titled(char* msg, lrec_t* prec);
172 void lrec_pointer_dump(lrec_t* prec);
173 // The caller should free the return value
174 char* lrec_sprint(lrec_t* prec, char* ors, char* ofs, char* ops);
175 
176 // NIDX data are keyed by one-up field index which is not explicitly contained
177 // in the file, e.g. line "a b c" splits to an lrec with "{"1" => "a", "2" =>
178 // "b", "3" => "c"}. This function creates the keys, avoiding redundant memory
179 // allocation for most-used keys such as "1", "2", ... up to 100 or so. In case
180 // of large idx, free_flags & FREE_ENTRY_KEY will indicate that the key
181 // was dynamically allocated.
182 char* low_int_to_string(int idx, char* pfree_flags);
183 
184 // For unit-test.
185 lrec_t* lrec_literal_1(char* k1, char* v1);
186 lrec_t* lrec_literal_2(char* k1, char* v1, char* k2, char* v2);
187 lrec_t* lrec_literal_3(char* k1, char* v1, char* k2, char* v2, char* k3, char* v3);
188 lrec_t* lrec_literal_4(char* k1, char* v1, char* k2, char* v2, char* k3, char* v3, char* k4, char* v4);
189 
190 #endif // LREC_H
191