1 // ================================================================ 2 // This is a hashless implementation of insertion-ordered key-value pairs for 3 // Miller's fundamental record data structure. It implements the same 4 // interface as the hashed version (see lhmss.h). 5 // 6 // Design: 7 // 8 // * It keeps a doubly-linked list of key-value pairs. 9 // * No hash functions are computed when the map is written to or read from. 10 // * Gets are implemented by sequential scan through the list: given a key, 11 // the key-value pairs are scanned through until a match is (or is not) found. 12 // * Performance improvement of 10-15% percent over lhmss is found (for test data). 13 // 14 // Motivation: 15 // 16 // * The use case for records in Miller is that *all* fields are read from 17 // strings & written to strings (split/join), while only *some* fields are 18 // operated on. 19 // 20 // * Meanwhile there are few repeated accesses to a given record: the 21 // access-to-construct ratio is quite low for Miller data records. Miller 22 // instantiates thousands, millions, billions of records (depending on the 23 // input data) but accesses each record only once per mapping operation. 24 // (This is in contrast to accumulator hashmaps which are repeatedly accessed 25 // during a stats run.) 26 // 27 // * The hashed impl computes hashsums for *all* fields whether operated on or not, 28 // for the benefit of the *few* fields looked up during the mapping operation. 29 // 30 // * The hashless impl only keeps string pointers. Lookups are done at runtime 31 // doing prefix search on the key names. Assuming field names are distinct, 32 // this is just a few char-ptr accesses which (in experiments) turn out to 33 // offer about a 10-15% performance improvement. 34 // 35 // * Added benefit: the field-rename operation (preserving field order) becomes 36 // trivial. 37 // 38 // Notes: 39 // * null key is not supported. 40 // * null value is supported. 41 // ================================================================ 42 43 #ifndef LREC_H 44 #define LREC_H 45 46 #include "lib/free_flags.h" 47 #include "containers/sllv.h" 48 #include "containers/slls.h" 49 #include "containers/hss.h" 50 #include "containers/header_keeper.h" 51 52 #define FIELD_QUOTED_ON_INPUT 0x02 53 54 struct _lrec_t; // forward reference 55 typedef struct _lrec_t lrec_t; 56 57 typedef void lrec_free_func_t(lrec_t* prec); 58 59 // ---------------------------------------------------------------- 60 typedef struct _lrece_t { 61 char* key; 62 char* value; 63 // These indicate whether the key/value should be freed on lrec_free(). 64 // Affirmative example: key/value is strdup of something. 65 // Negative example: key/value are pointers into a line the memory 66 // management of which is separately managed. 67 // Another negative example: key/value is a string literal, e.g. "". 68 char free_flags; 69 char quote_flags; 70 71 struct _lrece_t *pprev; 72 struct _lrece_t *pnext; 73 } lrece_t; 74 75 struct _lrec_t { 76 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 77 int field_count; 78 lrece_t* phead; 79 lrece_t* ptail; 80 81 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 82 // See comments above free_flags. Used to track a mallocked pointer to be 83 // freed at lrec_free(). 84 85 // E.g. for NIDX, DKVP, and CSV formats (header handled separately in the 86 // latter case). 87 char* psingle_line; 88 89 // For XTAB format. 90 slls_t* pxtab_lines; 91 92 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 93 // Format-dependent virtual-function pointer: 94 lrec_free_func_t* pfree_backing_func; 95 }; 96 97 // ---------------------------------------------------------------- 98 lrec_t* lrec_unbacked_alloc(); 99 lrec_t* lrec_dkvp_alloc(char* line); 100 lrec_t* lrec_nidx_alloc(char* line); 101 lrec_t* lrec_csvlite_alloc(char* data_line); 102 lrec_t* lrec_csv_alloc(char* data_line); 103 lrec_t* lrec_xtab_alloc(slls_t* pxtab_lines); 104 105 void lrec_clear(lrec_t* prec); 106 void lrec_free(lrec_t* prec); 107 lrec_t* lrec_copy(lrec_t* pinrec); 108 109 // The only difference between lrec_put and lrec_prepend is that the latter 110 // adds to the end of the record, while the former adds to the beginning. 111 // 112 // For both, the key/value respectively will be freed by lrec_free if the 113 // corresponding bits are set in the free_flags. 114 // 115 // * If a string literal or other non-allocated pointer (e.g. mmapped memory 116 // from a file reader) is passed in, the free flag should not be set. 117 // 118 // * If dynamically allocated pointers are passed in, then either: 119 // 120 // o The respective free_flag(s) should be set and the caller should be sure 121 // not to also free (else, there will be heap corruption due to 122 // double-free), or 123 // 124 // o The respective free_flag(s) should not be set and the caller should 125 // free the memory (else, there will be a memory leak). 126 void lrec_put(lrec_t* prec, char* key, char* value, char free_flags); 127 void lrec_put_ext(lrec_t* prec, char* key, char* value, char free_flags, char quote_flags); 128 // Like lrec_put: if key is present, modify value. But if not, add new field at start of record, not at end. 129 void lrec_prepend(lrec_t* prec, char* key, char* value, char free_flags); 130 // Like lrec_put: if key is present, modify value. But if not, add new field after specified entry, not at end. 131 // Returns a pointer to the added/modified node. 132 lrece_t* lrec_put_after(lrec_t* prec, lrece_t* pd, char* key, char* value, char free_flags); 133 134 char* lrec_get(lrec_t* prec, char* key); 135 lrece_t* lrec_get_pair_by_position(lrec_t* prec, int position); // 1-up not 0-up 136 char* lrec_get_key_by_position(lrec_t* prec, int position); // 1-up not 0-up 137 char* lrec_get_value_by_position(lrec_t* prec, int position); // 1-up not 0-up 138 139 // This returns a pointer to the lrec's free-flags so that the caller can do ownership-transfer 140 // of about-to-be-removed key-value pairs. 141 char* lrec_get_pff(lrec_t* prec, char* key, char** ppfree_flags); 142 143 // This returns a pointer to the entry so the caller can update it directly without needing 144 // to do another field-scan on subsequent lrec_put etc. This is a performance optimization; 145 // it also allows mlr nest --explode to do explode-in-place rather than explode-at-end. 146 char* lrec_get_ext(lrec_t* prec, char* key, lrece_t** ppentry); 147 148 void lrec_remove(lrec_t* prec, char* key); 149 void lrec_remove_by_position(lrec_t* prec, int position); // 1-up not 0-up 150 void lrec_rename(lrec_t* prec, char* old_key, char* new_key, int new_needs_freeing); 151 void lrec_rename_at_position(lrec_t* prec, int position, char* new_key, int new_needs_freeing); // 1-up not 0-up 152 void lrec_move_to_head(lrec_t* prec, char* key); 153 void lrec_move_to_tail(lrec_t* prec, char* key); 154 // Renames the first n fields where n is the length of pnames. 155 // The hash-set argument is for efficient dedupe. 156 // Assumes as a precondition that pnames_as_list has no duplicates. 157 // If the new labels include any field names existing later on in the record, those are unset. 158 // For example, input record "a=1,b=2,c=3,d=4,e=5" with labels "d,x,f" results in output record "d=1,x=2,f=3,e=5". 159 void lrec_label(lrec_t* prec, slls_t* pnames_as_list, hss_t* pnames_as_set); 160 161 void lrece_update_value(lrece_t* pe, char* new_value, int new_needs_freeing); 162 163 // For lrec-internal use: 164 void lrec_unlink(lrec_t* prec, lrece_t* pe); 165 // May be used for removing fields from a record while iterating over it: 166 void lrec_unlink_and_free(lrec_t* prec, lrece_t* pe); 167 168 void lrec_print(lrec_t* prec); 169 void lrec_dump(lrec_t* prec); 170 void lrec_dump_fp(lrec_t* prec, FILE* fp); 171 void lrec_dump_titled(char* msg, lrec_t* prec); 172 void lrec_pointer_dump(lrec_t* prec); 173 // The caller should free the return value 174 char* lrec_sprint(lrec_t* prec, char* ors, char* ofs, char* ops); 175 176 // NIDX data are keyed by one-up field index which is not explicitly contained 177 // in the file, e.g. line "a b c" splits to an lrec with "{"1" => "a", "2" => 178 // "b", "3" => "c"}. This function creates the keys, avoiding redundant memory 179 // allocation for most-used keys such as "1", "2", ... up to 100 or so. In case 180 // of large idx, free_flags & FREE_ENTRY_KEY will indicate that the key 181 // was dynamically allocated. 182 char* low_int_to_string(int idx, char* pfree_flags); 183 184 // For unit-test. 185 lrec_t* lrec_literal_1(char* k1, char* v1); 186 lrec_t* lrec_literal_2(char* k1, char* v1, char* k2, char* v2); 187 lrec_t* lrec_literal_3(char* k1, char* v1, char* k2, char* v2, char* k3, char* v3); 188 lrec_t* lrec_literal_4(char* k1, char* v1, char* k2, char* v2, char* k3, char* v3, char* k4, char* v4); 189 190 #endif // LREC_H 191