1 #ifndef __PLINK2_TEXT_H__
2 #define __PLINK2_TEXT_H__
3
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation, either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library. If not, see <http://www.gnu.org/licenses/>.
19
20
21 // Scanning one line at a time from a text file is one of the most common
22 // workflows in all of computing.
23 //
24 // Usually, text files are small; the obvious reason to choose text over binary
25 // is human-readability, after all, and humans can't read multi-gigabyte files
26 // in a reasonable amount of time. As a consequence, the most commonly used C
27 // and C++ text-processing library functions sacrifice a substantial amount of
28 // performance in favor of ease-of-use.
29 //
30 // However, plink2 is frequently asked to load a multi-gigabyte text file and
31 // then do something very simple with it. Often, the file is in the operating
32 // system's page cache, since the user or script is doing multiple things with
33 // the file and they're split across multiple invocations of plink2 and other
34 // programs. In this setting, the usual "I/O cost > processing cost, it isn't
35 // worth worrying much about the latter" assumption is very, very wrong, and it
36 // is worth going to great lengths to keep baseline text-processing cost to a
37 // minimum.
38 //
39 // In addition, multi-gigabyte text files are practically guaranteed to
40 // compress well, and gzipped and bgzipped text files are widely used in
41 // bioinformatics practice. Ordinarily, when sequentially processing a text
42 // file, there's little to gain from spawning a separate thread to issue
43 // file-read requests, since a modern operating system will recognize the
44 // access pattern and read-ahead from the disk on its own. However, the
45 // operating system can't *decompress-ahead* for you; and when decompression
46 // has comparable latency to processing, decompress-ahead reduces runtime by up
47 // to 50%.
48 //
49 // Thus, this library provides a text reader that
50 // 1. allows the caller to treat gzipped and Zstd-compressed text files as if
51 // they were uncompressed. This is functionally identical to Zstd's
52 // zlibWrapper, which was used for most of plink2's alpha testing period,
53 // but I've decided to phase out zlibWrapper thanks to compilation headaches
54 // and its static-linking requirement.
55 // 2. decompresses-ahead, potentially with multiple threads.
56 // a. For now, multithreaded decompression can only kick in for bgzipped
57 // files. However, if a clear use case exists, it should be possible to
58 // build a multithreaded Zstd decoder that isn't restricted to a Zstd
59 // sub-format; see
60 // https://github.com/facebook/zstd/issues/1702#issuecomment-515124700
61 // If you have such a use case, I recommend responding to Yann Collet in
62 // that GitHub issue.
63 // b. Tabix-based seek support was considered and rejected, since the tabix
64 // index only stores CHROM/POS, while plink2 also needs record numbers in
65 // its most critical use case (.pvar loading). A suitable index format
66 // may be adopted or developed later, and if/when that's supported, there
67 // will probably also be a way for callers which don't need record
68 // numbers to exploit tabix indexes. (In that event, plink2's Zstd
69 // compressor will be modified to support seekable-Zstd output and index
70 // generation.)
71 // 3. has line-reader functions that don't force the user to provide their own
72 // buffer to put the line in. Instead, they just return a (possibly const)
73 // pointer to the beginning of the line and expose a pointer to the end of
74 // the line. This simultaneously saves memory and reduces overhead.
75 // a. Since this reuses a single buffer, the string-view is invalidated when
76 // the next line is read.
77 // b. When the last line in the file is not terminated by '\n', this text
78 // reader automatically appends '\n'. Thus, while C library functions
79 // that assume null-termination can't be used here (unless you're using a
80 // wrapper function that replaces the terminating '\n' with '\0' after
81 // the line-read function call, which is a totally valid thing to do),
82 // plink2_string.h functions which either iterate to any end-of-line
83 // character (ASCII code < 32 and unequal to 9=tab) or explicitly assume
84 // '\n'-termination can be used, and these use essentially the same
85 // optimizations as modern memchr implementations.
86 // (A C++17 interface that returns std::string_view objects was considered,
87 // but then rejected since std::string_view's design makes it much better
88 // suited to be a function input-parameter type than a return type. It is
89 // easy enough to efficiently construct a string_view using the current
90 // interface when it's time to call a function that accepts one. The rest
91 // of the time, there's no meaningful advantage over plain C pointers.)
92 // 4. can be used with either a single fixed-size memory buffer (this plays
93 // well with plink2's memory allocation strategy), or dynamic resizing with
94 // malloc()/realloc() calls.
95 //
96 // Two other readers are provided:
97 // - A decompress-ahead token reader. This also shards the tokens, for the
98 // common use case where the tokens don't need to be parsed in order (e.g.
99 // --extract/--exclude).
100 // - A simpler single-threaded (no decompress-ahead) reader.
101
102 #ifdef STATIC_ZLIB
103 # include "../../zlib-1.2.11/zlib.h"
104 #else
105 # include <zlib.h>
106 # if !defined(ZLIB_VERNUM) || (ZLIB_VERNUM < 0x1240)
107 # error "plink2_text requires zlib 1.2.4 or later."
108 # endif
109 #endif
110
111 #include "plink2_bgzf.h"
112 #include "plink2_zstfile.h"
113
114 #ifdef __cplusplus
115 namespace plink2 {
116 #endif
117
118 PglErr GetFileType(const char* fname, FileCompressionType* ftype_ptr);
119
120 typedef struct TextFileBaseStruct {
121 // Not noncopyable, since this is copied wholesale by the TextStream
122 // move-constructor.
123
124 // Positioned first so the compiler doesn't need to add an offset to access
125 // it.
126 char* consume_iter;
127
128 char* consume_stop; // should always point after the last loaded \n
129
130 const char* errmsg;
131 PglErr reterr;
132
133 FileCompressionType file_type;
134 FILE* ff; // could use e.g. htslib for some network support later
135 uint32_t dst_owned_by_consumer;
136 uint32_t enforced_max_line_blen;
137 // Note that when dst_owned_by_consumer isn't true, the reader thread can
138 // alter the values below. However, this can only happen when the consumer
139 // thread is blocked waiting for the next line, or before it's attempted to
140 // read the first line. So there's no advantage to placing these in a
141 // different cacheline from consume_iter (which is constantly altered by the
142 // consumer).
143 char* dst;
144 uint32_t dst_len;
145 uint32_t dst_capacity;
146 } TextFileBase;
147
148 typedef struct GzRawDecompressStreamStruct {
149 // Copied by TextStream move-constructor.
150 unsigned char* in;
151 z_stream ds;
152 uint32_t ds_initialized;
153 } GzRawDecompressStream;
154
155 // BgzfRawDecompressStream declared in plink2_bgzf.h.
156
157 typedef struct ZstRawDecompressStreamStruct {
158 // Copied by TextStream move-constructor.
159 ZSTD_DStream* ds;
160 ZSTD_inBuffer ib;
161 } ZstRawDecompressStream;
162
163 typedef union {
164 GzRawDecompressStream gz;
165 // Even in the single-threaded case, it's worth distinguishing bgzf from
166 // generic .gz, since with bgzf we can use libdeflate.
167 BgzfRawDecompressStream bgzf;
168 ZstRawDecompressStream zst;
169 } RawDecompressStream;
170
171 typedef struct textFILEMainStruct {
172 TextFileBase base;
173 RawDecompressStream rds;
174 } textFILEMain;
175
176 typedef struct textFILEStruct {
177 #ifdef __cplusplus
GET_PRIVATE_mtextFILEStruct178 textFILEMain& GET_PRIVATE_m() { return m; }
GET_PRIVATE_mtextFILEStruct179 textFILEMain const& GET_PRIVATE_m() const { return m; }
180 private:
181 #endif
182 textFILEMain m;
183 } textFILE;
184
185 void PreinitTextFile(textFILE* txf_ptr);
186
187 // kDecompressChunkSize = 1 MiB currently declared in plink2_bgzf.h, may move
188 // somewhere closer to the base later.
189 CONSTI32(kTextStreamBlenFast, 11 * kDecompressChunkSize);
190 CONSTI32(kTokenStreamBlen, 12 * kDecompressChunkSize);
191 CONSTI32(kMaxTokenBlen, 8 * kDecompressChunkSize);
192 static_assert(kMaxTokenBlen >= kDecompressChunkSize, "kMaxTokenBlen too small.");
193 static_assert(kMaxTokenBlen + kDecompressChunkSize <= kTokenStreamBlen, "kMaxTokenBlen + kDecompressChunkSize can't be larger than kTokenStreamBlen.");
194
195 // max_line_blen and enforced_max_line_blen lower bound.
196 CONSTI32(kDecompressMinBlen, kDecompressChunkSize);
197
198 CONSTI32(kDecompressMinCapacity, kDecompressMinBlen + kDecompressChunkSize);
199
200 // * Can return nomem, open-fail, or read-fail.
201 // * If dst == nullptr, this mallocs a buffer of size 2 * kDecompressChunkSize,
202 // and it'll be realloced as necessary and freed by CleanupTextFile().
203 // The original value of dst_capacity doesn't matter in this case.
204 // Otherwise, the buffer is owned by the caller, assumed to have size >=
205 // dst_capacity, and never grown.
206 // * enforced_max_line_blen must be >= dst_capacity - kDecompressChunkSize.
207 // It's the point at which long-line errors instead of out-of-memory errors
208 // are reported. It isn't permitted to be less than 1 MiB.
209 PglErr TextFileOpenEx(const char* fname, uint32_t enforced_max_line_blen, uint32_t dst_capacity, char* dst, textFILE* txf_ptr);
210
TextFileOpen(const char * fname,textFILE * txf_ptr)211 HEADER_INLINE PglErr TextFileOpen(const char* fname, textFILE* txf_ptr) {
212 return TextFileOpenEx(fname, kMaxLongLine, 0, nullptr, txf_ptr);
213 }
214
215 extern const char kShortErrLongLine[];
216 extern const char kShortErrInteriorEmptyLine[];
217
218 PglErr TextFileAdvance(textFILE* txf_ptr);
219
TextFileNextLine(textFILE * txf_ptr,char ** line_startp)220 HEADER_INLINE PglErr TextFileNextLine(textFILE* txf_ptr, char** line_startp) {
221 TextFileBase* basep = &GET_PRIVATE(*txf_ptr, m).base;
222 if (basep->consume_iter == basep->consume_stop) {
223 PglErr reterr = TextFileAdvance(txf_ptr);
224 // not unlikely() due to eof
225 if (reterr) {
226 return reterr;
227 }
228 }
229 *line_startp = basep->consume_iter;
230 basep->consume_iter = AdvPastDelim(*line_startp, '\n');
231 return kPglRetSuccess;
232 }
233
TextFileLineEnd(textFILE * txf_ptr)234 HEADER_INLINE char* TextFileLineEnd(textFILE* txf_ptr) {
235 return GET_PRIVATE(*txf_ptr, m).base.consume_iter;
236 }
237
TextFileNextLineLstrip(textFILE * txf_ptr,char ** line_startp)238 HEADER_INLINE PglErr TextFileNextLineLstrip(textFILE* txf_ptr, char** line_startp) {
239 TextFileBase* basep = &GET_PRIVATE(*txf_ptr, m).base;
240 if (basep->consume_iter == basep->consume_stop) {
241 PglErr reterr = TextFileAdvance(txf_ptr);
242 if (reterr) {
243 return reterr;
244 }
245 }
246 *line_startp = FirstNonTspace(basep->consume_iter);
247 basep->consume_iter = AdvPastDelim(*line_startp, '\n');
248 return kPglRetSuccess;
249 }
250
251 PglErr TextFileOnlyEmptyLinesLeft(textFILE* txf_ptr);
252
253 // API fix (30-31 Oct 2019): While we want to tolerate *trailing* empty lines
254 // (as well as lack of a final newline), since those arise from manual
255 // text-editing every once in a while, there's usually no good reason to
256 // tolerate *interior* empty lines like we were previously doing. So the
257 // generic empty-line-skipping interface has been replaced with one that
258 // reports an "Unexpected empty line" error with a kPglRetMalformedInput error
259 // code unless the empty line(s) are at EOF.
TextFileGet(textFILE * txf_ptr)260 HEADER_INLINE char* TextFileGet(textFILE* txf_ptr) {
261 TextFileBase* basep = &GET_PRIVATE(*txf_ptr, m).base;
262 if (basep->consume_iter == basep->consume_stop) {
263 // not unlikely() due to eof
264 if (TextFileAdvance(txf_ptr)) {
265 return nullptr;
266 }
267 }
268 char* line_start = FirstNonTspace(basep->consume_iter);
269 basep->consume_iter = AdvPastDelim(line_start, '\n');
270 if (!IsEolnKns(*line_start)) {
271 return line_start;
272 }
273 TextFileOnlyEmptyLinesLeft(txf_ptr);
274 return nullptr;
275 }
276
277 void TextFileRewind(textFILE* txf_ptr);
278
TextFileIsOpen(const textFILE * txf_ptr)279 HEADER_INLINE int32_t TextFileIsOpen(const textFILE* txf_ptr) {
280 return (GET_PRIVATE(*txf_ptr, m).base.ff != nullptr);
281 }
282
TextFileEof(const textFILE * txf_ptr)283 HEADER_INLINE int32_t TextFileEof(const textFILE* txf_ptr) {
284 return (GET_PRIVATE(*txf_ptr, m).base.reterr == kPglRetEof);
285 }
286
TextFileError(const textFILE * txf_ptr)287 HEADER_INLINE const char* TextFileError(const textFILE* txf_ptr) {
288 return GET_PRIVATE(*txf_ptr, m).base.errmsg;
289 }
290
TextFileErrcode(const textFILE * txf_ptr)291 HEADER_INLINE PglErr TextFileErrcode(const textFILE* txf_ptr) {
292 const PglErr reterr = GET_PRIVATE(*txf_ptr, m).base.reterr;
293 if (reterr == kPglRetEof) {
294 return kPglRetSuccess;
295 }
296 return reterr;
297 }
298
299 // Note that this does not assign to *reterrp at all in the usual eof case.
TextFileErrcode2(const textFILE * txf_ptr,PglErr * reterrp)300 HEADER_INLINE BoolErr TextFileErrcode2(const textFILE* txf_ptr, PglErr* reterrp) {
301 const PglErr reterr = GET_PRIVATE(*txf_ptr, m).base.reterr;
302 if (reterr == kPglRetEof) {
303 return 0;
304 }
305 *reterrp = reterr;
306 return 1;
307 }
308
309 // Does not convert kPglRetEof -> kPglRetSuccess.
TextFileRawErrcode(const textFILE * txf_ptr)310 HEADER_INLINE PglErr TextFileRawErrcode(const textFILE* txf_ptr) {
311 return GET_PRIVATE(*txf_ptr, m).base.reterr;
312 }
313
314 // Relevant when about to move-construct a TextStream; see plink2_glm.
TextFileLinebufLen(const textFILE * txf_ptr)315 HEADER_INLINE uint32_t TextFileLinebufLen(const textFILE* txf_ptr) {
316 return GET_PRIVATE(*txf_ptr, m).base.dst_len;
317 }
318
319 // Ok to pass reterrp == nullptr.
320 // Returns nonzero iff file-close fails, and either reterrp == nullptr or
321 // *reterrp == kPglRetSuccess; this is intended to be followed by logging of
322 // strerror(errno). In the latter case, *reterrp is set to kPglRetReadFail.
323 BoolErr CleanupTextFile(textFILE* txf_ptr, PglErr* reterrp);
324
325
326 // consumer -> reader message
327 // could add a "close current file and open another one" case
328 ENUM_U31_DEF_START()
329 kTxsInterruptNone,
330 kTxsInterruptRetarget,
331 kTxsInterruptShutdown
332 ENUM_U31_DEF_END(TxsInterrupt);
333
334 typedef struct TextStreamSyncStruct {
335 // Mutex shared state, and everything guarded by the mutex. Allocated to
336 // different cacheline(s) than consume_stop.
337 #ifdef _WIN32
338 CRITICAL_SECTION critical_section;
339 HANDLE reader_progress_event;
340 HANDLE consumer_progress_event;
341 #else
342 pthread_mutex_t sync_mutex;
343 pthread_cond_t reader_progress_condvar;
344 pthread_cond_t consumer_progress_condvar;
345 // bugfix (7 Mar 2018): need to avoid waiting on consumer_progress_condvar if
346 // this is set. (could also check an appropriate predicate)
347 uint32_t consumer_progress_state;
348
349 uint32_t sync_init_state;
350 #endif
351
352 pthread_t read_thread;
353
354 char* consume_tail;
355 char* cur_circular_end;
356 char* available_end;
357
358 // Separate from the TextFileBase instances of these values, since we don't
359 // want to force the user to worry about these values changing at any moment.
360 // Instead, the TextFileBase instances are only updated during TextAdvance()
361 // calls and the like.
362 const char* errmsg;
363 PglErr reterr; // note that this is set to kPglRetEof once we reach eof
364
365 uint32_t dst_reallocated;
366 TxsInterrupt interrupt;
367 const char* new_fname;
368 } TextStreamSync;
369
370 typedef union {
371 GzRawDecompressStream gz;
372 BgzfRawMtDecompressStream bgzf;
373 ZstRawDecompressStream zst;
374 } RawMtDecompressStream;
375
376 typedef struct TextStreamMainStruct {
377 TextFileBase base;
378 RawMtDecompressStream rds;
379 uint32_t decompress_thread_ct;
380 TextStreamSync* syncp;
381 } TextStreamMain;
382
383 typedef struct TextStreamStruct {
384 #ifdef __cplusplus
GET_PRIVATE_mTextStreamStruct385 TextStreamMain& GET_PRIVATE_m() { return m; }
GET_PRIVATE_mTextStreamStruct386 TextStreamMain const& GET_PRIVATE_m() const { return m; }
387 private:
388 #endif
389 TextStreamMain m;
390 } TextStream;
391
392 void PreinitTextStream(TextStream* txs_ptr);
393
394 // * Can return nomem, open-fail, read-fail, or thread-create-fail.
395 // * Exactly one of fname and txf_ptr must be nullptr. If txf_ptr is null,
396 // fname is opened. Otherwise, the returned stream is "move-constructed"
397 // from txf_ptr.
398 // When not move-constructing, enforced_max_line_blen, dst_capacity, and dst
399 // are interpreted the same way as TextFileOpenEx().
400 // When move-constructing, enforced_max_line_blen and dst_capacity may be
401 // smaller than what the textFILE was opened with.
402 PglErr TextStreamOpenEx(const char* fname, uint32_t enforced_max_line_blen, uint32_t dst_capacity, uint32_t decompress_thread_ct, textFILE* txf_ptr, char* dst, TextStream* txs_ptr);
403
TextStreamOpen(const char * fname,TextStream * txs_ptr)404 HEADER_INLINE PglErr TextStreamOpen(const char* fname, TextStream* txs_ptr) {
405 return TextStreamOpenEx(fname, kMaxLongLine, 0, NumCpu(nullptr), nullptr, nullptr, txs_ptr);
406 }
407
408 // We drop 'Stream' from the function names outside of open/close, to emphasize
409 // that this is the default choice.
410 // (Originally this was named 'TextRstream', but then I realized that the write
411 // case doesn't really care about whether the input is text or binary.)
TextLineEnd(TextStream * txs_ptr)412 HEADER_INLINE char* TextLineEnd(TextStream* txs_ptr) {
413 return GET_PRIVATE(*txs_ptr, m).base.consume_iter;
414 }
415
TextIsOpen(const TextStream * txs_ptr)416 HEADER_INLINE int32_t TextIsOpen(const TextStream* txs_ptr) {
417 return (GET_PRIVATE(*txs_ptr, m).base.ff != nullptr);
418 }
419
TextEof(const TextStream * txs_ptr)420 HEADER_INLINE int32_t TextEof(const TextStream* txs_ptr) {
421 return (GET_PRIVATE(*txs_ptr, m).base.reterr == kPglRetEof);
422 }
423
424 uint32_t TextDecompressThreadCt(const TextStream* txs_ptr);
425
426 PglErr TextAdvance(TextStream* txs_ptr);
427
TextNextLine(TextStream * txs_ptr,char ** line_startp)428 HEADER_INLINE PglErr TextNextLine(TextStream* txs_ptr, char** line_startp) {
429 TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
430 if (basep->consume_iter == basep->consume_stop) {
431 PglErr reterr = TextAdvance(txs_ptr);
432 // not unlikely() due to eof
433 if (reterr) {
434 return reterr;
435 }
436 }
437 *line_startp = basep->consume_iter;
438 basep->consume_iter = AdvPastDelim(basep->consume_iter, '\n');
439 return kPglRetSuccess;
440 }
441
TextNextLineK(TextStream * txs_ptr,const char ** line_startp)442 HEADER_INLINE PglErr TextNextLineK(TextStream* txs_ptr, const char** line_startp) {
443 return TextNextLine(txs_ptr, K_CAST(char**, line_startp));
444 }
445
446 // plink2 functions strip leading whitespace by default (unless there's a clear
447 // reason not to, e.g. it isn't allowed in VCF files), since plink 1.x inserts
448 // it so often.
TextNextLineLstrip(TextStream * txs_ptr,char ** line_startp)449 HEADER_INLINE PglErr TextNextLineLstrip(TextStream* txs_ptr, char** line_startp) {
450 TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
451 if (basep->consume_iter == basep->consume_stop) {
452 PglErr reterr = TextAdvance(txs_ptr);
453 // not unlikely() due to eof
454 if (reterr) {
455 return reterr;
456 }
457 }
458 *line_startp = FirstNonTspace(basep->consume_iter);
459 basep->consume_iter = AdvPastDelim(*line_startp, '\n');
460 return kPglRetSuccess;
461 }
462
TextNextLineLstripK(TextStream * txs_ptr,const char ** line_startp)463 HEADER_INLINE PglErr TextNextLineLstripK(TextStream* txs_ptr, const char** line_startp) {
464 return TextNextLineLstrip(txs_ptr, K_CAST(char**, line_startp));
465 }
466
467 // returns kPglRetEof if true, otherwise kPglRetMalformedInput and sets
468 // errmsg.
469 PglErr TextOnlyEmptyLinesLeft(TextStream* txs_ptr);
470
471 // This was previously named TextNextLineLstripNoempty, and returned a PglErr,
472 // but some compilers were (i) not inlining it and (ii) spamming spurious
473 // uninitialized-variable warnings as a side effect. And it's plink2's most
474 // common use case.
475 // So we'll use a different interface here (and a shorter name). User is
476 // expected to call TextStream[Raw]Errcode when nullptr is returned.
TextGet(TextStream * txs_ptr)477 HEADER_INLINE char* TextGet(TextStream* txs_ptr) {
478 TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
479 if (basep->consume_iter == basep->consume_stop) {
480 // not unlikely() due to eof
481 if (TextAdvance(txs_ptr)) {
482 return nullptr;
483 }
484 }
485 char* line_start = FirstNonTspace(basep->consume_iter);
486 basep->consume_iter = AdvPastDelim(line_start, '\n');
487 if (!IsEolnKns(*line_start)) {
488 return line_start;
489 }
490 TextOnlyEmptyLinesLeft(txs_ptr);
491 return nullptr;
492 }
493
494 PglErr TextSkipNz(uintptr_t skip_ct, TextStream* txs_ptr);
495
TextSkip(uintptr_t skip_ct,TextStream * txs_ptr)496 HEADER_INLINE PglErr TextSkip(uintptr_t skip_ct, TextStream* txs_ptr) {
497 if (skip_ct == 0) {
498 return kPglRetSuccess;
499 }
500 return TextSkipNz(skip_ct, txs_ptr);
501 }
502
503
504 // 'Unsafe' functions require line_iter to already point to the start of the
505 // line, and don't update txsp->base.consume_iter; they primarily wrap the
506 // TextAdvance() call.
TextNextLineUnsafe(TextStream * txs_ptr,char ** line_iterp)507 HEADER_INLINE PglErr TextNextLineUnsafe(TextStream* txs_ptr, char** line_iterp) {
508 TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
509 if (*line_iterp != basep->consume_stop) {
510 return kPglRetSuccess;
511 }
512 basep->consume_iter = *line_iterp;
513 PglErr reterr = TextAdvance(txs_ptr);
514 // not unlikely() due to eof
515 if (reterr) {
516 return reterr;
517 }
518 *line_iterp = basep->consume_iter;
519 return kPglRetSuccess;
520 }
521
522
TextNextLineLstripUnsafe(TextStream * txs_ptr,char ** line_iterp)523 HEADER_INLINE PglErr TextNextLineLstripUnsafe(TextStream* txs_ptr, char** line_iterp) {
524 char* line_iter = *line_iterp;
525 TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
526 if (line_iter == basep->consume_stop) {
527 basep->consume_iter = line_iter;
528 PglErr reterr = TextAdvance(txs_ptr);
529 // not unlikely() due to eof
530 if (reterr) {
531 return reterr;
532 }
533 line_iter = basep->consume_iter;
534 }
535 *line_iterp = FirstNonTspace(line_iter);
536 return kPglRetSuccess;
537 }
538
TextNextLineLstripUnsafeK(TextStream * txs_ptr,const char ** line_iterp)539 HEADER_INLINE PglErr TextNextLineLstripUnsafeK(TextStream* txs_ptr, const char** line_iterp) {
540 return TextNextLineLstripUnsafe(txs_ptr, K_CAST(char**, line_iterp));
541 }
542
TextGetUnsafe(TextStream * txs_ptr,char ** line_iterp)543 HEADER_INLINE PglErr TextGetUnsafe(TextStream* txs_ptr, char** line_iterp) {
544 char* line_iter = *line_iterp;
545 TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
546 if (line_iter == basep->consume_stop) {
547 basep->consume_iter = line_iter;
548 PglErr reterr = TextAdvance(txs_ptr);
549 // not unlikely() due to eof
550 if (reterr) {
551 return reterr;
552 }
553 line_iter = basep->consume_iter;
554 }
555 line_iter = FirstNonTspace(line_iter);
556 if (!IsEolnKns(*line_iter)) {
557 *line_iterp = line_iter;
558 return kPglRetSuccess;
559 }
560 return TextOnlyEmptyLinesLeft(txs_ptr);
561 }
562
563 /*
564 HEADER_INLINE PglErr TextGetUnsafeK(TextStream* txs_ptr, const char** line_iterp) {
565 return TextGetUnsafe(txs_ptr, K_CAST(char**, line_iterp));
566 }
567 */
568
569 // Returns *zero* when it's time to stop iterating. Designed to be the middle
570 // argument in a for (; ; ) loop.
TextGetUnsafe2(TextStream * txs_ptr,char ** line_iterp)571 HEADER_INLINE uint32_t TextGetUnsafe2(TextStream* txs_ptr, char** line_iterp) {
572 char* line_iter = *line_iterp;
573 TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
574 if (line_iter == basep->consume_stop) {
575 basep->consume_iter = line_iter;
576 // not unlikely() due to eof
577 if (TextAdvance(txs_ptr)) {
578 return 0;
579 }
580 line_iter = basep->consume_iter;
581 }
582 line_iter = FirstNonTspace(line_iter);
583 if (!IsEolnKns(*line_iter)) {
584 *line_iterp = line_iter;
585 return 1;
586 }
587 TextOnlyEmptyLinesLeft(txs_ptr);
588 return 0;
589 }
590
TextGetUnsafe2K(TextStream * txs_ptr,const char ** line_iterp)591 HEADER_INLINE uint32_t TextGetUnsafe2K(TextStream* txs_ptr, const char** line_iterp) {
592 return TextGetUnsafe2(txs_ptr, K_CAST(char**, line_iterp));
593 }
594
TextSetPos(char * new_consume_iter,TextStream * txs_ptr)595 HEADER_INLINE void TextSetPos(char* new_consume_iter, TextStream* txs_ptr) {
596 GET_PRIVATE(*txs_ptr, m).base.consume_iter = new_consume_iter;
597 }
598
599
TextIsMt(const TextStream * txs_ptr)600 HEADER_INLINE uint32_t TextIsMt(const TextStream* txs_ptr) {
601 // Only bgzf decoder is multithreaded for now.
602 return (GET_PRIVATE(*txs_ptr, m).base.file_type == kFileBgzf);
603 }
604
605 PglErr TextRetarget(const char* new_fname, TextStream* txs_ptr);
606
TextRewind(TextStream * txs_ptr)607 HEADER_INLINE PglErr TextRewind(TextStream* txs_ptr) {
608 return TextRetarget(nullptr, txs_ptr);
609 }
610
TextStreamError(const TextStream * txs_ptr)611 HEADER_INLINE const char* TextStreamError(const TextStream* txs_ptr) {
612 return GET_PRIVATE(*txs_ptr, m).base.errmsg;
613 }
614
TextStreamErrcode(const TextStream * txs_ptr)615 HEADER_INLINE PglErr TextStreamErrcode(const TextStream* txs_ptr) {
616 const PglErr reterr = GET_PRIVATE(*txs_ptr, m).base.reterr;
617 if (reterr == kPglRetEof) {
618 return kPglRetSuccess;
619 }
620 return reterr;
621 }
622
623 // Note that this does not assign to *reterrp at all in the usual eof case.
TextStreamErrcode2(const TextStream * txs_ptr,PglErr * reterrp)624 HEADER_INLINE BoolErr TextStreamErrcode2(const TextStream* txs_ptr, PglErr* reterrp) {
625 const PglErr reterr = GET_PRIVATE(*txs_ptr, m).base.reterr;
626 if (reterr == kPglRetEof) {
627 return 0;
628 }
629 *reterrp = reterr;
630 return 1;
631 }
632
633 // Does not convert kPglRetEof -> kPglRetSuccess.
TextStreamRawErrcode(const TextStream * txs_ptr)634 HEADER_INLINE PglErr TextStreamRawErrcode(const TextStream* txs_ptr) {
635 return GET_PRIVATE(*txs_ptr, m).base.reterr;
636 }
637
638 // Ok to pass reterrp == nullptr.
639 // Returns nonzero iff file-close fails, and either reterrp == nullptr or
640 // *reterrp == kPglRetSuccess. In the latter case, *reterrp is set to
641 // kPglRetReadFail. (Note that this does *not* retrieve the existing
642 // txsp->reterr value; caller is responsible for checking TextStreamErrcode()
643 // first when they care.)
644 BoolErr CleanupTextStream(TextStream* txs_ptr, PglErr* reterrp);
645
646
647 // Low-level token-batch-reading interface, using the extra TextStream mode
648 // which cares about token rather than line endings.
649 typedef struct TokenStreamStruct {
650 NONCOPYABLE(TokenStreamStruct);
651 TextStream txs;
652 } TokenStream;
653
PreinitTokenStream(TokenStream * tksp)654 HEADER_INLINE void PreinitTokenStream(TokenStream* tksp) {
655 PreinitTextStream(&tksp->txs);
656 }
657
658 // Note that shard_boundaries must have length (piece_ct + 1).
659 PglErr TksNext(TokenStream* tksp, uint32_t shard_ct, char** shard_boundaries);
660
TokenStreamRetarget(const char * new_fname,TokenStream * tksp)661 HEADER_INLINE PglErr TokenStreamRetarget(const char* new_fname, TokenStream* tksp) {
662 return TextRetarget(new_fname, &(tksp->txs));
663 }
664
TokenRewind(TokenStream * tksp)665 HEADER_INLINE PglErr TokenRewind(TokenStream* tksp) {
666 return TextRetarget(nullptr, &(tksp->txs));
667 }
668
TokenStreamError(const TokenStream * tksp)669 HEADER_INLINE const char* TokenStreamError(const TokenStream* tksp) {
670 return GET_PRIVATE(tksp->txs, m).base.errmsg;
671 }
672
TokenStreamErrcode(const TokenStream * tksp)673 HEADER_INLINE PglErr TokenStreamErrcode(const TokenStream* tksp) {
674 const PglErr reterr = GET_PRIVATE(tksp->txs, m).base.reterr;
675 if (reterr == kPglRetEof) {
676 return kPglRetSuccess;
677 }
678 return reterr;
679 }
680
CleanupTokenStream(TokenStream * tksp,PglErr * reterrp)681 HEADER_INLINE BoolErr CleanupTokenStream(TokenStream* tksp, PglErr* reterrp) {
682 return CleanupTextStream(&(tksp->txs), reterrp);
683 }
684
685 // Could create a slightly simpler interface for the one-token-at-a-time case,
686 // but I won't bother for now since it's kind of good for the relative cost of
687 // parallelizing token processing to be low.
688
689
690 #ifdef __cplusplus
691 } // namespace plink2
692 #endif
693
694 #endif // __PLINK2_TEXT_H__
695