1 #ifndef __PLINK2_TEXT_H__
2 #define __PLINK2_TEXT_H__
3 
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation, either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
19 
20 
21 // Scanning one line at a time from a text file is one of the most common
22 // workflows in all of computing.
23 //
24 // Usually, text files are small; the obvious reason to choose text over binary
25 // is human-readability, after all, and humans can't read multi-gigabyte files
26 // in a reasonable amount of time.  As a consequence, the most commonly used C
27 // and C++ text-processing library functions sacrifice a substantial amount of
28 // performance in favor of ease-of-use.
29 //
30 // However, plink2 is frequently asked to load a multi-gigabyte text file and
31 // then do something very simple with it.  Often, the file is in the operating
32 // system's page cache, since the user or script is doing multiple things with
33 // the file and they're split across multiple invocations of plink2 and other
34 // programs.  In this setting, the usual "I/O cost > processing cost, it isn't
35 // worth worrying much about the latter" assumption is very, very wrong, and it
36 // is worth going to great lengths to keep baseline text-processing cost to a
37 // minimum.
38 //
39 // In addition, multi-gigabyte text files are practically guaranteed to
40 // compress well, and gzipped and bgzipped text files are widely used in
41 // bioinformatics practice.  Ordinarily, when sequentially processing a text
42 // file, there's little to gain from spawning a separate thread to issue
43 // file-read requests, since a modern operating system will recognize the
44 // access pattern and read-ahead from the disk on its own.  However, the
45 // operating system can't *decompress-ahead* for you; and when decompression
46 // has comparable latency to processing, decompress-ahead reduces runtime by up
47 // to 50%.
48 //
49 // Thus, this library provides a text reader that
50 // 1. allows the caller to treat gzipped and Zstd-compressed text files as if
51 //    they were uncompressed.  This is functionally identical to Zstd's
52 //    zlibWrapper, which was used for most of plink2's alpha testing period,
53 //    but I've decided to phase out zlibWrapper thanks to compilation headaches
54 //    and its static-linking requirement.
55 // 2. decompresses-ahead, potentially with multiple threads.
56 //    a. For now, multithreaded decompression can only kick in for bgzipped
57 //       files.  However, if a clear use case exists, it should be possible to
58 //       build a multithreaded Zstd decoder that isn't restricted to a Zstd
59 //       sub-format; see
60 //         https://github.com/facebook/zstd/issues/1702#issuecomment-515124700
61 //       If you have such a use case, I recommend responding to Yann Collet in
62 //       that GitHub issue.
63 //    b. Tabix-based seek support was considered and rejected, since the tabix
64 //       index only stores CHROM/POS, while plink2 also needs record numbers in
65 //       its most critical use case (.pvar loading).  A suitable index format
66 //       may be adopted or developed later, and if/when that's supported, there
67 //       will probably also be a way for callers which don't need record
68 //       numbers to exploit tabix indexes.  (In that event, plink2's Zstd
69 //       compressor will be modified to support seekable-Zstd output and index
70 //       generation.)
71 // 3. has line-reader functions that don't force the user to provide their own
72 //    buffer to put the line in.  Instead, they just return a (possibly const)
73 //    pointer to the beginning of the line and expose a pointer to the end of
74 //    the line.  This simultaneously saves memory and reduces overhead.
75 //    a. Since this reuses a single buffer, the string-view is invalidated when
76 //       the next line is read.
77 //    b. When the last line in the file is not terminated by '\n', this text
78 //       reader automatically appends '\n'.  Thus, while C library functions
79 //       that assume null-termination can't be used here (unless you're using a
80 //       wrapper function that replaces the terminating '\n' with '\0' after
81 //       the line-read function call, which is a totally valid thing to do),
82 //       plink2_string.h functions which either iterate to any end-of-line
83 //       character (ASCII code < 32 and unequal to 9=tab) or explicitly assume
84 //       '\n'-termination can be used, and these use essentially the same
85 //       optimizations as modern memchr implementations.
86 //    (A C++17 interface that returns std::string_view objects was considered,
87 //    but then rejected since std::string_view's design makes it much better
88 //    suited to be a function input-parameter type than a return type.  It is
89 //    easy enough to efficiently construct a string_view using the current
90 //    interface when it's time to call a function that accepts one.  The rest
91 //    of the time, there's no meaningful advantage over plain C pointers.)
92 // 4. can be used with either a single fixed-size memory buffer (this plays
93 //    well with plink2's memory allocation strategy), or dynamic resizing with
94 //    malloc()/realloc() calls.
95 //
96 // Two other readers are provided:
97 // - A decompress-ahead token reader.  This also shards the tokens, for the
98 //   common use case where the tokens don't need to be parsed in order (e.g.
99 //   --extract/--exclude).
100 // - A simpler single-threaded (no decompress-ahead) reader.
101 
102 #ifdef STATIC_ZLIB
103 #  include "../../zlib-1.2.11/zlib.h"
104 #else
105 #  include <zlib.h>
106 #  if !defined(ZLIB_VERNUM) || (ZLIB_VERNUM < 0x1240)
107 #    error "plink2_text requires zlib 1.2.4 or later."
108 #  endif
109 #endif
110 
111 #include "plink2_bgzf.h"
112 #include "plink2_zstfile.h"
113 
114 #ifdef __cplusplus
115 namespace plink2 {
116 #endif
117 
118 PglErr GetFileType(const char* fname, FileCompressionType* ftype_ptr);
119 
120 typedef struct TextFileBaseStruct {
121   // Not noncopyable, since this is copied wholesale by the TextStream
122   // move-constructor.
123 
124   // Positioned first so the compiler doesn't need to add an offset to access
125   // it.
126   char* consume_iter;
127 
128   char* consume_stop;  // should always point after the last loaded \n
129 
130   const char* errmsg;
131   PglErr reterr;
132 
133   FileCompressionType file_type;
134   FILE* ff;  // could use e.g. htslib for some network support later
135   uint32_t dst_owned_by_consumer;
136   uint32_t enforced_max_line_blen;
137   // Note that when dst_owned_by_consumer isn't true, the reader thread can
138   // alter the values below.  However, this can only happen when the consumer
139   // thread is blocked waiting for the next line, or before it's attempted to
140   // read the first line.  So there's no advantage to placing these in a
141   // different cacheline from consume_iter (which is constantly altered by the
142   // consumer).
143   char* dst;
144   uint32_t dst_len;
145   uint32_t dst_capacity;
146 } TextFileBase;
147 
148 typedef struct GzRawDecompressStreamStruct {
149   // Copied by TextStream move-constructor.
150   unsigned char* in;
151   z_stream ds;
152   uint32_t ds_initialized;
153 } GzRawDecompressStream;
154 
155 // BgzfRawDecompressStream declared in plink2_bgzf.h.
156 
157 typedef struct ZstRawDecompressStreamStruct {
158   // Copied by TextStream move-constructor.
159   ZSTD_DStream* ds;
160   ZSTD_inBuffer ib;
161 } ZstRawDecompressStream;
162 
163 typedef union {
164   GzRawDecompressStream gz;
165   // Even in the single-threaded case, it's worth distinguishing bgzf from
166   // generic .gz, since with bgzf we can use libdeflate.
167   BgzfRawDecompressStream bgzf;
168   ZstRawDecompressStream zst;
169 } RawDecompressStream;
170 
171 typedef struct textFILEMainStruct {
172   TextFileBase base;
173   RawDecompressStream rds;
174 } textFILEMain;
175 
176 typedef struct textFILEStruct {
177 #ifdef __cplusplus
GET_PRIVATE_mtextFILEStruct178   textFILEMain& GET_PRIVATE_m() { return m; }
GET_PRIVATE_mtextFILEStruct179   textFILEMain const& GET_PRIVATE_m() const { return m; }
180  private:
181 #endif
182   textFILEMain m;
183 } textFILE;
184 
185 void PreinitTextFile(textFILE* txf_ptr);
186 
187 // kDecompressChunkSize = 1 MiB currently declared in plink2_bgzf.h, may move
188 // somewhere closer to the base later.
189 CONSTI32(kTextStreamBlenFast, 11 * kDecompressChunkSize);
190 CONSTI32(kTokenStreamBlen, 12 * kDecompressChunkSize);
191 CONSTI32(kMaxTokenBlen, 8 * kDecompressChunkSize);
192 static_assert(kMaxTokenBlen >= kDecompressChunkSize, "kMaxTokenBlen too small.");
193 static_assert(kMaxTokenBlen + kDecompressChunkSize <= kTokenStreamBlen, "kMaxTokenBlen + kDecompressChunkSize can't be larger than kTokenStreamBlen.");
194 
195 // max_line_blen and enforced_max_line_blen lower bound.
196 CONSTI32(kDecompressMinBlen, kDecompressChunkSize);
197 
198 CONSTI32(kDecompressMinCapacity, kDecompressMinBlen + kDecompressChunkSize);
199 
200 // * Can return nomem, open-fail, or read-fail.
201 // * If dst == nullptr, this mallocs a buffer of size 2 * kDecompressChunkSize,
202 //   and it'll be realloced as necessary and freed by CleanupTextFile().
203 //   The original value of dst_capacity doesn't matter in this case.
204 //   Otherwise, the buffer is owned by the caller, assumed to have size >=
205 //   dst_capacity, and never grown.
206 // * enforced_max_line_blen must be >= dst_capacity - kDecompressChunkSize.
207 //   It's the point at which long-line errors instead of out-of-memory errors
208 //   are reported.  It isn't permitted to be less than 1 MiB.
209 PglErr TextFileOpenEx(const char* fname, uint32_t enforced_max_line_blen, uint32_t dst_capacity, char* dst, textFILE* txf_ptr);
210 
TextFileOpen(const char * fname,textFILE * txf_ptr)211 HEADER_INLINE PglErr TextFileOpen(const char* fname, textFILE* txf_ptr) {
212   return TextFileOpenEx(fname, kMaxLongLine, 0, nullptr, txf_ptr);
213 }
214 
215 extern const char kShortErrLongLine[];
216 extern const char kShortErrInteriorEmptyLine[];
217 
218 PglErr TextFileAdvance(textFILE* txf_ptr);
219 
TextFileNextLine(textFILE * txf_ptr,char ** line_startp)220 HEADER_INLINE PglErr TextFileNextLine(textFILE* txf_ptr, char** line_startp) {
221   TextFileBase* basep = &GET_PRIVATE(*txf_ptr, m).base;
222   if (basep->consume_iter == basep->consume_stop) {
223     PglErr reterr = TextFileAdvance(txf_ptr);
224     // not unlikely() due to eof
225     if (reterr) {
226       return reterr;
227     }
228   }
229   *line_startp = basep->consume_iter;
230   basep->consume_iter = AdvPastDelim(*line_startp, '\n');
231   return kPglRetSuccess;
232 }
233 
TextFileLineEnd(textFILE * txf_ptr)234 HEADER_INLINE char* TextFileLineEnd(textFILE* txf_ptr) {
235   return GET_PRIVATE(*txf_ptr, m).base.consume_iter;
236 }
237 
TextFileNextLineLstrip(textFILE * txf_ptr,char ** line_startp)238 HEADER_INLINE PglErr TextFileNextLineLstrip(textFILE* txf_ptr, char** line_startp) {
239   TextFileBase* basep = &GET_PRIVATE(*txf_ptr, m).base;
240   if (basep->consume_iter == basep->consume_stop) {
241     PglErr reterr = TextFileAdvance(txf_ptr);
242     if (reterr) {
243       return reterr;
244     }
245   }
246   *line_startp = FirstNonTspace(basep->consume_iter);
247   basep->consume_iter = AdvPastDelim(*line_startp, '\n');
248   return kPglRetSuccess;
249 }
250 
251 PglErr TextFileOnlyEmptyLinesLeft(textFILE* txf_ptr);
252 
253 // API fix (30-31 Oct 2019): While we want to tolerate *trailing* empty lines
254 // (as well as lack of a final newline), since those arise from manual
255 // text-editing every once in a while, there's usually no good reason to
256 // tolerate *interior* empty lines like we were previously doing.  So the
257 // generic empty-line-skipping interface has been replaced with one that
258 // reports an "Unexpected empty line" error with a kPglRetMalformedInput error
259 // code unless the empty line(s) are at EOF.
TextFileGet(textFILE * txf_ptr)260 HEADER_INLINE char* TextFileGet(textFILE* txf_ptr) {
261   TextFileBase* basep = &GET_PRIVATE(*txf_ptr, m).base;
262   if (basep->consume_iter == basep->consume_stop) {
263     // not unlikely() due to eof
264     if (TextFileAdvance(txf_ptr)) {
265       return nullptr;
266     }
267   }
268   char* line_start = FirstNonTspace(basep->consume_iter);
269   basep->consume_iter = AdvPastDelim(line_start, '\n');
270   if (!IsEolnKns(*line_start)) {
271     return line_start;
272   }
273   TextFileOnlyEmptyLinesLeft(txf_ptr);
274   return nullptr;
275 }
276 
277 void TextFileRewind(textFILE* txf_ptr);
278 
TextFileIsOpen(const textFILE * txf_ptr)279 HEADER_INLINE int32_t TextFileIsOpen(const textFILE* txf_ptr) {
280   return (GET_PRIVATE(*txf_ptr, m).base.ff != nullptr);
281 }
282 
TextFileEof(const textFILE * txf_ptr)283 HEADER_INLINE int32_t TextFileEof(const textFILE* txf_ptr) {
284   return (GET_PRIVATE(*txf_ptr, m).base.reterr == kPglRetEof);
285 }
286 
TextFileError(const textFILE * txf_ptr)287 HEADER_INLINE const char* TextFileError(const textFILE* txf_ptr) {
288   return GET_PRIVATE(*txf_ptr, m).base.errmsg;
289 }
290 
TextFileErrcode(const textFILE * txf_ptr)291 HEADER_INLINE PglErr TextFileErrcode(const textFILE* txf_ptr) {
292   const PglErr reterr = GET_PRIVATE(*txf_ptr, m).base.reterr;
293   if (reterr == kPglRetEof) {
294     return kPglRetSuccess;
295   }
296   return reterr;
297 }
298 
299 // Note that this does not assign to *reterrp at all in the usual eof case.
TextFileErrcode2(const textFILE * txf_ptr,PglErr * reterrp)300 HEADER_INLINE BoolErr TextFileErrcode2(const textFILE* txf_ptr, PglErr* reterrp) {
301   const PglErr reterr = GET_PRIVATE(*txf_ptr, m).base.reterr;
302   if (reterr == kPglRetEof) {
303     return 0;
304   }
305   *reterrp = reterr;
306   return 1;
307 }
308 
309 // Does not convert kPglRetEof -> kPglRetSuccess.
TextFileRawErrcode(const textFILE * txf_ptr)310 HEADER_INLINE PglErr TextFileRawErrcode(const textFILE* txf_ptr) {
311   return GET_PRIVATE(*txf_ptr, m).base.reterr;
312 }
313 
314 // Relevant when about to move-construct a TextStream; see plink2_glm.
TextFileLinebufLen(const textFILE * txf_ptr)315 HEADER_INLINE uint32_t TextFileLinebufLen(const textFILE* txf_ptr) {
316   return GET_PRIVATE(*txf_ptr, m).base.dst_len;
317 }
318 
319 // Ok to pass reterrp == nullptr.
320 // Returns nonzero iff file-close fails, and either reterrp == nullptr or
321 // *reterrp == kPglRetSuccess; this is intended to be followed by logging of
322 // strerror(errno).  In the latter case, *reterrp is set to kPglRetReadFail.
323 BoolErr CleanupTextFile(textFILE* txf_ptr, PglErr* reterrp);
324 
325 
326 // consumer -> reader message
327 // could add a "close current file and open another one" case
328 ENUM_U31_DEF_START()
329   kTxsInterruptNone,
330   kTxsInterruptRetarget,
331   kTxsInterruptShutdown
332 ENUM_U31_DEF_END(TxsInterrupt);
333 
334 typedef struct TextStreamSyncStruct {
335   // Mutex shared state, and everything guarded by the mutex.  Allocated to
336   // different cacheline(s) than consume_stop.
337 #ifdef _WIN32
338   CRITICAL_SECTION critical_section;
339   HANDLE reader_progress_event;
340   HANDLE consumer_progress_event;
341 #else
342   pthread_mutex_t sync_mutex;
343   pthread_cond_t reader_progress_condvar;
344   pthread_cond_t consumer_progress_condvar;
345   // bugfix (7 Mar 2018): need to avoid waiting on consumer_progress_condvar if
346   // this is set.  (could also check an appropriate predicate)
347   uint32_t consumer_progress_state;
348 
349   uint32_t sync_init_state;
350 #endif
351 
352   pthread_t read_thread;
353 
354   char* consume_tail;
355   char* cur_circular_end;
356   char* available_end;
357 
358   // Separate from the TextFileBase instances of these values, since we don't
359   // want to force the user to worry about these values changing at any moment.
360   // Instead, the TextFileBase instances are only updated during TextAdvance()
361   // calls and the like.
362   const char* errmsg;
363   PglErr reterr;  // note that this is set to kPglRetEof once we reach eof
364 
365   uint32_t dst_reallocated;
366   TxsInterrupt interrupt;
367   const char* new_fname;
368 } TextStreamSync;
369 
370 typedef union {
371   GzRawDecompressStream gz;
372   BgzfRawMtDecompressStream bgzf;
373   ZstRawDecompressStream zst;
374 } RawMtDecompressStream;
375 
376 typedef struct TextStreamMainStruct {
377   TextFileBase base;
378   RawMtDecompressStream rds;
379   uint32_t decompress_thread_ct;
380   TextStreamSync* syncp;
381 } TextStreamMain;
382 
383 typedef struct TextStreamStruct {
384 #ifdef __cplusplus
GET_PRIVATE_mTextStreamStruct385   TextStreamMain& GET_PRIVATE_m() { return m; }
GET_PRIVATE_mTextStreamStruct386   TextStreamMain const& GET_PRIVATE_m() const { return m; }
387  private:
388 #endif
389   TextStreamMain m;
390 } TextStream;
391 
392 void PreinitTextStream(TextStream* txs_ptr);
393 
394 // * Can return nomem, open-fail, read-fail, or thread-create-fail.
395 // * Exactly one of fname and txf_ptr must be nullptr.  If txf_ptr is null,
396 //   fname is opened.  Otherwise, the returned stream is "move-constructed"
397 //   from txf_ptr.
398 //   When not move-constructing, enforced_max_line_blen, dst_capacity, and dst
399 //   are interpreted the same way as TextFileOpenEx().
400 //   When move-constructing, enforced_max_line_blen and dst_capacity may be
401 //   smaller than what the textFILE was opened with.
402 PglErr TextStreamOpenEx(const char* fname, uint32_t enforced_max_line_blen, uint32_t dst_capacity, uint32_t decompress_thread_ct, textFILE* txf_ptr, char* dst, TextStream* txs_ptr);
403 
TextStreamOpen(const char * fname,TextStream * txs_ptr)404 HEADER_INLINE PglErr TextStreamOpen(const char* fname, TextStream* txs_ptr) {
405   return TextStreamOpenEx(fname, kMaxLongLine, 0, NumCpu(nullptr), nullptr, nullptr, txs_ptr);
406 }
407 
408 // We drop 'Stream' from the function names outside of open/close, to emphasize
409 // that this is the default choice.
410 // (Originally this was named 'TextRstream', but then I realized that the write
411 // case doesn't really care about whether the input is text or binary.)
TextLineEnd(TextStream * txs_ptr)412 HEADER_INLINE char* TextLineEnd(TextStream* txs_ptr) {
413   return GET_PRIVATE(*txs_ptr, m).base.consume_iter;
414 }
415 
TextIsOpen(const TextStream * txs_ptr)416 HEADER_INLINE int32_t TextIsOpen(const TextStream* txs_ptr) {
417   return (GET_PRIVATE(*txs_ptr, m).base.ff != nullptr);
418 }
419 
TextEof(const TextStream * txs_ptr)420 HEADER_INLINE int32_t TextEof(const TextStream* txs_ptr) {
421   return (GET_PRIVATE(*txs_ptr, m).base.reterr == kPglRetEof);
422 }
423 
424 uint32_t TextDecompressThreadCt(const TextStream* txs_ptr);
425 
426 PglErr TextAdvance(TextStream* txs_ptr);
427 
TextNextLine(TextStream * txs_ptr,char ** line_startp)428 HEADER_INLINE PglErr TextNextLine(TextStream* txs_ptr, char** line_startp) {
429   TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
430   if (basep->consume_iter == basep->consume_stop) {
431     PglErr reterr = TextAdvance(txs_ptr);
432     // not unlikely() due to eof
433     if (reterr) {
434       return reterr;
435     }
436   }
437   *line_startp = basep->consume_iter;
438   basep->consume_iter = AdvPastDelim(basep->consume_iter, '\n');
439   return kPglRetSuccess;
440 }
441 
TextNextLineK(TextStream * txs_ptr,const char ** line_startp)442 HEADER_INLINE PglErr TextNextLineK(TextStream* txs_ptr, const char** line_startp) {
443   return TextNextLine(txs_ptr, K_CAST(char**, line_startp));
444 }
445 
446 // plink2 functions strip leading whitespace by default (unless there's a clear
447 // reason not to, e.g. it isn't allowed in VCF files), since plink 1.x inserts
448 // it so often.
TextNextLineLstrip(TextStream * txs_ptr,char ** line_startp)449 HEADER_INLINE PglErr TextNextLineLstrip(TextStream* txs_ptr, char** line_startp) {
450   TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
451   if (basep->consume_iter == basep->consume_stop) {
452     PglErr reterr = TextAdvance(txs_ptr);
453     // not unlikely() due to eof
454     if (reterr) {
455       return reterr;
456     }
457   }
458   *line_startp = FirstNonTspace(basep->consume_iter);
459   basep->consume_iter = AdvPastDelim(*line_startp, '\n');
460   return kPglRetSuccess;
461 }
462 
TextNextLineLstripK(TextStream * txs_ptr,const char ** line_startp)463 HEADER_INLINE PglErr TextNextLineLstripK(TextStream* txs_ptr, const char** line_startp) {
464   return TextNextLineLstrip(txs_ptr, K_CAST(char**, line_startp));
465 }
466 
467 // returns kPglRetEof if true, otherwise kPglRetMalformedInput and sets
468 // errmsg.
469 PglErr TextOnlyEmptyLinesLeft(TextStream* txs_ptr);
470 
471 // This was previously named TextNextLineLstripNoempty, and returned a PglErr,
472 // but some compilers were (i) not inlining it and (ii) spamming spurious
473 // uninitialized-variable warnings as a side effect.  And it's plink2's most
474 // common use case.
475 // So we'll use a different interface here (and a shorter name).  User is
476 // expected to call TextStream[Raw]Errcode when nullptr is returned.
TextGet(TextStream * txs_ptr)477 HEADER_INLINE char* TextGet(TextStream* txs_ptr) {
478   TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
479   if (basep->consume_iter == basep->consume_stop) {
480     // not unlikely() due to eof
481     if (TextAdvance(txs_ptr)) {
482       return nullptr;
483     }
484   }
485   char* line_start = FirstNonTspace(basep->consume_iter);
486   basep->consume_iter = AdvPastDelim(line_start, '\n');
487   if (!IsEolnKns(*line_start)) {
488     return line_start;
489   }
490   TextOnlyEmptyLinesLeft(txs_ptr);
491   return nullptr;
492 }
493 
494 PglErr TextSkipNz(uintptr_t skip_ct, TextStream* txs_ptr);
495 
TextSkip(uintptr_t skip_ct,TextStream * txs_ptr)496 HEADER_INLINE PglErr TextSkip(uintptr_t skip_ct, TextStream* txs_ptr) {
497   if (skip_ct == 0) {
498     return kPglRetSuccess;
499   }
500   return TextSkipNz(skip_ct, txs_ptr);
501 }
502 
503 
504 // 'Unsafe' functions require line_iter to already point to the start of the
505 // line, and don't update txsp->base.consume_iter; they primarily wrap the
506 // TextAdvance() call.
TextNextLineUnsafe(TextStream * txs_ptr,char ** line_iterp)507 HEADER_INLINE PglErr TextNextLineUnsafe(TextStream* txs_ptr, char** line_iterp) {
508   TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
509   if (*line_iterp != basep->consume_stop) {
510     return kPglRetSuccess;
511   }
512   basep->consume_iter = *line_iterp;
513   PglErr reterr = TextAdvance(txs_ptr);
514   // not unlikely() due to eof
515   if (reterr) {
516     return reterr;
517   }
518   *line_iterp = basep->consume_iter;
519   return kPglRetSuccess;
520 }
521 
522 
TextNextLineLstripUnsafe(TextStream * txs_ptr,char ** line_iterp)523 HEADER_INLINE PglErr TextNextLineLstripUnsafe(TextStream* txs_ptr, char** line_iterp) {
524   char* line_iter = *line_iterp;
525   TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
526   if (line_iter == basep->consume_stop) {
527     basep->consume_iter = line_iter;
528     PglErr reterr = TextAdvance(txs_ptr);
529     // not unlikely() due to eof
530     if (reterr) {
531       return reterr;
532     }
533     line_iter = basep->consume_iter;
534   }
535   *line_iterp = FirstNonTspace(line_iter);
536   return kPglRetSuccess;
537 }
538 
TextNextLineLstripUnsafeK(TextStream * txs_ptr,const char ** line_iterp)539 HEADER_INLINE PglErr TextNextLineLstripUnsafeK(TextStream* txs_ptr, const char** line_iterp) {
540   return TextNextLineLstripUnsafe(txs_ptr, K_CAST(char**, line_iterp));
541 }
542 
TextGetUnsafe(TextStream * txs_ptr,char ** line_iterp)543 HEADER_INLINE PglErr TextGetUnsafe(TextStream* txs_ptr, char** line_iterp) {
544   char* line_iter = *line_iterp;
545   TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
546   if (line_iter == basep->consume_stop) {
547     basep->consume_iter = line_iter;
548     PglErr reterr = TextAdvance(txs_ptr);
549     // not unlikely() due to eof
550     if (reterr) {
551       return reterr;
552     }
553     line_iter = basep->consume_iter;
554   }
555   line_iter = FirstNonTspace(line_iter);
556   if (!IsEolnKns(*line_iter)) {
557     *line_iterp = line_iter;
558     return kPglRetSuccess;
559   }
560   return TextOnlyEmptyLinesLeft(txs_ptr);
561 }
562 
563 /*
564 HEADER_INLINE PglErr TextGetUnsafeK(TextStream* txs_ptr, const char** line_iterp) {
565   return TextGetUnsafe(txs_ptr, K_CAST(char**, line_iterp));
566 }
567 */
568 
569 // Returns *zero* when it's time to stop iterating.  Designed to be the middle
570 // argument in a for (; ; ) loop.
TextGetUnsafe2(TextStream * txs_ptr,char ** line_iterp)571 HEADER_INLINE uint32_t TextGetUnsafe2(TextStream* txs_ptr, char** line_iterp) {
572   char* line_iter = *line_iterp;
573   TextFileBase* basep = &GET_PRIVATE(*txs_ptr, m).base;
574   if (line_iter == basep->consume_stop) {
575     basep->consume_iter = line_iter;
576     // not unlikely() due to eof
577     if (TextAdvance(txs_ptr)) {
578       return 0;
579     }
580     line_iter = basep->consume_iter;
581   }
582   line_iter = FirstNonTspace(line_iter);
583   if (!IsEolnKns(*line_iter)) {
584     *line_iterp = line_iter;
585     return 1;
586   }
587   TextOnlyEmptyLinesLeft(txs_ptr);
588   return 0;
589 }
590 
TextGetUnsafe2K(TextStream * txs_ptr,const char ** line_iterp)591 HEADER_INLINE uint32_t TextGetUnsafe2K(TextStream* txs_ptr, const char** line_iterp) {
592   return TextGetUnsafe2(txs_ptr, K_CAST(char**, line_iterp));
593 }
594 
TextSetPos(char * new_consume_iter,TextStream * txs_ptr)595 HEADER_INLINE void TextSetPos(char* new_consume_iter, TextStream* txs_ptr) {
596   GET_PRIVATE(*txs_ptr, m).base.consume_iter = new_consume_iter;
597 }
598 
599 
TextIsMt(const TextStream * txs_ptr)600 HEADER_INLINE uint32_t TextIsMt(const TextStream* txs_ptr) {
601   // Only bgzf decoder is multithreaded for now.
602   return (GET_PRIVATE(*txs_ptr, m).base.file_type == kFileBgzf);
603 }
604 
605 PglErr TextRetarget(const char* new_fname, TextStream* txs_ptr);
606 
TextRewind(TextStream * txs_ptr)607 HEADER_INLINE PglErr TextRewind(TextStream* txs_ptr) {
608   return TextRetarget(nullptr, txs_ptr);
609 }
610 
TextStreamError(const TextStream * txs_ptr)611 HEADER_INLINE const char* TextStreamError(const TextStream* txs_ptr) {
612   return GET_PRIVATE(*txs_ptr, m).base.errmsg;
613 }
614 
TextStreamErrcode(const TextStream * txs_ptr)615 HEADER_INLINE PglErr TextStreamErrcode(const TextStream* txs_ptr) {
616   const PglErr reterr = GET_PRIVATE(*txs_ptr, m).base.reterr;
617   if (reterr == kPglRetEof) {
618     return kPglRetSuccess;
619   }
620   return reterr;
621 }
622 
623 // Note that this does not assign to *reterrp at all in the usual eof case.
TextStreamErrcode2(const TextStream * txs_ptr,PglErr * reterrp)624 HEADER_INLINE BoolErr TextStreamErrcode2(const TextStream* txs_ptr, PglErr* reterrp) {
625   const PglErr reterr = GET_PRIVATE(*txs_ptr, m).base.reterr;
626   if (reterr == kPglRetEof) {
627     return 0;
628   }
629   *reterrp = reterr;
630   return 1;
631 }
632 
633 // Does not convert kPglRetEof -> kPglRetSuccess.
TextStreamRawErrcode(const TextStream * txs_ptr)634 HEADER_INLINE PglErr TextStreamRawErrcode(const TextStream* txs_ptr) {
635   return GET_PRIVATE(*txs_ptr, m).base.reterr;
636 }
637 
638 // Ok to pass reterrp == nullptr.
639 // Returns nonzero iff file-close fails, and either reterrp == nullptr or
640 // *reterrp == kPglRetSuccess.  In the latter case, *reterrp is set to
641 // kPglRetReadFail.  (Note that this does *not* retrieve the existing
642 // txsp->reterr value; caller is responsible for checking TextStreamErrcode()
643 // first when they care.)
644 BoolErr CleanupTextStream(TextStream* txs_ptr, PglErr* reterrp);
645 
646 
647 // Low-level token-batch-reading interface, using the extra TextStream mode
648 // which cares about token rather than line endings.
649 typedef struct TokenStreamStruct {
650   NONCOPYABLE(TokenStreamStruct);
651   TextStream txs;
652 } TokenStream;
653 
PreinitTokenStream(TokenStream * tksp)654 HEADER_INLINE void PreinitTokenStream(TokenStream* tksp) {
655   PreinitTextStream(&tksp->txs);
656 }
657 
658 // Note that shard_boundaries must have length (piece_ct + 1).
659 PglErr TksNext(TokenStream* tksp, uint32_t shard_ct, char** shard_boundaries);
660 
TokenStreamRetarget(const char * new_fname,TokenStream * tksp)661 HEADER_INLINE PglErr TokenStreamRetarget(const char* new_fname, TokenStream* tksp) {
662   return TextRetarget(new_fname, &(tksp->txs));
663 }
664 
TokenRewind(TokenStream * tksp)665 HEADER_INLINE PglErr TokenRewind(TokenStream* tksp) {
666   return TextRetarget(nullptr, &(tksp->txs));
667 }
668 
TokenStreamError(const TokenStream * tksp)669 HEADER_INLINE const char* TokenStreamError(const TokenStream* tksp) {
670   return GET_PRIVATE(tksp->txs, m).base.errmsg;
671 }
672 
TokenStreamErrcode(const TokenStream * tksp)673 HEADER_INLINE PglErr TokenStreamErrcode(const TokenStream* tksp) {
674   const PglErr reterr = GET_PRIVATE(tksp->txs, m).base.reterr;
675   if (reterr == kPglRetEof) {
676     return kPglRetSuccess;
677   }
678   return reterr;
679 }
680 
CleanupTokenStream(TokenStream * tksp,PglErr * reterrp)681 HEADER_INLINE BoolErr CleanupTokenStream(TokenStream* tksp, PglErr* reterrp) {
682   return CleanupTextStream(&(tksp->txs), reterrp);
683 }
684 
685 // Could create a slightly simpler interface for the one-token-at-a-time case,
686 // but I won't bother for now since it's kind of good for the relative cost of
687 // parallelizing token processing to be low.
688 
689 
690 #ifdef __cplusplus
691 }  // namespace plink2
692 #endif
693 
694 #endif  // __PLINK2_TEXT_H__
695