1 #include "include/pgenlib_read.h"
2 #include "include/pgenlib_write.h"
3 
4 // #define SUBSET_TEST
5 
main(int32_t argc,char ** argv)6 int32_t main(int32_t argc, char** argv) {
7 #ifdef __cplusplus
8   using namespace plink2;
9 #endif
10   PglErr reterr = kPglRetSuccess;
11   unsigned char* pgfi_alloc = nullptr;
12   unsigned char* pgr_alloc = nullptr;
13   unsigned char* spgw_alloc = nullptr;
14   uintptr_t* genovec = nullptr;
15   uintptr_t* raregeno = nullptr;
16   uintptr_t* sample_include = nullptr;
17   uint32_t* sample_include_cumulative_popcounts = nullptr;
18   uint32_t* difflist_sample_ids = nullptr;
19   FILE* outfile = nullptr;
20   PgenHeaderCtrl header_ctrl;
21   STPgenWriter spgw;
22   uint32_t write_sample_ct;
23   PgenFileInfo pgfi;
24   PgenReader pgr;
25   PreinitPgfi(&pgfi);
26   PreinitPgr(&pgr);
27   PreinitSpgw(&spgw);
28   {
29     const uint32_t use_mmap = 0;
30     if ((argc < 3) || (argc > 5)) {
31       fputs(
32 "Usage:\n"
33 "pgen_compress [input .bed or .pgen] [output filename] {sample_ct}\n"
34 "  (sample_ct is required when loading a .bed file)\n"
35 "pgen_compress -u [input .pgen] [output .bed]\n"
36             , stdout);
37       goto main_ret_INVALID_CMDLINE;
38     }
39     const uint32_t decompress = (argv[1][0] == '-') && (argv[1][1] == 'u') && (argv[1][2] == '\0');
40     uint32_t sample_ct = 0xffffffffU;
41     if (S_CAST(uint32_t, argc) == 4 + decompress) {
42       if (ScanPosintDefcap(argv[3 + decompress], &sample_ct)) {
43         goto main_ret_INVALID_CMDLINE;
44       }
45     }
46     char errstr_buf[kPglErrstrBufBlen];
47     uintptr_t cur_alloc_cacheline_ct;
48     reterr = PgfiInitPhase1(argv[1 + decompress], 0xffffffffU, sample_ct, use_mmap, &header_ctrl, &pgfi, &cur_alloc_cacheline_ct, errstr_buf);
49     if (reterr) {
50       fputs(errstr_buf, stderr);
51       goto main_ret_1;
52     }
53     sample_ct = pgfi.raw_sample_ct;
54     if (!sample_ct) {
55       fprintf(stderr, "error: sample_ct == 0\n");
56       goto main_ret_INVALID_CMDLINE;
57     }
58     const uint32_t variant_ct = pgfi.raw_variant_ct;
59     if (!variant_ct) {
60       fprintf(stderr, "error: variant_ct == 0\n");
61       goto main_ret_INVALID_CMDLINE;
62     }
63     if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &pgfi_alloc)) {
64       goto main_ret_NOMEM;
65     }
66     uint32_t max_vrec_width;
67     // todo: test block-fread
68     reterr = PgfiInitPhase2(header_ctrl, 0, 0, 0, 0, variant_ct, &max_vrec_width, &pgfi, pgfi_alloc, &cur_alloc_cacheline_ct, errstr_buf);
69     if (reterr) {
70       fputs(errstr_buf, stderr);
71       goto main_ret_1;
72     }
73     if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &pgr_alloc)) {
74       goto main_ret_NOMEM;
75     }
76 
77     // modify this when trying block-fread
78     reterr = PgrInit(use_mmap? nullptr : argv[1 + decompress], max_vrec_width, &pgfi, &pgr, pgr_alloc);
79     if (reterr) {
80       fprintf(stderr, "pgr_init error %u\n", S_CAST(uint32_t, reterr));
81       goto main_ret_1;
82     }
83 
84     if (S_CAST(uint32_t, argc) == 4 + decompress) {
85       printf("%u variant%s detected.\n", variant_ct, (variant_ct == 1)? "" : "s");
86     } else {
87       printf("%u variant%s and %u sample%s detected.\n", variant_ct, (variant_ct == 1)? "" : "s", sample_ct, (sample_ct == 1)? "" : "s");
88     }
89     if (cachealigned_malloc(NypCtToVecCt(sample_ct) * kBytesPerVec, &genovec)) {
90       goto main_ret_NOMEM;
91     }
92     if (decompress) {
93       outfile = fopen(argv[3], FOPEN_WB);
94       if (!outfile) {
95         goto main_ret_OPEN_FAIL;
96       }
97       PgrSampleSubsetIndex pssi;
98       PgrClearSampleSubsetIndex(&pgr, &pssi);
99       const uintptr_t final_mask = (k1LU << ((sample_ct % kBitsPerWordD2) * 2)) - k1LU;
100       const uint32_t final_widx = NypCtToWordCt(sample_ct) - 1;
101       const uint32_t variant_byte_ct = (sample_ct + 3) / 4;
102       fwrite("l\x1b\x01", 3, 1, outfile);
103       for (uint32_t vidx = 0; vidx < variant_ct; ) {
104         reterr = PgrGet(nullptr, pssi, sample_ct, vidx, &pgr, genovec);
105         if (reterr) {
106           fprintf(stderr, "\nread error %u, vidx=%u\n", S_CAST(uint32_t, reterr), vidx);
107           goto main_ret_1;
108         }
109         PgrPlink2ToPlink1InplaceUnsafe(sample_ct, genovec);
110         if (final_mask) {
111           genovec[final_widx] &= final_mask;
112         }
113         fwrite(genovec, variant_byte_ct, 1, outfile);
114         ++vidx;
115         if (!(vidx % 100000)) {
116           printf("\r%u.%um variants decompressed.", vidx / 1000000, (vidx / 100000) % 10);
117           fflush(stdout);
118         }
119       }
120       if (fclose_null(&outfile)) {
121         goto main_ret_WRITE_FAIL;
122       }
123       printf("\n");
124       goto main_ret_1;
125     }
126 #ifdef SUBSET_TEST
127     // write_sample_ct = sample_ct - 3;
128     write_sample_ct = 3;
129 #else
130     write_sample_ct = sample_ct;
131 #endif
132     uint32_t max_vrec_len;
133     reterr = SpgwInitPhase1(argv[2], nullptr, nullptr, variant_ct, write_sample_ct, kfPgenGlobal0, 2, &spgw, &cur_alloc_cacheline_ct, &max_vrec_len);
134     if (reterr) {
135       fprintf(stderr, "compression phase 1 error %u\n", S_CAST(uint32_t, reterr));
136       goto main_ret_1;
137     }
138     if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
139       goto main_ret_NOMEM;
140     }
141     SpgwInitPhase2(max_vrec_len, &spgw, spgw_alloc);
142 
143     const uint32_t max_simple_difflist_len = sample_ct / kBitsPerWordD2;
144     const uint32_t max_returned_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor);
145     const uint32_t max_difflist_len = 2 * (write_sample_ct / kPglMaxDifflistLenDivisor);
146     if (cachealigned_malloc(RoundUpPow2((max_returned_difflist_len + 3) / 4, kCacheline), &raregeno) ||
147         cachealigned_malloc(RoundUpPow2((sample_ct + 7) / 8, kCacheline), &sample_include) ||
148         cachealigned_malloc(RoundUpPow2((1 + (sample_ct / kBitsPerWord)) * sizeof(int32_t), kCacheline), &sample_include_cumulative_popcounts) ||
149         cachealigned_malloc(RoundUpPow2((max_returned_difflist_len + 1) * sizeof(int32_t), kCacheline), &difflist_sample_ids)) {
150       goto main_ret_NOMEM;
151     }
152 #ifdef SUBSET_TEST
153     fill_ulong_zero(BitCtToWordCt(sample_ct), sample_include);
154     SetBit(123, sample_include);
155     SetBit(127, sample_include);
156     SetBit(320, sample_include);
157     // ClearBit(123, sample_include);
158     // ClearBit(127, sample_include);
159     // ClearBit(320, sample_include);
160 #else
161     SetAllBits(sample_ct, sample_include);
162 #endif
163     FillCumulativePopcounts(sample_include, 1 + (sample_ct / kBitsPerWord), sample_include_cumulative_popcounts);
164     PgrSampleSubsetIndex pssi;
165     PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, &pgr, &pssi);
166     for (uint32_t vidx = 0; vidx < variant_ct; ) {
167       uint32_t difflist_common_geno;
168       uint32_t difflist_len;
169       reterr = PgrGetDifflistOrGenovec(sample_include, pssi, write_sample_ct, max_simple_difflist_len, vidx, &pgr, genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len);
170       if (reterr) {
171         fprintf(stderr, "\nread error %u, vidx=%u\n", S_CAST(uint32_t, reterr), vidx);
172         goto main_ret_1;
173       }
174       if (difflist_common_geno == 0xffffffffU) {
175         ZeroTrailingBits(write_sample_ct * 2, genovec);
176         reterr = SpgwAppendBiallelicGenovec(genovec, &spgw);
177       } else if (difflist_len <= max_difflist_len) {
178         ZeroTrailingBits(2 * difflist_len, raregeno);
179         difflist_sample_ids[difflist_len] = write_sample_ct;
180         reterr = SpgwAppendBiallelicDifflistLimited(raregeno, difflist_sample_ids, difflist_common_geno, difflist_len, &spgw);
181       } else {
182         PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, write_sample_ct, difflist_len, genovec);
183         ZeroTrailingBits(write_sample_ct * 2, genovec);
184         reterr = SpgwAppendBiallelicGenovec(genovec, &spgw);
185       }
186       if (reterr) {
187         fprintf(stderr, "\ncompress/write error %u, vidx=%u\n", S_CAST(uint32_t, reterr), vidx);
188         goto main_ret_1;
189       }
190       ++vidx;
191       if (!(vidx % 100000)) {
192         printf("\r%u.%um variants compressed.", vidx / 1000000, (vidx / 100000) % 10);
193         fflush(stdout);
194       }
195     }
196   }
197   printf("\n");
198 
199   SpgwFinish(&spgw);
200   while (0) {
201   main_ret_NOMEM:
202     reterr = kPglRetNomem;
203     break;
204   main_ret_OPEN_FAIL:
205     reterr = kPglRetOpenFail;
206     break;
207   main_ret_WRITE_FAIL:
208     reterr = kPglRetWriteFail;
209     break;
210   main_ret_INVALID_CMDLINE:
211     reterr = kPglRetInvalidCmdline;
212     break;
213   }
214  main_ret_1:
215   CleanupPgr(&pgr, &reterr);
216 #ifndef NO_MMAP
217   CleanupPgfi(&pgfi, &reterr);
218 #endif
219   CleanupSpgw(&spgw, &reterr);
220   if (pgfi_alloc) {
221     aligned_free(pgfi_alloc);
222   }
223   if (pgr_alloc) {
224     aligned_free(pgr_alloc);
225   }
226   if (spgw_alloc) {
227     aligned_free(spgw_alloc);
228   }
229   if (genovec) {
230     aligned_free(genovec);
231   }
232   if (raregeno) {
233     aligned_free(raregeno);
234   }
235   if (sample_include) {
236     aligned_free(sample_include);
237   }
238   if (sample_include_cumulative_popcounts) {
239     aligned_free(sample_include_cumulative_popcounts);
240   }
241   if (difflist_sample_ids) {
242     aligned_free(difflist_sample_ids);
243   }
244   if (outfile) {
245     fclose(outfile);
246   }
247   return S_CAST(int32_t, reterr);
248 }
249