1 #include "include/pgenlib_read.h"
2 #include "include/pgenlib_write.h"
3
4 // #define SUBSET_TEST
5
main(int32_t argc,char ** argv)6 int32_t main(int32_t argc, char** argv) {
7 #ifdef __cplusplus
8 using namespace plink2;
9 #endif
10 PglErr reterr = kPglRetSuccess;
11 unsigned char* pgfi_alloc = nullptr;
12 unsigned char* pgr_alloc = nullptr;
13 unsigned char* spgw_alloc = nullptr;
14 uintptr_t* genovec = nullptr;
15 uintptr_t* raregeno = nullptr;
16 uintptr_t* sample_include = nullptr;
17 uint32_t* sample_include_cumulative_popcounts = nullptr;
18 uint32_t* difflist_sample_ids = nullptr;
19 FILE* outfile = nullptr;
20 PgenHeaderCtrl header_ctrl;
21 STPgenWriter spgw;
22 uint32_t write_sample_ct;
23 PgenFileInfo pgfi;
24 PgenReader pgr;
25 PreinitPgfi(&pgfi);
26 PreinitPgr(&pgr);
27 PreinitSpgw(&spgw);
28 {
29 const uint32_t use_mmap = 0;
30 if ((argc < 3) || (argc > 5)) {
31 fputs(
32 "Usage:\n"
33 "pgen_compress [input .bed or .pgen] [output filename] {sample_ct}\n"
34 " (sample_ct is required when loading a .bed file)\n"
35 "pgen_compress -u [input .pgen] [output .bed]\n"
36 , stdout);
37 goto main_ret_INVALID_CMDLINE;
38 }
39 const uint32_t decompress = (argv[1][0] == '-') && (argv[1][1] == 'u') && (argv[1][2] == '\0');
40 uint32_t sample_ct = 0xffffffffU;
41 if (S_CAST(uint32_t, argc) == 4 + decompress) {
42 if (ScanPosintDefcap(argv[3 + decompress], &sample_ct)) {
43 goto main_ret_INVALID_CMDLINE;
44 }
45 }
46 char errstr_buf[kPglErrstrBufBlen];
47 uintptr_t cur_alloc_cacheline_ct;
48 reterr = PgfiInitPhase1(argv[1 + decompress], 0xffffffffU, sample_ct, use_mmap, &header_ctrl, &pgfi, &cur_alloc_cacheline_ct, errstr_buf);
49 if (reterr) {
50 fputs(errstr_buf, stderr);
51 goto main_ret_1;
52 }
53 sample_ct = pgfi.raw_sample_ct;
54 if (!sample_ct) {
55 fprintf(stderr, "error: sample_ct == 0\n");
56 goto main_ret_INVALID_CMDLINE;
57 }
58 const uint32_t variant_ct = pgfi.raw_variant_ct;
59 if (!variant_ct) {
60 fprintf(stderr, "error: variant_ct == 0\n");
61 goto main_ret_INVALID_CMDLINE;
62 }
63 if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &pgfi_alloc)) {
64 goto main_ret_NOMEM;
65 }
66 uint32_t max_vrec_width;
67 // todo: test block-fread
68 reterr = PgfiInitPhase2(header_ctrl, 0, 0, 0, 0, variant_ct, &max_vrec_width, &pgfi, pgfi_alloc, &cur_alloc_cacheline_ct, errstr_buf);
69 if (reterr) {
70 fputs(errstr_buf, stderr);
71 goto main_ret_1;
72 }
73 if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &pgr_alloc)) {
74 goto main_ret_NOMEM;
75 }
76
77 // modify this when trying block-fread
78 reterr = PgrInit(use_mmap? nullptr : argv[1 + decompress], max_vrec_width, &pgfi, &pgr, pgr_alloc);
79 if (reterr) {
80 fprintf(stderr, "pgr_init error %u\n", S_CAST(uint32_t, reterr));
81 goto main_ret_1;
82 }
83
84 if (S_CAST(uint32_t, argc) == 4 + decompress) {
85 printf("%u variant%s detected.\n", variant_ct, (variant_ct == 1)? "" : "s");
86 } else {
87 printf("%u variant%s and %u sample%s detected.\n", variant_ct, (variant_ct == 1)? "" : "s", sample_ct, (sample_ct == 1)? "" : "s");
88 }
89 if (cachealigned_malloc(NypCtToVecCt(sample_ct) * kBytesPerVec, &genovec)) {
90 goto main_ret_NOMEM;
91 }
92 if (decompress) {
93 outfile = fopen(argv[3], FOPEN_WB);
94 if (!outfile) {
95 goto main_ret_OPEN_FAIL;
96 }
97 PgrSampleSubsetIndex pssi;
98 PgrClearSampleSubsetIndex(&pgr, &pssi);
99 const uintptr_t final_mask = (k1LU << ((sample_ct % kBitsPerWordD2) * 2)) - k1LU;
100 const uint32_t final_widx = NypCtToWordCt(sample_ct) - 1;
101 const uint32_t variant_byte_ct = (sample_ct + 3) / 4;
102 fwrite("l\x1b\x01", 3, 1, outfile);
103 for (uint32_t vidx = 0; vidx < variant_ct; ) {
104 reterr = PgrGet(nullptr, pssi, sample_ct, vidx, &pgr, genovec);
105 if (reterr) {
106 fprintf(stderr, "\nread error %u, vidx=%u\n", S_CAST(uint32_t, reterr), vidx);
107 goto main_ret_1;
108 }
109 PgrPlink2ToPlink1InplaceUnsafe(sample_ct, genovec);
110 if (final_mask) {
111 genovec[final_widx] &= final_mask;
112 }
113 fwrite(genovec, variant_byte_ct, 1, outfile);
114 ++vidx;
115 if (!(vidx % 100000)) {
116 printf("\r%u.%um variants decompressed.", vidx / 1000000, (vidx / 100000) % 10);
117 fflush(stdout);
118 }
119 }
120 if (fclose_null(&outfile)) {
121 goto main_ret_WRITE_FAIL;
122 }
123 printf("\n");
124 goto main_ret_1;
125 }
126 #ifdef SUBSET_TEST
127 // write_sample_ct = sample_ct - 3;
128 write_sample_ct = 3;
129 #else
130 write_sample_ct = sample_ct;
131 #endif
132 uint32_t max_vrec_len;
133 reterr = SpgwInitPhase1(argv[2], nullptr, nullptr, variant_ct, write_sample_ct, kfPgenGlobal0, 2, &spgw, &cur_alloc_cacheline_ct, &max_vrec_len);
134 if (reterr) {
135 fprintf(stderr, "compression phase 1 error %u\n", S_CAST(uint32_t, reterr));
136 goto main_ret_1;
137 }
138 if (cachealigned_malloc(cur_alloc_cacheline_ct * kCacheline, &spgw_alloc)) {
139 goto main_ret_NOMEM;
140 }
141 SpgwInitPhase2(max_vrec_len, &spgw, spgw_alloc);
142
143 const uint32_t max_simple_difflist_len = sample_ct / kBitsPerWordD2;
144 const uint32_t max_returned_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor);
145 const uint32_t max_difflist_len = 2 * (write_sample_ct / kPglMaxDifflistLenDivisor);
146 if (cachealigned_malloc(RoundUpPow2((max_returned_difflist_len + 3) / 4, kCacheline), &raregeno) ||
147 cachealigned_malloc(RoundUpPow2((sample_ct + 7) / 8, kCacheline), &sample_include) ||
148 cachealigned_malloc(RoundUpPow2((1 + (sample_ct / kBitsPerWord)) * sizeof(int32_t), kCacheline), &sample_include_cumulative_popcounts) ||
149 cachealigned_malloc(RoundUpPow2((max_returned_difflist_len + 1) * sizeof(int32_t), kCacheline), &difflist_sample_ids)) {
150 goto main_ret_NOMEM;
151 }
152 #ifdef SUBSET_TEST
153 fill_ulong_zero(BitCtToWordCt(sample_ct), sample_include);
154 SetBit(123, sample_include);
155 SetBit(127, sample_include);
156 SetBit(320, sample_include);
157 // ClearBit(123, sample_include);
158 // ClearBit(127, sample_include);
159 // ClearBit(320, sample_include);
160 #else
161 SetAllBits(sample_ct, sample_include);
162 #endif
163 FillCumulativePopcounts(sample_include, 1 + (sample_ct / kBitsPerWord), sample_include_cumulative_popcounts);
164 PgrSampleSubsetIndex pssi;
165 PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, &pgr, &pssi);
166 for (uint32_t vidx = 0; vidx < variant_ct; ) {
167 uint32_t difflist_common_geno;
168 uint32_t difflist_len;
169 reterr = PgrGetDifflistOrGenovec(sample_include, pssi, write_sample_ct, max_simple_difflist_len, vidx, &pgr, genovec, &difflist_common_geno, raregeno, difflist_sample_ids, &difflist_len);
170 if (reterr) {
171 fprintf(stderr, "\nread error %u, vidx=%u\n", S_CAST(uint32_t, reterr), vidx);
172 goto main_ret_1;
173 }
174 if (difflist_common_geno == 0xffffffffU) {
175 ZeroTrailingBits(write_sample_ct * 2, genovec);
176 reterr = SpgwAppendBiallelicGenovec(genovec, &spgw);
177 } else if (difflist_len <= max_difflist_len) {
178 ZeroTrailingBits(2 * difflist_len, raregeno);
179 difflist_sample_ids[difflist_len] = write_sample_ct;
180 reterr = SpgwAppendBiallelicDifflistLimited(raregeno, difflist_sample_ids, difflist_common_geno, difflist_len, &spgw);
181 } else {
182 PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, write_sample_ct, difflist_len, genovec);
183 ZeroTrailingBits(write_sample_ct * 2, genovec);
184 reterr = SpgwAppendBiallelicGenovec(genovec, &spgw);
185 }
186 if (reterr) {
187 fprintf(stderr, "\ncompress/write error %u, vidx=%u\n", S_CAST(uint32_t, reterr), vidx);
188 goto main_ret_1;
189 }
190 ++vidx;
191 if (!(vidx % 100000)) {
192 printf("\r%u.%um variants compressed.", vidx / 1000000, (vidx / 100000) % 10);
193 fflush(stdout);
194 }
195 }
196 }
197 printf("\n");
198
199 SpgwFinish(&spgw);
200 while (0) {
201 main_ret_NOMEM:
202 reterr = kPglRetNomem;
203 break;
204 main_ret_OPEN_FAIL:
205 reterr = kPglRetOpenFail;
206 break;
207 main_ret_WRITE_FAIL:
208 reterr = kPglRetWriteFail;
209 break;
210 main_ret_INVALID_CMDLINE:
211 reterr = kPglRetInvalidCmdline;
212 break;
213 }
214 main_ret_1:
215 CleanupPgr(&pgr, &reterr);
216 #ifndef NO_MMAP
217 CleanupPgfi(&pgfi, &reterr);
218 #endif
219 CleanupSpgw(&spgw, &reterr);
220 if (pgfi_alloc) {
221 aligned_free(pgfi_alloc);
222 }
223 if (pgr_alloc) {
224 aligned_free(pgr_alloc);
225 }
226 if (spgw_alloc) {
227 aligned_free(spgw_alloc);
228 }
229 if (genovec) {
230 aligned_free(genovec);
231 }
232 if (raregeno) {
233 aligned_free(raregeno);
234 }
235 if (sample_include) {
236 aligned_free(sample_include);
237 }
238 if (sample_include_cumulative_popcounts) {
239 aligned_free(sample_include_cumulative_popcounts);
240 }
241 if (difflist_sample_ids) {
242 aligned_free(difflist_sample_ids);
243 }
244 if (outfile) {
245 fclose(outfile);
246 }
247 return S_CAST(int32_t, reterr);
248 }
249