1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2016, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // sql_utilities.h -- template routines to read and write Stacks SQL file formats.
23 //
24 #ifndef __SQL_UTILITIES_H__
25 #define __SQL_UTILITIES_H__
26
27 #include <unordered_set>
28
29 #include "input.h"
30 #include "utils.h"
31
32 //
33 // The expected number of tab-separated fields in our SQL input files.
34 //
35 const uint num_tags_fields = 9;
36 const uint num_snps_fields = 9;
37 const uint num_alleles_fields = 5;
38 const uint num_matches_fields = 6;
39
40 void load_catalog_matches(string sample, vector<CatMatch *> &matches, bool verbose=true);
41 int load_model_results(string sample, map<int, ModRes *> &modres);
42 int load_snp_calls(string sample, map<int, SNPRes *> &snpres);
43
44 // retrieve_bijective_sloci()
45 // ----------
46 // Returns pairs of (sample locus ID, catalog locus ID) for which there is a bijective relation.
47 vector<pair<int, int> > retrieve_bijective_loci(const vector<CatMatch*>& matches);
48 vector<pair<int, int> > retrieve_bijective_loci(const vector<pair<int,int>>& sloc_cloc_id_pairs);
49
50 template <class LocusT>
51 int
52 load_loci(const string& sample, map<int, LocusT *> &loci, int store_reads, bool load_all_model_calls, bool &compressed, bool verbose=true)
53 {
54 using namespace std;
55
56 LocusT *c;
57 SNP *snp;
58 string f;
59 char *cmp;
60 const char *p, *q;
61 int len;
62 vector<string> parts;
63 set<int> blacklisted;
64 long int line_num;
65 ifstream fh;
66 gzFile gz_fh;
67
68 char *line = new char[max_len];
69 int size = max_len;
70 bool gzip = false;
71 bool open_fail = true;
72 int fh_status = 1;
73
74 if (open_fail) {
75 //
76 // Test for a TAGs file.
77 //
78 f = sample + ".tags.tsv";
79 fh.open(f.c_str(), ifstream::in);
80 if (fh.fail())
81 open_fail = true;
82 else
83 open_fail = false;
84 }
85
86 if (open_fail) {
87 //
88 // Test for a gzipped TAGs file.
89 //
90 f = sample + ".tags.tsv.gz";
91 gz_fh = gzopen(f.c_str(), "rb");
92 if (!gz_fh) {
93 open_fail = true;
94 } else {
95 open_fail = false;
96 #if ZLIB_VERNUM >= 0x1240
97 gzbuffer(gz_fh, libz_buffer_size);
98 #endif
99 gzip = true;
100 }
101 }
102
103 if (open_fail) {
104 cerr << " Unable to open '" << sample << "'\n";
105 return 0;
106 }
107
108 if (verbose)
109 cerr << " Parsing " << f.c_str() << "\n";
110
111 uint id;
112
113 line_num = 1;
114 while (fh_status) {
115 fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
116
117 if (!fh_status && strlen(line) == 0)
118 continue;
119
120 if (is_comment(line)) continue;
121
122 parse_tsv(line, parts);
123
124 if (parts.size() != num_tags_fields) {
125 cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
126 return 0;
127 }
128
129 id = atoi(parts[1].c_str());
130
131 if (parts[2] != "consensus") {
132 if (blacklisted.count(id)) continue;
133
134 //
135 // Make sure this locus has already been defined (consensus sequence SHOULD always
136 // be specified first in the file for a particular locus).
137 //
138 if (loci.count(id) > 0) {
139 //
140 // Read the model sequence, a series of letters specifying if the model called a
141 // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
142 //
143 if (parts[2] == "model") {
144 loci[id]->model = new char[parts[5].length() + 1];
145 strcpy(loci[id]->model, parts[5].c_str());
146
147 } else {
148 //
149 // Otherwise, we expect a primary or secondary read, record these if specified.
150 //
151 loci[id]->depth++;
152
153 if (store_reads >= 1) {
154 if (store_reads >= 2) {
155 // Load the actual sequences (otherwise don't).
156 char *read = new char[parts[5].length() + 1];
157 strcpy(read, parts[5].c_str());
158 loci[id]->reads.push_back(read);
159 }
160
161 char *read_id = new char[parts[4].length() + 1];
162 strcpy(read_id, parts[4].c_str());
163 loci[id]->comp.push_back(read_id);
164 //
165 // Store the internal stack number for this read.
166 //
167 loci[id]->comp_cnt.push_back(atoi(parts[3].c_str()));
168
169 //
170 // Store the read type.
171 //
172 if (parts[2] == "primary")
173 loci[id]->comp_type.push_back(primary);
174 else
175 loci[id]->comp_type.push_back(secondary);
176 }
177 }
178
179 continue;
180 } else {
181 cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (stack " << id << " does not exist).\n";
182 return 0;
183 }
184 }
185
186 //
187 // Do not include blacklisted tags in the catalog. They are tags that are composed
188 // of noise and/or repetitive sequence.
189 //
190 if (parts[7] == "1") {
191 blacklisted.insert(id);
192 continue;
193 }
194
195 c = new LocusT;
196 c->sample_id = atoi(parts[0].c_str());
197 c->id = id;
198 c->add_consensus(parts[5].c_str());
199
200 //
201 // Read in the flags
202 //
203 c->deleveraged = (parts[6] == "1" ? true : false);
204 c->lumberjackstack = (parts[8] == "1" ? true : false);
205
206 //
207 // Parse the components of this stack (either the Illumina ID, or the catalog constituents)
208 //
209 q = parts[4].c_str();
210 while (*q != '\0') {
211 for (p = q; *q != ',' && *q != '\0'; q++);
212 len = q - p;
213 cmp = new char[len + 1];
214 strncpy(cmp, p, len);
215 cmp[len] = '\0';
216 c->comp.push_back(cmp);
217 if (*q != '\0') q++;
218 }
219
220 loci[c->id] = c;
221
222 line_num++;
223 }
224
225 if (gzip)
226 gzclose(gz_fh);
227 else
228 fh.close();
229
230 //
231 // Next, parse the SNP file and load model calls.
232 //
233 gzip = false;
234 fh_status = 1;
235 line_num = 1;
236
237 f = sample + ".snps.tsv";
238 fh.open(f.c_str(), ifstream::in);
239 if (fh.fail()) {
240 //
241 // Test for a gzipped file.
242 //
243 f = sample + ".snps.tsv.gz";
244 gz_fh = gzopen(f.c_str(), "rb");
245 if (!gz_fh) {
246 cerr << " Unable to open '" << sample << "'\n";
247 return 0;
248 }
249 #if ZLIB_VERNUM >= 0x1240
250 gzbuffer(gz_fh, libz_buffer_size);
251 #endif
252 gzip = true;
253 compressed = true;
254 }
255 if (verbose)
256 cerr << " Parsing " << f.c_str() << "\n";
257
258 while (fh_status) {
259 fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
260
261 if (!fh_status && strlen(line) == 0)
262 continue;
263
264 if (is_comment(line)) continue;
265
266 parse_tsv(line, parts);
267
268 if (parts.size() != num_snps_fields) {
269 cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
270 return 0;
271 }
272
273 id = atoi(parts[1].c_str());
274
275 if (blacklisted.count(id))
276 continue;
277
278 //
279 // Only load heterozygous model calls.
280 //
281 if (load_all_model_calls == false && parts[3] != "E")
282 continue;
283
284 snp = new SNP;
285 snp->col = atoi(parts[2].c_str());
286 snp->lratio = atof(parts[4].c_str());
287 snp->rank_1 = parts[5].at(0);
288 snp->rank_2 = parts[6].at(0) == '-' ? 0 : parts[6].at(0);
289
290 if (parts[3] == "E")
291 snp->type = snp_type_het;
292 else if (parts[3] == "O")
293 snp->type = snp_type_hom;
294 else
295 snp->type = snp_type_unk;
296
297 if (parts[7].length() == 0 || parts[7].at(0) == '-')
298 snp->rank_3 = 0;
299 else
300 snp->rank_3 = parts[7].at(0);
301
302 if (parts[8].length() == 0 || parts[8].at(0) == '-')
303 snp->rank_4 = 0;
304 else
305 snp->rank_4 = parts[8].at(0);
306
307 if (loci.count(id) > 0) {
308 loci[id]->snps.push_back(snp);
309 } else {
310 cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". SNP asks for nonexistent locus with ID: " << id << "\n";
311 return 0;
312 }
313
314 line_num++;
315 }
316
317 if (gzip)
318 gzclose(gz_fh);
319 else
320 fh.close();
321
322 //
323 // Finally, parse the Alleles file
324 //
325 gzip = false;
326 fh_status = 1;
327 line_num = 1;
328
329 f = sample + ".alleles.tsv";
330 fh.open(f.c_str(), ifstream::in);
331 if (fh.fail()) {
332 //
333 // Test for a gzipped file.
334 //
335 f = sample + ".alleles.tsv.gz";
336 gz_fh = gzopen(f.c_str(), "rb");
337 if (!gz_fh) {
338 cerr << " Unable to open '" << sample << "'\n";
339 return 0;
340 }
341 #if ZLIB_VERNUM >= 0x1240
342 gzbuffer(gz_fh, libz_buffer_size);
343 #endif
344 gzip = true;
345 compressed = true;
346 }
347 if (verbose)
348 cerr << " Parsing " << f.c_str() << "\n";
349
350 while (fh_status) {
351 fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
352
353 if (!fh_status && strlen(line) == 0)
354 continue;
355
356 if (is_comment(line)) continue;
357
358 parse_tsv(line, parts);
359
360 if (parts.size() != num_alleles_fields) {
361 cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
362 return 0;
363 }
364
365 id = atoi(parts[1].c_str());
366
367 if (blacklisted.count(id))
368 continue;
369
370 if (loci.count(id) > 0) {
371 loci[id]->alleles[parts[2]] = atoi(parts[4].c_str());
372 } else {
373 cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". SNP asks for nonexistent locus with ID: " << id << "\n";
374 return 0;
375 }
376
377 line_num++;
378 }
379
380 if (gzip)
381 gzclose(gz_fh);
382 else
383 fh.close();
384
385 //
386 // Populate the strings member with the sequence for each allele for each Locus.
387 //
388 typename map<int, LocusT *>::iterator i;
389 for (i = loci.begin(); i != loci.end(); i++)
390 i->second->populate_alleles();
391
392 delete[] line;
393
394 return 1;
395 }
396
397 template <class LocusT>
dump_loci(map<int,LocusT * > & u)398 int dump_loci(map<int, LocusT *> &u) {
399 typename map<int, LocusT *>::iterator i;
400 vector<SNP *>::iterator s;
401
402 for (i = u.begin(); i != u.end(); i++) {
403
404 cerr << "Locus ID: " << i->second->id << "\n"
405 << " Consensus: " << i->second->con << "\n"
406 << " Genomic Location: " << i->second->loc.chr << "; " << i->second->loc.bp +1 << "bp\n"
407 << " SNPs:\n";
408
409 for (s = i->second->snps.begin(); s != i->second->snps.end(); s++)
410 cerr << " Col: " << (*s)->col << " rank 1: " << (*s)->rank_1 << " rank 2: " << (*s)->rank_2 << "\n";
411
412 cerr << "\n";
413 }
414
415 return 0;
416 }
417
418 #endif // __SQL_UTILITIES_H__
419