1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2010-2016, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // sql_utilities.h -- template routines to read and write Stacks SQL file formats.
23 //
24 #ifndef __SQL_UTILITIES_H__
25 #define __SQL_UTILITIES_H__
26 
27 #include <unordered_set>
28 
29 #include "input.h"
30 #include "utils.h"
31 
32 //
33 // The expected number of tab-separated fields in our SQL input files.
34 //
35 const uint num_tags_fields    = 9;
36 const uint num_snps_fields    = 9;
37 const uint num_alleles_fields = 5;
38 const uint num_matches_fields = 6;
39 
40 void load_catalog_matches(string sample,  vector<CatMatch *> &matches, bool verbose=true);
41 int load_model_results(string sample,  map<int, ModRes *> &modres);
42 int load_snp_calls(string sample,  map<int, SNPRes *> &snpres);
43 
44 // retrieve_bijective_sloci()
45 // ----------
46 // Returns pairs of (sample locus ID, catalog locus ID) for which there is a bijective relation.
47 vector<pair<int, int> > retrieve_bijective_loci(const vector<CatMatch*>& matches);
48 vector<pair<int, int> > retrieve_bijective_loci(const vector<pair<int,int>>& sloc_cloc_id_pairs);
49 
50 template <class LocusT>
51 int
52 load_loci(const string& sample,  map<int, LocusT *> &loci, int store_reads, bool load_all_model_calls, bool &compressed, bool verbose=true)
53 {
54     using namespace std;
55 
56     LocusT        *c;
57     SNP           *snp;
58     string         f;
59     char          *cmp;
60     const char    *p, *q;
61     int            len;
62     vector<string> parts;
63     set<int>       blacklisted;
64     long int       line_num;
65     ifstream       fh;
66     gzFile         gz_fh;
67 
68     char *line      = new char[max_len];
69     int   size      = max_len;
70     bool  gzip      = false;
71     bool  open_fail = true;
72     int   fh_status = 1;
73 
74     if (open_fail) {
75         //
76         // Test for a TAGs file.
77         //
78         f = sample + ".tags.tsv";
79         fh.open(f.c_str(), ifstream::in);
80         if (fh.fail())
81             open_fail = true;
82         else
83             open_fail = false;
84     }
85 
86     if (open_fail) {
87         //
88         // Test for a gzipped TAGs file.
89         //
90         f = sample + ".tags.tsv.gz";
91         gz_fh = gzopen(f.c_str(), "rb");
92         if (!gz_fh) {
93             open_fail = true;
94         } else {
95             open_fail = false;
96             #if ZLIB_VERNUM >= 0x1240
97             gzbuffer(gz_fh, libz_buffer_size);
98             #endif
99             gzip = true;
100         }
101     }
102 
103     if (open_fail) {
104         cerr << " Unable to open '" << sample << "'\n";
105         return 0;
106     }
107 
108     if (verbose)
109         cerr << "  Parsing " << f.c_str() << "\n";
110 
111     uint id;
112 
113     line_num = 1;
114     while (fh_status) {
115         fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
116 
117         if (!fh_status && strlen(line) == 0)
118             continue;
119 
120         if (is_comment(line)) continue;
121 
122         parse_tsv(line, parts);
123 
124         if (parts.size() != num_tags_fields) {
125             cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
126             return 0;
127         }
128 
129         id = atoi(parts[1].c_str());
130 
131         if (parts[2] != "consensus") {
132             if (blacklisted.count(id)) continue;
133 
134             //
135             // Make sure this locus has already been defined (consensus sequence SHOULD always
136             // be specified first in the file for a particular locus).
137             //
138             if (loci.count(id) > 0) {
139                 //
140                 // Read the model sequence, a series of letters specifying if the model called a
141                 // homozygous base (O), a heterozygous base (E), or if the base type was unknown (U).
142                 //
143                 if (parts[2] == "model") {
144                     loci[id]->model = new char[parts[5].length() + 1];
145                     strcpy(loci[id]->model, parts[5].c_str());
146 
147                 } else {
148                     //
149                     // Otherwise, we expect a primary or secondary read, record these if specified.
150                     //
151                     loci[id]->depth++;
152 
153                     if (store_reads >= 1) {
154                         if (store_reads >= 2) {
155                             // Load the actual sequences (otherwise don't).
156                             char *read = new char[parts[5].length() + 1];
157                             strcpy(read, parts[5].c_str());
158                             loci[id]->reads.push_back(read);
159                         }
160 
161                         char *read_id = new char[parts[4].length() + 1];
162                         strcpy(read_id, parts[4].c_str());
163                         loci[id]->comp.push_back(read_id);
164                         //
165                         // Store the internal stack number for this read.
166                         //
167                         loci[id]->comp_cnt.push_back(atoi(parts[3].c_str()));
168 
169                         //
170                         // Store the read type.
171                         //
172                         if (parts[2] == "primary")
173                             loci[id]->comp_type.push_back(primary);
174                         else
175                             loci[id]->comp_type.push_back(secondary);
176                     }
177                 }
178 
179                 continue;
180             } else {
181                 cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (stack " << id << " does not exist).\n";
182                 return 0;
183             }
184         }
185 
186         //
187         // Do not include blacklisted tags in the catalog. They are tags that are composed
188         // of noise and/or repetitive sequence.
189         //
190         if (parts[7] == "1") {
191             blacklisted.insert(id);
192             continue;
193         }
194 
195         c = new LocusT;
196         c->sample_id = atoi(parts[0].c_str());
197         c->id        = id;
198         c->add_consensus(parts[5].c_str());
199 
200         //
201         // Read in the flags
202         //
203         c->deleveraged     = (parts[6] == "1" ? true : false);
204         c->lumberjackstack = (parts[8] == "1" ? true : false);
205 
206         //
207         // Parse the components of this stack (either the Illumina ID, or the catalog constituents)
208         //
209         q = parts[4].c_str();
210         while (*q != '\0') {
211             for (p = q; *q != ',' && *q != '\0'; q++);
212             len = q - p;
213             cmp = new char[len + 1];
214             strncpy(cmp, p, len);
215             cmp[len] = '\0';
216             c->comp.push_back(cmp);
217             if (*q != '\0') q++;
218         }
219 
220         loci[c->id] = c;
221 
222         line_num++;
223     }
224 
225     if (gzip)
226         gzclose(gz_fh);
227     else
228         fh.close();
229 
230     //
231     // Next, parse the SNP file and load model calls.
232     //
233     gzip      = false;
234     fh_status = 1;
235     line_num  = 1;
236 
237     f = sample + ".snps.tsv";
238     fh.open(f.c_str(), ifstream::in);
239     if (fh.fail()) {
240         //
241         // Test for a gzipped file.
242         //
243         f = sample + ".snps.tsv.gz";
244         gz_fh = gzopen(f.c_str(), "rb");
245         if (!gz_fh) {
246             cerr << " Unable to open '" << sample << "'\n";
247             return 0;
248         }
249         #if ZLIB_VERNUM >= 0x1240
250         gzbuffer(gz_fh, libz_buffer_size);
251         #endif
252         gzip       = true;
253         compressed = true;
254     }
255     if (verbose)
256         cerr << "  Parsing " << f.c_str() << "\n";
257 
258     while (fh_status) {
259         fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
260 
261         if (!fh_status && strlen(line) == 0)
262             continue;
263 
264         if (is_comment(line)) continue;
265 
266         parse_tsv(line, parts);
267 
268         if (parts.size() != num_snps_fields) {
269             cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
270             return 0;
271         }
272 
273         id = atoi(parts[1].c_str());
274 
275         if (blacklisted.count(id))
276             continue;
277 
278         //
279         // Only load heterozygous model calls.
280         //
281         if (load_all_model_calls == false && parts[3] != "E")
282             continue;
283 
284         snp         = new SNP;
285         snp->col    = atoi(parts[2].c_str());
286         snp->lratio = atof(parts[4].c_str());
287         snp->rank_1 = parts[5].at(0);
288         snp->rank_2 = parts[6].at(0) == '-' ? 0 : parts[6].at(0);
289 
290         if (parts[3] == "E")
291             snp->type = snp_type_het;
292         else if (parts[3] == "O")
293             snp->type = snp_type_hom;
294         else
295             snp->type = snp_type_unk;
296 
297         if (parts[7].length() == 0 || parts[7].at(0) == '-')
298             snp->rank_3 = 0;
299         else
300             snp->rank_3 = parts[7].at(0);
301 
302         if (parts[8].length() == 0 || parts[8].at(0) == '-')
303             snp->rank_4 = 0;
304         else
305             snp->rank_4 = parts[8].at(0);
306 
307         if (loci.count(id) > 0) {
308             loci[id]->snps.push_back(snp);
309         } else {
310             cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". SNP asks for nonexistent locus with ID: " << id << "\n";
311             return 0;
312         }
313 
314         line_num++;
315     }
316 
317     if (gzip)
318         gzclose(gz_fh);
319     else
320         fh.close();
321 
322     //
323     // Finally, parse the Alleles file
324     //
325     gzip      = false;
326     fh_status = 1;
327     line_num  = 1;
328 
329     f = sample + ".alleles.tsv";
330     fh.open(f.c_str(), ifstream::in);
331     if (fh.fail()) {
332         //
333         // Test for a gzipped file.
334         //
335         f = sample + ".alleles.tsv.gz";
336         gz_fh = gzopen(f.c_str(), "rb");
337         if (!gz_fh) {
338             cerr << " Unable to open '" << sample << "'\n";
339             return 0;
340         }
341         #if ZLIB_VERNUM >= 0x1240
342         gzbuffer(gz_fh, libz_buffer_size);
343         #endif
344         gzip       = true;
345         compressed = true;
346     }
347     if (verbose)
348         cerr << "  Parsing " << f.c_str() << "\n";
349 
350     while (fh_status) {
351         fh_status = (gzip == true) ? read_gzip_line(gz_fh, &line, &size) : read_line(fh, &line, &size);
352 
353         if (!fh_status && strlen(line) == 0)
354             continue;
355 
356         if (is_comment(line)) continue;
357 
358         parse_tsv(line, parts);
359 
360         if (parts.size() != num_alleles_fields) {
361             cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". (" << parts.size() << " fields).\n";
362             return 0;
363         }
364 
365         id = atoi(parts[1].c_str());
366 
367         if (blacklisted.count(id))
368             continue;
369 
370         if (loci.count(id) > 0) {
371             loci[id]->alleles[parts[2]] = atoi(parts[4].c_str());
372         } else {
373             cerr << "Error parsing " << f.c_str() << " at line: " << line_num << ". SNP asks for nonexistent locus with ID: " << id << "\n";
374             return 0;
375         }
376 
377         line_num++;
378     }
379 
380     if (gzip)
381         gzclose(gz_fh);
382     else
383         fh.close();
384 
385     //
386     // Populate the strings member with the sequence for each allele for each Locus.
387     //
388     typename map<int, LocusT *>::iterator i;
389     for (i = loci.begin(); i != loci.end(); i++)
390         i->second->populate_alleles();
391 
392     delete[] line;
393 
394     return 1;
395 }
396 
397 template <class LocusT>
dump_loci(map<int,LocusT * > & u)398 int dump_loci(map<int, LocusT *> &u) {
399     typename map<int, LocusT *>::iterator i;
400     vector<SNP *>::iterator      s;
401 
402     for (i = u.begin(); i != u.end(); i++) {
403 
404         cerr << "Locus ID:    " << i->second->id << "\n"
405              << "  Consensus: " << i->second->con << "\n"
406              << "  Genomic Location: " << i->second->loc.chr << "; " << i->second->loc.bp +1 << "bp\n"
407              << "  SNPs:\n";
408 
409         for (s = i->second->snps.begin(); s != i->second->snps.end(); s++)
410             cerr << "    Col: " << (*s)->col << " rank 1: " << (*s)->rank_1 << " rank 2: " << (*s)->rank_2 << "\n";
411 
412         cerr << "\n";
413     }
414 
415     return 0;
416 }
417 
418 #endif // __SQL_UTILITIES_H__
419