1
2 /******************************************************************************
3 *
4 * This file is part of canu, a software program that assembles whole-genome
5 * sequencing reads into contigs.
6 *
7 * This software is based on:
8 * 'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9 * the 'kmer package' r1994 (http://kmer.sourceforge.net)
10 *
11 * Except as indicated otherwise, this is a 'United States Government Work',
12 * and is released in the public domain.
13 *
14 * File 'README.licenses' in the root directory of this distribution
15 * contains full conditions and disclaimers.
16 */
17
18 #include "runtime.H"
19 #include "ovStore.H"
20 #include "strings.H"
21
22 #include <vector>
23
24 int
main(int argc,char ** argv)25 main(int argc, char **argv) {
26 char *outName = NULL;
27 char *seqName = NULL;
28 bool partialOverlaps = false;
29 uint32 minOverlapLength = 0;
30 double erate = 0;
31
32 std::vector<char *> files;
33
34 argc = AS_configure(argc, argv, 1);
35
36 int32 arg = 1;
37 int32 err = 0;
38 while (arg < argc) {
39 if (strcmp(argv[arg], "-o") == 0) {
40 outName = argv[++arg];
41
42 } else if (strcmp(argv[arg], "-S") == 0) {
43 seqName = argv[++arg];
44
45 } else if (strcmp(argv[arg], "-partial") == 0) {
46 partialOverlaps = true;
47
48 } else if (strcmp(argv[arg], "-e") == 0) {
49 erate = atof(argv[++arg]);
50
51 } else if (strcmp(argv[arg], "-len") == 0) {
52 minOverlapLength = atoi(argv[++arg]);
53
54 } else if (fileExists(argv[arg])) {
55 files.push_back(argv[arg]);
56
57 } else {
58 fprintf(stderr, "ERROR: invalid arg '%s'\n", argv[arg]);
59 err++;
60 }
61
62 arg++;
63 }
64
65 if ((err) || (seqName == NULL) || (outName == NULL) || (files.size() == 0)) {
66 fprintf(stderr, "usage: %s [options] file.mhap[.gz]\n", argv[0]);
67 fprintf(stderr, "\n");
68 fprintf(stderr, " Converts mhap native output to ovb\n");
69 fprintf(stderr, "\n");
70 fprintf(stderr, " -o out.ovb output file\n");
71 fprintf(stderr, "\n");
72
73 if (seqName == NULL)
74 fprintf(stderr, "ERROR: no seqStore (-S) supplied\n");
75 if (files.size() == 0)
76 fprintf(stderr, "ERROR: no overlap files supplied\n");
77
78 exit(1);
79 }
80
81 char *ovStr = new char [1024*1024];
82
83 sqStore *seqStore = new sqStore(seqName);
84 ovOverlap ov;
85 ovFile *of = new ovFile(seqStore, outName, ovFileFullWrite);
86
87 for (uint32 ff=0; ff<files.size(); ff++) {
88 compressedFileReader *in = new compressedFileReader(files[ff]);
89
90 // $1 $2 $3 $4 $5 $6 $7 $8 $9 $10 $11 $12 $13
91 // 0 1 2 3 4 5 6 7 8 9 10 11 12
92 // aiid alen bgn end bori biid blen bgn end #match minimizers alnlen cm:i:errori
93 // read1 5064 0 5060 + read164 7384 138 5251 4763 5144 0 tp:A:S cm:i:1410 s1:i:4754 dv:f:0.0142
94 //
95
96 while (fgets(ovStr, 1024*1024, in->file()) != NULL) {
97 splitToWords W(ovStr);
98
99 ov.a_iid = atoi(W[0]+4);
100 ov.b_iid = atoi(W[5]+4);
101
102 if (ov.a_iid == ov.b_iid)
103 continue;
104
105 ov.dat.ovl.ahg5 = W.toint32(2);
106 ov.dat.ovl.ahg3 = W.toint32(1) - W.toint32(3);
107
108 if (W[4][0] == '+') {
109 ov.dat.ovl.bhg5 = W.toint32(7);
110 ov.dat.ovl.bhg3 = W.toint32(6) - W.toint32(8);
111 ov.flipped(false);
112 } else {
113 ov.dat.ovl.bhg3 = W.toint32(7);
114 ov.dat.ovl.bhg5 = W.toint32(6) - W.toint32(8);
115 ov.flipped(true);
116 }
117
118 ov.erate((double)atof(W[15]+5));
119
120 // Check the overlap - the hangs must be less than the read length.
121
122 uint32 alen = seqStore->sqStore_getReadLength(ov.a_iid);
123 uint32 blen = seqStore->sqStore_getReadLength(ov.b_iid);
124
125 if ((alen < ov.dat.ovl.ahg5 + ov.dat.ovl.ahg3) ||
126 (blen < ov.dat.ovl.bhg5 + ov.dat.ovl.bhg3))
127 fprintf(stderr, "INVALID OVERLAP " F_U32 " (len %6d) " F_U32 " (len %6d) hangs " F_OV " " F_OV " - " F_OV " " F_OV "%s\n",
128 ov.a_iid, alen,
129 ov.b_iid, blen,
130 ov.dat.ovl.ahg5, ov.dat.ovl.ahg3,
131 ov.dat.ovl.bhg5, ov.dat.ovl.bhg3,
132 (ov.dat.ovl.flipped) ? " flipped" : ""), exit(1);
133
134 ov.dat.ovl.forUTG = (partialOverlaps == false) && (ov.overlapIsDovetail() == true);;
135 ov.dat.ovl.forOBT = partialOverlaps;
136 ov.dat.ovl.forDUP = partialOverlaps;
137
138 // check the length is big enough
139 if (ov.a_end() - ov.a_bgn() < minOverlapLength || ov.b_end() - ov.b_bgn() < minOverlapLength) {
140 continue;
141 }
142 // check if the erate is OK
143 if (ov.erate() > erate) {
144 continue;
145 }
146 // Overlap looks good, write it!
147
148 of->writeOverlap(&ov);
149 }
150
151 arg++;
152 }
153
154 delete of;
155 delete [] ovStr;
156
157 delete seqStore;
158
159 exit(0);
160 }
161