1 
2 /******************************************************************************
3  *
4  *  This file is part of canu, a software program that assembles whole-genome
5  *  sequencing reads into contigs.
6  *
7  *  This software is based on:
8  *    'Celera Assembler' r4587 (http://wgs-assembler.sourceforge.net)
9  *    the 'kmer package' r1994 (http://kmer.sourceforge.net)
10  *
11  *  Except as indicated otherwise, this is a 'United States Government Work',
12  *  and is released in the public domain.
13  *
14  *  File 'README.licenses' in the root directory of this distribution
15  *  contains full conditions and disclaimers.
16  */
17 
18 #include "runtime.H"
19 #include "ovStore.H"
20 #include "strings.H"
21 
22 #include <vector>
23 
24 int
main(int argc,char ** argv)25 main(int argc, char **argv) {
26   char                *outName  = NULL;
27   char                *seqName  = NULL;
28   bool                 partialOverlaps = false;
29   uint32               minOverlapLength = 0;
30   double               erate = 0;
31 
32   std::vector<char *>  files;
33 
34   argc = AS_configure(argc, argv, 1);
35 
36   int32     arg = 1;
37   int32     err = 0;
38   while (arg < argc) {
39     if        (strcmp(argv[arg], "-o") == 0) {
40       outName = argv[++arg];
41 
42     } else if (strcmp(argv[arg], "-S") == 0) {
43       seqName = argv[++arg];
44 
45     } else if (strcmp(argv[arg], "-partial") == 0) {
46       partialOverlaps = true;
47 
48    } else if (strcmp(argv[arg], "-e") == 0) {
49       erate = atof(argv[++arg]);
50 
51     } else if (strcmp(argv[arg], "-len") == 0) {
52       minOverlapLength = atoi(argv[++arg]);
53 
54     } else if (fileExists(argv[arg])) {
55       files.push_back(argv[arg]);
56 
57     } else {
58       fprintf(stderr, "ERROR:  invalid arg '%s'\n", argv[arg]);
59       err++;
60     }
61 
62     arg++;
63   }
64 
65   if ((err) || (seqName == NULL) || (outName == NULL) || (files.size() == 0)) {
66     fprintf(stderr, "usage: %s [options] file.mhap[.gz]\n", argv[0]);
67     fprintf(stderr, "\n");
68     fprintf(stderr, "  Converts mhap native output to ovb\n");
69     fprintf(stderr, "\n");
70     fprintf(stderr, "  -o out.ovb     output file\n");
71     fprintf(stderr, "\n");
72 
73     if (seqName == NULL)
74       fprintf(stderr, "ERROR:  no seqStore (-S) supplied\n");
75     if (files.size() == 0)
76       fprintf(stderr, "ERROR:  no overlap files supplied\n");
77 
78     exit(1);
79   }
80 
81   char        *ovStr = new char [1024*1024];
82 
83   sqStore    *seqStore = new sqStore(seqName);
84   ovOverlap   ov;
85   ovFile      *of = new ovFile(seqStore, outName, ovFileFullWrite);
86 
87   for (uint32 ff=0; ff<files.size(); ff++) {
88     compressedFileReader  *in = new compressedFileReader(files[ff]);
89 
90     //  $1        $2     $3     $4     $5     $6         $7      $8    $9     $10      $11          $12        $13
91     //  0         1      2      3      4      5          6       7     8      9        10           11         12
92     //  aiid      alen   bgn    end    bori   biid       blen    bgn   end    #match   minimizers   alnlen     cm:i:errori
93     //  read1	5064	0	5060	+	read164	7384	138	5251	4763	5144	0	tp:A:S	cm:i:1410	s1:i:4754	dv:f:0.0142
94     //
95 
96     while (fgets(ovStr, 1024*1024, in->file()) != NULL) {
97       splitToWords  W(ovStr);
98 
99       ov.a_iid = atoi(W[0]+4);
100       ov.b_iid = atoi(W[5]+4);
101 
102       if (ov.a_iid == ov.b_iid)
103         continue;
104 
105       ov.dat.ovl.ahg5 = W.toint32(2);
106       ov.dat.ovl.ahg3 = W.toint32(1) - W.toint32(3);
107 
108       if (W[4][0] == '+') {
109         ov.dat.ovl.bhg5 = W.toint32(7);
110         ov.dat.ovl.bhg3 = W.toint32(6) - W.toint32(8);
111         ov.flipped(false);
112       } else {
113         ov.dat.ovl.bhg3 = W.toint32(7);
114         ov.dat.ovl.bhg5 = W.toint32(6) - W.toint32(8);
115         ov.flipped(true);
116       }
117 
118       ov.erate((double)atof(W[15]+5));
119 
120       //  Check the overlap - the hangs must be less than the read length.
121 
122       uint32  alen = seqStore->sqStore_getReadLength(ov.a_iid);
123       uint32  blen = seqStore->sqStore_getReadLength(ov.b_iid);
124 
125       if ((alen < ov.dat.ovl.ahg5 + ov.dat.ovl.ahg3) ||
126           (blen < ov.dat.ovl.bhg5 + ov.dat.ovl.bhg3))
127         fprintf(stderr, "INVALID OVERLAP " F_U32 " (len %6d) " F_U32 " (len %6d) hangs " F_OV " " F_OV " - " F_OV " " F_OV "%s\n",
128                 ov.a_iid, alen,
129                 ov.b_iid, blen,
130                 ov.dat.ovl.ahg5, ov.dat.ovl.ahg3,
131                 ov.dat.ovl.bhg5, ov.dat.ovl.bhg3,
132                 (ov.dat.ovl.flipped) ? " flipped" : ""), exit(1);
133 
134       ov.dat.ovl.forUTG = (partialOverlaps == false) && (ov.overlapIsDovetail() == true);;
135       ov.dat.ovl.forOBT = partialOverlaps;
136       ov.dat.ovl.forDUP = partialOverlaps;
137 
138       // check the length is big enough
139       if (ov.a_end() - ov.a_bgn() < minOverlapLength || ov.b_end() - ov.b_bgn() < minOverlapLength) {
140          continue;
141       }
142       // check if the erate is OK
143       if (ov.erate() > erate) {
144          continue;
145       }
146       //  Overlap looks good, write it!
147 
148       of->writeOverlap(&ov);
149     }
150 
151     arg++;
152   }
153 
154   delete    of;
155   delete [] ovStr;
156 
157   delete seqStore;
158 
159   exit(0);
160 }
161