1#!/usr/bin/perl -w 2 3# Contact: lh3 4# Version: 0.1.3 5 6#Modified by Zayed Albertyn(zayed.albertyn@gmail.com) & Colin Hercus(colin@novocraft.com) 7 8#use strict; 9#use warnings; 10use Data::Dumper; 11use Getopt::Std; 12 13&novo2sam; 14exit; 15 16sub mating { 17 my ($s1, $s2) = @_; 18 my $isize = 0; 19 if ($s1->[2] ne '*' && $s1->[2] eq $s2->[2]) { # then calculate $isize 20 my $x1 = ($s1->[1] & 0x10)? $s1->[3] + length($s1->[9]) : $s1->[3]; 21 my $x2 = ($s2->[1] & 0x10)? $s2->[3] + length($s2->[9]) : $s2->[3]; 22 $isize = $x2 - $x1; 23 } 24 # update mate coordinate 25 if ($s2->[2] ne '*') { 26 @$s1[6..8] = (($s2->[2] eq $s1->[2])? "=" : $s2->[2], $s2->[3], $isize); 27 $s1->[1] |= 0x20 if ($s2->[1] & 0x10); 28 } else { 29 $s1->[1] |= 0x8; 30 } 31 if ($s1->[2] ne '*') { 32 @$s2[6..8] = (($s1->[2] eq $s2->[2])? "=" : $s1->[2], $s1->[3], -$isize); 33 $s2->[1] |= 0x20 if ($s1->[1] & 0x10); 34 } else { 35 $s2->[1] |= 0x8; 36 } 37} 38 39sub novo2sam { 40 my %opts = (); 41 getopts("p", \%opts); 42 die("Usage: novo2sam.pl [-p] <aln.novo>\n") if (@ARGV == 0); 43 my $is_paired = defined($opts{p}); 44 # core loop 45 my @s1 = (); 46 my @s2 = (); 47 my ($s_last, $s_curr) = (\@s1, \@s2); 48 while (<>) { 49 next if (/^#/); 50 next if (/(QC|NM)\s*$/ || /(R\s+\d+)\s*$/); 51 &novo2sam_aux($_, $s_curr, $is_paired); 52 if (@$s_last != 0 && $s_last->[0] eq $s_curr->[0]) { 53 &mating($s_last, $s_curr); 54 print join("\t", @$s_last), "\n"; 55 print join("\t", @$s_curr), "\n"; 56 @$s_last = (); @$s_curr = (); 57 } else { 58 print join("\t", @$s_last), "\n" if (@$s_last != 0); 59 my $s = $s_last; $s_last = $s_curr; $s_curr = $s; 60 } 61 } 62 print join("\t", @$s_last), "\n" if (@$s_last != 0); 63} 64 65sub novo2sam_aux { 66 my ($line, $s, $is_paired) = @_; 67 68 chomp($line); 69 my @t = split(/\s+/, $line); 70 my @variations = @t[13 .. $#t]; 71 @$s = (); 72 return if ($t[4] ne 'U'); 73 my $len = length($t[2]); 74 # read name 75 $s->[0] = substr($t[0], 1); 76 $s->[0] =~ s/\/[12]$//g; 77 # initial flag (will be updated later) 78 $s->[1] = 0; 79 $s->[1] |= 1 | 1<<($t[1] eq 'L'? 6 : 7); 80 $s->[1] |= 2 if ($t[10] eq '.'); 81 # read & quality 82 if ($t[9] eq 'R') { 83 $s->[9] = reverse($t[2]); 84 $s->[10] = reverse($t[3]); 85 $s->[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; 86 } else { 87 $s->[9] = $t[2]; $s->[10] = $t[3]; 88 } 89 # cigar 90 my $cigarstring =""; 91 if (scalar @variations ==0 ) { 92 $s->[5] = $len . "M"; # IMPORTANT: this cigar is not correct for gapped alignment 93 } else { 94 #convert to correct CIGAR 95 my $tmpstr = join" ",@variations ; 96 if ( $tmpstr=~ /\+|\-/ ) { 97 $cigarstring = cigar_method($line,\@variations,$len); 98 $s->[5]=$cigarstring; 99 } else { 100 $s->[5]=$len. "M"; 101 } 102} 103 104# coor 105 $s->[2] = substr($t[7], 1); $s->[3] = $t[8]; 106 $s->[1] |= 0x10 if ($t[9] eq 'R'); 107 # mapQ 108 $s->[4] = $t[5] > $t[6]? $t[5] : $t[6]; 109 # mate coordinate 110 $s->[6] = '*'; $s->[7] = $s->[8] = 0; 111 # aux 112 push(@$s, "NM:i:".(@t-13)); 113 my $md = ''; 114 $md = mdtag($md,$line,\@variations,$len); 115 push(@$s, "MD:Z:$md"); 116 117} 118 119sub mdtag { 120 my $oldmd = shift; 121 my $line = shift; 122 my $ref =shift; 123 my $rdlen = shift; 124 my @variations = @$ref; 125 my $string=""; 126 my $mdtag=""; 127 my $t=1; 128 my $q=1; 129 my $deleteflag=0; 130 my $len =0; 131 foreach $string (@variations) { 132 my ($indeltype,$insert) = indeltype($string); 133 if ($indeltype eq "+") { 134 $len = length ($insert); 135 $q+=$len; 136 next; 137 } 138 my $pos = $1 if $string =~ /^(\d+)/; 139 $len = $pos - $t; 140 if ($len !=0 || ($deleteflag eq 1 && $indeltype eq ">")) { 141 $mdtag.=$len; 142 } 143 $t+=$len; 144 $q+=$len; 145 if ($indeltype eq ">") { 146 $mdtag.=$insert; 147 $deleteflag=0; 148 $t+=1; 149 $q+=1; 150 } 151 if ($indeltype eq "-") { 152 my $deletedbase = $2 if $string =~ /(\d+)\-([A-Za-z]+)/; 153 if ($deleteflag == 0 ) { 154 $mdtag.="^"; 155 } 156 $mdtag.=$deletedbase; 157 $deleteflag=1; 158 $t+=1; 159 } 160 } 161 $len = $rdlen - $q + 1; 162 if ($len > 0) { 163 $mdtag.="$len"; 164 } 165# print "In:$line\n"; 166# print "MD: OLD => NEW\nMD: $oldmd => $mdtag\n\n"; 167 168 return $mdtag; 169} 170 171sub indeltype { 172 my $string = shift; 173 my $insert=""; 174 my $indeltype; 175 if ($string =~ /([A-Za-z]+)\>/) { 176 $indeltype=">"; 177 $insert=$1; 178 } elsif ($string =~ /\-/) { 179 $indeltype="-"; 180 } elsif ($string =~ /\+([A-Za-z]+)/) { 181 $indeltype="+"; 182 $insert=$1; 183 } 184 return ($indeltype,$insert); 185 186} 187 188 189sub cigar_method { 190 my $line = shift; 191 my $ref =shift; 192 my $rdlen = shift; 193 my @variations = @$ref; 194 my $string=""; 195 my $type=""; 196 my $t =1; 197 my $q=1; 198 my $indeltype=""; 199 my $cigar= ""; 200 my $insert = ""; 201 my $len=0; 202 my @cig=(); 203 foreach $string (@variations) { 204 next if $string =~ />/; 205 my $pos = $1 if $string =~ /^(\d+)/; 206 207 if ($string =~ /\+([A-Za-z]+)/) { 208 $indeltype="+"; 209 $insert = $1; 210 }elsif ($string =~ /\-([A-Za-z]+)/) { 211 $indeltype="-"; 212 $insert = $1; 213 } 214#print "$pos $indeltype $insert $t $q\n"; 215 $len = $pos - $t; 216 if ( $len > 0) { 217 $cigar.=$len."M"; 218 push(@cig,$len."M"); 219 } 220 $t+=$len; 221 $q+=$len; 222 223 if ($indeltype eq "-") { 224 $cigar.="D"; 225 push(@cig,"D"); 226 $t++; 227 } 228 if ($indeltype eq "+") { 229 $len = length ($insert); 230 if ($len == 1) { 231 $cigar.="I"; 232 push(@cig,"I"); 233 } 234 if ($len > 1) { 235 $cigar.=$len."I"; 236 push(@cig,$len."I") 237 } 238 $q+=$len; 239 } 240 $insert=""; 241 } 242 $len= $rdlen - $q + 1; 243 if ($len > 0) { 244 $cigar.=$len."M"; 245 push(@cig,$len."M"); 246 } 247 248 $cigar = newcigar($cigar,'D'); 249 $cigar = newcigar($cigar,'I'); 250 251 #print "$line\n"; 252 #print "c CIGAR:\t$cigar\n\n"; 253 return $cigar; 254 255} 256 257 258 259sub newcigar { 260 my $cigar = shift; 261 my $char = shift; 262 my $new = ""; 263 my $copy = $cigar; 264#print "$cigar\n"; 265 $copy =~ s/^($char+)/$1;/g; 266#print "$copy\n"; 267 $copy =~ s/([^0-9$char])($char+)/$1;$2;/g; 268#print "$copy\n"; 269 my @parts = split(/;/,$copy); 270 my $el=""; 271 foreach $el (@parts) { 272#print "$el\n"; 273 if ($el =~ /^$char+$/) { 274 $new.=length($el).$char; 275 }else { 276 $new.=$el; 277 } 278 279 } 280 return $new; 281} 282