1############################################################################ 2# demotic_fasta package 3# Parses fasta or ssearch output, stores extracted information in convenient vars. 4# SRE, Wed Jun 25 13:41:41 2003 5# 6############################################################################ 7 8package demotic_fasta; 9 10# parse(\*STDIN) would parse ssearch output 11# coming in via stdin. 12# 13sub parse (*) { 14 my $fh = shift; 15 my $parsing_header = 1; 16 my $parsing_hitlist = 0; 17 my $parsing_alilist = 0; 18 my $target; 19 my $alilinecount = 0; 20 my $prvaliline_isquery = 0; 21 my $ali_qline; 22 my $ali_tline; 23 my $ali_qasq; 24 my $ali_tasq; 25 my $margin; 26 27 # Initialize everything... so we can call the parser 28 # repeatedly, one call per ssearch output. 29 # 30 # This section also documents what's available in 31 # variables within the package. 32 # 33 # Arrays are indexed [0..nhits-1] or [0..nali-1] 34 # 35 $queryname = ""; # Name of the query sequence 36 $querydesc = ""; # Description line for the query (or "") 37 $querylen = 0; # Length of the query in residues 38 $db = ""; # Name of the target database file 39 $db_nseq = 0; # Number of sequences in the target database 40 $db_nletters = ""; # Number of residues in the target database 41 # (stored as a string so we can have large #'s) 42 43 # The top hit list (still in rank order) 44 $nhits = 0; # Number of entries in the hit list 45 @hit_target = (); # Target sequence name (by rank) 46 %target_desc = (); # Target sequence description (by targ name) 47 %target_len = (); # Length of target sequence 48 @hit_score = (); # Raw score (by rank) 49 @hit_bitscore = (); # Bit score (by rank) 50 @hit_Eval = (); # E-value (by rank) 51 52 # The alignment output (in order it came in) 53 # all indexed by # of alignment [0..nali-1] 54 $nali = 0; # Number of alignments 55 @ali_target = (); # Target sequence name 56 @ali_score = (); # Smith/Waterman raw score of alignment 57 @ali_bitscore = (); # bit score 58 @ali_evalue = (); # E-value 59 @ali_nident = (); # Number of identical residues 60 @ali_alen = (); # Length of alignment (overlap) 61 @ali_identity = (); # Percent identity 62# @ali_npos = (); # Number of positives (similar positions) 63# @ali_positive = (); # Percent of positions matched or similar 64 @ali_qstart = (); # Start position on query 65 @ali_qend = (); # End position on query 66 @ali_tstart = (); # Start position on target 67 @ali_tend = (); # End position on target 68 @ali_qali = (); # Aligned string from query 69 @ali_tali = (); # Aligned string from target (subject) 70 @ali_qmask = (); # line in between the two aligned strings 71 @ali_tmask = (); # line in between the two aligned strings 72 @ali_hitidx = (); # index of hit 73 $hitidx = -1; 74 75 if (defined($save_querycount) && $save_querycount > 1) { # on subsequent queries, we had to use the >>> start line to detect 76 # the end of the prev query; we socked the necessary info away in tmp vars. 77 $querycount = $save_querycount; 78 $queryname = $save_queryname; 79 $querydesc = $save_querydesc; 80 $querylen = $save_querylen; 81 } 82 83 # Now, loop over the lines of our input, and parse 'em. 84 # 85 my $line; 86 my $prvline; 87 while (<$fh>) { 88 $line = $_; 89 if ($parsing_header) { 90 if (/^The best scores are:/) { # start of hit list 91 $parsing_header = 0; 92 $parsing_hitlist = 1; 93 next; 94 } elsif (/^!! No sequences/) { # no hit list: no hits; just return 95 return 1; # return "success" 96 } elsif (/^\s+(\d+)>>>\s*(\S*)\s*(.*)\s*-\s*(\d+) \S\S$/) { # allows blank query. \S\S is "nt" or "aa" 97 $querycount = $1; 98 $queryname = $2; 99 $querydesc = $3; 100 $querylen = $4; 101 if ($queryname eq "") { 102 $queryname = "unnamed_query"; 103 } 104 } elsif (/^\s+(\d+)\s+residues in\s+(\d+)\s+sequences\s*$/) { 105 $db_nletters = $1; 106 $db_nseq = $2; 107 } 108 } 109 elsif ($parsing_hitlist) { 110 if (/^\s*$/) { # blank line marks end of hit list, start of alignments 111 $parsing_hitlist = 0; 112 $parsing_alilist = 1; 113 next; 114 } elsif (/^(\S+)\s*(.*\S?)\s*\(\s*(\d+)\)\s+(\d+)\s+(\S+)\s+(\S+)\s*$/) { 115 $hit_target[$nhits] = $1; 116 $target_desc{$1} = $2; 117 $target_len{$1} = $3; 118 $hit_score[$nhits] = $4; 119 $hit_bitscore[$nhits] = $5; 120 $hit_Eval[$nhits] = $6; 121 $nhits++; 122 } 123 } 124 elsif ($parsing_alilist) { 125 if (/^>>(\S+)\s*(.*)\s+\((\d+) \S\S\)\s*$/) { # the \S\S is either nt or aa 126 $target = $1; 127 $hitidx ++; 128 $target_desc{$target} = $2; 129 if ($3 != $target_len{$target}) { die "can't happen.", "1)", $3, "2)", $target_len{$target}; } 130 } 131 elsif (/^ s-w opt:\s+(\d+)\s+Z-score:\s*(\S+)\s+bits:\s+(\S+)\s+E\(\d*\):\s+(\S+)\s*$/) { # SSEARCH 132 $nali++; 133 $ali_target[$nali-1] = $target; 134 $ali_score[$nali-1] = $1; 135 $ali_bitscore[$nali-1] = $3; 136 $ali_evalue[$nali-1] = $4; 137 $ali_hitidx[$nali-1] = $hitidx; 138 } 139 elsif (/^ initn:\s*\d+\s*init1:\s*\d+\s*opt:\s*(\d+)\s*Z-score:\s*(\S+)\s*bits:\s*(\S+)\s*E\(\d*\):\s*(\S+)\s*$/) { # FASTA 140 $nali++; 141 $ali_target[$nali-1] = $target; 142 $ali_score[$nali-1] = $1; 143 $ali_bitscore[$nali-1] = $3; 144 $ali_evalue[$nali-1] = $4; 145 $ali_hitidx[$nali-1] = $hitidx; 146 } 147 elsif (/^Smith-Waterman score:\s+(\d+);\s+(\S+)% identity .* in (\d+) \S\S overlap \((\d+)-(\d+):(\d+)-(\d+)\)\s*/) { 148 $ali_identity[$nali-1] = $2; 149 $ali_alen[$nali-1] = $3; 150 $ali_qstart[$nali-1] = $4; 151 $ali_qend[$nali-1] = $5; 152 $ali_tstart[$nali-1] = $6; 153 $ali_tend[$nali-1] = $7; 154# $ali_nident[$nali-1] = $1; 155# $ali_npos[$nali-1] = $4; 156# $ali_positive[$nali-1] = $5; 157 $alilinecount = 0; 158 $ali_qali[$nali-1] = ""; 159 $ali_tali[$nali-1] = ""; 160 $ali_qmask[$nali-1] = ""; 161 $ali_tmask[$nali-1] = ""; 162 } 163 elsif (/^(\S+)\s+(\S+)\s*$/) { # only ali lines are right-flushed 164 # the usual alignment display is 165 # ali_query_line 166 # mask 167 # ali_target_line 168 # 169 # (1) ends are not flushed, and they can have "extra stuff" 170 # function calculate_flushedmask() corrects that 171 # 172 # (2) alingments do not need to be complete. (particularly using option -a ) 173 # Meaning at the end AND at the beginning as well, you can end up with one single query line 174 # or one single target line. 175 # 176 # ali_query_line 177 # (no mask) 178 # (no ali_target_line) 179 # 180 # or 181 # 182 # (no ali_query_line) 183 # (no mask) 184 # ali_target_line 185 # 186 # or even 187 # 188 # aliq_query_line 189 # (no mask) 190 # ali_target_Line 191 # 192 # why? oh! why? check for that and fix it. 193 194 my $name = $1; 195 my $asq = $2; 196 197 #carefull, the alignment part, truncates the names of the query and targets 198 # this is a problem specially if the query and target names start similarly. 199 # it appears that querynames have been truncated to 5 characters and targetnames to 6 200 # also check for a prvline with numbers, but if len < 10 those do not show up either 201 if ($queryname =~ /^$name/ && (length($name) <= 5 || $prvline =~ /\s+(\d+)\s*/)) { 202 $prvaliline_isquery = 1; 203 $ali_qline = $_; $ali_qline =~ s/\n//; 204 $ali_qasq = $asq; 205 $mask = ""; 206 } 207 elsif ($ali_target[$nali-1] =~ /^$name/) { 208 $talilinecount ++; 209 $ali_tline = $_; $ali_tline =~ s/\n//; 210 $ali_tasq = $asq; 211 if ($prvaliline_isquery) { 212 $ali_qali[$nali-1] .= $ali_qasq; 213 $ali_tali[$nali-1] .= $ali_tasq; 214 $ali_qmask[$nali-1] .= calculate_flushedmask($ali_qline, $mask); 215 $ali_tmask[$nali-1] .= calculate_flushedmask($ali_tline, $mask); 216 } 217 $prvaliline_isquery = 0; 218 } 219 $alilinecount++; 220 } 221 elsif (/^(\s*[\.\:][\s\.\:]*)$/) { 222 $mask .= $1; 223 } 224 225 elsif (/^\s+(\d+)>>>\s*(\S*)\s*(.*)\s*-\s*(\d+) \S\S$/) { # next query is starting. \S\S is "nt" or "aa" 226 $save_querycount = $1; 227 $save_queryname = $2; 228 $save_querydesc = $3; 229 $save_querylen = $4; 230 if ($save_queryname eq "") { $save_queryname = "unnamed_query"; } 231 return 1; # normal return. We've finished output for a query, and stored some info about the next one. 232 } 233 } 234 $prvline = $line; 235 } # this closes the loop over lines in the input stream: at EOF. 236 237 if ($parsing_alilist) { 238 for (my $ali = 0; $ali < $nali; $ali ++) { 239 # the ali lines come along with more residues that are not part of the alignment. Why? oh! why?. REMOVE 240 # you cannot remove using ali_{q,t}start and $ali_{q,t}end because those are 241 # relative to the full sequence. Need to do it by parsing also the line in between (what I call the "mask") 242 # and finding the first and last ":" or "." 243 $ali_qali[$ali] = ali_removeflanking($ali_qali[$ali], $ali_qmask[$ali], $ali_qstart[$ali], $ali_qend[$ali]); 244 $ali_tali[$ali] = ali_removeflanking($ali_tali[$ali], $ali_tmask[$ali], $ali_tstart[$ali], $ali_tend[$ali]); 245 } 246 return 1; 247 } 248 else { return 0; } # at EOF: normal return if we were in the alignment section. 249} 250 251 252 253sub exblxout { 254 my $ofh = shift; 255 my $i; 256 257 for ($i = 0; $i <= $nali-1; $i++) { 258 printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\n", 259 $ali_evalue[$i], 260 $ali_identity[$i], 261 $ali_tstart[$i], 262 $ali_tend[$i], 263 $ali_target[$i], 264 $ali_qstart[$i], 265 $ali_qend[$i], 266 $queryname; 267 } 268} 269 270sub tblout { 271 my $ofh = shift; 272 my $i; 273 274 for ($i = 0; $i <= $nali-1; $i++) { 275 printf $ofh "%s\t%d\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", 276 $ali_evalue[$i], 277 $ali_identity[$i], 278 $ali_qstart[$i], 279 $ali_qend[$i], 280 $queryname, 281 $ali_tstart[$i], 282 $ali_tend[$i], 283 $ali_target[$i], 284 $target_desc{$ali_target[$i]}; 285 } 286} 287 288 289sub gffout { 290 my $ofh = shift; 291 my $source = shift; 292 my $feature = shift; 293 my $i; 294 my $strand; 295 296 for ($i = 0; $i <= $nali-1; $i++) { 297 if ($ali_qstart[$i] > $ali_qend[$i]) { $strand = "-"; } 298 else { $strand = "+"; } 299 300 printf $ofh "%s\t%s\t%s\t%d\t%d\t%.1f\t%s\t.\tgene \"%s\"\n", 301 $ali_target[$i], 302 $source, 303 $feature, 304 $ali_tstart[$i], 305 $ali_tend[$i], 306 $ali_bitscore[$i], 307 $strand, 308 $queryname; 309 } 310} 311 312sub profmarkout { 313 my $ofh = shift; 314 my $i; 315 316 for ($i = 0; $i < $nhits; $i++) { 317 printf $ofh "%g\t%.1f\t%s\t%s\n", $hit_Eval[$i], $hit_bitscore[$i], $hit_target[$i], $queryname; 318 } 319} 320 321sub calculate_flushedmask { 322 my ($asq, $mask) = @_; 323 324 my $lremove = -1; 325 my $flushedasq; 326 327 if ($asq =~ /^(\S+\s+)(\S+)\s*$/) { 328 $lremove = length($1); 329 $flushedasq = $2; 330 } 331 332 my $flushedmask = $mask; 333 $flushedmask =~ s/^(.{$lremove})//; 334 $flushedmask =~ s/\n//; 335 336 if (length($flushedmask) > length($flushedasq)) { 337 while (length($flushedmask) > length($flushedasq)) { 338 if ($flushedmask =~ /(\s)$/) { $flushedmask =~ s/(\s)$//; } 339 } 340 } 341 if (length($flushedmask) < length($flushedasq)) { 342 while (length($flushedmask) < length($flushedasq)) { $flushedmask .= " "; } 343 } 344 345 #printf("\naseq|$asq|\nmask|$mask|\n"); 346 #printf("^^aseq|$flushedasq|\n^^mask|$flushedmask|\n"); 347 348 return $flushedmask; 349} 350 351 352sub ali_removeflanking { 353 my ($aseq, $mask) = @_; 354 355 my $taseq = ""; 356 357 my $alen = length($aseq); 358 359 my $apos_start = 0; 360 while ($apos_start < $alen) { 361 my $mval = substr($mask, $apos_start, 1); 362 if ($mval =~ /[\.\:]/) { last; } 363 $apos_start ++; 364 } 365 366 my $apos_end = $alen-1; 367 while ($apos_end >= 0) { 368 my $mval = substr($mask, $apos_end, 1); 369 if ($mval =~ /[\.\:]/) { last; } 370 $apos_end --; 371 } 372 373 for (my $apos = $apos_start; $apos <= $apos_end; $apos++) { 374 $taseq .= substr($aseq, $apos, 1); 375 } 376 #print "B:$aseq\nM:$mask\nA:$taseq\n"; 377 378 return $taseq; 379} 380 381 3821; 383__END__ 384