1#!/usr/local/bin/perl 2 3# reformat.pl 4# Reformat a multiple alignment file 5# 6# HHsuite version 3.0.0 (15-03-2015) 7# 8# Reference: 9# Remmert M., Biegert A., Hauser A., and Soding J. 10# HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. 11# Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). 12 13# (C) Johannes Soeding, 2012 14 15# This program is free software: you can redistribute it and/or modify 16# it under the terms of the GNU General Public License as published by 17# the Free Software Foundation, either version 3 of the License, or 18# (at your option) any later version. 19 20# This program is distributed in the hope that it will be useful, 21# but WITHOUT ANY WARRANTY; without even the implied warranty of 22# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23# GNU General Public License for more details. 24 25# You should have received a copy of the GNU General Public License 26# along with this program. If not, see <http://www.gnu.org/licenses/>. 27 28# We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de 29 30use strict; 31use warnings; 32my $numres=100; # number of residues per line 33my $desclen=1000; # maximum number of characters in nameline 34my $ARGC=scalar(@ARGV); 35if ($ARGC<2) { 36 die(" 37reformat.pl from HHsuite3 38Read a multiple alignment in one format and write it in another format 39Usage: reformat.pl [informat] [outformat] infile outfile [options] 40 or reformat.pl [informat] [outformat] 'fileglob' .ext [options] 41 42Available input formats: 43 fas: aligned fasta; lower and upper case equivalent, '.' and '-' equivalent 44 a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', 45 gaps aligned to inserts: '.' 46 a3m: like a2m, but gaps aligned to inserts MAY be omitted 47 sto: Stockholm format; sequences in several blocks with sequence name at 48 beginning of line (hmmer output) 49 psi: format as read by PSI-BLAST using the -B option (like sto with -M first -r) 50 clu: Clustal format; sequences in several blocks with sequence name at beginning 51 of line 52Available output formats: 53 fas: aligned fasta; all gaps '-' 54 a2m: aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps 55 aligned to inserts: '.' 56 a3m: like a2m, but gaps aligned to inserts are omitted 57 sto: Stockholm format; sequences in just one block, one line per sequence 58 psi: format as read by PSI-BLAST using the -B option 59 clu: Clustal format 60If no input or output format is given the file extension is interpreted as format 61specification ('aln' as 'clu') 62 63Options: 64 -v int verbose mode (0:off, 1:on) 65 -num add number prefix to sequence names: 'name', '1:name' '2:name' etc 66 -noss remove secondary structure sequences (beginning with >ss_) 67 -sa do not remove solvent accessibility sequences (beginning with >sa_) 68 -M first make all columns with residue in first sequence match columns 69 (default for output format a2m or a3m) 70 -M int make all columns with less than X% gaps match columns 71 (for output format a2m or a3m) 72 -r remove all lower case residues (insert states) 73 (AFTER -M option has been processed) 74 -r int remove all lower case columns with more than X% gaps 75 -g '' suppress all gaps 76 -g '-' write all gaps as '-' 77 -uc write all residues in upper case (AFTER all other options have been processed) 78 -lc write all residues in lower case (AFTER all other options have been processed) 79 -l number of residues per line (for Clustal, FASTA, A2M, A3M formats) 80 (default=$numres) 81 -d maximum number of characers in nameline (default=$desclen) 82 83Examples: reformat.pl 1hjra.a3m 1hjra.a2m 84 (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m) 85 reformat.pl test.a3m test.fas -num -r 90 86 reformat.pl fas sto '*.fasta' .stockholm 87\n"); 88# clu: clustal format (hmmer output) 89# sel: Selex format 90# phy: Phylip format 91} 92 93my $informat=""; 94my $outformat=""; 95my $infile=""; 96my $outfile=""; 97my $num=0; # don't use sequence number as prefix: '>n|name' 98my $noss=0; # don't remove secondary structure 99my $nosa=1; # remove solvent accessibility sequences 100my $line; 101my $options=""; 102my @names; # names of sequences read in 103my @seqs; # residues of sequences read in 104my $n; # number of sequences read in 105my $k; # counts sequences for output 106my $remove_inserts=0; 107my $remove_gapped=0; 108my $matchmode=""; # do not change capitalization 109my $match_gaprule=0; # columns with less than this percentage of gaps will be match columns 110my $v=2; 111my $update=0; 112my $nss=-1; # index of secondary structure sequence 113my $lname; # length of sequence name in clustal, stockholm, psi format 114my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files 115 116my @informats= ("fas","a2m","a3m","sto","psi","clu"); 117my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas"); 118my $found; 119my $element; 120my $gap="default"; 121my $case="default"; 122 123# Process optionsz 124for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";} 125if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;} 126if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;} 127if ($options=~s/ -num / /) {$num=1; $desclen=505;} 128if ($options=~s/ -noss / /) {$noss=1;} 129if ($options=~s/ -sa / /) {$nosa=0;} 130if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;} 131if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;} 132if ($options=~s/ -r / /) {$remove_inserts=1;} 133if ($options=~s/ -lc / /) {$case="lc";} 134if ($options=~s/ -uc / /) {$case="uc";} 135if ($options=~s/ -v\s*(\d+) / /) {$v=$1;} 136if ($options=~s/ -v / /) {$v=2;} 137if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;} 138if ($options=~s/ -M\s+first / /) {$matchmode="first"; $match_gaprule=$1;} 139if ($options=~s/ -u / /) {$update=1;} 140if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;} 141if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;} 142if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;} 143 144# Assign informat, outformat, infile, and outfile 145if ($outfile eq "") { 146 if ($options=~s/(\S+)\s*$//) { 147 $outfile=$1; 148 } else { 149 die("Error: no output file given: '$options'\n"); 150 } 151} 152if ($infile eq "") { 153 if ($options=~s/(\S+)\s*$//) { 154 $infile=$1; 155 } else { 156 die("Error: no input file given: '$options'\n"); 157 } 158} 159if ($options=~s/(\S+)\s*$//) { 160 $outformat=$1; 161} else { 162 if ($outfile=~/\S*\.(\S+?)$/) { 163 $outformat=lc($1); 164 if ($outformat eq "aln") {$outformat="clu";} 165 elsif ($outformat eq "fa") {$outformat="fas";} 166 elsif ($outformat eq "fasta") {$outformat="fas";} 167 elsif ($outformat eq "afa") {$outformat="fas";} 168 elsif ($outformat eq "afas") {$outformat="fas";} 169 elsif ($outformat eq "afasta") {$outformat="fas";} 170 } else { 171 print ("Using FASTA output format: '$options'\n"); $outformat="fas"; 172 } 173} 174if ($options=~s/(\S+)\s*$//) { 175 $informat=$1; 176} else { 177 if ($infile=~/\S*\.(\S+?)$/) { 178 $informat=lc($1); 179 if ($informat eq "aln") {$informat="clu";} 180 elsif ($informat eq "fa") {$informat="fas";} 181 elsif ($informat eq "fasta") {$informat="fas";} 182 } else { 183 print ("Using FASTA input format: '$options'\n"); $informat="fas"; 184 } 185} 186 187 188# Warn if unknown options found 189if ($options!~/^\s*$/) { 190 $options=~s/^\s*(.*?)\s*$/$1/g; 191 print("\nWARNING: unknown options '$options'\n"); 192} 193 194# Check if input and output formats are valid 195$found=0; 196foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}} 197if(!$found) {die("\nError: $informat is not a valid input format option\n");} 198$found=0; 199foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}} 200if(!$found) {die("\nError: $outformat is not a valid output format option\n");} 201 202#if($outformat eq "psi") { 203# $remove_inserts=1; 204#} 205if($outformat eq "ufas") {$gap="";} 206 207 208if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension 209{ 210 $outfile=~/.*\.(\S*)$/; 211 my $outext=$1; 212 my @infiles=glob($infile); 213 printf("%i files to reformat\n",scalar(@infiles)); 214 foreach $infile (@infiles) 215 { 216 if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/} 217 $outfile="$1.$outext"; 218 if ($update && -e $outfile) {next;} 219 if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");} 220 &reformat($infile,$outfile); 221 } 222 exit 0; 223} 224else 225{ 226 if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");} 227 &reformat($infile,$outfile); 228 exit 0; 229} 230 231 232################################################################################################ 233# Reformat a single file 234################################################################################################ 235sub reformat() 236{ 237 $infile=$_[0]; 238 $nss=-1; 239 $titleline=""; 240 241################################################################################################ 242# Input part 243################################################################################################ 244 245 my $skip=0; 246 open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n"); 247 248 # Read a2m, a3m, fas format 249 if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m") 250 { 251 $/=">"; # set input field seperator 252 $n=0; 253 my $seq=<INFILE>; 254 if ($seq=~s/^(\#.*)//) {$titleline=$1;} # remove commentary lines at beginning of file 255 $seq=~s/(\n\#.*)*\n//; # remove commentary lines at beginning of file 256 257 # If first line has no sequence name, use >root_name 258 if ($seq ne ">") { 259 $infile="/$infile."; # determine root name 1 260 $infile=~/^.*\/(.*?)\..*/; # determine root name 2 261 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//; 262 $seq=~tr/\n //d; # remove newlines and blanks 263 $seqs[$n]=$seq; 264 $n++; 265 } 266 267 while ($seq=<INFILE>) { 268 $seq=~s/\n\#.*//g; # remove commentary lines 269 while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;} # in the case that nameline contains a '>'; '.' matches anything except '\n' 270 if ($seq=~/^aa_/) {next;} 271 if ($seq=~/^sa_/ && $nosa) {next;} 272 if ($seq=~/^ss_/) { 273 if ($noss) {next;} # do not read in >ss_ and >sa_ sequences 274# if ($seq=~/^ss_conf/) {next;} # comment out to read sequence with confidence values 275# if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines 276 $nss=$n; 277 } 278 $seq=~s/^\s*(.*)//; # divide into nameline and residues; '.' matches anything except '\n' 279 if (defined $1 && $1 ne "") { 280 $names[$n]=$1; # don't move this line away from previous line $seq=~s/([^\n]*)//; 281 } else { 282 $names[$n]=$n; 283 } 284 $seqs[$n]=$seq; 285 $n++; 286 } 287 288 $/="\n"; # reset input field seperator 289 } 290 291 # Read Stockholm format 292 elsif ($informat eq "sto") 293 { 294 my %seqhash; 295 my $name; 296 my $first_block=1; 297 298 $n=0; 299 while ($line = <INFILE>) 300 { 301 $line=~tr/\r//d; 302 $line=~s/\s+/ /g; 303 if ($line=~s/^\#=GC SS_cons/ss_dssp/) {} # skip commentary and blank line 304 if ($line=~/^\#/) {next;} # skip commentary and blank line 305 if ($line=~/^\/\//) {last;} # reached end of file 306 if ($line=~/^\s*$/){$first_block=0; next;} # new sequence block starts 307 if ($line!~/^\s*(\S+)\s+(\S+)/) { 308 die ("\nERROR found in stockholm format: $!"); 309 } 310 if (!(exists $seqhash{$1})) 311 { 312 if ($line=~/^aa_/) {next;} # do not read in >aa_ sequences 313 if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences 314 if ($line=~/^ss_/) { 315 if ($noss) {next;} # do not read in >ss_ and >aa_ sequences 316# if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values 317# if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines 318 $nss=$n; 319 } 320 $line=~/^\s*(\S+)\s+(\S+)/; 321 $names[$n]=$1; 322 $seqs[$n]=$2; 323 $seqhash{$1}=$n++; 324 $first_block=1; 325 } 326 else 327 { 328 if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");} 329 $seqs[$seqhash{$1}].=$2; 330 } 331# printf("%3i %s\n",$n-1,$seqs[$n-1]); 332 } 333 } 334 335 elsif ($informat eq "clu") 336 { 337 my $residues_per_line=50; # default number of characters for namefield residues 338 # (only needed if no gap between name and residues in first sequence -> SMART) 339 my $block=1; # number of block currently read 340 my $name; 341 my $residues; 342 $n=0; # sequence number of first block 343 $k=0; # sequence index to zero for first block 344 345 while ($line = <INFILE>) 346 { 347# print($namefieldlen.$line); 348 $line=~tr/\r//d; 349 if ($line=~/CLUSTAL/i) {next;} 350 if ($line=~/^\#/) {next;} # skip commentary and blank line 351 if ($line=~/^\/\//) {last;} # reached end of file 352 if ($line=~/^\s*$/){ # new sequence block starts 353 if ($k) { 354 if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} 355 $block++; 356 $n=$k; 357 $k=0; # set sequence index to zero for next block to read 358 } 359 next; 360 } 361 if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) { 362 if ($line=~/^[*.: ]*$/) {next;} 363 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences 364 chomp($line); 365 if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) { 366 die ("\nError found in Clustal format in $infile, line $.: '$line'\n"); 367 } 368 $name=$1; 369 $residues=$2; 370 print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n"); 371 } else { 372 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences 373 if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences 374 if ($line=~/^ss_/) { 375# if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values 376# if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines 377 $nss=$n; 378 } 379 $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/; 380 $name=$1; 381 $residues=$2; 382 $residues=~tr/ //d; 383 $residues_per_line=length($residues); 384 } 385 if ($block==1) { 386 $names[$k]=$name; 387 $seqs[$k]=$residues; 388 } else { 389 $seqs[$k].=$residues; 390 if ($names[$k] ne $name) { 391 print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n"); 392 } 393 } 394# printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]); 395 $k++; 396 } 397 if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} 398 if (!$n) {$n=$k;} 399 } 400 401 # Read psi format 402 elsif ($informat eq "psi") 403 { 404 my $block=1; # number of block currently read 405 my $name; 406 my $residues; 407 $n=0; # sequence number of first block 408 $k=0; # sequence index to zero for first block 409 410 while ($line = <INFILE>) 411 { 412# print($namefieldlen.$line); 413 $line=~tr/\r//d; 414 if ($line=~/^\s*$/){ # new sequence block starts 415 if ($k) { 416 if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} 417 $block++; 418 $n=$k; 419 $k=0; # set sequence index to zero for next block to read 420 } 421 next; 422 } 423 424 if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences 425 if ($line=~/^aa_/ || $line=~/^sa_/) {next;} # do not read in >ss_ and >aa_ sequences 426 if ($line=~/^ss_/) { 427# if ($line=~/^ss_conf/) {next;} # comment out to read sequence with confidence values 428# if ($nss>=0) {next;} # comment out: allow for two or more >ss_dssp and >ss_pred lines 429 $nss=$n; 430 } 431 $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/; 432 $name=$1; 433 $residues=$2; 434 $residues=~tr/ //d; 435 436 if ($block==1) { 437 $names[$k]=$name; 438 $seqs[$k]=$residues; 439 } else { 440 $seqs[$k].=$residues; 441 if ($names[$k] ne $name) { 442 print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n"); 443 } 444 } 445# printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]); 446 $k++; 447 } 448 if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");} 449 if (!$n) {$n=$k;} 450 } 451 452 close INFILE; 453 454 455 # Empty input file? 456 if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");} 457 458 459 460################################################################################################ 461# Transforming to upper-case 462################################################################################################ 463 if ($informat ne "a3m" && $informat ne "a2m") { # Transform to upper case if input format is not A3M or A2M 464 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;} 465 } 466 467################################################################################################ 468# Removing non-alphanumeric symbols 469################################################################################################ 470 for ($k=0; $k<$n; $k++) { 471 $seqs[$k]=~tr/A-Za-z0-9.~-//cd; 472 $seqs[$k]=~tr/~/-/; 473 } 474 475 476################################################################################################ 477# Filling up with gaps '.' or deleting gaps 478################################################################################################ 479 480 # Insertion of '.' only needed for a3m if: NOT -r option OR -M first OR -M <int> option 481 if ($informat eq "a3m" && (!$remove_inserts || $matchmode)) 482 { 483 print("inserting gaps...\n"); 484 my @len_ins; # $len_ins[$j] will count the maximum number of inserted residues after match state $i. 485 my $j; # counts match states 486 my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state 487 my $insert; 488 489 # Determine length of longest insert 490 for ($k=0; $k<$n; $k++) 491 { 492 # split into list of single match states and variable-length inserts 493 # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements 494 # The '#' symbol is prepended to get rid of a perl bug in split 495 @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#"); 496 $j=0; 497# printf("Sequence $k: $seqs[$k]\n"); 498# printf("Sequence $k: @inserts\n"); 499 500 # Does sequence $k contain insert after match state j that is longer than $ngap[$j]? 501 foreach $insert (@inserts) 502 { 503 if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);} 504 $j++; 505# printf("$insert|"); 506 } 507# printf("\n"); 508 } 509 my $ngap; 510 511 # Fill up inserts with gaps '.' up to length $len_ins[$j] 512 for ($k=0; $k<$n; $k++) 513 { 514 # split into list of single match states and variable-length inserts 515 @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#"); 516 $j=0; 517 518 # append the missing number of gaps after each match state 519 foreach $insert (@inserts) 520 { 521 for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";} 522 $j++; 523 } 524 $seqs[$k] = join("",@inserts); 525 $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end 526 } 527 } 528 529 530################################################################################################ 531# Match state assignment 532################################################################################################ 533 534 # Default match state rule for a3m is -M first 535 if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";} 536 537 # Use gap rule for match state assignment? 538 if ($matchmode eq "gaprule") { 539 540 my @gaps=(); 541 my $residues; 542 my @residues; 543 544 # Determine number of gaps per column 545 for ($k=0; $k<$n; $k++) { 546 @residues=unpack("C*",$seqs[$k]); 547 for (my $l=0; $l<@residues; $l++) { 548 if ($residues[$l]==46 || $residues[$l]==45) { 549 if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;} 550 } 551 } 552 } 553 554 # Set columns with more gaps than $match_gaprule to lower case, 555 for ($k=0; $k<$n; $k++) { 556 @residues=unpack("C*",$seqs[$k]); 557 $residues=""; 558 for (my $l=0; $l<@residues; $l++) { 559 if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) { 560 if ($residues[$l]==46) { 561 $residues .= "-"; 562 } else { 563 $residues .= uc(chr($residues[$l])); 564 } 565 } else { 566 if ($residues[$l]==45) { 567 $residues .= "."; 568 } else { 569 $residues .= lc(chr($residues[$l])); 570 } 571 } 572 $seqs[$k]=$residues; 573 } 574 } 575 } 576 577 # Use first sequence for match state assignment? 578 if ($matchmode eq "first") { 579 580 my @match=(); 581 my $residues; 582 my @residues; 583 584 585 # Determine which columns have a gap in first sequence 586 for ($k=0; $k<scalar(@names); $k++) { #find seed sequence 587 if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;} 588 } 589 @residues=unpack("C*",$seqs[$k]); 590 for (my $l=0; $l<@residues; $l++) { 591 if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;} 592 } 593 594 # Set columns without residue in first sequence to upper case, 595 for ($k=0; $k<$n; $k++) { 596 @residues=unpack("C*",$seqs[$k]); 597 $residues=""; 598 for (my $l=0; $l<@residues; $l++) { 599 if ($match[$l]) { 600 if ($residues[$l]==46) { 601 $residues .= "-"; 602 } else { 603 $residues .= uc(chr($residues[$l])); 604 } 605 } else { 606 if ($residues[$l]==45) { 607 $residues .= "."; 608 } else { 609 $residues .= lc(chr($residues[$l])); 610 } 611 } 612 $seqs[$k]=$residues; 613 } 614 } 615 } 616 617 618################################################################################################ 619# Remove gaps etc. 620################################################################################################ 621 622 # Remove columns with too many gaps? 623 if ($remove_gapped) { 624 625 my @gaps=(); 626 my $residues; 627 my @residues; 628 629 # Determine number of gaps '.' or '-' per column 630 for ($k=0; $k<$n; $k++) { 631 @residues=unpack("C*",$seqs[$k]); 632 for (my $l=0; $l<@residues; $l++) { 633 if ($residues[$l]==45 || $residues[$l]==46) { 634 if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;} 635 } 636 } 637 } 638 639 # Remove columns with too many gaps 640 for ($k=0; $k<$n; $k++) { 641 @residues=unpack("C*",$seqs[$k]); 642 $residues=""; 643 for (my $l=0; $l<@residues; $l++) { 644 if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) { 645 $residues .= chr($residues[$l]) 646 } 647 $seqs[$k]=$residues; 648 } 649 } 650 } 651 652 # Remove/transliterate gaps? 653 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;} 654 if ($remove_inserts) { 655 for ($k=0; $k<$n; $k++) { 656 $seqs[$k]=~tr/a-z.//d; 657# printf("%s\n",$seqs[$k]); 658 } 659 } 660 661 662 # Remove sequences which contain only gaps 663 my $nin=$n; 664 for ($k=0; $k<$n; $k++) { 665 if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) { 666 if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");} 667 splice(@seqs,$k,1); 668 splice(@names,$k,1); 669 $k--; $n--; 670 } 671 } 672 673 674 # Cut down sequence names to $desclen characters maximum 675 for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);} 676 677 if ($outformat eq "a3m") { 678 # Remove all '.' gaps 679 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;} 680 } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) { 681 # Substitute all '.' by '-' 682 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;} 683 } 684 if ($gap ne "default") { 685 for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;} 686 } 687 if ($case eq "uc") { 688 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;} 689 } elsif ($case eq "lc") { 690 for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;} 691 } 692 693 694 695 #################################################### 696 # Check that sequences have same length 697 #################################################### 698 if ($outformat ne "a3m" && $outformat ne "ufas") { 699 my $len=length($seqs[0]); 700 for($k=1; $k<$n; $k++) { 701 if (length($seqs[$k])!=$len) { 702 printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s (len=%i) and >%-.20s (len=%i)\n", 703 $names[0],$len,$names[$k],length($seqs[$k])); 704 if ($v>=3) { 705 printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]); 706 } 707 exit 1; 708 } 709 } 710 } 711 712 713 #################################################### 714 # Remove html tags 715 #################################################### 716 for($k=0; $k<$n; $k++) { 717 $names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags 718 } 719 720 721 722################################################################################################ 723# Output part 724################################################################################################ 725 726 my $ndssp=-1; 727 my $nsa=-1; 728 my $npred=-1; 729 my $nconf=-1; 730 my $nquery=-1; 731 for ($k=0; $k<$n; $k++) { 732 if ($names[$k]=~/^ss_dssp/){$ndssp=$k; } 733 elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; } 734 elsif ($names[$k]=~/^ss_pred/){$npred=$k; } 735 elsif ($names[$k]=~/^ss_conf/){$nconf=$k; } 736 elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;} 737 } 738 739 #Write sequences into output file 740 open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n"); 741 if ($outformat eq "sto" || $outformat eq "psi") { 742 my $refline; 743 if ($outformat eq "sto") { 744 print(OUTFILE "# STOCKHOLM 1.0\n\n"); 745 746 # Write reference annotation line for match states (-M first) 747 if (!$lname) {$lname=32;} 748 $names[$nquery]=~/^\S+\s+(.*)/; 749 printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE",$1); 750 $refline=$seqs[$nquery]; 751 $refline=~s/[a-z]/-/g; 752 printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline); 753 if ($ndssp>=0) { 754 printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]); 755 } 756 } 757 if ($num) { 758 my $num=2; 759 for ($k=0; $k<$n; $k++) { 760 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} 761 $names[$k]=~s/^(\S+)\#\d+/$1/; 762 $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/; 763 $num++; 764 } 765 } 766 for ($k=0; $k<$n; $k++) { 767 if ($k==$ndssp || $k==$npred || $k==$nconf) {next;} 768 $names[$k] =~ /\s*(\S+)/; 769 if (!$lname) {$lname=32;} 770 printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]); 771 } 772 if ($outformat eq "sto") {print(OUTFILE "//\n");} 773 } elsif ($outformat eq "clu") { 774 printf(OUTFILE "CLUSTAL\n\n\n"); 775 if ($num) { 776 my $num=2; 777 for ($k=0; $k<$n; $k++) { 778 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} 779 $names[$k]=~s/^(\S+)\#\d+/$1/; 780 $names[$k]=~s/^(\S{1,10})\S*/$1\#$num/; 781 $num++; 782 } 783 } 784 while($seqs[0] ne "") { # While there are still residues left, write new block of sequences 785 for ($k=0; $k<scalar(@names); $k++) { # Go through all sequences 786 $names[$k] =~ s/\s*(\S+).*/$1/; 787 $seqs[$k]=~s/(\S{1,$numres})//; # remove the leftmost (up to) 60 residues from sequence $nseq 788 if (!$lname) {$lname=18;} 789 printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); # and print them after the sequence name 790 } 791 print(OUTFILE "\n"); # leave a blank line after each block of 60 residues 792 } 793 } else { 794 if ($num) { 795 my $num=2; 796 for ($k=0; $k<$n; $k++) { 797 if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;} 798 $names[$k]=~s/^(\S+)\#\d+/$1/; 799 $names[$k]=~s/^(\S{1,25})\S+/$1\#$num/; 800# $names[$k]=~s/^(\S+)/$1\#$num/; 801 $num++; 802 } 803 } 804 if ($titleline ne "" && $outformat eq "a3m") { 805 printf(OUTFILE "%s\n",$titleline); 806 } 807 for ($k=0; $k<$n; $k++) { 808 $seqs[$k]=~s/(\S{$numres})/$1\n/g; 809 printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]); 810 } 811 } 812 813 close OUTFILE; 814 if ($v>=2) { 815 if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");} 816 else { 817 if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); } 818 print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n"); 819 } 820 } 821 822 return; 823} 824