1#!/usr/local/bin/perl
2
3# reformat.pl
4# Reformat a multiple alignment file
5#
6#     HHsuite version 3.0.0 (15-03-2015)
7#
8#     Reference:
9#     Remmert M., Biegert A., Hauser A., and Soding J.
10#     HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
11#     Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
12
13#     (C) Johannes Soeding, 2012
14
15#     This program is free software: you can redistribute it and/or modify
16#     it under the terms of the GNU General Public License as published by
17#     the Free Software Foundation, either version 3 of the License, or
18#     (at your option) any later version.
19
20#     This program is distributed in the hope that it will be useful,
21#     but WITHOUT ANY WARRANTY; without even the implied warranty of
22#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23#     GNU General Public License for more details.
24
25#     You should have received a copy of the GNU General Public License
26#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
27
28#     We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de
29
30use strict;
31use warnings;
32my $numres=100;             # number of residues per line
33my $desclen=1000;           # maximum number of characters in nameline
34my $ARGC=scalar(@ARGV);
35if ($ARGC<2) {
36    die("
37reformat.pl from HHsuite3
38Read a multiple alignment in one format and write it in another format
39Usage: reformat.pl [informat] [outformat] infile outfile [options]
40  or   reformat.pl [informat] [outformat] 'fileglob' .ext [options]
41
42Available input formats:
43   fas:     aligned fasta; lower and upper case equivalent, '.' and '-' equivalent
44   a2m:     aligned fasta; inserts: lower case, matches: upper case, deletes: '-',
45            gaps aligned to inserts: '.'
46   a3m:     like a2m, but gaps aligned to inserts MAY be omitted
47   sto:     Stockholm format; sequences in several blocks with sequence name at
48            beginning of line (hmmer output)
49   psi:     format as read by PSI-BLAST using the -B option (like sto with -M first -r)
50   clu:     Clustal format; sequences in several blocks with sequence name at beginning
51            of line
52Available output formats:
53   fas:     aligned fasta; all gaps '-'
54   a2m:     aligned fasta; inserts: lower case, matches: upper case, deletes: '-', gaps
55            aligned to inserts: '.'
56   a3m:     like a2m, but gaps aligned to inserts are omitted
57   sto:     Stockholm format; sequences in just one block, one line per sequence
58   psi:     format as read by PSI-BLAST using the -B option
59   clu:     Clustal format
60If no input or output format is given the file extension is interpreted as format
61specification ('aln' as 'clu')
62
63Options:
64  -v int    verbose mode (0:off, 1:on)
65  -num      add number prefix to sequence names: 'name', '1:name' '2:name' etc
66  -noss     remove secondary structure sequences (beginning with >ss_)
67  -sa       do not remove solvent accessibility sequences (beginning with >sa_)
68  -M first  make all columns with residue in first sequence match columns
69            (default for output format a2m or a3m)
70  -M int    make all columns with less than X% gaps match columns
71            (for output format a2m or a3m)
72  -r        remove all lower case residues (insert states)
73            (AFTER -M option has been processed)
74  -r int    remove all lower case columns with more than X% gaps
75  -g ''     suppress all gaps
76  -g '-'    write all gaps as '-'
77  -uc       write all residues in upper case (AFTER all other options have been processed)
78  -lc       write all residues in lower case (AFTER all other options have been processed)
79  -l        number of residues per line (for Clustal, FASTA, A2M, A3M formats)
80            (default=$numres)
81  -d        maximum number of characers in nameline (default=$desclen)
82
83Examples: reformat.pl 1hjra.a3m 1hjra.a2m
84          (same as reformat.pl a3m a2m 1hjra.a3m 1hjra.a2m)
85          reformat.pl test.a3m test.fas -num -r 90
86          reformat.pl fas sto '*.fasta' .stockholm
87\n");
88#  clu:  clustal format (hmmer output)
89#  sel:  Selex format
90#  phy:  Phylip format
91}
92
93my $informat="";
94my $outformat="";
95my $infile="";
96my $outfile="";
97my $num=0;                    # don't use sequence number as prefix: '>n|name'
98my $noss=0;                   # don't remove secondary structure
99my $nosa=1;                   # remove solvent accessibility sequences
100my $line;
101my $options="";
102my @names;   # names of sequences read in
103my @seqs;    # residues of sequences read in
104my $n;       # number of sequences read in
105my $k;       # counts sequences for output
106my $remove_inserts=0;
107my $remove_gapped=0;
108my $matchmode=""; # do not change capitalization
109my $match_gaprule=0;   # columns with less than this percentage of gaps will be match columns
110my $v=2;
111my $update=0;
112my $nss=-1;  # index of secondary structure sequence
113my $lname;   # length of sequence name in clustal, stockholm, psi format
114my $titleline; # first line beginning with "#" in A3M, A2M, or FASTA files
115
116my @informats=  ("fas","a2m","a3m","sto","psi","clu");
117my @outformats= ("fas","a2m","a3m","sto","psi","clu","ufas");
118my $found;
119my $element;
120my $gap="default";
121my $case="default";
122
123# Process optionsz
124for (my $i=0; $i<$ARGC; $i++) {$options.=" $ARGV[$i] ";}
125if ($options=~s/ -i\s+(\S+) / /) {$infile=$1;}
126if ($options=~s/ -o\s+(\S+) / /) {$outfile=$1;}
127if ($options=~s/ -num / /)    {$num=1; $desclen=505;}
128if ($options=~s/ -noss / /)   {$noss=1;}
129if ($options=~s/ -sa / /)     {$nosa=0;}
130if ($options=~s/ -g\s+\'?(\S*)\'? / /) {$gap=$1;}
131if ($options=~s/ -r\s+(\d+) / /) {$remove_gapped=$1;}
132if ($options=~s/ -r / /)     {$remove_inserts=1;}
133if ($options=~s/ -lc / /)    {$case="lc";}
134if ($options=~s/ -uc / /)    {$case="uc";}
135if ($options=~s/ -v\s*(\d+) / /) {$v=$1;}
136if ($options=~s/ -v / /) {$v=2;}
137if ($options=~s/ -M\s+(\d+) / /) {$matchmode="gaprule"; $match_gaprule=$1;}
138if ($options=~s/ -M\s+first / /) {$matchmode="first";   $match_gaprule=$1;}
139if ($options=~s/ -u / /) {$update=1;}
140if ($options=~s/ -l\s+(\S+) / /) {$numres=$1;}
141if ($options=~s/ -lname\s+(\S+) / /) {$lname=$1;}
142if ($options=~s/ -d\s+(\S+) / /) {$desclen=$1;}
143
144# Assign informat, outformat, infile, and outfile
145if ($outfile eq "") {
146    if ($options=~s/(\S+)\s*$//) {
147	$outfile=$1;
148    } else {
149	die("Error: no output file given: '$options'\n");
150    }
151}
152if ($infile eq "") {
153    if ($options=~s/(\S+)\s*$//) {
154	$infile=$1;
155    } else {
156	die("Error: no input file given: '$options'\n");
157    }
158}
159if ($options=~s/(\S+)\s*$//) {
160    $outformat=$1;
161} else {
162    if ($outfile=~/\S*\.(\S+?)$/) {
163	$outformat=lc($1);
164	if    ($outformat eq "aln")   {$outformat="clu";}
165	elsif ($outformat eq "fa")    {$outformat="fas";}
166	elsif ($outformat eq "fasta") {$outformat="fas";}
167	elsif ($outformat eq "afa")   {$outformat="fas";}
168	elsif ($outformat eq "afas")  {$outformat="fas";}
169	elsif ($outformat eq "afasta") {$outformat="fas";}
170    } else {
171	print ("Using FASTA output format: '$options'\n"); $outformat="fas";
172    }
173}
174if ($options=~s/(\S+)\s*$//) {
175    $informat=$1;
176} else {
177    if ($infile=~/\S*\.(\S+?)$/) {
178	$informat=lc($1);
179	if    ($informat eq "aln")   {$informat="clu";}
180	elsif ($informat eq "fa")    {$informat="fas";}
181	elsif ($informat eq "fasta") {$informat="fas";}
182    } else {
183	print ("Using FASTA input format: '$options'\n"); $informat="fas";
184    }
185}
186
187
188# Warn if unknown options found
189if ($options!~/^\s*$/) {
190    $options=~s/^\s*(.*?)\s*$/$1/g;
191    print("\nWARNING: unknown options '$options'\n");
192}
193
194# Check if input and output formats are valid
195$found=0;
196foreach $element (@informats) {if ($informat eq $element) {$found=1; last;}}
197if(!$found) {die("\nError: $informat is not a valid input format option\n");}
198$found=0;
199foreach $element (@outformats) {if ($outformat eq $element) {$found=1; last;}}
200if(!$found) {die("\nError: $outformat is not a valid output format option\n");}
201
202#if($outformat eq "psi") {
203#   $remove_inserts=1;
204#}
205if($outformat eq "ufas") {$gap="";}
206
207
208if ($infile=~/\*/ || $outfile=~/^\./) # if infile contains '*' or outfile is just an extension
209{
210    $outfile=~/.*\.(\S*)$/;
211    my $outext=$1;
212    my @infiles=glob($infile);
213    printf("%i files to reformat\n",scalar(@infiles));
214    foreach $infile (@infiles)
215    {
216	if ($infile!~/(\S+)\.\S+/) {$infile=~/(\S+)/}
217	$outfile="$1.$outext";
218	if ($update && -e $outfile) {next;}
219	if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
220	&reformat($infile,$outfile);
221    }
222    exit 0;
223}
224else
225{
226    if ($v>=3) {print("Reformatting $infile from $informat to $outformat ...\n");}
227    &reformat($infile,$outfile);
228    exit 0;
229}
230
231
232################################################################################################
233# Reformat a single file
234################################################################################################
235sub reformat()
236{
237    $infile=$_[0];
238    $nss=-1;
239    $titleline="";
240
241################################################################################################
242# Input part
243################################################################################################
244
245    my $skip=0;
246    open (INFILE,"<$infile") or die ("ERROR: cannot open $infile: $!\n");
247
248    # Read a2m, a3m, fas format
249    if ($informat eq "fas" || $informat eq "a2m" || $informat eq "a3m")
250    {
251	$/=">"; # set input field seperator
252	$n=0;
253	my $seq=<INFILE>;
254	if ($seq=~s/^(\#.*)//) {$titleline=$1;}          # remove commentary lines at beginning of file
255	$seq=~s/(\n\#.*)*\n//;    # remove commentary lines at beginning of file
256
257	# If first line has no sequence name, use >root_name
258	if ($seq ne ">") {
259	    $infile="/$infile.";        # determine root name 1
260	    $infile=~/^.*\/(.*?)\..*/;  # determine root name 2
261	    $names[$n]=$1;        # don't move this line away from previous line $seq=~s/([^\n]*)//;
262	    $seq=~tr/\n //d;      # remove newlines and blanks
263	    $seqs[$n]=$seq;
264	    $n++;
265	}
266
267	while ($seq=<INFILE>) {
268	    $seq=~s/\n\#.*//g;    # remove commentary lines
269	    while ($seq=~s/(.)>/$1/) {$seq.=<INFILE>;}  # in the case that nameline contains a '>'; '.' matches anything except '\n'
270	    if ($seq=~/^aa_/) {next;}
271	    if ($seq=~/^sa_/ && $nosa) {next;}
272	    if ($seq=~/^ss_/) {
273		if ($noss) {next;}             # do not read in >ss_ and >sa_ sequences
274#		if ($seq=~/^ss_conf/)  {next;} # comment out to read sequence with confidence values
275#		if ($nss>=0) {next;}           # comment out: allow for two or more >ss_dssp and >ss_pred lines
276 		$nss=$n;
277	    }
278	    $seq=~s/^\s*(.*)//;        # divide into nameline and residues; '.' matches anything except '\n'
279	    if (defined $1 && $1 ne "") {
280		$names[$n]=$1;        # don't move this line away from previous line $seq=~s/([^\n]*)//;
281	    } else {
282		$names[$n]=$n;
283	    }
284	    $seqs[$n]=$seq;
285	    $n++;
286	}
287
288	$/="\n"; # reset input field seperator
289    }
290
291    # Read Stockholm format
292    elsif ($informat eq "sto")
293    {
294	my %seqhash;
295	my $name;
296	my $first_block=1;
297
298	$n=0;
299	while ($line = <INFILE>)
300	{
301	    $line=~tr/\r//d;
302	    $line=~s/\s+/ /g;
303	    if ($line=~s/^\#=GC SS_cons/ss_dssp/) {}        # skip commentary and blank line
304	    if ($line=~/^\#/) {next;}                       # skip commentary and blank line
305	    if ($line=~/^\/\//) {last;}                     # reached end of file
306	    if ($line=~/^\s*$/){$first_block=0; next;}      # new sequence block starts
307	    if ($line!~/^\s*(\S+)\s+(\S+)/) {
308		die ("\nERROR found in stockholm format: $!");
309	    }
310	    if (!(exists $seqhash{$1}))
311	    {
312		if ($line=~/^aa_/) {next;}          # do not read in >aa_ sequences
313		if ($line=~/^sa_/ && $nosa) {next;} # do not read in >sa_ sequences
314		if ($line=~/^ss_/) {
315		    if ($noss) {next;} # do not read in >ss_ and >aa_ sequences
316#	  	    if ($line=~/^ss_conf/)  {next;} # comment out to read sequence with confidence values
317#		    if ($nss>=0) {next;}  # comment out: allow for two or more >ss_dssp and >ss_pred lines
318		    $nss=$n;
319		}
320		$line=~/^\s*(\S+)\s+(\S+)/;
321		$names[$n]=$1;
322		$seqs[$n]=$2;
323		$seqhash{$1}=$n++;
324		$first_block=1;
325	    }
326	    else
327	    {
328		if ($first_block) {die ("\nERROR: sequence $1 appears more than once per block\n");}
329		$seqs[$seqhash{$1}].=$2;
330	    }
331#	    printf("%3i %s\n",$n-1,$seqs[$n-1]);
332	}
333    }
334
335    elsif ($informat eq "clu")
336    {
337	my $residues_per_line=50; # default number of characters for namefield residues
338                             # (only needed if no gap between name and residues in first sequence -> SMART)
339	my $block=1;         # number of block currently read
340	my $name;
341	my $residues;
342	$n=0;                # sequence number of first block
343	$k=0;                # sequence index to zero for first block
344
345	while ($line = <INFILE>)
346	{
347#	    print($namefieldlen.$line);
348	    $line=~tr/\r//d;
349	    if ($line=~/CLUSTAL/i) {next;}
350	    if ($line=~/^\#/) {next;}                     # skip commentary and blank line
351	    if ($line=~/^\/\//) {last;}                   # reached end of file
352	    if ($line=~/^\s*$/){                          # new sequence block starts
353		if ($k) {
354		    if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
355		    $block++;
356		    $n=$k;
357		    $k=0;  # set sequence index to zero for next block to read
358		}
359		next;
360	    }
361	    if ($line!~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/) {
362		if ($line=~/^[*.: ]*$/) {next;}
363		if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
364		chomp($line);
365		if ($line!~/^(\S{1,20})([a-zA-Z0-9.-]{$residues_per_line})(\s+\d+)?$/) {
366		    die ("\nError found in Clustal format in $infile, line $.: '$line'\n");
367		}
368		$name=$1;
369		$residues=$2;
370		print("WARNING: Found no space between name and residues in $infile, line $.: '$line'\n");
371	    } else {
372		if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
373		if ($line=~/^aa_/ || $line=~/^sa_/) {next;}     # do not read in >ss_ and >aa_ sequences
374		if ($line=~/^ss_/) {
375#		    if ($line=~/^ss_conf/)  {next;} # comment out to read sequence with confidence values
376#		    if ($nss>=0) {next;}  # comment out: allow for two or more >ss_dssp and >ss_pred lines
377		    $nss=$n;
378		}
379		$line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
380		$name=$1;
381		$residues=$2;
382		$residues=~tr/ //d;
383		$residues_per_line=length($residues);
384	    }
385	    if ($block==1) {
386		$names[$k]=$name;
387		$seqs[$k]=$residues;
388	    } else {
389		$seqs[$k].=$residues;
390		if ($names[$k] ne $name) {
391		    print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
392		}
393	    }
394#	    printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
395	    $k++;
396	}
397	if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
398	if (!$n) {$n=$k;}
399    }
400
401   # Read psi format
402    elsif ($informat eq "psi")
403    {
404	my $block=1;         # number of block currently read
405	my $name;
406	my $residues;
407	$n=0;                # sequence number of first block
408	$k=0;                # sequence index to zero for first block
409
410	while ($line = <INFILE>)
411	{
412#	    print($namefieldlen.$line);
413	    $line=~tr/\r//d;
414	    if ($line=~/^\s*$/){                          # new sequence block starts
415		if ($k) {
416		    if ($n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
417		    $block++;
418		    $n=$k;
419		    $k=0;  # set sequence index to zero for next block to read
420		}
421		next;
422	    }
423
424	    if ($noss && ($line=~/^aa_/ || $line=~/^ss_/ || $line=~/^sa_/)) {next;} # do not read in >ss_ and >aa_ sequences
425	    if ($line=~/^aa_/ || $line=~/^sa_/) {next;}     # do not read in >ss_ and >aa_ sequences
426	    if ($line=~/^ss_/) {
427#		    if ($line=~/^ss_conf/)  {next;} # comment out to read sequence with confidence values
428#		    if ($nss>=0) {next;}  # comment out: allow for two or more >ss_dssp and >ss_pred lines
429		$nss=$n;
430	    }
431	    $line=~/^(\S+)\s+([ a-zA-Z0-9.-]+?)(\s+\d+)?$/;
432	    $name=$1;
433	    $residues=$2;
434	    $residues=~tr/ //d;
435
436	    if ($block==1) {
437		$names[$k]=$name;
438		$seqs[$k]=$residues;
439	    } else {
440		$seqs[$k].=$residues;
441		if ($names[$k] ne $name) {
442		    print("WARNING: name of sequence $k in block 1 ($names[$k]) is not the same as in block $block ($name) in $infile\n");
443		}
444	    }
445#	    printf("%3i %3i %-16.16s %s\n",$block,$k,$names[$k],$seqs[$k]);
446	    $k++;
447	}
448	if ($k && $n && $n!=$k) {die("\nError: different number of sequences in blocks 1 and $block of $infile\n");}
449	if (!$n) {$n=$k;}
450    }
451
452    close INFILE;
453
454
455    # Empty input file?
456    if ($n==0) {die("\nERROR: input file $infile contains no sequences\n");}
457
458
459
460################################################################################################
461# Transforming to upper-case
462################################################################################################
463    if ($informat ne "a3m" && $informat ne "a2m") {	# Transform to upper case if input format is not A3M or A2M
464	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
465    }
466
467################################################################################################
468# Removing non-alphanumeric symbols
469################################################################################################
470    for ($k=0; $k<$n; $k++) {
471	$seqs[$k]=~tr/A-Za-z0-9.~-//cd;
472	$seqs[$k]=~tr/~/-/;
473    }
474
475
476################################################################################################
477# Filling up with gaps '.' or deleting gaps
478################################################################################################
479
480    # Insertion of '.' only needed for a3m if: NOT -r option  OR  -M first  OR  -M <int> option
481    if ($informat eq "a3m" && (!$remove_inserts || $matchmode))
482    {
483	print("inserting gaps...\n");
484	my @len_ins;   # $len_ins[$j] will count the maximum number of inserted residues after match state $i.
485	my $j;       # counts match states
486	my @inserts; # $inserts[$j] contains the insert (in small case) of sequence $i after the $j'th match state
487	my $insert;
488
489	# Determine length of longest insert
490	for ($k=0; $k<$n; $k++)
491	{
492	    # split into list of single match states and variable-length inserts
493	    # ([A-Z]|-) is the split pattern. The parenthesis indicate that split patterns are to be included as list elements
494	    # The '#' symbol is prepended to get rid of a perl bug in split
495	    @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
496	    $j=0;
497#	printf("Sequence $k: $seqs[$k]\n");
498#	printf("Sequence $k: @inserts\n");
499
500	    # Does sequence $k contain insert after match state j that is longer than $ngap[$j]?
501	    foreach $insert (@inserts)
502	    {
503		if( !defined $len_ins[$j] || length($insert)>$len_ins[$j]) {$len_ins[$j]=length($insert);}
504		$j++;
505#	    printf("$insert|");
506	    }
507#	printf("\n");
508	}
509	my $ngap;
510
511	# Fill up inserts with gaps '.' up to length $len_ins[$j]
512	for ($k=0; $k<$n; $k++)
513	{
514	    # split into list of single match states and variable-length inserts
515	    @inserts = split(/([A-Z]|-|~|[0-9])/,"#".$seqs[$k]."#");
516	    $j=0;
517
518	    # append the missing number of gaps after each match state
519	    foreach $insert (@inserts)
520	    {
521		for (my $l=length($insert); $l<$len_ins[$j]; $l++) {$insert.=".";}
522		$j++;
523	    }
524	    $seqs[$k] = join("",@inserts);
525	    $seqs[$k] =~ tr/\#//d; # remove the '#' symbols inserted at the beginning and end
526	}
527    }
528
529
530################################################################################################
531# Match state assignment
532################################################################################################
533
534    # Default match state rule for a3m is -M first
535    if ($matchmode eq "" && ($outformat eq "a3m" || $outformat eq "a2m")) {$matchmode="first";}
536
537    # Use gap rule for match state assignment?
538    if ($matchmode eq "gaprule") {
539
540	my @gaps=();
541	my $residues;
542	my @residues;
543
544	# Determine number of gaps per column
545	for ($k=0; $k<$n; $k++) {
546	    @residues=unpack("C*",$seqs[$k]);
547	    for (my $l=0; $l<@residues; $l++) {
548		if ($residues[$l]==46 || $residues[$l]==45) {
549		    if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
550		}
551	    }
552	}
553
554	# Set columns with more gaps than $match_gaprule to lower case,
555	for ($k=0; $k<$n; $k++) {
556	    @residues=unpack("C*",$seqs[$k]);
557	    $residues="";
558	    for (my $l=0; $l<@residues; $l++) {
559		if (!defined $gaps[$l] || $gaps[$l]<0.01*$match_gaprule*$n) {
560		    if ($residues[$l]==46) {
561			$residues .= "-";
562		    } else {
563			$residues .= uc(chr($residues[$l]));
564		    }
565		} else {
566		    if ($residues[$l]==45) {
567			$residues .= ".";
568		    } else {
569			$residues .= lc(chr($residues[$l]));
570		    }
571		}
572		$seqs[$k]=$residues;
573	    }
574	}
575    }
576
577    # Use first sequence for match state assignment?
578    if ($matchmode eq "first") {
579
580	my @match=();
581	my $residues;
582	my @residues;
583
584
585	# Determine which columns have a gap in first sequence
586	for ($k=0; $k<scalar(@names); $k++) {  #find seed sequence
587	    if ($names[$k]!~/^(ss_|aa_|sa_)/) {last;}
588	}
589	@residues=unpack("C*",$seqs[$k]);
590	for (my $l=0; $l<@residues; $l++) {
591	    if ($residues[$l]==46 || $residues[$l]==45) {$match[$l]=0;} else {$match[$l]=1;}
592	}
593
594	# Set columns without residue in first sequence to upper case,
595	for ($k=0; $k<$n; $k++) {
596	    @residues=unpack("C*",$seqs[$k]);
597	    $residues="";
598	    for (my $l=0; $l<@residues; $l++) {
599		if ($match[$l]) {
600		    if ($residues[$l]==46) {
601			$residues .= "-";
602		    } else {
603			$residues .= uc(chr($residues[$l]));
604		    }
605		} else {
606		    if ($residues[$l]==45) {
607			$residues .= ".";
608		    } else {
609			$residues .= lc(chr($residues[$l]));
610		    }
611		}
612		$seqs[$k]=$residues;
613	    }
614	}
615    }
616
617
618################################################################################################
619# Remove gaps etc.
620################################################################################################
621
622    # Remove columns with too many gaps?
623    if ($remove_gapped) {
624
625	my @gaps=();
626	my $residues;
627	my @residues;
628
629	# Determine number of gaps '.' or '-' per column
630	for ($k=0; $k<$n; $k++) {
631	    @residues=unpack("C*",$seqs[$k]);
632	    for (my $l=0; $l<@residues; $l++) {
633		if ($residues[$l]==45 || $residues[$l]==46) {
634		    if (defined $gaps[$l]) {$gaps[$l]++;} else {$gaps[$l]=1;}
635		}
636	    }
637	}
638
639	# Remove columns with too many gaps
640	for ($k=0; $k<$n; $k++) {
641	    @residues=unpack("C*",$seqs[$k]);
642	    $residues="";
643	    for (my $l=0; $l<@residues; $l++) {
644		if (!defined $gaps[$l] || $gaps[$l]<0.01*$remove_gapped*$n) {
645		    $residues .= chr($residues[$l])
646		}
647		$seqs[$k]=$residues;
648	    }
649	}
650    }
651
652    # Remove/transliterate gaps?
653    for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/ //d;}
654    if ($remove_inserts) {
655	for ($k=0; $k<$n; $k++) {
656	    $seqs[$k]=~tr/a-z.//d;
657#	    printf("%s\n",$seqs[$k]);
658	}
659    }
660
661
662    # Remove sequences which contain only gaps
663    my $nin=$n;
664    for ($k=0; $k<$n; $k++) {
665	if (($seqs[$k]=~tr/a-zA-Z0-9/a-zA-Z0-9/==0)) {
666	    if ($v>=2) {print("Sequence contains only gaps and is removed: $names[$k]\n");}
667	    splice(@seqs,$k,1);
668	    splice(@names,$k,1);
669	    $k--; $n--;
670	}
671    }
672
673
674    # Cut down sequence names to $desclen characters maximum
675    for ($k=0; $k<$n; $k++) {$names[$k]=substr($names[$k],0,$desclen);}
676
677    if ($outformat eq "a3m") {
678	# Remove all '.' gaps
679	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/.//d;}
680    } elsif ($outformat eq "fas" || $outformat eq "clu" || $outformat eq "sto" || $outformat eq "psi" ) {
681	# Substitute all '.' by '-'
682	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/./-/;}
683    }
684    if ($gap ne "default") {
685	for ($k=0; $k<$n; $k++) {$seqs[$k]=~s/\./$gap/g; $seqs[$k]=~s/-/$gap/g;}
686    }
687    if ($case eq "uc") {
688	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/a-z/A-Z/;}
689    } elsif ($case eq "lc") {
690	for ($k=0; $k<$n; $k++) {$seqs[$k]=~tr/A-Z/a-z/;}
691    }
692
693
694
695    ####################################################
696    # Check that sequences have same length
697    ####################################################
698    if ($outformat ne "a3m" && $outformat ne "ufas") {
699	my $len=length($seqs[0]);
700	for($k=1; $k<$n; $k++) {
701  	    if (length($seqs[$k])!=$len) {
702		printf("\nError: Sequences in $infile do not all have same length, e.g. >%-.20s  (len=%i)  and  >%-.20s  (len=%i)\n",
703		       $names[0],$len,$names[$k],length($seqs[$k]));
704		if ($v>=3) {
705		    printf("%.20s %s\n%.20s %s\n\n",$names[0],$seqs[0],$names[$k],$seqs[$k]);
706		}
707		exit 1;
708	    }
709	}
710    }
711
712
713    ####################################################
714    # Remove html tags
715    ####################################################
716    for($k=0; $k<$n; $k++) {
717	$names[$k]=~s/<[A-Za-z\/].*?>//g; # remove html tags
718    }
719
720
721
722################################################################################################
723# Output part
724################################################################################################
725
726    my $ndssp=-1;
727    my $nsa=-1;
728    my $npred=-1;
729    my $nconf=-1;
730    my $nquery=-1;
731    for ($k=0; $k<$n; $k++) {
732	if ($names[$k]=~/^ss_dssp/){$ndssp=$k; }
733	elsif ($names[$k]=~/^sa_dssp/){$nsa=$k; }
734	elsif ($names[$k]=~/^ss_pred/){$npred=$k; }
735	elsif ($names[$k]=~/^ss_conf/){$nconf=$k; }
736	elsif ($nquery==-1 && $names[$k]!~/^aa_/) {$nquery=$k;}
737    }
738
739    #Write sequences into output file
740    open (OUTFILE, ">$outfile") or die ("cannot open $outfile:$!\n");
741    if ($outformat eq "sto" || $outformat eq "psi") {
742	my $refline;
743	if ($outformat eq "sto") {
744	    print(OUTFILE "# STOCKHOLM 1.0\n\n");
745
746	    # Write reference annotation line for match states (-M first)
747	    if (!$lname) {$lname=32;}
748	    $names[$nquery]=~/^\S+\s+(.*)/;
749	    printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GF DE",$1);
750	    $refline=$seqs[$nquery];
751	    $refline=~s/[a-z]/-/g;
752	    printf(OUTFILE "%-$lname.$lname"."s %s\n","#=GC RF",$refline);
753	    if ($ndssp>=0) {
754		printf(OUTFILE "%-32.32s %s\n","#=GC SS_cons",$seqs[$ndssp]);
755	    }
756	}
757	if ($num) {
758	    my $num=2;
759	    for ($k=0; $k<$n; $k++) {
760		if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
761		$names[$k]=~s/^(\S+)\#\d+/$1/;
762		$names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
763		$num++;
764	    }
765	}
766	for ($k=0; $k<$n; $k++) {
767	    if ($k==$ndssp || $k==$npred || $k==$nconf) {next;}
768	    $names[$k] =~ /\s*(\S+)/;
769	    if (!$lname) {$lname=32;}
770	    printf(OUTFILE "%-$lname.$lname"."s %s\n",$1,$seqs[$k]);
771	}
772	if ($outformat eq "sto") {print(OUTFILE "//\n");}
773    } elsif ($outformat eq "clu") {
774	printf(OUTFILE "CLUSTAL\n\n\n");
775	if ($num) {
776	    my $num=2;
777	    for ($k=0; $k<$n; $k++) {
778		if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
779		$names[$k]=~s/^(\S+)\#\d+/$1/;
780		$names[$k]=~s/^(\S{1,10})\S*/$1\#$num/;
781		$num++;
782	    }
783	}
784	while($seqs[0] ne "") {                         # While there are still residues left, write new block of sequences
785	    for ($k=0; $k<scalar(@names); $k++) {       # Go through all sequences
786		$names[$k] =~ s/\s*(\S+).*/$1/;
787		$seqs[$k]=~s/(\S{1,$numres})//;           # remove the leftmost (up to) 60 residues from sequence $nseq
788		if (!$lname) {$lname=18;}
789		printf(OUTFILE "%-$lname.$lname"."s %s\n",$names[$k],$1); # and print them after the sequence name
790	    }
791	    print(OUTFILE "\n");                            # leave a blank line after each block of 60 residues
792	}
793    } else {
794	if ($num) {
795	    my $num=2;
796	    for ($k=0; $k<$n; $k++) {
797		if ($k==$ndssp || $k==$npred || $k==$nconf || $k==$nquery) {next;}
798		$names[$k]=~s/^(\S+)\#\d+/$1/;
799		$names[$k]=~s/^(\S{1,25})\S+/$1\#$num/;
800#		$names[$k]=~s/^(\S+)/$1\#$num/;
801		$num++;
802	    }
803	}
804	if ($titleline ne "" && $outformat eq "a3m") {
805	    printf(OUTFILE "%s\n",$titleline);
806	}
807	for ($k=0; $k<$n; $k++) {
808	    $seqs[$k]=~s/(\S{$numres})/$1\n/g;
809	    printf(OUTFILE ">%s\n%s\n",$names[$k],$seqs[$k]);
810	}
811    }
812
813    close OUTFILE;
814    if ($v>=2) {
815	if ($nin==1) {print("Reformatted $infile with 1 sequence from $informat to $outformat and written to file $outfile\n");}
816	else {
817	    if (!$nin==$n) {printf("Removed %i sequences which contained no residues\n",$nin-$n); }
818	    print("Reformatted $infile with $n sequences from $informat to $outformat and written to file $outfile\n");
819	}
820    }
821
822    return;
823}
824