1#!/usr/bin/perl -w
2
3use Getopt::Long;
4use Pod::Usage;
5#use POSIX;
6#use GD;
7use strict;
8
9my $ssFile='alirna.ps';
10my $columnWidth=120;
11my $formatT=0;
12my $man=0;
13my $ruler=0;
14my $resnum=0;
15
16GetOptions ('-s:s' => \$ssFile,
17	    '-c:i' => \$columnWidth,
18	    '-t' => \$formatT,
19	    '-r' => \$ruler,
20	    '-n' => \$resnum,
21	    "help"=> \$man,
22	    "man" => \$man,
23	    "h"=> sub {pod2usage(-verbose => 1)},
24	   ) || pod2usage(-verbose => 0);
25
26pod2usage(-verbose => 2) if $man;
27
28if (!-e $ssFile) {
29    pod2usage(-msg => "RNAalifold secondary structure file $ssFile not found (use option -s)",
30	      -verbose => 0);
31  exit(1);
32}
33
34my $aln=readClustal();
35
36my @ss=();
37for my $i (1..length($aln->[0]->{seq})) {
38  push @ss,'.';
39}
40
41my $consStruc='';
42
43open(ALIRNA,"<$ssFile");
44my $pairsFlag=0;
45my $sequenceFlag=0;
46while (<ALIRNA>) {
47  $pairsFlag=1 if (/\/pairs/);
48  if ($pairsFlag and /\[(\d+) (\d+)\]/) {
49    $ss[$1-1]='(';
50    $ss[$2-1]=')';
51  }
52  $pairsFlag=0 if ($pairsFlag and /def/);
53}
54
55$consStruc=join('',@ss);
56
57for my $row (@$aln) {
58  $row->{seq}=uc($row->{seq});
59  if ($formatT) {
60    $row->{seq}=~s/U/T/g;
61  } else {
62    $row->{seq}=~s/T/U/g;
63  }
64}
65
66
67my $consSeq=consensusSeq($aln);
68
69#print "$consSeq\n$consStruc\n";
70
71plotAln($aln,$consSeq,$consStruc);
72
73
74
75######################################################################
76#
77# consensusSeq(\@aln alnref)
78#
79# Returns consensus sequence of alignment
80#
81######################################################################
82
83sub consensusSeq{
84
85  my @aln=@{$_[0]};
86  my $out='';
87
88  for my $i (0..length($aln[0]->{seq})-1) {
89
90    my %countHash=('A'=>0,'C'=>0,'G'=>0,'T'=>0,'U'=>0,'-'=>0);
91    #print %countHash,"\n";
92    for my $j (0..$#aln) {
93      my $c=substr($aln[$j]->{seq},$i,1);
94      $countHash{$c}++;
95    }
96    #print %countHash,"\n";
97    my $maxCount=0;
98    my $maxChar='';
99
100    for my $c ('A','C','G','T','U','-') {
101      if ($countHash{$c}>=$maxCount) {
102	$maxChar=$c;
103	$maxCount=$countHash{$c};
104      }
105    }
106    $out.=$maxChar;
107  }
108  return $out;
109}
110
111
112######################################################################
113#
114# readClustal(filehandle)
115#
116# Reads Clustal W formatted alignment file and returns it in list of
117# hash references with keys "name" and "seq" for the name and the sequence,
118# resp.
119#
120######################################################################
121
122sub readClustal{
123  #  my $fh=shift;
124  my @out=();
125  my (%order, $order, %alignments);
126  while (<>) {
127    next if ( /^\s+$/ );
128    my ($seqname, $aln_line) = ('', '');
129    if ( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) {
130      # clustal 1.4 format
131      ($seqname,$aln_line) = ("$1/$2-$3",$4);
132    } elsif ( /^(\S+)\s+([A-Z\-]+)\s*\d*$/ ) {
133	  ($seqname,$aln_line) = ($1,$2);
134	} else {
135	  next;
136  }
137	  if ( !exists $order{$seqname} ) {
138	    $order{$seqname} = $order++;
139	  }
140    $alignments{$seqname} .= $aln_line;
141  }
142
143  foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) {
144    if ( $name =~ /(\S+):(\d+)-(\d+)/ ) {
145      (my $sname,my $start, my $end) = ($1,$2,$3);
146    } else {
147      (my $sname, my $start) = ($name,1);
148      my $str  = $alignments{$name};
149      $str =~ s/[^A-Za-z]//g;
150      my $end = length($str);
151    }
152    my $seq=$alignments{$name};
153    push @out, {name=>$name,seq=>$seq};
154  }
155  return [@out];
156}
157
158######################################################################
159#
160# getPairs(\@aln alnref, $ss string)
161#
162# Evalutates the pairing of an alignment according to a given
163# consensus secondary structure
164#
165# Returns list of all base pairs which is a hash with the following
166# keys:
167#
168#  open ... column in the alignment of the first base in the pair "("
169#  close ... column in the alignment of the second base in the pair ")"
170#  all ... list of all basepairs in the different sequences in the alignment
171#  pairing ... list of all different pairing basepairs
172#  nonpairing ... list of all incompatible basepairs
173#
174######################################################################
175
176sub getPairs{
177
178  my @inputAln=@{$_[0]};
179  my $ss=$_[1];
180
181  # return nothing if there are no pairs
182  if (!($ss=~tr/(/(/)) {
183    return ();
184  }
185
186  my @aln=();
187  foreach my $row (@inputAln) {
188    my $seq=$row->{seq};
189    $seq=uc($seq);
190    $seq=~s/T/U/g;
191    my @tmp=split(//,$seq);
192    push @aln,\@tmp;
193  }
194  my @ss=split(//,$ss);
195
196  my @pairs=();
197  my @stack=();
198
199  foreach my $column (0..$#ss) {
200
201    my $currChar=$ss[$column];
202
203    if ($currChar eq '(') {
204      push @stack,$column;
205    }
206
207    if ($currChar eq ')') {
208      my $openedCol=pop @stack;
209      push @pairs,{open=>$openedCol,close=>$column};
210    }
211  }
212
213  @pairs=sort {$a->{open} <=> $b->{open}} @pairs;
214
215  foreach my $i (0..$#pairs) {
216    #print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n";
217
218    my @all=();
219    my @pairing=();
220    my @nonpairing=();
221
222    for my $j (0..$#aln) {
223      my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}];
224      push @all,$currPair;
225    }
226
227    for my $pair (@all) {
228      if (($pair eq 'AU') or
229	  ($pair eq 'UA') or
230	  ($pair eq 'GC') or
231	  ($pair eq 'CG') or
232	  ($pair eq 'UG') or
233	  ($pair eq 'GU')) {
234
235	push @pairing, $pair;
236      } elsif ($pair eq "--") {
237	# do nothing
238      } else {
239	push @nonpairing,$pair;
240      }
241    }
242
243    undef my %saw;
244    my @uniquePairing = grep(!$saw{$_}++, @pairing);
245
246    $pairs[$i]->{all}=[@all];
247    $pairs[$i]->{pairing}=[@uniquePairing];
248    $pairs[$i]->{nonpairing}=[@nonpairing];
249  }
250
251  return @pairs;
252
253}
254
255
256######################################################################
257#
258# plotAln(\@aln ref-to-alignment, $consensus string, $ss string)
259#
260# Creates a image of the alignment with various annotations
261#
262# \@aln ... alignment in list of hash format
263# $consensus ... consensus sequence
264# $ss ... consensus secondary structure in dot bracket notation
265#
266# Returns png as string.
267#
268######################################################################
269
270sub plotAln{
271
272  my @aln=@{$_[0]};
273  my $consensus=$_[1];
274  my $ss=$_[2];
275
276  my $ps='';
277
278  # Get important measures
279  (my $fontWidth, my $fontHeight) = (6,6.5);
280
281  my $length=length($aln[0]->{seq});
282  my $maxName=0;
283  foreach my $row (@aln) {
284    $maxName=length($row->{name}) if (length($row->{name})>$maxName);
285  }
286  $maxName = 5 if $maxName<5 && $ruler;
287
288  # Custom Sizes
289  my $lineStep=$fontHeight+2;	# distance between lines
290  my $blockStep=3.5*$fontHeight; # distance between blocks
291  my $consStep=$fontHeight*0.5;	# distance between alignment and conservation curve
292  my $ssStep=2;
293  my $nameStep=3*$fontWidth;	# distance between names and sequences
294  my $maxConsBar=2.5*$fontHeight; # Height of conservation curve
295  my $startY=2;			# "y origin"
296  my $namesX=$fontWidth;	# "x origin"
297  my $seqsX=$namesX+$maxName*$fontWidth+$nameStep; # x-coord. where sequences start
298
299  # Calculate width and height
300
301  my $tmpColumns=$columnWidth;
302
303  $tmpColumns=$length if ($length<$columnWidth);
304
305  my $imageWidth = int(0.9+$namesX+($maxName+$tmpColumns)*$fontWidth+$nameStep+$fontWidth);
306  $imageWidth += int($fontWidth + length("$length")*$fontWidth) if $resnum;
307  my $numBlock = int(1+($length-1)/$columnWidth);
308  my $imageHeight = int($startY+$numBlock*((@aln+1+$ruler)*$lineStep+$blockStep+$consStep+$ssStep));
309
310
311  my $white="1 1 1";
312  my $black="0 0 0" ;
313  my $grey="1 1 1";
314
315  my $red="0.0 1";
316  my $ocre="0.16 1";
317  my $green="0.32 1";
318  my $turq="0.48 1";
319  my $blue="0.65 1";
320  my $violet="0.81 1";
321
322
323  my $red1="0.0 0.6";
324  my $ocre1="0.16 0.6";
325  my $green1="0.32 0.6";
326  my $turq1="0.48 0.6";
327  my $blue1="0.65 0.6";
328  my $violet1="0.81 0.6";
329
330  my $red2="0.0 0.2";
331  my $ocre2="0.16 0.2";
332  my $green2="0.32 0.2";
333  my $turq2="0.48 0.2";
334  my $blue2="0.65 0.2";
335  my $violet2="0.81 0.2";
336
337
338  my @colorMatrix=([$red,$red1,$red2],
339		   [$ocre,$ocre1,$ocre2],
340		   [$green,$green1,$green2],
341		   [$turq,$turq1,$turq2],
342		   [$blue,$blue1,$blue2],
343		   [$violet,$violet1,$violet2]);
344
345  my @pairs=getPairs(\@aln,$ss);
346
347  foreach my $pair (@pairs) {
348
349    foreach my $column ($pair->{open},$pair->{close}) {
350      my $block = int(0.999+($column+1)/$columnWidth);
351      my $effCol=$column-($block-1)*$columnWidth;
352      my $x=$seqsX+$effCol*$fontWidth;
353
354      foreach my $row (0..$#aln) {
355
356	my $pairing=@{$pair->{pairing}};
357	my $nonpairing=@{$pair->{nonpairing}};
358
359	my $color;
360
361	if ($nonpairing <=2) {
362	  $color=$colorMatrix[$pairing-1][$nonpairing];
363
364	  if (($pair->{all}->[$row] eq 'AU') or
365	      ($pair->{all}->[$row] eq 'UA') or
366	      ($pair->{all}->[$row] eq 'GC') or
367	      ($pair->{all}->[$row] eq 'CG') or
368	      ($pair->{all}->[$row] eq 'GU') or
369	      ($pair->{all}->[$row] eq 'UG')) {
370
371	    my $y=$startY+($block-1)*($lineStep*(@aln+1+$ruler)+$blockStep+$consStep)+$ssStep*($block)+($row+1)*$lineStep;
372
373	    my $xtmp=$x+$fontWidth;
374	    my $ytmp1=$y-1;
375	    my $ytmp2=$y+$fontHeight+1;
376	    $ps.="$x $ytmp1 $xtmp $ytmp2 $color box\n";
377	  }
378	}
379      }
380    }
381  }
382  # Calculate conservation scores as (M+1)/(N+1), where M is the
383  # number of matches to the consensus and N is the number of
384  # sequences in the alignment
385
386  my @conservation=();		# each column one score
387
388  for my $column (0..$length-1) {
389
390    my $consChar=substr($consensus,$column,1);
391
392    # if consensus is gap, score=0
393    if ($consChar eq "-" or $consChar eq "_") {
394      push @conservation, 0;
395      next;
396    }
397
398    my $match=0;
399    for my $row (0..$#aln) {
400      my $currChar=substr($aln[$row]->{seq},$column,1);
401      $match++ if ($currChar eq $consChar);
402    }
403
404    my $score=($match-1)/(@aln-1);
405    push @conservation,$score;
406  }
407
408  # Draw the alignments in chunks
409  my $currY=$startY;
410  my $currPos=0;
411
412  while ($currPos<$length) {
413
414    # secondary structure in first line
415    #$out->string($font,$seqsX,$currY,substr($ss,$currPos,$columnWidth),$black);
416    my $tmpSeq=substr($ss,$currPos,$columnWidth);
417    $tmpSeq=~s/\(/\\\(/g;
418    $tmpSeq=~s/\)/\\\)/g;
419    $ps .= "0 setgray\n";
420    $ps.="($tmpSeq) $seqsX $currY string\n";
421    $currY+=$lineStep+$ssStep;
422
423    # sequences labeled only with the organism-specifier
424    foreach my $row (@aln) {
425      #$out->string($font,$namesX,$currY,$row->{name},$black);
426      $ps.="($row->{name}) $namesX $currY string\n";
427      #$out->string($font,$seqsX,$currY,substr($row->{seq},$currPos,$columnWidth),$black);
428      my $tmpSeq=substr($row->{seq},$currPos,$columnWidth);
429      $ps.="($tmpSeq) $seqsX $currY string\n";
430      if ($resnum) {
431	my $numX=$seqsX+(length($tmpSeq)+1)*$fontWidth; # x-coord. where residue nums start
432	$tmpSeq = substr($row->{seq},0, $currPos+$columnWidth);
433	my $p = length($tmpSeq) - $tmpSeq =~ tr/_-//; # remove gaps
434	$ps .= sprintf("(%" . length($length) . "d) $numX $currY string\n", $p);
435      }
436      $currY+=$lineStep;
437    }
438    if ($ruler) {
439      $ps .= "(ruler) $namesX $currY string\n";
440      my $p = 10 * (int(1.5+$currPos/10));
441      my $r = "." x ($p-$currPos-length("$p"));
442      $r .= $p; $p+=10;
443      my $maxp = $currPos+$columnWidth;
444      $maxp = $length if $maxp>$length;
445      for (;$p<=$maxp; $p+=10) {
446	$r .= sprintf("%10d", $p);
447      }
448      $r =~ tr/ /./;
449      $r .= "." x ($maxp-$p+10) if $p>$maxp;
450      $ps .= "($r) $seqsX $currY string\n";
451      $currY+=$lineStep;
452    }
453
454    # conservation curve
455    $currY+=$consStep;
456    $ps .= "0.6 setgray\n";
457    for my $col ($currPos..$currPos+$columnWidth-1) {
458      my $score=shift @conservation;
459      last if (!defined $score);
460      my $barHeight=$maxConsBar*$score;
461      $barHeight=1 if ($barHeight==0);
462      my $x=$seqsX+($col-($columnWidth*int($currPos/$columnWidth)))*$fontWidth;
463
464      my $ytmp1=$currY+$maxConsBar-$barHeight;
465      my $xtmp2=$x+$fontWidth;
466      my $ytmp2=$currY+$maxConsBar;
467      $ps .= "$x $ytmp1 $xtmp2 $ytmp2 box2\n";
468    }
469    $currY+=$blockStep;
470    $currPos+=$columnWidth;
471  }
472
473  my $BB="2 2 $imageWidth $imageHeight";
474
475  while (<DATA>) {
476    s/!BB!/$BB/;
477    s/!HEIGHT!/$imageHeight/;
478    print;
479  }
480  print $ps;
481  print "showpage\n";
482}
483
484=head1 NAME
485
486coloraln.pl - colorize an alignment with consensus structure
487
488=head1 SYNOPSIS
489
490  coloraln.pl [-r] [-n] [-c columns] [-s structure.ps] file.aln
491
492=head1 DESCRIPTION
493
494colorrna.pl reads an alignment in CLUSTAL format, and a consensus
495secondary structure (which it extracts from a postscript secondary
496structure plot). It produces a postscript figure of the alignment, in
497which compensatory mutation supporting the consensus structure are
498marked by color. The color scheme is the same employed by RNAalifold
499and alidot: Red marks pairs with no sequence variation; ochre, green,
500turquoise, blue, and violet mark pairs with 2,3,4,5,6 different tpyes
501of pairs, respectively.
502
503=head1 OPTIONS
504
505=over 4
506
507=item B<-s> I<file>
508
509read I<file> to extract the consensus structure (default: C<alirna.ps>)
510
511=item B<-c> I<width>
512
513break alignments into blocks of at most I<width> columns, (default: 120)
514
515=item B<-t>
516
517suppress conversion of C<T> to C<U>, i.e. do not convert DNA to
518RNA, (default: convert to C<U>)
519
520=item B<-r>
521
522add a "ruler" below the alignment showing sequence positions
523
524=item B<-n>
525
526write sequence position at the end of each sequence line
527
528=back
529
530=cut
531
532
533__DATA__
534%!PS-Adobe-3.0 EPSF-3.0
535%%BoundingBox: !BB!
536%%EndComments
537
538% draws ViennaRNA like colored boxes
539/box { % x1 y1 x2 y2 hue saturation
540  gsave
541  dup 0.3 mul 1 exch sub sethsbcolor
542  exch 3 index sub exch 2 index sub rectfill
543  grestore
544} def
545
546% draws a box in current color
547/box2 { % x1 y1 x2 y2
548  exch 3 index sub exch 2 index sub rectfill
549} def
550
551/string { % (Text) x y
552  6 add
553  moveto
554  show
555} def
556
5570 !HEIGHT! translate
5581 -1 scale
559/Courier findfont
560[10 0 0 -10 0 0] makefont setfont
561
562%100 100 106 110 0.5 0.5 1 box
563%(A) 100 100 string
564%(This is a string) 200 200 string
565