1#!/usr/bin/perl -w 2 3use Getopt::Long; 4use Pod::Usage; 5#use POSIX; 6#use GD; 7use strict; 8 9my $ssFile='alirna.ps'; 10my $columnWidth=120; 11my $formatT=0; 12my $man=0; 13my $ruler=0; 14my $resnum=0; 15 16GetOptions ('-s:s' => \$ssFile, 17 '-c:i' => \$columnWidth, 18 '-t' => \$formatT, 19 '-r' => \$ruler, 20 '-n' => \$resnum, 21 "help"=> \$man, 22 "man" => \$man, 23 "h"=> sub {pod2usage(-verbose => 1)}, 24 ) || pod2usage(-verbose => 0); 25 26pod2usage(-verbose => 2) if $man; 27 28if (!-e $ssFile) { 29 pod2usage(-msg => "RNAalifold secondary structure file $ssFile not found (use option -s)", 30 -verbose => 0); 31 exit(1); 32} 33 34my $aln=readClustal(); 35 36my @ss=(); 37for my $i (1..length($aln->[0]->{seq})) { 38 push @ss,'.'; 39} 40 41my $consStruc=''; 42 43open(ALIRNA,"<$ssFile"); 44my $pairsFlag=0; 45my $sequenceFlag=0; 46while (<ALIRNA>) { 47 $pairsFlag=1 if (/\/pairs/); 48 if ($pairsFlag and /\[(\d+) (\d+)\]/) { 49 $ss[$1-1]='('; 50 $ss[$2-1]=')'; 51 } 52 $pairsFlag=0 if ($pairsFlag and /def/); 53} 54 55$consStruc=join('',@ss); 56 57for my $row (@$aln) { 58 $row->{seq}=uc($row->{seq}); 59 if ($formatT) { 60 $row->{seq}=~s/U/T/g; 61 } else { 62 $row->{seq}=~s/T/U/g; 63 } 64} 65 66 67my $consSeq=consensusSeq($aln); 68 69#print "$consSeq\n$consStruc\n"; 70 71plotAln($aln,$consSeq,$consStruc); 72 73 74 75###################################################################### 76# 77# consensusSeq(\@aln alnref) 78# 79# Returns consensus sequence of alignment 80# 81###################################################################### 82 83sub consensusSeq{ 84 85 my @aln=@{$_[0]}; 86 my $out=''; 87 88 for my $i (0..length($aln[0]->{seq})-1) { 89 90 my %countHash=('A'=>0,'C'=>0,'G'=>0,'T'=>0,'U'=>0,'-'=>0); 91 #print %countHash,"\n"; 92 for my $j (0..$#aln) { 93 my $c=substr($aln[$j]->{seq},$i,1); 94 $countHash{$c}++; 95 } 96 #print %countHash,"\n"; 97 my $maxCount=0; 98 my $maxChar=''; 99 100 for my $c ('A','C','G','T','U','-') { 101 if ($countHash{$c}>=$maxCount) { 102 $maxChar=$c; 103 $maxCount=$countHash{$c}; 104 } 105 } 106 $out.=$maxChar; 107 } 108 return $out; 109} 110 111 112###################################################################### 113# 114# readClustal(filehandle) 115# 116# Reads Clustal W formatted alignment file and returns it in list of 117# hash references with keys "name" and "seq" for the name and the sequence, 118# resp. 119# 120###################################################################### 121 122sub readClustal{ 123 # my $fh=shift; 124 my @out=(); 125 my (%order, $order, %alignments); 126 while (<>) { 127 next if ( /^\s+$/ ); 128 my ($seqname, $aln_line) = ('', ''); 129 if ( /^\s*(\S+)\s*\/\s*(\d+)-(\d+)\s+(\S+)\s*$/ ) { 130 # clustal 1.4 format 131 ($seqname,$aln_line) = ("$1/$2-$3",$4); 132 } elsif ( /^(\S+)\s+([A-Z\-]+)\s*\d*$/ ) { 133 ($seqname,$aln_line) = ($1,$2); 134 } else { 135 next; 136 } 137 if ( !exists $order{$seqname} ) { 138 $order{$seqname} = $order++; 139 } 140 $alignments{$seqname} .= $aln_line; 141 } 142 143 foreach my $name ( sort { $order{$a} <=> $order{$b} } keys %alignments ) { 144 if ( $name =~ /(\S+):(\d+)-(\d+)/ ) { 145 (my $sname,my $start, my $end) = ($1,$2,$3); 146 } else { 147 (my $sname, my $start) = ($name,1); 148 my $str = $alignments{$name}; 149 $str =~ s/[^A-Za-z]//g; 150 my $end = length($str); 151 } 152 my $seq=$alignments{$name}; 153 push @out, {name=>$name,seq=>$seq}; 154 } 155 return [@out]; 156} 157 158###################################################################### 159# 160# getPairs(\@aln alnref, $ss string) 161# 162# Evalutates the pairing of an alignment according to a given 163# consensus secondary structure 164# 165# Returns list of all base pairs which is a hash with the following 166# keys: 167# 168# open ... column in the alignment of the first base in the pair "(" 169# close ... column in the alignment of the second base in the pair ")" 170# all ... list of all basepairs in the different sequences in the alignment 171# pairing ... list of all different pairing basepairs 172# nonpairing ... list of all incompatible basepairs 173# 174###################################################################### 175 176sub getPairs{ 177 178 my @inputAln=@{$_[0]}; 179 my $ss=$_[1]; 180 181 # return nothing if there are no pairs 182 if (!($ss=~tr/(/(/)) { 183 return (); 184 } 185 186 my @aln=(); 187 foreach my $row (@inputAln) { 188 my $seq=$row->{seq}; 189 $seq=uc($seq); 190 $seq=~s/T/U/g; 191 my @tmp=split(//,$seq); 192 push @aln,\@tmp; 193 } 194 my @ss=split(//,$ss); 195 196 my @pairs=(); 197 my @stack=(); 198 199 foreach my $column (0..$#ss) { 200 201 my $currChar=$ss[$column]; 202 203 if ($currChar eq '(') { 204 push @stack,$column; 205 } 206 207 if ($currChar eq ')') { 208 my $openedCol=pop @stack; 209 push @pairs,{open=>$openedCol,close=>$column}; 210 } 211 } 212 213 @pairs=sort {$a->{open} <=> $b->{open}} @pairs; 214 215 foreach my $i (0..$#pairs) { 216 #print "$i: $pairs[$i]->{open} - $pairs[$i]->{close}\n"; 217 218 my @all=(); 219 my @pairing=(); 220 my @nonpairing=(); 221 222 for my $j (0..$#aln) { 223 my $currPair=$aln[$j][$pairs[$i]->{open}].$aln[$j][$pairs[$i]->{close}]; 224 push @all,$currPair; 225 } 226 227 for my $pair (@all) { 228 if (($pair eq 'AU') or 229 ($pair eq 'UA') or 230 ($pair eq 'GC') or 231 ($pair eq 'CG') or 232 ($pair eq 'UG') or 233 ($pair eq 'GU')) { 234 235 push @pairing, $pair; 236 } elsif ($pair eq "--") { 237 # do nothing 238 } else { 239 push @nonpairing,$pair; 240 } 241 } 242 243 undef my %saw; 244 my @uniquePairing = grep(!$saw{$_}++, @pairing); 245 246 $pairs[$i]->{all}=[@all]; 247 $pairs[$i]->{pairing}=[@uniquePairing]; 248 $pairs[$i]->{nonpairing}=[@nonpairing]; 249 } 250 251 return @pairs; 252 253} 254 255 256###################################################################### 257# 258# plotAln(\@aln ref-to-alignment, $consensus string, $ss string) 259# 260# Creates a image of the alignment with various annotations 261# 262# \@aln ... alignment in list of hash format 263# $consensus ... consensus sequence 264# $ss ... consensus secondary structure in dot bracket notation 265# 266# Returns png as string. 267# 268###################################################################### 269 270sub plotAln{ 271 272 my @aln=@{$_[0]}; 273 my $consensus=$_[1]; 274 my $ss=$_[2]; 275 276 my $ps=''; 277 278 # Get important measures 279 (my $fontWidth, my $fontHeight) = (6,6.5); 280 281 my $length=length($aln[0]->{seq}); 282 my $maxName=0; 283 foreach my $row (@aln) { 284 $maxName=length($row->{name}) if (length($row->{name})>$maxName); 285 } 286 $maxName = 5 if $maxName<5 && $ruler; 287 288 # Custom Sizes 289 my $lineStep=$fontHeight+2; # distance between lines 290 my $blockStep=3.5*$fontHeight; # distance between blocks 291 my $consStep=$fontHeight*0.5; # distance between alignment and conservation curve 292 my $ssStep=2; 293 my $nameStep=3*$fontWidth; # distance between names and sequences 294 my $maxConsBar=2.5*$fontHeight; # Height of conservation curve 295 my $startY=2; # "y origin" 296 my $namesX=$fontWidth; # "x origin" 297 my $seqsX=$namesX+$maxName*$fontWidth+$nameStep; # x-coord. where sequences start 298 299 # Calculate width and height 300 301 my $tmpColumns=$columnWidth; 302 303 $tmpColumns=$length if ($length<$columnWidth); 304 305 my $imageWidth = int(0.9+$namesX+($maxName+$tmpColumns)*$fontWidth+$nameStep+$fontWidth); 306 $imageWidth += int($fontWidth + length("$length")*$fontWidth) if $resnum; 307 my $numBlock = int(1+($length-1)/$columnWidth); 308 my $imageHeight = int($startY+$numBlock*((@aln+1+$ruler)*$lineStep+$blockStep+$consStep+$ssStep)); 309 310 311 my $white="1 1 1"; 312 my $black="0 0 0" ; 313 my $grey="1 1 1"; 314 315 my $red="0.0 1"; 316 my $ocre="0.16 1"; 317 my $green="0.32 1"; 318 my $turq="0.48 1"; 319 my $blue="0.65 1"; 320 my $violet="0.81 1"; 321 322 323 my $red1="0.0 0.6"; 324 my $ocre1="0.16 0.6"; 325 my $green1="0.32 0.6"; 326 my $turq1="0.48 0.6"; 327 my $blue1="0.65 0.6"; 328 my $violet1="0.81 0.6"; 329 330 my $red2="0.0 0.2"; 331 my $ocre2="0.16 0.2"; 332 my $green2="0.32 0.2"; 333 my $turq2="0.48 0.2"; 334 my $blue2="0.65 0.2"; 335 my $violet2="0.81 0.2"; 336 337 338 my @colorMatrix=([$red,$red1,$red2], 339 [$ocre,$ocre1,$ocre2], 340 [$green,$green1,$green2], 341 [$turq,$turq1,$turq2], 342 [$blue,$blue1,$blue2], 343 [$violet,$violet1,$violet2]); 344 345 my @pairs=getPairs(\@aln,$ss); 346 347 foreach my $pair (@pairs) { 348 349 foreach my $column ($pair->{open},$pair->{close}) { 350 my $block = int(0.999+($column+1)/$columnWidth); 351 my $effCol=$column-($block-1)*$columnWidth; 352 my $x=$seqsX+$effCol*$fontWidth; 353 354 foreach my $row (0..$#aln) { 355 356 my $pairing=@{$pair->{pairing}}; 357 my $nonpairing=@{$pair->{nonpairing}}; 358 359 my $color; 360 361 if ($nonpairing <=2) { 362 $color=$colorMatrix[$pairing-1][$nonpairing]; 363 364 if (($pair->{all}->[$row] eq 'AU') or 365 ($pair->{all}->[$row] eq 'UA') or 366 ($pair->{all}->[$row] eq 'GC') or 367 ($pair->{all}->[$row] eq 'CG') or 368 ($pair->{all}->[$row] eq 'GU') or 369 ($pair->{all}->[$row] eq 'UG')) { 370 371 my $y=$startY+($block-1)*($lineStep*(@aln+1+$ruler)+$blockStep+$consStep)+$ssStep*($block)+($row+1)*$lineStep; 372 373 my $xtmp=$x+$fontWidth; 374 my $ytmp1=$y-1; 375 my $ytmp2=$y+$fontHeight+1; 376 $ps.="$x $ytmp1 $xtmp $ytmp2 $color box\n"; 377 } 378 } 379 } 380 } 381 } 382 # Calculate conservation scores as (M+1)/(N+1), where M is the 383 # number of matches to the consensus and N is the number of 384 # sequences in the alignment 385 386 my @conservation=(); # each column one score 387 388 for my $column (0..$length-1) { 389 390 my $consChar=substr($consensus,$column,1); 391 392 # if consensus is gap, score=0 393 if ($consChar eq "-" or $consChar eq "_") { 394 push @conservation, 0; 395 next; 396 } 397 398 my $match=0; 399 for my $row (0..$#aln) { 400 my $currChar=substr($aln[$row]->{seq},$column,1); 401 $match++ if ($currChar eq $consChar); 402 } 403 404 my $score=($match-1)/(@aln-1); 405 push @conservation,$score; 406 } 407 408 # Draw the alignments in chunks 409 my $currY=$startY; 410 my $currPos=0; 411 412 while ($currPos<$length) { 413 414 # secondary structure in first line 415 #$out->string($font,$seqsX,$currY,substr($ss,$currPos,$columnWidth),$black); 416 my $tmpSeq=substr($ss,$currPos,$columnWidth); 417 $tmpSeq=~s/\(/\\\(/g; 418 $tmpSeq=~s/\)/\\\)/g; 419 $ps .= "0 setgray\n"; 420 $ps.="($tmpSeq) $seqsX $currY string\n"; 421 $currY+=$lineStep+$ssStep; 422 423 # sequences labeled only with the organism-specifier 424 foreach my $row (@aln) { 425 #$out->string($font,$namesX,$currY,$row->{name},$black); 426 $ps.="($row->{name}) $namesX $currY string\n"; 427 #$out->string($font,$seqsX,$currY,substr($row->{seq},$currPos,$columnWidth),$black); 428 my $tmpSeq=substr($row->{seq},$currPos,$columnWidth); 429 $ps.="($tmpSeq) $seqsX $currY string\n"; 430 if ($resnum) { 431 my $numX=$seqsX+(length($tmpSeq)+1)*$fontWidth; # x-coord. where residue nums start 432 $tmpSeq = substr($row->{seq},0, $currPos+$columnWidth); 433 my $p = length($tmpSeq) - $tmpSeq =~ tr/_-//; # remove gaps 434 $ps .= sprintf("(%" . length($length) . "d) $numX $currY string\n", $p); 435 } 436 $currY+=$lineStep; 437 } 438 if ($ruler) { 439 $ps .= "(ruler) $namesX $currY string\n"; 440 my $p = 10 * (int(1.5+$currPos/10)); 441 my $r = "." x ($p-$currPos-length("$p")); 442 $r .= $p; $p+=10; 443 my $maxp = $currPos+$columnWidth; 444 $maxp = $length if $maxp>$length; 445 for (;$p<=$maxp; $p+=10) { 446 $r .= sprintf("%10d", $p); 447 } 448 $r =~ tr/ /./; 449 $r .= "." x ($maxp-$p+10) if $p>$maxp; 450 $ps .= "($r) $seqsX $currY string\n"; 451 $currY+=$lineStep; 452 } 453 454 # conservation curve 455 $currY+=$consStep; 456 $ps .= "0.6 setgray\n"; 457 for my $col ($currPos..$currPos+$columnWidth-1) { 458 my $score=shift @conservation; 459 last if (!defined $score); 460 my $barHeight=$maxConsBar*$score; 461 $barHeight=1 if ($barHeight==0); 462 my $x=$seqsX+($col-($columnWidth*int($currPos/$columnWidth)))*$fontWidth; 463 464 my $ytmp1=$currY+$maxConsBar-$barHeight; 465 my $xtmp2=$x+$fontWidth; 466 my $ytmp2=$currY+$maxConsBar; 467 $ps .= "$x $ytmp1 $xtmp2 $ytmp2 box2\n"; 468 } 469 $currY+=$blockStep; 470 $currPos+=$columnWidth; 471 } 472 473 my $BB="2 2 $imageWidth $imageHeight"; 474 475 while (<DATA>) { 476 s/!BB!/$BB/; 477 s/!HEIGHT!/$imageHeight/; 478 print; 479 } 480 print $ps; 481 print "showpage\n"; 482} 483 484=head1 NAME 485 486coloraln.pl - colorize an alignment with consensus structure 487 488=head1 SYNOPSIS 489 490 coloraln.pl [-r] [-n] [-c columns] [-s structure.ps] file.aln 491 492=head1 DESCRIPTION 493 494colorrna.pl reads an alignment in CLUSTAL format, and a consensus 495secondary structure (which it extracts from a postscript secondary 496structure plot). It produces a postscript figure of the alignment, in 497which compensatory mutation supporting the consensus structure are 498marked by color. The color scheme is the same employed by RNAalifold 499and alidot: Red marks pairs with no sequence variation; ochre, green, 500turquoise, blue, and violet mark pairs with 2,3,4,5,6 different tpyes 501of pairs, respectively. 502 503=head1 OPTIONS 504 505=over 4 506 507=item B<-s> I<file> 508 509read I<file> to extract the consensus structure (default: C<alirna.ps>) 510 511=item B<-c> I<width> 512 513break alignments into blocks of at most I<width> columns, (default: 120) 514 515=item B<-t> 516 517suppress conversion of C<T> to C<U>, i.e. do not convert DNA to 518RNA, (default: convert to C<U>) 519 520=item B<-r> 521 522add a "ruler" below the alignment showing sequence positions 523 524=item B<-n> 525 526write sequence position at the end of each sequence line 527 528=back 529 530=cut 531 532 533__DATA__ 534%!PS-Adobe-3.0 EPSF-3.0 535%%BoundingBox: !BB! 536%%EndComments 537 538% draws ViennaRNA like colored boxes 539/box { % x1 y1 x2 y2 hue saturation 540 gsave 541 dup 0.3 mul 1 exch sub sethsbcolor 542 exch 3 index sub exch 2 index sub rectfill 543 grestore 544} def 545 546% draws a box in current color 547/box2 { % x1 y1 x2 y2 548 exch 3 index sub exch 2 index sub rectfill 549} def 550 551/string { % (Text) x y 552 6 add 553 moveto 554 show 555} def 556 5570 !HEIGHT! translate 5581 -1 scale 559/Courier findfont 560[10 0 0 -10 0 0] makefont setfont 561 562%100 100 106 110 0.5 0.5 1 box 563%(A) 100 100 string 564%(This is a string) 200 200 string 565