1% talk.tex
2% > dvips -P pdf -ta4 talk -o talk.temp.ps
3% > psnup -1 -m1cm -W128mm -H96mm -pa4 talk.temp.ps talk.handout.ps
4% > psnup -2 -m1cm -b1cm -W128mm -H96mm -pa4 talk.temp.ps talk.handout.ps
5%
6% -*-latex-*-
7\NeedsTeXFormat{LaTeX2e}
8\documentclass[a4paper]{article}
9%\usepackage{beamerarticle}
10%\documentclass[compress,ignorenonframetext]{beamer}
11\usepackage[english]{babel}
12\usepackage{url}
13\usepackage{color}
14\usepackage[labelformat=empty]{caption}
15\usepackage{amsmath}
16\usepackage{amssymb}
17\usepackage{upquote}% necessary to get correct ` in verbatim enviroment and thereby a copy paste behaviour (http://tex.stackexchange.com/questions/63353/how-to-properly-display-backticks-in-verbatim-environment)
18\usepackage{verbatim}
19\usepackage{keystroke}
20\usepackage{fancyvrb}
21\usepackage{hyperref}
22%% presentation mode
23%% handout mode
24%\mode<article>{
25  \usepackage{graphics}
26  \usepackage{pgf}
27  \usepackage{xcolor}
28  \renewcommand{\theenumi}{\alph{enumi}}
29  \renewcommand{\labelenumi}{(\theenumi)}
30  \renewcommand{\theenumii}{\Roman{enumii}}
31  \renewcommand{\labelenumii}{\theenumii.}
32  \newcommand{\frametitle}[1]{\subsubsection{#1}}
33% }
34
35%% \mode<presentation>{
36%%   \beamertemplatenavigationsymbolsempty
37%%   \useinnertheme{circles}
38%%   \setbeamertemplate{frametitle}[default][center]
39%%   \setbeamercovered{transparent}
40%%   \setbeamertemplate{frametitle}[default][center]
41%% }
42
43
44% globals
45
46\title{A short Tutorial on RNA Bioinformatics\newline
47{\small The ViennaRNA Package and related Programs}}
48\author{Ivo Hofacker, Ronny Lorenz, Dominik Steininger, and Sven Findei{\ss}}
49\vspace{2cm}
50%\institute[TBI]{Institute for Theoretical Chemistry\\ University Vienna}
51%\titlegraphic{\centerline{\pgfuseimage{logo}}}
52\date{\url{http://www.tbi.univie.ac.at/RNA/}\\[1ex]\today}
53
54\newcommand{\TODO}[1]{{\textcolor{red}{* #1 *}}}
55
56
57
58% pictures
59\pgfdeclareimage[width=.5cm]{logo}{Figures/tbilogo}
60
61% colors
62\colorlet{darkgreen}{green!80!black}
63
64% never indent paragraphs!
65\setlength\parindent{0pt}
66
67\DefineVerbatimEnvironment%
68  {VerbatimExample}{Verbatim}
69  {frame=single,label=Example,framerule=0.2mm,rulecolor=\color{blue}}
70
71\DefineVerbatimEnvironment%
72  {VerbatimTask}{Verbatim}
73  {frame=single,label=Task,framerule=0.2mm,rulecolor=\color{red}}
74
75%===
76\begin{document}
77%===
78
79%===
80%\frame{\titlepage}
81
82\maketitle
83\newpage
84
85\tableofcontents
86\newpage
87
88%===
89%%\begin{frame}
90%%  \frametitle{Outline}
91%%\tableofcontents
92%%\setcounter{page}{1}
93%%\end{frame}
94
95%===
96\section{RNA Web Services}
97This tutorial aims to give a basic introduction to using the command line
98programs in the ViennaRNA Package in a UNIX-like (LINUX) environment.
99Of course, some of you may ask ``Why are there no friendly graphical user
100interfaces?''. Well, there are some, especially in the form of web
101services.
102
103If a few simple structure predictions is all you want to do, there are
104several useful sites for doing RNA structure analysis available
105on the web. Indeed many of the tasks described below can be performed using
106various web servers.
107%\begin{frame}
108
109%\frametitle{Useful Web Services}
110\subsection{Useful Web Services}
111
112\begin{itemize}
113\item Michael Zuker's \texttt{mfold} server computes (sub)optimal structures
114  and hybridization for DNA and RNA sequences with many options.\newline
115  \href{http://mfold.rna.albany.edu/?q=mfold}{Mfold Website}
116\item BiBiServ, several small services e.g.\ pseudo-knot prediction
117  \texttt{pknotsRG}, bi-stable structures \texttt{paRNAss}, alignment
118  \texttt{RNAforester}, visualization \texttt{RNAmovies}, suboptimal
119  structures \texttt{RNAshapes}\newline
120  \href{https://bibiserv.cebitec.uni-bielefeld.de/rna}{Bielefeld Bioinformatics Service}
121\item The ViennaRNA Server offers web access to many tools of the ViennaRNA
122Package, e.g. \texttt{RNAfold}, \texttt{RNAalifold}, \texttt{RNAinverse} and \texttt{RNAz}\newline
123  \url{http://rna.tbi.univie.ac.at/}
124\item several specialized servers such as \newline
125\begin{itemize}
126  \item \texttt{pfold} consensus structure prediction\newline
127  \href{http://www.daimi.au.dk/~compbio/rnafold/}{ pfold RNA fold server}\newline
128  \item \texttt{s-fold} stochastic suboptimals and siRNA design \newline
129  \href{http://sfold.wadsworth.org/}{Sfold Webservices}\newline
130  \item \texttt{StrAl} progressiv ncRNA alignment tool\newline
131  \href{http://www.biophys.uni-duesseldorf.de/stral/}{StrAl Webservice}
132\end{itemize}
133\end{itemize}
134%\end{frame}
135
136Web servers are also a good starting point for novice users since they
137provide a more intuitive interface. Moreover, the ViennaRNA Server will
138return the equivalent command line invocation for each request, making the
139transition from web services to locally installed software easier.
140
141On the other hand, web servers are not ideal for analyzing many or very long
142sequences and usually they offer only few often-used tasks.
143Much the same is true for point-and-click graphical interfaces.
144Command line tools, on the other hand, are ideally suited for automating
145repetitive tasks. They can even be combined in pipes to process the
146results of one program with another or they can be used in parallel, running
147tens or hundreds of tasks simultaneously on a cluster of PCs.
148
149You can try some of these web services in parallel to the exercises below.
150
151\section{Get started}
152\subsection{Typographical Conventions}
153\begin{itemize}
154  \item \texttt{Constant width font} is used for program names, variable
155    names and other literal text like input and output in the terminal
156    window.
157
158  \item Lines starting with a \texttt{\$} within a literal text block
159    are commands. You should type the text following the \texttt{\$} into
160    your terminal window finishing by hitting the \Enter-key. (The
161    \texttt{\$} signifies the command line prompt, which may look
162    different on your system).
163
164  \item All other lines within a literal text block are the output from
165    the command you just typed.
166\end{itemize}
167
168\subsection{Data Files}
169Data files containing the sequences used in the examples below are
170shipped with this tutorial.
171
172\subsection{Terminal, Command line and Editor}
173%===
174%\begin{frame}[fragile]
175%\frametitle{Terminal and Command}
176
177\begin{itemize}
178  \item You can get a \textbf{terminal} by moving your
179    mouse-pointer to an empty spot of your desktop, clicking the right
180    mouse-button and choose ``Open Terminal'' from the pull-down menu.
181  \item You can \textbf{run commands} in the terminal by typing them next
182    to the command line prompt (usually something like \$) followed by
183    hitting the \Enter-key.
184
185\begin{VerbatimExample}
186$ date
187Tue Jul  7 14:30:25 CEST 2015
188\end{VerbatimExample}
189  \item To get more information about a command type \texttt{man} followed
190    by the \textit{command-name} and hitting the \Enter-key.
191Leave the man pages by pressing the \keystroke{q}-key.
192\begin{VerbatimExample}
193$ man date
194\end{VerbatimExample}
195  \item Redirect a command's input and output using the following special
196    characters:\\
197  `$|$' ties \textit{stdout} to \textit{stdin}\\
198  `$<$' redirects \textit{stdout} to \textit{stdin}\\
199  `$>$' redirects \textit{stdout} to a file\\
200  Here, \textit{stdout} stands for \emph{standard output}, which you can
201  normaly see in the terminal. \textit{stdin} is it's counterpart, the
202  \emph{standard input}. The character `$|$' allows you to \emph{pipe}
203  the standard output of one program directly as standard input into
204  another program, hence, the programs are chained together.
205\end{itemize}
206
207  Below you'll find a list of some useful core commands available in all Linux
208  terminal.
209
210\begin{tabular}{ll}
211  Command & Description\\\hline
212  \texttt{pwd} & displays the path to the current working directory\\
213  \texttt{cd} & changes the working directory (initially your ``HOME'')\\
214  \texttt{ls} & lists files and directorys in the current (or a specified) directory\\
215  \texttt{mkdir} & creates a directory\\
216  \texttt{rm} & removes a file (add option \texttt{-r} for deleting a folder)\\
217  \texttt{less} & shows file(s) one page at a time\\
218  \texttt{echo} & prints string(s) to standard output\\
219  \texttt{wc} & command prints the number of newlines, words and bytes in a specified file
220\end{tabular}
221
222\noindent
223For more information regarding these commands prepend \texttt{--help} to the program call,
224like this:
225\begin{VerbatimExample}
226$ rm --help
227\end{VerbatimExample}
228
229Try a few commands on your own, e.g.
230\begin{VerbatimTask}
231$ ls > file_list
232$ less file_list
233$ rm file_list
234$ ls | less
235\end{VerbatimTask}
236
237Here the \textit{stdout} from the \texttt{ls} command was written to a file called \texttt{file\_list}.
238The next command shows the content of \texttt{file\_list}. We quit \texttt{less} by pressing the
239\keystroke{q}-key and removing the file. \texttt{ls $|$ less} pipes the output in the \texttt{less} program without writing it
240to a file.
241
242\noindent
243Now we create our working directory including subfolders and our first sequence file using
244the commands we just learned. Have in mind that you create a good structure so you can
245find your data easily.
246
247First find out in which directory your are in by typing
248\begin{VerbatimTask}
249$ pwd
250\end{VerbatimTask}
251It should look similar to
252\begin{VerbatimExample}
253$ /home/YOURUSER
254\end{VerbatimExample}
255To insure yourself, that you are in the correct directory type (\textasciitilde is the shortcut for the \texttt{home}-directory)
256\begin{VerbatimExample}
257$ cd ~
258\end{VerbatimExample}
259Now create a new folder in your home directory
260\begin{VerbatimTask}
261$ mkdir -p ~/Tutorial/Data
262$ cd ~/Tutorial/Data
263$ echo ATGAAGATGA > BAZ.seq
264\end{VerbatimTask}
265\noindent
266Here we created two new folders in our \texttt{HOME}, \texttt{Tutorial} and a subfolder called \texttt{Data}, then
267we jumped to the \texttt{Data}-folder and wrote a short DNA sequence to the \texttt{BAZ.seq} file.
268
269\noindent
270For further processing we need a RNA sequence instead of an DNA sequence, so we need to replace the T
271by an U by executing following command using \texttt{sed} (the stream editor).
272\begin{verbatim}
273$ sed -i 's/T/U/g' BAZ.seq
274\end{verbatim}
275The program is called via \texttt{sed}, \texttt{-i} tells \texttt{sed} to replace the existing file (in this case \texttt{BAZ.seq}).
276\texttt{s} stands for substitute T by U and \texttt{g} tells \texttt{sed} to replace all occuring T's in the file globaly).
277
278When we look at our file using \texttt{less} we should see our new sequence ``AUGAAGAUGA''
279\begin{verbatim}
280$ less BAZ.seq
281\end{verbatim}
282%\end{frame}
283%===
284
285\subsection{Installing Software from Source}
286\label{sect:install}
287
288Many bioinformatics programs are available only as source code that has to
289be compiled and installed. We'll demonstrate the standard way to install
290programs from source using the \texttt{ViennaRNA Package}.
291
292%===
293%\begin{frame}[fragile]
294 \frametitle{Get the \texttt{ViennaRNA Package}}
295You can either get the required package, depending on which operating system you run
296(precompiled package is availaible for distinct distributions like Fedora, Arch Linux, Debian, Ubuntu,
297Windows) or you compile the source code yourself. Here we are compiling the programs ourself.
298Have a look at the file \texttt{INSTALL} distributed with the \texttt{ViennaRNA Package}
299for more detail or read the documentation on the url. \\
300
301Subsequently the instructions for building the source code are:
302
303  \begin{enumerate}
304  \item Go to your \texttt{Tutorials} folder and create a directory
305\begin{verbatim}
306$ cd ..
307$ mkdir downloads
308$ cd downloads
309\end{verbatim}
310  \item Download the \texttt{ViennaRNA Package} from
311    \url{http://www.tbi.univie.ac.at/RNA/index.html} and save it in to the newly created
312    directory.
313  \item Unpack the gzipped tar archive by running: (Replace 2.4.11 with the latest version number)
314    \begin{verbatim}
315       $ tar -zxf ViennaRNA-2.4.11.tar.gz
316    \end{verbatim}
317  \item list the content of the directory
318    \begin{verbatim}
319       $ ls -F
320         ViennaRNA-2.4.11/  ViennaRNA-2.4.11.tar.gz
321    \end{verbatim}
322  \end{enumerate}
323%\end{frame}
324
325%===
326\subsection{Build the \texttt{ViennaRNA Package}}
327%===
328%\begin{frame}[fragile]
329\frametitle{Build the \texttt{ViennaRNA Package}}
330The installation location can be controlled through options to the
331\texttt{configure} script. E.g. to change the default installation
332location to the directory \texttt{VRP} in your \texttt{\$HOME/Tutorial}
333directory use the \texttt{--prefix} tag so the compiler knows that the
334target directory is changed.
335
336\begin{enumerate}
337\item To configure and build the package just run the following commands.
338\begin{verbatim}
339$ cd ViennaRNA-2.4.11
340$ mkdir -p ~/Tutorial/Progs/VRP
341$ ./configure --prefix=$HOME/Tutorial/Progs/VRP
342$ make
343$ make install
344\end{verbatim}
345You already know the \texttt{cd} and the \texttt{mkdir} command, \texttt{./configure} checks
346whether all dependencies are fulfilled and exits the script if some major requirements are missing.
347If all is ok it creates the \texttt{Makefile} which then is used to start the buildingprocess via \texttt{make install}.
348\item To install the \texttt{ViennaRNA package} system wide (only
349for people with superuser privileges, which we are NOT!) run
350\begin{verbatim}
351$ ./configure
352$ make
353$ make install
354\end{verbatim}
355\end{enumerate}
356%\end{frame}
357\noindent
358You find the installed files in
359\begin{enumerate}
360\item \texttt{\$HOME/Tutorial/Progs/VRP/bin} (programs)
361\item \texttt{\$HOME/Tutorial/Progs/VRP/share/ViennaRNA/bin} (perl scripts)
362\end{enumerate}
363Wherever you installed the main programs of the \texttt{ViennaRNA Package},
364make sure the path to the executables shows up in your \texttt{PATH}
365environment variable. To check the contents of the \texttt{PATH} environment
366variable simply run
367\begin{verbatim}
368$ echo $PATH
369\end{verbatim}
370For easier handling we now create a folder containing all our binaries as well as perl scripts and
371copy them into a common folder.
372\begin{verbatim}
373$ cd ~/Tutorial/Progs/
374$ cp VRP/share/ViennaRNA/bin/* .
375\end{verbatim}
376Now you can show the contents of the folder using the command \texttt{ls}.
377
378
379Also copy the binaries from the \texttt{VRP/bin} folder. In the next step we add the path
380of the directory to the PATH environment variable (e.g. use \texttt{pwd}) so we don't need to
381write the hole path every time we call it.
382\begin{verbatim}
383$ export PATH=${HOME}/Tutorial/Progs:${PATH}
384\end{verbatim}
385%
386Note that this is only a temporary solution. If you want the path to be
387permanently added you need to add the line above to the config file of your
388shell environment. Typically \texttt{bash} is the standard. You need to add the export line
389above to the \texttt{.bashrc} in your homedirectory. To reload the contents of \texttt{.bashrc} type
390\begin{verbatim}
391$ source ~/.bashrc
392\end{verbatim}
393or close the current terminal and open it again. (Remember, this works only for the \texttt{bash} shell.)
394%
395To check if everything worked out find which source you use.
396%
397\begin{verbatim}
398  $ which RNAfold
399\end{verbatim}
400The shown path should point to \texttt{\$HOME/Tutorial/Progs/}. Finally try to
401get a brief description of a program e.g.
402%
403\begin{verbatim}
404$ RNAfold --help
405\end{verbatim}
406%
407If this doesn't work re-read the steps described above more carefully.
408%===
409
410\subsection{What's in the \texttt{ViennaRNA Package}}
411The core of the \texttt{ViennaRNA Package} is formed by a collection
412of routines for the prediction and comparison of RNA secondary
413structures. These routines can be accessed through stand-alone
414programs, such as \texttt{RNAfold}, \texttt{RNAdistance}, etc., which
415should be sufficient for most users. For those who wish to develop
416their own programs a library which can be linked to your own code is
417provided.
418
419%===
420%\begin{frame}[fragile]
421  \frametitle{The base directory}
422
423  \begin{itemize}
424  \item make a directory listing of \texttt{downloads/ViennaRNA-2.4.11/}
425\begin{verbatim}
426  $ ls -F ~/Tutorial/downloads/ViennaRNA-2.4.11/
427\end{verbatim}
428  \end{itemize}
429  \footnotesize
430  \begin{center}
431    \begin{verbatim}
432        aclocal.m4     config.sub*   INSTALL      man/           RNA-Tutorial/
433        AUTHORS        configure*    install-sh*  misc/          src/
434        CHANGELOG.md   configure.ac  interfaces/  missing*       tests/
435        compile*       COPYING       license.txt  NEWS           THANKS
436        config/        depcomp*      m4/          packaging/     ylwrap*
437        config.guess*  doc/          Makefile.am  README.md
438        config.h.in    examples/     Makefile.in  RNAlib2.pc.in
439    \end{verbatim}%$
440  \end{center}
441You now see the contents of the \texttt{ViennaRNA-2.4.11} folder. Directorys are marked
442by a "/" and the "*" indicates executable files. The \texttt{Makefile} contains the rules to
443compile the code, source code is located within the \texttt{src/} directory. \texttt{configure}
444handles distinct options for installation and creation of the \texttt{Makefile}. \texttt{INSTALL}
445covers installation instructions and the \texttt{README} file contains information about the
446\texttt{ViennaRNA Package}.
447%\end{frame}
448
449
450%===
451%\begin{frame}
452  \frametitle{Which programs are available?}
453
454  {\small
455  \begin{tabular}{ll}
456	RNA2Dfold & Compute coarse grained energy landscape of representative sample structures\\
457	RNAaliduplex & Predict conserved RNA-RNA interactions between two alignments\\
458	RNAalifold & Calculate secondary structures for a set of aligned RNA sequences\\
459	RNAcofold & Calculate secondary structures of two RNAs with dimerization\\
460	RNAdistance & Calculate distances between RNA secondary structures\\
461	RNAduplex & Compute the structure upon hybridization of two RNA strands\\
462	RNAeval & Evaluate free energy of RNA sequences with given secondary structure\\
463	RNAfold & Calculate minimum free energy secondary structures and partition function of RNAs\\
464	RNAheat & Calculate the specific heat (melting curve) of an RNA sequence\\
465	RNAinverse & Find RNA sequences with given secondary structure (sequence design)\\
466	RNALalifold & Calculate locally stable secondary structures for a set of aligned RNAs\\
467	RNALfold & Calculate locally stable secondary structures of long RNAs\\
468	RNApaln & RNA alignment based on sequence base pairing propensities\\
469	RNApdist & Calculate distances between thermodynamic RNA secondary structures ensembles\\
470	RNAparconv & Convert energy parameter files from ViennaRNA 1.8 to 2 format\\
471	RNAPKplex & Predict RNA secondary structures including pseudoknots\\
472	RNAplex & Find targets of a query RNA\\
473	RNAplfold & Calculate average pair probabilities for locally stable secondary structures\\
474	RNAplot & Draw and markup RNA secondary structures in PostScript, SVG, or GML\\
475	RNApvmin & Find a vector of perturbation energies which may further be used to constrain folding\\
476	RNAsnoop & Find targets of a query H/ACA snoRNA\\
477	RNAsubopt & Calculate suboptimal secondary structures of RNAs\\
478	RNAup & Calculate the thermodynamics of RNA-RNA interactions\\
479	Kinfold & simulates  the stochastic folding kinetics of RNA sequences into secondary structures\\
480	RNAforester \footnotemark & compare RNA secondary structures via forest alignment\\
481  \end{tabular}}
482	\footnotetext{RNAforester is not developed by the TBI Vienna.}
483
484%\end{frame}
485
486%===
487%\begin{frame}[fragile]
488  \frametitle{Which Utilities are available?}
489
490  {\small
491  \begin{tabular}{ll}
492	b2ct	& converts dot-bracket notation to Zukers mfold '.ct' file format \\
493	b2mt.pl	& converts dot-bracket notation to x y values \\
494	cmount.pl	& generates colored mountain plot\\
495	coloraln.pl	& colorize an alirna.ps file\\
496	colorrna.pl	& colorize a secondary structure with reliability annotation\\
497	ct2db	& converts Zukers mfold '.ct' file format to dot-bracket notation\\
498	dpzoom.pl	& extract a portion of a dot plot \\
499	mountain.pl	& generates mountain plot \\
500	popt	& extract Zuker's p-optimal folds from subopt output\\
501	refold.pl	& refold using consensus structure as constraint\\
502	relplot.pl	& add reliability information to a RNA secondary structure plot\\
503	rotate\_ss.pl	& rotate the coordinates of an RNA secondary structure plot\\
504	switch.pl	& describes RNA sequences that exhibit two almost equally stable structures\\
505  \end{tabular}}
506\noindent
507%\end{frame}
508
509\vspace*{2ex}\noindent
510All programs that are shipped with the \texttt{ViennaRNA Package} provide
511some documentation in the form of ``man pages''. In UNIX like environments, these
512manual pages can be viewed using the \texttt{man} command after successfully
513installing the \texttt{ViennaRNA Package}:
514\begin{verbatim}
515$ man RNAalifold
516\end{verbatim}%$
517Alternatively, an online version of the manual pages is available at\\
518\url{https://www.tbi.univie.ac.at/RNA/documentation.html#programs}.
519Note, that the \texttt{MANPATH} environment variable requires to be updated
520if the \texttt{ViennaRNA Package} has been installed in a non-standard path.
521
522There also is a helpful documentation in the folder of the \texttt{ViennaRNA Package}:\\
523\texttt{~/Tutorial/downloads/ViennaRNA-2.4.11/doc/RNAlib-2.4.11.pdf}\\
524
525Most Perl scripts carry embedded documentation that is displayed by typing
526\begin{verbatim}
527$ perldoc coloraln.pl
528\end{verbatim}%$
529in the folder where the script is located.
530All scripts and programs give short usage instructions when called with
531the \texttt{-h} command line option (e.g. \texttt{RNAalifold -h}).
532%===
533
534\subsection{The Input File Format}
535RNA sequences come in a variety of formats. The sequence format used
536throughout the ViennaRNA Package is very simple. A sequence
537file contains one or more sequences. Each sequence either spans a single
538line without any additional whitespaces, or the file is \texttt{FASTA}
539formatted. In the latter case, the sequence is preceded by a special header
540line that starts with the `\texttt{>}' character followed by a sequence identifier.
541This identifier, usually a unique name assigned to the sequence, will then be
542used by the programs in the \texttt{ViennaRNA Package} as basename for any output
543files. Please note some of the programs do not support the \texttt{FASTA} format
544yet. Furthermore, programs that require multiple input sequences, e.g. for
545interaction prediction, may require them as separate lines, or in a concatenated
546form on a single line with the delimiting character \texttt{\&}. Please read
547the corresponding manpages and \texttt{--help} output to find out the actual
548input format requirements.
549
550%===
551%\input{single_seq.tex}
552\section{Structure Prediction on single Sequences}
553\subsection{The Program \texttt{RNAfold}}
554%===
555Our first task will be to do a structure prediction using
556\texttt{RNAfold}. This should get you familiar with the input and output
557format as well as the graphical output produced.
558
559\texttt{RNAfold} reads single RNA sequences, computes their minimum free energy
560(\texttt{MFE}) structures, and prints the result together with the corresponding
561\texttt{MFE} structure in dot-bracket notation. This is the default mode if no
562further command line parameters are provided. Please note, that the \texttt{RNAfold}
563program can either be used in \textit{interactive mode}, where the program expects
564the input from \textit{stdin}, or in \textit{batch processing mode} where
565you provide the input sequences as text files.
566
567To activate computation of the partition function for each sequence, the
568\texttt{-p} option must be set. From the partition function
569$$Q = \sum_{s \in \Omega} exp(-E(s) / RT)$$
570
571over the ensemble of all possible structures $\Omega$, with temperature $T$ and gas
572constant $R$, \texttt{RNAfold} then computes the ensemble free energy $G = -RT \cdot ln(Q)$,
573and frequency of the \texttt{MFE} structure $s_{mfe}$ within the ensemble
574$$p = exp(-E(s_{mfe}) / RT) / Q$$
575
576Furthermore, by default, the \texttt{-p} option also activates the computation
577of base pairing probabilities $p_{ij}$. From this data, \texttt{RNAfold} then
578determines the ensemble diversity
579$$\langle d \rangle = \sum_{ij} p_{ij} \cdot (1 - p_{ij}),$$
580i.e. the expected distance between any two secondary structure, as well as the
581\texttt{centroid} structure, i.e. the structure $s_c$ with the least Boltzmann weighted
582distance
583$$d_\Omega(s_c) = \sum_{\substack{s \in \Omega}} p(s) d(s_c, s).$$ to all other
584structures $s \in \Omega$.
585
586Another useful structure representative one can determine from base pairing probabilities
587$p_{ij}$ is the structure that exhibits the \textit{maximum expected accuracy (MEA)}. By
588assuming the base pair probability is a good measure of correctnes of a pair $(i,j)$, the
589expected accuracy of a structure $s$ is
590$$\text{EA}(s) = \sum_{\substack{(i,j) \in s}} 2\gamma p_{ij} + \sum_{\substack{i \\ \nexists (i,j) \in s}} q_i$$
591with $q_i = 1 - \sum_j p_{ij}$ and weighting factor $\gamma$ that allows us to
592weight paired against unpaired positions. \texttt{RNAfold} uses a dynamic programming
593scheme similar to the \textit{Maximum Matching algorithm} of Ruth Nussinov to find the
594structure $s$ that minimizes the above equation.
595
596The \texttt{RNAfold} program provides a large amount of additional
597computation modes that will be partly covered below. To get a full list of all
598computation modes available, please consult the \texttt{RNAfold} man page or
599the outputs of \texttt{RNAfold -h} and \texttt{RNAfold --detailed-help}.
600
601\subsubsection{MFE structure of a single sequence}
602
603\begin{enumerate}
604\item Use a text editor (emacs, vi, nano, gedit) to prepare an input file by pasting the text
605below and save it under the name \texttt{test.seq} in your \texttt{Data} folder.
606\begin{verbatim}
607> test
608CUACGGCGCGGCGCCCUUGGCGA
609\end{verbatim}
610\item Compute the best (MFE) structure for this sequence using \textit{batch processing mode}
611\begin{verbatim}
612  $ RNAfold test.seq
613  CUACGGCGCGGCGCCCUUGGCGA
614  ...........((((...)))). ( -5.00)
615 \end{verbatim}%$
616\item or use the \textit{interactive mode} and redirect the content of \texttt{test.seq}
617to \textit{stdin}
618\begin{verbatim}
619  $ RNAfold < text.seq
620  CUACGGCGCGGCGCCCUUGGCGA
621  ...........((((...)))). ( -5.00)
622\end{verbatim}
623\item alternatively, you could use the \textit{interactive mode} and manually enter the sequence
624  as soon as \texttt{RNAfold} prompts for input
625\begin{verbatim}
626  $ RNAfold
627  Input string (upper or lower case); @ to quit
628  ....,....1....,....2....,....3....,....4....,....5....,....6....,....7....,....8
629  CUACGGCGCGGCGCCCUUGGCGA
630  length = 23
631
632  CUACGGCGCGGCGCCCUUGGCGA
633  ...........((((...)))).
634   minimum free energy =  -5.00 kcal/mol
635 \end{verbatim}%$
636%\end{frame}
637\end{enumerate}
638
639All the above variants to compute the MFE and the corresponding structure result in identical
640output, except for slight variations in the formatting when true \textit{interactive mode} is used.
641The last line(s) of the text output contains the predicted MFE structure in \textit{dot-bracket notation}
642and its free energy in \texttt{kcal/mol}. A dot in the dot-bracket notation represents an unpaired
643position, while a base pair (i, j) is represented by a pair of matching parentheses at position
644i and j.
645
646If the input was \texttt{FASTA} formatted, i.e. the sequence was preceded by a header line
647with sequence identifier, \texttt{RNAfold} creates a structure layout file named \texttt{test\_ss.ps},
648where \texttt{test} is the sequence identifier as provided through the \texttt{FASTA} header.
649In case the header was omitted the output file name simply is \texttt{rna.ps}.\\
650Let's take a look at the output file with your favorite \texttt{PostScript} viewer, e.g. \texttt{gv}.\footnote{In contrast to
651  bitmap based image files (such as GIF or JPEG) PostScript files contain resolution
652  independent vector graphics, suitable for publication. They can be
653  viewed on-screen using a postscript viewer such as \texttt{gv} or
654  \texttt{evince}}. Note the \& at the end of the following command line that simply detaches
655the program call and immediately starts the program in the background.
656\begin{verbatim}
657$ gv test_ss.ps &
658\end{verbatim}
659\noindent
660Compare the dot-bracket notation to the PostScript
661drawing shown in the file \texttt{test\_ss.eps}.
662
663You can use the \texttt{-t} option to change the layout algorithm \texttt{RNAfold} uses
664to produce the plot. The most simply layout is the \textit{radial} layout that can be chosen
665with \texttt{-t 0}. Here, each nucleotide in a loop is equally spaced on its enclosing circle.
666The more sophisticated \texttt{Naview} layout algorithm is used by default but may be explicitly
667chosen through \texttt{-t 1}. A hidden feature can be found with \texttt{-t 2}, where \texttt{RNAfold}
668creates a most simple circular plot.
669
670The calculation above does not tell us whether we can actually trust the predicted structure.
671In fact, there may be many more possible structures that might be equally probable. To find
672out about that, let's have a look at the equilibrium ensemble instead.
673
674\subsubsection{Predicting equilibrium properties of the structure ensemble}
675
676\begin{enumerate}
677\item Run \texttt{RNAfold -p --MEA} to compute the partition function,
678pair probabilities, centroid structure, and the maximum expected accuracy
679(MEA) structure.
680\item Have a look at the generated PostScript files \texttt{test\_ss.ps} and
681\texttt{test\_dp.ps}
682\begin{verbatim}
683  $ RNAfold -p --MEA test.seq
684  CUACGGCGCGGCGCCCUUGGCGA
685  ...........((((...)))). ( -5.00)
686  ....{,{{...||||...)}}}. [ -5.72]
687  ....................... {  0.00 d=4.66}
688  ......((...))((...))... {  2.90 MEA=14.79}
689   frequency of mfe structure in ensemble 0.311796; ensemble diversity 6.36
690
691\end{verbatim}
692%\end{frame}
693  \end{enumerate}
694\noindent
695Here the last four lines are new compared to the text output without the \texttt{-p --MEA}
696options. The partition function is already a rough measure for the well-definedness of the \texttt{MFE}
697structure. The third line shows a condensed representation of the pair probabilities of each
698nucleotide, similar to the dot-bracket notation, followed by the ensemble free energy
699($G = -kT \cdot ln(Z)$) in \texttt{kcal/mol}. Here, the dot-bracket like notation consists
700of additional characters that denote the pairing propensity for each nucleotide.
701"." denotes bases that are essentially unpaired, "," weakly paired,
702"$|$"strongly paired without preference, "\{\},()" weakly ($>$33\%) upstream (downstream)
703paired or strongly ($>$66\%) up-/downstream paired bases, respectively.\\
704
705The next two lines represent (i) the centroid structure
706with its free energy and distance to the ensemble, and (ii) the MEA structure, it's free
707energy and the actual accuracy. The very last line shows the frequency of the MFE structure in
708the ensemble of secondary structures and the diversity of the ensemble as discussed above.
709
710Note that the MFE structure is adopted only with 31\% probability, also the
711diversity is very high for such a short sequence.\\
712
713\subsubsection{Rotate the structure plot}
714
715\includegraphics[width=.50\textwidth]{Figures/test_ss.eps}\\
716
717To rotate the secondary structure plot that is generated by \texttt{RNAfold}
718the \texttt{ViennaRNA Package} provides the perl script utility \texttt{rotate\_ss.pl}.
719Just read the \texttt{perldoc} for this tool to know how to handle the rotation and use
720the information to get your secondary structure in a vertical position.
721\begin{verbatim}
722$ perldoc rotate_ss.pl
723\end{verbatim}%$
724
725
726\subsubsection{The base pair probability dot plot}
727
728\includegraphics[width=.50\textwidth]{Figures/test_dp.eps}\\
729%\end{frame}
730
731The ``dot plot'' (\texttt{test\_dp.ps}) shows the pair probabilities within
732the equilibrium ensemble as $n\times n$ matrix, and is an excellent way to
733visualize structural alternatives. A square at row $i$ and column $j$
734indicates a base pair. The area of a square in the upper right half of the
735matrix is proportional to the probability of the base pair $(i,j)$ within the
736equilibrium ensemble. The lower left half shows all pairs belonging to
737the \texttt{MFE} structure. While the MFE consists of a single helix, several
738different helices are visualized in the pair probabilities.
739
740While a base pair probability dot-plot is quite handy to interpret for short
741sequences, it quickly becomes confusing the longer the RNA sequence is. Still,
742this is (currently) the only output of base pair probabilities for the \texttt{RNAfold}
743program. Nevertheless, since the dot plot is a true \texttt{PostScript} file,
744one can retrieve the individual base pair probabilities by parsing its textual
745content.
746
747\begin{enumerate}
748\item Open the dot plot with your favorite text editor
749\item Locate the lines that that follow the scheme
750\begin{verbatim}
751  i j v ubox
752\end{verbatim}
753where $i$ and $j$ are integer values and $v$ is a floating point decimal
754with values between $0$ and $1$. These are the data for the boxes drawn in
755the upper triangle. The integer values $i$ and $j$ denote the nucleotide positions
756while the value $v$ is the square-root of the probability of base pair $(i,j)$.
757Thus, the actual base pair probability $p(i,j) = v * v$.
758
759\end{enumerate}
760
761\subsubsection{Mountain and Reliability plot}
762Next, let's use the \texttt{relplot.pl} utility to annotate which parts of a
763predicted MFE structure are well-defined and thus more reliable. Also let's use a real
764example for a change and produce yet another representation of the predicted
765structure, the \emph{mountain plot}.
766
767\noindent
768Fold the 5S rRNA sequence and visualize the structure. (The \texttt{5S.seq} is shipped with the tutorial)
769\begin{verbatim}
770  $ RNAfold -p 5S.seq
771  $ mountain.pl 5S_dp.ps | xmgrace -pipe
772  $ relplot.pl 5S_ss.ps 5S_dp.ps > 5S_rss.ps
773\end{verbatim}
774
775  \includegraphics[width=.45\textwidth]{Figures/5S_mt.eps}\hfill
776  \includegraphics[trim=0cm 1.5cm 0cm 0cm, width=.50\textwidth]{Figures/5S_rot.eps}
777
778A mountain plot is especially useful for long sequences where conventional
779structure drawings become terribly cluttered.  It is a xy-diagram plotting
780the number of base pairs enclosing a sequence position \textit{versus} the
781position. The  \texttt{Perl} script \texttt{mountain.pl} transforms a dot
782plot into the mountain plot coordinates which can be visualized with any
783xy-plotting program, e.g. \texttt{xmgrace}.
784
785The resulting plot shows three curves, two mountain plots derived from
786the \texttt{MFE} structure (red) and the pairing probabilities (black) and
787a positional entropy curve (green). Well-defined regions are identified by low
788entropy. By superimposing several mountain plots structures can easily
789be compared.
790
791The perl script \texttt{relplot.pl} adds reliability
792information to a RNA secondary structure plot in the form of color
793annotation. The script computes a well-definedness measure we call
794``positional entropy''
795$$S(i) = -\sum p_{ij}\log(p_{ij})$$
796and encodes it as color hue, ranging from red
797(low entropy, well-defined) via green to blue and violet (high
798entropy, ill-defined). In the example above two helices of the 5S RNA are
799well-defined (red) and indeed predicted correctly, the left arm is not quite
800correct and disordered.
801
802For the figure above we had to rotate and mirror the structure plot, e.g.
803\begin{verbatim}
804$ rotate_ss.pl -a 180 -m 5S_rss.ps > 5S_rot.ps
805\end{verbatim}%$
806
807\subsubsection{Batch job processing}
808In most cases, one doesn't only want to predict the structure and equilibrium
809probabilities for a single RNA sequence but a set of sequences. \texttt{RNAfold}
810is perfectly suited for this task since it provides several different mechanisms
811to support batch job processing. First, in \textit{interactive} mode, it only
812stops processing input from \textit{stdin} if it is requested to do so. This means
813that after processing one sequence, it will prompt for the input of the next
814sequence. Entering the \texttt{@} character will forcefully abort processing.
815In situations where the input is provided through input stream redirection,
816it will end processing as soon stream is closed.
817
818In constrat to that, the \textit{batch processing mode} where one simply specifies
819input files as so-called unnamed command line parameters, the number of input
820sequences is more or less unlimited. You can specify as many input files as
821your terminal emulator allows, and each input file may consist of arbitrarily
822many sequences. However, please note that mixing \texttt{FASTA} and non-fasta
823input is not allowed and will most likely produce bogus output.
824
825Assume you have four input files \texttt{file\_0.fa}, \texttt{file\_1.fa},
826\texttt{file\_2.fa}, and \texttt{file\_3.fa}. Each file contains a set of RNA
827sequences in \texttt{FASTA} format. Predicting secondary structures for all
828sequences in all files with a single call to \texttt{RNAfold} and redirecting
829the output to a file \texttt{all\_sequences\_output.fold} can be achieved
830like this:
831\begin{verbatim}
832  $ RNAfold file_0.fa file_1.fa file_2.fa file_3.fa > all_sequences_output.fold
833\end{verbatim}
834
835The above call to \texttt{RNAfold} will open each of the files and process the
836sequences sequentially. This, however, might take a long time and the sequential
837processing will most likely bore out your multi-core workstation or laptop computer,
838since only a single core is used for the computations while the others are idle.
839If you happen to have more than a single CPU core and want to take advantage of
840the available parallel processing power, you can use the \texttt{-j} option of\
841\texttt{RNAfold} to split the input into concurrent jobs.
842\begin{verbatim}
843  $ RNAfold -j file_*.fa > all_sequences_output.fold
844\end{verbatim}
845This command will uses as many CPU cores as available and, therefore, process
846you input much faster. If you want to limit the number of concurrent jobs to
847a particular number, say $2$, to leave the remaining cores available for other
848tasks, you can append the number of jobs directly to the \texttt{-j} option:
849\begin{verbatim}
850  $ RNAfold -j2 file_*.fa > all_sequences_output.fold
851\end{verbatim}
852Note here, that there must not be any space between the \texttt{j} and the number
853of jobs.
854
855Now imagine what happens if you have a larger set of
856sequences that are not stored in \texttt{FASTA} format. If you would serve such
857an input to \texttt{RNAfold}, it would happily process each of the sequences
858but always over-write the structure layout and dot-plot files, since the default
859names for these files are \texttt{rna.ps} and \texttt{dot.ps} for any sequence.
860This is usually an undesired behavior, where \texttt{RNAfold} and the \texttt{--auto-id}
861option becomes handy. This option flag forces \texttt{RNAfold} to automatically
862create a sequence identifier for each input, thus using different file names for
863each single output. The identifier that is created follows the form
864\begin{verbatim}
865  sequence_XXXX
866\end{verbatim}
867where \texttt{sequence} is a prefix, followed by the delimiting character \texttt{\_},
868and an increasing 4-digit number \texttt{XXXX} starting at 0000. This feature is
869even useful if the input is in \texttt{FASTA} format, but one wants to enforce
870a novel naming scheme for the sequences. As soon as the \texttt{--auto-id} option
871is set, \texttt{RNAfold} will ignore any id taken from existing \texttt{FASTA}
872headers in the input files.
873
874See also the man page of \texttt{RNAfold} to find out how to modify the prefix,
875delimiting character, start number and number of digits.
876
877\begin{enumerate}
878\item Create an input file with many RNA sequences, each on a separate line, e.g.
879\begin{verbatim}
880  $ randseq -n 127 > many_files.seq
881\end{verbatim}
882\item Compute the MFE structure for each of the sequences and generate output
883ids with numbers between $100$ and $226$ and prefix \texttt{test\_seq}
884\begin{verbatim}
885  $ RNAfold --auto-id --id-start=100 --id-prefix="test_seq" many_files.seq
886\end{verbatim}
887\end{enumerate}
888
889\subsubsection{Add constraints to the structure prediction}
890For some scientific questions one requires additional constraints that must be
891enforced when predicting secondary structures. For instance, one might have resolved
892parts of the structure already and is simply interested in the optimal conformation
893of the remaining part of the molecule. Another example would be that one already
894knows that particular nucleotides can not participate in any base pair, since they
895are physically hindered to do so. These types of constraints are termed \textit{hard}
896constraints and they can enforce or prohibit particular conformations, thus include
897or omit structures with these feature from the set candidate ensemble.
898
899Another type of constraints are so-called \textit{soft} constraints, that enable one
900to adjust the free energy contributions of particular conformations. For instance,
901one could add a bonus energy if a particular (stretch of) nucleotides is left unpaired
902to emulate the binding free energy of a single strand binding protein. The same can
903be applied to base pairs, for instance one could add a penalizing energy term if a
904particular base pair is formed to make it less likely.
905
906The \texttt{RNAfold} programs comes with a comprehensive hard and soft constraints
907support and provides several convenience command line parameters to ease constraint
908application.
909
910The most simple hard constraint that can be applied is the maximum base pair span, i.e.
911the maximum number of nucleotides a particular base pair may span. This constraint can
912be applied with the \texttt{--maxBPspan} option followed by an integer number.
913\begin{enumerate}
914  \item Compute the secondary structure for the \texttt{5S.seq} input file
915  \item Now limit the maximum base pair span to $50$ and compare both results
916\begin{verbatim}
917  $ RNAfold --maxBPspan 50 5S.seq
918\end{verbatim}
919\end{enumerate}
920
921Now assume you already know parts of the structure and want to \textit{fill-in}
922an optimal remaining part. You can do that by using the \texttt{-C} option
923and adding an additional line in dot-bracket notation to the input (after the sequence)
924that corresponds to the known structure:
925\begin{enumerate}
926\item Prepare the input file \texttt{hard\_const\_example.fa}
927\begin{verbatim}
928  >my_constrained_sequence
929  GCCCUUGUCGAGAGGAACUCGAGACACCCACUACCCACUGAGGACUUUCG
930  ..((((.....))))
931\end{verbatim}
932  Note here, that we left out the remainder of the input structure constraint that will
933  eventually be used to enforce a helix of 4 base pairs at the beginning of the sequence.
934  You may also fill the remainder of the constraint with dots to silence any warnings issued
935  by \texttt{RNAfold}.
936\item Compute the MFE structure for the input
937\begin{verbatim}
938  $ RNAfold hard_const_example.fa
939  >my_constrained_sequence
940  GCCCUUGUCGAGAGGAACUCGAGACACCCACUACCCACUGAGGACUUUCG
941  ........((((((...((((.................))))..)))))) ( -8.00)
942\end{verbatim}
943\item Now compute the MFE structure under the provided constraint
944\begin{verbatim}
945  $ RNAfold -C hard_const_example.fa
946  >my_constrained_sequence
947  GCCCUUGUCGAGAGGAACUCGAGACACCCACUACCCACUGAGGACUUUCG
948  ..((((.....))))....(((((..((((........)).))..))))) ( -7.90)
949\end{verbatim}
950\item Due to historic reasons, the \texttt{-C} option alone only forbids any base pairs
951that are incompatible with the constraint, rather than enforcing the constraint. Thus,
952if you compute equilibrium probabilities, structures that are missing the small helix in
953the beginning are still part of the ensemble. If you want to compute the pairing probabilities
954upon forcing the small helix at the beginning, you can add the \texttt{--enforceConstraint} option:
955\begin{verbatim}
956  $ RNAfold -p -C --enforceConstraint hard_const_example.fa
957  >my_constrained_sequence
958  GCCCUUGUCGAGAGGAACUCGAGACACCCACUACCCACUGAGGACUUUCG
959  ..((((.....))))....(((((..((((........)).))..))))) ( -7.90)
960\end{verbatim}
961  Have a look at the differences in ensemble free energy and base pair probabilities between
962  the results obtained with and without the \texttt{--enforceConstraint} option.
963\end{enumerate}
964
965A more thorough alternative to provide constraints is to use the \texttt{--commands} option
966and a corresponding \textit{commands file}. This allows one to specify constraints on nucleotide
967or base pair level and even to restrict a constraint to particular loop types. A commands file
968is a simple multi column text file with one constraint on each line. A line starts with a one- or
969two-letter command, followed by multiple values that specify the addressed nucleotides, the loop
970context restriction, and, for soft constraints, the strength of the constraint in $kcal/mol$.
971The syntax is as follows:
972
973{\footnotesize
974\begin{verbatim}
975F i 0 k   [TYPE] [ORIENTATION] # Force nucleotides i...i+k-1 to be paired
976F i j k   [TYPE] # Force helix of size k starting with (i,j) to be formed
977P i 0 k   [TYPE] # Prohibit nucleotides i...i+k-1 to be paired
978P i j k   [TYPE] # Prohibit pairs (i,j),...,(i+k-1,j-k+1)
979P i-j k-l [TYPE] # Prohibit pairing between two ranges
980C i 0 k   [TYPE] # Nucleotides i,...,i+k-1 must appear in context TYPE
981C i j k          # Remove pairs conflicting with (i,j),...,(i+k-1,j-k+1)
982E i 0 k e        # Add pseudo-energy e to nucleotides i...i+k-1
983E i j k e        # Add pseudo-energy e to pairs (i,j),...,(i+k-1,j-k+1)
984\end{verbatim}
985}
986with
987{\footnotesize
988\begin{verbatim}
989[TYPE]        = { E, H, I, i, M, m, A }
990[ORIENTATION] = { U, D }
991\end{verbatim}
992}
993
994\begin{enumerate}
995\item Prepare a commands file \texttt{test.constraints} that forces the first 5 nucleotides to pair and the
996following 3 nucleotides to stay unpaired as part of a multi-branch loop:
997\begin{verbatim}
998F 1 0 5
999C 6 0 3 M
1000\end{verbatim}
1001\item Use the \texttt{randseq} program to generate multiple sequences and compute the MFE structure
1002for each under the constraints prepared earlier.
1003\begin{verbatim}
1004  $ randseq -n 20 | RNAfold --commands test.constraints
1005\end{verbatim}
1006Inspect the output to assure yourself that hte commands have been applied
1007\end{enumerate}
1008
1009A couple of much more sophisticated constraints will be discussed below.
1010
1011\subsubsection{SHAPE directed RNA folding}
1012
1013In order to further improve the quality of secondary structure predictions, mapping experiments like
1014SHAPE (selective 2'-hydroxyl acylation analyzed by primer extension) can be used to exerimentally determine
1015the pairing status for each nucleotide.
1016In addition to thermodynamic based secondary structure predictions, RNAfold supports the incorporation of this additional
1017experimental data as soft constraints.
1018
1019If you want to use SHAPE data to guide the folding process, please make sure that your experimental data is present in a text file,
1020where each line stores three white space separated columns containing the position, the abbreviation and the normalized SHAPE reactivity for
1021a certain nucleotide.
1022
1023\begin{verbatim}
1024     1 G 0.134
1025     2 C 0.044
1026     3 C 0.057
1027     4 G 0.114
1028     5 U 0.094
1029        ...
1030        ...
1031        ...
1032     71 C 0.035
1033     72 G 0.909
1034     73 C 0.224
1035     74 C 0.529
1036     75 A 1.475
1037\end{verbatim}%$
1038
1039The second column, which holds the nucleotide abbreviation, is optional.
1040If it is present, the data will be used to perform a cross check against the provided input sequence.
1041Missing SHAPE reactivities for certain positions can be indicated by omitting the reactivity column or the whole line.
1042Negative reactivities will be treated as missing.
1043Once the SHAPE file is ready, it can be used to constrain folding:
1044
1045\begin{verbatim}
1046$ RNAfold --shape=rna.shape --shapeMethod=D < rna.seq
1047\end{verbatim}%$
1048
1049A small compilation of reference data taken from Hajdin et al. 2013 is available online
1050\url{https://weeks.chem.unc.edu/data-files/ShapeKnots_DATA.zip}. However, the included
1051reference structures are only available in connect (.ct) format and require conversion into
1052dot-bracket notation to compare them against predicted structures with \texttt{RNAfold}.
1053Furthermore, the normalized \texttt{SHAPE} data is available as Excel spreadsheet and
1054also requires some pre-processing to make it available for \texttt{RNAfold}.
1055
1056
1057\subsubsection{Adding ligand interactions}
1058RNA molecules are known to interact with other molecules, such as additional RNAs, proteins,
1059or other small ligand molecules. Some interactions with small ligands that take place in
1060loops of an RNA structure can be modeled in terms of soft constraints. However, to stay
1061compatible with the recursive decomposition scheme for secondary structures they are
1062limited to the unpaired nucleotides of hairpins and internal loops.
1063
1064The \texttt{RNAlib} library of the \texttt{ViennaRNA Package} implements a most general
1065form of constraints capability. However, the available programs do not allow for a full
1066access to the implemented features. Nevertheless, \texttt{RNAfold} provides a convenience
1067option that allows to easily include ligand binding to hairpin- or interior-loop like aptamer
1068motifs. For that purpose, a user needs only to provide motif and a binding free energy.
1069
1070Consider the following example file \texttt{theo.fa} for a theophylline triggered
1071riboswitch with the sequence
1072\begin{verbatim}
1073  >theo-switch
1074  GGUGAUACCAGAUUUCGCGAAAAAUCCCUUGGCAGCACCUCGCACAUCUUGUUGUC
1075  UGAUUAUUGAUUUUUCGCGAAACCAUUUGAUCAUAUGACAAGAUUGAG
1076\end{verbatim}
1077
1078The theopylline aptamer structure has been actively researched during the last two decades.
1079\begin{center}
1080\includegraphics[width=.75\textwidth]{Figures/theo_aptamer.eps}\\
1081\end{center}
1082Although the actual aptamer part (marked in blue) is not a simple interior loop, it
1083can still be modeled as such. It consists of two delimiting base pairs (G,C) at the
10845' site, and another (G,C) at its 3' end. That is already enough to satisfy the requirements
1085for the \texttt{--motif} option of \texttt{RNAfold}. Together with the aptamer sequence
1086motif, the entire aptamer can be written down in dot-bracket form as
1087
1088\begin{verbatim}
1089GAUACCAG&CCCUUGGCAGC
1090(...((((&)...)))...)
1091\end{verbatim}
1092
1093Note here, that we separated the 5' and 3' part from each other using the \texttt{\&}
1094character. This enables us to omit the variable hairpin end of the aptamer from the
1095specification in our model.
1096
1097The only ingredient that is still missing is the actual stabilizing energy contribution
1098induced by the ligand binding into the aptamer pocket. But several experimental and computational
1099studies have already determined dissociation constants for this system. Jenison et al. 1994,
1100for instance, determined a dissociation constant of $K_d = 0.32\mu M$ which, for standard
1101reference concentration $c = 1 mol/L$, can be translated into a binding free energy
1102
1103$$\Delta G = RT \cdot \ln \frac{K_d}{c} \approx -9.22~kcal/mol$$
1104
1105Finally, we can compute the MFE structure for our example sequence
1106
1107\begin{verbatim}
1108  $ RNAfold -v --motif "GAUACCAG&CCCUUGGCAGC,(...((((&)...)))...),-9.22" theo.fa
1109\end{verbatim}
1110
1111Compare the predicted MFE structure with and without modeling the ligand interaction.
1112You may also enable partition function computation to compute base pair probabilities,
1113the centroid structure and MEA structure to investigate the effect of ligand binding
1114on ensemble diversity.
1115
1116\subsubsection{G-quadruplexes}
1117G-Quadruplexes are a common conformation found in G-rich sequences where four runs of
1118consecutive G's are separated by three short sequence stretches.
1119
1120\begin{center}
1121\includegraphics[width=.5\textwidth]{Figures/gquad_pattern.eps}\\
1122\end{center}
1123
1124They form local
1125self-enclosed stacks of G-quartets bound together through 8 Hogsteen-Watson Crick bonds
1126and further stabilized by a metal ion (usually potassium).
1127
1128\begin{center}
1129\includegraphics[width=.85\textwidth]{Figures/gquad.eps}\\
1130\end{center}
1131
1132To
1133acknowledge the competition of regular secondary structure and G-quadruplex formation,
1134the \texttt{ViennaRNA Package} implements an extension to the default recursion scheme.
1135For that purpose, G-quadruplexes are simply considered a different type of substructure
1136that may be incorporated like any other substructure. The free energy of a particular
1137G-quadruplex at temperature $T$ is determined by a simple energy model
1138
1139$$E(L, l_{tot}, T) = a(t) \cdot (L - 1) + b(T) \cdot ln(l_{tot} - 2)$$
1140
1141that only considers the number of stacked layers $L$ and the total size of the three
1142linker sequences $l_{tot} = l_1 + l_2 + l_3$ connecting the G runs. Linker sequence
1143and assymetry effects as well as relative strand orientations (parallel, anti-parallel
1144or mixed) are entirely neglected in this model. The free energy parameters
1145$$a(T) = H_a + TS_a$$ and
1146$$b(T) = H_b + TS_b$$ have been determined from experimental UV-melting data taken
1147from Zhang et al. 2011, Biochemistry.
1148
1149\texttt{RNAfold} allows one to activate the G-quadruplex implementation by simply
1150providing the \texttt{-g} switch. G-quadruplexes are then taken into account for
1151MFE and equilibrium probability computations.
1152
1153\begin{verbatim}
1154  $ echo "GGCUGGUGAUUGGAAGGGAGGGAGGUGGCCAGCC" | RNAfold -g -p
1155  GGCUGGUGAUUGGAAGGGAGGGAGGUGGCCAGCC
1156  ((((((..........++.++..++.++)))))) (-21.39)
1157  ((((((..........(..........))))))) [-21.83]
1158  ((((((..........++.++..++.++)))))) {-21.39 d=0.04}
1159   frequency of mfe structure in ensemble 0.491118; ensemble diversity 0.08
1160\end{verbatim}
1161
1162The resulting structure layout and dot plot \texttt{PostScript} files depict the
1163prediced G-quadruplexes as hairpin-like loops with additional bonds between the
1164interacting G's, and green triangles where the color intensity encodes the G-quadruplex
1165probability, respectively. Have a closer look at the actual G-quadruplex probabilities
1166by opening the dot plot \textit{PostScript} file with a text browser again.
1167
1168\begin{center}
1169\includegraphics[width=.75\textwidth]{Figures/gquad_menon.eps}\\
1170\end{center}
1171
1172A better drawing of the predicted G-quadruplex might look as follows
1173
1174\begin{center}
1175\includegraphics[width=.5\textwidth]{Figures/gquad_menon_nice.eps}\\
1176\end{center}
1177
1178Repeat the above analysis for other RNA sequences that might contain and form
1179a G-quadruplex, e.g. the human telomerase RNA component hTERC
1180\begin{verbatim}
1181  >hTERC
1182  AGAGAGUGACUCUCACGAGAGCCGCGAGAGUCAGCUUGGCCAAUCCGUGCGGUCGG
1183  CGGCCGCUCCCUUUAUAAGCCGACUCGCCCGGCAGCGCACCGGGUUGCGGAGGGUG
1184  GGCCUGGGAGGGGUGGUGGCCAUUUUUUGUCUAACCCUAACUGAGAAGGGCGUAGG
1185  CGCCGUGCUUUUGCUCCCCGCGCGCUGUUUUUCUCGCUGACUUUCAGCGGGCGGAA
1186  AAGCCUCGGCCUGCCGCCUUCCACCGUUCAUUCUAGAGCAAACAAAAAAUGUCAGC
1187  UGCUGGCCCGUUCGCCCCUCCCGGGGACCUGCGGCGGGUCGCCUGCCCAGCCCCCG
1188  AACCCCGCCUGGAGGCCGCGGUCGGCCCGGGGCUUCUCCGGAGGCACCCACUGCCA
1189  CCGCGAAGAGUUGGGCUCUGUCAGCCGCGGGUCUCUCGGGGGCGAGGGCGAGGUUC
1190  AGGCCUUUCAGGCCGCAGGAAGAGGAACGGAGCGAGUCCCCGCGCGCGGCGCGAUU
1191  CCCUGAGCUGUGGGACGUGCACCCAGGACUCGGCUCACACAUGC
1192\end{verbatim}
1193
1194\subsubsection{Single strand binding (SSB) protein interaction}
1195Similar to the ligand interactions discussed above, a single strand binding
1196(SSB) protein might bind to consecutively unpaired sequence motifs. To model
1197such interactions the \texttt{ViennaRNA Package} implements yet another
1198extension to the folding grammar to cover all cases a protein may bind to,
1199termed \textit{unstructured domains}. This is in contrast to the ligand binding
1200example above that uses the soft constraints implementation, and is, therefore,
1201restricted to unpaired hairpin- and interior-loops.
1202
1203To make use of this implementation in \texttt{RNAfold} one has to resort
1204to \textit{command files} again. Here, an unstructured domain (UD) can be easily
1205added using the following syntax
1206\begin{verbatim}
1207UD m e [LOOP]
1208\end{verbatim}
1209where \texttt{m} is the sequence motif the protein binds to in IUPAC format,
1210\texttt{e} is the binding free energy in $kcal/mol$, and the optional \texttt{LOOP}
1211specifier allows for restricting the binding to particular loop types, e.g.
1212\texttt{M} for multibranch loops, or \texttt{E} for the exterior loop. See the
1213syntax for command files above for an overview of all loop types available.
1214
1215As an example, consider the protein binding experiment taken from Forties and Bundschuh 2010,
1216Bioinformatics (\url{https://dx.doi.org/10.1093/bioinformatics/btp627}). Here, the authors
1217investigate a hypothetical unspecific RNA binding protein with a footprint of $6~nt$ and
1218a binding energy of $\Delta G = -10~kcal/mol$ at $1~M$. With $T = 37^\circ C$ and
1219$$\Delta G = RT \cdot \ln \frac{K_d}{c}$$
1220this translates into a dissociation constant of
1221$$K_d = exp(\Delta G / RT) = 8.983267433 \cdot 10^{-8}.$$
1222Hence, the binding energies at $50~nM$, $100~nM$, $400~nM$, and $1~\mu M$ are $0.36~kcal/mol$,
1223$-0.07~kcal/mol$, $-0.92~kcal/mol$, and $-1.49~kcal/mol$, respectively.\
1224
1225The RNA sequence file \texttt{forties\_bundschuh.fa} for this experiment is
1226\begin{verbatim}
1227>forties_bundschuh
1228CGCUAUAAACCCCAAAAAAAAAAAAGGGGAAAAUAGCG
1229\end{verbatim}
1230which yields the following MFE structure
1231\begin{center}
1232\includegraphics[width=.5\textwidth]{Figures/forties_ss.eps}\\
1233\end{center}
1234
1235To model the protein binding for this example with \texttt{RNAfold} we require
1236a commands file for each of the concentrations in question. Thus, one simply creates
1237text files with a single line content
1238\begin{verbatim}
1239UD NNNNNN e
1240\end{verbatim}
1241where \texttt{e} is the binding free energy at this specific protein concentration
1242as computed above. Note here, that we use \texttt{NNNNNN} as sequence motif that is
1243bound by the protein to acknowledge the unspecific interaction between protein and
1244RNA. Finally, \texttt{RNAfold} is executed to compute equilibrium base pairing and
1245per-nucleotide protein binding probabilities
1246\begin{verbatim}
1247  $ RNAfold -p --commands forties_50nM.txt forties_bundschuh.fa
1248\end{verbatim}
1249and the produced probability dot plot can be inspected.
1250\begin{center}
1251\includegraphics[width=.5\textwidth]{Figures/forties_50nM_dp.eps}\\
1252\end{center}
1253As you can see, the dot plot is augmented with an additional linear array of blue squares
1254along each side that depicts the probability that the respective nucleotide is bound
1255by the protein. Now, repeat the computations for different protein concentrations and
1256compare the probabilities computed with the unstructured domain feature of the
1257\texttt{ViennaRNA Package} with those in Fig. 3(a) of the publication.
1258
1259Note, that \texttt{RNAfold} allows for an unlimited number of different proteins
1260specified in the commands file. This easily allows one to model RNA-protein binding
1261interaction within a relatively complex solution of different competing proteins.
1262
1263\subsubsection{Change other model settings}
1264\texttt{RNAfold} also allows for many other changes of the implemented Nearest Neighbor
1265model. For instance, you can explicitly prohibit $(G,U)$ pairs, change the temperature
1266that is used for evaluation of the free energy of particular loops, select a different
1267dangling-end energy model or load a different set of free energy parameters, e.g. for
1268DNA or parameters derived from computational optimizations.
1269
1270See the man pages of \texttt{RNAfold} for a complete overview of all available options
1271and command line switches. Additional energy parameter collections are distributed together
1272with the \texttt{ViennaRNA Package} as part of the contents of the \texttt{misc/} directory,
1273and are typically installed in
1274\begin{verbatim}
1275  prefix/share/ViennaRNA
1276\end{verbatim}
1277where \texttt{prefix} is the path that was used as installation prefix, e.g.
1278\texttt{\$HOME/Tutorial/Progs/VRP} (used in this tutorial) or \texttt{/usr} when installed
1279globally using a package manager.
1280
1281
1282\subsection{The Program \texttt{RNAplot}}
1283You can manually add additional annotation to structure drawings using the \texttt{RNAplot}
1284program (for information see its \texttt{man} page). Here's a somewhat complicated example:
1285
1286\begin{verbatim}
1287  $ RNAfold 5S.seq > 5S.fold
1288  $ RNAplot --pre "76 107 82 102 GREEN BFmark 44 49 0.8 0.8 0.8 Fomark \
1289    1 15 8 RED omark 80 cmark 80 -0.23 -1.2 (pos80) Label 90 95 BLUE Fomark" < 5S.fold
1290  $ gv 5S_ss.ps
1291\end{verbatim}%$
1292\begin{center}
1293\includegraphics[width=.75\textwidth]{Figures/5S_ss.eps}\\
1294\end{center}
1295
1296\texttt{RNAplot} is a very useful tool to color structure layout plots. The \texttt{--pre} tag adds
1297PostScript code required to color distinct regions of your molecule. There are some predefined
1298statements with different options for annotations listed below:
1299
1300\begin{tabular}{ll}
1301	\texttt{i cmark} & draws circle around base i\\
1302	\texttt{i j c gmark} & draw basepair i,j with c counter examples in grey\\
1303	\texttt{i j lw rgb omark} & stroke segment i...j with linewidth lw and color (rgb)\\
1304	\texttt{i j rgb Fomark} & fill segment i...j with color (rgb)\\
1305	\texttt{i j k l rgb BFmark} & fill block between pairs i,j and k,l with color (rgb)\\
1306	\texttt{i dx dy (text) Label} & adds a textlabel with an offset dx and dy relative to base i\\
1307\end{tabular}
1308
1309Predefined color options are \texttt{BLACK, RED, GREEN, BLUE, WHITE} but you can also
1310replace the value to some standard RGB code (e.g. 0 5 8 for lightblue).\\
1311
1312To simply add the annotation macros to the \texttt{PostScript} file without
1313any actual annotation you can use the following program call
1314\begin{verbatim}
1315  $ RNAplot --pre "" < 5S.fold
1316\end{verbatim}
1317
1318If you now open the structure layout file \texttt{5S\_ss.ps} with a text editor
1319you'll see the additional macros for \texttt{cmark}, \texttt{omark}, etc.
1320along with some show synopsis on how to use them. Actual annotations can then be added
1321between the lines
1322\begin{verbatim}
1323\% Start Annotations
1324\end{verbatim}
1325and
1326\begin{verbatim}
1327\% End Annotations
1328\end{verbatim}
1329Here, you simply need to add the same string of commands you would provide through the
1330\texttt{--pre} option of \texttt{RNAplot}.
1331
1332
1333To see what exactly the alternative structures of our sequence are, we need
1334to predict \emph{suboptimal} structures.
1335
1336
1337\pagebreak[3]
1338\subsection{The Program \texttt{RNApvmin}}
1339
1340The program \texttt{RNApvmin} reads a RNA sequence from \textit{stdin} and uses an iterative minimization
1341process to calculate a perturbation vector that minimizes the discripancies
1342between predicted pairing probabilites and observed pairing probabilities
1343(deduced from given shape reactivities).
1344The experimental SHAPE data has to be present in the file format described above.
1345The application will write the calculated vector of perturbation energies to \textit{stdout},
1346while the progress of the minimization process is written to \textit{stderr}.
1347The resulting perturbation vector can be interpreted directly and gives usefull insights into the
1348discrepancies between thermodynamic prediction and experimentally determined pairing status.
1349In addition the perturbation energies can be used to constrain folding with \texttt{RNAfold}:
1350
1351\begin{verbatim}
1352$ RNApvmin rna.shape < rna.seq >vector.csv
1353$ RNAfold --shape=vector.csv --shapeMethod=W < rna.seq
1354\end{verbatim}%$
1355
1356The perturbation vector file uses the same file format as the SHAPE data file.
1357Instead of SHAPE reactivities the raw perturbation energies will be storred in the last column.
1358Since the energy model is only adjusted when necessary, the calculated perturbation energies may be used
1359for the interpretation of the secondary structure prediction, since they indicate
1360which positions require major energy model adjustments in order to yield a prediction
1361result close to the experimental data. High perturbation energies for just
1362a few nucleotides may indicate the occurrence of features, which are not explicitly
1363handled by the energy model, such as posttranscriptional modifications and
1364intermolecular interactions.
1365
1366%===
1367\pagebreak[3]
1368\subsection{The Program \texttt{RNAsubopt}}
1369\texttt{RNAsubopt} calculates all suboptimal secondary structures within a
1370given energy range above the \texttt{MFE} structure. Be careful, the number
1371of structures returned grows exponentially with both sequence length and
1372energy range.
1373
1374%\begin{frame}[fragile]
1375\frametitle{Suboptimal folding}
1376\begin{itemize}
1377\item Generate all suboptimal structures within a certain energy
1378range from the \texttt{MFE} specified by the \texttt{-e} option.
1379\begin{verbatim}
1380$ RNAsubopt -e 1 -s < test.seq
1381CUACGGCGCGGCGCCCUUGGCGA   -500    100
1382...........((((...)))).  -5.00
1383....((((...))))........  -4.80
1384(((.((((...))))..)))...  -4.20
1385...((.((.((...)).)).)).  -4.10
1386\end{verbatim}%$
1387%\end{frame}
1388\end{itemize}
1389\noindent
1390The text output shows an energy sorted list (option \texttt{-s}) of all
1391secondary structures within 1~kcal/mol of the \texttt{MFE}
1392structure. Our sequence actually has a ground state structure (-5.70) and three
1393structures within 1~kcal/mol range.
1394
1395\texttt{MFE} folding alone gives no
1396indication that there are actually a number of plausible structures.
1397Remember that \texttt{RNAsubopt} cannot automatically plot structures, therefore
1398you can use the tool \texttt{RNAplot}. Note that you CANNOT simply pipe the
1399output of \texttt{RNAsubopt} to \texttt{RNAplot} using
1400\begin{verbatim}
1401$ RNAsubopt < test.seq | RNAplot
1402\end{verbatim}
1403You need to manually create a file for each structure you want to plot. Here, for example we created a new file named suboptstructure.txt:
1404\begin{verbatim}
1405> suboptstructure-4.20
1406CUACGGCGCGGCGCCCUUGGCGA
1407(((.((((...))))..)))...
1408\end{verbatim}
1409The fasta header is optional, but useful (without it the outputfile will be named rna.ps).
1410The next two lines contain the sequence and the suboptimal structure you want to plot;
1411in this case we plotted the structure with the folding energy of -4.20.
1412Then plot it with
1413\begin{verbatim}
1414$ RNAplot < suboptstructure.txt
1415\end{verbatim}
1416
1417Note that the number of suboptimal structures grows exponentially with
1418sequence length and therefore this approach is only tractable for
1419sequences with less than 100 nt. To keep the number of suboptimal
1420structures manageable the option \texttt{--noLP} can be used, forcing
1421\texttt{RNAsubopt} to produce only structures without isolated base
1422pairs. While \texttt{RNAsubopt} produces \emph{all} structures within an
1423energy range, \texttt{mfold} produces only a few, hopefully representative,
1424structures. Try folding the sequence on the mfold
1425server at \\
1426\url{http://mfold.rna.albany.edu/?q=mfold}.\\
1427
1428Sometimes you want to get information about unusual properties of the
1429Boltzmann ensemble (the sum of all RNA structures possible) for which no
1430specialized program exists. For example you want to know all fractions
1431of a bacterial mRNA in the Boltzmann ensemble where the Shine-Dalgarno (SD)
1432 sequence is unpaired. If the SD sequence is concealed
1433by secondary structure the translation efficiency is reduced.
1434
1435In such cases you can resort to drawing a representative sample of
1436structures from the Boltzmann ensemble by using the option
1437\texttt{-p}. Now you can simply count how many structures in the sample
1438possess the feature you are looking for. This number divided by the
1439size of your sample gives you the desired fraction.\\
1440
1441\noindent
1442The following example calculates the fraction of structures in the
1443ensemble that have bases 6 to 8 unpaired.
1444
1445%===
1446%\begin{frame}[fragile]
1447\frametitle{Sampling the Boltzmann Ensemble}
1448\begin{enumerate}
1449\item Draw a sample of size 10,000 from the Boltzmann ensemble
1450\item Calculate the desired property by using a perl script
1451\end{enumerate}
1452\begin{verbatim}
1453$ RNAsubopt -p 10000 < test.seq > tt
1454$ perl -nle '$h++ if substr($_,5,3) eq "...";
1455      END {print $h/$.}' tt
1456      0.391960803919608
1457\end{verbatim}
1458%\end{frame}
1459%$
1460
1461\noindent
1462A far better way to calculate this property is to use \texttt{RNAfold -p}
1463to get the ensemble free energy, which is related to the
1464partition function via $F = -RT\ln(Q)$, for the unconstrained ($F_u$)
1465and the constrained case ($F_c$), where the three bases are not
1466allowed to form base pairs (use option \texttt{-C}), and evaluate $p_c
1467= \exp((F_u - F_c)/RT)$ to get the desired probability.\\
1468
1469So let's do the calculation using \texttt{RNAfold}.
1470\begin{verbatim}
1471$RNAfold -p
1472
1473Input string (upper or lower case); @ to quit
1474....,....1....,....2....,....3....,....4....,....5....,....6....,....7....,....8
1475CUACGGCGCGGCGCCCUUGGCGA
1476length = 23
1477CUACGGCGCGGCGCCCUUGGCGA
1478...........((((...)))).
1479 minimum free energy =  -5.00 kcal/mol
1480....{,{{...||||...)}}}.
1481 free energy of ensemble =  -5.72 kcal/mol
1482....................... {  0.00 d=4.66}
1483 frequency of mfe structure in ensemble 0.311796; ensemble diversity 6.36
1484\end{verbatim}
1485\noindent
1486Now we have calculated the free ensemble energy of the ensemble over all structures (F\_u),
1487in the next step we have to calculate it for the structures using a constraint(F\_c).\\
1488
1489\noindent Following notation has to be used for defining the constraint:
1490\begin{enumerate}
1491\item $|$ : paired with another base
1492\item . : no constraint at all
1493\item x : base must not pair
1494\item $<$ : base i is paired with a base j<i
1495\item $>$ : base i is paired with a base j>i
1496\item matching brackets ( ): base i pairs base j\\
1497\end{enumerate}
1498
1499\noindent So our constraint should look like this:
1500\begin{verbatim}
1501  .....xxx...............
1502\end{verbatim}
1503Next call the application with following command and provide the
1504sequence and constraint we just created.
1505\begin{verbatim}
1506$ RNAfold -p -C
1507\end{verbatim}
1508The output should look like this
1509\begin{verbatim}
1510length = 23
1511CUACGGCGCGGCGCCCUUGGCGA
1512...........((((...)))).
1513 minimum free energy =  -5.00 kcal/mol
1514...........((((...)))).
1515 free energy of ensemble =  -5.14 kcal/mol
1516...........((((...)))). { -5.00 d=0.42}
1517 frequency of mfe structure in ensemble 0.792925; ensemble diversity 0.79
1518\end{verbatim}
1519\noindent
1520Afterwards evaluate the desired probability according to the formula given before
1521e.g. with a simple perl script.
1522\begin{verbatim}
1523$ perl -e 'print exp(-(5.72-5.14)/(0.00198*310.15))."\n"'
1524\end{verbatim}
1525
1526You can see that there is a slight difference between the \texttt{RNAsubopt} run with 10,000
1527samples and the \texttt{RNAfold} run including all structures.
1528
1529\pagebreak[3]
1530\section{RNA folding kinetics}
1531RNA folding kinetics describes the dynamical process of how a RNA molecule
1532approaches to its unique folded biological active conformation (often
1533referred to as the native state) starting from an initial ensemble of
1534disordered conformations e.g. the unfolded open chain. The key for
1535resolving the dynamical behavior of a folding RNA chain lies in the
1536understanding of the ways in which the molecule explores its astronomically
1537large free energy landscape, a rugged and complex hyper-surface established
1538by all the feasible base pairing patterns a RNA sequence can form. The
1539challenge is to understand how the interplay of formation and break up of
1540base pairing interactions along the RNA chain can lead to an efficient
1541search in the energy landscape which reaches the native state of the
1542molecule on a biologically meaningful time scale.
1543
1544\subsection{RNA2Dfold}
1545RNA2Dfold is a tool for computing the MFE structure, partition function and
1546 representative sample structures of $\kappa$, $\lambda$ neighborhoods and projects an high
1547dimensional energy landscape of RNA into two dimensions. Therefore a sequence
1548and two user-defined reference structures are expected by the program.
1549For each of the resulting distance class, the MFE representative, the
1550Boltzmann probabilities and the Gibbs free energy is computed. Additionally,
1551representative suboptimal secondary structures from each partition can be calculated.
1552
1553\begin{verbatim}
1554$ RNA2Dfold -p < 2dfold.inp > 2dfold.out
1555\end{verbatim}
1556
1557The outputfile \texttt{2dfold.out} should look like below, check it out using \texttt{less}.
1558\begin{tiny}
1559\begin{verbatim}
1560CGUCAGCUGGGAUGCCAGCCUGCCCCGAAAGGGGCUUGGCGUUUUGGUUGUUGAUUCAACGAUCAC
1561((((((((((....)))))..(((((....))))).)))))...(((((((((...))))))))). (-30.40)
1562((((((((((....)))))..(((((....))))).)))))...(((((((((...))))))))). (-30.40) <ref 1>
1563.................................................................. (  0.00) <ref 2>
1564free energy of ensemble = -31.15 kcal/mol
1565k       l       P(neighborhood) P(MFE in neighborhood)  P(MFE in ensemble)      MFE     E_gibbs MFE-structure
15660       24      0.29435909      1.00000000      0.29435892      -30.40  -30.40  ((((((((((....)))))..(((((....))))).)))))...(((((((((...))))))))).
15671       23      0.17076902      0.47069889      0.08038083      -29.60  -30.06  ((((((((((....)))))..(((((....))))).)))))....((((((((...))))))))..
15682       22      0.03575448      0.37731068      0.01349056      -28.50  -29.10  ((((.(((((....)))))..(((((....)))))..))))....((((((((...))))))))..
15692       24      0.00531223      0.42621709      0.00226416      -27.40  -27.93  ((((((((((....))))...(((((....)))))))))))...(((((((((...))))))))).
15703       21      0.00398349      0.29701636      0.00118316      -27.00  -27.75  .(((.(((((....)))))..(((((....)))))..))).....((((((((...))))))))..
15713       23      0.00233909      0.26432372      0.00061828      -26.60  -27.42  ((((((((((....))))...(((((....)))))))))))....((((((((...))))))))..
1572[...]
1573\end{verbatim}
1574\end{tiny}
1575
1576For visualizing the output the ViennaRNA Package includes two scripts
1577\texttt{2Dlandscape\_pf.gri, 2Dlandscape\_mfe.gri} located in \texttt{VRP/share/ViennaRNA/}.
1578gri (a language for scientific graphics programing) is needed to create a colored
1579postscript plot. We use the partition function script to show the free energies of
1580the distance classes (graph below, left):
1581
1582\begin{verbatim}
1583$ gri ../Progs/VRP/share/ViennaRNA/2Dlandscape_pf.gri 2dfold.out
1584\end{verbatim}
1585
1586Compare the output file with the colored plot and determine the MFE minima with
1587corresponding distance classes. For easier comparision the outputfile of \texttt{RNA2Dfold} can be
1588sorted by a simple sort command. For further information regarding sort use the \texttt{--help} option.
1589\begin{verbatim}
1590$ sort -k6 -n 2dfold.out > sort.out
1591\end{verbatim}
1592Now we choose the structure with the lowest energy besides our startstructure,
1593replace the open chain structure from our old input with that structure and repeat the steps above
1594with our new values
1595
1596\begin{itemize}
1597\item run \texttt{RNA2Dfold}
1598\item plot it using \texttt{2Dlandscape\_pf.gri}\\
1599\end{itemize}
1600
1601The new projection (right graph) shows the two major local minima which are separated by 39 bp (red dots in figure below)
1602and both are likely to be populated with high probability. The landscape gives an estimate of
1603the energy barrier separating the two minima (about -20 kcal/mol).\\
1604\noindent
1605The red dots mark the distance from open chain to the MFE structure respectively the
1606distance from the 2nd best structure to the MFE. Note that the red dots were manually added to the image afterwards so don't panic if you don't see them in your gri output.
1607\begin{center}
1608	\includegraphics[width=.45\textwidth]{Figures/2dfold_out_m.eps}\hfill
1609	\includegraphics[width=.45\textwidth]{Figures/2dfold_2_out_m.eps}
1610\end{center}
1611
1612\subsection{barriers \& treekin}
1613The following assumes you have the barriers and treekin programs
1614installed. If not, the current release can be found at
1615\url{http://www.tbi.univie.ac.at/RNA/Barriers/}. Installation proceeds
1616as shown for the ViennaRNA Package in section \ref{sect:install}. One
1617problem that often occurs during treekin installation is the
1618dependency on \texttt{blas} and \texttt{lapack} packages which is not
1619carefully checked. For further information according to the barriers
1620and treekin program also see the website.
1621
1622\frametitle{A short recall on howto install/compile a program}
1623\begin{enumerate}
1624	\item Get the barriers source from \url{http://www.tbi.univie.ac.at/RNA/Barriers/}
1625	\item extract the archive and go to the directory
1626\begin{verbatim}
1627$ tar -xzf Barriers-1.5.2.tar.gz
1628$ cd Barriers-1.5.2
1629\end{verbatim}
1630	\item use the \texttt{--prefix} option to install in your \texttt{Progs} directory
1631\begin{verbatim}
1632$ ./configure --prefix=$HOME/Tutorial/Progs/barriers-1.5.2
1633\end{verbatim}
1634	\item make install
1635\begin{verbatim}
1636$ make
1637$ make install
1638\end{verbatim}
1639\end{enumerate}
1640
1641Now barriers is ready to use. Apply the same steps to install treekin.
1642\texttt{Note:} Copy the barriers and treekin binaries to your \texttt{bin}
1643folder or add the path to your \texttt{PATH} variable.
1644
1645%\begin{frame}[fragile]
1646\frametitle{Calculate the Barrier Tree}
1647\begin{verbatim}
1648$ echo UCCACGGCUGUUAGUGGAUAACGGC | RNAsubopt --noLP -s -e 10 > barseq.sub
1649$ barriers -G RNA-noLP --bsize --rates < barseq.sub > barseq.bar
1650\end{verbatim}%$
1651You can restrict the number of local minima using the \texttt{barriers}
1652command-line option \texttt{--max} followed by a number. The option \texttt{-G RNA-noLP}
1653instructs barriers that the input consists of RNA secondary structures without isolated
1654 basepairs. \texttt{--bsize} adds size of the gradient basins and \texttt{--rates} tells
1655barriers to compute rates between macro states/basins for use with treekin. Another useful
1656options is \texttt{--minh} to print only minima with a barrier $> dE$. Look at the
1657output file \texttt{less -S barseq.bar}.  Use the arrow keys to navigate.
1658\begin{small}
1659\begin{verbatim}
1660  UCCACGGCUGUUAGUGGAUAACGGC
16611 (((((........))))).......  -6.90    0  10.00    115     0  -7.354207     23  -7.012023
16622 ......(((((((.....)))))))  -6.80    1   9.30     32    58  -6.828221     38  -6.828218
16633 (((...(((...)))))).......  -0.80    1   0.90      1    10  -0.800000      9  -1.075516
16644 ....((..((((....)))).))..  -0.80    1   2.70      5    37  -0.973593     11  -0.996226
16655 .........................   0.00    1   0.40      1    14  -0.000000     26  -0.612908
16666 ......(((....((.....)))))   0.60    2   0.40      1    22   0.600000      3   0.573278
16677 ......((((((....)))...)))   1.00    1   1.50      1    95   1.000000      2   0.948187
16688 .((....((......)).....)).   1.40    1   0.30      1    30   1.400000      2   1.228342
1669\end{verbatim}
1670
1671The first row holds the input sequence, the successive list the local
1672minima ascending in energy. The meaning of the first 5 columns is as follows
1673\begin{enumerate}
1674\item label (number) of the local minima (1=MFE)
1675\item structure of the minimum
1676\item free energy of the minimum
1677\item label of deeper local minimum the current minimum merges with (note that the
1678  \texttt{MFE} has no deeper local minimum to merge with)
1679\item height of the energy barrier to the local minimum to merge with
1680\item numbers of structures in the basin we merge with
1681\item number of basin which we merge to
1682\item free energy of the basin
1683\item number of structures in this basin using gradient walk
1684\item gradient basin (consisting of all structures where gradientwalk ends in the minimum)
1685\end{enumerate}
1686\end{small}
1687%\end{frame}
1688\frametitle{Calculate The Barrier Tree}
1689\begin{center}
1690  \includegraphics[width=.5\textwidth]{Figures/tree.eps}
1691\end{center}
1692\texttt{barriers} produced two additional files, the \texttt{PostScript}
1693file \texttt{tree.eps} which represents the basic information of the
1694\texttt{barseq.bar} file visually (look at the file e.g. \texttt{gv tree.eps})
1695and a text file \texttt{rates.out} which holds the matrix of transition
1696probabilities between the local minima.
1697%\end{frame}
1698
1699%\begin{frame}
1700\frametitle{Simulating the Folding Kinetics}
1701\noindent
1702The program \texttt{treekin} is used to simulate the evolution over time of the
1703population densities of local minima starting from an initial population
1704density distribution $p0$ (given on the command-line) and the
1705transition rate matrix in the file \texttt{rates.out}.
1706\begin{verbatim}
1707$ treekin -m I --p0 5=1 < barseq.bar | xmgrace -log x -nxy -
1708\end{verbatim}%$
1709\begin{center}
1710  \includegraphics[width=.5\textwidth]{Figures/FOO.eps}\hfill
1711  \includegraphics[width=.40\textwidth]{Figures/FOO_dp.eps}
1712\end{center}
1713The simulation starts with all the population density in the open chain
1714(local minimum 5, see \texttt{barseq.bar}). Over time the population density of this state decays
1715(yellow curve) and other local minima get populated. The simulation ends
1716with the population densities of the thermodynamic equilibrium in which the
1717MFE (black curve) and local minimum 2 (red curve) are the only ones
1718populated. (Look at the dot plot of the sequence created with \texttt{RNAsubopt}
1719and \texttt{RNAfold}!)
1720%\end{frame}
1721
1722%===
1723\pagebreak[4]
1724\section{Sequence Design}
1725\subsection{The Program \texttt{RNAinverse}}
1726\texttt{RNAinverse} searches for sequences folding into a predefined
1727structure, thereby inverting the folding algorithm. Input consists of the
1728target structures (in dot-bracket notation) and a starting sequence,
1729which is optional.\\
1730
1731Lower case characters in the start sequence indicate fixed positions,
1732i.e. they can be used to add sequence constraints. '\texttt{N}'s in the
1733starting sequence will be replaced by a random nucleotide.
1734For each search the best sequence found and its Hamming distance to the
1735start sequence are printed to \textit{stdout}. If the the search was
1736unsuccessful a structure distance to the target is appended.\\
1737
1738By default the program stops as soon as it finds a sequence that has the
1739target as MFE structure. The option \texttt{-Fp} switches
1740\texttt{RNAinverse} to the partition function mode where the probability of
1741the target structure $\exp(-E(S)/RT)/Q$ is maximized.  This tends to produce
1742sequences with a more well-defined structure.
1743This probability is written in dot-brackets after the found sequence and Hamming
1744distance. With the option \texttt{-R} you can specify how often the search
1745should be repeated.
1746
1747%===
1748%\begin{frame}[fragile]
1749  \frametitle{Sequence Design}
1750
1751\begin{enumerate}
1752\item Prepare an input file \texttt{inv.in} containing the target
1753structure and sequence constraints
1754\begin{verbatim}
1755  (((.(((....))).)))
1756  NNNgNNNNNNNNNNaNNN
1757\end{verbatim}
1758\item Design sequences using RNAinverse
1759\begin{verbatim}
1760$ RNAinverse < inv.in
1761        GGUgUUGGAUCCGAaACC    5
1762
1763$ RNAinverse -R5 -Fp < inv.in
1764        GGUgUGAACCCUCGaACC    5
1765        GGCgCCCUUUUGGGaGCC   12  (0.967418)
1766        CUCgAUCUCACGAUaGGG    6
1767        GGCgCCCGAAAGGGaGCC   13  (0.967548)
1768        GUUgAGCCCAUGCUaAGC    6
1769        GGCgCCCUUAUGGGaGCC   10  (0.967418)
1770        CGGgUGUUGUGACAaCCG    5
1771        GCGgGUCGAAAGGCaCGC   12  (0.925482)
1772        GCCgUAUCCGGGUGaGGC    6
1773        GGCgCCCUUUUGGGaGCC   13  (0.967418)
1774
1775\end{verbatim}
1776\end{enumerate}
1777\noindent
1778The output consists of the calculated sequence and the number of mutations
1779needed to get the MFE-structure from the start sequence (start sequence not shown).
1780Additionaly, with the partition function folding (\texttt{-Fp}) set, the second
1781output is another refinement so that the ensemble preferes the MFE and folds
1782into your given structure with a distinct probability, shown in brackets.\\
1783%\end{frame}
1784
1785Another useful program for inverse folding is \texttt{RNA designer}, see
1786\url{http://www.rnasoft.ca/}. RNA Designer takes a secondary structure
1787description as input and returns an RNA strand that is likely to fold in the
1788given secondary structure.
1789
1790The \texttt{sequence design application} of the \texttt{ViennaRNA Design Webservices},
1791see \url{http://nibiru.tbi.univie.ac.at/rnadesign/index.html} uses a different approach,
1792allowing for more than one secondary structure as input. For more detail read the online
1793Documentation and the next section of this tutorial.
1794
1795%\TODO{install package with RNA.pm}
1796\subsection{switch.pl}
1797The \texttt{switch.pl} script can be used to design bi-stable structures,
1798i.e. structures with two almost equally good foldings. For two given structures
1799there are always a lot of sequences compatible with both structures. If both
1800structures are reasonably stable you can find sequences where both target
1801structures have almost equal energy and all other structures have much higher energies.
1802Combined with RNAsubopt, barriers and treekin, this is a very useful tool for
1803designing RNAswitches. \\
1804
1805The input requires two structures in dot-bracket notation
1806and additionally you can add a sequence. It is also possible to calculate the
1807switching function at two different temperatures with option \texttt{-T} and \texttt{-T2}.
1808
1809\frametitle{Designing a Switch}
1810\noindent Now we try to create an RNA switch using \texttt{switch.pl}.
1811First we create our inputfile, then invoke the program using ten optimization runs
1812(\texttt{-n 10}) and do not allow lonely pairs. Write it out to \texttt{switch.out}
1813\begin{verbatim}
1814switch.in
1815     ((((((((......))))))))....((((((((.......))))))))
1816     ((((((((((((((((((........)))))))))))))))))).....
1817
1818$ switch.pl -n 10 --noLP < switch.in > switch.out
1819\end{verbatim}
1820
1821\texttt{switch.out} should look similar like this, the first block represents our
1822bi-stable structures in random order, the second block shows the resulting sequences ordered by
1823their score.
1824\begin{verbatim}
1825$ less switch.out
1826
1827GGGUGGACGUUUCGGUCCAUCCUUACGGACUGGGGCGUUUACCUAGUCC   0.9656
1828CAUUUGGCUUGUGUGUCGAAUGGCCCCGGUACGUAGGCUAAAUGUACCG   1.2319
1829GGGGGGUGCGUUCACACCCCUCAUUUGGUGUGGAUGUGCUUUCUACACU   1.1554
1830[...]
1831the resulting sequences are:
1832CAUUUGGCUUGUGUGUCGAAUGGCCCCGGUACGUAGGCUAAAUGUACCG   1.2319
1833GGGGGGUGCGUUCACACCCCUCAUUUGGUGUGGAUGUGCUUUCUACACU   1.1554
1834CGGGUUGUAACUGGAUAGCCUGGAAACUGUUUGGUUGUAAUCCGAACAG   1.0956
1835[...]
1836\end{verbatim}
1837
1838Given all 10 suggestions in our \texttt{switch.out}, we select the one with the best score
1839with some command line tools to use it as an \texttt{RNAsubopt} input file and build up the barriers tree.
1840\begin{verbatim}
1841$ tail -10 switch.out | awk '{print($1)}'  | head -n 1 > subopt.in
1842$ RNAsubopt --noLP -s -e 25 < subopt.in > subopt.out
1843$ barriers -G RNA-noLP --bsize --rates --minh 2 --max 30 < subopt.out > barriers.out
1844\end{verbatim}
1845
1846\texttt{tail -10} cuts the last 10 lines from the \texttt{switch.out} file and pipes them into
1847an \texttt{awk} script. The function \texttt{print(\$1)} echoes only the first column and this
1848is piped into the \texttt{head} program where the first line, which equals the best scored
1849sequence, is taken and written into \texttt{subopt.in}. Then \texttt{RNAsubopt} is called
1850to process our sequence and write the output to another file which is the input for the
1851barriers calculation.\\
1852
1853Below you find an example of the barriertree calculation above done with the right settings (connected root)
1854on the left side and the wrong \texttt{RNAsubobt -e} value on the right. Keep in mind that
1855\texttt{switch.pl} performs an stochastic search and the output sequences are different every time
1856because there are a lot of sequences which fit the structure and switch calculates a new one
1857everytime. Simply try to make sure.\\
1858
1859\includegraphics[width=.27\textheight]{Figures/switch_barriertree.eps}\hfill
1860\includegraphics[trim=-1.5cm 0cm 0cm 0cm,width=.27\textheight]{Figures/switch_barriertree_e13.eps}
1861\begin{tiny}
1862left: Barriers tree as it should look like, all branches connected to the main root
1863right: disconnected tree due to a too low \texttt{energy range (-e)} parameter set in \texttt{RNAsubopt}.
1864\end{tiny}\\
1865
1866Be careful to set the range -e high enough, otherwise we get a problem when calculation
1867the kinetics using treekin. Every branch should be somehow connected to the main root of the tree.
1868Try \texttt{-e 20} and \texttt{-e 30} to see the
1869difference in the trees and choose the optimal value. By using \texttt{--max 30} we
1870shorten our tree to focus only on the lowest minima.
1871We then select a branch preferably outside of the two main branches, here branch 30 (may differ from your own
1872calculation). Look at the barrier tree to find the best branch to start and replace \texttt{30} by the
1873branch you would choose. Now use treekin to plot concentration kinetics and think about the graph you
1874just created.
1875\begin{verbatim}
1876$ treekin -m I --p0 30=1  < barriers.out > treekin.out
1877$ xmgrace -log x -nxy treekin.out
1878\end{verbatim}
1879The graph could look like the one below, remember everytime you use \texttt{switch.pl} it can give you different
1880sequences so the output varies too. Here the one from the example.
1881\begin{center}
1882\includegraphics[trim=0cm 1.5cm 0cm -2.5cm, width=.45\textheight]{Figures/switch_treekin.eps}\\
1883\end{center}
1884%===
1885\pagebreak[3]
1886\section{RNA-RNA Interactions}
1887A common problem is the prediction of binding sites between two RNAs, as in
1888the case of miRNA-mRNA interactions. Following tools of the \texttt{ViennaRNA Package}
1889can be used to calculate base pairing probabilities.
1890
1891\subsection{The Program \texttt{RNAcofold}}
1892\texttt{RNAcofold} works much like \texttt{RNAfold} but uses
1893two RNA sequences as input which are then allowed to form a dimer
1894structure. In the input the two RNA sequences should be concatenated using
1895the `\texttt{\&}' character as separator.
1896As in \texttt{RNAfold} the \texttt{-p} option can be used to compute
1897partition function and base pairing probabilities.\\
1898
1899Since dimer formation is concentration dependent, \texttt{RNAcofold}
1900can be used to compute equilibrium concentrations for all five monomer
1901and (homo/hetero)-dimer species, given input concentrations for the
1902monomers (see the \texttt{man} page for details).
1903%===
1904%\begin{frame}[fragile]
1905  \frametitle{Two Sequences one Structure}
1906
1907  \begin{enumerate}
1908  \item Prepare a sequence file (\texttt{t.seq}) for input that looks like this
1909\begin{verbatim}
1910>t
1911GCGCUUCGCCGCGCGCC&GCGCUUCGCCGCGCGCA
1912\end{verbatim}
1913  \item Compute the \texttt{MFE} and the ensemble properties
1914  \item Look at the generated PostScript files \texttt{t\_ss.ps}
1915  and \texttt{t\_dp.ps}
1916  \end{enumerate}
1917\begin{verbatim}
1918$ RNAcofold -p < t.seq
1919>t
1920GCGCUUCGCCGCGCGCC&GCGCUUCGCCGCGCGCA
1921((((..((..((((...&))))..))..))))... (-17.70)
1922((((..{(,.((((,,.&))))..}),.)))),,. [-18.26]
1923frequency of mfe structure in ensemble 0.401754 , delta G binding= -3.95
1924\end{verbatim}%$
1925%\end{frame}
1926
1927
1928%===
1929%\begin{frame}
1930  \frametitle{Secondary Structure Plot and Dot Plot}
1931  \includegraphics[width=.25\textheight]{Figures/t_ss.eps}\hfill
1932  \includegraphics[width=.5\textwidth]{Figures/t_dp.eps}\\
1933%\end{frame}
1934In the dot plot a cross marks the chain break between the two concatenated sequences.
1935
1936\subsection{Concentration Dependency}
1937 Cofolding is an intermolecular process, therefore whether
1938 duplex formation will actually occur is concentration dependent.
1939 Trivially, if one of the molecules is not present, no dimers are going to
1940 be formed. The partition functions of the molecules give us the
1941 equilibrium constants:
1942 \begin{equation*}
1943   K_{AB} = \frac{[AB]}{[A][B]} = \frac{Z_{AB}}{Z_AZ_B}
1944 \end{equation*}
1945 with these and mass conservation, the equilibrium concentration of
1946 homodimers, heterodimers and monomers can be computed in dependence
1947 of the start concentrations of the two molecules.
1948 This is most easily done by creating a file with the
1949 initial concentrations of molecules $A$ and $B$ in two columns:\\
1950 \begin{eqnarray*}
1951   [a_1]([mol/l])  & [b_1]([mol/l])\cr
1952   [a_2]([mol/l])  & [b_2]([mol/l])\cr
1953       \vdots & \cr
1954   [a_n]([mol/l])  & [b_n]([mol/l])
1955 \end{eqnarray*}
1956
1957%\begin{frame}
1958  \frametitle{Concentration Dependency}
1959  \begin{enumerate}
1960  \item Prepare a concentration file for input with this little perl script
1961\begin{verbatim}
1962$ perl -e '$c=1e-07; do {print "$c\t$c\n"; $c*=1.71;} while $c<0.2' > concfile
1963\end{verbatim}
1964\noindent
1965This script creates a file displaying values from 1e-07 to just below 0.2, with 1.71-fold steps
1966in between. For convenience, concentration of molecule A is the same as concentration of
1967molecule B in each row. This will facilitate visualization of the results.
1968  \item Compute the \texttt{MFE}, the ensemble properties
1969    and the concentration dependency of hybridization.
1970\begin{verbatim}
1971$ RNAcofold -f concfile < t.seq > cofold.out
1972\end{verbatim}
1973  \item Look at the generated output with
1974\begin{verbatim}
1975$ less cofold.out
1976\end{verbatim}
1977  \end{enumerate}
1978\begin{scriptsize}
1979\begin{verbatim}
1980[...]
1981Free Energies:
1982AB              AA              BB              A               B
1983-18.261023      -17.562553      -18.274376      -7.017902       -7.290237
1984Initial concentrations          relative Equilibrium concentrations
1985A                B               AB              AA              BB              A               B
19861e-07           1e-07           0.00003         0.00002         0.00002         0.49994         0.49993
1987[...]
1988\end{verbatim}
1989\end{scriptsize}
1990% \end{frame}
1991
1992\noindent
1993The five different free energies were printed out first, followed
1994by a list of all the equilibrium concentrations, where the first two columns denote the initial
1995(absolute) concentrations of molecules $A$ and $B$, respectively. The next
1996five columns denote the equilibrium concentrations of dimers and monomers,
1997relative to the total particle number. (Hence, the concentrations don't add
1998up to one, except in the case where no dimers are built -- if you want to
1999know the fraction of particles in a dimer, you have to take the relative
2000dimer concentrations times 2).\\
2001Since relative concentrations of species depend on two independent values -
2002initial concentration of A as well as initial concentration of B - it is not
2003trivial to visualize the results. For this reason we used the same concentration
2004for A and for B. Another possibility would be to keep the initial concentration of
2005one molecule constant.
2006As an example we show the following plot of
2007$t.seq$.
2008%\begin{frame}[fragile]
2009Now we use some commandline tools to render our plot. We use \texttt{tail -n +11} to
2010show all lines starting with line 11 (1-10 are cut) and pipe it into an \texttt{awk} command, which
2011prints every column but the first from our input.
2012This is then piped to \texttt{xmgrace}.
2013With \texttt{-log x -nxy -} we tell it to plot the x axis in logarithmic scale and to read data file in X Y1 Y2 ... format.
2014
2015\begin{verbatim}
2016$ tail -n +11 cofold.out | awk '{print $2, $3, $4, $5, $6, $7}' | xmgrace -log x -nxy -
2017\end{verbatim}
2018
2019\frametitle{Concentration Dependency plot}
2020\begin{center}
2021\begin{figure}[h]
2022\includegraphics[trim= 0cm 0cm 0cm -2.2cm,width=.70\textwidth]{Figures/tconcdep.eps}\hfill
2023\end{figure}
2024\end{center}
2025  $\Delta G_{\text{binding}}=-5.01$ kcal/mol
2026\begin{verbatim}
2027  sequences:GCGCUUCGCCGCGCGCG&GCGCUUCGCCGCGCGCG
2028\end{verbatim}
2029%\end{frame}
2030Since the two sequences are almost identical, the monomer and homo-dimer
2031concentrations behave very similarly.
2032In this example, at a concentration of about 1 mmol 50\% of the molecule
2033is still in monomer form.
2034%===
2035
2036\pagebreak[2]
2037\subsection{Finding potential binding sites with \tt RNAduplex}
2038
2039If the sequences are very long (many kb)\texttt{RNAcofold} is too slow to be useful.
2040The \texttt{RNAduplex} program is a fast alternative, that works by
2041predicting \emph{only} intermolecular base pairs. It's almost as fast
2042as simple sequence alignment, but much more accurate than a \texttt{BLAST}
2043search.
2044
2045The example below searches the 3' UTR of an mRNA for a miRNA binding site.
2046
2047%\begin{frame}[fragile]
2048  \frametitle{Binding site prediction with \tt RNAduplex}
2049  The file \texttt{duplex.seq} contains the 3'UTR of NM\_024615 and the
2050  microRNA mir-145.
2051  \begin{small}
2052\begin{verbatim}
2053$ RNAduplex < duplex.seq
2054>NM_024615
2055>hsa-miR-145
2056.(((((.(((...((((((((((.&)))))))))))))))))).  34,57  :   1,19  (-21.90)
2057\end{verbatim}%$
2058  \end{small}
2059  Most favorable binding has an interaction energy of -21.90 kcal/mol and pairs up on positions
2060  34-57 of the UTR with positions 1-22 of the miRNA.\\
2061%\end{frame}
2062
2063\texttt{RNAduplex} can also produce alternative binding sites, e.g. running
2064\texttt{RNAduplex -e 10} would list all binding sites within 10 kcal/mol of
2065the best one.
2066
2067Since \texttt{RNAduplex} forms only intermolecular pairs, it neglects the
2068competition between intramolecular folding and hybridization. Thus, it is
2069recommended to use \texttt{RNAduplex} as a pre-filter and analyse good
2070\texttt{RNAduplex} hits additionally with \texttt{RNAcofold} or
2071\texttt{RNAup}. Using the example above, running \texttt{RNAup} will yield:
2072
2073%\begin{frame}[fragile]
2074%  \frametitle{Binding site prediction with \tt RNAduplex}
2075  \begin{small}
2076\begin{verbatim}
2077$ RNAup -b < duplex.seq
2078
2079>NM_024615
2080>hsa-miR-145
2081(((((((&)))))))  50,56  :   1,7   (-8.41 = -9.50 + 0.69 + 0.40)
2082GCUGGAU&GUCCAGU
2083RNAup output in file: hsa-miR-145_NM_024615_w25_u1.out
2084\end{verbatim}%$
2085  \end{small}
2086  The free energy of the duplex is -9.50 kcal/mol and shows a discrepancy to the
2087  structure and energy value computed by \texttt{RNAduplex} (differences may arise from
2088  the fact that RNAup computes partition functions rather than optimal structures).
2089  However, the total free energy of binding is less favorable (-8.41 kcal/mol), since it
2090  includes the energetic penalty for opening the binding site on the mRNA
2091  (0.69 kcal/mol) and miRNA (0.40 kcal/mol). The \texttt{-b} option includes the
2092  probability of unpaired regions in both RNAs.
2093
2094%\end{frame}
2095
2096You can also run \texttt{RNAcofold} on the example to see the complete
2097structure after hybridization (neither \texttt{RNAduplex} nor
2098\texttt{RNAup} produce structure drawings). Note however, that the input
2099format for \texttt{RNAcofold} is different. An input file suitable for
2100\texttt{RNAcofold} has to be created from the \texttt{duplex.seq} file first (use any text editor).\\
2101
2102
2103As a more difficult example, let's look at the interaction of the bacterial smallRNA RybB and its
2104target mRNA ompN. First we'll try predicting the binding site using RNAduplex:
2105\begin{small}
2106\begin{verbatim}
2107$ RNAduplex < RybB.seq
2108>RybB
2109>ompN
2110.((((..((((((.(((....((((((((..(((((.((..((.((....((((..(((((((((((..((((((&
2111.))))))..))))))).)))).....))))....)).)).)).))).))..))))........))))..))).)))))).)))).
2112 5,79  :  80,164 (-34.60)
2113\end{verbatim}%$
2114\end{small}
2115
2116Note, that the predicted structure spans almost the full length of the RybB small RNA.
2117Compare the predicted interaction to the structures predicted for RybB and ompN alone, and ask
2118yourself whether the predicted interaction is indeed plausible.
2119
2120Below the structure of ompN on the left and RybB on the right side. The respective binding regions
2121predicted by RNAduplex are marked in red.
2122
2123\begin{center}
2124  \includegraphics[width=0.65\linewidth]{Figures/ompN_ss.eps}
2125  \includegraphics[width=0.50\linewidth]{Figures/RybB_ss.eps}
2126\end{center}
2127\begin{tiny}
2128\begin{verbatim}
2129GCCAC-----TGCTTTTCTTTGATGTCCCCATTTT-GTGGA-------GC-CCATCAACCCCGCCATTTCGGTT---CAAG-GTTGGTGGGTTTTTT
2130 |||      ||||  |||||| |||    ||||| ||||        || ||| || ||  ||    ||||     |||| ||  |||  |||||| -40.30
2131AGGTCAAACAACGGC-AGAAACAATATT--TAAAGTCGCCGCACACGACGCGGTCGTCGGT-CGTCTCGGCCCTACTGTTCACGGTTATGAAAAGAAACC-3'
2132\end{verbatim}
2133\end{tiny}
2134\noindent
2135Compare the \texttt{RNAduplex} prediction with the interaction predicted by \texttt{RNAcofold}, \texttt{RNAup} and the handcrafted prediction you see above.
2136
2137\begin{center}
2138  \includegraphics[width=0.7\linewidth]{Figures/OmpN_cofold.eps}
2139\end{center}
2140
2141\pagebreak[3]
2142%===
2143%\input{alifold}
2144\section{Consensus Structure Prediction}
2145Sequence co-variations are a direct consequence of RNA base pairing
2146rules and can be deduced to alignments. RNA helices normally contain
2147only 6 out of the 16 possible
2148combinations: the Watson-Crick pairs \textsf{GC}, \textsf{CG},
2149\textsf{AU}, \textsf{UA}, and the somewhat weaker wobble pairs
2150\textsf{GU} and \textsf{UG}. Mutations in helical regions therefore
2151have to be correlated. In particular we often find ``compensatory
2152mutations'' where a mutation on one side of the helix is compensated
2153by a second mutation on the other side, e.g.\ a
2154\textsf{C}$\cdot$\textsf{G} pair changes into a
2155\textsf{U}$\cdot$\textsf{A} pair. Mutations where only one pairing
2156partner changes (such as \textsf{C}$\cdot$\textsf{G} to
2157\textsf{U}$\cdot$\textsf{G}) are termed ``consistent mutations''.
2158
2159\subsection{The Program \texttt{RNAalifold}}
2160\texttt{RNAalifold} generalizes the folding algorithm for sequence
2161alignments, treating the entire alignment as a single ``generalized
2162sequence''.  To assign an energy to a structure on such a generalized
2163sequence, the energy is simply averaged over all sequences in the
2164alignment. This average energy is augmented by a covariance term, that
2165assigns a bonus or penalty to every possible base pair $(i,j)$ based
2166on the sequence variation in columns $i$ and $j$ of the alignment.\\
2167
2168Compensatory mutations are a strong indication of structural
2169conservation, while consistent mutations provide a weaker signal. The
2170covariance term used by \texttt{RNAalifold} therefore assigns a bonus
2171of 1 kcal/mol to each consistent and 2 kcal/mol for each compensatory
2172mutation. Sequences that cannot form a standard base pair incur a
2173penalty of $-1$ kcal/mol. Thus, for every possible consensus pair
2174between two columns $i$ and $j$ of the alignment a covariance score
2175$C_{ij}$ is computed by counting the fraction of sequence pairs
2176exhibiting consistent and compensatory mutations, as well as the
2177fraction of sequences that are inconsistent with the pair. The weight
2178of the covariance term relative to the normal energy function, as well
2179as the penalty for inconsistent mutations can be changed via command
2180line parameters.\\
2181
2182Apart from the covariance term, the folding algorithm in
2183\texttt{RNAalifold} is essentially the same as for single sequence
2184folding. In particular, folding an alignment containing just one
2185sequence will give the same result as single sequence folding using
2186\texttt{RNAfold}. For $N$ sequences of length $n$ the required CPU
2187time scales as $\mathcal{O}(N\cdot n^2 + n^3)$ while memory
2188requirements grow as the square of the sequence length. Thus
2189\texttt{RNAalifold} is in general faster than folding each sequence
2190individually. The main advantage, however, is that the accuracy of
2191consensus structure predictions is generally much higher than for
2192single sequence folding, where typically only between 40\% and 70\% of
2193the base pairs are predicted correctly.\\
2194
2195Apart from prediction of \texttt{MFE} structures \texttt{RNAalifold}
2196also implements an algorithm to compute the partition function over
2197all possible (consensus) structures and the thermodynamic equilibrium
2198probability for each possible pair. These base pairing probabilities
2199are useful to see structural alternatives, and to distinguish well
2200defined regions, where the predicted structure is most likely correct,
2201from ambiguous regions.\\
2202
2203As a first example we'll produce a consensus structure prediction for
2204the following four tRNA sequences.
2205\begin{verbatim}
2206$ cat four.seq
2207\end{verbatim}
2208\vspace*{-3ex}
2209\begin{scriptsize}
2210\begin{verbatim}
2211>M10740 Yeast-PHE
2212GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUUUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA
2213>K00349 Drosophila-PHE
2214GCCGAAAUAGCUCAGUUGGGAGAGCGUUAGACUGAAGAUCUAAAGGUCCCCGGUUCAAUCCCGGGUUUCGGCA
2215>K00283 Halobacterium volcanii Lys-tRNA-1
2216GGGCCGGUAGCUCAUUUAGGCAGAGCGUCUGACUCUUAAUCAGACGGUCGCGUGUUCGAAUCGCGUCCGGCCCA
2217>AF346993
2218CAGAGUGUAGCUUAACACAAAGCACCCAACUUACACUUAGGAGAUUUCAACUUAACUUGACCGCUCUGA
2219\end{verbatim}%$
2220\end{scriptsize}
2221
2222\vspace*{3ex}\noindent
2223\texttt{RNAalifold} uses aligned sequences as input. Thus, our first
2224step will be to align the sequences. We use \texttt{clustalw2} in this
2225example, since it's one of the most widely used alignment programs and
2226has been shown to work well on structural RNAs. Other alignment
2227programs can be used (including programs that attempt to do structural
2228alignment of RNAs), but the resulting multiple sequence alignment must
2229be in \texttt{Clustal} format. Get \texttt{clustalw2} and install it as you have done it with the other packages: \url{http://www.clustal.org/clustal2}
2230
2231%===
2232%\begin{frame}[fragile]
2233 \frametitle{Consensus Structure from related Sequences}
2234
2235  \begin{enumerate}
2236  \item Prepare a sequence file (use file \texttt{four.seq} and copy it to your working directory)
2237  \item Align the sequences
2238  \item Compute the consensus structure from the alignment
2239  \item Inspect the output files \texttt{alifold.out}, \texttt{alirna.ps},
2240    \texttt{alidot.ps}
2241  \item For comparison fold the sequences individually using \texttt{RNAfold}
2242  \end{enumerate}
2243\begin{verbatim}
2244$ clustalw2 four.seq > four.out
2245\end{verbatim}
2246\vspace*{-5ex}
2247\texttt{Clustalw2} creates two more output files, \texttt{four.aln} and \texttt{four.dnd}. For \texttt{RNAalifold} you need the \texttt{.aln} file.
2248\begin{verbatim}
2249$ RNAalifold -p four.aln
2250$ RNAfold -p < four.seq
2251\end{verbatim}%$
2252\vspace*{-3ex}
2253\texttt{RNAalifold} output:
2254\begin{scriptsize}
2255\begin{verbatim}
2256__GCCGAUGUAGCUCAGUUGGG_AGAGCGCCAGACUGAAAAUCAGAAGGUCCCGUGUUCAAUCCACGGAUCCGGCA__
2257..(((((((..((((.........)))).(((((.......))))).....(((((.......))))))))))))...
2258 minimum free energy = -15.12 kcal/mol (-13.70 +  -1.43)
2259..(((((({..((((.........)))).(((((.......))))).....(((((.......)))))}))))))...
2260 free energy of ensemble = -15.75 kcal/mol
2261 frequency of mfe structure in ensemble 0.361603
2262..(((((((..((((.........)))).(((((.......))))).....(((((.......))))))))))))... -15.20 {-13.70 +  -1.50}
2263\end{verbatim}
2264\end{scriptsize}
2265\texttt{RNAfold} output:
2266\begin{scriptsize}
2267\begin{verbatim}
2268>M10740 Yeast-PHE
2269GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUUUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA
2270((((((((........((((.((((((..((((...........))))..))))))..))))..)))))))). (-21.60)
2271((((((({...,,.{,((((.((((((..((((...........))))..))))))..))))),)))))))). [-23.20]
2272((((((((.........(((.((((((..((((...........))))..))))))..)))...)))))))). {-20.00 d=9.63}
2273 frequency of mfe structure in ensemble 0.0744065; ensemble diversity 15.35
2274>K00349 Drosophila-PHE
2275[...]
2276\end{verbatim}
2277\end{scriptsize}
2278%\end{frame}
2279
2280\noindent
2281The output contains a consensus sequence and the consensus structure
2282in dot-bracket notation. The consensus structure has an energy of
2283$-15.12$~kcal/mol, which in turn consists of the average free energy of the
2284structure $-13.70$~kcal/mol and the covariance term $-1.43$~kcal/mol. The
2285strongly negative covariance term shows that there must be a fair number of
2286consistent and compensatory mutations, but in contrast to the average free
2287energy it's not meaningful in the biophysical sense.
2288
2289Compare the predicted consensus structure with the structures predicted for
2290the individual sequences using \texttt{RNAfold}. How often is the correct
2291``clover-leaf'' shape predicted?
2292
2293For better visualization, a structure annotated alignment or color annotated structure drawing can be
2294generated by using the \texttt{--aln} and \texttt{--color} options of \texttt{RNAalifold}.
2295\begin{verbatim}
2296$ RNAalifold --color --aln four.aln
2297$ gv aln.ps &
2298$ gv alirna.ps &
2299\end{verbatim}%$
2300%===
2301%\begin{frame}
2302  \frametitle{\texttt{RNAalifold} Output Files}
2303  \begin{center}
2304\begin{scriptsize}
2305\begin{verbatim}
23064 sequence; length of alignment 78
2307alifold output
2308    6    72  0  99.8%   0.007 GC:2    GU:1    AU:1
2309   33    43  0  98.9%   0.033 GC:2    GU:1    AU:1
2310   31    45  0  99.0%   0.030 CG:3    UA:1
2311   15    25  0  98.9%   0.045 CG:3    UA:1
2312    5    73  1  99.7%   0.008 CG:2    GC:1
2313   13    27  0  99.1%   0.042 CG:4
2314   14    26  0  99.1%   0.042 UA:4
2315    4    74  1  99.5%   0.015 CG:3
2316[...]
2317\end{verbatim}
2318\end{scriptsize}
2319\end{center}
2320
2321%  \centerline{\includegraphics[width=.6\textwidth]{Figures/aout.eps}}
2322
2323\vspace*{1ex}
2324  \includegraphics[width=.48\textwidth]{Figures/alirna.eps}
2325  \includegraphics[width=.50\textwidth]{Figures/alidot.eps}
2326%\end{frame}
2327
2328\noindent
2329The last output file produced by \texttt{RNAalifold -p}, named
2330\texttt{alifold.out}, is a plain text file with detailed information
2331on all plausible base pairs sorted by the likelihood of the pair.  In
2332the example above we see that the pair $(6,72)$ has no inconsistent
2333sequences, is predicted almost with probability 1, and occurs as a
2334\textsf{GC} pair in two sequences, a \textsf{GU} pair in one, and a
2335\textsf{AU} pair in another.
2336
2337\noindent
2338\texttt{RNAalifold} automatically produces a drawing of the consensus
2339structure in Postscript format and writes it to the file
2340``\texttt{alirna.ps}''.  In the structure graph consistent and compensatory
2341mutations are marked by a circle around the variable base(s),
2342i.e. pairs where one pairing partner is encircled exhibit consistent
2343mutations, whereas pairs supported by compensatory mutations have both
2344bases marked. Pairs that cannot be formed by some of the sequences are
2345shown gray instead of black.
2346In the example given, many pairs show such inconsistencies. This is because
2347one of the sequences (AF346993) is not aligned well by \texttt{clustalw}.
2348
2349Note, that subsequent calls to \texttt{RNAalifold} will overwrite any
2350existing output \texttt{alirna.ps} (\texttt{alidot.ps},
2351\texttt{alifold.out}) files in the current directory. Be sure to
2352rename any files you want to keep.
2353
2354\subsubsection{Structure predictions for the individual sequences}
2355
2356The consensus structure computed by \texttt{RNAalifold} will contain only
2357pairs that can be formed by most of the sequences. The structures of the
2358individual sequences will typically have additional base pairs that are not
2359part of the consensus structure. Moreover, ncRNA may exhibit a highly
2360conserved core structure while other regions are more variable. It may
2361therefore be desirable to produce structure predictions for one particular
2362sequence, while still using covariance information from other sequences.
2363
2364This can be accomplished by first computing the consensus structure for all
2365sequences using \texttt{RNAalifold}, then folding individual sequences using
2366\texttt{RNAfold -C\,} with the consensus structure as a constraint. In
2367constraint folding mode \texttt{RNAfold -C\,} allows only base pairs to form
2368which are compatible with the constraint structure. This resulting
2369structure typically contains most of the constraint (the consensus
2370structure)  plus some additional pairs that are specific for this sequence.
2371
2372%\begin{frame}
2373\frametitle{Refolding Individual Sequences}
2374The \texttt{refold.pl} (find it in the Progs folder) script removes gaps and maps the consensus structure to
2375each individual sequence.
2376\begin{scriptsize}
2377\begin{small}
2378\begin{verbatim}
2379$ RNAalifold  RNaseP.aln > RNaseP.alifold
2380$ gv alirna.ps
2381$ refold.pl RNaseP.aln RNaseP.alifold | head -3 > RNaseP.cfold
2382$ RNAfold -C --noLP < RNaseP.cfold > RNaseP.refold
2383$ gv E-coli_ss.ps
2384\end{verbatim} %$
2385\end{small}
2386\end{scriptsize}
2387%\end{frame}
2388
2389If you compare the refolded structure (\verb+E-coli_ss.ps+) with the
2390structure you get by simply folding the E.coli sequence in the
2391RNaseP.seq file (\verb+RNAfold --noLP+) you find a clear
2392rearrangement.
2393
2394In cases where constrained folding results in a structure that is very
2395different from the consensus, or if the energy from constrained
2396folding is much worse than from unconstrained folding, this may
2397indicate that the sequence in question does not really share a common
2398structure with the rest of the alignment or is misaligned. One should
2399then either remove or re-align that sequence and recompute the
2400consensus structure.
2401
2402Note that since RNase P forms sizable pseudo-knots, a perfect
2403prediction is impossible in this case.
2404
2405%%% This will come back when Teresa fixed Milli's script to highlight
2406%%% similarities/differences of n given structures.
2407
2408%%\begin{frame}
2409%\frametitle{Refolded structure of E.~coli RNase P}
2410%
2411%\begin{figure}[h]
2412%  \includegraphics[width=0.6\textwidth,clip=true,trim=0cm 2cm 0cm 2cm]{Figures/ec_RNaseP.ps}
2413%  \caption{Correct base pairs pairing in all tree structures are
2414%    indicated in yellow, correct pairs added by refolding are colored
2415%    orange. Incorrect base pairs which are only paired in the
2416%    alifolded and refolded structures but not in the native structure
2417%    are painted blue.}
2418%\end{figure}
2419%%%\TODO{colors created with sequenzvergleich.pl}
2420%
2421%
2422%%\end{frame}
2423%
2424%Comparing the structures with the \emph{E.~coli} reference
2425%structure we find that 55 basepairs are predicted correctly in all three structures.
2426%After running \texttt{refold} three additional basepairs were predicted right.
2427
2428
2429
2430
2431%===
2432
2433\section{Structural Alignments}
2434
2435\subsection{Manually correcting Alignments}
2436As the tRNA example above demonstrates, sequence alignments are often
2437unsuitable as a basis for determining consensus structures. As a last
2438resort, one may always try manually correcting an alignment. Sequence
2439editors that are structure-aware may help in this task.
2440In particular the SARSE \url{http://sarse.kvl.dk/} editor, and the
2441\texttt{ralee-mode} for emacs
2442\url{http://personalpages.manchester.ac.uk/staff/sam.griffiths-jones/software/ralee/}
2443are useful.
2444
2445After downloading the \texttt{ralee}-files extract them and put them
2446in a folder called \texttt{\textasciitilde/Tutorial/Progs/ralee}. Now
2447read the \texttt{00README} file and follow the instructions. If you
2448don't find an ``.emacs'' file in your home directory execute the
2449following command to copy it from the Data directory.\\
2450
2451\begin{verbatim}
2452$ cp Data/dot.emacs ~/
2453\end{verbatim}
2454
2455Next try correcting the \texttt{ClustalW} generated alignment \texttt{(four.aln)} from the
2456example above. For this we first have to convert it to the Stockholm format.
2457Fortunately the formats are similar. Make a copy of the file add the
2458correct header line and the consensus structure from RNAalifold:
2459\begin{verbatim}
2460$ cp four.aln four.stk
2461$ emacs four.stk
2462  .....
2463$ cat four.stk
2464\end{verbatim}
2465The final alignment should look like:
2466\begin{tiny}
2467\begin{verbatim}
2468# STOCKHOLM 1.0
2469
2470K00349          --GCCGAAAUAGCUCAGUUGGG-AGAGCGUUAGACUGAAGAUCUAAAGGUCCCCGGUUCAAUCCCGGGUUUCGGCA--
2471K00283          GGGCCG--GUAGCUCAUUUAGGCAGAGCGUCUGACUCUUAAUCAGACGGUCGCGUGUUCGAAUC--GCGUCCGGCCCA
2472M10740          --GCGGAUUUAGCUCAGUUGGG-AGAGCGCCAGACUGAAGAUUUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA--
2473AF346993        --CAGAGUGUAGCUUAAC---ACAAAGCACCCAACUUACACUUAGGAGAUUUCAACUUAACUUGACCGCUCUGA----
2474#=GC SS_cons    ..(((((((..((((.........)))).(((((.......))))).....(((((.......))))))))))))...
2475//
2476\end{verbatim}%$
2477\end{tiny}
2478Now use the functions under the edit menu to improve the alignment, the
2479coloring by structure should help to highlight misaligned positions.
2480
2481\subsection{Automatic structural alignments}
2482
2483Next, we'll compute alignments using two structural alignment programs:
2484\texttt{LocARNA} and \texttt{T-Coffee}. \texttt{LocARNA} is an implementation
2485of the Sankoff algorithm for simultaneous folding and alignment (i.e. it
2486will generate both alignment and consensus structure). \texttt{T-Coffee} uses a
2487progressive alignment algorithm.
2488
2489Download \texttt{LocARNA} from \url{http://www.bioinf.uni-freiburg.de/Software/LocARNA/},
2490extract and install it in your \texttt{Progs} folder and eventually add it to your
2491path variable or copy it into the corresponding directory.
2492
2493Both programs can read the fasta file \texttt{four.seq}.
2494\begin{verbatim}
2495$ mlocarna --alifold-consensus-dp four.seq
2496[...]
2497M10740             GCGGAUUUAGCUCAGUUGGG-AGAGCGCCAGACUGAAGAUUUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCA
2498K00349             GCCGAAAUAGCUCAGUUGGG-AGAGCGUUAGACUGAAGAUCUAAAGGUCCCCGGUUCAAUCCCGGGUUUCGGCA
2499K00283             GGGCCGGUAGCUCAUUUAGGCAGAGCGUCUGACUCUUAAUCAGACGGUCGCGUGUUCGAAUCGCGUCCGGCCCA
2500AF346993           CAGAGUGUAGCUUAAC---ACAAAGCACCCAACUUACACUUAGGAGAUU-UCAACUUAA-CUUGACCGCUCUGA
2501alifold            (((((((..((((.........)))).(((((.......))))).....(((((.......)))))))))))).
2502        (-52.53 = -21.58 + -30.95)
2503\end{verbatim}%$
2504
2505\frametitle{Install T-Coffee}
2506
2507Get \texttt{T-Coffee} from the github page
2508\url{https://github.com/cbcrg/tcoffee}. There is a detailed
2509information how you should download and install the software in the
2510given README.md.
2511
2512Go to the \texttt{downloads} directory and use the provided installer
2513by typing
2514\begin{verbatim}
2515$ cd Tutorial/downloads
2516$ git clone git@github.com:cbcrg/tcoffee.git tcoffee
2517$ cd tcoffee/compile/
2518$ make t_coffee
2519$ cp t_coffee ~/Tutorial/Progs/
2520\end{verbatim}
2521
2522Afterwards align the \texttt{four.seq} using \texttt{t\_coffee} and compare the output with
2523the one given by LocARNA.
2524\begin{verbatim}
2525$ t_coffee four.seq > t_coffee.out
2526
2527 [t_coffee.out]
2528 CLUSTAL FORMAT for T-COFFEE 20150925_14:18 [http://www.tcoffee.org] [MODE:  ],
2529 CPU=0.00 sec, SCORE=739, Nseq=4, Len=74
2530
2531 M10740          GCGGAUUUAGCUCAGUU-GGGAGAGCGCCAGACUGAAGAUUUGGAGGUCC
2532 K00349          GCCGAAAUAGCUCAGUU-GGGAGAGCGUUAGACUGAAGAUCUAAAGGUCC
2533 K00283          GGGCCGGUAGCUCAUUUAGGCAGAGCGUCUGACUCUUAAUCAGACGGUCG
2534 AF346993        CAGAGUGUAGCUUAAC---ACAAAGCACCCAACUUACACUUAGGAGAUUU
2535                        ***** *       * ***     ***     *     * *
2536
2537 M10740          UGUGUUCGAUCCACAGAAUUCGCA
2538 K00349          CCGGUUCAAUCCCGGGUUUCGGCA
2539 K00283          CGUGUUCGAAUCGCGUCCGGCCCA
2540 AF346993        CAACUUAACUUGACCG--CUCUGA
2541                     **                 *
2542\end{verbatim}
2543
2544Use RNAalifold to predict structures for all your alignments
2545(ClustalW, handcrafted, T-Coffee, and LocARNA) and compare them. The
2546handcrafted and LocARNA alignments should be essentially perfect.
2547
2548Other interesting approaches to structural alignment include
2549\texttt{CMfinder}, \texttt{dynalign}, and \texttt{stemloc}.
2550
2551
2552
2553
2554
2555
2556
2557
2558%% % talk.tex
2559%% % > dvips -P pdf -ta4 talk -o talk.temp.ps
2560%% % > psnup -1 -m1cm -W128mm -H96mm -pa4 talk.temp.ps talk.handout.ps
2561%% % > psnup -2 -m1cm -b1cm -W128mm -H96mm -pa4 talk.temp.ps talk.handout.ps
2562%% %
2563%% % -*-latex-*-
2564%% \NeedsTeXFormat{LaTeX2e}
2565%% \documentclass[a4paper]{article}
2566%% \usepackage{beamerarticle}
2567%% %\documentclass[compress,ignorenonframetext]{beamer}
2568%% \usepackage[english]{babel}
2569%% \usepackage{url}
2570%% %\usepackage{amsmath,amssymbols}
2571
2572%% % presentation mode
2573%% % handout mode
2574%% \mode<article>{
2575%%   \usepackage{graphics}
2576%%   \usepackage{pgf}
2577%%   \usepackage{xcolor}
2578%%   \renewcommand{\theenumi}{\alph{enumi}}
2579%%   \renewcommand{\labelenumi}{(\theenumi)}
2580%%   \renewcommand{\theenumii}{\Roman{enumii}}
2581%%   \renewcommand{\labelenumii}{\theenumii.}
2582%%   \renewcommand{\labelenumii}{\theenumii.}
2583%%  }
2584
2585%% \mode<presentation>{
2586%%   \beamertemplatenavigationsymbolsempty
2587%%   \useinnertheme{circles}
2588%%   \setbeamertemplate{frametitle}[default][center]
2589%%   \setbeamercovered{transparent}
2590%%   \setbeamertemplate{frametitle}[default][center]
2591%% }
2592
2593
2594%% % globals
2595%% \def\EPS{./Figures}
2596%% \title{RNA gene finding tutorial}
2597%% \author{Stefan Washietl, Ivo Hofacker, Stephan Bernhart, Andreas Gruber}
2598%% \institute[TBI]{Institute for Theoretical Chemistry\\ University Vienna}
2599%% \titlegraphic{\centerline{\pgfuseimage{logo}}}
2600%% \date{\tiny{\url{http://www.tbi.univie.ac.at/}}\\[1ex]\today}
2601
2602%% % pictures
2603%% \pgfdeclareimage[width=.5cm]{logo}{\EPS/tbilogo}
2604
2605%% % colors
2606%% \colorlet{darkgreen}{green!80!black}
2607
2608%% \begin{document}
2609
2610%\frame{\titlepage}
2611\section{Noncoding RNA gene prediction}
2612
2613Prediction of ncRNAs is still a challenging problem in bioinformatics.
2614Unlike protein coding genes, ncRNAs do not have any statistically
2615significant features in primary sequences that could be used for reliable
2616prediction. A large class of ncRNAs, however, depend on a defined secondary
2617structure for their function. As a consequence, evolutionarily conserved
2618secondary structures can be used as characteristic signal to detect ncRNAs.
2619All currently available programs for \emph{de novo} prediction make use of
2620this principle and are therefore, by construction, limited to structured
2621RNAs.
2622
2623%\begin{frame}
2624  \frametitle{Programs to predict structural RNAs}
2625  \begin{itemize}
2626    \item \texttt{\textbf{QRNA}} (Eddy \& Rivas, 2001)
2627    \item \texttt{ddbRNA} (di Bernardo, Down \& Hubbard, 2003)
2628    \item \texttt{MSARi} (Coventry, Kleitman \& Berger, 2004)
2629    \item \texttt{\textbf{AlifoldZ}} (Washietl \& Hofacker, 2004)
2630    \item \texttt{\textbf{RNAz}} (Washietl, Hofacker \& Stadler, 2005)
2631    \item \texttt{EvoFold} (Pedersen et al, 2006)
2632  \end{itemize}
2633%\end{frame}
2634
2635\subsection{QRNA}
2636\texttt{QRNA} analyzes pairwise alignments for characteristic patterns of
2637evolution. An alignment is scored by three probabilistic models: (i)
2638Position independent, (ii) coding, (iii) RNA. The independent and the
2639coding model is a pair hidden Markov model. The RNA model is a pair
2640stochastic context-free grammar.  First, it calculates the \emph{prior
2641probability} that, given a model, the alignment is observed. Second, it
2642calculates the \emph{posterior probability} that, given an alignment, it
2643has been generated by one of the three models. The posterior probabilities
2644are compared to the position independent background model and a ``winner''
2645is found.
2646
2647\texttt{QRNA} reads pairwise alignments in MFASTA format (i.e. FASTA format
2648  with gaps)
2649
2650%\begin{frame}
2651\frametitle{Three competing models in QRNA}
2652
2653\centerline{\includegraphics[width=0.5\textwidth,clip=]{Figures/qrna.eps}}
2654%\end{frame}
2655
2656%\begin{frame}[fragile]
2657\frametitle{Installing and basic usage of QRNA}
2658
2659\begin{itemize}
2660%% \item get the tarball from \url{ftp://ftp.genetics.wustl.edu/pub/eddy/software} and untar it
2661\item Use the files in \texttt{qrna-2.0.3d.tar.gz} located in the
2662  \texttt{Data/programs}-folder shipped with the tutorial
2663\item don't forget to set the \texttt{QRNADB} environment variable\\
2664  (e.g. \verb+export QRNADB=$HOME/Tutorial/Data/programs/qrna-2.0.3d/lib/+)
2665  and add it to your \texttt{.bashrc}
2666\item follow the instructions in the \texttt{INSTALL} document and make the binaries
2667\item create the directory
2668  \texttt{\textasciitilde/Tutorial/Progs/qrna} and move the binaries
2669  located in the \texttt{src/} sub-directory into this folder and add it to your
2670  \texttt{.bashrc}\\
2671  (e.g. \verb+export PATH=${HOME}/Tutorial/Progs:${PATH}:${HOME}/Tutorial/Progs/qrna+)
2672\item first read the help text (option \texttt{-h}).
2673\item for advanced use of \texttt{QRNA} read the
2674\texttt{userguide.pdf} shipped with the package (in the \texttt{documentation} folder
2675\item \texttt{-a} tells \texttt{QRNA} to print the alignment
2676\end{itemize}
2677\small
2678\begin{verbatim}
2679$ eqrna -h
2680$ eqrna -a Data/qrna/tRNA.fa
2681$ eqrna -a Data/qrna/coding.fa
2682\end{verbatim}
2683
2684%\end{frame}
2685\begin{verbatim}
2686[...]
2687Divergence time (variable): 0.214132 0.208107 0.203995
2688[alignment ID = 72.37 MUT = 23.68 GAP = 3.95]
2689
2690length alignment: 76 (id=72.37) (mut=23.68) (gap=3.95)
2691posX: 0-75 [0-72](73) -- (0.18 0.30 0.36 0.16)
2692posY: 0-75 [0-75](76) -- (0.14 0.34 0.37 0.14)
2693
2694
2695	         DA0780 GGGCTCGTAGCTCAGCT.GGAAGAGCGCGGCGTTTGCAACGCCGAGGCCT
2696	         DA0940 GGGCCGGTAGCTCAGCCTGGGAGAGCGTCGGCTTTGCAAGCCGAAGGCCC
2697
2698	         DA0780 GGGGTTCAAATCCCCACGGGTCCA..
2699	         DA0940 CGGGTTCGAATCCCGGCCGGTCCACC
2700[..]
2701\end{verbatim}
2702\normalsize
2703
2704\subsection{AlifoldZ}
2705\texttt{AlifoldZ} is based on an old hypothesis: functional RNAs are
2706thermodynamically more stable than expected by chance. This hypothesis can
2707be statistically tested by calculating $z$-scores: Calculate the MFE $m$ of
2708the native RNA and the mean $\mu$ and standard deviation $\sigma$ of the
2709background distribution of a large number of random (shuffled) RNAs.  The
2710normalized $z$-score $z=(m-\mu)/\sigma$ expresses how many standard
2711deviations the native RNA is more stable than random sequences.
2712
2713Unfortunately, most ncRNAs are not significantly more stable than the
2714background. See for example the distribution of $z$-scores of some tRNAs,
2715where the overlap of real (green bars) and shuffled (dashed line) tRNAs
2716is relatively high.
2717
2718%\begin{frame}
2719\frametitle{z-score distribution of tRNAs}
2720\begin{figure}
2721\centering
2722\includegraphics[width=0.6\textwidth,clip=]{Figures/trna-histo.eps}\\
2723The overlap of real (bars) and shuffled (dashed line) tRNAs is relatively high.
2724\end{figure}
2725
2726%\end{frame}
2727\noindent
2728\texttt{AlifoldZ} calculates $z$-scores for consensus structures folded by
2729\texttt{RNAalifold}. This significantly improves the detection performance compared
2730to single sequence folding.
2731
2732%\begin{frame}
2733\frametitle{z-score distribution of tRNA consensus folds}
2734\begin{figure}
2735\centering
2736\includegraphics[width=0.8\textwidth,clip=]{Figures/histos.eps}\\
2737The separation of real and shuffled tRNAs gets evident with more sequences in the alignment.
2738\end{figure}
2739%\end{frame}
2740
2741%\begin{frame}[fragile]
2742\frametitle{Installation and basic usage of AlifoldZ}
2743
2744\begin{itemize}
2745%%\item Available at: \small{\url{http://www.tbi.univie.ac.at/papers/SUPPLEMENTS/Alifoldz/}}
2746\item Use the tarball \texttt{alifoldz\_adopted.tar.gz} located in the
2747  \texttt{Data/programs/}-folder shipped with the tutorial
2748\item Copy the files into your \texttt{Progs} directory (It's just one
2749  single Perl script which needs \texttt{RNAfold} and
2750  \texttt{RNAalifold} and an important perl module located in the Math
2751  subdirectory)\\
2752  \verb+ $ cp -r alifoldz.pl Math/ ~/Tutorial/Progs/+
2753\item add the perl module to your \texttt{PERL5LIB} variable in the
2754  \texttt{.bashrc}\\
2755  \verb+ $ export PERL5LIB=$HOME/Tutorial/Progs/:$PERL5LIB+
2756\item test the tool
2757\end{itemize}
2758\begin{verbatim}
2759$ alifoldz.pl -h
2760$ alifoldz.pl < Data/alifoldz/miRNA.aln
2761$ alifoldz.pl -w 120 -x 100 < Data/alifoldz/unknown.aln
2762\end{verbatim}
2763
2764\normalsize
2765
2766%\end{frame}
2767
2768\subsection{RNAz}
2769\TODO{New version by Someone who loves RNAz. This part of the tutorial
2770  is based on the RNAz 1.0 version which is obsolete quite a while
2771  already!!!}\\
2772
2773\texttt{AlifoldZ} has some shortcomings that limits its usefulness in
2774practice: The $z$-scores are not deterministic, i.e. you get a
2775different score each time you run \texttt{AlifoldZ}. To get stable
2776$z$-scores you need to sample a large number of random alignments
2777which is computationally expensive. Moreover, \texttt{AlifoldZ} is
2778extremely sensitive to alignment errors.
2779
2780The program \texttt{RNAz} overcomes these problems by using a different
2781approach to asses a multiple sequence alignment for significant RNA
2782structures. It is based on two key innovations: (i) The structure
2783conservation index (SCI) to measure structural conservation in an alignment
2784and (ii) $z$-scores that are calculated by regression without sampling.
2785Both measures are combined to an overall score that is used to classify an
2786alignment as ``structured RNA'' or ``other''.
2787
2788%\begin{frame}
2789\frametitle{The structure conservation index}
2790
2791\centerline{\includegraphics[width=0.8\textwidth,clip=]{Figures/sci.eps}}
2792
2793\begin{itemize}
2794
2795\item The structure conservation index is an easy way to normalize an
2796  \texttt{RNAalifold} consensus MFE.
2797
2798\end{itemize}
2799
2800%\end{frame}
2801
2802%\begin{frame}
2803
2804  \frametitle{z-score regression}
2805
2806  \begin{itemize}
2807  \item The mean $\mu$ and standard deviation $\sigma$ of random samples of
2808    a given sequence are functions of the length and the base composition:
2809    $$\mu,\sigma(length,\frac{GC}{AT},\frac{G}{C},\frac{A}{T})$$
2810  \item It is therefore be possible to \textit{calculate} $z$-scores by
2811    solving this 5 dimensional regression problem.
2812  \end{itemize}
2813%\end{frame}
2814
2815%\begin{frame}
2816  \frametitle{SVM Classification}
2817
2818  \centerline{\includegraphics[width=0.5\textwidth,angle=-90,clip=]{Figures/contour.eps}}
2819
2820  \begin{itemize}
2821  \item A support vector machine learning algorithm is used to classify an
2822    alignment based on $z$-score and structure conservation index.
2823  \end{itemize}
2824
2825%\end{frame}
2826
2827%\begin{frame}[fragile]
2828
2829\frametitle{Installation of RNAz}
2830
2831Installation is done according to the instructions used by the \texttt{ViennaRNA Package}.
2832Just use the \texttt{--prefix} option as mentioned earlier and add the PATH to \texttt{.bashrc}
2833\begin{itemize}
2834\item RNAz is available at: \url{http://www.tbi.univie.ac.at/~wash/RNAz}
2835\item Package includes the core program \texttt{RNAz} in ISO \texttt{C},
2836a set of helper programs in Perl, and an extensive manual.
2837\end{itemize}
2838
2839%\end{frame}
2840
2841%\begin{frame}[fragile]
2842
2843  \frametitle{Basic usage of RNAz}
2844\TODO{where to get examples from (RNAz install package) - commands work with v2 but txt needs to be adopted}
2845  \begin{itemize}
2846  \item \texttt{RNAz} reads one or more multiple sequence alignments in \texttt{clustalw} or
2847    MAF format.
2848  \end{itemize}
2849
2850\small
2851\begin{verbatim}
2852$ RNAz --help
2853$ RNAz tRNA.aln
2854$ RNAz --both-strands --predict-strand tRNA.maf
2855\end{verbatim}
2856\normalsize
2857
2858%\end{frame}
2859
2860%\begin{frame}[fragile]
2861
2862  \frametitle{Advanced usage of RNAz}
2863
2864  \begin{itemize}
2865  \item \texttt{RNAz} is limited to a maximum alignment length of 400 columns and a
2866    maximum number of 6 sequences. To process larger alignments a set of
2867    Perl helper scripts are used.
2868  \item Selecting one or more subsets of sequences from an alignment with
2869    more than 6 sequences:
2870  \end{itemize}
2871
2872  \small
2873\begin{verbatim}
2874$ rnazSelectSeqs.pl miRNA.maf |RNAz
2875$ rnazSelectSeqs.pl --num-seqs=4 --num-samples=3 miRNA.maf |RNAz
2876\end{verbatim}
2877  \normalsize
2878  \begin{itemize}
2879    \item Scoring long alignments in overlapping windows:
2880  \end{itemize}
2881  \small
2882\begin{verbatim}
2883$ rnazWindow.pl --window=120 --slide=40 unknown.aln \
2884                | RNAz --both-strands
2885\end{verbatim}
2886  \normalsize
2887
2888%\end{frame}
2889
2890\subsection{Large scale screens}
2891
2892The \texttt{RNAz} package provides a set of Perl scripts that implement a
2893complete analysis pipeline suitable for medium to large scale screens of
2894genomic data.
2895
2896%\begin{frame}
2897
2898  \frametitle{General procedure}
2899
2900  \begin{enumerate}
2901    \item Obtain or create multiple sequence alignments in MAF format
2902    \item Run through the \texttt{RNAz} pipeline:
2903  \end{enumerate}
2904
2905  \centerline{\includegraphics[width=0.6\textwidth,clip=]{Figures/flowchart.eps}}
2906
2907%\end{frame}
2908
2909%\begin{frame}
2910\frametitle{Examples in this tutorial}
2911
2912\begin{enumerate}
2913
2914\item Align Epstein Barr Virus genome (Acc.no: NC\_007605) to two related
2915  primate viruses (Acc.nos: NC\_004367, NC\_006146) using \texttt{multiz}
2916  and run it through the \texttt{RNAz} pipeline.
2917\TODO{where are this data comeing from? file from NCBI differs from those hidden in genefinding/rnaz/herpes}
2918
2919\item Analyze snoRNA cluster in the human genome for conserved RNA
2920  structures: download pre-computed alignments from the UCSC genome browser
2921  and run it through the \texttt{RNAz} pipeline
2922
2923\end{enumerate}
2924
2925
2926%\end{frame}
2927%\begin{frame}[fragile]
2928
2929  \frametitle{Example a: Preparation of data}
2930  \begin{itemize}
2931  \item \texttt{multiz} and \texttt{blastz} are available here:
2932    \url{http://www.bx.psu.edu/miller_lab/}
2933  \item Download the viral genomes in FASTA format and reformat the header
2934    strictly according to the rules given in the \texttt{multiz} documentation
2935    (\url{http://www.bx.psu.edu/miller_lab/dist/tba_howto.pdf}), e.g.:
2936  \item You have to edit the ''multiz '' \texttt{Makefile} and replace
2937\begin{verbatim}
2938CFLAGS = -Wall -Wextra -Werror
2939\end{verbatim}
2940with
2941\begin{verbatim}
2942CFLAGS = -Wall -Wextra #-Werror
2943\end{verbatim}
2944and then simply use the \texttt{make} command to compile both programs.
2945  \end{itemize}
2946\TODO{dont understand what to do}
2947\small
2948\begin{verbatim}
2949>NC_007605:genome:1:+:149696
2950AGAATTCGTCTTGCTCTATTCACCCTTACTTTTCTTCTTGCCCGTTCTCTTTCTTAGTAT
2951GAATCCAGTATGCCTGCCTGTAATTGTTGCGCCCTACCTCTTTTGGCTGGCGGCTATTGC
2952CGCCTCGTGTTTCACGGCCTCAGTTAGTACCGTTGTGACCGCCACCGGCTTGGCCCTCTC
2953ACTTCTACTCTTGGCAGCAGTGGCCAGCTCATATGCCGCTGCACAAAGGAAACTGCTGAC
2954ACCGGTGACAGTGCTTACTGCGGTTGTCACTTGTGAGTACACACGCACCATTTACAATGC
2955ATGATGTTCGTGAGATTGATCTGTCTCTAACAGTTCACTTCCTCTGCTTTTCTCCTCAGT
2956CTTTGCAATTTGCCTAACATGGAGGATTGAGGACCCACCTTTTAATTCTCTTCTGTTTGC
2957[...]
2958\end{verbatim}
2959\normalsize
2960%\end{frame}
2961%\begin{frame}[fragile]
2962  \frametitle{Example a: Aligning viral genomes}
2963
2964  \begin{itemize}
2965  \item To get a multiple alignment a phylogenetic tree and the following
2966    three steps are necessary:
2967    \begin{enumerate}
2968      \item Run \texttt{blastz} each vs. each
2969      \item Combine blastz results to multiple sequence alignments
2970      \item Project raw alignments to a reference sequence.
2971    \end{enumerate}
2972  \item The corresponding commands:
2973  \end{itemize}
2974\small
2975\begin{verbatim}
2976all_bz - "((NC_007605 NC_006146) NC_004367)" | bash
2977tba "((NC_007605 NC_006146) NC_004367)" \
2978*.sing.maf raw-tba.maf
2979maf_project raw-tba.maf NC_007605 > final.maf
2980\end{verbatim}
2981\normalsize
2982\begin{itemize}
2983\item Note: The tree is given in NEWICK like format with blanks instead
2984  of commas. The sequence data files must be named exactly like the names
2985  in this tree and in the FASTA headers.
2986\end{itemize}
2987
2988
2989%\end{frame}
2990
2991%\begin{frame}[fragile]
2992
2993  \frametitle{Example a: Running the pipeline I}
2994
2995  \begin{itemize}
2996  \item First the alignments are filtered and sliced in overlapping windows:
2997  \end{itemize}
2998\small
2999\begin{verbatim}
3000$ rnazWindow.pl < final.maf > windows.maf
3001\end{verbatim}
3002\normalsize
3003  \begin{itemize}
3004  \item \texttt{RNAz} is run on these windows:
3005  \end{itemize}
3006\small
3007\begin{verbatim}
3008$ RNAz --both-strands --show-gaps --cutoff=0.5 windows.maf \
3009        > rnaz.out
3010\end{verbatim}
3011\normalsize
3012\begin{itemize}
3013  \item Overlapping hits are combined to ``loci'' and visualized on a web-site:
3014  \end{itemize}
3015\small
3016\begin{verbatim}
3017$ rnazCluster.pl --html rnaz.out > results.dat
3018\end{verbatim}
3019\normalsize
3020
3021%\end{frame}
3022
3023%\begin{frame}[fragile]
3024
3025 \frametitle{Example a: Running the pipeline II}
3026
3027\begin{itemize}
3028  \item The predicted hits are compared with available annotation of the genome:
3029  \end{itemize}
3030\small
3031\begin{verbatim}
3032$ rnazAnnotate.pl --bed annotation.bed results.dat \
3033          > results_annotated.dat
3034\end{verbatim}
3035\normalsize
3036\begin{itemize}
3037  \item The results file is formatted in a HTML overview page:
3038  \end{itemize}
3039\small
3040\begin{verbatim}
3041$ rnazIndex.pl --html results_annotated.dat \
3042          > results/index.html
3043\end{verbatim}
3044\normalsize
3045%\end{frame}
3046
3047
3048%\begin{frame}[fragile]
3049
3050  \frametitle{Example a: Statistics on the results}
3051
3052\begin{itemize}
3053\item \texttt{rnazIndex.pl} can be used to generate a BED formatted
3054  annotation file which can be analyzed using \texttt{rnazBEDstats.pl}
3055  (after sorting, for the case the input alignments were unsorted)''
3056\end{itemize}
3057
3058\small
3059\begin{verbatim}
3060$ rnazIndex.pl --bed results.dat | \
3061          rnazBEDsort.pl | rnazBEDstats.pl
3062\end{verbatim}
3063\normalsize
3064
3065\begin{itemize}
3066\item RNAzfilter.pl can be used to filter the results by different
3067  criteria. In this case it gives us all loci with P$>$0.9'':
3068\end{itemize}
3069
3070\small
3071\begin{verbatim}
3072$ rnazFilter.pl "P>0.9" results.dat | \
3073          rnazIndex.pl --bed | \
3074          rnazBEDsort.pl | rnazBEDstats.pl
3075\end{verbatim}
3076\normalsize
3077
3078\begin{itemize}
3079\item To get an estimate on the (statistical) false positives one can
3080  repeat the complete screen with randomized alignments:
3081\end{itemize}
3082
3083\small
3084\begin{verbatim}
3085$ rnazRandomizeAln final.maf > random.maf
3086\end{verbatim}
3087\normalsize
3088
3089%\end{frame}
3090
3091
3092%\begin{frame}[fragile]
3093
3094  \frametitle{Example b: Obtaining pre-computed alignments from UCSC}
3095
3096  \begin{itemize}
3097  \item Go to the UCSC genome browser (\url{http://genome.ucsc.edu}) and go to
3098    ``Tables''. Download ``multiz17'' alignments in MAF format for the
3099    region: chr11:93103000-93108000
3100  \end{itemize}
3101
3102  \centerline{\includegraphics[width=\textwidth,clip=]{Figures/table-browser.eps}}
3103
3104%\end{frame}
3105
3106%\begin{frame}[fragile]
3107
3108  \frametitle{Example b: Running the pipeline}
3109
3110  \begin{itemize}
3111  \item The Perl scripts are run in the same order as in Example 1:
3112  \end{itemize}
3113\small
3114\begin{verbatim}
3115$ rnazWindow.pl --min-seqs=4 region.maf > windows.maf
3116$ RNAz --both-strands --show-gaps --cutoff=0.5 windows.maf \
3117          > rnaz.out
3118$ rnazCluster.pl --html rnaz.out > results.dat
3119$ rnazAnnotate.pl --bed annotation.bed results.dat \
3120          > results_annotated.dat
3121$ rnazIndex.pl --html results_annotated.dat \
3122          > results/index.html
3123\end{verbatim}
3124\normalsize
3125  \begin{itemize}
3126  \item The results can be exported as UCSC BED file which can be displayed
3127    in the genome browser:
3128  \end{itemize}
3129\small
3130\begin{verbatim}
3131$ rnazIndex.pl --bed --ucsc results.dat > prediction.bed
3132\end{verbatim}
3133\normalsize
3134
3135%\end{frame}
3136
3137%\begin{frame}[fragile]
3138
3139  \frametitle{Example b: Visualizing the results on the genome browser}
3140
3141  \begin{itemize}
3142  \item Upload the BED file as ``Custom Track''\dots
3143  \end{itemize}
3144
3145  \centerline{\includegraphics[width=0.8\textwidth,clip=]{Figures/table-browser.eps}}
3146
3147  \begin{itemize}
3148  \item \dots and have a look at the results:
3149  \end{itemize}
3150
3151  \centerline{\includegraphics[width=\textwidth,clip=]{Figures/snos.eps}}
3152
3153
3154%\end{frame}
3155
3156%\end{document}
3157
3158%\renewcommand{\refname}{\Large Literature}
3159%\nocite{*}
3160%\bibliographystyle{unsrt}
3161%\bibliography{./talk}
3162
3163%===
3164\end{document}
3165%===
3166
3167% LocalWords:  ATGAAGATGA gzipped RNAalifold RNAs dimerization RNAdistance pdf
3168% LocalWords:  RNAduplex RNAeval RNAfold RNAheat RNAinverse RNALfold RNApaln ta
3169% LocalWords:  RNApdist RNAplfold RNAplot RNAsubopt stdout stdin wc cd pwd txt
3170% LocalWords:  MANPATH online RNAcofold Zukers mfold cmount coloraln alirna ps
3171% LocalWords:  colorrna dpzoom databanks readseq popt Zuker's subopt relplot ss
3172% LocalWords:  MFE mol PostScript dp definedness xy xmgrace ij nt noLP tRNAs gv
3173% LocalWords:  alifold alidot gsview RNASubopt mRNA Dalgarno AUGC Fp hetero kb
3174% LocalWords:  GC UA UG tRNA clustalw Clustal kcal miRNA microRNA mir pos intra
3175% LocalWords:  pre executables LaTeX basename fasta GIF JPEG inv rRNA Cofolding
3176% LocalWords:  mmol minima treekin ncRNA coli RNase pknotsRG paRNAss RNAmovies
3177% LocalWords:  RNAforester RNAshapes pfold suboptimals siRNA foldings SARSE aln
3178% LocalWords:  ralee emacs QRNA ddbRNA di MSARi Kleitman AlifoldZ EvoFold al un
3179% LocalWords:  Pedersen Untar MFASTA SVM ClustalW MAF Acc multiz snoRNA UCSC de
3180% LocalWords:  blastz NEWICK rnazIndex rnazBEDstats RNAzfilter chr RybB ompN et
3181% LocalWords:  RNAup stral LocARNA StrAl CMfinder dynalign stemloc Noncoding
3182% LocalWords:  novo
3183