1 /*==========================================================================
2 SeqAn - The Library for Sequence Analysis
3 http://www.seqan.de
4 ============================================================================
5 Copyright (C) 2007
6
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16 ==========================================================================*/
17
18 #include <seqan/basic.h>
19 #include <seqan/graph_align.h>
20 #include <seqan/modifier.h>
21 #include <seqan/misc/misc_cmdparser.h>
22
23 #include <iostream>
24 #include <fstream>
25
26
27 using namespace seqan;
28
29 //////////////////////////////////////////////////////////////////////////////////
30
31 inline void
_addVersion(CommandLineParser & parser)32 _addVersion(CommandLineParser& parser) {
33 ::std::string rev = "$Revision: 4566 $";
34 addVersionLine(parser, "Version 1.0 (15. July 2009) Revision: " + rev.substr(11, 4) + "");
35 }
36
37 //////////////////////////////////////////////////////////////////////////////////
38
39 template <typename TSeqSet, typename TNameSet>
_loadSequences(TSeqSet & sequences,TNameSet & fastaIDs,const char * fileName)40 bool _loadSequences(TSeqSet& sequences,
41 TNameSet& fastaIDs,
42 const char *fileName)
43 {
44 MultiFasta multiFasta;
45 if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false;
46 AutoSeqFormat format;
47 guessFormat(multiFasta.concat, format);
48 split(multiFasta, format);
49 unsigned seqCount = length(multiFasta);
50 resize(sequences, seqCount, Exact());
51 resize(fastaIDs, seqCount, Exact());
52 for(unsigned i = 0; i < seqCount; ++i)
53 {
54 assignSeqId(fastaIDs[i], multiFasta[i], format);
55 assignSeq(sequences[i], multiFasta[i], format);
56 }
57 return (seqCount > 0);
58 }
59
60
61
62 //////////////////////////////////////////////////////////////////////////////////
63
64 template<typename TAlphabet, typename TAlignConfig, typename TScore, typename TSeqFile, typename TMethod, typename TDiag, typename TOutputFormat, typename TOutfile>
65 inline void
pairwise_align(TScore const & sc,TSeqFile & seqfile,TMethod method,TDiag low,TDiag high,bool banded,TOutputFormat outputFormat,TOutfile & outfile)66 pairwise_align(TScore const& sc,
67 TSeqFile& seqfile,
68 TMethod method,
69 TDiag low,
70 TDiag high,
71 bool banded,
72 TOutputFormat outputFormat,
73 TOutfile& outfile)
74 {
75 // Load the 2 sequences
76 typedef String<TAlphabet> TSequence;
77 StringSet<TSequence, Owner<> > sequenceSet;
78 StringSet<String<char> > sequenceNames;
79 _loadSequences(sequenceSet, sequenceNames, seqfile.c_str());
80
81 // Fix low and high diagonal.
82 low = _max(low, -1 * (int) length(sequenceSet[1]));
83 high = _min(high, (int) length(sequenceSet[0]));
84
85 // Align the sequences
86 Graph<Alignment<StringSet<TSequence, Dependent<> >, void, WithoutEdgeId> > gAlign(sequenceSet);
87
88 int aliScore = 0;
89 // Banded alignment?
90 if (!banded) {
91 if (method == 0) aliScore = globalAlignment(gAlign, sc, TAlignConfig(), NeedlemanWunsch());
92 else if (method == 1) aliScore = globalAlignment(gAlign, sc, TAlignConfig(), Gotoh());
93 else if (method == 2) aliScore = localAlignment(gAlign, sc, SmithWaterman());
94 else if (method == 3) aliScore = globalAlignment(gAlign, Lcs());
95 } else {
96 if (method == 0) aliScore = globalAlignment(gAlign, sc, TAlignConfig(), low, high, BandedNeedlemanWunsch());
97 else if (method == 1) aliScore = globalAlignment(gAlign, sc, TAlignConfig(), low, high, BandedGotoh());
98 }
99
100 // Alignment output
101 std::cout << "Alignment score: " << aliScore << std::endl;
102 if (outputFormat == 0) {
103 FILE* strmWrite = fopen(outfile.c_str(), "w");
104 write(strmWrite, gAlign, sequenceNames, FastaFormat());
105 fclose(strmWrite);
106 } else if (outputFormat == 1) {
107 FILE* strmWrite = fopen(outfile.c_str(), "w");
108 write(strmWrite, gAlign, sequenceNames, MsfFormat());
109 fclose(strmWrite);
110 }
111 }
112
113 //////////////////////////////////////////////////////////////////////////////////
114
115 template<typename TScore, typename TSc>
116 inline void
_setMatchScore(TScore &,TSc)117 _setMatchScore(TScore&, TSc) {
118 // No operation
119 }
120
121 //////////////////////////////////////////////////////////////////////////////////
122
123 template<typename TScore, typename TSc>
124 inline void
_setMismatchScore(TScore &,TSc)125 _setMismatchScore(TScore&, TSc) {
126 // No operation
127 }
128
129 //////////////////////////////////////////////////////////////////////////////////
130
131 template<typename TSc>
132 inline void
_setMatchScore(Score<int,Simple> & sc,TSc msc)133 _setMatchScore(Score<int, Simple>& sc, TSc msc) {
134 sc.data_match = msc;
135 }
136
137 //////////////////////////////////////////////////////////////////////////////////
138
139 template<typename TSc>
140 inline void
_setMismatchScore(Score<int,Simple> & sc,TSc mmsc)141 _setMismatchScore(Score<int, Simple>& sc, TSc mmsc) {
142 sc.data_mismatch = mmsc;
143 }
144
145
146 //////////////////////////////////////////////////////////////////////////////////
147
148 template<typename TAlphabet, typename TScore>
149 inline void
_initAlignParams(CommandLineParser & parser,TScore & sc)150 _initAlignParams(CommandLineParser& parser, TScore& sc) {
151 // Set options
152 getOptionValueLong(parser, "gop", sc.data_gap_open);
153 getOptionValueLong(parser, "gex", sc.data_gap_extend);
154 int msc = 0;
155 getOptionValueLong(parser, "msc", msc);
156 _setMatchScore(sc, msc);
157 int mmsc = 0;
158 getOptionValueLong(parser, "mmsc", mmsc);
159 _setMismatchScore(sc, mmsc);
160 ::std::string seqfile;
161 getOptionValueLong(parser, "seq", seqfile);
162 ::std::string outfile = "out.fasta";
163 getOptionValueLong(parser, "outfile", outfile);
164 unsigned int method = 0;
165 String<char> meth;
166 getOptionValueLong(parser, "method", meth);
167 if (meth == "nw") method = 0;
168 else if (meth == "gotoh") method = 1;
169 else if (meth == "sw") method = 2;
170 else if (meth == "lcs") method = 3;
171 unsigned int outputFormat = 0;
172 String<char> format;
173 getOptionValueLong(parser, "format", format);
174 if (format == "fasta") outputFormat = 0;
175 else if (format == "msf") outputFormat = 1;
176 int low = 0;
177 int high = 0;
178 bool banded = false;
179 if (isSetLong(parser, "low")) {
180 getOptionValueLong(parser, "low", low);
181 banded = true;
182 }
183 if (isSetLong(parser, "high")) {
184 getOptionValueLong(parser, "high", high);
185 banded = true;
186 }
187
188 // Check options
189 if (!isSetLong(parser, "seq")) { help(parser); exit(1); }
190 if (low > high) banded = false;
191
192 // Do pairwise alignment
193 if (isSetLong(parser, "config")) {
194 String<char> config;
195 getOptionValueLong(parser, "config", config);
196 if (config == "tttt") pairwise_align<TAlphabet, AlignConfig<true, true, true, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
197 else if (config == "tttf") pairwise_align<TAlphabet, AlignConfig<true, true, true, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
198 else if (config == "ttft") pairwise_align<TAlphabet, AlignConfig<true, true, false, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
199 else if (config == "ttff") pairwise_align<TAlphabet, AlignConfig<true, true, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
200 else if (config == "tftt") pairwise_align<TAlphabet, AlignConfig<true, false, true, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
201 else if (config == "tftf") pairwise_align<TAlphabet, AlignConfig<true, false, true, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
202 else if (config == "tfft") pairwise_align<TAlphabet, AlignConfig<true, false, false, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
203 else if (config == "tfff") pairwise_align<TAlphabet, AlignConfig<true, false, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
204 else if (config == "fttt") pairwise_align<TAlphabet, AlignConfig<false, true, true, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
205 else if (config == "fttf") pairwise_align<TAlphabet, AlignConfig<false, true, true, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
206 else if (config == "ftft") pairwise_align<TAlphabet, AlignConfig<false, true, false, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
207 else if (config == "ftff") pairwise_align<TAlphabet, AlignConfig<false, true, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
208 else if (config == "fftt") pairwise_align<TAlphabet, AlignConfig<false, false, true, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
209 else if (config == "fftf") pairwise_align<TAlphabet, AlignConfig<false, false, true, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
210 else if (config == "ffft") pairwise_align<TAlphabet, AlignConfig<false, false, false, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
211 else if (config == "ffff") pairwise_align<TAlphabet, AlignConfig<false, false, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
212 } else pairwise_align<TAlphabet, AlignConfig<false, false, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
213 }
214
215 //////////////////////////////////////////////////////////////////////////////////
216
217 inline void
_initScoreMatrix(CommandLineParser & parser,Dna5 const)218 _initScoreMatrix(CommandLineParser& parser, Dna5 const) {
219 String<char> matrix;
220 getOptionValueLong(parser, "matrix", matrix);
221 if (isSetLong(parser, "matrix")) {
222 Score<int, ScoreMatrix<> > sc;
223 loadScoreMatrix(sc, matrix);
224 _initAlignParams<Dna5>(parser, sc);
225 } else {
226 Score<int> sc;
227 _initAlignParams<Dna5>(parser, sc);
228 }
229 }
230
231 //////////////////////////////////////////////////////////////////////////////////
232
233 inline void
_initScoreMatrix(CommandLineParser & parser,Rna5 const)234 _initScoreMatrix(CommandLineParser& parser, Rna5 const) {
235 String<char> matrix;
236 getOptionValueLong(parser, "matrix", matrix);
237 if (isSetLong(parser, "matrix")) {
238 Score<int, ScoreMatrix<> > sc;
239 loadScoreMatrix(sc, matrix);
240 _initAlignParams<Rna5>(parser, sc);
241 } else {
242 Score<int> sc;
243 _initAlignParams<Rna5>(parser, sc);
244 }
245 }
246
247 //////////////////////////////////////////////////////////////////////////////////
248
249 inline void
_initScoreMatrix(CommandLineParser & parser,AminoAcid const)250 _initScoreMatrix(CommandLineParser& parser, AminoAcid const) {
251 String<char> matrix;
252 getOptionValueLong(parser, "matrix", matrix);
253 if (isSetLong(parser, "matrix")) {
254 Score<int, ScoreMatrix<> > sc;
255 loadScoreMatrix(sc, matrix);
256 _initAlignParams<AminoAcid>(parser, sc);
257 } else {
258 Blosum62 sc;
259 _initAlignParams<AminoAcid>(parser, sc);
260 }
261 }
262
263 //////////////////////////////////////////////////////////////////////////////////
264
main(int argc,const char * argv[])265 int main(int argc, const char *argv[]) {
266
267 // Command line parsing
268 CommandLineParser parser;
269 _addVersion(parser);
270
271 addTitleLine(parser, "***************************************");
272 addTitleLine(parser, "* Pairwise alignment - PairAlign *");
273 addTitleLine(parser, "* (c) Copyright 2009 by Tobias Rausch *");
274 addTitleLine(parser, "***************************************");
275
276 addUsageLine(parser, "-s <FASTA sequence file> [Options]");
277
278 addSection(parser, "Main Options:");
279 addOption(parser, addArgumentText(CommandLineOption("s", "seq", "file with 2 sequences", OptionType::String), "<FASTA Sequence File>"));
280 addOption(parser, addArgumentText(CommandLineOption("a", "alphabet", "sequence alphabet", (int)OptionType::String, "protein"), "[protein | dna | rna]"));
281 addOption(parser, addArgumentText(CommandLineOption("m", "method", "alignment method", (int)OptionType::String, "gotoh"), "[nw, gotoh, sw, lcs]"));
282 addHelpLine(parser, "nw = Needleman-Wunsch");
283 addHelpLine(parser, "gotoh = Gotoh");
284 addHelpLine(parser, "sw = Smith-Waterman");
285 addHelpLine(parser, "lcs = Longest common subsequence");
286 addOption(parser, addArgumentText(CommandLineOption("o", "outfile", "output filename", (int)OptionType::String, "out.fasta"), "<Filename>"));
287 addOption(parser, addArgumentText(CommandLineOption("f", "format", "output format", (int)OptionType::String, "fasta"), "[fasta | msf]"));
288
289 addSection(parser, "Scoring Options:");
290 addOption(parser, addArgumentText(CommandLineOption("g", "gop", "gap open penalty", (int)OptionType::Int, -11), "<Int>"));
291 addOption(parser, addArgumentText(CommandLineOption("e", "gex", "gap extension penalty", (int)OptionType::Int, -1), "<Int>"));
292 addOption(parser, addArgumentText(CommandLineOption("ma", "matrix", "score matrix", (int)OptionType::String, "Blosum62"), "<Matrix file>"));
293 addOption(parser, addArgumentText(CommandLineOption("ms", "msc", "match score", (int)OptionType::Int, 5), "<Int>"));
294 addOption(parser, addArgumentText(CommandLineOption("mm", "mmsc", "mismatch penalty", (int)OptionType::Int, -4), "<Int>"));
295
296 addSection(parser, "Banded Alignment Options:");
297 addOption(parser, addArgumentText(CommandLineOption("lo", "low", "lower diagonal", OptionType::Int), "<Int>"));
298 addOption(parser, addArgumentText(CommandLineOption("hi", "high", "upper diagonal", OptionType::Int), "<Int>"));
299
300 addSection(parser, "DP Matrix Configuration Options:");
301 addOption(parser, addArgumentText(CommandLineOption("c", "config", "alignment configuration", (int)OptionType::String, "ffff"), "[ffff | ... | tttt]"));
302 addHelpLine(parser, "tfff = First row with 0's");
303 addHelpLine(parser, "ftff = First column with 0's");
304 addHelpLine(parser, "fftf = Search last column for max");
305 addHelpLine(parser, "ffft = Search last row for max");
306 addHelpLine(parser, "All combinations are allowed.");
307
308 if (argc == 1)
309 {
310 shortHelp(parser, std::cerr); // print short help and exit
311 return 0;
312 }
313
314 if (!parse(parser, argc, argv, ::std::cerr)) return 1;
315 if (isSetLong(parser, "help") || isSetLong(parser, "version")) return 0; // print help or version and exit
316
317 // Basic command line options
318 String<char> alphabet;
319 getOptionValueLong(parser, "alphabet", alphabet);
320
321 // Initialize scoring matrices
322 if (alphabet == "dna") _initScoreMatrix(parser, Dna5());
323 else if (alphabet == "rna") _initScoreMatrix(parser, Rna5());
324 else _initScoreMatrix(parser, AminoAcid());
325
326 return 0;
327 }
328