1 /*==========================================================================
2                SeqAn - The Library for Sequence Analysis
3                          http://www.seqan.de
4 ============================================================================
5 Copyright (C) 2007
6 
7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 3 of the License, or (at your option) any later version.
11 
12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16 ==========================================================================*/
17 
18 #include <seqan/basic.h>
19 #include <seqan/graph_align.h>
20 #include <seqan/modifier.h>
21 #include <seqan/misc/misc_cmdparser.h>
22 
23 #include <iostream>
24 #include <fstream>
25 
26 
27 using namespace seqan;
28 
29 //////////////////////////////////////////////////////////////////////////////////
30 
31 inline void
_addVersion(CommandLineParser & parser)32 _addVersion(CommandLineParser& parser) {
33 	::std::string rev = "$Revision: 4566 $";
34 	addVersionLine(parser, "Version 1.0 (15. July 2009) Revision: " + rev.substr(11, 4) + "");
35 }
36 
37 //////////////////////////////////////////////////////////////////////////////////
38 
39 template <typename TSeqSet, typename TNameSet>
_loadSequences(TSeqSet & sequences,TNameSet & fastaIDs,const char * fileName)40 bool _loadSequences(TSeqSet& sequences,
41 					TNameSet& fastaIDs,
42 					const char *fileName)
43 {
44 	MultiFasta multiFasta;
45 	if (!open(multiFasta.concat, fileName, OPEN_RDONLY)) return false;
46 	AutoSeqFormat format;
47 	guessFormat(multiFasta.concat, format);
48 	split(multiFasta, format);
49 	unsigned seqCount = length(multiFasta);
50 	resize(sequences, seqCount, Exact());
51 	resize(fastaIDs, seqCount, Exact());
52 	for(unsigned i = 0; i < seqCount; ++i)
53 	{
54 		assignSeqId(fastaIDs[i], multiFasta[i], format);
55 		assignSeq(sequences[i], multiFasta[i], format);
56 	}
57 	return (seqCount > 0);
58 }
59 
60 
61 
62 //////////////////////////////////////////////////////////////////////////////////
63 
64 template<typename TAlphabet, typename TAlignConfig, typename TScore, typename TSeqFile, typename TMethod, typename TDiag, typename TOutputFormat, typename TOutfile>
65 inline void
pairwise_align(TScore const & sc,TSeqFile & seqfile,TMethod method,TDiag low,TDiag high,bool banded,TOutputFormat outputFormat,TOutfile & outfile)66 pairwise_align(TScore const& sc,
67 			   TSeqFile& seqfile,
68 			   TMethod method,
69 			   TDiag low,
70 			   TDiag high,
71 			   bool banded,
72 			   TOutputFormat outputFormat,
73 			   TOutfile& outfile)
74 {
75 	// Load the 2 sequences
76 	typedef String<TAlphabet> TSequence;
77 	StringSet<TSequence, Owner<> > sequenceSet;
78 	StringSet<String<char> > sequenceNames;
79 	_loadSequences(sequenceSet, sequenceNames, seqfile.c_str());
80 
81 	// Fix low and high diagonal.
82 	low = _max(low, -1 * (int) length(sequenceSet[1]));
83 	high = _min(high, (int) length(sequenceSet[0]));
84 
85 	// Align the sequences
86 	Graph<Alignment<StringSet<TSequence, Dependent<> >, void, WithoutEdgeId> > gAlign(sequenceSet);
87 
88 	int aliScore = 0;
89 	// Banded alignment?
90 	if (!banded) {
91 		if (method == 0) aliScore = globalAlignment(gAlign, sc, TAlignConfig(), NeedlemanWunsch());
92 		else if (method == 1) aliScore = globalAlignment(gAlign, sc, TAlignConfig(), Gotoh());
93 		else if (method == 2) aliScore = localAlignment(gAlign, sc, SmithWaterman());
94 		else if (method == 3) aliScore = globalAlignment(gAlign, Lcs());
95 	} else {
96 		if (method == 0) aliScore = globalAlignment(gAlign, sc, TAlignConfig(), low, high, BandedNeedlemanWunsch());
97 		else if (method == 1) aliScore = globalAlignment(gAlign, sc, TAlignConfig(), low, high, BandedGotoh());
98 	}
99 
100 	// Alignment output
101 	std::cout << "Alignment score: " << aliScore << std::endl;
102 	if (outputFormat == 0) {
103 		FILE* strmWrite = fopen(outfile.c_str(), "w");
104 		write(strmWrite, gAlign, sequenceNames, FastaFormat());
105 		fclose(strmWrite);
106 	} else if (outputFormat == 1) {
107 		FILE* strmWrite = fopen(outfile.c_str(), "w");
108 		write(strmWrite, gAlign, sequenceNames, MsfFormat());
109 		fclose(strmWrite);
110 	}
111 }
112 
113 //////////////////////////////////////////////////////////////////////////////////
114 
115 template<typename TScore, typename TSc>
116 inline void
_setMatchScore(TScore &,TSc)117 _setMatchScore(TScore&, TSc) {
118 	// No operation
119 }
120 
121 //////////////////////////////////////////////////////////////////////////////////
122 
123 template<typename TScore, typename TSc>
124 inline void
_setMismatchScore(TScore &,TSc)125 _setMismatchScore(TScore&, TSc) {
126 	// No operation
127 }
128 
129 //////////////////////////////////////////////////////////////////////////////////
130 
131 template<typename TSc>
132 inline void
_setMatchScore(Score<int,Simple> & sc,TSc msc)133 _setMatchScore(Score<int, Simple>& sc, TSc msc) {
134 	sc.data_match = msc;
135 }
136 
137 //////////////////////////////////////////////////////////////////////////////////
138 
139 template<typename TSc>
140 inline void
_setMismatchScore(Score<int,Simple> & sc,TSc mmsc)141 _setMismatchScore(Score<int, Simple>& sc, TSc mmsc) {
142 	sc.data_mismatch = mmsc;
143 }
144 
145 
146 //////////////////////////////////////////////////////////////////////////////////
147 
148 template<typename TAlphabet, typename TScore>
149 inline void
_initAlignParams(CommandLineParser & parser,TScore & sc)150 _initAlignParams(CommandLineParser& parser, TScore& sc) {
151 	// Set options
152 	getOptionValueLong(parser, "gop", sc.data_gap_open);
153 	getOptionValueLong(parser, "gex", sc.data_gap_extend);
154 	int msc = 0;
155 	getOptionValueLong(parser, "msc", msc);
156 	_setMatchScore(sc, msc);
157 	int mmsc = 0;
158 	getOptionValueLong(parser, "mmsc", mmsc);
159 	_setMismatchScore(sc, mmsc);
160 	::std::string seqfile;
161 	getOptionValueLong(parser, "seq", seqfile);
162 	::std::string outfile = "out.fasta";
163 	getOptionValueLong(parser, "outfile", outfile);
164 	unsigned int method = 0;
165 	String<char> meth;
166 	getOptionValueLong(parser, "method", meth);
167 	if (meth == "nw") method = 0;
168 	else if (meth == "gotoh") method = 1;
169 	else if (meth == "sw") method = 2;
170 	else if (meth == "lcs") method = 3;
171 	unsigned int outputFormat = 0;
172 	String<char> format;
173 	getOptionValueLong(parser, "format", format);
174 	if (format == "fasta") outputFormat = 0;
175 	else if (format == "msf") outputFormat = 1;
176 	int low = 0;
177 	int high = 0;
178 	bool banded = false;
179 	if (isSetLong(parser, "low")) {
180 		getOptionValueLong(parser, "low", low);
181 		banded = true;
182 	}
183 	if (isSetLong(parser, "high")) {
184 		getOptionValueLong(parser, "high", high);
185 		banded = true;
186 	}
187 
188 	// Check options
189 	if (!isSetLong(parser, "seq")) { help(parser); exit(1); }
190 	if (low > high) banded = false;
191 
192 	// Do pairwise alignment
193 	if (isSetLong(parser, "config")) {
194 		String<char> config;
195 		getOptionValueLong(parser, "config", config);
196 		if (config == "tttt") pairwise_align<TAlphabet, AlignConfig<true, true, true, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
197 		else if (config == "tttf") pairwise_align<TAlphabet, AlignConfig<true, true, true, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
198 		else if (config == "ttft") pairwise_align<TAlphabet, AlignConfig<true, true, false, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
199 		else if (config == "ttff") pairwise_align<TAlphabet, AlignConfig<true, true, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
200 		else if (config == "tftt") pairwise_align<TAlphabet, AlignConfig<true, false, true, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
201 		else if (config == "tftf") pairwise_align<TAlphabet, AlignConfig<true, false, true, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
202 		else if (config == "tfft") pairwise_align<TAlphabet, AlignConfig<true, false, false, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
203 		else if (config == "tfff") pairwise_align<TAlphabet, AlignConfig<true, false, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
204 		else if (config == "fttt") pairwise_align<TAlphabet, AlignConfig<false, true, true, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
205 		else if (config == "fttf") pairwise_align<TAlphabet, AlignConfig<false, true, true, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
206 		else if (config == "ftft") pairwise_align<TAlphabet, AlignConfig<false, true, false, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
207 		else if (config == "ftff") pairwise_align<TAlphabet, AlignConfig<false, true, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
208 		else if (config == "fftt") pairwise_align<TAlphabet, AlignConfig<false, false, true, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
209 		else if (config == "fftf") pairwise_align<TAlphabet, AlignConfig<false, false, true, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
210 		else if (config == "ffft") pairwise_align<TAlphabet, AlignConfig<false, false, false, true> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
211 		else if (config == "ffff") pairwise_align<TAlphabet, AlignConfig<false, false, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
212 	} else pairwise_align<TAlphabet, AlignConfig<false, false, false, false> >(sc, seqfile, method, low, high, banded, outputFormat, outfile);
213 }
214 
215 //////////////////////////////////////////////////////////////////////////////////
216 
217 inline void
_initScoreMatrix(CommandLineParser & parser,Dna5 const)218 _initScoreMatrix(CommandLineParser& parser, Dna5 const) {
219 	String<char> matrix;
220 	getOptionValueLong(parser, "matrix", matrix);
221 	if (isSetLong(parser, "matrix")) {
222 		Score<int, ScoreMatrix<> > sc;
223 		loadScoreMatrix(sc, matrix);
224 		_initAlignParams<Dna5>(parser, sc);
225 	} else {
226 		Score<int> sc;
227 		_initAlignParams<Dna5>(parser, sc);
228 	}
229 }
230 
231 //////////////////////////////////////////////////////////////////////////////////
232 
233 inline void
_initScoreMatrix(CommandLineParser & parser,Rna5 const)234 _initScoreMatrix(CommandLineParser& parser, Rna5 const) {
235 	String<char> matrix;
236 	getOptionValueLong(parser, "matrix", matrix);
237 	if (isSetLong(parser, "matrix")) {
238 		Score<int, ScoreMatrix<> > sc;
239 		loadScoreMatrix(sc, matrix);
240 		_initAlignParams<Rna5>(parser, sc);
241 	} else {
242 		Score<int> sc;
243 		_initAlignParams<Rna5>(parser, sc);
244 	}
245 }
246 
247 //////////////////////////////////////////////////////////////////////////////////
248 
249 inline void
_initScoreMatrix(CommandLineParser & parser,AminoAcid const)250 _initScoreMatrix(CommandLineParser& parser, AminoAcid const) {
251 	String<char> matrix;
252 	getOptionValueLong(parser, "matrix", matrix);
253 	if (isSetLong(parser, "matrix")) {
254 		Score<int, ScoreMatrix<> > sc;
255 		loadScoreMatrix(sc, matrix);
256 		_initAlignParams<AminoAcid>(parser, sc);
257 	} else {
258 		Blosum62 sc;
259 		_initAlignParams<AminoAcid>(parser, sc);
260 	}
261 }
262 
263 //////////////////////////////////////////////////////////////////////////////////
264 
main(int argc,const char * argv[])265 int main(int argc, const char *argv[]) {
266 
267 	// Command line parsing
268 	CommandLineParser parser;
269 	_addVersion(parser);
270 
271 	addTitleLine(parser, "***************************************");
272 	addTitleLine(parser, "* Pairwise alignment - PairAlign      *");
273 	addTitleLine(parser, "* (c) Copyright 2009 by Tobias Rausch *");
274 	addTitleLine(parser, "***************************************");
275 
276 	addUsageLine(parser, "-s <FASTA sequence file> [Options]");
277 
278 	addSection(parser, "Main Options:");
279 	addOption(parser, addArgumentText(CommandLineOption("s", "seq", "file with 2 sequences", OptionType::String), "<FASTA Sequence File>"));
280 	addOption(parser, addArgumentText(CommandLineOption("a", "alphabet", "sequence alphabet", (int)OptionType::String, "protein"), "[protein | dna | rna]"));
281 	addOption(parser, addArgumentText(CommandLineOption("m", "method", "alignment method", (int)OptionType::String, "gotoh"), "[nw, gotoh, sw, lcs]"));
282 	addHelpLine(parser, "nw = Needleman-Wunsch");
283 	addHelpLine(parser, "gotoh = Gotoh");
284 	addHelpLine(parser, "sw = Smith-Waterman");
285 	addHelpLine(parser, "lcs = Longest common subsequence");
286 	addOption(parser, addArgumentText(CommandLineOption("o", "outfile", "output filename", (int)OptionType::String, "out.fasta"), "<Filename>"));
287 	addOption(parser, addArgumentText(CommandLineOption("f", "format", "output format", (int)OptionType::String, "fasta"), "[fasta | msf]"));
288 
289 	addSection(parser, "Scoring Options:");
290 	addOption(parser, addArgumentText(CommandLineOption("g", "gop", "gap open penalty", (int)OptionType::Int, -11), "<Int>"));
291 	addOption(parser, addArgumentText(CommandLineOption("e", "gex", "gap extension penalty", (int)OptionType::Int, -1), "<Int>"));
292 	addOption(parser, addArgumentText(CommandLineOption("ma", "matrix", "score matrix", (int)OptionType::String, "Blosum62"), "<Matrix file>"));
293 	addOption(parser, addArgumentText(CommandLineOption("ms", "msc", "match score", (int)OptionType::Int, 5), "<Int>"));
294 	addOption(parser, addArgumentText(CommandLineOption("mm", "mmsc", "mismatch penalty", (int)OptionType::Int, -4), "<Int>"));
295 
296 	addSection(parser, "Banded Alignment Options:");
297 	addOption(parser, addArgumentText(CommandLineOption("lo", "low", "lower diagonal", OptionType::Int), "<Int>"));
298 	addOption(parser, addArgumentText(CommandLineOption("hi", "high", "upper diagonal", OptionType::Int), "<Int>"));
299 
300 	addSection(parser, "DP Matrix Configuration Options:");
301 	addOption(parser, addArgumentText(CommandLineOption("c", "config", "alignment configuration", (int)OptionType::String, "ffff"), "[ffff | ... | tttt]"));
302 	addHelpLine(parser, "tfff = First row with 0's");
303 	addHelpLine(parser, "ftff = First column with 0's");
304 	addHelpLine(parser, "fftf = Search last column for max");
305 	addHelpLine(parser, "ffft = Search last row for max");
306 	addHelpLine(parser, "All combinations are allowed.");
307 
308 	if (argc == 1)
309 	{
310 		shortHelp(parser, std::cerr);	// print short help and exit
311 		return 0;
312 	}
313 
314 	if (!parse(parser, argc, argv, ::std::cerr)) return 1;
315 	if (isSetLong(parser, "help") || isSetLong(parser, "version")) return 0;	// print help or version and exit
316 
317 	// Basic command line options
318 	String<char> alphabet;
319 	getOptionValueLong(parser, "alphabet", alphabet);
320 
321 	// Initialize scoring matrices
322 	if (alphabet == "dna") _initScoreMatrix(parser, Dna5());
323 	else if (alphabet == "rna") _initScoreMatrix(parser, Rna5());
324 	else _initScoreMatrix(parser, AminoAcid());
325 
326 	return 0;
327 }
328