1 /* 2 Copyright 2009-2012 Andreas Biegert, Christof Angermueller 3 4 This file is part of the CS-BLAST package. 5 6 The CS-BLAST package is free software: you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation, either version 3 of the License, or 9 (at your option) any later version. 10 11 The CS-BLAST package is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "cs.h" 21 #include "alignment-inl.h" 22 #include "application.h" 23 #include "blosum_matrix.h" 24 #include "context_library.h" 25 #include "crf_pseudocounts-inl.h" 26 #include "crf-inl.h" 27 #include "count_profile-inl.h" 28 #include "getopt_pp.h" 29 #include "library_pseudocounts-inl.h" 30 #include "pssm.h" 31 #include "sequence-inl.h" 32 #include "a3m_compress.h" 33 #include "ffindexdatabase.h" 34 35 #include "ext/fmemopen.h" 36 #include "context_data.lib.h" 37 //include "context_data.crf.h" 38 #include "cs219.lib.h" 39 40 #ifdef OPENMP 41 #include <omp.h> 42 #endif 43 44 #include <sstream> 45 46 using namespace GetOpt; 47 using std::string; 48 using std::vector; 49 50 namespace cs { 51 struct CSTranslateAppOptions { 52 static const int kAssignMatchColsByQuery = -1; 53 CSTranslateAppOptionsCSTranslateAppOptions54 CSTranslateAppOptions() { Init(); } 55 ~CSTranslateAppOptionsCSTranslateAppOptions56 virtual ~CSTranslateAppOptions() { } 57 58 // Set csbuild default parameters InitCSTranslateAppOptions59 void Init() { 60 informat = "auto"; 61 outformat = "seq"; 62 modelfile = "internal"; 63 alphabetfile = "internal"; 64 pc_admix = 0.90; 65 pc_ali = 12.0; 66 match_assign = kAssignMatchColsByQuery; 67 weight_center = 1.6; 68 weight_decay = 0.85; 69 weight_as = 1000.0; 70 ffindex = false; 71 verbose = true; 72 } 73 74 // Validates the parameter settings and throws exception if needed. ValidateCSTranslateAppOptions75 void Validate() { 76 if (infile.empty()) throw Exception("No input file provided!"); 77 if (alphabetfile.empty()) throw Exception("No abstract states provided!"); 78 } 79 80 // The input alignment file with training data. 81 string infile; 82 // The output file. 83 string outfile; 84 // The file to which the output should be appended. 85 string appendfile; 86 // Input file with context profile library or HMM for generating pseudocounts 87 string modelfile; 88 // Input file with profile library to be used as abstract state alphabet 89 string alphabetfile; 90 // Input file format 91 string informat; 92 // Output file format (abstract state sequence or abstract state profile) 93 string outformat; 94 // Overall pseudocount admixture 95 double pc_admix; 96 // Constant in pseudocount calculation for alignments 97 double pc_ali; 98 // Match column assignment for FASTA alignments 99 int match_assign; 100 // Weight of central column in multinomial emission 101 double weight_center; 102 // Exponential decay of window weights 103 double weight_decay; 104 // Weight in emission calculation of abstract states 105 double weight_as; 106 // verbose output 107 bool verbose; 108 // ffindex 109 bool ffindex; 110 }; // CSTranslateAppOptions 111 GetMatchSymbol(double pp)112 char GetMatchSymbol(double pp) { 113 char rv = '='; 114 if (pp > 0.8) rv = '|'; 115 else if (pp > 0.6) rv = '+'; 116 else if (pp > 0.4) rv = '.'; 117 else if (pp > 0.2) rv = '-'; 118 return rv; 119 } 120 GetConfidence(double pp)121 inline int GetConfidence(double pp) { 122 return static_cast<int>(floor((pp - DBL_EPSILON) * 10)); 123 } 124 125 126 template<class Abc> 127 class CSTranslateApp : public Application { 128 public: 129 // Runs the csbuild application. Run()130 virtual int Run() { 131 SetupEmissions(); 132 SetupPseudocountEngine(); 133 SetupAbstractStateEngine(); 134 135 if (!opts_.ffindex) { 136 FILE *fin; 137 if (strcmp(opts_.infile.c_str(), "stdin") == 0) 138 fin = stdin; 139 else 140 fin = fopen(opts_.infile.c_str(), "r"); 141 142 if (!fin) 143 throw Exception("Unable to read input file '%s'!", opts_.infile.c_str()); 144 145 string header; 146 CountProfile<Abc> profile; // input profile we want to translate 147 ReadProfile(fin, header, profile); 148 149 size_t profile_counts_lenght = profile.counts.length(); 150 151 // Prepare abstract sequence in AS219 format 152 Sequence<AS219> as_seq(profile_counts_lenght); 153 as_seq.set_header(header); 154 155 // Translate count profile into abstract state count profile (Neff is one) 156 if (opts_.verbose) 157 fputs("Translating count profile to abstract state alphabet AS219 ...\n", out_); 158 159 CountProfile<AS219> as_profile(profile_counts_lenght); // output profile 160 Translate(profile, as_profile); 161 162 BuildSequence(as_profile, profile_counts_lenght, as_seq); 163 164 // Build pseudo-alignment for output 165 const size_t nseqs = 5; 166 const size_t header_width = 4; 167 const size_t width = 100; 168 vector<string> ali(nseqs, ""); 169 vector<string> labels(nseqs, ""); 170 labels[0] = "Pos"; 171 labels[1] = "Cons"; 172 labels[3] = "AS219"; 173 labels[4] = "Conf"; 174 175 BlosumMatrix sm; 176 ali[1] = ConservationSequence(profile, sm); 177 178 for (size_t i = 0; i < profile.counts.length(); ++i) { 179 ali[0].append(strprintf("%d", (i + 1) % 10)); 180 ali[2].push_back(GetMatchSymbol(as_profile.counts[i][as_seq[i]])); 181 ali[3].push_back(as_seq.chr(i)); 182 ali[4].append(strprintf("%d", GetConfidence(as_profile.counts[i][as_seq[i]]))); 183 } 184 185 // Print pseudo alignment in blocks if verbose 186 if (opts_.verbose) { 187 fputc('\n', out_); // blank line before alignment 188 while (!ali.front().empty()) { 189 for (size_t k = 0; k < nseqs; ++k) { 190 string label = labels[k]; 191 label += string(header_width - label.length() + 1, ' '); 192 fputs(label.c_str(), out_); 193 fputc(' ', out_); // separator between header and sequence 194 195 size_t len = MIN(width, ali[k].length()); 196 fputs(ali[k].substr(0, len).c_str(), out_); 197 fputc('\n', out_); 198 ali[k].erase(0, len); 199 } 200 fputc('\n', out_); // blank line after each block 201 } 202 } 203 204 // Write abstract-state sequence or profile to outfile 205 if (opts_.outformat == "seq") { 206 if (!opts_.outfile.empty()) { 207 WriteStateSequence(as_seq, opts_.outfile, false); 208 } 209 if (!opts_.appendfile.empty()) { 210 WriteStateSequence(as_seq, opts_.appendfile, true); 211 } 212 } else { 213 if (!opts_.outfile.empty()) { 214 WriteStateProfile(as_profile, opts_.outfile, false); 215 } 216 if (!opts_.appendfile.empty()) { 217 WriteStateProfile(as_profile, opts_.appendfile, true); 218 } 219 } 220 } 221 else { 222 const bool isCa3m = this->opts_.informat == "ca3m"; 223 224 std::string input_data_file = opts_.infile + ".ffdata"; 225 std::string input_index_file = opts_.infile + ".ffindex"; 226 227 FFindexDatabase *header_db = NULL; 228 FFindexDatabase *sequence_db = NULL; 229 230 if (isCa3m) { 231 // infile has to be the ffindex basepath with no suffices 232 input_data_file = this->opts_.infile + "_ca3m.ffdata"; 233 input_index_file = this->opts_.infile + "_ca3m.ffindex"; 234 235 std::string input_header_data_file = this->opts_.infile + "_header.ffdata"; 236 std::string input_header_index_file = this->opts_.infile + "_header.ffindex"; 237 header_db = new FFindexDatabase(const_cast<char *>(input_header_data_file.c_str()), 238 const_cast<char *>(input_header_index_file.c_str()), false); 239 240 241 std::string input_sequence_data_file = this->opts_.infile + "_sequence.ffdata"; 242 std::string input_sequence_index_file = this->opts_.infile + "_sequence.ffindex"; 243 sequence_db = new FFindexDatabase(const_cast<char *>(input_sequence_data_file.c_str()), 244 const_cast<char *>(input_sequence_index_file.c_str()), false); 245 246 } 247 248 FFindexDatabase input(const_cast<char *>(input_data_file.c_str()), 249 const_cast<char *>(input_index_file.c_str()), isCa3m); 250 input.ensureLinearAccess(); 251 252 //prepare output ffindex cs219 database 253 std::string output_data_file = opts_.outfile + ".ffdata"; 254 std::string output_index_file = opts_.outfile + ".ffindex"; 255 256 FILE *output_data_fh = fopen(output_data_file.c_str(), "w"); 257 FILE *output_index_fh = fopen(output_index_file.c_str(), "w"); 258 259 if (output_data_fh == NULL) { 260 LOG(ERROR) << "Could not open ffindex output data file! (" << output_data_file << ")!" << std::endl; 261 exit(1); 262 } 263 264 if (output_index_fh == NULL) { 265 LOG(ERROR) << "Could not open ffindex output index file! (" << output_index_file << ")!" << std::endl; 266 exit(1); 267 } 268 269 size_t output_offset = 0; 270 271 size_t input_range_start = 0; 272 size_t input_range_end = input.db_index->n_entries; 273 274 // Foreach entry 275 #pragma omp parallel for shared(input, sequence_db, header_db) 276 for (size_t entry_index = input_range_start; entry_index < input_range_end; entry_index++) { 277 ffindex_entry_t *entry = ffindex_get_entry_by_index(input.db_index, entry_index); 278 279 if (entry == NULL) { 280 LOG(WARNING) << "Could not open entry " << entry_index << " from input ffindex!" << std::endl; 281 continue; 282 } 283 284 if (opts_.verbose) { 285 fprintf(out_, "Processing entry: %s\n", entry->name); 286 } 287 288 std::ostringstream a3m_buffer; 289 std::string a3m_string; 290 FILE *inf; 291 if (isCa3m) { 292 char *data = ffindex_get_data_by_entry(input.db_data, entry); 293 294 compressed_a3m::extract_a3m(data, entry->length, 295 sequence_db->db_index, sequence_db->db_data, 296 header_db->db_index, header_db->db_data, 297 &a3m_buffer); 298 299 a3m_string = a3m_buffer.str(); 300 301 inf = fmemopen(static_cast<void *>(const_cast<char *>(a3m_string.c_str())), a3m_string.length(), "r"); 302 } else { 303 inf = ffindex_fopen_by_entry(input.db_data, entry); 304 } 305 306 if (inf == NULL) { 307 LOG(WARNING) << "Could not open input entry (" << entry->name << ")!" << std::endl; 308 continue; 309 } 310 311 string header; 312 CountProfile<Abc> profile; // input profile we want to translate 313 try { 314 ReadProfile(inf, header, profile); 315 } catch (const Exception &e) { 316 fprintf(out_, "Could not read entry: %s, Message: %s\n", entry->name, e.what()); 317 continue; 318 } 319 320 size_t profile_counts_length = profile.counts.length(); 321 322 CountProfile<AS219> as_profile(profile_counts_length); // output profile 323 Translate(profile, as_profile); 324 325 // Prepare abstract sequence in AS219 format 326 Sequence<AS219> as_seq(profile_counts_length); 327 as_seq.set_header(header); 328 BuildSequence(as_profile, profile_counts_length, as_seq); 329 330 std::stringstream out_buffer; 331 332 if (opts_.outformat == "seq") { 333 WriteStateSequence(as_seq, out_buffer); 334 } else { 335 WriteStateProfile(as_profile, out_buffer); 336 } 337 338 std::string out_string = out_buffer.str(); 339 #pragma omp critical 340 { 341 ffindex_insert_memory(output_data_fh, output_index_fh, &output_offset, 342 const_cast<char *>(out_string.c_str()), 343 out_string.size(), entry->name); 344 } 345 346 // FIXME: we are leaking inf, but if we fclose we get weird crashes 347 //fclose(inf); 348 349 } 350 351 fclose(output_index_fh); 352 fclose(output_data_fh); 353 354 355 ffsort_index(output_index_file.c_str()); 356 357 if (isCa3m) { 358 delete sequence_db; 359 delete header_db; 360 } 361 } 362 363 return 0; 364 }; 365 366 // Parses command line options. ParseOptions(GetOpt_pp & ops)367 virtual void ParseOptions(GetOpt_pp &ops) { 368 ops >> Option('i', "infile", opts_.infile, opts_.infile); 369 ops >> Option('o', "outfile", opts_.outfile, opts_.outfile); 370 ops >> Option('a', "appendfile", opts_.appendfile, opts_.appendfile); 371 ops >> Option('I', "informat", opts_.informat, opts_.informat); 372 ops >> Option('O', "outformat", opts_.outformat, opts_.outformat); 373 ops >> Option('M', "match-assign", opts_.match_assign, opts_.match_assign); 374 ops >> Option('x', "pc-admix", opts_.pc_admix, opts_.pc_admix); 375 ops >> Option('c', "pc-ali", opts_.pc_ali, opts_.pc_ali); 376 ops >> Option('A', "alphabet", opts_.alphabetfile, opts_.alphabetfile); 377 ops >> Option('D', "context-data", opts_.modelfile, opts_.modelfile); 378 ops >> Option('w', "weight", opts_.weight_as, opts_.weight_as); 379 ops >> OptionPresent('f', "ffindex", opts_.ffindex); 380 ops >> Option('v', "verbose", opts_.verbose, opts_.verbose); 381 382 opts_.Validate(); 383 384 if (opts_.outfile.compare("stdout") == 0) 385 opts_.verbose = false; 386 387 if (opts_.outfile.empty() && opts_.appendfile.empty()) 388 opts_.outfile = GetBasename(opts_.infile, false) + ".as"; 389 if (opts_.informat == "auto") 390 opts_.informat = GetFileExt(opts_.infile); 391 }; 392 393 // Prints options summary to stream. PrintOptions()394 virtual void PrintOptions() const { 395 fprintf(out_, " %-30s %s\n", "-i, --infile <file>", 396 "Input file with alignment or sequence"); 397 fprintf(out_, " %-30s %s\n", "-o, --outfile <file>", 398 "Output file for generated abstract state sequence (def: <infile>.as)"); 399 fprintf(out_, " %-30s %s\n", "-a, --append <file>", "Append generated abstract state sequence to this file"); 400 fprintf(out_, " %-30s %s (def=%s)\n", "-I, --informat prf|seq|fas|...", 401 "Input format: prf, seq, fas, a2m, a3m or ca3m", opts_.informat.c_str()); 402 fprintf(out_, " %-30s %s (def=%s)\n", "-O, --outformat seq|prf", "Outformat: abstract state sequence or profile", 403 opts_.outformat.c_str()); 404 fprintf(out_, " %-30s %s\n", "-M, --match-assign [0:100]", 405 "Make all FASTA columns with less than X% gaps match columns"); 406 fprintf(out_, " %-30s %s\n", "", "(def: make columns with residue in first sequence match columns)"); 407 fprintf(out_, " %-30s %s (def=internal)\n", "-A, --alphabet <file>", 408 "Abstract state alphabet consisting of exactly 219 states"); 409 fprintf(out_, " %-30s %s (def=internal)\n", "-D, --context-data <file>", 410 "Add context-specific pseudocounts using given context-data"); 411 fprintf(out_, " %-30s %s (def=%-.2f)\n", "-x, --pc-admix [0,1]", 412 "Pseudocount admix for context-specific pseudocounts", opts_.pc_admix); 413 fprintf(out_, " %-30s %s (def=%-.1f)\n", "-c, --pc-ali [0,inf[", 414 "Constant in pseudocount calculation for alignments", opts_.pc_ali); 415 fprintf(out_, " %-30s %s (def=%-.2f)\n", "-w, --weight [0,inf[", 416 "Weight of abstract state column in emission calculation", opts_.weight_as); 417 fprintf(out_, " %-30s %s (def=off)\n", "-f, --ffindex", 418 "Read from -i <ffindex>, write to -o <ffindex> (do not include _ca3m suffix for ca3m informat); enables openmp if possible"); 419 }; 420 421 // Prints short application description. PrintBanner()422 virtual void PrintBanner() const { 423 fputs("Translate a sequence/alignment into an abstract state alphabet.\n", out_); 424 }; 425 426 // Prints usage banner to stream. PrintUsage()427 virtual void PrintUsage() const { 428 fputs("Usage: cstranslate -i <infile> [options]\n", out_); 429 }; 430 431 // Setup pseudocount engine SetupPseudocountEngine()432 void SetupPseudocountEngine() { 433 if (opts_.modelfile.empty()) 434 return; 435 436 std::string engine; 437 FILE *fin; 438 if (opts_.modelfile == "internal") { 439 // fin = fmemopen((void*)context_data_crf, context_data_crf_len, "r"); 440 // engine = "crf"; 441 fin = fmemopen((void*)context_data_lib, context_data_lib_len, "r"); 442 engine = "lib"; 443 } else { 444 fin = fopen(opts_.modelfile.c_str(), "r"); 445 engine = GetFileExt(opts_.modelfile); 446 } 447 if (!fin) { 448 throw Exception("Unable to read file '" + opts_.modelfile + "'!\n"); 449 } 450 451 if (engine == "lib") { 452 // Older generative approach by Biegert & Soeding PNAS 2009 453 if (opts_.verbose) { 454 fprintf(out_, "Reading context library for pseudocounts from %s ...\n", GetBasename(opts_.modelfile).c_str()); 455 } 456 pc_lib_.reset(new ContextLibrary<Abc>(fin)); 457 TransformToLog(*pc_lib_); 458 pc_.reset(new LibraryPseudocounts<Abc>(*pc_lib_, opts_.weight_center,opts_.weight_decay)); 459 } else if (engine == "crf") { 460 // Newer discriminative approach by Angermueller & Soeding Bioinformatics 2012 461 if (opts_.verbose) { 462 fprintf(out_, "Reading CRF for pseudocounts from %s ...\n", GetBasename(opts_.modelfile).c_str()); 463 } 464 pc_crf_.reset(new Crf<Abc>(fin)); 465 pc_.reset(new CrfPseudocounts<Abc>(*pc_crf_)); 466 } else { 467 throw Exception("Invalid pseudocount engine: " + engine); 468 } 469 fclose(fin); 470 }; 471 472 // Setup abstract state engine SetupAbstractStateEngine()473 void SetupAbstractStateEngine() { 474 FILE *fin; 475 if (opts_.alphabetfile == "internal") { 476 fin = fmemopen((void*)cs219_lib, cs219_lib_len, "r"); 477 } else { 478 fin = fopen(opts_.alphabetfile.c_str(), "r"); 479 } 480 if (!fin) { 481 throw Exception("Unable to read file '" + opts_.alphabetfile + "'!\n"); 482 } 483 if (opts_.verbose) { 484 fprintf(out_, "Reading abstract state alphabet from %s ...\n", GetBasename(opts_.alphabetfile).c_str()); 485 } 486 487 as_lib_.reset(new ContextLibrary<Abc>(fin)); 488 489 if (as_lib_->size() != AS219::kSize) { 490 fprintf(out_, "Abstract state alphabet should have %zu states but actually has %zu states!\n", AS219::kSize, as_lib_->size()); 491 exit(1); 492 } 493 494 if (static_cast<int>(as_lib_->wlen()) != 1) { 495 fprintf(out_, "Abstract state alphabet should have a window length of %d but actually has %zu!\n", 1, as_lib_->wlen()); 496 exit(1); 497 } 498 499 TransformToLog(*as_lib_); 500 fclose(fin); 501 }; 502 SetupEmissions()503 void SetupEmissions() { 504 emission_.reset(new Emission<Abc>(1, this->opts_.weight_as, 1.0)); 505 } 506 507 // Writes abstract state sequence to outfile 508 void WriteStateSequence(const Sequence<AS219> &seq, string outfile, bool append = false) const { 509 FILE *fout; 510 if (outfile.compare("stdout") == 0) 511 fout = stdout; 512 else 513 fout = fopen(outfile.c_str(), append ? "a" : "w"); 514 if (!fout) throw Exception("Can't %s to file '%s'!", append ? "append" : "write", outfile.c_str()); 515 for (size_t i = 0; i < seq.length(); ++i) { 516 fputc((char) seq[i], fout); 517 } 518 fclose(fout); 519 if (opts_.verbose) 520 fprintf(out_, "%s abstract state sequence to %s\n", append ? "Appended" : "Wrote", outfile.c_str()); 521 }; 522 WriteStateSequence(const Sequence<AS219> & seq,std::stringstream & ss)523 void WriteStateSequence(const Sequence<AS219> &seq, std::stringstream &ss) { 524 for (size_t i = 0; i < seq.length(); ++i) { 525 ss.put((char) seq[i]); 526 } 527 }; 528 529 // Writes abstract state profile to outfile 530 void WriteStateProfile(const CountProfile<AS219> &prof, string outfile, bool append = false) const { 531 FILE *fout; 532 if (outfile.compare("stdout") == 0) 533 fout = stdout; 534 else 535 fout = fopen(outfile.c_str(), append ? "a" : "w"); 536 if (!fout) throw Exception("Can't %s to output file '%s'!", append ? "append" : "write", outfile.c_str()); 537 prof.Write(fout); 538 fclose(fout); 539 if (opts_.verbose) 540 fprintf(out_, "%s abstract state count profile to %s\n", append ? "Appended" : "Wrote", outfile.c_str()); 541 }; 542 WriteStateProfile(const CountProfile<AS219> & prof,std::stringstream & ss)543 void WriteStateProfile(const CountProfile<AS219> &prof, std::stringstream &ss) { 544 prof.Write(ss); 545 }; 546 ReadProfile(FILE * fin,string & header,CountProfile<Abc> & profile)547 void ReadProfile(FILE *fin, string &header, CountProfile<Abc> &profile) { 548 if (opts_.informat == "prf") { // read count profile from infile 549 profile = CountProfile<Abc>(fin);; 550 if (profile.name.empty()) header = GetBasename(opts_.infile, false); 551 else header = profile.name; 552 553 if (pc_) { 554 if (opts_.verbose) 555 fprintf(out_, "Adding cs-pseudocounts (admix=%.2f) ...\n", opts_.pc_admix); 556 CSBlastAdmix admix(opts_.pc_admix, opts_.pc_ali); 557 profile.counts = pc_->AddTo(profile, admix); 558 Normalize(profile.counts, profile.neff); 559 } 560 561 } else if (opts_.informat == "seq") { // build profile from sequence 562 Sequence<Abc> seq(fin); 563 header = seq.header(); 564 profile = CountProfile<Abc>(seq); 565 566 if (pc_) { 567 if (opts_.verbose) 568 fprintf(out_, "Adding cs-pseudocounts (admix=%.2f) ...\n", opts_.pc_admix); 569 ConstantAdmix admix(opts_.pc_admix); 570 profile.counts = pc_->AddTo(seq, admix); 571 } 572 573 } else { // build profile from alignment 574 AlignmentFormat f = AlignmentFormatFromString(opts_.informat); 575 Alignment<Abc> ali(fin, f); 576 header = ali.name(); 577 578 if (f == FASTA_ALIGNMENT) { 579 if (opts_.match_assign == CSTranslateAppOptions::kAssignMatchColsByQuery) 580 ali.AssignMatchColumnsBySequence(0); 581 else 582 ali.AssignMatchColumnsByGapRule(opts_.match_assign); 583 } 584 profile = CountProfile<Abc>(ali); 585 586 if (pc_) { 587 if (opts_.verbose) 588 fprintf(out_, "Adding cs-pseudocounts (admix=%.2f) ...\n", opts_.pc_admix); 589 CSBlastAdmix admix(opts_.pc_admix, opts_.pc_ali); 590 profile.counts = pc_->AddTo(profile, admix); 591 Normalize(profile.counts, profile.neff); 592 } 593 } 594 fclose(fin); // close input file 595 }; 596 Translate(CountProfile<Abc> & profile,CountProfile<cs::AS219> & as_profile)597 void Translate(CountProfile<Abc> &profile, CountProfile<cs::AS219> &as_profile) { 598 for (size_t i = 0; i < as_profile.length(); ++i) { 599 CalculatePosteriorProbs(*as_lib_, *emission_, profile, i, as_profile.counts[i]); 600 } 601 as_profile.name = GetBasename(opts_.infile, false); 602 as_profile.name = as_profile.name.substr(0, as_profile.name.length() - 1); 603 }; 604 BuildSequence(CountProfile<AS219> & as_profile,size_t profile_counts_length,Sequence<cs::AS219> & as_seq)605 void BuildSequence(CountProfile<AS219> &as_profile, size_t profile_counts_length, Sequence<cs::AS219> &as_seq) { 606 // We also build an abstract state sequence, either just for pretty printing or 607 // even because this is the actual output that the user wants 608 for (size_t i = 0; i < profile_counts_length; ++i) { 609 // Find state with maximal posterior prob and assign it to as_seq[i] 610 size_t k_max = 0; 611 double p_max = as_profile.counts[i][0]; 612 for (size_t k = 1; k < AS219::kSize; ++k) { 613 if (as_profile.counts[i][k] > p_max) { 614 k_max = k; 615 p_max = as_profile.counts[i][k]; 616 } 617 } 618 as_seq[i] = k_max; 619 } 620 }; 621 622 // Parameter wrapper 623 CSTranslateAppOptions opts_; 624 // Profile library with abstract states 625 scoped_ptr<ContextLibrary<Abc> > as_lib_; 626 // Profile library for context pseudocounts 627 scoped_ptr<ContextLibrary<Abc> > pc_lib_; 628 // CRF for CRF context pseudocounts 629 scoped_ptr<Crf<Abc> > pc_crf_; 630 // Pseudocount engine 631 scoped_ptr<Pseudocounts<Abc> > pc_; 632 // Emissions 633 scoped_ptr<Emission<Abc> > emission_; 634 }; // class CSTranslateApp 635 636 } // namespace cs 637