1 /* massXpert - the true massist's program. 2 -------------------------------------- 3 Copyright(C) 2006,2007 Filippo Rusconi 4 5 http://www.massxpert.org/massXpert 6 7 This file is part of the massXpert project. 8 9 The massxpert project is the successor to the "GNU polyxmass" 10 project that is an official GNU project package(see 11 www.gnu.org). The massXpert project is not endorsed by the GNU 12 project, although it is released ---in its entirety--- under the 13 GNU General Public License. A huge part of the code in massXpert 14 is actually a C++ rewrite of code in GNU polyxmass. As such 15 massXpert was started at the Centre National de la Recherche 16 Scientifique(FRANCE), that granted me the formal authorization to 17 publish it under this Free Software License. 18 19 This software is free software; you can redistribute it and/or 20 modify it under the terms of the GNU General Public 21 License version 3, as published by the Free Software Foundation. 22 23 24 This software is distributed in the hope that it will be useful, 25 but WITHOUT ANY WARRANTY; without even the implied warranty of 26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 27 General Public License for more details. 28 29 You should have received a copy of the GNU General Public License 30 along with this software; if not, write to the 31 32 Free Software Foundation, Inc., 33 34 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 35 */ 36 37 38 /////////////////////// Local includes 39 #include "sequence.hpp" 40 #include "polChemDef.hpp" 41 42 43 namespace massXpert 44 { 45 46 //! Constructs a sequence 47 /*! The sequence is in the form of a string of concatenated monomer 48 codes. No quality check is performed. 49 50 \param text sequence in the form of concatenated monomer codes. 51 */ Sequence(const QString & text)52 Sequence::Sequence(const QString &text) 53 : m_monomerText(text) 54 { 55 } 56 57 58 //! Destroys the sequence. ~Sequence()59 Sequence::~Sequence() 60 { 61 while(!m_monomerList.isEmpty()) 62 delete m_monomerList.takeFirst(); 63 } 64 65 66 //! Sets the sequence text. 67 /*! \param text Monomer sequence as a string of monomer codes. 68 */ 69 void setMonomerText(const QString & text)70 Sequence::setMonomerText(const QString &text) 71 { 72 m_monomerText = text; 73 } 74 75 76 //! Appends text to the sequence text. 77 /*! \param text Monomer sequence as a string of monomer codes. 78 */ 79 void appendMonomerText(const QString & text)80 Sequence::appendMonomerText(const QString &text) 81 { 82 if (text.isEmpty()) 83 return; 84 85 m_monomerText += text; 86 } 87 88 89 //! Returns the sequence as a string of monomer codes. 90 /*! \return The sequence as a string. 91 */ 92 const QString * monomerText()93 Sequence::monomerText() 94 { 95 return &m_monomerText; 96 } 97 98 99 //! Returns the sequence as a list of monomers. 100 /*! \return The list of monomers. 101 */ 102 const QList<const Monomer *> & monomerList() const103 Sequence::monomerList() const 104 { 105 return m_monomerList; 106 } 107 108 109 //! Returns the sequence as a list of monomers. 110 /*! \return The list of monomers. 111 */ 112 QList<const Monomer *> * monomerListPtr()113 Sequence::monomerListPtr() 114 { 115 return &m_monomerList; 116 } 117 118 119 //! Returns the size of the sequence. 120 /*! Returns the size of the sequence as the size of the list of 121 monomers. 122 123 \return The number of items in the list of monomers. 124 */ 125 int size() const126 Sequence::size() const 127 { 128 return m_monomerList.size(); 129 } 130 131 132 //! Removes all spaces, carriage returns and linefeeds. 133 void unspacifyMonomerText()134 Sequence::unspacifyMonomerText() 135 { 136 // Removal of all spaces, carriage returns and linefeeds: 137 138 for (int iter = m_monomerText.length() -1; iter >= 0 ; --iter) 139 { 140 QChar curChar = m_monomerText.at(iter); 141 142 QChar::Category category = curChar.category(); 143 144 if(category == QChar::Separator_Space) 145 m_monomerText.remove(iter, 1); 146 147 else if (curChar == '\n') 148 m_monomerText.remove(iter, 1); 149 else if (curChar == '\r') 150 m_monomerText.remove(iter, 1); 151 } 152 } 153 154 155 //! Creates the string representation of the sequence. 156 /*! The string representation of the sequence is created by iterating 157 in the list of monomers and concatenating into one single string the 158 monomer code of each iterated monomer. The generated string is 159 stored in the member datum. 160 161 \return The number of codes concatenated into the string. 162 163 \sa makeMonomerList() 164 */ 165 int makeMonomerText()166 Sequence::makeMonomerText() 167 { 168 int iter = 0; 169 170 m_monomerText.clear(); 171 172 for (iter = 0; iter < m_monomerList.size(); ++iter) 173 m_monomerText.append(m_monomerList.at(iter)->code()); 174 175 return iter; 176 } 177 178 179 QString * monomerText(int start,int end,bool withModif) const180 Sequence::monomerText(int start, int end, bool withModif) const 181 { 182 int localStart = 0; 183 int localEnd = 0; 184 185 QString *p_text = new QString(); 186 187 if (size() == 0) 188 return p_text; 189 190 if (start > end) 191 { 192 localStart = end; 193 localEnd = start; 194 } 195 else 196 { 197 localStart = start; 198 localEnd = end; 199 } 200 201 if (localStart < 0) 202 localStart = 0; 203 204 if (localEnd < 0 || localEnd >= size()) 205 localEnd = size() - 1; 206 207 QString text; 208 209 for (int iter = localStart ; iter < localEnd + 1 ; ++iter) 210 { 211 const Monomer *monomer = m_monomerList.at(iter); 212 213 if(withModif) 214 { 215 if (monomer->isModified()) 216 { 217 for(int iter = 0; iter < monomer->modifList()->size();++iter) 218 { 219 text = QString("%1<%2>") 220 .arg(monomer->code()) 221 .arg(monomer->modifList()->at(iter)->name()); 222 } 223 } 224 else 225 text = monomer->code(); 226 } 227 else 228 text = monomer->code(); 229 230 p_text->append(text); 231 } 232 233 return p_text; 234 } 235 236 237 QString * monomerText(const CoordinateList & coordinateList,bool withModif,bool delimitedRegions) const238 Sequence::monomerText(const CoordinateList &coordinateList, 239 bool withModif, bool delimitedRegions) const 240 { 241 QString *p_text = new QString(); 242 243 for (int iter = 0; iter < coordinateList.size(); ++iter) 244 { 245 // New coordinates instance we are iterating into. 246 Coordinates *coordinates = coordinateList.at(iter); 247 248 QString *tempString = monomerText(coordinates->start(), 249 coordinates->end(), 250 withModif); 251 252 if(delimitedRegions) 253 *p_text += QString("Region %1: %2\n") 254 .arg(coordinates->positionsAsText()) 255 .arg(*tempString); 256 else 257 *p_text += *tempString; 258 259 delete(tempString); 260 } 261 262 *p_text += QString("\n"); 263 264 return p_text; 265 } 266 267 268 //! Creates a list of monomers from the string sequence. 269 /*! The creation of the list of monomers is performed by iterating in 270 the sequence text form and for each monomer code parsed a monomer is 271 created by looking into a list of reference monomers(belonging to 272 the polymer chemistry definition used at construction). 273 274 \param reset If true, the list of monomers is first cleared. 275 276 \param polChemDef Polymer chemistry definition to be used to craft 277 the fully qualified monomers using their code in the text 278 representation of the sequence. 279 280 \param errorList list of int where to store the indices where errors 281 are encountered. Defaults to 0, in which case no storing of the 282 indices occurs. 283 284 \return The size of the monomer list or -1 if an error occurred. 285 */ 286 int makeMonomerList(const PolChemDef * polChemDef,bool reset,QList<int> * errorList)287 Sequence::makeMonomerList(const PolChemDef *polChemDef, bool reset, 288 QList<int> *errorList) 289 { 290 if (!polChemDef) 291 return -1; 292 293 int index = 0; 294 int ret = -1; 295 QString err; 296 QString code; 297 298 // If error indices are to be stored, the list MUST be empty. 299 if (errorList) 300 Q_ASSERT(errorList->size() == 0); 301 302 if (reset) 303 { 304 while(!m_monomerList.isEmpty()) 305 delete m_monomerList.takeFirst(); 306 } 307 308 unspacifyMonomerText(); 309 310 // qDebug() << __FILE__ << __LINE__ 311 // << "Sequence:" << m_monomerText; 312 313 ret = nextCode(&code, &index, &err, polChemDef->codeLength()); 314 315 const QList<Monomer*> &refList = polChemDef->monomerList(); 316 317 while(1) 318 { 319 if(ret < 0) 320 { 321 // There was an error in the parsed code. Store the index. 322 if (errorList) 323 { 324 errorList->append(index); 325 ++index; 326 ret = nextCode(&code, &index, &err, polChemDef->codeLength()); 327 continue; 328 } 329 else 330 { 331 break; 332 } 333 } 334 335 if(ret == 0) 336 break; 337 338 Monomer *monomer = new Monomer(polChemDef, "NOT_SET"); 339 340 if(Monomer::isCodeInList(code, refList, monomer) == -1) 341 { 342 delete monomer; 343 344 if (errorList) 345 { 346 errorList->append(index); 347 ++index; 348 ret = nextCode(&code, &index, &err, polChemDef->codeLength()); 349 continue; 350 } 351 else 352 { 353 return -1; 354 } 355 } 356 357 m_monomerList.append(monomer); 358 359 // qDebug() << __FILE__ << __LINE__ 360 // << "New monomer:" << monomer->name(); 361 362 ++index; 363 364 // qDebug() << __FILE__ << __LINE__ << "index:" << index; 365 366 ret = nextCode(&code, &index, &err, polChemDef->codeLength()); 367 } 368 // End of 369 // while(1) 370 371 if (errorList) 372 { 373 if(errorList->size()) 374 return -1; 375 } 376 377 if (ret == -1) 378 return -1; 379 380 return m_monomerList.size(); 381 } 382 383 384 //! Returns the next code occurring in the sequence. 385 /*! Returns the code occurring in the sequence starting at index \p 386 index. 387 388 \param code Location where to store the code to return to caller. 389 390 \param index Index at which parsing for a new code in the sequence has 391 to start. 392 393 \param err Location where to store the erroneous characters that 394 might be encountered during parsing of the sequence. 395 396 \param codeLength Number of authorized characters to qualify a 397 monomer code. 398 399 \return the length(in characters) of the returned code or -1 if an 400 error occurred. 401 */ 402 int nextCode(QString * code,int * index,QString * err,int codeLength)403 Sequence::nextCode(QString *code, int *index, QString *err, int codeLength) 404 { 405 QString newCode; 406 int iter = 0; 407 408 // We get a sequence of monomer codes(like "LysArgGlu" for example) 409 // and we have to return the next code starting from *index. Note 410 // that the sequence must not contain invalid characters. The 411 // invalid characters might be placed in err for further scrutiny by 412 // the caller. 413 414 // Returns the count of actually parsed characters in the string 415 // newCode(copied to 'code' param). If an error occurs -1 is 416 // returned and the faulty character is copied in 'err'. 'index' is 417 // updated with the index of the last valid character parsed for 418 // current code. 419 420 Q_ASSERT(code); 421 Q_ASSERT(index); 422 Q_ASSERT(err); 423 424 code->clear(); 425 err->clear(); 426 427 int length = m_monomerText.length(); 428 429 while(1) 430 { 431 if(iter >= codeLength) 432 { 433 // Because we have progressed farther than authorized by 434 // the number of characters allowed in the monomer codes 435 // of this polymer chemistry definition, we decrement iter 436 // and break the loop... Later in this function, we'll set 437 // the proper index in the sequence where next parsing run 438 // should occurs (the calling function will increment 439 // *index by one). 440 441 --iter; 442 break; 443 } 444 445 if(iter + *index >= length) 446 break; 447 448 QChar curChar = m_monomerText.at(iter + *index); 449 450 if(!curChar.isLetter()) 451 { 452 // qDebug() << __FILE__ << __LINE__ 453 // << "The character is not a letter:" 454 // << curChar; 455 456 *err = curChar; 457 458 // The non-Letter character might be '/', which would be 459 // perfectly fine, as we use it to symbolize the actual 460 // cleavage site. Which means that we will continue 461 // parsing the rest of the string : we have to give the 462 // current position back to the caller in the *index 463 // variable for the next call to this function to start at 464 // next character (not falling back to '/', which would 465 // make us enter in an infinite loop). 466 467 *index = *index + iter; 468 469 return -1; 470 } 471 472 bool isLower =(curChar.category() == QChar::Letter_Lowercase); 473 474 if(iter == 0) 475 { 476 if (isLower) 477 { 478 qDebug() << __FILE__ << __LINE__ 479 << "First character of monomer code might not be" 480 << "lower case; sequence is" 481 << m_monomerText; 482 483 *err = curChar; 484 485 return -1; 486 } 487 else 488 { 489 // Good, first char is uppercase. 490 newCode += curChar; 491 } 492 } 493 else //(iter != 0) 494 { 495 // We are not in our first iteration. So either the current 496 // character is lowercase and we are just continuing to 497 // iterate into a multi-char monomer code, or the current 498 // character is uppercase, in which case we are starting to 499 // iterate in a new monomer code. 500 501 if (isLower) 502 newCode += curChar; 503 else 504 { 505 // Decrement iter, because this round was for nothing: 506 // we had "invaded" the next monomer code in sequence, 507 // which we must not do. 508 509 --iter; 510 break; 511 } 512 } 513 514 ++iter; 515 } 516 517 // We finished parsing at most codeLength characters out of 518 // 'm_monomerText', so we have a valid code in the 'code' variable. We 519 // can also compute a new index position in the sequence and return 520 // the number of characters that we effectively parsed. Note that 521 // the caller will be responsible for incrementing the 'index' value 522 // by one character unit so as not to reparse the last characters of 523 // the sent 'code' object. 524 525 *index = *index + iter; 526 *code = newCode; 527 err->clear(); 528 529 return code->length(); 530 } 531 532 533 534 // Returns -1 if an error was encountered, 0 if no match could be 535 // found, 1 if a match was found. 536 bool findForwardMotif(Sequence * motif,const PolChemDef * polChemDef,int * index)537 Sequence::findForwardMotif(Sequence *motif, 538 const PolChemDef *polChemDef, 539 int *index) 540 { 541 Q_ASSERT(motif); 542 Q_ASSERT(polChemDef); 543 Q_ASSERT(index); 544 545 if (*index < 0) 546 return -1; 547 if (*index >= size()) 548 return -1; 549 550 int motifSize = motif->size(); 551 552 // If motif's length is 0, then nothing to search for, return 553 // unmodified 'index'. 554 if (!motifSize) 555 return 0; 556 557 // Simple optimization, if index + size of motif is greater then 558 // size of sequence, return right away. 559 if (*index + motifSize >= size()) 560 return 0; 561 562 // First, make a monomerList. 563 if (motif->makeMonomerList(polChemDef) == -1) 564 return -1; 565 566 // Compare *this sequence with the one in 'motif', starting at index 567 // 'index' in *this sequence and 0 in 'motif'. 568 569 bool matched = false; 570 int matchIndex = 0; 571 572 for (int iter = *index; iter < size(); ++iter) 573 { 574 matched = false; 575 int jter = 0; 576 577 const Monomer *monomer = at(iter); 578 const Monomer *motifMonomer = motif->at(jter); 579 580 // We do not compare with operator == because that comparison 581 // would involve the comparison of modifications inside the 582 // monomers, which would not work here. 583 if(monomer->code() != motifMonomer->code()) 584 continue; 585 586 // An easy check is to see if the number of remaining monomers 587 // in the polymer sequence is compatible with the number of 588 // monomers still to be matched in the find array. Imagine the 589 // sequence of the polymer ends like this: ==========JTOUTVU and 590 // the sequence to be searched for is : TVUL What we see is that 591 // the T of the TVU of the sequence matches; however we can stop 592 // the search right away because there is a 'L' in the search 593 // pattern that is not present in the end part of the 594 // sequence. This is exactly what is checked below. Note that 595 // this check makes SURE that at the end of the second inner 596 // loop, when we get out of it, the sole reason we may not 597 // consider that the match did not occur is because actually two 598 // monomers differred and not because anybody came out of the 599 // borders of the sequence in neither the array of the sequence 600 // to be searched, nor the array of the polymer sequence. This 601 // makes it very easy to assess if a match occurred or not. 602 603 if(size() - iter < motif->size() - jter) 604 { 605 // Note that if it were ==, then it would have been possible 606 // that the sequence "just-in-time" match prior to ending of 607 // the polymer sequence array. Do not forget that we are in 608 // forward mode, thus we can break immediately, because we 609 // are certain that we won't have any chance to find the 610 // sequence downstream of current index. 611 612 matched = false; 613 break; 614 } 615 616 matchIndex = iter; 617 618 // We have to set the matched boolean to true, because if the 619 // motif to find is one monomer-long, then the loop below will 620 // not be entered, and we'll fail to know that the match 621 // occurred later on. 622 matched = true; 623 624 // Now that we have our anchoring point in the *this sequence, 625 // let's iterate in the motif, and check if the identity in 626 // sequence goes along. 627 628 for(int kter = jter + 1 ; kter < motif->size() ; ++kter) 629 { 630 // At first run in this loop, we are in the second cell of 631 // the find list, which means that we should have jter == 632 // 1. And we should compare its contents with those of the 633 // cell in the sequence list at index(iter + jter). 634 635 monomer = at(iter + kter); 636 motifMonomer = motif->at(kter); 637 638 // We do not compare with operator == because that 639 // comparison would involve the comparison of modifications 640 // inside the monomers, which would not work here. 641 if (monomer->code() == motifMonomer->code()) 642 { 643 // The monomers still match. 644 matched = true; 645 continue; 646 } 647 else 648 { 649 matched = false; 650 break; 651 } 652 } 653 // End of 654 // for (int kter = jter + 1 ; kter < motif->size() ; ++kter) 655 656 // At this point, we either have normally extinguished the run 657 // in the inner loop, or we have gone out of it before its 658 // normal termination. In either case, we have to test if the 659 // match occurred or not. 660 661 // Check if the match did NOT occur: 662 663 if(!matched) 664 { 665 // We just continue with the outer loop, that is we continue 666 // searching in the polymer sequence for a match with the 667 // first monomer in the motif. 668 669 continue; 670 } 671 else 672 { 673 // The match indeed occurred. 674 675 *index = matchIndex; 676 return 1; 677 } 678 } 679 // End of 680 // for (int iter = *index; iter < size(); ++iter) 681 682 683 // No match could be achieved, we have to let the caller function 684 // know this in a durable manner : returning 0. 685 686 return 0; 687 } 688 689 690 691 //! Returns the monomer at index \p index. 692 /*! 693 694 \param index index of the monomer to return in the list of 695 monomers. Must comply with the boundaries of the monomer list(that is 696 be >= 0 and < list.size()). 697 698 \return a pointer to the monomer. 699 */ 700 const Monomer * at(int index) const701 Sequence::at(int index) const 702 { 703 // qDebug() << __FILE__ << __LINE__ << "In call at() with value:" 704 // << index ; 705 706 if (index < 0) 707 qFatal("%s@%d -- Index cannot be less than 0.", 708 __FILE__, __LINE__); 709 710 if (index > m_monomerList.size()) 711 qFatal("%s@%d -- Index cannot be greater than polymer size.", 712 __FILE__, __LINE__); 713 714 return m_monomerList.at(index); 715 } 716 717 718 int monomerIndex(const Monomer * monomer)719 Sequence::monomerIndex(const Monomer *monomer) 720 { 721 for (int iter = 0; iter < m_monomerList.size(); ++iter) 722 { 723 if(m_monomerList.at(iter) == monomer) 724 return iter; 725 } 726 727 return -1; 728 } 729 730 731 //! Inserts the monomer at index \p index. 732 /*! Assertions ensure that \p index is not less than 0 and not greater 733 than sequence size as reported by size(). 734 735 This means that a monomer can only be inserted from a sequence if 736 the sequence is at least in the form of a list of monomers. 737 738 \param monomer dynamically allocated monomer. Assertion insures that 739 this pointer is non-0. 740 741 \param index Index of monomer to insert. 742 743 \return Always true. 744 */ 745 bool insertMonomerAt(const Monomer * monomer,int index)746 Sequence::insertMonomerAt(const Monomer *monomer, int index) 747 { 748 Q_ASSERT(monomer); 749 Q_ASSERT(index > -1 && index <= size()); 750 751 m_monomerList.insert(index, monomer); 752 753 return true; 754 } 755 756 757 bool prepareMonomerRemoval(const Monomer * monomer)758 Sequence::prepareMonomerRemoval(const Monomer *monomer) 759 { 760 return true; 761 } 762 763 764 //! Removes the monomer at index \p index. 765 /*! Assertions ensure that \p index is not less than 0 and not equal 766 or greater than sequence size as reported by size(). 767 768 This means that a monomer can only be removed from a sequence if the 769 sequence is at least in the form of a list of monomers. 770 771 \param index Index of monomer to remove. 772 773 \return Always true. 774 */ 775 bool removeMonomerAt(int index)776 Sequence::removeMonomerAt(int index) 777 { 778 Q_ASSERT(index > -1); 779 Q_ASSERT(index < size()); 780 781 const Monomer *monomer = at(index); 782 783 if (!prepareMonomerRemoval(monomer)) 784 return false; 785 786 m_monomerList.removeAt(index); 787 788 delete monomer; 789 790 return true; 791 } 792 793 794 bool validate(const PolChemDef * polChemDef)795 Sequence::validate(const PolChemDef *polChemDef) 796 { 797 Q_ASSERT(polChemDef); 798 799 if (makeMonomerList(polChemDef) > - 1) 800 return true; 801 802 return false; 803 } 804 805 quint16 checksum(int startIdx,int endIdx,bool withModifs) const806 Sequence::checksum(int startIdx, int endIdx, bool withModifs) const 807 { 808 if (!size()) 809 return 0; 810 811 QString *text = monomerText(startIdx, endIdx, withModifs); 812 813 QByteArray bytes = text->toUtf8(); 814 815 quint16 checksum = qChecksum(bytes.data(), bytes.size()); 816 817 // qDebug() << __FILE__ << __LINE__ 818 // << "checksum:" << checksum; 819 820 return checksum; 821 } 822 823 824 } // namespace massXpert 825