1 /* massXpert - the true massist's program.
2    --------------------------------------
3    Copyright(C) 2006,2007 Filippo Rusconi
4 
5    http://www.massxpert.org/massXpert
6 
7    This file is part of the massXpert project.
8 
9    The massxpert project is the successor to the "GNU polyxmass"
10    project that is an official GNU project package(see
11    www.gnu.org). The massXpert project is not endorsed by the GNU
12    project, although it is released ---in its entirety--- under the
13    GNU General Public License. A huge part of the code in massXpert
14    is actually a C++ rewrite of code in GNU polyxmass. As such
15    massXpert was started at the Centre National de la Recherche
16    Scientifique(FRANCE), that granted me the formal authorization to
17    publish it under this Free Software License.
18 
19    This software is free software; you can redistribute it and/or
20    modify it under the terms of the GNU  General Public
21    License version 3, as published by the Free Software Foundation.
22 
23 
24    This software is distributed in the hope that it will be useful,
25    but WITHOUT ANY WARRANTY; without even the implied warranty of
26    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
27    General Public License for more details.
28 
29    You should have received a copy of the GNU General Public License
30    along with this software; if not, write to the
31 
32    Free Software Foundation, Inc.,
33 
34    51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
35 */
36 
37 
38 /////////////////////// Local includes
39 #include "sequence.hpp"
40 #include "polChemDef.hpp"
41 
42 
43 namespace massXpert
44 {
45 
46   //! Constructs a sequence
47   /*! The sequence is in the form of a string of concatenated monomer
48       codes. No quality check is performed.
49 
50     \param text sequence in the form of concatenated monomer codes.
51   */
Sequence(const QString & text)52   Sequence::Sequence(const QString &text)
53     : m_monomerText(text)
54   {
55   }
56 
57 
58   //! Destroys the sequence.
~Sequence()59   Sequence::~Sequence()
60   {
61     while(!m_monomerList.isEmpty())
62       delete m_monomerList.takeFirst();
63   }
64 
65 
66   //! Sets the sequence text.
67   /*!  \param text Monomer sequence as a string of monomer codes.
68    */
69   void
setMonomerText(const QString & text)70   Sequence::setMonomerText(const QString &text)
71   {
72     m_monomerText = text;
73   }
74 
75 
76   //! Appends text to the sequence text.
77   /*!  \param text Monomer sequence as a string of monomer codes.
78    */
79   void
appendMonomerText(const QString & text)80   Sequence::appendMonomerText(const QString &text)
81   {
82     if (text.isEmpty())
83       return;
84 
85     m_monomerText += text;
86   }
87 
88 
89   //! Returns the sequence as a string of monomer codes.
90   /*!  \return The sequence as a string.
91    */
92   const QString *
monomerText()93   Sequence::monomerText()
94   {
95     return &m_monomerText;
96   }
97 
98 
99   //! Returns the sequence as a list of monomers.
100   /*!  \return The list of monomers.
101    */
102   const QList<const Monomer *> &
monomerList() const103   Sequence::monomerList() const
104   {
105     return m_monomerList;
106   }
107 
108 
109   //! Returns the sequence as a list of monomers.
110   /*!  \return The list of monomers.
111    */
112   QList<const Monomer *> *
monomerListPtr()113   Sequence::monomerListPtr()
114   {
115     return &m_monomerList;
116   }
117 
118 
119   //! Returns the size of the sequence.
120   /*! Returns the size of the sequence as the size of the list of
121     monomers.
122 
123     \return The number of items in the list of monomers.
124   */
125   int
size() const126   Sequence::size() const
127   {
128     return m_monomerList.size();
129   }
130 
131 
132   //! Removes all spaces, carriage returns and linefeeds.
133   void
unspacifyMonomerText()134   Sequence::unspacifyMonomerText()
135   {
136     // Removal of all spaces, carriage returns and linefeeds:
137 
138     for (int iter = m_monomerText.length() -1; iter >= 0 ; --iter)
139       {
140 	QChar curChar = m_monomerText.at(iter);
141 
142 	QChar::Category category = curChar.category();
143 
144 	if(category == QChar::Separator_Space)
145 	  m_monomerText.remove(iter, 1);
146 
147 	else if (curChar == '\n')
148 	  m_monomerText.remove(iter, 1);
149 	else if (curChar == '\r')
150 	  m_monomerText.remove(iter, 1);
151       }
152   }
153 
154 
155   //! Creates the string representation of the sequence.
156   /*! The string representation of the sequence is created by iterating
157     in the list of monomers and concatenating into one single string the
158     monomer code of each iterated monomer. The generated string is
159     stored in the member datum.
160 
161     \return The number of codes concatenated into the string.
162 
163     \sa makeMonomerList()
164   */
165   int
makeMonomerText()166   Sequence::makeMonomerText()
167   {
168     int iter = 0;
169 
170     m_monomerText.clear();
171 
172     for (iter = 0; iter < m_monomerList.size(); ++iter)
173       m_monomerText.append(m_monomerList.at(iter)->code());
174 
175     return iter;
176   }
177 
178 
179   QString *
monomerText(int start,int end,bool withModif) const180   Sequence::monomerText(int start, int end, bool withModif) const
181   {
182     int localStart = 0;
183     int localEnd = 0;
184 
185     QString *p_text = new QString();
186 
187     if (size() == 0)
188       return p_text;
189 
190     if (start > end)
191       {
192 	localStart = end;
193 	localEnd = start;
194       }
195     else
196       {
197 	localStart = start;
198 	localEnd = end;
199       }
200 
201     if (localStart < 0)
202       localStart = 0;
203 
204     if (localEnd < 0 || localEnd >= size())
205       localEnd = size() - 1;
206 
207     QString text;
208 
209     for (int iter = localStart ; iter < localEnd + 1 ; ++iter)
210       {
211 	const Monomer *monomer = m_monomerList.at(iter);
212 
213 	if(withModif)
214 	  {
215 	    if (monomer->isModified())
216 	      {
217 		for(int iter = 0; iter < monomer->modifList()->size();++iter)
218 		  {
219 		    text = QString("%1<%2>")
220 		      .arg(monomer->code())
221 		      .arg(monomer->modifList()->at(iter)->name());
222 		  }
223 	      }
224 	    else
225 	      text = monomer->code();
226 	  }
227 	else
228 	  text = monomer->code();
229 
230 	p_text->append(text);
231       }
232 
233     return p_text;
234   }
235 
236 
237   QString *
monomerText(const CoordinateList & coordinateList,bool withModif,bool delimitedRegions) const238   Sequence::monomerText(const CoordinateList &coordinateList,
239 			     bool withModif, bool delimitedRegions) const
240   {
241     QString *p_text = new QString();
242 
243     for (int iter = 0; iter < coordinateList.size(); ++iter)
244       {
245 	// New coordinates instance we are iterating into.
246 	Coordinates *coordinates = coordinateList.at(iter);
247 
248 	QString *tempString = monomerText(coordinates->start(),
249 					   coordinates->end(),
250 					   withModif);
251 
252 	if(delimitedRegions)
253 	  *p_text += QString("Region %1: %2\n")
254 	    .arg(coordinates->positionsAsText())
255 	    .arg(*tempString);
256 	else
257 	  *p_text += *tempString;
258 
259 	delete(tempString);
260       }
261 
262     *p_text += QString("\n");
263 
264     return p_text;
265   }
266 
267 
268   //! Creates a list of monomers from the string sequence.
269   /*! The creation of the list of monomers is performed by iterating in
270     the sequence text form and for each monomer code parsed a monomer is
271     created by looking into a list of reference monomers(belonging to
272     the polymer chemistry definition used at construction).
273 
274     \param reset If true, the list of monomers is first cleared.
275 
276     \param polChemDef Polymer chemistry definition to be used to craft
277     the fully qualified monomers using their code in the text
278     representation of the sequence.
279 
280     \param errorList list of int where to store the indices where errors
281     are encountered. Defaults to 0, in which case no storing of the
282     indices occurs.
283 
284     \return The size of the monomer list or -1 if an error occurred.
285   */
286   int
makeMonomerList(const PolChemDef * polChemDef,bool reset,QList<int> * errorList)287   Sequence::makeMonomerList(const PolChemDef *polChemDef, bool reset,
288 			     QList<int> *errorList)
289   {
290     if (!polChemDef)
291       return -1;
292 
293     int index = 0;
294     int ret = -1;
295     QString err;
296     QString code;
297 
298     // If error indices are to be stored, the list MUST be empty.
299     if (errorList)
300       Q_ASSERT(errorList->size() == 0);
301 
302     if (reset)
303       {
304 	while(!m_monomerList.isEmpty())
305 	  delete m_monomerList.takeFirst();
306       }
307 
308     unspacifyMonomerText();
309 
310 //     qDebug() << __FILE__ << __LINE__
311 // 	     << "Sequence:" << m_monomerText;
312 
313     ret = nextCode(&code, &index, &err, polChemDef->codeLength());
314 
315     const QList<Monomer*> &refList = polChemDef->monomerList();
316 
317     while(1)
318       {
319 	if(ret < 0)
320 	  {
321 	    // There was an error in the parsed code. Store the index.
322 	    if (errorList)
323 	      {
324 		errorList->append(index);
325 		++index;
326 		ret = nextCode(&code, &index, &err, polChemDef->codeLength());
327 		continue;
328 	      }
329 	    else
330 	      {
331 		break;
332 	      }
333 	  }
334 
335 	if(ret == 0)
336 	  break;
337 
338 	Monomer *monomer = new Monomer(polChemDef, "NOT_SET");
339 
340 	if(Monomer::isCodeInList(code, refList, monomer) == -1)
341 	  {
342 	    delete monomer;
343 
344 	    if (errorList)
345 	      {
346 		errorList->append(index);
347 		++index;
348 		ret = nextCode(&code, &index, &err, polChemDef->codeLength());
349 		continue;
350 	      }
351 	    else
352 	      {
353 		return -1;
354 	      }
355 	  }
356 
357 	m_monomerList.append(monomer);
358 
359 // 	qDebug() << __FILE__ << __LINE__
360 // 		 << "New monomer:" << monomer->name();
361 
362 	++index;
363 
364 // 	qDebug() <<  __FILE__ << __LINE__ << "index:" << index;
365 
366 	ret = nextCode(&code, &index, &err, polChemDef->codeLength());
367       }
368     // End of
369     // while(1)
370 
371     if (errorList)
372       {
373 	if(errorList->size())
374 	  return -1;
375       }
376 
377     if (ret == -1)
378       return -1;
379 
380     return m_monomerList.size();
381   }
382 
383 
384   //! Returns the next code occurring in the sequence.
385   /*! Returns the code occurring in the sequence starting at index \p
386     index.
387 
388     \param code Location where to store the code to return to caller.
389 
390     \param index Index at which parsing for a new code in the sequence has
391     to start.
392 
393     \param err Location where to store the erroneous characters that
394     might be encountered during parsing of the sequence.
395 
396     \param codeLength Number of authorized characters to qualify a
397     monomer code.
398 
399     \return the length(in characters) of the returned code or -1 if an
400     error occurred.
401   */
402   int
nextCode(QString * code,int * index,QString * err,int codeLength)403   Sequence::nextCode(QString *code, int *index, QString *err, int codeLength)
404   {
405     QString newCode;
406     int iter = 0;
407 
408     // We get a sequence of monomer codes(like "LysArgGlu" for example)
409     // and we have to return the next code starting from *index. Note
410     // that the sequence must not contain invalid characters. The
411     // invalid characters might be placed in err for further scrutiny by
412     // the caller.
413 
414     // Returns the count of actually parsed characters in the string
415     // newCode(copied to 'code' param). If an error occurs -1 is
416     // returned and the faulty character is copied in 'err'. 'index' is
417     // updated with the index of the last valid character parsed for
418     // current code.
419 
420     Q_ASSERT(code);
421     Q_ASSERT(index);
422     Q_ASSERT(err);
423 
424     code->clear();
425     err->clear();
426 
427     int length = m_monomerText.length();
428 
429     while(1)
430       {
431 	if(iter >= codeLength)
432 	  {
433 	    // Because we have progressed farther than authorized by
434 	    // the number of characters allowed in the monomer codes
435 	    // of this polymer chemistry definition, we decrement iter
436 	    // and break the loop... Later in this function, we'll set
437 	    // the proper index in the sequence where next parsing run
438 	    // should occurs (the calling function will increment
439 	    // *index by one).
440 
441 	    --iter;
442 	    break;
443 	  }
444 
445 	if(iter + *index >= length)
446 	  break;
447 
448 	QChar curChar = m_monomerText.at(iter + *index);
449 
450 	if(!curChar.isLetter())
451 	  {
452 // 	    qDebug() << __FILE__ << __LINE__
453 // 		     << "The character is not a letter:"
454 // 		     << curChar;
455 
456 	    *err = curChar;
457 
458 	    // The non-Letter character might be '/', which would be
459 	    // perfectly fine, as we use it to symbolize the actual
460 	    // cleavage site. Which means that we will continue
461 	    // parsing the rest of the string : we have to give the
462 	    // current position back to the caller in the *index
463 	    // variable for the next call to this function to start at
464 	    // next character (not falling back to '/', which would
465 	    // make us enter in an infinite loop).
466 
467 	    *index = *index + iter;
468 
469 	    return -1;
470 	  }
471 
472 	bool isLower =(curChar.category() == QChar::Letter_Lowercase);
473 
474 	if(iter == 0)
475 	  {
476 	    if (isLower)
477 	      {
478 		qDebug() << __FILE__ << __LINE__
479 			 << "First character of monomer code might not be"
480 			 << "lower case; sequence is"
481 			 << m_monomerText;
482 
483 		*err = curChar;
484 
485 		return -1;
486 	      }
487 	    else
488 	      {
489 		// Good, first char is uppercase.
490 		newCode += curChar;
491 	      }
492 	  }
493 	else //(iter != 0)
494 	  {
495 	    // We are not in our first iteration. So either the current
496 	    // character is lowercase and we are just continuing to
497 	    // iterate into a multi-char monomer code, or the current
498 	    // character is uppercase, in which case we are starting to
499 	    // iterate in a new monomer code.
500 
501 	    if (isLower)
502 	      newCode += curChar;
503 	    else
504 	      {
505 		// Decrement iter, because this round was for nothing:
506 		// we had "invaded" the next monomer code in sequence,
507 		// which we must not do.
508 
509 		--iter;
510 		break;
511 	      }
512 	  }
513 
514 	++iter;
515       }
516 
517     // We finished parsing at most codeLength characters out of
518     // 'm_monomerText', so we have a valid code in the 'code' variable. We
519     // can also compute a new index position in the sequence and return
520     // the number of characters that we effectively parsed. Note that
521     // the caller will be responsible for incrementing the 'index' value
522     // by one character unit so as not to reparse the last characters of
523     // the sent 'code' object.
524 
525     *index = *index + iter;
526     *code = newCode;
527     err->clear();
528 
529     return code->length();
530   }
531 
532 
533 
534   // Returns -1 if an error was encountered, 0 if no match could be
535   // found, 1 if a match was found.
536   bool
findForwardMotif(Sequence * motif,const PolChemDef * polChemDef,int * index)537   Sequence::findForwardMotif(Sequence *motif,
538 			      const PolChemDef *polChemDef,
539 			      int *index)
540   {
541     Q_ASSERT(motif);
542     Q_ASSERT(polChemDef);
543     Q_ASSERT(index);
544 
545     if (*index < 0)
546       return -1;
547     if (*index >= size())
548       return -1;
549 
550     int motifSize = motif->size();
551 
552     // If motif's length is 0, then nothing to search for, return
553     // unmodified 'index'.
554     if (!motifSize)
555       return 0;
556 
557     // Simple optimization, if index + size of motif is greater then
558     // size of sequence, return right away.
559     if (*index + motifSize >= size())
560       return 0;
561 
562     // First, make a monomerList.
563     if (motif->makeMonomerList(polChemDef) == -1)
564       return -1;
565 
566     // Compare *this sequence with the one in 'motif', starting at index
567     // 'index' in *this sequence and 0 in 'motif'.
568 
569     bool matched = false;
570     int matchIndex = 0;
571 
572     for (int iter = *index; iter < size(); ++iter)
573       {
574 	matched = false;
575 	int jter = 0;
576 
577 	const Monomer *monomer = at(iter);
578 	const Monomer *motifMonomer = motif->at(jter);
579 
580 	// We do not compare with operator == because that comparison
581 	// would involve the comparison of modifications inside the
582 	// monomers, which would not work here.
583 	if(monomer->code() != motifMonomer->code())
584 	  continue;
585 
586 	// An easy check is to see if the number of remaining monomers
587 	// in the polymer sequence is compatible with the number of
588 	// monomers still to be matched in the find array.  Imagine the
589 	// sequence of the polymer ends like this: ==========JTOUTVU and
590 	// the sequence to be searched for is : TVUL What we see is that
591 	// the T of the TVU of the sequence matches; however we can stop
592 	// the search right away because there is a 'L' in the search
593 	// pattern that is not present in the end part of the
594 	// sequence. This is exactly what is checked below.  Note that
595 	// this check makes SURE that at the end of the second inner
596 	// loop, when we get out of it, the sole reason we may not
597 	// consider that the match did not occur is because actually two
598 	// monomers differred and not because anybody came out of the
599 	// borders of the sequence in neither the array of the sequence
600 	// to be searched, nor the array of the polymer sequence. This
601 	// makes it very easy to assess if a match occurred or not.
602 
603 	if(size() - iter < motif->size() - jter)
604 	  {
605 	    // Note that if it were ==, then it would have been possible
606 	    // that the sequence "just-in-time" match prior to ending of
607 	    // the polymer sequence array. Do not forget that we are in
608 	    // forward mode, thus we can break immediately, because we
609 	    // are certain that we won't have any chance to find the
610 	    // sequence downstream of current index.
611 
612 	    matched = false;
613 	    break;
614 	  }
615 
616 	matchIndex = iter;
617 
618 	// We have to set the matched boolean to true, because if the
619 	// motif to find is one monomer-long, then the loop below will
620 	// not be entered, and we'll fail to know that the match
621 	// occurred later on.
622 	matched = true;
623 
624 	// Now that we have our anchoring point in the *this sequence,
625 	// let's iterate in the motif, and check if the identity in
626 	// sequence goes along.
627 
628 	for(int kter = jter + 1 ; kter < motif->size() ; ++kter)
629 	  {
630 	    // At first run in this loop, we are in the second cell of
631 	    // the find list, which means that we should have jter ==
632 	    // 1. And we should compare its contents with those of the
633 	    // cell in the sequence list at index(iter + jter).
634 
635 	    monomer = at(iter + kter);
636 	    motifMonomer = motif->at(kter);
637 
638 	    // We do not compare with operator == because that
639 	    // comparison would involve the comparison of modifications
640 	    // inside the monomers, which would not work here.
641 	    if (monomer->code() == motifMonomer->code())
642 	      {
643 		// The monomers still match.
644 		matched = true;
645 		continue;
646 	      }
647 	    else
648 	      {
649 		matched = false;
650 		break;
651 	      }
652 	  }
653 	// End of
654 	// for (int kter = jter + 1 ; kter < motif->size() ; ++kter)
655 
656 	// At this point, we either have normally extinguished the run
657 	// in the inner loop, or we have gone out of it before its
658 	// normal termination. In either case, we have to test if the
659 	// match occurred or not.
660 
661 	// Check if the match did NOT occur:
662 
663 	if(!matched)
664 	  {
665 	    // We just continue with the outer loop, that is we continue
666 	    // searching in the polymer sequence for a match with the
667 	    // first monomer in the motif.
668 
669 	    continue;
670 	  }
671 	else
672 	  {
673 	    // The match indeed occurred.
674 
675 	    *index = matchIndex;
676 	    return 1;
677 	  }
678       }
679     // End of
680     // for (int iter = *index; iter < size(); ++iter)
681 
682 
683     // No match could be achieved, we have to let the caller function
684     // know this in a durable manner : returning 0.
685 
686     return 0;
687   }
688 
689 
690 
691   //! Returns the monomer at index \p index.
692   /*!
693 
694     \param index index of the monomer to return in the list of
695     monomers. Must comply with the boundaries of the monomer list(that is
696     be >= 0 and < list.size()).
697 
698     \return a pointer to the monomer.
699   */
700   const Monomer *
at(int index) const701   Sequence::at(int index) const
702   {
703 //     qDebug() << __FILE__ << __LINE__ << "In call at() with value:"
704 // 	      << index ;
705 
706     if (index < 0)
707       qFatal("%s@%d -- Index cannot be less than 0.",
708 	     __FILE__, __LINE__);
709 
710     if (index > m_monomerList.size())
711       qFatal("%s@%d -- Index cannot be greater than polymer size.",
712 	     __FILE__, __LINE__);
713 
714     return m_monomerList.at(index);
715   }
716 
717 
718   int
monomerIndex(const Monomer * monomer)719   Sequence::monomerIndex(const Monomer *monomer)
720   {
721     for (int iter = 0; iter < m_monomerList.size(); ++iter)
722       {
723 	if(m_monomerList.at(iter) == monomer)
724 	  return iter;
725       }
726 
727     return -1;
728   }
729 
730 
731   //! Inserts the monomer at index \p index.
732   /*! Assertions ensure that \p index is not less than 0 and not greater
733     than sequence size as reported by size().
734 
735     This means that a monomer can only be inserted from a sequence if
736     the sequence is at least in the form of a list of monomers.
737 
738     \param monomer dynamically allocated monomer. Assertion insures that
739     this pointer is non-0.
740 
741     \param index Index of monomer to insert.
742 
743     \return Always true.
744   */
745   bool
insertMonomerAt(const Monomer * monomer,int index)746   Sequence::insertMonomerAt(const Monomer *monomer, int index)
747   {
748     Q_ASSERT(monomer);
749     Q_ASSERT(index > -1 && index <= size());
750 
751     m_monomerList.insert(index, monomer);
752 
753     return true;
754   }
755 
756 
757   bool
prepareMonomerRemoval(const Monomer * monomer)758   Sequence::prepareMonomerRemoval(const Monomer *monomer)
759   {
760     return true;
761   }
762 
763 
764   //! Removes the monomer at index \p index.
765   /*! Assertions ensure that \p index is not less than 0 and not equal
766     or greater than sequence size as reported by size().
767 
768     This means that a monomer can only be removed from a sequence if the
769     sequence is at least in the form of a list of monomers.
770 
771     \param index Index of monomer to remove.
772 
773     \return Always true.
774   */
775   bool
removeMonomerAt(int index)776   Sequence::removeMonomerAt(int index)
777   {
778     Q_ASSERT(index > -1);
779     Q_ASSERT(index < size());
780 
781     const Monomer *monomer = at(index);
782 
783     if (!prepareMonomerRemoval(monomer))
784       return false;
785 
786     m_monomerList.removeAt(index);
787 
788     delete monomer;
789 
790     return true;
791   }
792 
793 
794   bool
validate(const PolChemDef * polChemDef)795   Sequence::validate(const PolChemDef *polChemDef)
796   {
797     Q_ASSERT(polChemDef);
798 
799     if (makeMonomerList(polChemDef) > - 1)
800       return true;
801 
802     return false;
803   }
804 
805   quint16
checksum(int startIdx,int endIdx,bool withModifs) const806   Sequence::checksum(int startIdx, int endIdx, bool withModifs) const
807   {
808     if (!size())
809       return 0;
810 
811     QString *text = monomerText(startIdx, endIdx, withModifs);
812 
813     QByteArray bytes = text->toUtf8();
814 
815     quint16 checksum = qChecksum(bytes.data(),  bytes.size());
816 
817 //     qDebug() << __FILE__ << __LINE__
818 // 	     << "checksum:" << checksum;
819 
820     return checksum;
821   }
822 
823 
824 } // namespace massXpert
825