1 // -*- C++ -*-
2
3 /*
4 * Gnome Chemistry Utils
5 * formula.cc
6 *
7 * Copyright (C) 2005-2008 Jean Bréfort <jean.brefort@normalesup.org>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 3 of the
12 * License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
23 */
24
25 #include "config.h"
26 #include "formula.h"
27 #include "element.h"
28 #include "residue.h"
29 #include "document.h"
30 #include "molecule.h"
31 #include <glib/gi18n.h>
32 #include <cctype>
33 #include <cmath>
34 #include <cstdlib>
35 #include <cstring>
36 #include <sstream>
37
38 using namespace std;
39
40 namespace gcu
41 {
42
parse_error(const string & __arg,int start,int length)43 parse_error::parse_error(const string& __arg, int start, int length)
44 : exception(), m_msg(__arg)
45 {
46 m_start = start;
47 m_length = length;
48 }
49
~parse_error()50 parse_error::~parse_error() throw() { }
51
52 const char*
what() const53 parse_error::what() const throw()
54 { return m_msg.c_str(); }
55
56 const char*
what(int & start,int & length) const57 parse_error::what(int &start, int &length) const throw()
58 {
59 start = m_start;
60 length = m_length;
61 return m_msg.c_str();
62 }
63
FormulaElt()64 FormulaElt::FormulaElt ()
65 {
66 stoich = 1;
67 }
68
~FormulaElt()69 FormulaElt::~FormulaElt ()
70 {
71 }
72
Markup()73 string FormulaElt::Markup ()
74 {
75 ostringstream oss;
76 if (stoich > 1)
77 oss << "<sub>" << stoich << "</sub>";
78 return oss.str ();
79 }
80
Text()81 string FormulaElt::Text ()
82 {
83 ostringstream oss;
84 if (stoich > 1)
85 oss << stoich;
86 return oss.str ();
87 }
88
FormulaAtom(int Z)89 FormulaAtom::FormulaAtom (int Z): FormulaElt()
90 {
91 elt = Z;
92 }
93
~FormulaAtom()94 FormulaAtom::~FormulaAtom ()
95 {
96 }
97
Markup()98 string FormulaAtom::Markup ()
99 {
100 string s = Element::Symbol (elt);
101 s += FormulaElt::Markup ();
102 return s;
103 }
104
Text()105 string FormulaAtom::Text ()
106 {
107 string s = Element::Symbol (elt);
108 s += FormulaElt::Text ();
109 return s;
110 }
111
GetValence()112 int FormulaAtom::GetValence ()
113 {
114 return Element::GetElement (elt)->GetDefaultValence ();
115 }
116
BuildRawFormula(map<int,int> & raw)117 void FormulaAtom::BuildRawFormula (map<int, int> &raw)
118 {
119 raw[elt] += stoich;
120 }
121
FormulaBlock()122 FormulaBlock::FormulaBlock (): FormulaElt()
123 {
124 }
125
~FormulaBlock()126 FormulaBlock::~FormulaBlock ()
127 {
128 list<FormulaElt *>::iterator i, end = children.end();
129 for (i = children.begin (); i != end; i++)
130 delete *i;
131 }
132
Markup()133 string FormulaBlock::Markup ()
134 {
135 ostringstream oss;
136 switch (parenthesis) {
137 case 0:
138 oss << "(";
139 break;
140 case 1:
141 oss << "[";
142 break;
143 case 2:
144 oss << "{";
145 break;
146 }
147 list<FormulaElt *>::iterator i, end = children.end();
148 for (i = children.begin (); i != end; i++) {
149 oss << (*i)->Markup ();
150 }
151 switch (parenthesis) {
152 case 0:
153 oss << ")";
154 break;
155 case 1:
156 oss << "]";
157 break;
158 case 2:
159 oss << "}";
160 break;
161 }
162 oss << FormulaElt::Markup ();
163 return oss.str ();
164 }
165
Text()166 string FormulaBlock::Text ()
167 {
168 ostringstream oss;
169 switch (parenthesis) {
170 case 0:
171 oss << "(";
172 break;
173 case 1:
174 oss << "[";
175 break;
176 case 2:
177 oss << "{";
178 break;
179 }
180 list<FormulaElt *>::iterator i, end = children.end();
181 for (i = children.begin (); i != end; i++) {
182 oss << (*i)->Text ();
183 }
184 switch (parenthesis) {
185 case 0:
186 oss << ")";
187 break;
188 case 1:
189 oss << "]";
190 break;
191 case 2:
192 oss << "}";
193 break;
194 }
195 oss << FormulaElt::Text ();
196 return oss.str ();
197 }
198
BuildRawFormula(map<int,int> & raw)199 void FormulaBlock::BuildRawFormula (map<int, int> &raw)
200 {
201 map<int, int> local;
202 list<FormulaElt *>::iterator i, iend = children.end();
203 for (i = children.begin (); i != iend; i++)
204 (*i)->BuildRawFormula (local);
205 map<int, int>::iterator j, jend = local.end();
206 for (j = local.begin (); j != jend; j++){
207 raw[(*j).first] += stoich * (*j).second;}
208 }
209
GetValence()210 int FormulaBlock::GetValence ()
211 {
212 return -1; // FIXME !!!
213 }
214
FormulaResidue(Residue const * res,char const * symbol,int Z)215 FormulaResidue::FormulaResidue (Residue const *res, char const *symbol, int Z): FormulaElt()
216 {
217 residue = res;
218 Symbol = symbol;
219 m_Z = Z;
220 }
221
~FormulaResidue()222 FormulaResidue::~FormulaResidue ()
223 {
224 }
225
Markup()226 string FormulaResidue::Markup ()
227 {
228 size_t n = Symbol.find ('-');
229 string s = (n != string::npos)? string ("<i>") + string (Symbol, 0, n) + "</i>" + string (Symbol, n): Symbol;
230 s += FormulaElt::Text ();
231 return s;
232 }
233
Text()234 string FormulaResidue::Text ()
235 {
236 string s = Symbol;
237 s += FormulaElt::Text ();
238 return s;
239 }
240
BuildRawFormula(map<int,int> & raw)241 void FormulaResidue::BuildRawFormula (map<int, int> &raw)
242 {
243 std::map<int,int> const &m = residue->GetRawFormula ();
244 map<int, int>::const_iterator j, jend = m.end();
245 for (j = m.begin (); j != jend; j++){
246 raw[(*j).first] += stoich * (*j).second;}
247 }
248
GetValence()249 int FormulaResidue::GetValence ()
250 {
251 return 1; // residues with other valences are not currently supported
252 }
253
AnalString(char * sz,list<FormulaElt * > & result,bool & ambiguous,int offset)254 bool Formula::AnalString (char *sz, list<FormulaElt *> &result, bool &ambiguous, int offset)
255 {
256 if (*sz == 0)
257 return true;
258 unsigned i = 0;
259 char sy[Residue::MaxSymbolLength + 1];
260 Residue const *r = NULL;
261 bool amb = ambiguous, local_amb;
262 if (*sz) {
263 // search for any abbreviation starting sz
264 strncpy (sy, sz, Residue::MaxSymbolLength);
265 i = strlen (sz);
266 if (i > Residue::MaxSymbolLength)
267 i = Residue::MaxSymbolLength;
268 while (i > 0) {
269 sy[i] = 0;
270 r = Residue::GetResidue (sy, &local_amb);
271 if (r)
272 break;
273 // don't search for a shorter residue symbol if the string represents an atom
274 if (Element::Z (sy) > 0)
275 break;
276 i--;
277 }
278 if (r) {
279 FormulaResidue *fr = new FormulaResidue (r, sy, (local_amb? Element::Z (sy): 0));
280 fr->start = offset;
281 fr->end = offset + i - 1;
282 result.push_back (fr);
283 ambiguous |= local_amb;
284 if (AnalString (sz + i, result, ambiguous, offset + i))
285 return true;
286 ambiguous = amb; // restore ambiguity state
287 delete result.back ();
288 result.pop_back ();
289 }
290 if (islower (*sz)) {
291 /* we might have some abbreviation around there */
292 }
293 if (!(m_ParseMode & GCU_FORMULA_PARSE_NO_CASE))
294 *sz = toupper (*sz);
295 if (strlen (sz) == 1) {
296 i = Element::Z (sz);
297 if (i > 0) {
298 FormulaAtom *fa = new FormulaAtom (i);
299 fa->start = offset;
300 fa->end = offset + 1;
301 result.push_back (fa);
302 return true;
303 } else
304 return false;
305 }
306 if (isupper (sz[1])) {
307 sy [0] = *sz;
308 sy [1] = 0;
309 i = Element::Z (sy);
310 if (i > 0) {
311 FormulaAtom *fa = new FormulaAtom (i);
312 fa->start = offset;
313 fa->end = offset + 1;
314 result.push_back (fa);
315 if (AnalString (sz + 1, result, ambiguous, offset + 1))
316 return true;
317 delete result.back ();
318 result.pop_back ();
319 }
320 if (!(m_ParseMode & GCU_FORMULA_PARSE_NO_CASE))
321 sy[1] = tolower (sz[1]);
322 sy[2] = 0;
323 i = Element::Z (sy);
324 if (i > 0) {
325 FormulaAtom *fa = new FormulaAtom (i);
326 fa->start = offset;
327 fa->end = offset + 2;
328 result.push_back (fa);
329 if (AnalString (sz + 2, result, ambiguous, offset + 2))
330 return true;
331 delete result.back ();
332 result.pop_back ();
333 }
334 if (*sz != 'U')
335 return false;
336 if (!(m_ParseMode & GCU_FORMULA_PARSE_NO_CASE))
337 sy[2] = tolower (sz[2]);
338 sy[3] = 0;
339 i = Element::Z (sy);
340 if (i > 0) {
341 FormulaAtom *fa = new FormulaAtom (i);
342 fa->start = offset;
343 fa->end = offset + 3;
344 result.push_back (fa);
345 if (AnalString (sz + 3, result, ambiguous, offset + 3))
346 return true;
347 }
348 return false;
349 } else {
350 sy[0] = sz[0];
351 sy[1] = sz[1];
352 if (*sz == 'U') {
353 // No 2 chars symbols begining with U exist, so try 3 chars symbols
354 if (!(m_ParseMode & GCU_FORMULA_PARSE_NO_CASE))
355 sy[2] = tolower (sz[2]);
356 sy[3] = 0;
357 i = Element::Z (sy);
358 if (i > 0) {
359 FormulaAtom *fa = new FormulaAtom (i);
360 fa->start = offset;
361 fa->end = offset + 3;
362 result.push_back (fa);
363 if (AnalString (sz + 3, result, ambiguous, offset + 3))
364 return true;
365 delete result.back ();
366 result.pop_back ();
367 }
368 }
369 sy[2] = 0;
370 i = Element::Z (sy);
371 if (i > 0) {
372 FormulaAtom *fa = new FormulaAtom (i);
373 fa->start = offset;
374 fa->end = offset + 2;
375 result.push_back (fa);
376 if (AnalString (sz + 2, result, ambiguous, offset + 2))
377 return true;
378 delete result.back ();
379 result.pop_back ();
380 }
381 sy[1] = 0;
382 i = Element::Z (sy);
383 if (i > 0) {
384 FormulaAtom *fa = new FormulaAtom (i);
385 fa->start = offset;
386 fa->end = offset + 1;
387 result.push_back (fa);
388 if (AnalString (sz + 1, result, ambiguous, offset + 1))
389 return true;
390 }
391 }
392 }
393 return false;
394 }
395
Formula(string entry,FormulaParseMode mode)396 Formula::Formula (string entry, FormulaParseMode mode) throw (parse_error)
397 {
398 m_ParseMode = mode;
399 SetFormula (entry);
400 }
401
~Formula()402 Formula::~Formula ()
403 {
404 Clear ();
405 }
406
GetMarkup()407 char const *Formula::GetMarkup ()
408 {
409 return Markup.c_str ();
410 }
411
GetRawFormula()412 map<int,int> &Formula::GetRawFormula ()
413 {
414 return Raw;
415 }
416
GetRawMarkup()417 char const *Formula::GetRawMarkup ()
418 {
419 return RawMarkup.c_str ();
420 }
421
SetFormula(string entry)422 void Formula::SetFormula (string entry) throw (parse_error)
423 {
424 Entry = entry;
425 Clear ();
426 Parse (Entry, Details);
427 list<FormulaElt *>::iterator i, iend = Details.end();
428 // now update markups and raw formula
429 for (i = Details.begin (); i != iend; i++) {
430 Markup += (*i)->Markup ();
431 (*i)->BuildRawFormula (Raw);
432 }
433 ostringstream oss;
434 map<string, int> elts;
435 int nC = 0, nH = 0;
436 map<int, int>::iterator j, jend = Raw.end();
437 for (j = Raw.begin (); j != jend; j++) {
438 switch ((*j).first) {
439 case 1:
440 nH = (*j).second;
441 break;
442 case 6:
443 nC = (*j).second;
444 break;
445 default:
446 elts[Element::Symbol((*j).first)] = (*j).second;
447 break;
448 }
449 }
450 if (nC > 0) {
451 oss << "C";
452 if (nC > 1)
453 oss << "<sub>" << nC << "</sub>";
454 }
455 if (nH > 0) {
456 oss << "H";
457 if (nH > 1)
458 oss << "<sub>" << nH << "</sub>";
459 }
460 map<string, int>::iterator k, kend = elts.end ();
461 for (k = elts.begin (); k != kend; k++) {
462 nC = (*k).second;
463 oss << (*k).first;
464 if (nC > 1)
465 oss << "<sub>" << nC << "</sub>";
466 }
467 RawMarkup = oss.str ();
468 }
469
Clear()470 void Formula::Clear ()
471 {
472 list<FormulaElt *>::iterator i, end = Details.end();
473 for (i = Details.begin (); i != end; i++)
474 delete *i;
475 Details.clear ();
476 Markup = "";
477 Raw.clear ();
478 RawMarkup = "";
479 m_ConnectivityCached = m_WeightCached = false;
480 }
481
Parse(string & formula,list<FormulaElt * > & result)482 void Formula::Parse (string &formula, list<FormulaElt *> &result) throw (parse_error)
483 {
484 int i = 0, npo, size = formula.size (), j, k = 0; // parsing index, number of open parenthesis, string size
485 char c = 0, *sz, *end;
486 bool ambiguous = false;
487 while (i < size) {
488 if (formula[i] == '(' || formula[i] == '[' || formula[i] == '{') {
489 switch (formula[i]) {
490 case '(':
491 c = ')';
492 k = 0;
493 break;
494 case '[':
495 c = ']';
496 k = 1;
497 break;
498 case '{':
499 c = '}';
500 k = 2;
501 break;
502 }
503 npo = 1;
504 j = i + 1;
505 while (j < size && npo > 0) {
506 if (formula[j] == '(' || formula[j] == '[' || formula[j] == '{')
507 npo++;
508 else if (formula[j] == ')' || formula[j] == ']' || formula[j] == '}')
509 npo--;
510 j++;
511 }
512 if (npo || formula[j - 1] != c)
513 throw parse_error (_("Unmatched parenthesis"), i, 1);
514 string str (formula, i + 1, j - i - 2);
515 FormulaBlock *block = new FormulaBlock ();
516 block->parenthesis = k;
517 block->start = i;
518 block->end = j;
519 result.push_back (block);
520 try {
521 Parse (str, block->children);
522 }
523 catch (parse_error &error) {
524 error.add_offset (i + 1);
525 throw error;
526 }
527 block->stoich = strtol (formula.c_str () + j, &end, 10);
528 i = end - formula.c_str ();
529 if (i == j)
530 block->stoich = 1;
531 } else if (isalpha (formula[i]) || formula[i] == '-') {
532 j = i + 1;
533 while (isalpha (formula[j]) || formula[j] == '-')
534 j++;
535 k = j - i;
536 sz = new char[k + 1];
537 strncpy (sz, formula.c_str () + i, k);
538 sz[k] = 0;
539 if (!AnalString (sz, result, ambiguous, i)) {
540 delete [] sz;
541 throw parse_error (_("Could not interpret the symbol list"), i, k);
542 }
543 delete [] sz;
544 i = j;
545 FormulaElt *elt = result.back ();
546 if (!elt)
547 throw runtime_error (_("Parser failed, please fill a bug report."));
548 elt->stoich = strtol (formula.c_str () + j, &end, 10);
549 i = end - formula.c_str ();
550 if (i == j)
551 elt->stoich = 1;
552 } else if (formula[i] == ')' || formula[i] == ']' || formula[i] == '}') {
553 throw parse_error (_("Unmatched parenthesis"), i, 1);
554 } else
555 throw parse_error (_("Invalid character"), i, 1);
556 }
557 if (ambiguous) {
558 switch (m_ParseMode & 7) {
559 case GCU_FORMULA_PARSE_GUESS: {
560 // if it fails, nothing is replaced
561 if (!TryReplace (result, result.begin ()))
562 g_warning ("ambiguous formula");
563 break;
564 }
565 case GCU_FORMULA_PARSE_ATOM: {
566 // replace all ambiguous residues
567 list<FormulaElt *>::iterator it = result.begin ();
568 FormulaResidue *res;
569 while (it != result.end ()) {
570 res = dynamic_cast <FormulaResidue *> (*it);
571 if (res && res->GetZ ()) {
572 FormulaAtom *elt = new FormulaAtom (res->GetZ());
573 elt->stoich = res->stoich;
574 it = result.erase (it);
575 delete res;
576 it = result.insert (it, elt);
577 } else
578 it++;
579 }
580 break;
581 }
582 case GCU_FORMULA_PARSE_RESIDUE:
583 // don't do anything
584 break;
585 case GCU_FORMULA_PARSE_ASK: {
586 // FIXME: really ask
587 break;
588 }
589 }
590 }
591 }
592
GetMolecularWeight(bool & artificial)593 DimensionalValue Formula::GetMolecularWeight (bool &artificial)
594 {
595 if (Raw.size () == 0) {
596 return m_Weight;
597 }
598 if (!m_WeightCached) {
599 DimensionalValue atom_weight;
600 m_Artificial = false; // most formula don't have artificial elements
601 map<int,int>::iterator i, end = Raw.end (), begin = Raw.begin ();
602 for (i = begin; i != end; i++) {
603 atom_weight = *Element::GetElement ((*i).first)->GetWeight ();
604 if (atom_weight.GetValue ().prec == 0)
605 m_Artificial = true;
606 m_Weight = (i == begin)? atom_weight * (*i).second: m_Weight + atom_weight * (*i).second;
607 }
608 }
609 m_WeightCached = true;
610 artificial = m_Artificial;
611 return m_Weight;
612 }
613
CalculateIsotopicPattern(IsotopicPattern & pattern)614 void Formula::CalculateIsotopicPattern (IsotopicPattern &pattern)
615 {
616 map<int,int>::iterator i, end = Raw.end ();
617 i = Raw.begin ();
618 if (i == end) // empty formula
619 return;
620 IsotopicPattern *pat = NULL, *pat0;
621 while (!pat && (i != end)) {
622 pat = Element::GetElement ((*i).first)->GetIsotopicPattern ((*i).second);
623 i++;
624 }
625 if (!pat)
626 return;
627 pattern.Copy (*pat);
628 pat->Unref ();
629 for (; i != end; i++) {
630 pat = Element::GetElement ((*i).first)->GetIsotopicPattern ((*i).second);
631 if (!pat) {
632 // no stable isotope known for the element
633 pattern.Clear ();
634 return;
635 }
636 pat0 = pattern.Multiply (*pat);
637 pat->Unref ();
638 pat = pat0->Simplify ();
639 pattern.Copy (*pat);
640 pat0->Unref ();
641 pat->Unref ();
642 }
643 }
644
BuildConnectivity()645 bool Formula::BuildConnectivity ()
646 {
647 Document *Doc = new Document (NULL);
648 Molecule *mol = Molecule::MoleculeFromFormula (Doc, *this, false);
649 bool result = mol;
650 delete Doc;
651 return result;
652 }
653
TryReplace(list<FormulaElt * > & result,list<FormulaElt * >::iterator it)654 bool Formula::TryReplace (list<FormulaElt *> &result, list<FormulaElt *>::iterator it)
655 {
656 if (BuildConnectivity ())
657 return true;
658 FormulaResidue *res;
659 while (it != result.end ()) {
660 res = dynamic_cast <FormulaResidue *> (*it);
661 if (res && res->GetZ ())
662 break;
663 it++;
664 }
665 if (it == result.end ())
666 return false;
667 list<FormulaElt *>::iterator next = it;
668 next++;
669 if (TryReplace (result, next))
670 return true;
671 FormulaAtom *elt = new FormulaAtom (res->GetZ());
672 elt->stoich = res->stoich;
673 it = result.erase (it);
674 result.insert (it, elt);
675 next = it;
676 next++;
677 bool ret = TryReplace (result, next);
678 if (!ret) {
679 it--;
680 delete (*it);
681 it = result.erase (it);
682 result.insert (it, res);
683 } else
684 delete res;
685 return ret;
686 }
687
688 } // namespace gcu
689