1 // -*- C++ -*-
2 
3 /*
4  * Gnome Chemistry Utils
5  * formula.cc
6  *
7  * Copyright (C) 2005-2008 Jean Bréfort <jean.brefort@normalesup.org>
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License as
11  * published by the Free Software Foundation; either version 3 of the
12  * License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
22  * USA
23  */
24 
25 #include "config.h"
26 #include "formula.h"
27 #include "element.h"
28 #include "residue.h"
29 #include "document.h"
30 #include "molecule.h"
31 #include <glib/gi18n.h>
32 #include <cctype>
33 #include <cmath>
34 #include <cstdlib>
35 #include <cstring>
36 #include <sstream>
37 
38 using namespace std;
39 
40 namespace gcu
41 {
42 
parse_error(const string & __arg,int start,int length)43 parse_error::parse_error(const string& __arg, int start, int length)
44   : exception(), m_msg(__arg)
45 {
46 	m_start = start;
47 	m_length = length;
48 }
49 
~parse_error()50 parse_error::~parse_error() throw() { }
51 
52 const char*
what() const53 parse_error::what() const throw()
54 { return m_msg.c_str(); }
55 
56 const char*
what(int & start,int & length) const57 parse_error::what(int &start, int &length) const throw()
58 {
59 	start = m_start;
60 	length = m_length;
61 	return m_msg.c_str();
62 }
63 
FormulaElt()64 FormulaElt::FormulaElt ()
65 {
66 	stoich = 1;
67 }
68 
~FormulaElt()69 FormulaElt::~FormulaElt ()
70 {
71 }
72 
Markup()73 string FormulaElt::Markup ()
74 {
75 	ostringstream oss;
76 	if (stoich > 1)
77 		oss << "<sub>" << stoich << "</sub>";
78 	return oss.str ();
79 }
80 
Text()81 string FormulaElt::Text ()
82 {
83 	ostringstream oss;
84 	if (stoich > 1)
85 		oss << stoich;
86 	return oss.str ();
87 }
88 
FormulaAtom(int Z)89 FormulaAtom::FormulaAtom (int Z): FormulaElt()
90 {
91 	elt = Z;
92 }
93 
~FormulaAtom()94 FormulaAtom::~FormulaAtom ()
95 {
96 }
97 
Markup()98 string FormulaAtom::Markup ()
99 {
100 	string s = Element::Symbol (elt);
101 	s += FormulaElt::Markup ();
102 	return s;
103 }
104 
Text()105 string FormulaAtom::Text ()
106 {
107 	string s = Element::Symbol (elt);
108 	s += FormulaElt::Text ();
109 	return s;
110 }
111 
GetValence()112 int FormulaAtom::GetValence ()
113 {
114 	return Element::GetElement (elt)->GetDefaultValence ();
115 }
116 
BuildRawFormula(map<int,int> & raw)117 void FormulaAtom::BuildRawFormula (map<int, int> &raw)
118 {
119 	raw[elt] += stoich;
120 }
121 
FormulaBlock()122 FormulaBlock::FormulaBlock (): FormulaElt()
123 {
124 }
125 
~FormulaBlock()126 FormulaBlock::~FormulaBlock ()
127 {
128 	list<FormulaElt *>::iterator i, end = children.end();
129 	for (i = children.begin (); i != end; i++)
130 		delete *i;
131 }
132 
Markup()133 string FormulaBlock::Markup ()
134 {
135 	ostringstream oss;
136 	switch (parenthesis) {
137 		case 0:
138 			oss << "(";
139 			break;
140 		case 1:
141 			oss << "[";
142 			break;
143 		case 2:
144 			oss << "{";
145 			break;
146 	}
147 	list<FormulaElt *>::iterator i, end = children.end();
148 	for (i = children.begin (); i != end; i++) {
149 		oss << (*i)->Markup ();
150 	}
151 	switch (parenthesis) {
152 		case 0:
153 			oss << ")";
154 			break;
155 		case 1:
156 			oss << "]";
157 			break;
158 		case 2:
159 			oss << "}";
160 			break;
161 	}
162 	oss << FormulaElt::Markup ();
163 	return oss.str ();
164 }
165 
Text()166 string FormulaBlock::Text ()
167 {
168 	ostringstream oss;
169 	switch (parenthesis) {
170 		case 0:
171 			oss << "(";
172 			break;
173 		case 1:
174 			oss << "[";
175 			break;
176 		case 2:
177 			oss << "{";
178 			break;
179 	}
180 	list<FormulaElt *>::iterator i, end = children.end();
181 	for (i = children.begin (); i != end; i++) {
182 		oss << (*i)->Text ();
183 	}
184 	switch (parenthesis) {
185 		case 0:
186 			oss << ")";
187 			break;
188 		case 1:
189 			oss << "]";
190 			break;
191 		case 2:
192 			oss << "}";
193 			break;
194 	}
195 	oss << FormulaElt::Text ();
196 	return oss.str ();
197 }
198 
BuildRawFormula(map<int,int> & raw)199 void FormulaBlock::BuildRawFormula (map<int, int> &raw)
200 {
201 	map<int, int> local;
202 	list<FormulaElt *>::iterator i, iend = children.end();
203 	for (i = children.begin (); i != iend; i++)
204 		(*i)->BuildRawFormula (local);
205 	map<int, int>::iterator j, jend = local.end();
206 	for (j = local.begin (); j != jend; j++){
207 		raw[(*j).first] += stoich * (*j).second;}
208 }
209 
GetValence()210 int FormulaBlock::GetValence ()
211 {
212 	return -1; // FIXME !!!
213 }
214 
FormulaResidue(Residue const * res,char const * symbol,int Z)215 FormulaResidue::FormulaResidue (Residue const *res, char const *symbol, int Z): FormulaElt()
216 {
217 	residue = res;
218 	Symbol = symbol;
219 	m_Z = Z;
220 }
221 
~FormulaResidue()222 FormulaResidue::~FormulaResidue ()
223 {
224 }
225 
Markup()226 string FormulaResidue::Markup ()
227 {
228 	size_t n = Symbol.find ('-');
229 	string s = (n != string::npos)? string ("<i>") + string (Symbol, 0, n) + "</i>" + string (Symbol, n): Symbol;
230 	s += FormulaElt::Text ();
231 	return s;
232 }
233 
Text()234 string FormulaResidue::Text ()
235 {
236 	string s = Symbol;
237 	s += FormulaElt::Text ();
238 	return s;
239 }
240 
BuildRawFormula(map<int,int> & raw)241 void FormulaResidue::BuildRawFormula (map<int, int> &raw)
242 {
243 	std::map<int,int> const &m = residue->GetRawFormula ();
244 	map<int, int>::const_iterator j, jend = m.end();
245 	for (j = m.begin (); j != jend; j++){
246 		raw[(*j).first] += stoich * (*j).second;}
247 }
248 
GetValence()249 int FormulaResidue::GetValence ()
250 {
251 	return 1; // residues with other valences are not currently supported
252 }
253 
AnalString(char * sz,list<FormulaElt * > & result,bool & ambiguous,int offset)254 bool Formula::AnalString (char *sz, list<FormulaElt *> &result, bool &ambiguous, int offset)
255 {
256 	if (*sz == 0)
257 		return true;
258 	unsigned i = 0;
259 	char sy[Residue::MaxSymbolLength + 1];
260 	Residue const *r = NULL;
261 	bool amb = ambiguous, local_amb;
262 	if (*sz) {
263 		// search for any abbreviation starting sz
264 		strncpy (sy, sz, Residue::MaxSymbolLength);
265 		i = strlen (sz);
266 		if (i > Residue::MaxSymbolLength)
267 			i = Residue::MaxSymbolLength;
268 		while (i > 0) {
269 			sy[i] = 0;
270 			r = Residue::GetResidue (sy, &local_amb);
271 			if (r)
272 				break;
273 			// don't search for a shorter residue symbol if the string represents an atom
274 			if (Element::Z (sy) > 0)
275 				break;
276 			i--;
277 		}
278 		if (r) {
279 			FormulaResidue *fr = new FormulaResidue (r, sy, (local_amb? Element::Z (sy): 0));
280 			fr->start = offset;
281 			fr->end = offset + i - 1;
282 			result.push_back (fr);
283 			ambiguous |= local_amb;
284 			if (AnalString (sz + i, result, ambiguous, offset + i))
285 				return true;
286 			ambiguous = amb; // restore ambiguity state
287 			delete result.back ();
288 			result.pop_back ();
289 		}
290 		if (islower (*sz)) {
291 			/* we might have some abbreviation around there */
292 		}
293 		if (!(m_ParseMode & GCU_FORMULA_PARSE_NO_CASE))
294 			*sz = toupper (*sz);
295 		if (strlen (sz) == 1) {
296 			i = Element::Z (sz);
297 			if (i > 0) {
298 				FormulaAtom *fa = new FormulaAtom (i);
299 				fa->start = offset;
300 				fa->end = offset + 1;
301 				result.push_back (fa);
302 				return true;
303 			} else
304 				return false;
305 		}
306 		if (isupper (sz[1])) {
307 			sy [0] = *sz;
308 			sy [1] = 0;
309 			i = Element::Z (sy);
310 			if (i > 0) {
311 				FormulaAtom *fa = new FormulaAtom (i);
312 				fa->start = offset;
313 				fa->end = offset + 1;
314 				result.push_back (fa);
315 				if (AnalString (sz + 1, result, ambiguous, offset + 1))
316 					return true;
317 				delete result.back ();
318 				result.pop_back ();
319 			}
320 			if (!(m_ParseMode & GCU_FORMULA_PARSE_NO_CASE))
321 				sy[1] = tolower (sz[1]);
322 			sy[2] = 0;
323 			i = Element::Z (sy);
324 			if (i > 0) {
325 				FormulaAtom *fa = new FormulaAtom (i);
326 				fa->start = offset;
327 				fa->end = offset + 2;
328 				result.push_back (fa);
329 				if (AnalString (sz + 2, result, ambiguous, offset + 2))
330 					return true;
331 				delete result.back ();
332 				result.pop_back ();
333 			}
334 			if (*sz != 'U')
335 				return false;
336 			if (!(m_ParseMode & GCU_FORMULA_PARSE_NO_CASE))
337 				sy[2] = tolower (sz[2]);
338 			sy[3] = 0;
339 			i = Element::Z (sy);
340 			if (i > 0) {
341 				FormulaAtom *fa = new FormulaAtom (i);
342 				fa->start = offset;
343 				fa->end = offset + 3;
344 				result.push_back (fa);
345 				if (AnalString (sz + 3, result, ambiguous, offset + 3))
346 					return true;
347 			}
348 			return false;
349 		} else {
350 			sy[0] = sz[0];
351 			sy[1] = sz[1];
352 			if (*sz == 'U') {
353 				// No 2 chars symbols begining with U exist, so try 3 chars symbols
354 				if (!(m_ParseMode & GCU_FORMULA_PARSE_NO_CASE))
355 					sy[2] = tolower (sz[2]);
356 				sy[3] = 0;
357 				i = Element::Z (sy);
358 				if (i > 0) {
359 					FormulaAtom *fa = new FormulaAtom (i);
360 					fa->start = offset;
361 					fa->end = offset + 3;
362 					result.push_back (fa);
363 					if (AnalString (sz + 3, result, ambiguous, offset + 3))
364 						return true;
365 					delete result.back ();
366 					result.pop_back ();
367 				}
368 			}
369 			sy[2] = 0;
370 			i = Element::Z (sy);
371 			if (i > 0) {
372 				FormulaAtom *fa = new FormulaAtom (i);
373 				fa->start = offset;
374 				fa->end = offset + 2;
375 				result.push_back (fa);
376 				if (AnalString (sz + 2, result, ambiguous, offset + 2))
377 					return true;
378 				delete result.back ();
379 				result.pop_back ();
380 			}
381 			sy[1] = 0;
382 			i = Element::Z (sy);
383 			if (i > 0) {
384 				FormulaAtom *fa = new FormulaAtom (i);
385 				fa->start = offset;
386 				fa->end = offset + 1;
387 				result.push_back (fa);
388 				if (AnalString (sz + 1, result, ambiguous, offset + 1))
389 					return true;
390 			}
391 		}
392 	}
393 	return false;
394 }
395 
Formula(string entry,FormulaParseMode mode)396 Formula::Formula (string entry, FormulaParseMode mode) throw (parse_error)
397 {
398 	m_ParseMode = mode;
399 	SetFormula (entry);
400 }
401 
~Formula()402 Formula::~Formula ()
403 {
404 	Clear ();
405 }
406 
GetMarkup()407 char const *Formula::GetMarkup ()
408 {
409 	return Markup.c_str ();
410 }
411 
GetRawFormula()412 map<int,int> &Formula::GetRawFormula ()
413 {
414 	return Raw;
415 }
416 
GetRawMarkup()417 char const *Formula::GetRawMarkup ()
418 {
419 	return RawMarkup.c_str ();
420 }
421 
SetFormula(string entry)422 void Formula::SetFormula (string entry) throw (parse_error)
423 {
424 	Entry = entry;
425 	Clear ();
426 	Parse (Entry, Details);
427 	list<FormulaElt *>::iterator i, iend = Details.end();
428 	// now update markups and raw formula
429 	for (i = Details.begin (); i != iend; i++) {
430 		Markup += (*i)->Markup ();
431 		(*i)->BuildRawFormula (Raw);
432 	}
433 	ostringstream oss;
434 	map<string, int> elts;
435 	int nC = 0, nH = 0;
436 	map<int, int>::iterator j, jend = Raw.end();
437 	for (j = Raw.begin (); j != jend; j++) {
438 		switch ((*j).first) {
439 		case 1:
440 			nH = (*j).second;
441 			break;
442 		case 6:
443 			nC = (*j).second;
444 			break;
445 		default:
446 			elts[Element::Symbol((*j).first)] = (*j).second;
447 			break;
448 		}
449 	}
450 	if (nC > 0) {
451 		oss << "C";
452 		if (nC > 1)
453 			oss << "<sub>" << nC << "</sub>";
454 	}
455 	if (nH > 0) {
456 		oss << "H";
457 		if (nH > 1)
458 			oss << "<sub>" << nH << "</sub>";
459 	}
460 	map<string, int>::iterator k, kend = elts.end ();
461 	for (k = elts.begin (); k != kend; k++) {
462 		nC = (*k).second;
463 		oss << (*k).first;
464 		if (nC > 1)
465 			oss << "<sub>" << nC << "</sub>";
466 	}
467 	RawMarkup = oss.str ();
468 }
469 
Clear()470 void Formula::Clear ()
471 {
472 	list<FormulaElt *>::iterator i, end = Details.end();
473 	for (i = Details.begin (); i != end; i++)
474 		delete *i;
475 	Details.clear ();
476 	Markup = "";
477 	Raw.clear ();
478 	RawMarkup = "";
479 	m_ConnectivityCached = m_WeightCached = false;
480 }
481 
Parse(string & formula,list<FormulaElt * > & result)482 void Formula::Parse (string &formula, list<FormulaElt *> &result) throw (parse_error)
483 {
484 	int i = 0, npo, size = formula.size (), j, k = 0; // parsing index, number of open parenthesis, string size
485 	char c = 0, *sz, *end;
486 	bool ambiguous = false;
487 	while (i < size) {
488 		if (formula[i] == '(' || formula[i] == '[' || formula[i] == '{') {
489 			switch (formula[i]) {
490 				case '(':
491 					c = ')';
492 					k = 0;
493 					break;
494 				case '[':
495 					c = ']';
496 					k = 1;
497 					break;
498 				case '{':
499 					c = '}';
500 					k = 2;
501 					break;
502 			}
503 			npo = 1;
504 			j = i + 1;
505 			while (j < size && npo > 0) {
506 				if (formula[j] == '(' || formula[j] == '[' || formula[j] == '{')
507 					npo++;
508 				else if (formula[j] == ')' || formula[j] == ']' || formula[j] == '}')
509 					npo--;
510 				j++;
511 			}
512 			if (npo || formula[j - 1] != c)
513 				throw parse_error (_("Unmatched parenthesis"), i, 1);
514 			string str (formula, i + 1, j - i - 2);
515 			FormulaBlock *block = new FormulaBlock ();
516 			block->parenthesis = k;
517 			block->start = i;
518 			block->end = j;
519 			result.push_back (block);
520 			try {
521 				Parse (str, block->children);
522 			}
523 			catch (parse_error &error) {
524 				error.add_offset (i + 1);
525 				throw error;
526 			}
527 			block->stoich = strtol (formula.c_str () + j, &end, 10);
528 			i = end - formula.c_str ();
529 			if (i == j)
530 				block->stoich = 1;
531 		} else if (isalpha (formula[i]) || formula[i] == '-') {
532 			j = i + 1;
533 			while (isalpha (formula[j]) || formula[j] == '-')
534 				j++;
535 			k = j - i;
536 			sz = new char[k + 1];
537 			strncpy (sz, formula.c_str () + i, k);
538 			sz[k] = 0;
539 			if (!AnalString (sz, result, ambiguous, i)) {
540 				delete [] sz;
541 				throw parse_error (_("Could not interpret the symbol list"), i, k);
542 			}
543 			delete [] sz;
544 			i = j;
545 			FormulaElt *elt = result.back ();
546 			if (!elt)
547 				throw runtime_error (_("Parser failed, please fill a bug report."));
548 			elt->stoich = strtol (formula.c_str () + j, &end, 10);
549 			i = end - formula.c_str ();
550 			if (i == j)
551 				elt->stoich = 1;
552 		} else if (formula[i] == ')' || formula[i] == ']' || formula[i] == '}') {
553 			throw parse_error (_("Unmatched parenthesis"), i, 1);
554 		} else
555 			throw parse_error (_("Invalid character"), i, 1);
556 	}
557 	if (ambiguous) {
558 		switch (m_ParseMode & 7) {
559 		case GCU_FORMULA_PARSE_GUESS: {
560 			// if it fails, nothing is replaced
561 			if (!TryReplace (result, result.begin ()))
562 				g_warning ("ambiguous formula");
563 			break;
564 		}
565 		case GCU_FORMULA_PARSE_ATOM: {
566 			// replace all ambiguous residues
567 			list<FormulaElt *>::iterator it = result.begin ();
568 			FormulaResidue *res;
569 			while (it != result.end ()) {
570 				res = dynamic_cast <FormulaResidue *> (*it);
571 				if (res && res->GetZ ()) {
572 					FormulaAtom *elt = new FormulaAtom (res->GetZ());
573 					elt->stoich =  res->stoich;
574 					it = result.erase (it);
575 					delete res;
576 					it = result.insert (it, elt);
577 				} else
578 					it++;
579 			}
580 			break;
581 		}
582 		case GCU_FORMULA_PARSE_RESIDUE:
583 			// don't do anything
584 			break;
585 		case GCU_FORMULA_PARSE_ASK: {
586 			// FIXME: really ask
587 			break;
588 		}
589 		}
590 	}
591 }
592 
GetMolecularWeight(bool & artificial)593 DimensionalValue Formula::GetMolecularWeight (bool &artificial)
594 {
595 	if (Raw.size () == 0) {
596 		return m_Weight;
597 	}
598 	if (!m_WeightCached) {
599 		DimensionalValue atom_weight;
600 		m_Artificial = false; // most formula don't have artificial elements
601 		map<int,int>::iterator i, end = Raw.end (), begin = Raw.begin ();
602 		for (i = begin; i != end; i++) {
603 			atom_weight = *Element::GetElement ((*i).first)->GetWeight ();
604 			if (atom_weight.GetValue ().prec == 0)
605 				m_Artificial = true;
606 			m_Weight = (i == begin)? atom_weight * (*i).second: m_Weight + atom_weight * (*i).second;
607 		}
608 	}
609 	m_WeightCached = true;
610 	artificial = m_Artificial;
611 	return m_Weight;
612 }
613 
CalculateIsotopicPattern(IsotopicPattern & pattern)614 void Formula::CalculateIsotopicPattern (IsotopicPattern &pattern)
615 {
616 	map<int,int>::iterator i, end = Raw.end ();
617 	i = Raw.begin ();
618 	if (i == end) // empty formula
619 		return;
620 	IsotopicPattern *pat = NULL, *pat0;
621 	while (!pat && (i != end)) {
622 		pat = Element::GetElement ((*i).first)->GetIsotopicPattern ((*i).second);
623 		i++;
624 	}
625 	if (!pat)
626 		return;
627 	pattern.Copy (*pat);
628 	pat->Unref ();
629 	for (; i != end; i++) {
630 		pat = Element::GetElement ((*i).first)->GetIsotopicPattern ((*i).second);
631 		if (!pat) {
632 			// no stable isotope known for the element
633 			pattern.Clear ();
634 			return;
635 		}
636 		pat0 = pattern.Multiply (*pat);
637 		pat->Unref ();
638 		pat = pat0->Simplify ();
639 		pattern.Copy (*pat);
640 		pat0->Unref ();
641 		pat->Unref ();
642 	}
643 }
644 
BuildConnectivity()645 bool Formula::BuildConnectivity ()
646 {
647 	Document *Doc = new Document (NULL);
648 	Molecule *mol = Molecule::MoleculeFromFormula (Doc, *this, false);
649 	bool result = mol;
650 	delete Doc;
651 	return result;
652 }
653 
TryReplace(list<FormulaElt * > & result,list<FormulaElt * >::iterator it)654 bool Formula::TryReplace (list<FormulaElt *> &result, list<FormulaElt *>::iterator it)
655 {
656 	if (BuildConnectivity ())
657 		return true;
658 	FormulaResidue *res;
659 	while (it != result.end ()) {
660 		res = dynamic_cast <FormulaResidue *> (*it);
661 		if (res && res->GetZ ())
662 			break;
663 		it++;
664 	}
665 	if (it == result.end ())
666 		return false;
667 	list<FormulaElt *>::iterator next = it;
668 	next++;
669 	if (TryReplace (result, next))
670 		return true;
671 	FormulaAtom *elt = new FormulaAtom (res->GetZ());
672 	elt->stoich =  res->stoich;
673 	it = result.erase (it);
674 	result.insert (it, elt);
675 	next = it;
676 	next++;
677 	bool ret = TryReplace (result, next);
678 	if (!ret) {
679 		it--;
680 		delete (*it);
681 		it = result.erase (it);
682 		result.insert (it, res);
683 	} else
684 		delete res;
685 	return ret;
686 }
687 
688 }	//	namespace gcu
689