1# coding: utf-8
2# Copyright (c) Pymatgen Development Team.
3# Distributed under the terms of the MIT License.
4
5"""
6This module implements a Composition class to represent compositions,
7and a ChemicalPotential class to represent potentials.
8"""
9
10import collections
11import numbers
12import os
13import re
14import string
15from functools import total_ordering
16from itertools import combinations_with_replacement, product
17from typing import List, Tuple, Union, Dict
18
19from monty.fractions import gcd, gcd_float
20from monty.json import MSONable
21from monty.serialization import loadfn
22
23from pymatgen.core.periodic_table import DummySpecies, Element, Species, get_el_sp
24from pymatgen.core.units import Mass
25from pymatgen.util.string import formula_double_format, Stringify
26
27
28SpeciesLike = Union[str, Element, Species, DummySpecies]
29
30
31@total_ordering
32class Composition(collections.abc.Hashable, collections.abc.Mapping, MSONable, Stringify):
33    """
34    Represents a Composition, which is essentially a {element:amount} mapping
35    type. Composition is written to be immutable and hashable,
36    unlike a standard Python dict.
37
38    Note that the key can be either an Element or a Species. Elements and Species
39    are treated differently. i.e., a Fe2+ is not the same as a Fe3+ Species and
40    would be put in separate keys. This differentiation is deliberate to
41    support using Composition to determine the fraction of a particular Species.
42
43    Works almost completely like a standard python dictionary, except that
44    __getitem__ is overridden to return 0 when an element is not found.
45    (somewhat like a defaultdict, except it is immutable).
46
47    Also adds more convenience methods relevant to compositions, e.g.,
48    get_fraction.
49
50    It should also be noted that many Composition related functionality takes
51    in a standard string as a convenient input. For example,
52    even though the internal representation of a Fe2O3 composition is
53    {Element("Fe"): 2, Element("O"): 3}, you can obtain the amount of Fe
54    simply by comp["Fe"] instead of the more verbose comp[Element("Fe")].
55
56    >>> comp = Composition("LiFePO4")
57    >>> comp.get_atomic_fraction(Element("Li"))
58    0.14285714285714285
59    >>> comp.num_atoms
60    7.0
61    >>> comp.reduced_formula
62    'LiFePO4'
63    >>> comp.formula
64    'Li1 Fe1 P1 O4'
65    >>> comp.get_wt_fraction(Element("Li"))
66    0.04399794666951898
67    >>> comp.num_atoms
68    7.0
69    """
70
71    # Tolerance in distinguishing different composition amounts.
72    # 1e-8 is fairly tight, but should cut out most floating point arithmetic
73    # errors.
74    amount_tolerance = 1e-8
75
76    # Special formula handling for peroxides and certain elements. This is so
77    # that formula output does not write LiO instead of Li2O2 for example.
78    special_formulas = {
79        "LiO": "Li2O2",
80        "NaO": "Na2O2",
81        "KO": "K2O2",
82        "HO": "H2O2",
83        "CsO": "Cs2O2",
84        "RbO": "Rb2O2",
85        "O": "O2",
86        "N": "N2",
87        "F": "F2",
88        "Cl": "Cl2",
89        "H": "H2",
90    }
91
92    oxi_prob = None  # prior probability of oxidation used by oxi_state_guesses
93
94    def __init__(self, *args, strict: bool = False, **kwargs):
95        r"""
96        Very flexible Composition construction, similar to the built-in Python
97        dict(). Also extended to allow simple string init.
98
99        Args:
100            Any form supported by the Python built-in {} function.
101
102            1. A dict of either {Element/Species: amount},
103
104               {string symbol:amount}, or {atomic number:amount} or any mixture
105               of these. E.g., {Element("Li"):2 ,Element("O"):1},
106               {"Li":2, "O":1}, {3:2, 8:1} all result in a Li2O composition.
107            2. Keyword arg initialization, similar to a dict, e.g.,
108
109               Composition(Li = 2, O = 1)
110
111            In addition, the Composition constructor also allows a single
112            string as an input formula. E.g., Composition("Li2O").
113
114            strict: Only allow valid Elements and Species in the Composition.
115
116            allow_negative: Whether to allow negative compositions. This
117                argument must be popped from the **kwargs due to *args
118                ambiguity.
119        """
120        self.allow_negative = kwargs.pop("allow_negative", False)
121        # it's much faster to recognize a composition and use the elmap than
122        # to pass the composition to {}
123        if len(args) == 1 and isinstance(args[0], Composition):
124            elmap = args[0]
125        elif len(args) == 1 and isinstance(args[0], str):
126            elmap = self._parse_formula(args[0])
127        else:
128            elmap = dict(*args, **kwargs)  # type: ignore
129        elamt = {}
130        self._natoms = 0
131        for k, v in elmap.items():
132            if v < -Composition.amount_tolerance and not self.allow_negative:
133                raise ValueError("Amounts in Composition cannot be " "negative!")
134            if abs(v) >= Composition.amount_tolerance:
135                elamt[get_el_sp(k)] = v
136                self._natoms += abs(v)
137        self._data = elamt
138        if strict and not self.valid:
139            raise ValueError("Composition is not valid, contains: {}".format(", ".join(map(str, self.elements))))
140
141    def __getitem__(self, item: SpeciesLike):
142        try:
143            sp = get_el_sp(item)
144            return self._data.get(sp, 0)
145        except ValueError as ex:
146            raise TypeError(
147                "Invalid key {}, {} for Composition\n" "ValueError exception:\n{}".format(item, type(item), ex)
148            )
149
150    def __len__(self):
151        return len(self._data)
152
153    def __iter__(self):
154        return self._data.keys().__iter__()
155
156    def __contains__(self, item):
157        try:
158            sp = get_el_sp(item)
159            return sp in self._data
160        except ValueError as ex:
161            raise TypeError(
162                "Invalid key {}, {} for Composition\n" "ValueError exception:\n{}".format(item, type(item), ex)
163            )
164
165    def __eq__(self, other):
166        #  elements with amounts < Composition.amount_tolerance don't show up
167        #  in the elmap, so checking len enables us to only check one
168        #  compositions elements
169        if len(self) != len(other):
170            return False
171
172        return all(abs(v - other[el]) <= Composition.amount_tolerance for el, v in self.items())
173
174    def __ge__(self, other):
175        """
176        Defines >= for Compositions. Should ONLY be used for defining a sort
177        order (the behavior is probably not what you'd expect)
178        """
179        for el in sorted(set(self.elements + other.elements)):
180            if other[el] - self[el] >= Composition.amount_tolerance:
181                return False
182            if self[el] - other[el] >= Composition.amount_tolerance:
183                return True
184        return True
185
186    def __ne__(self, other):
187        return not self.__eq__(other)
188
189    def __add__(self, other):
190        """
191        Adds two compositions. For example, an Fe2O3 composition + an FeO
192        composition gives a Fe3O4 composition.
193        """
194        new_el_map = collections.defaultdict(float)
195        new_el_map.update(self)
196        for k, v in other.items():
197            new_el_map[get_el_sp(k)] += v
198        return Composition(new_el_map, allow_negative=self.allow_negative)
199
200    def __sub__(self, other):
201        """
202        Subtracts two compositions. For example, an Fe2O3 composition - an FeO
203        composition gives an FeO2 composition.
204
205        Raises:
206            ValueError if the subtracted composition is greater than the
207            original composition in any of its elements, unless allow_negative
208            is True
209        """
210        new_el_map = collections.defaultdict(float)
211        new_el_map.update(self)
212        for k, v in other.items():
213            new_el_map[get_el_sp(k)] -= v
214        return Composition(new_el_map, allow_negative=self.allow_negative)
215
216    def __mul__(self, other):
217        """
218        Multiply a Composition by an integer or a float.
219        Fe2O3 * 4 -> Fe8O12
220        """
221        if not isinstance(other, numbers.Number):
222            return NotImplemented
223        return Composition({el: self[el] * other for el in self}, allow_negative=self.allow_negative)
224
225    __rmul__ = __mul__
226
227    def __truediv__(self, other):
228        if not isinstance(other, numbers.Number):
229            return NotImplemented
230        return Composition({el: self[el] / other for el in self}, allow_negative=self.allow_negative)
231
232    __div__ = __truediv__
233
234    def __hash__(self):
235        """
236        hash based on the chemical system
237        """
238        return hash(frozenset(self._data.keys()))
239
240    @property
241    def average_electroneg(self) -> float:
242        """
243        :return: Average electronegativity of the composition.
244        """
245        return sum((el.X * abs(amt) for el, amt in self.items())) / self.num_atoms
246
247    @property
248    def total_electrons(self) -> float:
249        """
250        :return: Total number of electrons in composition.
251        """
252        return sum((el.Z * abs(amt) for el, amt in self.items()))
253
254    def almost_equals(self, other: "Composition", rtol: float = 0.1, atol: float = 1e-8) -> bool:
255        """
256        Returns true if compositions are equal within a tolerance.
257
258        Args:
259            other (Composition): Other composition to check
260            rtol (float): Relative tolerance
261            atol (float): Absolute tolerance
262        """
263        sps = set(self.elements + other.elements)
264        for sp in sps:
265            a = self[sp]
266            b = other[sp]
267            tol = atol + rtol * (abs(a) + abs(b)) / 2
268            if abs(b - a) > tol:
269                return False
270        return True
271
272    @property
273    def is_element(self) -> bool:
274        """
275        True if composition is for an element.
276        """
277        return len(self) == 1
278
279    def copy(self) -> "Composition":
280        """
281        :return: A copy of the composition.
282        """
283        return Composition(self, allow_negative=self.allow_negative)
284
285    @property
286    def formula(self) -> str:
287        """
288        Returns a formula string, with elements sorted by electronegativity,
289        e.g., Li4 Fe4 P4 O16.
290        """
291        sym_amt = self.get_el_amt_dict()
292        syms = sorted(sym_amt.keys(), key=lambda sym: get_el_sp(sym).X)
293        formula = [s + formula_double_format(sym_amt[s], False) for s in syms]
294        return " ".join(formula)
295
296    @property
297    def alphabetical_formula(self) -> str:
298        """
299        Returns a formula string, with elements sorted by alphabetically
300        e.g., Fe4 Li4 O16 P4.
301        """
302        sym_amt = self.get_el_amt_dict()
303        syms = sorted(sym_amt.keys())
304        formula = [s + formula_double_format(sym_amt[s], False) for s in syms]
305        return " ".join(formula)
306
307    @property
308    def iupac_formula(self) -> str:
309        """
310        Returns a formula string, with elements sorted by the iupac
311        electronegativity ordering defined in Table VI of "Nomenclature of
312        Inorganic Chemistry (IUPAC Recommendations 2005)". This ordering
313        effectively follows the groups and rows of the periodic table, except
314        the Lanthanides, Actanides and hydrogen. Polyanions are still determined
315        based on the true electronegativity of the elements.
316        e.g. CH2(SO4)2
317        """
318        sym_amt = self.get_el_amt_dict()
319        syms = sorted(sym_amt.keys(), key=lambda s: get_el_sp(s).iupac_ordering)
320        formula = [s + formula_double_format(sym_amt[s], False) for s in syms]
321        return " ".join(formula)
322
323    @property
324    def element_composition(self) -> "Composition":
325        """
326        Returns the composition replacing any species by the corresponding
327        element.
328        """
329        return Composition(self.get_el_amt_dict(), allow_negative=self.allow_negative)
330
331    @property
332    def fractional_composition(self) -> "Composition":
333        """
334        Returns the normalized composition which the number of species sum to
335        1.
336
337        Returns:
338            Normalized composition which the number of species sum to 1.
339        """
340        return self / self._natoms
341
342    @property
343    def reduced_composition(self) -> "Composition":
344        """
345        Returns the reduced composition,i.e. amounts normalized by greatest
346        common denominator. e.g., Composition("FePO4") for
347        Composition("Fe4P4O16").
348        """
349        return self.get_reduced_composition_and_factor()[0]
350
351    def get_reduced_composition_and_factor(self) -> Tuple["Composition", float]:
352        """
353        Calculates a reduced composition and factor.
354
355        Returns:
356            A normalized composition and a multiplicative factor, i.e.,
357            Li4Fe4P4O16 returns (Composition("LiFePO4"), 4).
358        """
359        factor = self.get_reduced_formula_and_factor()[1]
360        return self / factor, factor
361
362    def get_reduced_formula_and_factor(self, iupac_ordering: bool = False) -> Tuple[str, float]:
363        """
364        Calculates a reduced formula and factor.
365
366        Args:
367            iupac_ordering (bool, optional): Whether to order the
368                formula by the iupac "electronegativity" series, defined in
369                Table VI of "Nomenclature of Inorganic Chemistry (IUPAC
370                Recommendations 2005)". This ordering effectively follows
371                the groups and rows of the periodic table, except the
372                Lanthanides, Actanides and hydrogen. Note that polyanions
373                will still be determined based on the true electronegativity of
374                the elements.
375
376        Returns:
377            A pretty normalized formula and a multiplicative factor, i.e.,
378            Li4Fe4P4O16 returns (LiFePO4, 4).
379        """
380        all_int = all(abs(x - round(x)) < Composition.amount_tolerance for x in self.values())
381        if not all_int:
382            return self.formula.replace(" ", ""), 1
383        d = {k: int(round(v)) for k, v in self.get_el_amt_dict().items()}
384        (formula, factor) = reduce_formula(d, iupac_ordering=iupac_ordering)
385
386        if formula in Composition.special_formulas:
387            formula = Composition.special_formulas[formula]
388            factor /= 2
389
390        return formula, factor
391
392    def get_integer_formula_and_factor(
393        self, max_denominator: int = 10000, iupac_ordering: bool = False
394    ) -> Tuple[str, float]:
395        """
396        Calculates an integer formula and factor.
397
398        Args:
399            max_denominator (int): all amounts in the el:amt dict are
400                first converted to a Fraction with this maximum denominator
401            iupac_ordering (bool, optional): Whether to order the
402                formula by the iupac "electronegativity" series, defined in
403                Table VI of "Nomenclature of Inorganic Chemistry (IUPAC
404                Recommendations 2005)". This ordering effectively follows
405                the groups and rows of the periodic table, except the
406                Lanthanides, Actanides and hydrogen. Note that polyanions
407                will still be determined based on the true electronegativity of
408                the elements.
409
410        Returns:
411            A pretty normalized formula and a multiplicative factor, i.e.,
412            Li0.5O0.25 returns (Li2O, 0.25). O0.25 returns (O2, 0.125)
413        """
414        el_amt = self.get_el_amt_dict()
415        g = gcd_float(list(el_amt.values()), 1 / max_denominator)
416
417        d = {k: round(v / g) for k, v in el_amt.items()}
418        (formula, factor) = reduce_formula(d, iupac_ordering=iupac_ordering)
419        if formula in Composition.special_formulas:
420            formula = Composition.special_formulas[formula]
421            factor /= 2
422        return formula, factor * g
423
424    @property
425    def reduced_formula(self) -> str:
426        """
427        Returns a pretty normalized formula, i.e., LiFePO4 instead of
428        Li4Fe4P4O16.
429        """
430        return self.get_reduced_formula_and_factor()[0]
431
432    @property
433    def hill_formula(self) -> str:
434        """
435        :return: Hill formula. The Hill system (or Hill notation) is a system
436        of writing empirical chemical formulas, molecular chemical formulas and
437        components of a condensed formula such that the number of carbon atoms
438        in a molecule is indicated first, the number of hydrogen atoms next,
439        and then the number of all other chemical elements subsequently, in
440        alphabetical order of the chemical symbols. When the formula contains
441        no carbon, all the elements, including hydrogen, are listed
442        alphabetically.
443        """
444        c = self.element_composition
445        elements = sorted([el.symbol for el in c.keys()])
446        if "C" in elements:
447            elements = ["C"] + [el for el in elements if el != "C"]
448
449        formula = ["%s%s" % (el, formula_double_format(c[el]) if c[el] != 1 else "") for el in elements]
450        return " ".join(formula)
451
452    @property
453    def elements(self) -> List[Union[Element, Species, DummySpecies]]:
454        """
455        Returns view of elements in Composition.
456        """
457        return list(self.keys())
458
459    def __str__(self):
460        return " ".join(
461            ["{}{}".format(k, formula_double_format(v, ignore_ones=False)) for k, v in self.as_dict().items()]
462        )
463
464    def to_pretty_string(self) -> str:
465        """
466        :return: Same as __str__ but without spaces.
467        """
468        return re.sub(r"\s+", "", self.__str__())
469
470    @property
471    def num_atoms(self) -> float:
472        """
473        Total number of atoms in Composition. For negative amounts, sum
474        of absolute values
475        """
476        return self._natoms
477
478    @property
479    def weight(self) -> float:
480        """
481        Total molecular weight of Composition
482        """
483        return Mass(sum([amount * el.atomic_mass for el, amount in self.items()]), "amu")
484
485    def get_atomic_fraction(self, el: SpeciesLike) -> float:
486        """
487        Calculate atomic fraction of an Element or Species.
488
489        Args:
490            el (Element/Species): Element or Species to get fraction for.
491
492        Returns:
493            Atomic fraction for element el in Composition
494        """
495        return abs(self[el]) / self._natoms
496
497    def get_wt_fraction(self, el: SpeciesLike):
498        """
499        Calculate weight fraction of an Element or Species.
500
501        Args:
502            el (Element/Species): Element or Species to get fraction for.
503
504        Returns:
505            Weight fraction for element el in Composition
506        """
507        return get_el_sp(el).atomic_mass * abs(self[el]) / self.weight
508
509    def contains_element_type(
510        self,
511        category: str,
512    ):
513        """
514        Check if Composition contains any elements matching a given category.
515
516        Args:
517            category (str): one of "noble_gas", "transition_metal",
518                "post_transition_metal", "rare_earth_metal", "metal", "metalloid",
519                "alkali", "alkaline", "halogen", "chalcogen", "lanthanoid",
520                "actinoid", "quadrupolar", "s-block", "p-block", "d-block", "f-block"
521
522        Returns:
523            True if any elements in Composition match category, otherwise False
524        """
525
526        allowed_categories = (
527            "noble_gas",
528            "transition_metal",
529            "post_transition_metal",
530            "rare_earth_metal",
531            "metal",
532            "metalloid",
533            "alkali",
534            "alkaline",
535            "halogen",
536            "chalcogen",
537            "lanthanoid",
538            "actinoid",
539            "quadrupolar",
540            "s-block",
541            "p-block",
542            "d-block",
543            "f-block",
544        )
545
546        if category not in allowed_categories:
547            raise ValueError("Please pick a category from: {}".format(", ".join(allowed_categories)))
548
549        if "block" in category:
550            return any(category[0] in el.block for el in self.elements)
551        return any(getattr(el, "is_{}".format(category)) for el in self.elements)
552
553    def _parse_formula(self, formula):
554        """
555        Args:
556            formula (str): A string formula, e.g. Fe2O3, Li3Fe2(PO4)3
557
558        Returns:
559            Composition with that formula.
560
561        Notes:
562            In the case of Metallofullerene formula (e.g. Y3N@C80),
563            the @ mark will be dropped and passed to parser.
564        """
565        # for Metallofullerene like "Y3N@C80"
566        formula = formula.replace("@", "")
567
568        def get_sym_dict(f, factor):
569            sym_dict = collections.defaultdict(float)
570            for m in re.finditer(r"([A-Z][a-z]*)\s*([-*\.e\d]*)", f):
571                el = m.group(1)
572                amt = 1
573                if m.group(2).strip() != "":
574                    amt = float(m.group(2))
575                sym_dict[el] += amt * factor
576                f = f.replace(m.group(), "", 1)
577            if f.strip():
578                raise ValueError("{} is an invalid formula!".format(f))
579            return sym_dict
580
581        m = re.search(r"\(([^\(\)]+)\)\s*([\.e\d]*)", formula)
582        if m:
583            factor = 1
584            if m.group(2) != "":
585                factor = float(m.group(2))
586            unit_sym_dict = get_sym_dict(m.group(1), factor)
587            expanded_sym = "".join(["{}{}".format(el, amt) for el, amt in unit_sym_dict.items()])
588            expanded_formula = formula.replace(m.group(), expanded_sym)
589            return self._parse_formula(expanded_formula)
590        return get_sym_dict(formula, 1)
591
592    @property
593    def anonymized_formula(self) -> str:
594        """
595        An anonymized formula. Unique species are arranged in ordering of
596        increasing amounts and assigned ascending alphabets. Useful for
597        prototyping formulas. For example, all stoichiometric perovskites have
598        anonymized_formula ABC3.
599        """
600        reduced = self.element_composition
601        if all(x == int(x) for x in self.values()):
602            reduced /= gcd(*(int(i) for i in self.values()))
603
604        anon = ""
605        for e, amt in zip(string.ascii_uppercase, sorted(reduced.values())):
606            if amt == 1:
607                amt_str = ""
608            elif abs(amt % 1) < 1e-8:
609                amt_str = str(int(amt))
610            else:
611                amt_str = str(amt)
612            anon += "{}{}".format(e, amt_str)
613        return anon
614
615    @property
616    def chemical_system(self) -> str:
617        """
618        Get the chemical system of a Composition, for example "O-Si" for
619        SiO2. Chemical system is a string of a list of elements
620        sorted alphabetically and joined by dashes, by convention for use
621        in database keys.
622        """
623        return "-".join(sorted([el.symbol for el in self.elements]))
624
625    @property
626    def valid(self) -> bool:
627        """
628        Returns True if Composition contains valid elements or species and
629        False if the Composition contains any dummy species.
630        """
631        return not any(isinstance(el, DummySpecies) for el in self.elements)
632
633    def __repr__(self):
634        return "Comp: " + self.formula
635
636    @classmethod
637    def from_dict(cls, d) -> "Composition":
638        """
639        Creates a composition from a dict generated by as_dict(). Strictly not
640        necessary given that the standard constructor already takes in such an
641        input, but this method preserves the standard pymatgen API of having
642        from_dict methods to reconstitute objects generated by as_dict(). Allows
643        for easier introspection.
644
645        Args:
646            d (dict): {symbol: amount} dict.
647        """
648        return cls(d)
649
650    def get_el_amt_dict(self) -> Dict[str, float]:
651        """
652        Returns:
653            Dict with element symbol and (unreduced) amount e.g.,
654            {"Fe": 4.0, "O":6.0} or {"Fe3+": 4.0, "O2-":6.0}
655        """
656        d: Dict[str, float] = collections.defaultdict(float)
657        for e, a in self.items():
658            d[e.symbol] += a
659        return d
660
661    def as_dict(self) -> dict:
662        """
663        Returns:
664            dict with species symbol and (unreduced) amount e.g.,
665            {"Fe": 4.0, "O":6.0} or {"Fe3+": 4.0, "O2-":6.0}
666        """
667        d: Dict[str, float] = collections.defaultdict(float)
668        for e, a in self.items():
669            d[str(e)] += a
670        return d
671
672    @property
673    def to_reduced_dict(self) -> dict:
674        """
675        Returns:
676            Dict with element symbol and reduced amount e.g.,
677            {"Fe": 2.0, "O":3.0}
678        """
679        return self.get_reduced_composition_and_factor()[0].as_dict()
680
681    @property
682    def to_data_dict(self) -> dict:
683        """
684        Returns:
685            A dict with many keys and values relating to Composition/Formula,
686            including reduced_cell_composition, unit_cell_composition,
687            reduced_cell_formula, elements and nelements.
688        """
689        return {
690            "reduced_cell_composition": self.get_reduced_composition_and_factor()[0],
691            "unit_cell_composition": self.as_dict(),
692            "reduced_cell_formula": self.reduced_formula,
693            "elements": list(self.as_dict().keys()),
694            "nelements": len(self.as_dict().keys()),
695        }
696
697    def oxi_state_guesses(
698        self,
699        oxi_states_override: dict = None,
700        target_charge: float = 0,
701        all_oxi_states: bool = False,
702        max_sites: int = None,
703    ) -> List[Dict[str, float]]:
704        """
705        Checks if the composition is charge-balanced and returns back all
706        charge-balanced oxidation state combinations. Composition must have
707        integer values. Note that more num_atoms in the composition gives
708        more degrees of freedom. e.g., if possible oxidation states of
709        element X are [2,4] and Y are [-3], then XY is not charge balanced
710        but X2Y2 is. Results are returned from most to least probable based
711        on ICSD statistics. Use max_sites to improve performance if needed.
712
713        Args:
714            oxi_states_override (dict): dict of str->list to override an
715                element's common oxidation states, e.g. {"V": [2,3,4,5]}
716            target_charge (int): the desired total charge on the structure.
717                Default is 0 signifying charge balance.
718            all_oxi_states (bool): if True, an element defaults to
719                all oxidation states in pymatgen Element.icsd_oxidation_states.
720                Otherwise, default is Element.common_oxidation_states. Note
721                that the full oxidation state list is *very* inclusive and
722                can produce nonsensical results.
723            max_sites (int): if possible, will reduce Compositions to at most
724                this many sites to speed up oxidation state guesses. If the
725                composition cannot be reduced to this many sites a ValueError
726                will be raised. Set to -1 to just reduce fully. If set to a
727                number less than -1, the formula will be fully reduced but a
728                ValueError will be thrown if the number of atoms in the reduced
729                formula is greater than abs(max_sites).
730
731        Returns:
732            A list of dicts - each dict reports an element symbol and average
733                oxidation state across all sites in that composition. If the
734                composition is not charge balanced, an empty list is returned.
735        """
736
737        return self._get_oxid_state_guesses(all_oxi_states, max_sites, oxi_states_override, target_charge)[0]
738
739    def add_charges_from_oxi_state_guesses(
740        self,
741        oxi_states_override: dict = None,
742        target_charge: float = 0,
743        all_oxi_states: bool = False,
744        max_sites: int = None,
745    ) -> "Composition":
746        """
747        Assign oxidation states basedon guessed oxidation states.
748
749        See `oxi_state_guesses` for an explanation of how oxidation states are
750        guessed. This operation uses the set of oxidation states for each site
751        that were determined to be most likley from the oxidation state guessing
752        routine.
753
754        Args:
755            oxi_states_override (dict): dict of str->list to override an
756                element's common oxidation states, e.g. {"V": [2,3,4,5]}
757            target_charge (int): the desired total charge on the structure.
758                Default is 0 signifying charge balance.
759            all_oxi_states (bool): if True, an element defaults to
760                all oxidation states in pymatgen Element.icsd_oxidation_states.
761                Otherwise, default is Element.common_oxidation_states. Note
762                that the full oxidation state list is *very* inclusive and
763                can produce nonsensical results.
764            max_sites (int): if possible, will reduce Compositions to at most
765                this many sites to speed up oxidation state guesses. If the
766                composition cannot be reduced to this many sites a ValueError
767                will be raised. Set to -1 to just reduce fully. If set to a
768                number less than -1, the formula will be fully reduced but a
769                ValueError will be thrown if the number of atoms in the reduced
770                formula is greater than abs(max_sites).
771
772        Returns:
773            Composition, where the elements are assigned oxidation states based
774            on the results form guessing oxidation states. If no oxidation state
775            is possible, returns a Composition where all oxidation states are 0.
776        """
777
778        _, oxidation_states = self._get_oxid_state_guesses(
779            all_oxi_states, max_sites, oxi_states_override, target_charge
780        )
781
782        # Special case: No charged compound is possible
783        if not oxidation_states:
784            return Composition(dict((Species(e, 0), f) for e, f in self.items()))
785
786        # Generate the species
787        species = []
788        for el, charges in oxidation_states[0].items():
789            species.extend([Species(el, c) for c in charges])
790
791        # Return the new object
792        return Composition(collections.Counter(species))
793
794    def remove_charges(self) -> "Composition":
795        """
796        Removes the charges from any species in a Composition object.
797
798        Returns:
799            Composition object without charge decoration, for example
800            {"Fe3+": 2.0, "O2-":3.0} becomes {"Fe": 2.0, "O":3.0}
801        """
802        d: Dict[Element, float] = collections.defaultdict(float)
803        for e, a in self.items():
804            d[Element(e.symbol)] += a
805        return Composition(d)
806
807    def _get_oxid_state_guesses(self, all_oxi_states, max_sites, oxi_states_override, target_charge):
808        """
809        Utility operation for guessing oxidation states.
810
811        See `oxi_state_guesses` for full details. This operation does the
812        calculation of the most likely oxidation states
813
814        Args:
815            oxi_states_override (dict): dict of str->list to override an
816                element's common oxidation states, e.g. {"V": [2,3,4,5]}
817            target_charge (int): the desired total charge on the structure.
818                Default is 0 signifying charge balance.
819            all_oxi_states (bool): if True, an element defaults to
820                all oxidation states in pymatgen Element.icsd_oxidation_states.
821                Otherwise, default is Element.common_oxidation_states. Note
822                that the full oxidation state list is *very* inclusive and
823                can produce nonsensical results.
824            max_sites (int): if possible, will reduce Compositions to at most
825                this many sites to speed up oxidation state guesses. If the
826                composition cannot be reduced to this many sites a ValueError
827                will be raised. Set to -1 to just reduce fully. If set to a
828                number less than -1, the formula will be fully reduced but a
829                ValueError will be thrown if the number of atoms in the reduced
830                formula is greater than abs(max_sites).
831
832        Returns:
833            A list of dicts - each dict reports an element symbol and average
834                oxidation state across all sites in that composition. If the
835                composition is not charge balanced, an empty list is returned.
836            A list of dicts - each dict maps the element symbol to a list of
837                oxidation states for each site of that element. For example, Fe3O4 could
838                return a list of [2,2,2,3,3,3] for the oxidation states of If the composition
839                is
840
841        """
842        comp = self.copy()
843        # reduce Composition if necessary
844        if max_sites and max_sites < 0:
845            comp = self.reduced_composition
846
847            if max_sites < -1 and comp.num_atoms > abs(max_sites):
848                raise ValueError("Composition {} cannot accommodate max_sites " "setting!".format(comp))
849
850        elif max_sites and comp.num_atoms > max_sites:
851            reduced_comp, reduced_factor = self.get_reduced_composition_and_factor()
852            if reduced_factor > 1:
853                reduced_comp *= max(1, int(max_sites / reduced_comp.num_atoms))
854                comp = reduced_comp  # as close to max_sites as possible
855            if comp.num_atoms > max_sites:
856                raise ValueError("Composition {} cannot accommodate max_sites " "setting!".format(comp))
857
858        # Load prior probabilities of oxidation states, used to rank solutions
859        if not Composition.oxi_prob:
860            module_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)))
861            all_data = loadfn(os.path.join(module_dir, "..", "analysis", "icsd_bv.yaml"))
862            Composition.oxi_prob = {Species.from_string(sp): data for sp, data in all_data["occurrence"].items()}
863        oxi_states_override = oxi_states_override or {}
864        # assert: Composition only has integer amounts
865        if not all(amt == int(amt) for amt in comp.values()):
866            raise ValueError("Charge balance analysis requires integer " "values in Composition!")
867
868        # for each element, determine all possible sum of oxidations
869        # (taking into account nsites for that particular element)
870        el_amt = comp.get_el_amt_dict()
871        els = el_amt.keys()
872        el_sums = []  # matrix: dim1= el_idx, dim2=possible sums
873        el_sum_scores = collections.defaultdict(set)  # dict of el_idx, sum -> score
874        el_best_oxid_combo = {}  # dict of el_idx, sum -> oxid combo with best score
875        for idx, el in enumerate(els):
876            el_sum_scores[idx] = {}
877            el_best_oxid_combo[idx] = {}
878            el_sums.append([])
879            if oxi_states_override.get(el):
880                oxids = oxi_states_override[el]
881            elif all_oxi_states:
882                oxids = Element(el).oxidation_states
883            else:
884                oxids = Element(el).icsd_oxidation_states or Element(el).oxidation_states
885
886            # get all possible combinations of oxidation states
887            # and sum each combination
888            for oxid_combo in combinations_with_replacement(oxids, int(el_amt[el])):
889
890                # List this sum as a possible option
891                oxid_sum = sum(oxid_combo)
892                if oxid_sum not in el_sums[idx]:
893                    el_sums[idx].append(oxid_sum)
894
895                # Determine how probable is this combo?
896                score = sum([Composition.oxi_prob.get(Species(el, o), 0) for o in oxid_combo])
897
898                # If it is the most probable combo for a certain sum,
899                #   store the combination
900                if oxid_sum not in el_sum_scores[idx] or score > el_sum_scores[idx].get(oxid_sum, 0):
901                    el_sum_scores[idx][oxid_sum] = score
902                    el_best_oxid_combo[idx][oxid_sum] = oxid_combo
903
904        # Determine which combination of oxidation states for each element
905        #    is the most probable
906        all_sols = []  # will contain all solutions
907        all_oxid_combo = []  # will contain the best combination of oxidation states for each site
908        all_scores = []  # will contain a score for each solution
909        for x in product(*el_sums):
910            # each x is a trial of one possible oxidation sum for each element
911            if sum(x) == target_charge:  # charge balance condition
912                el_sum_sol = dict(zip(els, x))  # element->oxid_sum
913                # normalize oxid_sum by amount to get avg oxid state
914                sol = {el: v / el_amt[el] for el, v in el_sum_sol.items()}
915                # add the solution to the list of solutions
916                all_sols.append(sol)
917
918                # determine the score for this solution
919                score = 0
920                for idx, v in enumerate(x):
921                    score += el_sum_scores[idx][v]
922                all_scores.append(score)
923
924                # collect the combination of oxidation states for each site
925                all_oxid_combo.append(dict((e, el_best_oxid_combo[idx][v]) for idx, (e, v) in enumerate(zip(els, x))))
926
927        # sort the solutions by highest to lowest score
928        if all_scores:
929            all_sols, all_oxid_combo = zip(
930                *[
931                    (y, x)
932                    for (z, y, x) in sorted(
933                        zip(all_scores, all_sols, all_oxid_combo),
934                        key=lambda pair: pair[0],
935                        reverse=True,
936                    )
937                ]
938            )
939        return all_sols, all_oxid_combo
940
941    @staticmethod
942    def ranked_compositions_from_indeterminate_formula(fuzzy_formula, lock_if_strict=True):
943        """
944        Takes in a formula where capitilization might not be correctly entered,
945        and suggests a ranked list of potential Composition matches.
946        Author: Anubhav Jain
947
948        Args:
949            fuzzy_formula (str): A formula string, such as "co2o3" or "MN",
950                that may or may not have multiple interpretations
951            lock_if_strict (bool): If true, a properly entered formula will
952                only return the one correct interpretation. For example,
953                "Co1" will only return "Co1" if true, but will return both
954                "Co1" and "C1 O1" if false.
955
956        Returns:
957            A ranked list of potential Composition matches
958        """
959
960        # if we have an exact match and the user specifies lock_if_strict, just
961        # return the exact match!
962        if lock_if_strict:
963            # the strict composition parsing might throw an error, we can ignore
964            # it and just get on with fuzzy matching
965            try:
966                comp = Composition(fuzzy_formula)
967                return [comp]
968            except ValueError:
969                pass
970
971        all_matches = Composition._comps_from_fuzzy_formula(fuzzy_formula)
972        # remove duplicates
973        all_matches = list(set(all_matches))
974        # sort matches by rank descending
975        all_matches = sorted(all_matches, key=lambda match: (match[1], match[0]), reverse=True)
976        all_matches = [m[0] for m in all_matches]
977        return all_matches
978
979    @staticmethod
980    def _comps_from_fuzzy_formula(fuzzy_formula, m_dict=None, m_points=0, factor=1):
981        """
982        A recursive helper method for formula parsing that helps in
983        interpreting and ranking indeterminate formulas.
984        Author: Anubhav Jain
985
986        Args:
987            fuzzy_formula (str): A formula string, such as "co2o3" or "MN",
988                that may or may not have multiple interpretations.
989            m_dict (dict): A symbol:amt dictionary from the previously parsed
990                formula.
991            m_points: Number of points gained from the previously parsed
992                formula.
993            factor: Coefficient for this parse, e.g. (PO4)2 will feed in PO4
994                as the fuzzy_formula with a coefficient of 2.
995
996        Returns:
997            A list of tuples, with the first element being a Composition and
998            the second element being the number of points awarded that
999            Composition intepretation.
1000        """
1001        m_dict = m_dict or {}
1002
1003        def _parse_chomp_and_rank(m, f, m_dict, m_points):
1004            """
1005            A helper method for formula parsing that helps in interpreting and
1006            ranking indeterminate formulas
1007            Author: Anubhav Jain
1008
1009            Args:
1010                m: A regex match, with the first group being the element and
1011                    the second group being the amount
1012                f: The formula part containing the match
1013                m_dict: A symbol:amt dictionary from the previously parsed
1014                    formula
1015                m_points: Number of points gained from the previously parsed
1016                    formula
1017
1018            Returns:
1019                A tuple of (f, m_dict, points) where m_dict now contains data
1020                from the match and the match has been removed (chomped) from
1021                the formula f. The "goodness" of the match determines the
1022                number of points returned for chomping. Returns
1023                (None, None, None) if no element could be found...
1024            """
1025
1026            points = 0
1027            # Points awarded if the first element of the element is correctly
1028            # specified as a capital
1029            points_first_capital = 100
1030            # Points awarded if the second letter of the element is correctly
1031            # specified as lowercase
1032            points_second_lowercase = 100
1033
1034            # get element and amount from regex match
1035            el = m.group(1)
1036            if len(el) > 2 or len(el) < 1:
1037                raise ValueError("Invalid element symbol entered!")
1038            amt = float(m.group(2)) if m.group(2).strip() != "" else 1
1039
1040            # convert the element string to proper [uppercase,lowercase] format
1041            # and award points if it is already in that format
1042            char1 = el[0]
1043            char2 = el[1] if len(el) > 1 else ""
1044
1045            if char1 == char1.upper():
1046                points += points_first_capital
1047            if char2 and char2 == char2.lower():
1048                points += points_second_lowercase
1049
1050            el = char1.upper() + char2.lower()
1051
1052            # if it's a valid element, chomp and add to the points
1053            if Element.is_valid_symbol(el):
1054                if el in m_dict:
1055                    m_dict[el] += amt * factor
1056                else:
1057                    m_dict[el] = amt * factor
1058                return f.replace(m.group(), "", 1), m_dict, m_points + points
1059
1060            # else return None
1061            return None, None, None
1062
1063        fuzzy_formula = fuzzy_formula.strip()
1064
1065        if len(fuzzy_formula) == 0:
1066            # The entire formula has been parsed into m_dict. Return the
1067            # corresponding Composition and number of points
1068            if m_dict:
1069                yield (Composition.from_dict(m_dict), m_points)
1070        else:
1071            # if there is a parenthesis, remove it and match the remaining stuff
1072            # with the appropriate factor
1073            for mp in re.finditer(r"\(([^\(\)]+)\)([\.\d]*)", fuzzy_formula):
1074                mp_points = m_points
1075                mp_form = fuzzy_formula.replace(mp.group(), " ", 1)
1076                mp_dict = dict(m_dict)
1077                mp_factor = 1 if mp.group(2) == "" else float(mp.group(2))
1078                # Match the stuff inside the parenthesis with the appropriate
1079                # factor
1080                for match in Composition._comps_from_fuzzy_formula(mp.group(1), mp_dict, mp_points, factor=mp_factor):
1081                    only_me = True
1082                    # Match the stuff outside the parentheses and return the
1083                    # sum.
1084
1085                    for match2 in Composition._comps_from_fuzzy_formula(mp_form, mp_dict, mp_points, factor=1):
1086                        only_me = False
1087                        yield (match[0] + match2[0], match[1] + match2[1])
1088                    # if the stuff inside the parenthesis is nothing, then just
1089                    # return the stuff inside the parentheses
1090                    if only_me:
1091                        yield match
1092                return
1093
1094            # try to match the single-letter elements
1095            m1 = re.match(r"([A-z])([\.\d]*)", fuzzy_formula)
1096            if m1:
1097                m_points1 = m_points
1098                m_form1 = fuzzy_formula
1099                m_dict1 = dict(m_dict)
1100                (m_form1, m_dict1, m_points1) = _parse_chomp_and_rank(m1, m_form1, m_dict1, m_points1)
1101                if m_dict1:
1102                    # there was a real match
1103                    for match in Composition._comps_from_fuzzy_formula(m_form1, m_dict1, m_points1, factor):
1104                        yield match
1105
1106            # try to match two-letter elements
1107            m2 = re.match(r"([A-z]{2})([\.\d]*)", fuzzy_formula)
1108            if m2:
1109                m_points2 = m_points
1110                m_form2 = fuzzy_formula
1111                m_dict2 = dict(m_dict)
1112                (m_form2, m_dict2, m_points2) = _parse_chomp_and_rank(m2, m_form2, m_dict2, m_points2)
1113                if m_dict2:
1114                    # there was a real match
1115                    for match in Composition._comps_from_fuzzy_formula(m_form2, m_dict2, m_points2, factor):
1116                        yield match
1117
1118
1119def reduce_formula(sym_amt, iupac_ordering: bool = False) -> Tuple[str, float]:
1120    """
1121    Helper method to reduce a sym_amt dict to a reduced formula and factor.
1122
1123    Args:
1124        sym_amt (dict): {symbol: amount}.
1125        iupac_ordering (bool, optional): Whether to order the
1126            formula by the iupac "electronegativity" series, defined in
1127            Table VI of "Nomenclature of Inorganic Chemistry (IUPAC
1128            Recommendations 2005)". This ordering effectively follows
1129            the groups and rows of the periodic table, except the
1130            Lanthanides, Actanides and hydrogen. Note that polyanions
1131            will still be determined based on the true electronegativity of
1132            the elements.
1133
1134    Returns:
1135        (reduced_formula, factor).
1136    """
1137    syms = sorted(sym_amt.keys(), key=lambda x: [get_el_sp(x).X, x])
1138
1139    syms = list(filter(lambda x: abs(sym_amt[x]) > Composition.amount_tolerance, syms))
1140
1141    factor = 1
1142    # Enforce integers for doing gcd.
1143    if all((int(i) == i for i in sym_amt.values())):
1144        factor = abs(gcd(*(int(i) for i in sym_amt.values())))
1145
1146    polyanion = []
1147    # if the composition contains a poly anion
1148    if len(syms) >= 3 and get_el_sp(syms[-1]).X - get_el_sp(syms[-2]).X < 1.65:
1149        poly_sym_amt = {syms[i]: sym_amt[syms[i]] / factor for i in [-2, -1]}
1150        (poly_form, poly_factor) = reduce_formula(poly_sym_amt, iupac_ordering=iupac_ordering)
1151
1152        if poly_factor != 1:
1153            polyanion.append("({}){}".format(poly_form, int(poly_factor)))
1154
1155    syms = syms[: len(syms) - 2 if polyanion else len(syms)]
1156
1157    if iupac_ordering:
1158        syms = sorted(syms, key=lambda x: [get_el_sp(x).iupac_ordering, x])
1159
1160    reduced_form = []
1161    for s in syms:
1162        normamt = sym_amt[s] * 1.0 / factor
1163        reduced_form.append(s)
1164        reduced_form.append(formula_double_format(normamt))
1165
1166    reduced_form = "".join(reduced_form + polyanion)  # type: ignore
1167    return reduced_form, factor  # type: ignore
1168
1169
1170class ChemicalPotential(dict, MSONable):
1171    """
1172    Class to represent set of chemical potentials. Can be:
1173    multiplied/divided by a Number
1174    multiplied by a Composition (returns an energy)
1175    added/subtracted with other ChemicalPotentials.
1176    """
1177
1178    def __init__(self, *args, **kwargs):
1179        """
1180        Args:
1181            *args, **kwargs: any valid dict init arguments
1182        """
1183        d = dict(*args, **kwargs)
1184        super().__init__((get_el_sp(k), v) for k, v in d.items())
1185        if len(d) != len(self):
1186            raise ValueError("Duplicate potential specified")
1187
1188    def __mul__(self, other):
1189        if isinstance(other, numbers.Number):
1190            return ChemicalPotential({k: v * other for k, v in self.items()})
1191        raise NotImplementedError()
1192
1193    __rmul__ = __mul__
1194
1195    def __truediv__(self, other):
1196        if isinstance(other, numbers.Number):
1197            return ChemicalPotential({k: v / other for k, v in self.items()})
1198        raise NotImplementedError()
1199
1200    __div__ = __truediv__
1201
1202    def __sub__(self, other):
1203        if isinstance(other, ChemicalPotential):
1204            els = set(self.keys()).union(other.keys())
1205            return ChemicalPotential({e: self.get(e, 0) - other.get(e, 0) for e in els})
1206        raise NotImplementedError()
1207
1208    def __add__(self, other):
1209        if isinstance(other, ChemicalPotential):
1210            els = set(self.keys()).union(other.keys())
1211            return ChemicalPotential({e: self.get(e, 0) + other.get(e, 0) for e in els})
1212        raise NotImplementedError()
1213
1214    def get_energy(self, composition: Composition, strict: bool = True) -> float:
1215        """
1216        Calculates the energy of a composition.
1217
1218        Args:
1219            composition (Composition): input composition
1220            strict (bool): Whether all potentials must be specified
1221        """
1222        if strict and set(composition.keys()) > set(self.keys()):
1223            s = set(composition.keys()) - set(self.keys())
1224            raise ValueError("Potentials not specified for {}".format(s))
1225        return sum(self.get(k, 0) * v for k, v in composition.items())
1226
1227    def __repr__(self):
1228        return "ChemPots: " + super().__repr__()
1229
1230
1231class CompositionError(Exception):
1232    """Exception class for composition errors"""
1233
1234
1235if __name__ == "__main__":
1236    import doctest
1237
1238    doctest.testmod()
1239