1# -*- coding: utf-8 -*-
2#
3# Copyright (c) 2020, the cclib development team
4#
5# This file is part of cclib (http://cclib.github.io) and is distributed under
6# the terms of the BSD 3-Clause License.
7"""Generic output file parser and related tools"""
8
9
10import bz2
11import fileinput
12import gzip
13import inspect
14import io
15import logging
16import os
17import random
18import sys
19import zipfile
20from abc import ABC, abstractmethod
21
22import numpy
23
24from cclib.parser import utils
25from cclib.parser.data import ccData
26from cclib.parser.data import ccData_optdone_bool
27
28
29# This seems to avoid a problem with Avogadro.
30logging.logMultiprocessing = 0
31
32
33class myBZ2File(bz2.BZ2File):
34    """Return string instead of bytes"""
35    def __next__(self):
36        line = super(bz2.BZ2File, self).__next__()
37        return line.decode("ascii", "replace")
38
39    def next(self):
40        line = self.__next__()
41        return line
42
43
44class myGzipFile(gzip.GzipFile):
45    """Return string instead of bytes"""
46    def __next__(self):
47        super_ob = super(gzip.GzipFile, self)
48        # seemingly different versions of gzip can have either next or __next__
49        if hasattr(super_ob, 'next'):
50            line = super_ob.next()
51        else:
52            line = super_ob.__next__()
53        return line.decode("ascii", "replace")
54
55    def next(self):
56        line = self.__next__()
57        return line
58
59
60class FileWrapper:
61    """Wrap a file-like object or stream with some custom tweaks"""
62
63    def __init__(self, source, pos=0):
64
65        self.src = source
66
67        # Most file-like objects have seek and tell methods, but streams returned
68        # by urllib.urlopen in Python2 do not, which will raise an AttributeError
69        # in this code. On the other hand, in Python3 these methods do exist since
70        # urllib uses the stream class in the io library, but they raise a different
71        # error, namely io.UnsupportedOperation. That is why it is hard to be more
72        # specific with except block here.
73        try:
74            self.src.seek(0, 2)
75            self.size = self.src.tell()
76            self.src.seek(pos, 0)
77
78        except (AttributeError, IOError, io.UnsupportedOperation):
79            # Stream returned by urllib should have size information.
80            if hasattr(self.src, 'headers') and 'content-length' in self.src.headers:
81                self.size = int(self.src.headers['content-length'])
82            else:
83                self.size = pos
84
85        # Assume the position is what was passed to the constructor.
86        self.pos = pos
87
88        self.last_line = None
89
90    def next(self):
91        line = next(self.src)
92        self.pos += len(line)
93        self.last_line = line
94        return line
95
96    def __next__(self):
97        return self.next()
98
99    def __iter__(self):
100        return self
101
102    def close(self):
103        self.src.close()
104
105    def seek(self, pos, ref):
106
107        # If we are seeking to end, we can emulate it usually. As explained above,
108        # we cannot be too specific with the except clause due to differences
109        # between Python2 and 3. Yet another reason to drop Python 2 soon!
110        try:
111            self.src.seek(pos, ref)
112        except:
113            if ref == 2:
114                self.src.read()
115            else:
116                raise
117
118        if ref == 0:
119            self.pos = pos
120        if ref == 1:
121            self.pos += pos
122        if ref == 2 and hasattr(self, 'size'):
123            self.pos = self.size
124
125
126def openlogfile(filename, object=None):
127    """Return a file object given a filename or if object specified decompresses it
128    if needed and wrap it up.
129
130    Given the filename or file object of a log file or a gzipped, zipped, or bzipped
131    log file, this function returns a file-like object.
132
133    Given a list of filenames, this function returns a FileInput object,
134    which can be used for seamless iteration without concatenation.
135    """
136
137    # If there is a single string argument given.
138    if type(filename) in [str, str]:
139
140        extension = os.path.splitext(filename)[1]
141
142        if extension == ".gz":
143            fileobject = myGzipFile(filename, "r", fileobj=object)
144
145        elif extension == ".zip":
146            zip = zipfile.ZipFile(object, "r") if object else zipfile.ZipFile(filename, "r")
147            assert len(zip.namelist()) == 1, "ERROR: Zip file contains more than 1 file"
148            fileobject = io.StringIO(zip.read(zip.namelist()[0]).decode("ascii", "ignore"))
149
150        elif extension in ['.bz', '.bz2']:
151            # Module 'bz2' is not always importable.
152            assert bz2 is not None, "ERROR: module bz2 cannot be imported"
153            fileobject = myBZ2File(object, "r") if object else myBZ2File(filename, "r")
154
155        else:
156            # Assuming that object is text file encoded in utf-8
157            fileobject = io.StringIO(object.decode('utf-8')) if object \
158                    else FileWrapper(io.open(filename, "r", errors='ignore'))
159
160        return fileobject
161
162    elif hasattr(filename, "__iter__"):
163
164        # This is needed, because fileinput will assume stdin when filename is empty.
165        if len(filename) == 0:
166            return None
167
168        return fileinput.input(filename, openhook=fileinput.hook_compressed)
169
170
171class Logfile(ABC):
172    """Abstract class for logfile objects.
173
174    Subclasses defined by cclib:
175        ADF, DALTON, GAMESS, GAMESSUK, Gaussian, Jaguar, Molpro, MOPAC,
176        NWChem, ORCA, Psi, Q-Chem
177    """
178
179    def __init__(self, source, loglevel=logging.ERROR, logname="Log",
180                 logstream=sys.stderr, datatype=ccData_optdone_bool, **kwds):
181        """Initialise the Logfile object.
182
183        This should be called by a subclass in its own __init__ method.
184
185        Inputs:
186            source - a logfile, list of logfiles, or stream with at least a read method
187            loglevel - integer corresponding to a log level from the logging module
188            logname - name of the source logfile passed to this constructor
189            logstream - where to output the logging information
190            datatype - class to use for gathering data attributes
191        """
192
193        # Set the filename to source if it is a string or a list of strings, which are
194        # assumed to be filenames. Otherwise, assume the source is a file-like object
195        # if it has a read method, and we will try to use it like a stream.
196        self.isfileinput = False
197        if isinstance(source, str):
198            self.filename = source
199            self.isstream = False
200        elif isinstance(source, list) and all([isinstance(s, str) for s in source]):
201            self.filename = source
202            self.isstream = False
203        elif isinstance(source, fileinput.FileInput):
204            self.filename = source
205            self.isstream = False
206            self.isfileinput = True
207        elif hasattr(source, "read"):
208            self.filename = "stream %s" % str(type(source))
209            self.isstream = True
210            self.stream = source
211        else:
212            raise ValueError("Unexpected source type.")
213
214        # Set up the logger.
215        # Note that calling logging.getLogger() with one name always returns the same instance.
216        # Presently in cclib, all parser instances of the same class use the same logger,
217        #   which means that care needs to be taken not to duplicate handlers.
218        self.loglevel = loglevel
219        self.logname = logname
220        self.logger = logging.getLogger('%s %s' % (self.logname, self.filename))
221        self.logger.setLevel(self.loglevel)
222        if len(self.logger.handlers) == 0:
223            handler = logging.StreamHandler(logstream)
224            handler.setFormatter(logging.Formatter("[%(name)s %(levelname)s] %(message)s"))
225            self.logger.addHandler(handler)
226
227        # Set up the metadata.
228        if not hasattr(self, "metadata"):
229            self.metadata = {}
230            self.metadata["package"] = self.logname
231            self.metadata["methods"] = []
232            # Indicate if the computation has completed successfully
233            self.metadata['success'] = False
234
235
236        # Periodic table of elements.
237        self.table = utils.PeriodicTable()
238
239        # This is the class that will be used in the data object returned by parse(), and should
240        # normally be ccData or a subclass of it.
241        self.datatype = datatype
242
243        # Change the class used if we want optdone to be a list or if the 'future' option
244        # is used, which might have more consequences in the future.
245        optdone_as_list = kwds.get("optdone_as_list", False) or kwds.get("future", False)
246        optdone_as_list = optdone_as_list if isinstance(optdone_as_list, bool) else False
247        if optdone_as_list:
248            self.datatype = ccData
249        # Parsing of Natural Orbitals and Natural Spin Orbtials into one attribute
250        self.unified_no_nso = kwds.get("future",False)
251
252    def __setattr__(self, name, value):
253
254        # Send info to logger if the attribute is in the list of attributes.
255        if name in ccData._attrlist and hasattr(self, "logger"):
256
257            # Call logger.info() only if the attribute is new.
258            if not hasattr(self, name):
259                if type(value) in [numpy.ndarray, list]:
260                    self.logger.info("Creating attribute %s[]" % name)
261                else:
262                    self.logger.info("Creating attribute %s: %s" % (name, str(value)))
263
264        # Set the attribute.
265        object.__setattr__(self, name, value)
266
267    def parse(self, progress=None, fupdate=0.05, cupdate=0.002):
268        """Parse the logfile, using the assumed extract method of the child."""
269
270        # Check that the sub-class has an extract attribute,
271        #  that is callable with the proper number of arguemnts.
272        if not hasattr(self, "extract"):
273            raise AttributeError("Class %s has no extract() method." % self.__class__.__name__)
274        if not callable(self.extract):
275            raise AttributeError("Method %s._extract not callable." % self.__class__.__name__)
276        if len(inspect.getargspec(self.extract)[0]) != 3:
277            raise AttributeError("Method %s._extract takes wrong number of arguments." % self.__class__.__name__)
278
279        # Save the current list of attributes to keep after parsing.
280        # The dict of self should be the same after parsing.
281        _nodelete = list(set(self.__dict__.keys()))
282
283        # Initiate the FileInput object for the input files.
284        # Remember that self.filename can be a list of files.
285        if not self.isstream:
286            if not self.isfileinput:
287                inputfile = openlogfile(self.filename)
288            else:
289                inputfile = self.filename
290        else:
291            inputfile = FileWrapper(self.stream)
292
293        # Intialize self.progress
294        is_compressed = isinstance(inputfile, myGzipFile) or isinstance(inputfile, myBZ2File)
295        if progress and not (is_compressed):
296            self.progress = progress
297            self.progress.initialize(inputfile.size)
298            self.progress.step = 0
299        self.fupdate = fupdate
300        self.cupdate = cupdate
301
302        # Maybe the sub-class has something to do before parsing.
303        self.before_parsing()
304
305        # Loop over lines in the file object and call extract().
306        # This is where the actual parsing is done.
307        for line in inputfile:
308            self.updateprogress(inputfile, "Unsupported information", cupdate)
309
310            # This call should check if the line begins a section of extracted data.
311            # If it does, it parses some lines and sets the relevant attributes (to self).
312            # Any attributes can be freely set and used across calls, however only those
313            #   in data._attrlist will be moved to final data object that is returned.
314            try:
315                self.extract(inputfile, line)
316            except StopIteration:
317                self.logger.error("Unexpectedly encountered end of logfile.")
318                break
319            except Exception as e:
320                self.logger.error("Encountered error when parsing.")
321                self.logger.error("Last line read: %s" % inputfile.last_line)
322                raise
323
324        # Close input file object.
325        if not self.isstream:
326            inputfile.close()
327
328        # Maybe the sub-class has something to do after parsing.
329        self.after_parsing()
330
331        # If atomcoords were not parsed, but some input coordinates were ("inputcoords").
332        # This is originally from the Gaussian parser, a regression fix.
333        if not hasattr(self, "atomcoords") and hasattr(self, "inputcoords"):
334            self.atomcoords = numpy.array(self.inputcoords, 'd')
335
336        # Set nmo if not set already - to nbasis.
337        if not hasattr(self, "nmo") and hasattr(self, "nbasis"):
338            self.nmo = self.nbasis
339
340        # Create a default coreelectrons array, unless it's impossible
341        # to determine.
342        if not hasattr(self, "coreelectrons") and hasattr(self, "natom"):
343            self.coreelectrons = numpy.zeros(self.natom, "i")
344        if hasattr(self, "incorrect_coreelectrons"):
345            self.__delattr__("coreelectrons")
346
347        # Create the data object we want to return. This is normally ccData, but can be changed
348        # by passing the datatype argument to the constructor. All supported cclib attributes
349        # are copied to this object, but beware that in order to be moved an attribute must be
350        # included in the data._attrlist of ccData (or whatever else).
351        # There is the possibility of passing assitional argument via self.data_args, but
352        # we use this sparingly in cases where we want to limit the API with options, etc.
353        data = self.datatype(attributes=self.__dict__)
354
355        # Now make sure that the cclib attributes in the data object are all the correct type,
356        # including arrays and lists of arrays.
357        data.arrayify()
358
359        # Delete all temporary attributes (including cclib attributes).
360        # All attributes should have been moved to a data object, which will be returned.
361        for attr in list(self.__dict__.keys()):
362            if not attr in _nodelete:
363                self.__delattr__(attr)
364
365        # Perform final checks on values of attributes.
366        data.check_values(logger=self.logger)
367
368        # Update self.progress as done.
369        if hasattr(self, "progress"):
370            self.progress.update(inputfile.size, "Done")
371
372        return data
373
374    def before_parsing(self):
375        """Set parser-specific variables and do other initial things here."""
376        pass
377
378    def after_parsing(self):
379        """Correct data or do parser-specific validation after parsing is finished."""
380        pass
381
382    def updateprogress(self, inputfile, msg, xupdate=0.05):
383        """Update progress."""
384
385        if hasattr(self, "progress") and random.random() < xupdate:
386            newstep = inputfile.pos
387            if newstep != self.progress.step:
388                self.progress.update(newstep, msg)
389                self.progress.step = newstep
390
391    @abstractmethod
392    def normalisesym(self, symlabel):
393        """Standardise the symmetry labels between parsers."""
394
395    def new_internal_job(self):
396        """Delete attributes that can be problematic in multistep jobs.
397
398        TODO: instead of this hack, parse each job in a multistep comptation
399        as a different ccData object (this is for 2.x).
400
401        Some computations are actually sequences of several jobs, and some
402        attributes won't work well if parsed across jobs. There include:
403            mpenergies: if different jobs go to different orders then
404                        these won't be consistent and can't be converted
405                        to an array easily
406        """
407        for name in ("mpenergies",):
408            if hasattr(self, name):
409                delattr(self, name)
410
411    def set_attribute(self, name, value, check_change=True):
412        """Set an attribute and perform an optional check when it already exists.
413
414        Note that this can be used for scalars and lists alike, whenever we want
415        to set a value for an attribute.
416
417        Parameters
418        ----------
419        name: str
420            The name of the attribute.
421        value: str
422            The value for the attribute.
423        check_change: bool
424            By default we want to check that the value does not change
425            if the attribute already exists.
426        """
427        if check_change and hasattr(self, name):
428            try:
429                numpy.testing.assert_equal(getattr(self, name), value)
430            except AssertionError:
431                self.logger.warning("Attribute %s changed value (%s -> %s)" % (name, getattr(self, name), value))
432
433        setattr(self, name, value)
434
435    def append_attribute(self, name, value):
436        """Appends a value to an attribute."""
437
438        if not hasattr(self, name):
439            self.set_attribute(name, [])
440        getattr(self, name).append(value)
441
442    def extend_attribute(self, name, values):
443        """Appends an iterable of values to an attribute."""
444
445        if not hasattr(self, name):
446            self.set_attribute(name, [])
447        getattr(self, name).extend(values)
448
449    def _assign_coreelectrons_to_element(self, element, ncore,
450                                         ncore_is_total_count=False):
451        """Assign core electrons to all instances of the element.
452
453        It's usually reasonable to do this for all atoms of a given element,
454        because mixed usage isn't normally allowed within elements.
455
456        Parameters
457        ----------
458        element: str
459          the chemical element to set coreelectrons for
460        ncore: int
461          the number of core electrons
462        ncore_is_total_count: bool
463          whether the ncore argument is the total count, in which case it is
464          divided by the number of atoms of this element
465        """
466        atomsymbols = [self.table.element[atomno] for atomno in self.atomnos]
467        indices = [i for i, el in enumerate(atomsymbols) if el == element]
468        if ncore_is_total_count:
469            ncore = ncore // len(indices)
470
471        if not hasattr(self, 'coreelectrons'):
472            self.coreelectrons = numpy.zeros(self.natom, 'i')
473        self.coreelectrons[indices] = ncore
474
475    def skip_lines(self, inputfile, sequence):
476        """Read trivial line types and check they are what they are supposed to be.
477
478        This function will read len(sequence) lines and do certain checks on them,
479        when the elements of sequence have the appropriate values. Currently the
480        following elements trigger checks:
481            'blank' or 'b'      - the line should be blank
482            'dashes' or 'd'     - the line should contain only dashes (or spaces)
483            'equals' or 'e'     - the line should contain only equal signs (or spaces)
484            'stars' or 's'      - the line should contain only stars (or spaces)
485        """
486
487        expected_characters = {
488            '-': ['dashes', 'd'],
489            '=': ['equals', 'e'],
490            '*': ['stars', 's'],
491        }
492
493        lines = []
494        for expected in sequence:
495
496            # Read the line we want to skip.
497            line = next(inputfile)
498
499            # Blank lines are perhaps the most common thing we want to check for.
500            if expected in ["blank", "b"]:
501                try:
502                    assert line.strip() == ""
503                except AssertionError:
504                    frame, fname, lno, funcname, funcline, index = inspect.getouterframes(inspect.currentframe())[1]
505                    parser = fname.split('/')[-1]
506                    msg = "In %s, line %i, line not blank as expected: %s" % (parser, lno, line.strip())
507                    self.logger.warning(msg)
508
509            # All cases of heterogeneous lines can be dealt with by the same code.
510            for character, keys in expected_characters.items():
511                if expected in keys:
512                    try:
513                        assert utils.str_contains_only(line.strip(), [character, ' '])
514                    except AssertionError:
515                        frame, fname, lno, funcname, funcline, index = inspect.getouterframes(inspect.currentframe())[1]
516                        parser = fname.split('/')[-1]
517                        msg = "In %s, line %i, line not all %s as expected: %s" % (parser, lno, keys[0], line.strip())
518                        self.logger.warning(msg)
519                        continue
520
521            # Save the skipped line, and we will return the whole list.
522            lines.append(line)
523
524        return lines
525
526    skip_line = lambda self, inputfile, expected: self.skip_lines(inputfile, [expected])
527
528