1# -*- coding: utf-8 -*-
2
3"""
4A set of functions useful for customizing bibtex fields.
5You can find inspiration from these functions to design yours.
6Each of them takes a record and return the modified record.
7"""
8
9import re
10import logging
11
12from builtins import str
13
14from bibtexparser.latexenc import latex_to_unicode, string_to_latex, protect_uppercase
15
16logger = logging.getLogger(__name__)
17
18__all__ = ['splitname', 'getnames', 'author', 'editor', 'journal', 'keyword',
19           'link', 'page_double_hyphen', 'doi', 'type', 'convert_to_unicode',
20           'homogenize_latex_encoding', 'add_plaintext_fields']
21
22
23class InvalidName(ValueError):
24    """Exception raised by :py:func:`customization.splitname` when an invalid name is input.
25
26    """
27    pass
28
29
30def splitname(name, strict_mode=True):
31    """
32    Break a name into its constituent parts: First, von, Last, and Jr.
33
34    :param string name: a string containing a single name
35    :param Boolean strict_mode: whether to use strict mode
36    :returns: dictionary of constituent parts
37    :raises `customization.InvalidName`: If an invalid name is given and
38                                         ``strict_mode = True``.
39
40    In BibTeX, a name can be represented in any of three forms:
41        * First von Last
42        * von Last, First
43        * von Last, Jr, First
44
45    This function attempts to split a given name into its four parts. The
46    returned dictionary has keys of ``first``, ``last``, ``von`` and ``jr``.
47    Each value is a list of the words making up that part; this may be an empty
48    list.  If the input has no non-whitespace characters, a blank dictionary is
49    returned.
50
51    It is capable of detecting some errors with the input name. If the
52    ``strict_mode`` parameter is ``True``, which is the default, this results in
53    a :class:`customization.InvalidName` exception being raised. If it is
54    ``False``, the function continues, working around the error as best it can.
55    The errors that can be detected are listed below along with the handling
56    for non-strict mode:
57
58        * Name finishes with a trailing comma: delete the comma
59        * Too many parts (e.g., von Last, Jr, First, Error): merge extra parts
60          into First
61        * Unterminated opening brace: add closing brace to end of input
62        * Unmatched closing brace: add opening brace at start of word
63
64    """
65    # Useful references:
66    # http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html#names
67    # http://tug.ctan.org/info/bibtex/tamethebeast/ttb_en.pdf
68
69    # Whitespace characters that can separate words.
70    whitespace = set(' ~\r\n\t')
71
72    # We'll iterate over the input once, dividing it into a list of words for
73    # each comma-separated section. We'll also calculate the case of each word
74    # as we work.
75    sections = [[]]      # Sections of the name.
76    cases = [[]]         # 1 = uppercase, 0 = lowercase, -1 = caseless.
77    word = []            # Current word.
78    case = -1            # Case of the current word.
79    level = 0            # Current brace level.
80    bracestart = False   # Will the next character be the first within a brace?
81    controlseq = True    # Are we currently processing a control sequence?
82    specialchar = None   # Are we currently processing a special character?
83
84    # Using an iterator allows us to deal with escapes in a simple manner.
85    nameiter = iter(name)
86    for char in nameiter:
87        # An escape.
88        if char == '\\':
89            escaped = next(nameiter)
90
91            # BibTeX doesn't allow whitespace escaping. Copy the slash and fall
92            # through to the normal case to handle the whitespace.
93            if escaped in whitespace:
94                word.append(char)
95                char = escaped
96
97            else:
98                # Is this the first character in a brace?
99                if bracestart:
100                    bracestart = False
101                    controlseq = escaped.isalpha()
102                    specialchar = True
103
104                # Can we use it to determine the case?
105                elif (case == -1) and escaped.isalpha():
106                    if escaped.isupper():
107                        case = 1
108                    else:
109                        case = 0
110
111                # Copy the escape to the current word and go to the next
112                # character in the input.
113                word.append(char)
114                word.append(escaped)
115                continue
116
117        # Start of a braced expression.
118        if char == '{':
119            level += 1
120            word.append(char)
121            bracestart = True
122            controlseq = False
123            specialchar = False
124            continue
125
126        # All the below cases imply this (and don't test its previous value).
127        bracestart = False
128
129        # End of a braced expression.
130        if char == '}':
131            # Check and reduce the level.
132            if level:
133                level -= 1
134            else:
135                if strict_mode:
136                    raise InvalidName("Unmatched closing brace in name {{{0}}}.".format(name))
137                word.insert(0, '{')
138
139            # Update the state, append the character, and move on.
140            controlseq = False
141            specialchar = False
142            word.append(char)
143            continue
144
145        # Inside a braced expression.
146        if level:
147            # Is this the end of a control sequence?
148            if controlseq:
149                if not char.isalpha():
150                    controlseq = False
151
152            # If it's a special character, can we use it for a case?
153            elif specialchar:
154                if (case == -1) and char.isalpha():
155                    if char.isupper():
156                        case = 1
157                    else:
158                        case = 0
159
160            # Append the character and move on.
161            word.append(char)
162            continue
163
164        # End of a word.
165        # NB. we know we're not in a brace here due to the previous case.
166        if char == ',' or char in whitespace:
167            # Don't add empty words due to repeated whitespace.
168            if word:
169                sections[-1].append(''.join(word))
170                word = []
171                cases[-1].append(case)
172                case = -1
173                controlseq = False
174                specialchar = False
175
176            # End of a section.
177            if char == ',':
178                if len(sections) < 3:
179                    sections.append([])
180                    cases.append([])
181                elif strict_mode:
182                    raise InvalidName("Too many commas in the name {{{0}}}.".format(name))
183            continue
184
185        # Regular character.
186        word.append(char)
187        if (case == -1) and char.isalpha():
188            if char.isupper():
189                case = 1
190            else:
191                case = 0
192
193    # Unterminated brace?
194    if level:
195        if strict_mode:
196            raise InvalidName("Unterminated opening brace in the name {{{0}}}.".format(name))
197        while level:
198            word.append('}')
199            level -= 1
200
201    # Handle the final word.
202    if word:
203        sections[-1].append(''.join(word))
204        cases[-1].append(case)
205
206    # Get rid of trailing sections.
207    if not sections[-1]:
208        # Trailing comma?
209        if (len(sections) > 1) and strict_mode:
210            raise InvalidName("Trailing comma at end of name {{{0}}}.".format(name))
211        sections.pop(-1)
212        cases.pop(-1)
213
214    # No non-whitespace input.
215    if not sections or not any(bool(section) for section in sections):
216        return {}
217
218    # Initialise the output dictionary.
219    parts = {'first': [], 'last': [], 'von': [], 'jr': []}
220
221    # Form 1: "First von Last"
222    if len(sections) == 1:
223        p0 = sections[0]
224
225        # One word only: last cannot be empty.
226        if len(p0) == 1:
227            parts['last'] = p0
228
229        # Two words: must be first and last.
230        elif len(p0) == 2:
231            parts['first'] = p0[:1]
232            parts['last'] = p0[1:]
233
234        # Need to use the cases to figure it out.
235        else:
236            cases = cases[0]
237
238            # First is the longest sequence of words starting with uppercase
239            # that is not the whole string. von is then the longest sequence
240            # whose last word starts with lowercase that is not the whole
241            # string. Last is the rest. NB., this means last cannot be empty.
242
243            # At least one lowercase letter.
244            if 0 in cases:
245                # Index from end of list of first and last lowercase word.
246                firstl = cases.index(0) - len(cases)
247                lastl = -cases[::-1].index(0) - 1
248                if lastl == -1:
249                    lastl -= 1      # Cannot consume the rest of the string.
250
251                # Pull the parts out.
252                parts['first'] = p0[:firstl]
253                parts['von'] = p0[firstl:lastl+1]
254                parts['last'] = p0[lastl+1:]
255
256            # No lowercase: last is the last word, first is everything else.
257            else:
258                parts['first'] = p0[:-1]
259                parts['last'] = p0[-1:]
260
261    # Form 2 ("von Last, First") or 3 ("von Last, jr, First")
262    else:
263        # As long as there is content in the first name partition, use it as-is.
264        first = sections[-1]
265        if first and first[0]:
266            parts['first'] = first
267
268        # And again with the jr part.
269        if len(sections) == 3:
270            jr = sections[-2]
271            if jr and jr[0]:
272                parts['jr'] = jr
273
274        # Last name cannot be empty; if there is only one word in the first
275        # partition, we have to use it for the last name.
276        last = sections[0]
277        if len(last) == 1:
278            parts['last'] = last
279
280        # Have to look at the cases to figure it out.
281        else:
282            lcases = cases[0]
283
284            # At least one lowercase: von is the longest sequence of whitespace
285            # separated words whose last word does not start with an uppercase
286            # word, and last is the rest.
287            if 0 in lcases:
288                split = len(lcases) - lcases[::-1].index(0)
289                if split == len(lcases):
290                    split = 0            # Last cannot be empty.
291                parts['von'] = sections[0][:split]
292                parts['last'] = sections[0][split:]
293
294            # All uppercase => all last.
295            else:
296                parts['last'] = sections[0]
297
298    # Done.
299    return parts
300
301
302def getnames(names):
303    """Convert people names as surname, firstnames
304    or surname, initials.
305
306    :param names: a list of names
307    :type names: list
308    :returns: list -- Correctly formated names
309
310    .. Note::
311        This function is known to be too simple to handle properly
312        the complex rules. We would like to enhance this in forthcoming
313        releases.
314    """
315    tidynames = []
316    for namestring in names:
317        namestring = namestring.strip()
318        if len(namestring) < 1:
319            continue
320        if ',' in namestring:
321            namesplit = namestring.split(',', 1)
322            last = namesplit[0].strip()
323            firsts = [i.strip() for i in namesplit[1].split()]
324        else:
325            namesplit = namestring.split()
326            last = namesplit.pop()
327            firsts = [i.replace('.', '. ').strip() for i in namesplit]
328        if last in ['jnr', 'jr', 'junior']:
329            last = firsts.pop()
330        for item in firsts:
331            if item in ['ben', 'van', 'der', 'de', 'la', 'le']:
332                last = firsts.pop() + ' ' + last
333        tidynames.append(last + ", " + ' '.join(firsts))
334    return tidynames
335
336
337def author(record):
338    """
339    Split author field into a list of "Name, Surname".
340
341    :param record: the record.
342    :type record: dict
343    :returns: dict -- the modified record.
344
345    """
346    if "author" in record:
347        if record["author"]:
348            record["author"] = getnames([i.strip() for i in record["author"].replace('\n', ' ').split(" and ")])
349        else:
350            del record["author"]
351    return record
352
353
354def editor(record):
355    """
356    Turn the editor field into a dict composed of the original editor name
357    and a editor id (without coma or blank).
358
359    :param record: the record.
360    :type record: dict
361    :returns: dict -- the modified record.
362
363    """
364    if "editor" in record:
365        if record["editor"]:
366            record["editor"] = getnames([i.strip() for i in record["editor"].replace('\n', ' ').split(" and ")])
367            # convert editor to object
368            record["editor"] = [{"name": i, "ID": i.replace(',', '').replace(' ', '').replace('.', '')} for i in record["editor"]]
369        else:
370            del record["editor"]
371    return record
372
373
374def page_double_hyphen(record):
375    """
376    Separate pages by a double hyphen (--).
377
378    :param record: the record.
379    :type record: dict
380    :returns: dict -- the modified record.
381
382    """
383    if "pages" in record:
384        # hyphen, non-breaking hyphen, en dash, em dash, hyphen-minus, minus sign
385        separators = [u'‐', u'‑', u'–', u'—', u'-', u'−']
386        for separator in separators:
387            if separator in record["pages"]:
388                p = [i.strip().strip(separator) for i in record["pages"].split(separator)]
389                record["pages"] = p[0] + '--' + p[-1]
390    return record
391
392
393def type(record):
394    """
395    Put the type into lower case.
396
397    :param record: the record.
398    :type record: dict
399    :returns: dict -- the modified record.
400
401    """
402    if "type" in record:
403        record["type"] = record["type"].lower()
404    return record
405
406
407def journal(record):
408    """
409    Turn the journal field into a dict composed of the original journal name
410    and a journal id (without coma or blank).
411
412    :param record: the record.
413    :type record: dict
414    :returns: dict -- the modified record.
415
416    """
417    if "journal" in record:
418        # switch journal to object
419        if record["journal"]:
420            record["journal"] = {"name": record["journal"], "ID": record["journal"].replace(',', '').replace(' ', '').replace('.', '')}
421
422    return record
423
424
425def keyword(record, sep=',|;'):
426    """
427    Split keyword field into a list.
428
429    :param record: the record.
430    :type record: dict
431    :param sep: pattern used for the splitting regexp.
432    :type record: string, optional
433    :returns: dict -- the modified record.
434
435    """
436    if "keyword" in record:
437        record["keyword"] = [i.strip() for i in re.split(sep, record["keyword"].replace('\n', ''))]
438
439    return record
440
441
442def link(record):
443    """
444
445    :param record: the record.
446    :type record: dict
447    :returns: dict -- the modified record.
448
449    """
450    if "link" in record:
451        links = [i.strip().replace("  ", " ") for i in record["link"].split('\n')]
452        record['link'] = []
453        for link in links:
454            parts = link.split(" ")
455            linkobj = {"url": parts[0]}
456            if len(parts) > 1:
457                linkobj["anchor"] = parts[1]
458            if len(parts) > 2:
459                linkobj["format"] = parts[2]
460            if len(linkobj["url"]) > 0:
461                record["link"].append(linkobj)
462
463    return record
464
465
466def doi(record):
467    """
468
469    :param record: the record.
470    :type record: dict
471    :returns: dict -- the modified record.
472
473    """
474    if 'doi' in record:
475        if 'link' not in record:
476            record['link'] = []
477        nodoi = True
478        for item in record['link']:
479            if 'doi' in item:
480                nodoi = False
481        if nodoi:
482            link = record['doi']
483            if link.startswith('10'):
484                link = 'https://doi.org/' + link
485            record['link'].append({"url": link, "anchor": "doi"})
486    return record
487
488
489def convert_to_unicode(record):
490    """
491    Convert accent from latex to unicode style.
492
493    :param record: the record.
494    :type record: dict
495    :returns: dict -- the modified record.
496    """
497    for val in record:
498        if isinstance(record[val], list):
499            record[val] = [
500                latex_to_unicode(x) for x in record[val]
501            ]
502        elif isinstance(record[val], dict):
503            record[val] = {
504                k: latex_to_unicode(v) for k, v in record[val].items()
505            }
506        else:
507            record[val] = latex_to_unicode(record[val])
508    return record
509
510
511def homogenize_latex_encoding(record):
512    """
513    Homogenize the latex enconding style for bibtex
514
515    This function is experimental.
516
517    :param record: the record.
518    :type record: dict
519    :returns: dict -- the modified record.
520    """
521    # First, we convert everything to unicode
522    record = convert_to_unicode(record)
523    # And then, we fall back
524    for val in record:
525        if val not in ('ID',):
526            logger.debug('Apply string_to_latex to: %s', val)
527            record[val] = string_to_latex(record[val])
528            if val == 'title':
529                logger.debug('Protect uppercase in title')
530                logger.debug('Before: %s', record[val])
531                record[val] = protect_uppercase(record[val])
532                logger.debug('After: %s', record[val])
533    return record
534
535
536def add_plaintext_fields(record):
537    """
538    For each field in the record, add a `plain_` field containing the
539    plaintext, stripped from braces and similar. See
540    https://github.com/sciunto-org/python-bibtexparser/issues/116.
541
542    :param record: the record.
543    :type record: dict
544    :returns: dict -- the modified record.
545    """
546    def _strip_string(string):
547        for stripped in ['{', '}']:
548            string = string.replace(stripped, "")
549        return string
550
551    for key in list(record.keys()):
552        plain_key = "plain_{}".format(key)
553        record[plain_key] = record[key]
554
555        if isinstance(record[plain_key], str):
556            record[plain_key] = _strip_string(record[plain_key])
557        elif isinstance(record[plain_key], dict):
558            record[plain_key] = {
559                subkey: _strip_string(value)
560                for subkey, value in record[plain_key].items()
561            }
562        elif isinstance(record[plain_key], list):
563            record[plain_key] = [
564                _strip_string(value)
565                for value in record[plain_key]
566            ]
567
568    return record
569