1# -*- coding: utf-8 -*- 2 3""" 4A set of functions useful for customizing bibtex fields. 5You can find inspiration from these functions to design yours. 6Each of them takes a record and return the modified record. 7""" 8 9import re 10import logging 11 12from builtins import str 13 14from bibtexparser.latexenc import latex_to_unicode, string_to_latex, protect_uppercase 15 16logger = logging.getLogger(__name__) 17 18__all__ = ['splitname', 'getnames', 'author', 'editor', 'journal', 'keyword', 19 'link', 'page_double_hyphen', 'doi', 'type', 'convert_to_unicode', 20 'homogenize_latex_encoding', 'add_plaintext_fields'] 21 22 23class InvalidName(ValueError): 24 """Exception raised by :py:func:`customization.splitname` when an invalid name is input. 25 26 """ 27 pass 28 29 30def splitname(name, strict_mode=True): 31 """ 32 Break a name into its constituent parts: First, von, Last, and Jr. 33 34 :param string name: a string containing a single name 35 :param Boolean strict_mode: whether to use strict mode 36 :returns: dictionary of constituent parts 37 :raises `customization.InvalidName`: If an invalid name is given and 38 ``strict_mode = True``. 39 40 In BibTeX, a name can be represented in any of three forms: 41 * First von Last 42 * von Last, First 43 * von Last, Jr, First 44 45 This function attempts to split a given name into its four parts. The 46 returned dictionary has keys of ``first``, ``last``, ``von`` and ``jr``. 47 Each value is a list of the words making up that part; this may be an empty 48 list. If the input has no non-whitespace characters, a blank dictionary is 49 returned. 50 51 It is capable of detecting some errors with the input name. If the 52 ``strict_mode`` parameter is ``True``, which is the default, this results in 53 a :class:`customization.InvalidName` exception being raised. If it is 54 ``False``, the function continues, working around the error as best it can. 55 The errors that can be detected are listed below along with the handling 56 for non-strict mode: 57 58 * Name finishes with a trailing comma: delete the comma 59 * Too many parts (e.g., von Last, Jr, First, Error): merge extra parts 60 into First 61 * Unterminated opening brace: add closing brace to end of input 62 * Unmatched closing brace: add opening brace at start of word 63 64 """ 65 # Useful references: 66 # http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html#names 67 # http://tug.ctan.org/info/bibtex/tamethebeast/ttb_en.pdf 68 69 # Whitespace characters that can separate words. 70 whitespace = set(' ~\r\n\t') 71 72 # We'll iterate over the input once, dividing it into a list of words for 73 # each comma-separated section. We'll also calculate the case of each word 74 # as we work. 75 sections = [[]] # Sections of the name. 76 cases = [[]] # 1 = uppercase, 0 = lowercase, -1 = caseless. 77 word = [] # Current word. 78 case = -1 # Case of the current word. 79 level = 0 # Current brace level. 80 bracestart = False # Will the next character be the first within a brace? 81 controlseq = True # Are we currently processing a control sequence? 82 specialchar = None # Are we currently processing a special character? 83 84 # Using an iterator allows us to deal with escapes in a simple manner. 85 nameiter = iter(name) 86 for char in nameiter: 87 # An escape. 88 if char == '\\': 89 escaped = next(nameiter) 90 91 # BibTeX doesn't allow whitespace escaping. Copy the slash and fall 92 # through to the normal case to handle the whitespace. 93 if escaped in whitespace: 94 word.append(char) 95 char = escaped 96 97 else: 98 # Is this the first character in a brace? 99 if bracestart: 100 bracestart = False 101 controlseq = escaped.isalpha() 102 specialchar = True 103 104 # Can we use it to determine the case? 105 elif (case == -1) and escaped.isalpha(): 106 if escaped.isupper(): 107 case = 1 108 else: 109 case = 0 110 111 # Copy the escape to the current word and go to the next 112 # character in the input. 113 word.append(char) 114 word.append(escaped) 115 continue 116 117 # Start of a braced expression. 118 if char == '{': 119 level += 1 120 word.append(char) 121 bracestart = True 122 controlseq = False 123 specialchar = False 124 continue 125 126 # All the below cases imply this (and don't test its previous value). 127 bracestart = False 128 129 # End of a braced expression. 130 if char == '}': 131 # Check and reduce the level. 132 if level: 133 level -= 1 134 else: 135 if strict_mode: 136 raise InvalidName("Unmatched closing brace in name {{{0}}}.".format(name)) 137 word.insert(0, '{') 138 139 # Update the state, append the character, and move on. 140 controlseq = False 141 specialchar = False 142 word.append(char) 143 continue 144 145 # Inside a braced expression. 146 if level: 147 # Is this the end of a control sequence? 148 if controlseq: 149 if not char.isalpha(): 150 controlseq = False 151 152 # If it's a special character, can we use it for a case? 153 elif specialchar: 154 if (case == -1) and char.isalpha(): 155 if char.isupper(): 156 case = 1 157 else: 158 case = 0 159 160 # Append the character and move on. 161 word.append(char) 162 continue 163 164 # End of a word. 165 # NB. we know we're not in a brace here due to the previous case. 166 if char == ',' or char in whitespace: 167 # Don't add empty words due to repeated whitespace. 168 if word: 169 sections[-1].append(''.join(word)) 170 word = [] 171 cases[-1].append(case) 172 case = -1 173 controlseq = False 174 specialchar = False 175 176 # End of a section. 177 if char == ',': 178 if len(sections) < 3: 179 sections.append([]) 180 cases.append([]) 181 elif strict_mode: 182 raise InvalidName("Too many commas in the name {{{0}}}.".format(name)) 183 continue 184 185 # Regular character. 186 word.append(char) 187 if (case == -1) and char.isalpha(): 188 if char.isupper(): 189 case = 1 190 else: 191 case = 0 192 193 # Unterminated brace? 194 if level: 195 if strict_mode: 196 raise InvalidName("Unterminated opening brace in the name {{{0}}}.".format(name)) 197 while level: 198 word.append('}') 199 level -= 1 200 201 # Handle the final word. 202 if word: 203 sections[-1].append(''.join(word)) 204 cases[-1].append(case) 205 206 # Get rid of trailing sections. 207 if not sections[-1]: 208 # Trailing comma? 209 if (len(sections) > 1) and strict_mode: 210 raise InvalidName("Trailing comma at end of name {{{0}}}.".format(name)) 211 sections.pop(-1) 212 cases.pop(-1) 213 214 # No non-whitespace input. 215 if not sections or not any(bool(section) for section in sections): 216 return {} 217 218 # Initialise the output dictionary. 219 parts = {'first': [], 'last': [], 'von': [], 'jr': []} 220 221 # Form 1: "First von Last" 222 if len(sections) == 1: 223 p0 = sections[0] 224 225 # One word only: last cannot be empty. 226 if len(p0) == 1: 227 parts['last'] = p0 228 229 # Two words: must be first and last. 230 elif len(p0) == 2: 231 parts['first'] = p0[:1] 232 parts['last'] = p0[1:] 233 234 # Need to use the cases to figure it out. 235 else: 236 cases = cases[0] 237 238 # First is the longest sequence of words starting with uppercase 239 # that is not the whole string. von is then the longest sequence 240 # whose last word starts with lowercase that is not the whole 241 # string. Last is the rest. NB., this means last cannot be empty. 242 243 # At least one lowercase letter. 244 if 0 in cases: 245 # Index from end of list of first and last lowercase word. 246 firstl = cases.index(0) - len(cases) 247 lastl = -cases[::-1].index(0) - 1 248 if lastl == -1: 249 lastl -= 1 # Cannot consume the rest of the string. 250 251 # Pull the parts out. 252 parts['first'] = p0[:firstl] 253 parts['von'] = p0[firstl:lastl+1] 254 parts['last'] = p0[lastl+1:] 255 256 # No lowercase: last is the last word, first is everything else. 257 else: 258 parts['first'] = p0[:-1] 259 parts['last'] = p0[-1:] 260 261 # Form 2 ("von Last, First") or 3 ("von Last, jr, First") 262 else: 263 # As long as there is content in the first name partition, use it as-is. 264 first = sections[-1] 265 if first and first[0]: 266 parts['first'] = first 267 268 # And again with the jr part. 269 if len(sections) == 3: 270 jr = sections[-2] 271 if jr and jr[0]: 272 parts['jr'] = jr 273 274 # Last name cannot be empty; if there is only one word in the first 275 # partition, we have to use it for the last name. 276 last = sections[0] 277 if len(last) == 1: 278 parts['last'] = last 279 280 # Have to look at the cases to figure it out. 281 else: 282 lcases = cases[0] 283 284 # At least one lowercase: von is the longest sequence of whitespace 285 # separated words whose last word does not start with an uppercase 286 # word, and last is the rest. 287 if 0 in lcases: 288 split = len(lcases) - lcases[::-1].index(0) 289 if split == len(lcases): 290 split = 0 # Last cannot be empty. 291 parts['von'] = sections[0][:split] 292 parts['last'] = sections[0][split:] 293 294 # All uppercase => all last. 295 else: 296 parts['last'] = sections[0] 297 298 # Done. 299 return parts 300 301 302def getnames(names): 303 """Convert people names as surname, firstnames 304 or surname, initials. 305 306 :param names: a list of names 307 :type names: list 308 :returns: list -- Correctly formated names 309 310 .. Note:: 311 This function is known to be too simple to handle properly 312 the complex rules. We would like to enhance this in forthcoming 313 releases. 314 """ 315 tidynames = [] 316 for namestring in names: 317 namestring = namestring.strip() 318 if len(namestring) < 1: 319 continue 320 if ',' in namestring: 321 namesplit = namestring.split(',', 1) 322 last = namesplit[0].strip() 323 firsts = [i.strip() for i in namesplit[1].split()] 324 else: 325 namesplit = namestring.split() 326 last = namesplit.pop() 327 firsts = [i.replace('.', '. ').strip() for i in namesplit] 328 if last in ['jnr', 'jr', 'junior']: 329 last = firsts.pop() 330 for item in firsts: 331 if item in ['ben', 'van', 'der', 'de', 'la', 'le']: 332 last = firsts.pop() + ' ' + last 333 tidynames.append(last + ", " + ' '.join(firsts)) 334 return tidynames 335 336 337def author(record): 338 """ 339 Split author field into a list of "Name, Surname". 340 341 :param record: the record. 342 :type record: dict 343 :returns: dict -- the modified record. 344 345 """ 346 if "author" in record: 347 if record["author"]: 348 record["author"] = getnames([i.strip() for i in record["author"].replace('\n', ' ').split(" and ")]) 349 else: 350 del record["author"] 351 return record 352 353 354def editor(record): 355 """ 356 Turn the editor field into a dict composed of the original editor name 357 and a editor id (without coma or blank). 358 359 :param record: the record. 360 :type record: dict 361 :returns: dict -- the modified record. 362 363 """ 364 if "editor" in record: 365 if record["editor"]: 366 record["editor"] = getnames([i.strip() for i in record["editor"].replace('\n', ' ').split(" and ")]) 367 # convert editor to object 368 record["editor"] = [{"name": i, "ID": i.replace(',', '').replace(' ', '').replace('.', '')} for i in record["editor"]] 369 else: 370 del record["editor"] 371 return record 372 373 374def page_double_hyphen(record): 375 """ 376 Separate pages by a double hyphen (--). 377 378 :param record: the record. 379 :type record: dict 380 :returns: dict -- the modified record. 381 382 """ 383 if "pages" in record: 384 # hyphen, non-breaking hyphen, en dash, em dash, hyphen-minus, minus sign 385 separators = [u'‐', u'‑', u'–', u'—', u'-', u'−'] 386 for separator in separators: 387 if separator in record["pages"]: 388 p = [i.strip().strip(separator) for i in record["pages"].split(separator)] 389 record["pages"] = p[0] + '--' + p[-1] 390 return record 391 392 393def type(record): 394 """ 395 Put the type into lower case. 396 397 :param record: the record. 398 :type record: dict 399 :returns: dict -- the modified record. 400 401 """ 402 if "type" in record: 403 record["type"] = record["type"].lower() 404 return record 405 406 407def journal(record): 408 """ 409 Turn the journal field into a dict composed of the original journal name 410 and a journal id (without coma or blank). 411 412 :param record: the record. 413 :type record: dict 414 :returns: dict -- the modified record. 415 416 """ 417 if "journal" in record: 418 # switch journal to object 419 if record["journal"]: 420 record["journal"] = {"name": record["journal"], "ID": record["journal"].replace(',', '').replace(' ', '').replace('.', '')} 421 422 return record 423 424 425def keyword(record, sep=',|;'): 426 """ 427 Split keyword field into a list. 428 429 :param record: the record. 430 :type record: dict 431 :param sep: pattern used for the splitting regexp. 432 :type record: string, optional 433 :returns: dict -- the modified record. 434 435 """ 436 if "keyword" in record: 437 record["keyword"] = [i.strip() for i in re.split(sep, record["keyword"].replace('\n', ''))] 438 439 return record 440 441 442def link(record): 443 """ 444 445 :param record: the record. 446 :type record: dict 447 :returns: dict -- the modified record. 448 449 """ 450 if "link" in record: 451 links = [i.strip().replace(" ", " ") for i in record["link"].split('\n')] 452 record['link'] = [] 453 for link in links: 454 parts = link.split(" ") 455 linkobj = {"url": parts[0]} 456 if len(parts) > 1: 457 linkobj["anchor"] = parts[1] 458 if len(parts) > 2: 459 linkobj["format"] = parts[2] 460 if len(linkobj["url"]) > 0: 461 record["link"].append(linkobj) 462 463 return record 464 465 466def doi(record): 467 """ 468 469 :param record: the record. 470 :type record: dict 471 :returns: dict -- the modified record. 472 473 """ 474 if 'doi' in record: 475 if 'link' not in record: 476 record['link'] = [] 477 nodoi = True 478 for item in record['link']: 479 if 'doi' in item: 480 nodoi = False 481 if nodoi: 482 link = record['doi'] 483 if link.startswith('10'): 484 link = 'https://doi.org/' + link 485 record['link'].append({"url": link, "anchor": "doi"}) 486 return record 487 488 489def convert_to_unicode(record): 490 """ 491 Convert accent from latex to unicode style. 492 493 :param record: the record. 494 :type record: dict 495 :returns: dict -- the modified record. 496 """ 497 for val in record: 498 if isinstance(record[val], list): 499 record[val] = [ 500 latex_to_unicode(x) for x in record[val] 501 ] 502 elif isinstance(record[val], dict): 503 record[val] = { 504 k: latex_to_unicode(v) for k, v in record[val].items() 505 } 506 else: 507 record[val] = latex_to_unicode(record[val]) 508 return record 509 510 511def homogenize_latex_encoding(record): 512 """ 513 Homogenize the latex enconding style for bibtex 514 515 This function is experimental. 516 517 :param record: the record. 518 :type record: dict 519 :returns: dict -- the modified record. 520 """ 521 # First, we convert everything to unicode 522 record = convert_to_unicode(record) 523 # And then, we fall back 524 for val in record: 525 if val not in ('ID',): 526 logger.debug('Apply string_to_latex to: %s', val) 527 record[val] = string_to_latex(record[val]) 528 if val == 'title': 529 logger.debug('Protect uppercase in title') 530 logger.debug('Before: %s', record[val]) 531 record[val] = protect_uppercase(record[val]) 532 logger.debug('After: %s', record[val]) 533 return record 534 535 536def add_plaintext_fields(record): 537 """ 538 For each field in the record, add a `plain_` field containing the 539 plaintext, stripped from braces and similar. See 540 https://github.com/sciunto-org/python-bibtexparser/issues/116. 541 542 :param record: the record. 543 :type record: dict 544 :returns: dict -- the modified record. 545 """ 546 def _strip_string(string): 547 for stripped in ['{', '}']: 548 string = string.replace(stripped, "") 549 return string 550 551 for key in list(record.keys()): 552 plain_key = "plain_{}".format(key) 553 record[plain_key] = record[key] 554 555 if isinstance(record[plain_key], str): 556 record[plain_key] = _strip_string(record[plain_key]) 557 elif isinstance(record[plain_key], dict): 558 record[plain_key] = { 559 subkey: _strip_string(value) 560 for subkey, value in record[plain_key].items() 561 } 562 elif isinstance(record[plain_key], list): 563 record[plain_key] = [ 564 _strip_string(value) 565 for value in record[plain_key] 566 ] 567 568 return record 569