1 /*
2  * BibTeX Converter
3  * Copyright (C) 2010-2021 by Thomas Dreibholz
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9 
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Contact: dreibh@iem.uni-due.de
19  */
20 
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <set>
25 
26 #include "unification.h"
27 #include "stringhandling.h"
28 
29 
30 // ###### Extract initials from given name(s) ###############################
extractAuthorInitials(const std::string & givenNameFull,std::string & givenNameInitials)31 static void extractAuthorInitials(const std::string& givenNameFull,
32                                   std::string&       givenNameInitials)
33 {
34    const size_t length  = givenNameFull.size();
35    bool         extract = true;
36    bool         empty   = true;
37 
38    givenNameInitials = "";
39    for(size_t i = 0;i < length;i++) {
40       if( (givenNameFull[i] == ' ') || (givenNameFull[i] == '~') ) {
41          extract = true;
42       }
43       else {
44          if(extract == true) {
45             if(!empty) {
46                givenNameInitials += '~';
47             }
48 #ifdef USE_UTF8
49            if( ( (((unsigned char)givenNameFull[i]) & 0xE0) == 0xC0 ) && (i + 1 < length) ) {
50               // Two-byte UTF-8 character
51               givenNameInitials += givenNameFull[i];
52               givenNameInitials += givenNameFull[++i];
53            }
54            else if( ( (((unsigned char)givenNameFull[i]) & 0xF0) == 0xE0 ) && (i + 2 < length) ) {
55               // Three-byte UTF-8 character
56               givenNameInitials += givenNameFull[i];
57               givenNameInitials += givenNameFull[++i];
58               givenNameInitials += givenNameFull[++i];
59            }
60            else if( ( (((unsigned char)givenNameFull[i]) & 0xF8) == 0xF0 ) && (i + 3 < length) ) {
61               // Four-byte UTF-8 character
62               givenNameInitials += givenNameFull[i];
63               givenNameInitials += givenNameFull[++i];
64               givenNameInitials += givenNameFull[++i];
65               givenNameInitials += givenNameFull[++i];
66            }
67            else if( (((unsigned char)givenNameFull[i]) & 0x80) == 0 ) {
68               // Regular 1-byte character
69 #endif
70               givenNameInitials += givenNameFull[i];
71 #ifdef USE_UTF8
72            }
73            else {
74               // Invalid!
75            }
76 #endif
77             givenNameInitials += '.';
78             extract = false;
79             empty   = false;
80          }
81       }
82    }
83 }
84 
85 
86 // ###### Split author name into its parts ##################################
splitAuthor(std::string & author,std::string & givenNameFull,std::string & givenNameInitials,std::string & familyName)87 static void splitAuthor(std::string& author,
88                         std::string& givenNameFull,
89                         std::string& givenNameInitials,
90                         std::string& familyName)
91 {
92    size_t pos;
93 
94    // Clean up author string first.
95    trim(author);
96    while( (pos = author.find("~")) != std::string::npos ) {
97       author.replace(pos, 1, " ");
98    }
99 
100    if( author[0] == '{') {   // Special name in brackets, e.g. "{R Development Core Team}".
101       familyName    = author;
102       givenNameFull = givenNameInitials = "";
103    }
104    else if( (pos = author.find(",")) != std::string::npos ) {   // Name, Given Name(s)
105       givenNameFull = author.substr(pos + 1, author.size() - pos - 1);
106       familyName    = author.substr(0, pos);
107       extractAuthorInitials(givenNameFull, givenNameInitials);
108    }
109    else {   // Given Name(s) + Family Name
110       pos = author.rfind(" ");
111       if(pos == std::string::npos) {
112          pos = author.rfind("~");
113       }
114       if(pos == std::string::npos) {   // Family Name only
115          familyName     = author;
116          givenNameFull = givenNameInitials = "";
117       }
118       else {   // Given Name(s) + Family Name
119          familyName    = author.substr(pos + 1, author.size() - pos - 1);
120          givenNameFull = author.substr(0, pos);
121          extractAuthorInitials(givenNameFull, givenNameInitials);
122       }
123       if(givenNameFull.rfind(".") != std::string::npos) {   // Given first name + initial
124          // Replace spaces by non-breakable spaces.
125          while( (pos = givenNameFull.find(" ")) != std::string::npos ) {
126             givenNameFull.replace(pos, 1, "~");
127          }
128       }
129    }
130    trim(givenNameFull);
131    trim(familyName);
132 
133 /*
134    printf("\t-> %s:\tA=<%s>\t->\tI=<%s> G=<%s> F=<%s>\n", author.c_str(),
135           author.c_str(),
136           givenNameInitials.c_str(), givenNameFull.c_str(), familyName.c_str());
137 */
138 
139    if(givenNameFull != "") {
140       if(givenNameFull == givenNameInitials) {   // Given name == initials
141          author = givenNameInitials + "~" + familyName;
142       }
143       else {
144          author = givenNameFull + " " + familyName;
145       }
146       // printf("\t\t=> A=<%s>\n", author.c_str());
147    }
148    else {
149       author = familyName;
150    }
151 }
152 
153 
154 // ###### Unify "author" section ############################################
unifyAuthor(Node * publication,Node * author)155 void unifyAuthor(Node* publication, Node* author)
156 {
157    std::string currentAuthor;
158    std::string givenNameFull;
159    std::string givenNameInitials;
160    std::string familyName;
161 
162    size_t argumentsIndex = 0;
163    author->arguments.clear();
164 
165    // ====== Iterator from author 1 to author n-1 (for n authors) ===========
166    std::string allAuthors = author->value;
167    bool        empty      = true;
168    size_t      pos;
169    author->value = "";
170    while( (pos = allAuthors.find(" and ")) != std::string::npos ) {
171       currentAuthor = allAuthors.substr(0, pos);
172 
173       // ====== Extract current author ======================================
174       splitAuthor(currentAuthor, givenNameFull, givenNameInitials, familyName);
175       author->value += ((!empty) ? " and " : "") + currentAuthor;
176       empty = false;
177 
178       // ====== Store extracted name strings into Node's arguments vector ===
179       author->arguments.resize(argumentsIndex + 3);
180       author->arguments[argumentsIndex++] = familyName;
181       author->arguments[argumentsIndex++] = givenNameFull;
182       author->arguments[argumentsIndex++] = givenNameInitials;
183 
184       pos += 5;
185       allAuthors = allAuthors.substr(pos, allAuthors.size() - pos);
186    }
187 
188    // ====== Extract last author ============================================
189    splitAuthor(allAuthors, givenNameFull, givenNameInitials, familyName);
190    author->value += ((!empty) ? " and " : "") + allAuthors;
191 
192    // ====== Store extracted name strings into Node's arguments vector ======
193    author->arguments.resize(argumentsIndex + 3);
194    author->arguments[argumentsIndex++] = familyName;
195    author->arguments[argumentsIndex++] = givenNameFull;
196    author->arguments[argumentsIndex++] = givenNameInitials;
197 }
198 
199 
200 // ###### Unify "booktitle" section #########################################
unifyBookTitle(Node * publication,Node * booktitle)201 void unifyBookTitle(Node* publication, Node* booktitle)
202 {
203    size_t pos;
204    while( (pos = booktitle->value.find(" (")) != std::string::npos ) {
205       booktitle->value.replace(pos, 1, "~");
206    }
207 }
208 
209 
210 // ###### Unify "isbn" section ##############################################
unifyISBN(Node * publication,Node * isbn)211 void unifyISBN(Node* publication, Node* isbn)
212 {
213    // ====== Get pure number ================================================
214    std::string number = "";
215    size_t      length = isbn->value.size();
216    for(size_t i = 0; i < length; i++) {
217 #ifdef USE_UTF8
218       if((isbn->value[i] < 0) && (i + 1 < length)) {
219          i++;
220       }
221       else
222 #endif
223       if( ((isbn->value[i] >= '0') &&
224            (isbn->value[i] <= '9')) ||
225            ((isbn->value[i] == 'X') && (i == length - 1)) ) {
226          number += isbn->value[i];
227       }
228       else if(isbn->value[i] == '-') {
229 
230       }
231       else {
232          fprintf(stderr, "WARNING: Entry %s has invalid characters in \"isbn\" section (isbn=%s)!\n" ,
233                  publication->keyword.c_str(), isbn->value.c_str());
234          return;
235       }
236    }
237 
238    // ====== Validate =======================================================
239    if(number.size() == 10) {
240       unsigned int checksum = 0;
241       for(size_t i = 0; i < 9; i++) {
242          checksum += (10 - i) * ((number[i] == 'X') ? 10 : (number[i] - '0'));
243       }
244       checksum = 11 - checksum % 11;
245       if(checksum == 11) {
246          checksum = 0;
247       }
248       char value = ((checksum < 10) ? ((char)checksum + '0') : 'X');
249 
250       if(value != number[9]) {
251          fprintf(stderr, "WARNING: Entry %s has invalid ISBN-10 in \"isbn\" section (isbn=%s; checksum=%c)\n" ,
252                  publication->keyword.c_str(), isbn->value.c_str(), value);
253       }
254    }
255    else if(number.size() == 13) {
256       unsigned int checksum = 10 - (
257          (number[0] - '0') +
258          3 * (number[1] - '0') +
259          (number[2] - '0') +
260          3 * (number[3] - '0') +
261          (number[4] - '0') +
262          3 * (number[5] - '0') +
263          (number[6] - '0') +
264          3 * (number[7] - '0') +
265          (number[8] - '0') +
266          3 * (number[9] - '0') +
267          (number[10] - '0') +
268          3 * (number[11] - '0')) % 10;
269       if(checksum == 10) {
270          checksum = 0;
271       }
272       char value = (char)checksum + '0';
273 
274       if(value != number[12]) {
275          fprintf(stderr, "WARNING: Entry %s has invalid ISBN-13 in \"isbn\" section (isbn=%s; checksum=%c)\n" ,
276                  publication->keyword.c_str(), isbn->value.c_str(), value);
277       }
278    }
279    else {
280       fprintf(stderr, "WARNING: Entry %s has no ISBN-10 or ISBN-13 in \"isbn\" section (isbn=%s -> %s)\n" ,
281               publication->keyword.c_str(), isbn->value.c_str(), number.c_str());
282       return;
283    }
284 }
285 
286 
287 // ###### Unify "issn" section ##############################################
unifyISSN(Node * publication,Node * issn)288 void unifyISSN(Node* publication, Node* issn)
289 {
290    // ====== Get pure number ================================================
291    std::string number = "";
292    size_t      length = issn->value.size();
293    for(size_t i = 0; i < length; i++) {
294 #ifdef USE_UTF8
295       if((issn->value[i] < 0) && (i + 1 < length)) {
296          i++;
297       }
298       else
299 #endif
300       if( ((issn->value[i] >= '0') &&
301            (issn->value[i] <= '9')) ||
302            ((issn->value[i] == 'X') && (i == issn->value.size() - 1)) ) {
303          number += issn->value[i];
304       }
305       else if(issn->value[i] == '-') {
306 
307       }
308       else {
309          fprintf(stderr, "WARNING: Entry %s has invalid characters in \"issn\" section (issn=%s)!\n" ,
310                  publication->keyword.c_str(), issn->value.c_str());
311          return;
312       }
313    }
314 
315    // ====== Validate =======================================================
316    if(number.size() == 8) {
317       unsigned int checksum = 0;
318       for(size_t i = 0; i < 7; i++) {
319          checksum += (8 - i) * ((number[i] == 'X') ? 10 : (number[i] - '0'));
320 
321       }
322       checksum = 11 - checksum % 11;
323       if(checksum == 11) {
324          checksum = 0;
325       }
326       char value = ((checksum < 10) ? ((char)checksum + '0') : 'X');
327 
328       if(value != number[7]) {
329          fprintf(stderr, "WARNING: Entry %s has invalid ISSN-10 in \"issn\" section (issn=%s; checksum=%c)\n" ,
330                  publication->keyword.c_str(), issn->value.c_str(), value);
331       }
332    }
333    else {
334       fprintf(stderr, "WARNING: Entry %s has no ISSN in \"issn\" section (issn=%s -> %s)\n" ,
335               publication->keyword.c_str(), issn->value.c_str(), number.c_str());
336       return;
337    }
338 }
339 
340 
341 // ###### Unify "year"/"month"/"day" sections ###############################
unifyDate(Node * publication,Node * year,Node * month,Node * day)342 void unifyDate(Node* publication, Node* year, Node* month, Node* day)
343 {
344    int yearNumber = 1;
345    if(year != NULL) {
346       yearNumber = atol(year->value.c_str());
347       if((yearNumber < 1700) || (yearNumber > 2030)) {
348          fprintf(stderr, "WARNING: Entry %s has probably invalid \"year\" section (year=%d?)!\n" ,
349                  publication->keyword.c_str(), yearNumber);
350       }
351       year->number = yearNumber;
352       year->value  = format("%04d", yearNumber);
353    }
354    else {
355       fprintf(stderr, "WARNING: Entry %s has no \"year\" section, but \"month\" or \"day\"!\n" ,
356               publication->keyword.c_str());
357    }
358 
359    int monthNumber = 0;
360    int maxDays     = 0;
361    if(month != NULL) {
362       if(month->value == "jan") {
363          monthNumber = 1;   maxDays = 31;
364       }
365       else if(month->value == "feb") {
366          monthNumber = 2;
367          if( ((yearNumber % 4) == 0) &&
368              ( ((yearNumber % 100) != 0) ||
369                ((yearNumber % 400) == 0) ) ) {
370             maxDays = 29;
371          }
372          else {
373             maxDays = 28;
374          }
375       }
376       else if(month->value == "mar") {
377          monthNumber = 3;   maxDays = 31;
378       }
379       else if(month->value == "apr") {
380          monthNumber = 4;   maxDays = 30;
381       }
382       else if(month->value == "may") {
383          monthNumber = 5;   maxDays = 31;
384       }
385       else if(month->value == "jun") {
386          monthNumber = 6;   maxDays = 30;
387       }
388       else if(month->value == "jul") {
389          monthNumber = 7;   maxDays = 30;
390       }
391       else if(month->value == "aug") {
392          monthNumber = 8;   maxDays = 31;
393       }
394       else if(month->value == "sep") {
395          monthNumber = 9;   maxDays = 30;
396       }
397       else if(month->value == "oct") {
398          monthNumber = 10;   maxDays = 31;
399       }
400       else if(month->value == "nov") {
401          monthNumber = 11;   maxDays = 30;
402       }
403       else if(month->value == "dec") {
404          monthNumber = 12;   maxDays = 31;
405       }
406       else {
407          fprintf(stderr, "WARNING: Entry %s has probably invalid \"month\" section (month=%s?)!\n" ,
408                  publication->keyword.c_str(), month->value.c_str());
409       }
410       month->number = monthNumber;
411       month->value  = format("%02d", monthNumber);
412    }
413 
414    if(day != NULL) {
415       day->number = atol(day->value.c_str());
416       if(month == NULL) {
417          fprintf(stderr, "WARNING: Entry %s has no \"month\" section, but \"day\"!\n" ,
418                  publication->keyword.c_str());
419       }
420       else {
421          if((day->number < 1) || (day->number > maxDays)) {
422             fprintf(stderr, "WARNING: Entry %s has invalid \"day\" or \"month\" section (year=%d month=%d day=%d)!\n" ,
423                     publication->keyword.c_str(), yearNumber, monthNumber, day->number);
424          }
425       }
426       day->value = format("%04d", day->number);
427    }
428 }
429 
430 
431 // ###### Unify "url" section ###############################################
unifyURL(Node * publication,Node * url)432 void unifyURL(Node* publication, Node* url)
433 {
434    // ====== Remove deprecated \url{...} ====================================
435    if( (url->value.substr(0, 5) == "\\url{") &&
436        (url->value.substr(url->value.size() - 1) == "}") ) {
437       url->value = url->value.substr(5, url->value.size() - 6);
438    }
439 
440    // ====== Fix IEEExplore URLs ============================================
441    if( (url->value.substr(0, 27) == "http://ieeexplore.ieee.org/") ||
442        (url->value.substr(0, 28) == "https://ieeexplore.ieee.org/") ) {
443       const size_t is = url->value.find("&isnumber=");
444       if(is == std::string::npos) {   // URL would otherwise not point to PDF download!
445          url->value += "&isnumber=";
446       }
447    }
448 
449    url->value = laTeXtoURL(url->value);
450 }
451 
452 
453 // ###### Unify "pages" section #############################################
unifyPages(Node * publication,Node * pages)454 void unifyPages(Node* publication, Node* pages)
455 {
456    // ====== Get pure numbers ===============================================
457    std::string numbers = "";
458    size_t      length  = pages->value.size();
459    for(size_t i = 0; i < length; i++) {
460 #ifdef USE_UTF8
461       if((pages->value[i] < 0) && (i + 1 < length)) {
462          i++;
463       }
464       else
465 #endif
466       if( (pages->value[i] >= '0') &&
467           (pages->value[i] <= '9') ) {
468          numbers += pages->value[i];
469       }
470       else if(pages->value[i] == '-') {
471          numbers += ' ';
472       }
473       else {
474          fprintf(stderr, "WARNING: Entry %s has invalid characters in \"pages\" section (pages=%s)!\n" ,
475                  publication->keyword.c_str(), pages->value.c_str());
476          return;
477       }
478    }
479 
480    unsigned int a;
481    unsigned int b;
482    if(sscanf(numbers.c_str(), "%u %u", &a, &b) != 2) {
483       if(sscanf(numbers.c_str(), "%u", &a) != 1) {
484          a = b = 0;
485       }
486       else {
487          b = a;
488       }
489    }
490    if((a != 0) && (a <= b)) {
491       char pagesString[64];
492       if(a != b) {
493          snprintf((char*)&pagesString, sizeof(pagesString), "%u--%u", a, b);
494       }
495       else {
496          snprintf((char*)&pagesString, sizeof(pagesString), "%u", a);
497       }
498       pages->value = pagesString;
499 
500       Node* numpages = findChildNode(publication, "numpages");
501       if(numpages) {
502          unsigned int n = atol(numpages->value.c_str());
503          if(n != 1 + (b - a)) {
504             fprintf(stderr, "WARNING: Entry %s has inconsistent invalid page numbers and number of pages (pages=%s; numpages=%s)!\n" ,
505                     publication->keyword.c_str(), pages->value.c_str(), numpages->value.c_str());
506          }
507       }
508       addOrUpdateChildNode(publication, "numpages", format("%u", 1 + (b - a)).c_str());
509    }
510    else {
511       fprintf(stderr, "WARNING: Entry %s has possibly invalid page numbers in \"pages\" section (pages=%s)!\n" ,
512               publication->keyword.c_str(), pages->value.c_str());
513    }
514 }
515 
516 
517 // ###### Unify "numpages" section #############################################
unifyNumPages(Node * publication,Node * numpages)518 void unifyNumPages(Node* publication, Node* numpages)
519 {
520    const unsigned int numberOfPages = atol(numpages->value.c_str());
521    if( (numberOfPages < 1) || (numberOfPages >= 999999) ) {
522       fprintf(stderr, "WARNING: Entry %s has invalid page of numbers in \"numpages\" section (numpages=%s)!\n" ,
523               publication->keyword.c_str(), numpages->value.c_str());
524    }
525    numpages->value = format("%u", numberOfPages);
526 }
527