1 /*
2 * BibTeX Converter
3 * Copyright (C) 2010-2021 by Thomas Dreibholz
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
17 *
18 * Contact: dreibh@iem.uni-due.de
19 */
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <set>
25
26 #include "unification.h"
27 #include "stringhandling.h"
28
29
30 // ###### Extract initials from given name(s) ###############################
extractAuthorInitials(const std::string & givenNameFull,std::string & givenNameInitials)31 static void extractAuthorInitials(const std::string& givenNameFull,
32 std::string& givenNameInitials)
33 {
34 const size_t length = givenNameFull.size();
35 bool extract = true;
36 bool empty = true;
37
38 givenNameInitials = "";
39 for(size_t i = 0;i < length;i++) {
40 if( (givenNameFull[i] == ' ') || (givenNameFull[i] == '~') ) {
41 extract = true;
42 }
43 else {
44 if(extract == true) {
45 if(!empty) {
46 givenNameInitials += '~';
47 }
48 #ifdef USE_UTF8
49 if( ( (((unsigned char)givenNameFull[i]) & 0xE0) == 0xC0 ) && (i + 1 < length) ) {
50 // Two-byte UTF-8 character
51 givenNameInitials += givenNameFull[i];
52 givenNameInitials += givenNameFull[++i];
53 }
54 else if( ( (((unsigned char)givenNameFull[i]) & 0xF0) == 0xE0 ) && (i + 2 < length) ) {
55 // Three-byte UTF-8 character
56 givenNameInitials += givenNameFull[i];
57 givenNameInitials += givenNameFull[++i];
58 givenNameInitials += givenNameFull[++i];
59 }
60 else if( ( (((unsigned char)givenNameFull[i]) & 0xF8) == 0xF0 ) && (i + 3 < length) ) {
61 // Four-byte UTF-8 character
62 givenNameInitials += givenNameFull[i];
63 givenNameInitials += givenNameFull[++i];
64 givenNameInitials += givenNameFull[++i];
65 givenNameInitials += givenNameFull[++i];
66 }
67 else if( (((unsigned char)givenNameFull[i]) & 0x80) == 0 ) {
68 // Regular 1-byte character
69 #endif
70 givenNameInitials += givenNameFull[i];
71 #ifdef USE_UTF8
72 }
73 else {
74 // Invalid!
75 }
76 #endif
77 givenNameInitials += '.';
78 extract = false;
79 empty = false;
80 }
81 }
82 }
83 }
84
85
86 // ###### Split author name into its parts ##################################
splitAuthor(std::string & author,std::string & givenNameFull,std::string & givenNameInitials,std::string & familyName)87 static void splitAuthor(std::string& author,
88 std::string& givenNameFull,
89 std::string& givenNameInitials,
90 std::string& familyName)
91 {
92 size_t pos;
93
94 // Clean up author string first.
95 trim(author);
96 while( (pos = author.find("~")) != std::string::npos ) {
97 author.replace(pos, 1, " ");
98 }
99
100 if( author[0] == '{') { // Special name in brackets, e.g. "{R Development Core Team}".
101 familyName = author;
102 givenNameFull = givenNameInitials = "";
103 }
104 else if( (pos = author.find(",")) != std::string::npos ) { // Name, Given Name(s)
105 givenNameFull = author.substr(pos + 1, author.size() - pos - 1);
106 familyName = author.substr(0, pos);
107 extractAuthorInitials(givenNameFull, givenNameInitials);
108 }
109 else { // Given Name(s) + Family Name
110 pos = author.rfind(" ");
111 if(pos == std::string::npos) {
112 pos = author.rfind("~");
113 }
114 if(pos == std::string::npos) { // Family Name only
115 familyName = author;
116 givenNameFull = givenNameInitials = "";
117 }
118 else { // Given Name(s) + Family Name
119 familyName = author.substr(pos + 1, author.size() - pos - 1);
120 givenNameFull = author.substr(0, pos);
121 extractAuthorInitials(givenNameFull, givenNameInitials);
122 }
123 if(givenNameFull.rfind(".") != std::string::npos) { // Given first name + initial
124 // Replace spaces by non-breakable spaces.
125 while( (pos = givenNameFull.find(" ")) != std::string::npos ) {
126 givenNameFull.replace(pos, 1, "~");
127 }
128 }
129 }
130 trim(givenNameFull);
131 trim(familyName);
132
133 /*
134 printf("\t-> %s:\tA=<%s>\t->\tI=<%s> G=<%s> F=<%s>\n", author.c_str(),
135 author.c_str(),
136 givenNameInitials.c_str(), givenNameFull.c_str(), familyName.c_str());
137 */
138
139 if(givenNameFull != "") {
140 if(givenNameFull == givenNameInitials) { // Given name == initials
141 author = givenNameInitials + "~" + familyName;
142 }
143 else {
144 author = givenNameFull + " " + familyName;
145 }
146 // printf("\t\t=> A=<%s>\n", author.c_str());
147 }
148 else {
149 author = familyName;
150 }
151 }
152
153
154 // ###### Unify "author" section ############################################
unifyAuthor(Node * publication,Node * author)155 void unifyAuthor(Node* publication, Node* author)
156 {
157 std::string currentAuthor;
158 std::string givenNameFull;
159 std::string givenNameInitials;
160 std::string familyName;
161
162 size_t argumentsIndex = 0;
163 author->arguments.clear();
164
165 // ====== Iterator from author 1 to author n-1 (for n authors) ===========
166 std::string allAuthors = author->value;
167 bool empty = true;
168 size_t pos;
169 author->value = "";
170 while( (pos = allAuthors.find(" and ")) != std::string::npos ) {
171 currentAuthor = allAuthors.substr(0, pos);
172
173 // ====== Extract current author ======================================
174 splitAuthor(currentAuthor, givenNameFull, givenNameInitials, familyName);
175 author->value += ((!empty) ? " and " : "") + currentAuthor;
176 empty = false;
177
178 // ====== Store extracted name strings into Node's arguments vector ===
179 author->arguments.resize(argumentsIndex + 3);
180 author->arguments[argumentsIndex++] = familyName;
181 author->arguments[argumentsIndex++] = givenNameFull;
182 author->arguments[argumentsIndex++] = givenNameInitials;
183
184 pos += 5;
185 allAuthors = allAuthors.substr(pos, allAuthors.size() - pos);
186 }
187
188 // ====== Extract last author ============================================
189 splitAuthor(allAuthors, givenNameFull, givenNameInitials, familyName);
190 author->value += ((!empty) ? " and " : "") + allAuthors;
191
192 // ====== Store extracted name strings into Node's arguments vector ======
193 author->arguments.resize(argumentsIndex + 3);
194 author->arguments[argumentsIndex++] = familyName;
195 author->arguments[argumentsIndex++] = givenNameFull;
196 author->arguments[argumentsIndex++] = givenNameInitials;
197 }
198
199
200 // ###### Unify "booktitle" section #########################################
unifyBookTitle(Node * publication,Node * booktitle)201 void unifyBookTitle(Node* publication, Node* booktitle)
202 {
203 size_t pos;
204 while( (pos = booktitle->value.find(" (")) != std::string::npos ) {
205 booktitle->value.replace(pos, 1, "~");
206 }
207 }
208
209
210 // ###### Unify "isbn" section ##############################################
unifyISBN(Node * publication,Node * isbn)211 void unifyISBN(Node* publication, Node* isbn)
212 {
213 // ====== Get pure number ================================================
214 std::string number = "";
215 size_t length = isbn->value.size();
216 for(size_t i = 0; i < length; i++) {
217 #ifdef USE_UTF8
218 if((isbn->value[i] < 0) && (i + 1 < length)) {
219 i++;
220 }
221 else
222 #endif
223 if( ((isbn->value[i] >= '0') &&
224 (isbn->value[i] <= '9')) ||
225 ((isbn->value[i] == 'X') && (i == length - 1)) ) {
226 number += isbn->value[i];
227 }
228 else if(isbn->value[i] == '-') {
229
230 }
231 else {
232 fprintf(stderr, "WARNING: Entry %s has invalid characters in \"isbn\" section (isbn=%s)!\n" ,
233 publication->keyword.c_str(), isbn->value.c_str());
234 return;
235 }
236 }
237
238 // ====== Validate =======================================================
239 if(number.size() == 10) {
240 unsigned int checksum = 0;
241 for(size_t i = 0; i < 9; i++) {
242 checksum += (10 - i) * ((number[i] == 'X') ? 10 : (number[i] - '0'));
243 }
244 checksum = 11 - checksum % 11;
245 if(checksum == 11) {
246 checksum = 0;
247 }
248 char value = ((checksum < 10) ? ((char)checksum + '0') : 'X');
249
250 if(value != number[9]) {
251 fprintf(stderr, "WARNING: Entry %s has invalid ISBN-10 in \"isbn\" section (isbn=%s; checksum=%c)\n" ,
252 publication->keyword.c_str(), isbn->value.c_str(), value);
253 }
254 }
255 else if(number.size() == 13) {
256 unsigned int checksum = 10 - (
257 (number[0] - '0') +
258 3 * (number[1] - '0') +
259 (number[2] - '0') +
260 3 * (number[3] - '0') +
261 (number[4] - '0') +
262 3 * (number[5] - '0') +
263 (number[6] - '0') +
264 3 * (number[7] - '0') +
265 (number[8] - '0') +
266 3 * (number[9] - '0') +
267 (number[10] - '0') +
268 3 * (number[11] - '0')) % 10;
269 if(checksum == 10) {
270 checksum = 0;
271 }
272 char value = (char)checksum + '0';
273
274 if(value != number[12]) {
275 fprintf(stderr, "WARNING: Entry %s has invalid ISBN-13 in \"isbn\" section (isbn=%s; checksum=%c)\n" ,
276 publication->keyword.c_str(), isbn->value.c_str(), value);
277 }
278 }
279 else {
280 fprintf(stderr, "WARNING: Entry %s has no ISBN-10 or ISBN-13 in \"isbn\" section (isbn=%s -> %s)\n" ,
281 publication->keyword.c_str(), isbn->value.c_str(), number.c_str());
282 return;
283 }
284 }
285
286
287 // ###### Unify "issn" section ##############################################
unifyISSN(Node * publication,Node * issn)288 void unifyISSN(Node* publication, Node* issn)
289 {
290 // ====== Get pure number ================================================
291 std::string number = "";
292 size_t length = issn->value.size();
293 for(size_t i = 0; i < length; i++) {
294 #ifdef USE_UTF8
295 if((issn->value[i] < 0) && (i + 1 < length)) {
296 i++;
297 }
298 else
299 #endif
300 if( ((issn->value[i] >= '0') &&
301 (issn->value[i] <= '9')) ||
302 ((issn->value[i] == 'X') && (i == issn->value.size() - 1)) ) {
303 number += issn->value[i];
304 }
305 else if(issn->value[i] == '-') {
306
307 }
308 else {
309 fprintf(stderr, "WARNING: Entry %s has invalid characters in \"issn\" section (issn=%s)!\n" ,
310 publication->keyword.c_str(), issn->value.c_str());
311 return;
312 }
313 }
314
315 // ====== Validate =======================================================
316 if(number.size() == 8) {
317 unsigned int checksum = 0;
318 for(size_t i = 0; i < 7; i++) {
319 checksum += (8 - i) * ((number[i] == 'X') ? 10 : (number[i] - '0'));
320
321 }
322 checksum = 11 - checksum % 11;
323 if(checksum == 11) {
324 checksum = 0;
325 }
326 char value = ((checksum < 10) ? ((char)checksum + '0') : 'X');
327
328 if(value != number[7]) {
329 fprintf(stderr, "WARNING: Entry %s has invalid ISSN-10 in \"issn\" section (issn=%s; checksum=%c)\n" ,
330 publication->keyword.c_str(), issn->value.c_str(), value);
331 }
332 }
333 else {
334 fprintf(stderr, "WARNING: Entry %s has no ISSN in \"issn\" section (issn=%s -> %s)\n" ,
335 publication->keyword.c_str(), issn->value.c_str(), number.c_str());
336 return;
337 }
338 }
339
340
341 // ###### Unify "year"/"month"/"day" sections ###############################
unifyDate(Node * publication,Node * year,Node * month,Node * day)342 void unifyDate(Node* publication, Node* year, Node* month, Node* day)
343 {
344 int yearNumber = 1;
345 if(year != NULL) {
346 yearNumber = atol(year->value.c_str());
347 if((yearNumber < 1700) || (yearNumber > 2030)) {
348 fprintf(stderr, "WARNING: Entry %s has probably invalid \"year\" section (year=%d?)!\n" ,
349 publication->keyword.c_str(), yearNumber);
350 }
351 year->number = yearNumber;
352 year->value = format("%04d", yearNumber);
353 }
354 else {
355 fprintf(stderr, "WARNING: Entry %s has no \"year\" section, but \"month\" or \"day\"!\n" ,
356 publication->keyword.c_str());
357 }
358
359 int monthNumber = 0;
360 int maxDays = 0;
361 if(month != NULL) {
362 if(month->value == "jan") {
363 monthNumber = 1; maxDays = 31;
364 }
365 else if(month->value == "feb") {
366 monthNumber = 2;
367 if( ((yearNumber % 4) == 0) &&
368 ( ((yearNumber % 100) != 0) ||
369 ((yearNumber % 400) == 0) ) ) {
370 maxDays = 29;
371 }
372 else {
373 maxDays = 28;
374 }
375 }
376 else if(month->value == "mar") {
377 monthNumber = 3; maxDays = 31;
378 }
379 else if(month->value == "apr") {
380 monthNumber = 4; maxDays = 30;
381 }
382 else if(month->value == "may") {
383 monthNumber = 5; maxDays = 31;
384 }
385 else if(month->value == "jun") {
386 monthNumber = 6; maxDays = 30;
387 }
388 else if(month->value == "jul") {
389 monthNumber = 7; maxDays = 30;
390 }
391 else if(month->value == "aug") {
392 monthNumber = 8; maxDays = 31;
393 }
394 else if(month->value == "sep") {
395 monthNumber = 9; maxDays = 30;
396 }
397 else if(month->value == "oct") {
398 monthNumber = 10; maxDays = 31;
399 }
400 else if(month->value == "nov") {
401 monthNumber = 11; maxDays = 30;
402 }
403 else if(month->value == "dec") {
404 monthNumber = 12; maxDays = 31;
405 }
406 else {
407 fprintf(stderr, "WARNING: Entry %s has probably invalid \"month\" section (month=%s?)!\n" ,
408 publication->keyword.c_str(), month->value.c_str());
409 }
410 month->number = monthNumber;
411 month->value = format("%02d", monthNumber);
412 }
413
414 if(day != NULL) {
415 day->number = atol(day->value.c_str());
416 if(month == NULL) {
417 fprintf(stderr, "WARNING: Entry %s has no \"month\" section, but \"day\"!\n" ,
418 publication->keyword.c_str());
419 }
420 else {
421 if((day->number < 1) || (day->number > maxDays)) {
422 fprintf(stderr, "WARNING: Entry %s has invalid \"day\" or \"month\" section (year=%d month=%d day=%d)!\n" ,
423 publication->keyword.c_str(), yearNumber, monthNumber, day->number);
424 }
425 }
426 day->value = format("%04d", day->number);
427 }
428 }
429
430
431 // ###### Unify "url" section ###############################################
unifyURL(Node * publication,Node * url)432 void unifyURL(Node* publication, Node* url)
433 {
434 // ====== Remove deprecated \url{...} ====================================
435 if( (url->value.substr(0, 5) == "\\url{") &&
436 (url->value.substr(url->value.size() - 1) == "}") ) {
437 url->value = url->value.substr(5, url->value.size() - 6);
438 }
439
440 // ====== Fix IEEExplore URLs ============================================
441 if( (url->value.substr(0, 27) == "http://ieeexplore.ieee.org/") ||
442 (url->value.substr(0, 28) == "https://ieeexplore.ieee.org/") ) {
443 const size_t is = url->value.find("&isnumber=");
444 if(is == std::string::npos) { // URL would otherwise not point to PDF download!
445 url->value += "&isnumber=";
446 }
447 }
448
449 url->value = laTeXtoURL(url->value);
450 }
451
452
453 // ###### Unify "pages" section #############################################
unifyPages(Node * publication,Node * pages)454 void unifyPages(Node* publication, Node* pages)
455 {
456 // ====== Get pure numbers ===============================================
457 std::string numbers = "";
458 size_t length = pages->value.size();
459 for(size_t i = 0; i < length; i++) {
460 #ifdef USE_UTF8
461 if((pages->value[i] < 0) && (i + 1 < length)) {
462 i++;
463 }
464 else
465 #endif
466 if( (pages->value[i] >= '0') &&
467 (pages->value[i] <= '9') ) {
468 numbers += pages->value[i];
469 }
470 else if(pages->value[i] == '-') {
471 numbers += ' ';
472 }
473 else {
474 fprintf(stderr, "WARNING: Entry %s has invalid characters in \"pages\" section (pages=%s)!\n" ,
475 publication->keyword.c_str(), pages->value.c_str());
476 return;
477 }
478 }
479
480 unsigned int a;
481 unsigned int b;
482 if(sscanf(numbers.c_str(), "%u %u", &a, &b) != 2) {
483 if(sscanf(numbers.c_str(), "%u", &a) != 1) {
484 a = b = 0;
485 }
486 else {
487 b = a;
488 }
489 }
490 if((a != 0) && (a <= b)) {
491 char pagesString[64];
492 if(a != b) {
493 snprintf((char*)&pagesString, sizeof(pagesString), "%u--%u", a, b);
494 }
495 else {
496 snprintf((char*)&pagesString, sizeof(pagesString), "%u", a);
497 }
498 pages->value = pagesString;
499
500 Node* numpages = findChildNode(publication, "numpages");
501 if(numpages) {
502 unsigned int n = atol(numpages->value.c_str());
503 if(n != 1 + (b - a)) {
504 fprintf(stderr, "WARNING: Entry %s has inconsistent invalid page numbers and number of pages (pages=%s; numpages=%s)!\n" ,
505 publication->keyword.c_str(), pages->value.c_str(), numpages->value.c_str());
506 }
507 }
508 addOrUpdateChildNode(publication, "numpages", format("%u", 1 + (b - a)).c_str());
509 }
510 else {
511 fprintf(stderr, "WARNING: Entry %s has possibly invalid page numbers in \"pages\" section (pages=%s)!\n" ,
512 publication->keyword.c_str(), pages->value.c_str());
513 }
514 }
515
516
517 // ###### Unify "numpages" section #############################################
unifyNumPages(Node * publication,Node * numpages)518 void unifyNumPages(Node* publication, Node* numpages)
519 {
520 const unsigned int numberOfPages = atol(numpages->value.c_str());
521 if( (numberOfPages < 1) || (numberOfPages >= 999999) ) {
522 fprintf(stderr, "WARNING: Entry %s has invalid page of numbers in \"numpages\" section (numpages=%s)!\n" ,
523 publication->keyword.c_str(), numpages->value.c_str());
524 }
525 numpages->value = format("%u", numberOfPages);
526 }
527