1 /* utilref.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: utilref.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 * Utility routines for parsing reference block of flatfile.
34 *
35 */
36 #include <ncbi_pch.hpp>
37
38 #include "ftacpp.hpp"
39
40 #include <objects/general/Person_id.hpp>
41 #include <objects/biblio/Auth_list.hpp>
42 #include <objects/biblio/Cit_gen.hpp>
43
44
45 #include "index.h"
46
47 #include <objtools/flatfile/flatdefn.h>
48
49 #include "ftaerr.hpp"
50 #include "asci_blk.h"
51 #include "utilref.h"
52 #include "add.h"
53 #include "utilfun.h"
54
55 #ifdef THIS_FILE
56 # undef THIS_FILE
57 #endif
58 #define THIS_FILE "utilref.cpp"
59
60 #define MAX_PAGE 50
61 #define OTHER_MEDIUM 255
62
63 BEGIN_NCBI_SCOPE
64
65 /**********************************************************/
get_tokens(char * pt,const Char * delimeter)66 ValNodePtr get_tokens(char* pt, const Char *delimeter)
67 {
68 ValNodePtr token;
69 ValNodePtr vnp;
70
71 bool more;
72
73 if(pt == NULL || *pt == '\0')
74 return(NULL);
75
76 token = ValNodeNew(NULL);
77 vnp = token;
78 for(; *pt != '\0'; pt++)
79 {
80 for(; *pt != '\0'; pt++)
81 {
82 if(StringChr(" \n\t\f~,", *pt) == NULL)
83 break;
84 *pt = '\0';
85 }
86 if(*pt == '\0')
87 break;
88
89 vnp->next = ValNodeNew(NULL);
90 vnp = vnp->next;
91 vnp->data.ptrvalue = pt;
92 more = false;
93 for(; *pt != '\0'; pt++)
94 {
95 if(StringNCmp(pt, delimeter, StringLen(delimeter)) != 0 &&
96 StringNCmp(pt, ",\n", 2) != 0 && StringNCmp(pt, ",~", 2) != 0 &&
97 StringNCmp(pt, " and ", 5) != 0)
98 continue;
99
100 *pt = '\0';
101
102 if(StringNCmp(pt + 1, "and ", 4) == 0)
103 pt += 4;
104
105 more = true;
106 break;
107 }
108
109 if(!more)
110 break;
111 } /* for, completed parsing author list */
112
113 vnp = token->next;
114 MemFree(token);
115 return(vnp);
116 }
117
118 /**********************************************************/
AllUpperCase(char * p)119 static bool AllUpperCase(char* p)
120 {
121 if (p == NULL)
122 return false;
123 while (*p != '\0')
124 {
125 if (!IS_UPPER(*p))
126 return false;
127 p++;
128 }
129 return true;
130 }
131
132 /**********************************************************/
SplitMlAuthorName(const Char * name,char * last,char * initials,char * suffix)133 static void SplitMlAuthorName(const Char* name, char* last, char* initials,
134 char* suffix)
135 {
136 char* p;
137 char* p2;
138 Char sbuf[20];
139 Char ibuf[20];
140 Int2 i;
141
142 /* Clear the ibuf field and transfer the entire name to 'last',
143 * excluding leading and trailing spaces
144 */
145 if (name == NULL)
146 return;
147
148 ibuf[0] = '\0';
149 sbuf[0] = '\0';
150 last[0] = '\0';
151 initials[0] = '\0';
152 suffix[0] = '\0';
153 while (*name <= ' ')
154 {
155 name++;
156 if (*name == '\0')
157 return;
158 }
159 StringCpy(last, name);
160
161 for (i = static_cast<Int2>(StringLen(last)) - 1; i >= 0 && last[i] <= ' '; i--)
162 last[i] = '\0';
163
164 /* Strip off the last token (initials or name suffix (Jr, Sr, suffix.)
165 */
166 p = StringRChr(last, ' ');
167 if (p != NULL) /* more than just last name */
168 {
169 /* Separate the token from the last name
170 */
171 p2 = p + 1;
172 while (p > last && *p == ' ')
173 {
174 *p = '\0';
175 p--;
176 }
177
178 /* If the last token is not all upper case, and there are more than
179 * two tokens, see if the next to the last are initials (upper case)
180 */
181 if (!AllUpperCase(p2) && (p = StringRChr(last, (int) ' ')) != NULL)
182 {
183 /* We have at least three tokens, is the next to last initials?
184 */
185 if (AllUpperCase(p + 1))
186 {
187 /* Yes - concatenate the last two tokens as initials
188 */
189 StringCpy(ibuf, p + 1);
190 StringCpy(sbuf, p2);
191 while (p > last && *p == ' ')
192 {
193 *p = '\0';
194 p--;
195 }
196 }
197 }
198
199 if (ibuf[0] == '\0') /* Only the last token goes in ibuf */
200 StringCpy(ibuf, p2);
201 }
202
203 /* now add periods to ibuf and convert suffix
204 */
205 for (p = initials, p2 = ibuf; *p2 != '\0'; p2++, p++)
206 {
207 *p = *p2;
208 if (!IS_LOWER(*(p2 + 1))) /* watch out for foreign names */
209 {
210 p++;
211 *p = '.';
212 }
213 }
214 *p = '\0';
215
216 if (sbuf[0])
217 {
218 if (StringCmp(sbuf, "1d") == 0)
219 p = StringMove(suffix, "I.");
220 else if (StringCmp(sbuf, "2d") == 0)
221 p = StringMove(suffix, "II.");
222 else if (StringCmp(sbuf, "3d") == 0)
223 p = StringMove(suffix, "III.");
224 else if (StringCmp(sbuf, "4th") == 0)
225 p = StringMove(suffix, "IV.");
226 else if (StringCmp(sbuf, "5th") == 0)
227 p = StringMove(suffix, "V.");
228 else if (StringCmp(sbuf, "6th") == 0)
229 p = StringMove(suffix, "VI.");
230 else if (StringCmp(sbuf, "Sr") == 0)
231 p = StringMove(suffix, "Sr.");
232 else if (StringCmp(sbuf, "Jr") == 0)
233 p = StringMove(suffix, "Jr.");
234 else
235 p = StringMove(suffix, sbuf);
236 }
237 }
238
239 /**********************************************************/
GetNameStdFromMl(objects::CName_std & namestd,const Char * token)240 void GetNameStdFromMl(objects::CName_std& namestd, const Char* token)
241 {
242 Char last[80];
243 Char initials[20];
244 Char suffix[20];
245
246 if (token == NULL)
247 return;
248
249 SplitMlAuthorName(token, last, initials, suffix);
250 namestd.SetLast(last);
251 if (initials[0] != '\0')
252 namestd.SetInitials(initials);
253 if (suffix[0] != '\0')
254 namestd.SetSuffix(suffix);
255 }
256
257 /**********************************************************/
RemoveSpacesAndCommas(std::string & str)258 static void RemoveSpacesAndCommas(std::string& str)
259 {
260 std::string buf;
261 for (std::string::iterator it = str.begin(); it != str.end(); ++it)
262 if (*it != ',' && *it != '\t' && *it != ' ')
263 buf.push_back(*it);
264
265 str.swap(buf);
266 }
267
268 /**********************************************************/
get_auth_from_toks(ValNodePtr token,Uint1 format,CRef<objects::CAuth_list> & auths)269 void get_auth_from_toks(ValNodePtr token, Uint1 format, CRef<objects::CAuth_list>& auths)
270 {
271 ValNodePtr vnp;
272 char* p;
273
274 if (token == NULL)
275 return;
276
277 for (vnp = token; vnp != NULL; vnp = vnp->next)
278 {
279 p = (char*)vnp->data.ptrvalue;
280 if (StringNCmp(p, "and ", 4) == 0)
281 p += 4;
282
283 CRef<objects::CAuthor> author = get_std_auth(p, format);
284
285 if (author.Empty())
286 {
287 ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalAuthorName, "%s", p);
288 continue;
289 }
290
291 if (author->GetName().GetName().IsSetInitials())
292 {
293 std::string& initials = author->SetName().SetName().SetInitials();
294 RemoveSpacesAndCommas(initials);
295 }
296
297 if (author->GetName().GetName().IsSetSuffix())
298 {
299 std::string& suffix = author->SetName().SetName().SetSuffix();
300 RemoveSpacesAndCommas(suffix);
301 }
302
303 if (auths.Empty())
304 auths.Reset(new objects::CAuth_list);
305 auths->SetNames().SetStd().push_back(author);
306 }
307 }
308
309 /**********************************************************/
get_std_auth(const Char * token,Uint1 format)310 CRef<objects::CAuthor> get_std_auth(const Char* token, Uint1 format)
311 {
312 const Char* auth;
313 const Char* eptr;
314
315 CRef<objects::CAuthor> author;
316
317 if (token == NULL || *token == '\0')
318 return author;
319
320 author = new objects::CAuthor;
321 objects::CPerson_id& person_id = author->SetName();
322 objects::CName_std& namestd = person_id.SetName();
323
324 for (eptr = token + StringLen(token) - 1; eptr > token && *eptr == ' ';)
325 eptr--;
326
327 if (format == PIR_REF || format == GB_REF)
328 {
329 for (auth = token; *auth != ',' && *auth != '\0';)
330 auth++;
331 if (*auth == ',')
332 {
333 if (auth[1] != '\0')
334 namestd.SetInitials(auth + 1);
335 }
336
337 namestd.SetLast(std::string(token, auth));
338 }
339 else if (format == PDB_REF)
340 {
341 for (auth = eptr; auth > token && *auth != '.';)
342 auth--;
343 if (*auth == '.')
344 {
345 if (auth[1] != '\0' && auth[1] != '.')
346 namestd.SetLast(auth + 1);
347 namestd.SetInitials(std::string(token, auth + 1));
348 }
349 else
350 namestd.SetLast(token);
351 }
352 else if (format == EMBL_REF || format == SP_REF)
353 {
354 for (auth = eptr; *auth != ' ' && auth > token;)
355 auth--;
356 if (*auth == ' ')
357 {
358 if (*(auth - 1) == '.')
359 for (auth--; *auth != ' ' && auth > token;)
360 auth--;
361 if (*auth == ' ')
362 {
363 if (auth[1] != '\0')
364 namestd.SetInitials(auth + 1);
365 }
366 }
367 else
368 auth = eptr + 1;
369
370 namestd.SetLast(std::string(token, auth));
371 }
372 else if (format == ML_REF)
373 GetNameStdFromMl(namestd, token);
374
375 if (!namestd.IsSetLast())
376 {
377 author.Reset();
378 return author;
379 }
380
381 return author;
382 }
383
384 /**********************************************************
385 *
386 * AuthListPtr get_auth(pt, format, jour):
387 *
388 * Get AuthListPtr for the authors. Delimiter between
389 * the authors is ', ' for GenBank and EMBL. Delimiter
390 * between the authors is ';' for PIR. Delimiter between
391 * last name and initials is ',' for GenBank and PIR,
392 * ' ' for EMBL.
393 * Modified from ParseAuthorList (utilref.c).
394 *
395 * 12-4-93
396 *
397 **********************************************************/
get_auth(char * pt,Uint1 format,char * jour,CRef<objects::CAuth_list> & auths)398 void get_auth(char* pt, Uint1 format, char* jour, CRef<objects::CAuth_list>& auths)
399 {
400 static const char *delimiter;
401 static char* eptr;
402 ValNodePtr token;
403
404 switch(format)
405 {
406 case GB_REF:
407 case EMBL_REF:
408 case SP_REF:
409 delimiter = ", ";
410 break;
411 case PIR_REF:
412 case PDB_REF:
413 delimiter = "; ";
414 break;
415 default:
416 break;
417 }
418 if(pt == NULL || *pt == '\0' || *pt == ';')
419 return;
420
421 size_t len = StringLen(pt);
422 for(eptr = pt + len - 1; IS_ALPHANUM(*eptr) == 0; eptr--)
423 len--;
424
425 if(len > 4 && StringNCmp(eptr - 4, "et al", 5) == 0)
426 {
427 if(jour == NULL)
428 ErrPostEx(SEV_WARNING, ERR_REFERENCE_EtAlInAuthors, "%s", pt);
429 else
430 ErrPostEx(SEV_WARNING, ERR_REFERENCE_EtAlInAuthors, "%s : %s",
431 pt, jour);
432 }
433
434 token = get_tokens(pt, delimiter);
435 get_auth_from_toks(token, format, auths);
436 ValNodeFree(token);
437 }
438
439 /**********************************************************/
get_auth_consortium(char * cons,CRef<objects::CAuth_list> & auths)440 void get_auth_consortium(char* cons, CRef<objects::CAuth_list>& auths)
441 {
442 char* p;
443 char* q;
444
445 if(cons == NULL || *cons == '\0')
446 return;
447
448 for (q = cons;; q = p)
449 {
450 p = StringChr(q, ';');
451 if(p != NULL)
452 *p = '\0';
453
454 CRef<objects::CAuthor> author(new objects::CAuthor);
455 author->SetName().SetConsortium(q);
456
457 if (auths.Empty())
458 auths.Reset(new objects::CAuth_list);
459 auths->SetNames().SetStd().push_front(author);
460
461 if(p == NULL)
462 break;
463
464 for(*p++ = ';'; *p == ';' || *p == ' ';)
465 p++;
466
467 if (NStr::EqualNocase(p, 0, 4, "and "))
468 {
469 for (p += 4; *p == ' ';)
470 p++;
471 }
472 }
473 }
474
475 /**********************************************************/
check_mix_pages_range(char * pages)476 static Int4 check_mix_pages_range(char* pages)
477 {
478 char* page1;
479 char* page2;
480 char* dash;
481 char* p;
482 char* q;
483 Char ch1;
484 Char ch2;
485 Int4 i;
486
487 dash = StringChr(pages, '-');
488 if(dash == NULL)
489 return(0);
490
491 *dash = '\0';
492 page1 = pages;
493 page2 = dash + 1;
494
495 if((*page1 >= 'a' && *page1 <= 'z') || (*page1 >= 'A' && *page1 <= 'Z'))
496 {
497 for(p = page1; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z');)
498 p++;
499
500 if((*page2 < 'a' || *page2 > 'z') && (*page2 < 'A' || *page2 > 'Z'))
501 {
502 *dash = '-';
503 return(-1);
504 }
505
506 for(q = page2; (*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z');)
507 q++;
508 ch1 = *p;
509 *p = '\0';
510 ch2 = *q;
511 *q = '\0';
512 i = StringCmp(page1, page2);
513 *p = ch1;
514 *q = ch2;
515 if(i != 0)
516 {
517 *dash = '-';
518 return(-1);
519 }
520 for(page1 = p; *p >= '0' && *p <= '9';)
521 p++;
522 for(page2 = q; *q >= '0' && *q <= '9';)
523 q++;
524
525 i = atoi(page1) - atoi(page2);
526
527 if(*p != '\0' || *q != '\0')
528 {
529 *dash = '-';
530 return(-1);
531 }
532 *dash = '-';
533 if(i > 0)
534 return(1);
535 return(0);
536 }
537
538 if(*page1 < '0' || *page1 > '9' || *page2 < '0' || *page2 > '9')
539 {
540 *dash = '-';
541 return(-1);
542 }
543
544 for(p = page1; *p >= '0' && *p <= '9';)
545 p++;
546 for(q = page2; *q >= '0' && *q <= '9';)
547 q++;
548 ch1 = *p;
549 *p = '\0';
550 ch2 = *q;
551 *q = '\0';
552 i = atoi(page2) - atoi(page1);
553 *p = ch1;
554 *q = ch2;
555
556 for(page1 = p; (*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z');)
557 p++;
558 for(page2 = q; (*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z');)
559 q++;
560 if(*p != '\0' || *q != '\0' || StringCmp(page1, page2) != 0)
561 {
562 *dash = '-';
563 return(-1);
564 }
565
566 *dash = '-';
567 if(i < 0)
568 return(1);
569 return(0);
570 }
571
572 /**********************************************************/
valid_pages_range(char * pages,const Char * title,Int4 er,bool inpress)573 Int4 valid_pages_range(char* pages, const Char* title, Int4 er, bool inpress)
574 {
575 char* p;
576 char* q;
577 char* s;
578 Int4 fps;
579 Int4 lps;
580 Int4 i;
581
582 if(pages == NULL || *pages == '\0')
583 return(-1);
584
585 if(title == NULL)
586 title = (char*) "";
587 while(*pages == ' ' || *pages == ';' || *pages == '\t' || *pages == ',')
588 pages++;
589 if(*pages == '\0')
590 return(-1);
591
592 for(s = pages; *s != '\0';)
593 s++;
594 for(s--; *s == ' ' || *s == ';' || *s == ',' || *s == '\t';)
595 s--;
596 *++s = '\0';
597
598 p = StringChr(pages, '-');
599 if(p == NULL)
600 {
601 for(q = pages; (*q >= 'a' && *q <= 'z') || (*q >= 'A' && *q <= 'Z') ||
602 (*q >= '0' && *q <= '9');)
603 q++;
604 if(*q == '\0')
605 return(0);
606 if((er & 01) == 01)
607 return(0);
608 else if(er > 0)
609 return(-1);
610 return(1);
611 }
612
613 if(p == pages || p[1] == '\0')
614 {
615 if(er == 0)
616 ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegPageRange,
617 "Incorrect pages range provided: \"%s\".", pages);
618 return(-1);
619 }
620
621 if(inpress && (*(p - 1) == ' ' || *(p - 1) == '\t' ||
622 p[1] == ' ' || p[1] == '\t'))
623 return(1);
624
625 for(q = p + 1; *q >= '0' && *q <= '9';)
626 q++;
627 for(p = pages; *p >= '0' && *p <= '9';)
628 p++;
629 if(*p == '-' && *q == '\0')
630 {
631 *p = '\0';
632 fps = atoi(pages);
633 *p = '-';
634 lps = atoi(p + 1);
635
636 if(lps - fps >= MAX_PAGE)
637 {
638 ErrPostEx(SEV_WARNING, ERR_REFERENCE_LargePageRange,
639 "Total pages exceed %d: %s: %s",
640 MAX_PAGE, pages, title);
641 }
642 else if(fps > lps)
643 {
644 ErrPostEx(SEV_WARNING, ERR_REFERENCE_InvertPageRange,
645 "Page numbers may be inverted, %s: %s", pages, title);
646 }
647 }
648 else
649 {
650 i = check_mix_pages_range(pages);
651 if(i == -1)
652 {
653 if(er > 0 && (er & 01) != 01)
654 return(-1);
655 ErrPostEx(SEV_WARNING, ERR_REFERENCE_UnusualPageNumber,
656 "Pages numbers are not digits, letter+digits, or digits_letter: \"%s\": \"%s\".",
657 pages, title);
658 }
659 else if(i == 1)
660 {
661 ErrPostEx(SEV_WARNING, ERR_REFERENCE_InvertPageRange,
662 "Page numbers may be inverted, %s: %s", pages, title);
663 }
664 }
665 return(0);
666 }
667
668 /**********************************************************
669 *
670 * NCBI_DatePtr get_date(year):
671 *
672 * Gets only year and return NCBI_DatePtr.
673 *
674 **********************************************************/
get_date(const Char * year)675 CRef<objects::CDate> get_date(const Char* year)
676 {
677 CRef<objects::CDate> ret;
678
679 if(year == NULL || *year == '\0')
680 {
681 ErrPostEx(SEV_ERROR, ERR_REFERENCE_IllegalDate,
682 "No year in reference.");
683 return ret;
684 }
685
686 if(year[0] < '0' || year[0] > '9' || year[1] < '0' || year[1] > '9' ||
687 year[2] < '0' || year[2] > '9' || year[3] < '0' || year[3] > '9')
688 {
689 ErrPostEx(SEV_ERROR, ERR_REFERENCE_IllegalDate,
690 "Illegal year: \"%s\".", year);
691 return ret;
692 }
693
694 std::string year_str(year, year + 4);
695 time_t now = 0;
696 time(&now);
697 struct tm *tm = localtime(&now);
698
699 Int4 i = NStr::StringToInt(year_str, NStr::fAllowTrailingSymbols);
700
701 if (i < 1900)
702 {
703 ErrPostEx(SEV_ERROR, ERR_REFERENCE_YearPrecedes1900,
704 "Reference's year is extremely far in past: \"%s\".", year_str.c_str());
705 return ret;
706 }
707 else if (i < 1950)
708 {
709 ErrPostEx(SEV_WARNING, ERR_REFERENCE_YearPrecedes1950,
710 "Reference's year is too far in past: \"%s\".", year_str.c_str());
711 }
712 else if (i > tm->tm_year + 1900 + 2)
713 {
714 ErrPostEx(SEV_WARNING, ERR_REFERENCE_ImpendingYear,
715 "Reference's year is too far in future: \"%s\"", year_str.c_str());
716 }
717
718 ret.Reset(new objects::CDate);
719 ret->SetStd().SetYear(i);
720
721 return ret;
722 }
723
724 /**********************************************************/
get_error(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title)725 CRef<objects::CCit_gen> get_error(char* bptr, CRef<objects::CAuth_list>& auth_list, CRef<objects::CTitle::C_E>& title)
726 {
727 CRef<objects::CCit_gen> cit_gen(new objects::CCit_gen);
728
729 char* s;
730 bool zero_year = false;
731 char* end_tit;
732 char* eptr;
733
734 size_t len = StringLen(bptr);
735 eptr = bptr + len - 1;
736 while(*eptr == ' ' || *eptr == '\t' || *eptr == '.')
737 *eptr-- = '\0';
738
739 if(*eptr == ')')
740 {
741 for(s = eptr - 1; s >= bptr && *s != '(';)
742 s--;
743 if(*s == '(' && s[1] == '0')
744 {
745 zero_year = true;
746 for(end_tit = bptr; isdigit((int) *end_tit) == 0;)
747 end_tit++;
748 *end_tit = '\0';
749 }
750 }
751
752 if(zero_year)
753 {
754 CRef<objects::CTitle::C_E> journal_title(new objects::CTitle::C_E);
755 if(StringNCmp(bptr, "(re)", 4) == 0)
756 journal_title->SetName(NStr::Sanitize(bptr));
757 else
758 journal_title->SetIso_jta(NStr::Sanitize(bptr));
759
760 cit_gen->SetJournal().Set().push_back(journal_title);
761 cit_gen->SetCit("In press");
762 }
763 else if(bptr != NULL)
764 {
765 cit_gen->SetCit(NStr::Sanitize(bptr));
766 }
767
768 if (auth_list.NotEmpty())
769 cit_gen->SetAuthors(*auth_list);
770
771 if (title.NotEmpty())
772 cit_gen->SetTitle(title->GetName());
773
774 return cit_gen;
775 }
776
777 END_NCBI_SCOPE
778