1 /* ref.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: ref.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 *
34 */
35 #include <ncbi_pch.hpp>
36
37 #include "ftacpp.hpp"
38
39 #include <objects/biblio/Id_pat.hpp>
40 #include <objects/biblio/Id_pat_.hpp>
41 #include <objects/biblio/Auth_list.hpp>
42 #include <objects/biblio/Affil.hpp>
43 #include <objects/seq/Pubdesc.hpp>
44 #include <objects/pub/Pub_equiv.hpp>
45 #include <objects/pub/Pub.hpp>
46 #include <objects/biblio/Cit_gen.hpp>
47 #include <objects/biblio/PubMedId.hpp>
48 #include <objects/biblio/Cit_book.hpp>
49 #include <objects/biblio/Imprint.hpp>
50 #include <objects/biblio/Cit_let.hpp>
51 #include <objects/biblio/Cit_sub.hpp>
52 #include <objects/biblio/Cit_jour.hpp>
53 #include <objects/biblio/Cit_pat.hpp>
54 #include <objects/biblio/Cit_art.hpp>
55 #include <objects/biblio/ArticleIdSet.hpp>
56 #include <objects/biblio/ArticleId.hpp>
57 #include <objects/general/Dbtag.hpp>
58 #include <objects/general/Object_id.hpp>
59 #include <objects/general/Person_id.hpp>
60 #include <objects/medline/Medline_entry.hpp>
61 #include <objects/biblio/Cit_proc.hpp>
62
63 #include "index.h"
64 #include "genbank.h"
65 #include "embl.h"
66
67 #include <objtools/flatfile/flatdefn.h>
68 #include "ftanet.h"
69
70 #include "ftaerr.hpp"
71 #include "indx_blk.h"
72 #include "utilref.h"
73 #include "asci_blk.h"
74 #include "add.h"
75 #include "utilfun.h"
76 #include "ind.hpp"
77 #include "ref.h"
78 #include "xgbfeat.h"
79 #include "xutils.h"
80 #include "fta_xml.h"
81
82 #ifdef THIS_FILE
83 # undef THIS_FILE
84 #endif
85 #define THIS_FILE "ref.cpp"
86
87 #define MAXKW 38
88
89
90 BEGIN_NCBI_SCOPE
91
92 static const char *strip_sub_str[] = {
93 "to the EMBL/GenBank/DDBJ databases",
94 "to the EMBL/DDBJ/GenBank databases",
95 "to the DDBJ/GenBank/EMBL databases",
96 "to the DDBJ/EMBL/GenBank databases",
97 "to the GenBank/DDBJ/EMBL databases",
98 "to the GenBank/EMBL/DDBJ databases",
99 "to the INSDC",
100 NULL
101 };
102
103 static const char *ERRemarks[] = {
104 "Publication Status: Online-Only", /* 1 */
105 "Publication Status : Online-Only", /* 2 */
106 "Publication_Status: Online-Only", /* 3 */
107 "Publication_Status : Online-Only", /* 4 */
108 "Publication-Status: Online-Only", /* 5 */
109 "Publication-Status : Online-Only", /* 6 */
110 "Publication Status: Available-Online", /* 7 */
111 "Publication Status : Available-Online", /* 8 */
112 "Publication_Status: Available-Online", /* 9 */
113 "Publication_Status : Available-Online", /* 10 */
114 "Publication-Status: Available-Online", /* 11 */
115 "Publication-Status : Available-Online", /* 12 */
116 "Publication Status: Available-Online prior to print", /* 13 */
117 "Publication Status : Available-Online prior to print", /* 14 */
118 "Publication_Status: Available-Online prior to print", /* 15 */
119 "Publication_Status : Available-Online prior to print", /* 16 */
120 "Publication-Status: Available-Online prior to print", /* 17 */
121 "Publication-Status : Available-Online prior to print", /* 18 */
122 NULL
123 };
124
125 /**********************************************************/
normalize_comment(std::string & comment)126 static void normalize_comment(std::string& comment)
127 {
128 std::string new_comment = comment;
129 char *q, *r;
130
131 for(r = (char *) new_comment.c_str();;)
132 {
133 r = strstr(r, "; ");
134 if(r == NULL)
135 break;
136 for(r += 2, q = r; *q == ' ' || *q == ';';)
137 q++;
138 if(q > r)
139 fta_StringCpy(r, q);
140 }
141
142 comment = new_comment;
143 }
144
145 /**********************************************************
146 *
147 * static DatePtr get_lanl_date(s):
148 *
149 * Get year, month, day and return NCBI_DatePtr.
150 * Temporary used for lanl form of date that
151 * is (JUL 21 1993).
152 *
153 * 01-4-94
154 *
155 **********************************************************/
get_lanl_date(char * s)156 static CRef<objects::CDate> get_lanl_date(char* s)
157 {
158 int day = 0;
159 int month = 0;
160 int year;
161 int cal;
162
163 const char *months[12] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun",
164 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"};
165
166 CRef<objects::CDate> date(new objects::CDate);
167 for(cal = 0; cal < 12; cal++)
168 {
169 if(StringNICmp(s + 1, months[cal], 3) == 0)
170 {
171 month = cal + 1;
172 break;
173 }
174 }
175 day = atoi(s + 5);
176 year = atoi(s + 8);
177 if(year < 1900 || year > 1994)
178 {
179 ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalDate,
180 "Illegal year: %d", year);
181 }
182
183 date->SetStd().SetYear(year);
184 date->SetStd().SetMonth(month);
185 date->SetStd().SetDay(day);
186
187 if (XDateCheck(date->GetStd()) != 0)
188 {
189 ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalDate,
190 "Illegal date: %s", s);
191 date.Reset();
192 }
193
194 return(date);
195 }
196
197 /**********************************************************
198 *
199 * static char* clean_up(str):
200 *
201 * Deletes front and tail double or single quotes
202 * if any.
203 *
204 **********************************************************/
clean_up(char * str)205 static char* clean_up(char* str)
206 {
207 char* newp;
208 char* s;
209
210 if(str == NULL)
211 return(NULL);
212
213 s = str + StringLen(str) - 1;
214 if(*s == ';')
215 *s = '\0';
216
217 while(*str == '\"' || *str == '\'')
218 str++;
219
220 newp = strdup(str);
221 size_t size = StringLen(newp);
222 while(size > 0 && (newp[size-1] == '\"' || newp[size-1] == '\''))
223 {
224 size--;
225 newp[size] = '\0';
226 }
227
228 return(newp);
229 }
230
231 /**********************************************************
232 *
233 * static ValNodePtr get_num(str):
234 *
235 * Get gb serial number and put it to PUB_Gen.
236 *
237 * 12-4-93
238 *
239 **********************************************************/
get_num(char * str)240 static CRef<objects::CPub> get_num(char* str)
241 {
242 int serial_num = NStr::StringToInt(str, NStr::fAllowTrailingSymbols);
243
244 CRef<objects::CPub> ret(new objects::CPub);
245 ret->SetGen().SetSerial_number(serial_num);
246
247 return ret;
248 }
249
250 /**********************************************************
251 *
252 * static ValNodePtr get_muid(str, format):
253 *
254 * Get gb MUID and put it to PUB_Gen.
255 *
256 * 12-4-93
257 *
258 **********************************************************/
get_muid(char * str,Parser::EFormat format)259 static CRef<objects::CPub> get_muid(char* str, Parser::EFormat format)
260 {
261 char* p;
262 Int4 i;
263
264 CRef<objects::CPub> muid;
265
266 if(str == NULL)
267 return muid;
268
269 if(format == Parser::EFormat::GenBank || format == Parser::EFormat::XML)
270 p = str;
271 else if(format == Parser::EFormat::EMBL)
272 {
273 p = StringIStr(str, "MEDLINE;");
274 if(p == NULL)
275 return muid;
276 for(p += 8; *p == ' ';)
277 p++;
278 }
279 else
280 return muid;
281
282 i = NStr::StringToInt(p, NStr::fAllowTrailingSymbols);
283 if(i < 1)
284 return muid;
285
286 muid.Reset(new objects::CPub);
287 muid->SetMuid(ENTREZ_ID_FROM(int, i));
288 return muid;
289 }
290
291 /**********************************************************/
get_embl_str_pub_id(char * str,const Char * tag)292 static char* get_embl_str_pub_id(char* str, const Char *tag)
293 {
294 char* p;
295 char* q;
296 char* ret;
297 Char ch;
298
299 if(str == NULL || tag == NULL)
300 return(NULL);
301
302 p = StringIStr(str, tag);
303 if(p == NULL)
304 return(NULL);
305 for(p += StringLen(tag); *p == ' ';)
306 p++;
307
308 ret = NULL;
309 for(q = p; *q != ' ' && *q != '\0';)
310 q++;
311 q--;
312 if(*q != '.')
313 q++;
314 ch = *q;
315 *q = '\0';
316 ret = StringSave(p);
317 *q = ch;
318 return(ret);
319 }
320
321 /**********************************************************/
get_embl_pmid(char * str)322 static Int4 get_embl_pmid(char* str)
323 {
324 char* p;
325 Int4 i;
326
327 if(str == NULL)
328 return(0);
329
330 p = StringIStr(str, "PUBMED;");
331 if(p == NULL)
332 return(0);
333 for(p += 7; *p == ' ';)
334 p++;
335 i = (Int4) atol(p);
336 if(i < 1)
337 return(0);
338 return(i);
339 }
340
341 /**********************************************************
342 *
343 * static char* check_book_tit(title):
344 *
345 * Get volume from book title.
346 *
347 * 12-4-93
348 *
349 **********************************************************/
check_book_tit(char * title)350 static char* check_book_tit(char* title)
351 {
352 char* p;
353 char* q;
354 char* r;
355
356 p = StringRStr(title, "Vol");
357 if(p == NULL)
358 return(NULL);
359
360 if(p[3] == '.')
361 q = p + 4;
362 else if(StringNCmp(p + 3, "ume", 3) == 0)
363 q = p + 6;
364 else
365 return(NULL);
366
367 while(*q == ' ' || *q == '\t')
368 q++;
369 for(r = q; *r >= '0' && *r <= '9';)
370 r++;
371
372 if(r == q || *r != '\0')
373 return(NULL);
374
375 if(p > title)
376 {
377 p--;
378 if(*p != ' ' && *p != '\t' && *p != ',' && *p != ';' && *p != '.')
379 return(NULL);
380
381 while(*p == ' ' || *p == '\t' || *p == ',' || *p == ';' || *p == '.')
382 {
383 if(p == title)
384 break;
385 p--;
386 }
387 if(*p != ' ' && *p != '\t' && *p != ',' && *p != ';' && *p != '.')
388 p++;
389 }
390 *p = '\0';
391
392 return(q);
393 }
394
395 /**********************************************************
396 *
397 * static CitPatPtr get_pat(pp, bptr, auth, title, eptr):
398 *
399 * Return a CitPat pointer for patent ref in ncbi or
400 * embl or ddbj.
401 * Leading "I" or "AR" for NCBI or "A" for EMBL or
402 * "E" for DDBJ in accesion number requiered
403 *
404 * JOURNAL Patent: US 4446235-A 6 01-MAY-1984;
405 * or
406 * RL Patent number US4446235-A/6, 01-MAY-1984.
407 *
408 * 11-14-93
409 *
410 **********************************************************/
get_pat(ParserPtr pp,char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,char * eptr)411 static CRef<objects::CCit_pat> get_pat(ParserPtr pp, char* bptr, CRef<objects::CAuth_list>& auth_list, CRef<objects::CTitle::C_E>& title, char* eptr)
412 {
413 IndexblkPtr ibp;
414
415 CRef<objects::CCit_pat> cit_pat;
416
417 char* country;
418 char* number;
419 char* type;
420 char* app;
421 char* s;
422 char* p;
423 char* q;
424 char* temp;
425
426 ErrSev sev;
427 Char ch;
428
429 ibp = pp->entrylist[pp->curindx];
430
431 temp = StringSave(bptr);
432
433 ch = (pp->format == Parser::EFormat::EMBL) ? '.' : ';';
434 p = StringChr(temp, ch);
435 if(p != NULL)
436 *p = '\0';
437
438 p = StringChr(bptr, ch);
439 if(p != NULL)
440 *p = '\0';
441
442 if(ibp->is_pat && ibp->psip.NotEmpty())
443 {
444 ErrPostStr(SEV_ERROR, ERR_FORMAT_MultiplePatRefs,
445 "Too many patent references for patent sequence; ignoring all but the first.");
446 }
447
448 if(pp->source == Parser::ESource::USPTO)
449 s = bptr;
450 else
451 {
452 q = (pp->format == Parser::EFormat::EMBL) ? (char *) "Patent number" :
453 (char *) "Patent:";
454 size_t len = StringLen(q);
455 if(StringNICmp(q, bptr, len) != 0)
456 {
457 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
458 "Illegal format: \"%s\"", temp);
459 MemFree(temp);
460 return cit_pat;
461 }
462
463 for(s = bptr + len; *s == ' ';)
464 s++;
465 }
466
467 for(country = s, q = s; isalpha((int) *s) != 0 || *s == ' '; s++)
468 if(*s != ' ')
469 q = s;
470 if(country == q)
471 {
472 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
473 "No Patent Document Country: \"%s\"", temp);
474 MemFree(temp);
475 return cit_pat;
476 }
477 s = q + 1;
478
479 if(pp->format != Parser::EFormat::EMBL &&
480 pp->format != Parser::EFormat::XML)
481 *s++ = '\0';
482 while(*s == ' ')
483 s++;
484 for(number = s, q = s; isdigit((int) *s) != 0 || *s == ','; s++)
485 if(*s != ',')
486 *q++ = *s;
487
488 if(number == s)
489 {
490 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
491 "No Patent Document Number: \"%s\"", temp);
492 MemFree(temp);
493 return cit_pat;
494 }
495
496 if(q != s)
497 *q = '\0';
498
499 if(*s == '-')
500 {
501 *s++ = '\0';
502 for(type = s; *s != ' ' && *s != '/' && *s != '\0';)
503 s++;
504 if(type == s)
505 type = NULL;
506 }
507 else
508 type = NULL;
509 if(*s != '\0')
510 *s++ = '\0';
511
512 if(type == NULL)
513 {
514 sev = (ibp->is_pat ? SEV_ERROR : SEV_WARNING);
515 ErrPostEx(sev, ERR_REFERENCE_Fail_to_parse,
516 "No Patent Document Type: \"%s\"", temp);
517 }
518
519 for(app = s, q = s; *s >= '0' && *s <= '9';)
520 s++;
521 if(*s != '\0' && *s != ',' && *s != '.' && *s != ' ' && *s != ';' &&
522 *s != '\n')
523 {
524 sev = (ibp->is_pat ? SEV_ERROR : SEV_WARNING);
525 ErrPostEx(sev, ERR_REFERENCE_Fail_to_parse,
526 "No number of sequence in patent: \"%s\"", temp);
527 app = NULL;
528 s = q;
529 }
530 else if(*s != '\0')
531 for(*s++ = '\0'; *s == ' ';)
532 s++;
533
534 CRef<objects::CDate_std> std_date;
535 if(*s != '\0')
536 {
537 std_date = get_full_date(s, true, pp->source);
538 }
539
540 if (std_date.Empty())
541 {
542 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
543 "Illegal format: \"%s\"", temp);
544 MemFree(temp);
545 return cit_pat;
546 }
547
548 if(p != NULL)
549 *p = ch;
550
551 std::string msg = NStr::Sanitize(number);
552 if(pp->format == Parser::EFormat::EMBL ||
553 pp->source == Parser::ESource::USPTO)
554 *number = '\0';
555
556 cit_pat.Reset(new objects::CCit_pat);
557
558 cit_pat->SetCountry(country);
559 cit_pat->SetNumber(msg);
560
561 cit_pat->SetDoc_type(type == NULL ? "" : type);
562 cit_pat->SetDate_issue().SetStd(*std_date);
563 cit_pat->SetTitle(title.Empty() ? "" : title->GetName());
564
565 if (auth_list.Empty() || !auth_list->IsSetNames())
566 {
567 objects::CAuth_list& pat_auth_list = cit_pat->SetAuthors();
568 pat_auth_list.SetNames().SetStr().push_back("");
569 }
570 else
571 cit_pat->SetAuthors(*auth_list);
572
573 if (auth_list.NotEmpty())
574 {
575 objects::CAffil& affil = auth_list->SetAffil();
576
577 s += 13;
578 if (s < eptr && *s != '\0')
579 affil.SetStr(s);
580 else
581 affil.SetStr("");
582 }
583
584 if(ibp->is_pat && ibp->psip.Empty())
585 {
586 ibp->psip = new objects::CPatent_seq_id;
587 ibp->psip->SetCit().SetCountry(country);
588 ibp->psip->SetCit().SetId().SetNumber(msg);
589 ibp->psip->SetSeqid(app != NULL ? atoi(app) : 0);
590 if(type)
591 ibp->psip->SetCit().SetDoc_type(type);
592 }
593
594 MemFree(temp);
595 return cit_pat;
596 }
597
598 /**********************************************************/
fta_get_part_sup(char * parts,objects::CImprint & imp)599 static void fta_get_part_sup(char* parts, objects::CImprint& imp)
600 {
601 char* start;
602 char* end;
603 char* p;
604 char* q;
605 Char ch;
606 Int4 i;
607 Int4 j;
608
609 if(parts == NULL || *parts == '\0')
610 return;
611
612 for(p = parts, i = 0, j = 0; *p != '\0'; p++)
613 {
614 if(*p == '(')
615 i++;
616 else if(*p == ')')
617 j++;
618
619 if(j > i || i - j > 1)
620 break;
621 }
622
623 if(*p != '\0' || i < 2)
624 return;
625
626 start = StringChr(parts, '(');
627 end = StringChr(start + 1, ')');
628
629 for(p = start + 1; *p == ' ';)
630 p++;
631 if(p == end)
632 return;
633
634 for(q = end - 1; *q == ' ' && q > p;)
635 q--;
636 if(*q != ' ')
637 q++;
638
639 ch = *q;
640 *q = '\0';
641
642 imp.SetPart_sup(p);
643 *q = ch;
644
645 fta_StringCpy(start, end + 1);
646 }
647
648 /**********************************************************
649 *
650 * static bool get_parts(bptr, eptr, imp):
651 *
652 * Return a PARTS from medart2asn.c.
653 *
654 **********************************************************/
get_parts(char * bptr,char * eptr,objects::CImprint & imp)655 static bool get_parts(char* bptr, char* eptr, objects::CImprint& imp)
656 {
657 char* parts;
658 char* p;
659 char* q;
660 Char ch;
661 Int4 bad;
662
663 if(bptr == NULL || eptr == NULL)
664 return false;
665
666 ch = *eptr;
667 *eptr = '\0';
668 parts = StringSave(bptr);
669 *eptr = ch;
670
671 for(p = parts; *p != '\0'; p++)
672 if(*p == '\t')
673 *p = ' ';
674
675 fta_get_part_sup(parts, imp);
676
677 bad = 0;
678 q = StringChr(parts, '(');
679 p = StringChr(parts, ')');
680
681 if(p != NULL && q != NULL)
682 {
683 if(p < q || StringChr(p + 1, ')') != NULL ||
684 StringChr(q + 1, '(') != NULL)
685 bad = 1;
686 }
687 else if(p != NULL || q != NULL)
688 bad = 1;
689
690 if(bad != 0)
691 {
692 MemFree(parts);
693 return false;
694 }
695
696 if(q != NULL)
697 {
698 *q++ = '\0';
699 *p = '\0';
700
701 for(p = q; *p == ' ';)
702 p++;
703 for(q = p; *q != '\0' && *q != ' ';)
704 q++;
705 if(*q != '\0')
706 *q++ = '\0';
707 if(q > p)
708 imp.SetIssue(p);
709 for(p = q; *p == ' ';)
710 p++;
711 for(q = p; *q != '\0';)
712 q++;
713 if(q > p)
714 {
715 for(q--; *q == ' ';)
716 q--;
717 *++q = '\0';
718
719 std::string supi(" ");
720 supi += p;
721 imp.SetPart_supi(supi);
722 }
723
724 const Char* issue_str = imp.IsSetIssue() ? imp.GetIssue().c_str() : NULL;
725 if (imp.IsSetPart_supi() && issue_str != NULL &&
726 (issue_str[0] == 'P' || issue_str[0] == 'p') && (issue_str[1] == 'T' || issue_str[1] == 't') &&
727 issue_str[2] == '\0')
728 {
729 std::string& issue = imp.SetIssue();
730 issue += imp.GetPart_supi();
731 imp.ResetPart_supi();
732 }
733 }
734
735 for(p = parts; *p == ' ';)
736 p++;
737 for(q = p; *q != '\0' && *q != ' ';)
738 q++;
739 if(*q != '\0')
740 *q++ = '\0';
741 if(q > p)
742 imp.SetVolume(p);
743 for(p = q; *p == ' ';)
744 p++;
745 for(q = p; *q != '\0';)
746 q++;
747 if(q > p)
748 {
749 for(q--; *q == ' ';)
750 q--;
751 *++q = '\0';
752 imp.SetPart_sup(p);
753 }
754
755 MemFree(parts);
756 return true;
757 }
758
759 /**********************************************************
760 *
761 * static CitArtPtr get_art(pp, bptr, auth, title, pre,
762 * has_muid, all_zeros, er):
763 *
764 * Return a CitArt pointer for GENBANK or EMBL mode.
765 *
766 **********************************************************/
get_art(ParserPtr pp,char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,int pre,bool has_muid,bool * all_zeros,Int4 er)767 static CRef<objects::CCit_art> get_art(ParserPtr pp, char* bptr, CRef<objects::CAuth_list>& auth_list, CRef<objects::CTitle::C_E>& title,
768 int pre, bool has_muid, bool* all_zeros, Int4 er)
769 {
770 char* eptr;
771 char* end_tit;
772 char* s;
773 char* p;
774 char* ss;
775 char* end_volume;
776 char* end_pages;
777 char* buf;
778 char* tit = NULL;
779 char* volume = NULL;
780 char* pages = NULL;
781 char* year;
782 Char symbol;
783
784 Int4 i;
785 Int4 is_er;
786
787 *all_zeros = false;
788
789 is_er = 0;
790 if(er > 0)
791 is_er |= 01; /* based on REMARKs */
792 if(StringNCmp(bptr, "(er)", 4) == 0)
793 is_er |= 02;
794
795 CRef<objects::CCit_art> cit_art;
796
797 if(pp->format == Parser::EFormat::GenBank || pp->format == Parser::EFormat::PRF)
798 symbol = ',';
799 else if(pp->format == Parser::EFormat::EMBL)
800 symbol = ':';
801 else if(pp->format == Parser::EFormat::XML)
802 {
803 if(pp->source == Parser::ESource::EMBL)
804 symbol = ':';
805 else
806 symbol = ',';
807 }
808 else
809 return cit_art;
810
811 end_volume = NULL;
812
813 size_t len = StringLen(bptr);
814 buf = (char*) MemNew(len + 1);
815 StringCpy(buf, bptr);
816 eptr = buf + len - 1;
817 while(eptr > buf && (*eptr == ' ' || *eptr == '\t' || *eptr == '.'))
818 *eptr-- = '\0';
819 if(*eptr != ')')
820 {
821 MemFree(buf);
822 return cit_art;
823 }
824 for(s = eptr - 1; s > buf && *s != '(';)
825 s--;
826 if(*s != '(')
827 {
828 MemFree(buf);
829 return cit_art;
830 }
831
832 if(pp->format == Parser::EFormat::PRF && s > buf &&
833 (StringLen(s) != 6 || s[1] < '1' || s[1] > '2' || s[2] < '0' ||
834 s[2] > '9' || s[3] < '0' || s[3] > '9' || s[4] < '0' || s[4] > '9'))
835 {
836 for(p = s - 1; p > buf && *p != '(';)
837 p--;
838 if(*p == '(' && p[5] == ')' && p[1] > '0' && p[1] < '3' &&
839 p[2] >= '0' && p[2] <= '9' && p[3] >= '0' && p[3] <= '9' &&
840 p[4] >= '0' && p[4] <= '9')
841 {
842 *s = '\0';
843 s = p;
844 }
845 }
846
847 year = s + 1;
848 for(s--; s >= buf && isspace((int) *s) != 0;)
849 s--;
850 if(s < buf)
851 s = buf;
852 end_pages = s + 1;
853 if(buf[0] == 'G' && buf[1] == '3')
854 ss = buf + 2;
855 else
856 ss = buf;
857 for(i = 0; ss <= year; ss++)
858 {
859 if(*ss == '(')
860 i++;
861 else if(*ss == ')')
862 i--;
863 else if(*ss >= '0' && *ss <= '9' && i == 0)
864 break;
865 }
866
867 for(s = end_pages; s >= buf && *s != symbol;)
868 s--;
869 if(s < buf)
870 s = buf;
871 if(*s != symbol)
872 {
873 /* try delimiter from other format
874 */
875 if(pp->format == Parser::EFormat::GenBank)
876 symbol = ':';
877 else if(pp->format == Parser::EFormat::EMBL)
878 symbol = ',';
879 else if(pp->format == Parser::EFormat::XML)
880 {
881 if(pp->source == Parser::ESource::EMBL)
882 symbol = ',';
883 else
884 symbol = ':';
885 }
886
887 for(s = end_pages; s >= buf && *s != symbol;)
888 s--;
889 if(s < buf)
890 s = buf;
891 }
892
893 if(*s == symbol && ss != year)
894 {
895 if(ss > s)
896 ss = s + 1;
897 end_volume = s;
898 for(pages = s + 1; IS_WHITESP(*pages) != 0;)
899 pages++;
900 end_tit = ss - 1;
901 if(end_volume > ss)
902 {
903 volume = ss;
904 if(*end_tit == '(')
905 volume--;
906 }
907 }
908 else
909 {
910 if(pre != 1)
911 pre = 2;
912
913 end_tit = end_pages;
914 }
915
916 if(*year == '0')
917 {
918 if(pages != NULL && StringNCmp(pages, "0-0", 3) == 0 &&
919 pp->source == Parser::ESource::EMBL)
920 *all_zeros = true;
921 MemFree(buf);
922 return cit_art;
923 }
924
925 tit = buf;
926 if(*tit == '\0')
927 {
928 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
929 "No journal title.");
930 MemFree(buf);
931 return cit_art;
932 }
933
934 cit_art.Reset(new objects::CCit_art);
935 objects::CCit_jour& journal = cit_art->SetFrom().SetJournal();
936 objects::CImprint& imp = journal.SetImp();
937
938 if (pre > 0)
939 imp.SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
940
941 *end_pages = '\0';
942 if(pages != NULL && StringNCmp(pages, "0-0", 3) != 0)
943 {
944 i = valid_pages_range(pages, tit, is_er, (pre == 2));
945 if(i == 0)
946 imp.SetPages(pages);
947 else if(i == 1)
948 end_tit = end_pages;
949 else if(i == -1 && is_er > 0)
950 {
951 MemFree(buf);
952 cit_art.Reset();
953 return cit_art;
954 }
955 }
956 else if(pre != 1)
957 pre = 2;
958
959 if(volume != NULL)
960 {
961 if(!get_parts(volume, end_volume, imp))
962 {
963 MemFree(buf);
964 cit_art.Reset();
965 return cit_art;
966 }
967
968 if(pre != 1 && !imp.IsSetVolume())
969 {
970 if(imp.IsSetPages())
971 {
972 MemFree(buf);
973 cit_art.Reset();
974 return cit_art;
975 }
976 pre = 2;
977 }
978 }
979 else if(is_er > 0 && pre != 2)
980 {
981 MemFree(buf);
982 cit_art.Reset();
983 return cit_art;
984 }
985
986 CRef<objects::CDate> date;
987 if (*year != '0')
988 date = get_date(year);
989
990 if(date.Empty())
991 {
992 if(is_er == 0)
993 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
994 "No date in journal reference");
995
996 MemFree(buf);
997 cit_art.Reset();
998 return cit_art;
999 }
1000
1001 *end_tit = '\0';
1002
1003 CRef<objects::CTitle::C_E> journal_title(new objects::CTitle::C_E);
1004
1005 for (char* aux = end_tit - 1; aux > tit && *aux != '.' && *aux != ')' && !isalnum(*aux); --aux)
1006 *aux = 0;
1007
1008 journal_title->SetIso_jta(NStr::Sanitize(tit));
1009 journal.SetTitle().Set().push_back(journal_title);
1010
1011 imp.SetDate(*date);
1012 if (pre > 0)
1013 imp.SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
1014
1015 if((is_er & 01) == 01)
1016 {
1017 if(er == 1)
1018 imp.SetPubstatus(3); /* epublish */
1019 else
1020 imp.SetPubstatus(10); /* aheadofprint */
1021 }
1022
1023 /* check invalid "in-press"
1024 */
1025 if(pre == 2)
1026 {
1027 if(has_muid)
1028 {
1029 ErrPostEx(SEV_WARNING, ERR_REFERENCE_InvalidInPress,
1030 "Reference flagged as In-press, but Medline UID exists, In-press ignored: %s",
1031 buf);
1032 imp.ResetPrepub();
1033 }
1034
1035 if(imp.IsSetPages() && imp.IsSetVolume() && imp.IsSetDate())
1036 {
1037 ErrPostEx(SEV_WARNING, ERR_REFERENCE_InvalidInPress,
1038 "Reference flagged as In-press, but citation is complete, In-press ignored: %s",
1039 buf);
1040 imp.ResetPrepub();
1041 }
1042 }
1043
1044 /* Title and authors are optional for cit_art
1045 */
1046 if(title != NULL)
1047 cit_art->SetTitle().Set().push_back(title);
1048
1049 if (auth_list.NotEmpty())
1050 cit_art->SetAuthors(*auth_list);
1051
1052 MemFree(buf);
1053 return cit_art;
1054 }
1055
1056 /**********************************************************
1057 *
1058 * static CitGenPtr get_unpub(bptr, eptr, auth, title):
1059 *
1060 * Return a CitGen pointer.
1061 *
1062 * 11-14-93
1063 *
1064 **********************************************************/
get_unpub(char * bptr,char * eptr,CRef<objects::CAuth_list> & auth_list,const Char * title)1065 static CRef<objects::CCit_gen> get_unpub(char* bptr, char* eptr, CRef<objects::CAuth_list>& auth_list,
1066 const Char* title)
1067 {
1068 CRef<objects::CCit_gen> cit_gen(new objects::CCit_gen);
1069
1070 char* s;
1071 char* str;
1072
1073 if (bptr != NULL)
1074 {
1075 for(s = bptr; *s != '\0' && *s != '(';)
1076 s++;
1077 for(str = s - 1; str > bptr && IS_WHITESP(*str) != 0;)
1078 str--;
1079 if(*s == '(')
1080 s += 6;
1081
1082 if (s < eptr && *s != '\0' && auth_list.NotEmpty())
1083 auth_list->SetAffil().SetStr(NStr::Sanitize(s));
1084
1085 cit_gen->SetCit(std::string(bptr, str + 1));
1086 }
1087
1088 if (auth_list.NotEmpty())
1089 cit_gen->SetAuthors(*auth_list);
1090
1091 if (title != NULL)
1092 cit_gen->SetTitle(title);
1093
1094 return cit_gen;
1095 }
1096
1097 /**********************************************************
1098 *
1099 * static CitArtPtr get_book(bptr, auth, title, pre,
1100 * format, p):
1101 *
1102 * Return a CitArt pointer (!!! that is an article
1103 * from book!!).
1104 *
1105 * 11-14-93
1106 *
1107 **********************************************************/
get_book(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,int pre,Parser::EFormat format,char * jour)1108 static CRef<objects::CCit_art> get_book(char* bptr, CRef<objects::CAuth_list>& auth_list, CRef<objects::CTitle::C_E>& title,
1109 int pre, Parser::EFormat format, char* jour)
1110 {
1111 char* s;
1112 char* ss;
1113 char* tit;
1114 char* volume;
1115 char* pages;
1116 char* press;
1117
1118 Uint1 ref_fmt;
1119 bool IS_AUTH = false;
1120 char* tbptr;
1121 char* p;
1122 Char c;
1123 Int4 i;
1124
1125 tit = NULL;
1126 ref_fmt = GB_REF;
1127
1128 tbptr = (bptr == NULL) ? NULL : StringSave(bptr);
1129
1130 switch(format)
1131 {
1132 case Parser::EFormat::EMBL:
1133 ref_fmt = EMBL_REF;
1134 break;
1135 case Parser::EFormat::GenBank:
1136 ref_fmt = GB_REF;
1137 break;
1138 case Parser::EFormat::PIR:
1139 ref_fmt = PIR_REF;
1140 break;
1141 case Parser::EFormat::SPROT:
1142 ref_fmt = SP_REF;
1143 break;
1144 default:
1145 break;
1146 }
1147
1148 CRef<objects::CCit_art> cit_art(new objects::CCit_art);
1149 objects::CCit_book& cit_book = cit_art->SetFrom().SetBook();
1150
1151 if (pre > 0)
1152 cit_book.SetImp().SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
1153
1154 p = tbptr;
1155 CRef<objects::CTitle::C_E> book_title(new objects::CTitle::C_E);
1156
1157 if(StringNCmp("(in)", tbptr, 4) == 0)
1158 {
1159 for(s = tbptr + 4; *s == ' ';)
1160 s++;
1161 for(bptr = s; *s != ';' && *s != '(' && *s != '\0';)
1162 s++;
1163 if(StringNICmp(s, "(Eds.)", 6) == 0)
1164 {
1165 tit = s + 6;
1166 IS_AUTH = true;
1167 }
1168 else if(StringNICmp(s, "(Ed.)", 5) == 0)
1169 {
1170 tit = s + 5;
1171 IS_AUTH = true;
1172 }
1173 else if(*s == ';')
1174 tit = s;
1175 if(tit != NULL)
1176 while(*tit == ' ' || *tit == ';' || *tit == '\n')
1177 tit++;
1178 c = *s;
1179 *s++ = '\0';
1180 if(IS_AUTH && *bptr != '\0')
1181 {
1182 CRef<objects::CAuth_list> book_auth_list;
1183 get_auth(bptr, ref_fmt, jour, book_auth_list);
1184 if (book_auth_list.NotEmpty())
1185 cit_book.SetAuthors(*book_auth_list);
1186 }
1187 else
1188 {
1189 ErrPostEx(SEV_ERROR, ERR_REFERENCE_UnusualBookFormat,
1190 "Cannot parse unusually formatted book reference (generating Cit-gen instead): %s",
1191 p);
1192 if(tbptr != NULL)
1193 MemFree(tbptr);
1194
1195 cit_art.Reset();
1196 return cit_art;
1197 }
1198
1199 ss = StringRChr(tit, ';');
1200 if(ss == NULL)
1201 for(ss = tit; *ss != '\0';)
1202 ss++;
1203 for(s = ss; *s != ':' && s != tit;)
1204 s--;
1205 if(*s != ':')
1206 s = ss;
1207 c = *s;
1208 if(*s != '\0')
1209 *s++ = '\0';
1210
1211 book_title->SetName("");
1212 if(*tit != '\0')
1213 {
1214 volume = check_book_tit(tit);
1215 if(volume != NULL)
1216 cit_book.SetImp().SetVolume(volume);
1217
1218 book_title->SetName(NStr::Sanitize(tit));
1219 }
1220
1221 if(c == ':')
1222 {
1223 for(pages = s; *s != '\0' && *s != ',' && *s != ';';)
1224 s++;
1225 if(*s != '\0')
1226 *s++ = '\0';
1227
1228 while(*pages == ' ')
1229 pages++;
1230
1231 if (StringNCmp(pages, "0-0", 3) == 0)
1232 cit_book.SetImp().SetPrepub(objects::CImprint::ePrepub_in_press);
1233 else
1234 {
1235 bool is_in_press = cit_book.GetImp().IsSetPrepub() && cit_book.GetImp().GetPrepub() == objects::CImprint::ePrepub_in_press;
1236 i = valid_pages_range(pages, book_title->GetName().c_str(), 0, is_in_press);
1237
1238 if(i == 0)
1239 cit_book.SetImp().SetPages(NStr::Sanitize(pages));
1240 else if(i == 1)
1241 {
1242 std::string new_title = book_title->GetName();
1243 new_title += ": ";
1244 new_title += pages;
1245 book_title->SetName(new_title);
1246 }
1247 }
1248 }
1249
1250 for(press = s; *s != '(' && *s != '\0';)
1251 s++;
1252 if(*s != '\0')
1253 *s++ = '\0';
1254
1255 cit_book.SetImp().SetPub().SetStr(NStr::Sanitize(press));
1256
1257 CRef<objects::CDate> date = get_date(s);
1258 if (date.Empty())
1259 {
1260 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1261 "No date in book reference");
1262 ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
1263 "Book format error (cit-gen created): %s", p);
1264 if(tbptr != NULL)
1265 MemFree(tbptr);
1266
1267 cit_art.Reset();
1268 return cit_art;
1269 }
1270
1271 cit_book.SetImp().SetDate(*date);
1272 }
1273
1274 cit_book.SetTitle().Set().push_back(book_title);
1275
1276 if (title.NotEmpty())
1277 cit_art->SetTitle().Set().push_back(title);
1278
1279 if (auth_list.NotEmpty())
1280 cit_art->SetAuthors(*auth_list);
1281
1282 if(tbptr != NULL)
1283 MemFree(tbptr);
1284
1285 return cit_art;
1286 }
1287
1288 /**********************************************************
1289 *
1290 * static CitBookPtr get_thesis(bptr, auth, title, pre):
1291 *
1292 * Return a CitBook pointer.
1293 *
1294 * 11-14-93
1295 *
1296 **********************************************************/
get_thesis(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,int pre)1297 static CRef<objects::CCit_let> get_thesis(char* bptr, CRef<objects::CAuth_list>& auth_list,
1298 CRef<objects::CTitle::C_E>& title, int pre)
1299 {
1300 CRef<objects::CCit_let> cit_let(new objects::CCit_let);
1301
1302 cit_let->SetType(objects::CCit_let::eType_thesis);
1303
1304 objects::CCit_book& book = cit_let->SetCit();
1305
1306 if (pre > 0)
1307 book.SetImp().SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
1308
1309 char* s;
1310 for (s = bptr; *s != '\0' && *s != '(';)
1311 s++;
1312
1313 if(*s == '(')
1314 {
1315 CRef<objects::CDate> date = get_date(s + 1);
1316 if (date.NotEmpty())
1317 book.SetImp().SetDate(*date);
1318
1319 s = s + 6;
1320 }
1321
1322 if (!book.GetImp().IsSetDate())
1323 {
1324 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1325 "Fail to parse thesis: missing date");
1326
1327 cit_let.Reset();
1328 return cit_let;
1329 }
1330
1331 if(*s != '\0')
1332 book.SetImp().SetPub().SetStr(NStr::Sanitize(s));
1333
1334 if (title.NotEmpty())
1335 book.SetTitle().Set().push_back(title);
1336 else
1337 {
1338 ErrPostStr(SEV_WARNING, ERR_REFERENCE_Thesis, "Missing thesis title");
1339
1340 CRef<objects::CTitle::C_E> empty_title(new objects::CTitle::C_E);
1341 empty_title->SetName("");
1342 book.SetTitle().Set().push_back(empty_title);
1343 }
1344
1345 if (auth_list.NotEmpty())
1346 book.SetAuthors(*auth_list);
1347 return cit_let;
1348 }
1349
1350 /**********************************************************
1351 *
1352 * static CitBookPtr get_whole_book(bptr, auth, title,
1353 * pre):
1354 *
1355 * Return a CitBook pointer.
1356 *
1357 * 11-14-93
1358 *
1359 **********************************************************/
get_whole_book(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,int pre)1360 static CRef<objects::CCit_book> get_whole_book(char* bptr, CRef<objects::CAuth_list>& auth_list,
1361 CRef<objects::CTitle::C_E>& title, int pre)
1362 {
1363 CRef<objects::CCit_book> cit_book;
1364
1365 char* s;
1366
1367 for(bptr += 5; IS_WHITESP(*bptr) != 0;)
1368 bptr++;
1369
1370
1371 for(s = bptr; *s != '\0' && *s != '(';)
1372 s++;
1373
1374 if(*s != '(')
1375 {
1376 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1377 "Fail to parse book: missing date");
1378 return cit_book;
1379 }
1380
1381 cit_book.Reset(new objects::CCit_book);
1382
1383 if (pre > 0)
1384 cit_book->SetImp().SetPrepub(static_cast<objects::CImprint::EPrepub>(pre));
1385
1386 CRef<objects::CDate> date = get_date(s + 1);
1387 if (date.NotEmpty())
1388 cit_book->SetImp().SetDate(*date);
1389
1390 *s = '\0';
1391 for(s = bptr; *s != '\0' && *s != '.';)
1392 s++;
1393
1394 CRef<objects::CTitle::C_E> book_title(new objects::CTitle::C_E);
1395 book_title->SetName(std::string(bptr, s));
1396 cit_book->SetTitle().Set().push_back(book_title);
1397
1398 if(*s == '.')
1399 {
1400 for(s++; IS_WHITESP(*s) != 0;)
1401 s++;
1402
1403 cit_book->SetImp().SetPub().SetStr(NStr::Sanitize(s));
1404 }
1405
1406 if (auth_list.Empty() || !auth_list->IsSetNames())
1407 {
1408 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1409 "Fail to parse thesis: missing thesis author");
1410 cit_book.Reset();
1411 return cit_book;
1412 }
1413
1414 cit_book->SetAuthors(*auth_list);
1415
1416 return cit_book;
1417 }
1418
1419 /**********************************************************
1420 *
1421 * static CitSubPtr get_sub(pp, bptr, auth):
1422 *
1423 * Return a CitSub pointer.
1424 *
1425 **********************************************************/
get_sub(ParserPtr pp,char * bptr,CRef<objects::CAuth_list> & auth_list)1426 static CRef<objects::CCit_sub> get_sub(ParserPtr pp, char* bptr, CRef<objects::CAuth_list>& auth_list)
1427 {
1428 const char **b;
1429 char* s;
1430 Int2 medium = OTHER_MEDIUM;
1431
1432 CRef<objects::CCit_sub> ret;
1433
1434 for(s = bptr; *s != '(' && *s != '\0';)
1435 s++;
1436 if(*s == '\0')
1437 {
1438 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1439 "Fail to parse submission: missing date");
1440 return ret;
1441 }
1442
1443 ret.Reset(new objects::CCit_sub);
1444 CRef<objects::CDate> date;
1445
1446 if(pp != NULL && pp->entrylist != NULL &&
1447 IsNewAccessFormat(pp->entrylist[pp->curindx]->acnum) == 0 &&
1448 (StringChr(ParFlat_LANL_AC,
1449 pp->entrylist[pp->curindx]->acnum[0]) != NULL) &&
1450 isdigit((int) *(s + 1)) == 0)
1451 {
1452 date = get_lanl_date(s);
1453 }
1454 else
1455 {
1456 CRef<objects::CDate_std> std_date = get_full_date(s + 1, true, pp->source);
1457 date.Reset(new objects::CDate);
1458 date->SetStd(*std_date);
1459 }
1460
1461 if (date.Empty())
1462 return ret;
1463
1464 ret.Reset(new objects::CCit_sub);
1465 ret->SetDate(*date);
1466
1467 s = s + 13;
1468 if(StringStr(s, "E-mail") != NULL)
1469 medium = EMAIL_MEDIUM;
1470
1471 if(StringNICmp(" on tape", s, 8) == 0)
1472 {
1473 medium = TAPE_MEDIUM;
1474 for(s += 8; *s != '\0' && *s != ':';)
1475 s++;
1476 }
1477 if(*s != '\0' && *(s + 1) != '\0')
1478 {
1479 while(*s == ' ')
1480 s++;
1481
1482 if(*s == ':')
1483 s++;
1484 for(;;)
1485 {
1486 for(b = strip_sub_str; *b != NULL; b++)
1487 {
1488 size_t l_str = StringLen(*b);
1489 if(StringNCmp(s, *b, l_str) == 0)
1490 {
1491 for(s += l_str; *s == ' ' || *s == '.';)
1492 s++;
1493 break;
1494 }
1495 }
1496 if(*b == NULL)
1497 break;
1498 }
1499
1500 if (*s != '\0' && auth_list.NotEmpty())
1501 {
1502 auth_list->SetAffil().SetStr(NStr::Sanitize(s));
1503 }
1504 }
1505
1506 if(*s == '\0')
1507 {
1508 ErrPostEx(SEV_WARNING, ERR_REFERENCE_NoContactInfo,
1509 "Missing contact info : %s", bptr);
1510 }
1511
1512 if (auth_list.Empty() || !auth_list->IsSetNames())
1513 {
1514 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1515 "Direct submission: missing author (cit-gen created)");
1516
1517 ret.Reset();
1518 return ret;
1519 }
1520
1521 ret->SetAuthors(*auth_list);
1522 ret->SetMedium(static_cast<objects::CCit_sub::EMedium>(medium));
1523
1524 return ret;
1525 }
1526
1527 /**********************************************************
1528 *
1529 * static CitSubPtr get_sub_gsdb(bptr, auth, title, pp):
1530 *
1531 * GSDB specific format for CitSub :
1532 * REFERENCE 1 (bases 1 to 378)
1533 * AUTHORS Mundt,M.O.
1534 * TITLE Published by M.O. Mundt, Genomics LS-3,
1535 * Los Alamos National Laboratory,
1536 * Mail Stop M888, Los Alamos, NM, USA, 87545
1537 * JOURNAL Published in GSDB (11-OCT-1996)
1538 *
1539 **********************************************************/
get_sub_gsdb(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,ParserPtr pp)1540 static CRef<objects::CCit_sub> get_sub_gsdb(char* bptr, CRef<objects::CAuth_list>& auth_list,
1541 CRef<objects::CTitle::C_E>& title, ParserPtr pp)
1542 {
1543 CRef<objects::CCit_sub> cit_sub;
1544
1545 char* s;
1546
1547 for(s = bptr; *s != '(' && *s != '\0';)
1548 s++;
1549 if(*s == '\0')
1550 {
1551 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1552 "Fail to parse submission: missing date");
1553 return cit_sub;
1554 }
1555
1556 CRef<objects::CDate_std> std_date = get_full_date(s + 1, true, pp->source);
1557 if(std_date.Empty())
1558 return cit_sub;
1559
1560 CRef<objects::CDate> date;
1561 date->SetStd(*std_date);
1562
1563 if (auth_list.Empty() || !auth_list->IsSetNames())
1564 {
1565 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
1566 "Direct submission: missing author (cit-gen created)");
1567 return cit_sub;
1568 }
1569
1570 cit_sub.Reset(new objects::CCit_sub);
1571 cit_sub->SetAuthors(*auth_list);
1572 cit_sub->SetDate(*date);
1573
1574 if (title.NotEmpty())
1575 {
1576 const Char* s = title->GetName().c_str();
1577 size_t l_str = StringLen("Published by");
1578 if(StringNCmp(s, "Published by", l_str) == 0)
1579 {
1580 s += l_str;
1581 while(*s == ' ')
1582 s++;
1583 }
1584
1585 if(*s != '\0')
1586 {
1587 auth_list->SetAffil().SetStr(NStr::Sanitize(s));
1588 }
1589 else
1590 {
1591 ErrPostEx(SEV_WARNING, ERR_REFERENCE_NoContactInfo,
1592 "Missing contact info : %s", bptr);
1593 }
1594 }
1595 else
1596 {
1597 ErrPostEx(SEV_WARNING, ERR_REFERENCE_NoContactInfo,
1598 "Missing contact info : %s", bptr);
1599 }
1600
1601 return cit_sub;
1602 }
1603
1604 /**********************************************************/
fta_get_citgen(char * bptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title)1605 static CRef<objects::CCit_gen> fta_get_citgen(char* bptr, CRef<objects::CAuth_list>& auth_list,
1606 CRef<objects::CTitle::C_E>& title)
1607 {
1608 CRef<objects::CCit_gen> cit_gen;
1609
1610 char* p;
1611 char* q;
1612 char* r;
1613 Char ch;
1614 Int2 year;
1615
1616 if (bptr == NULL || auth_list.Empty() || !auth_list->IsSetNames() || title.Empty())
1617 return cit_gen;
1618
1619 year = 0;
1620 p = StringChr(bptr, '(');
1621 if(p != NULL)
1622 {
1623 for(p++; *p == ' ' || *p == '\t';)
1624 p++;
1625 for(q = p; *p >= '0' && *p <= '9';)
1626 p++;
1627 for(r = p; *p == ' ' || *p == '\t' || *p == ')';)
1628 p++;
1629 if(*p == '\n' || *p == '\0')
1630 {
1631 ch = *r;
1632 *r = '\0';
1633 year = atoi(q);
1634 if(year < 1900)
1635 *r = ch;
1636 else
1637 {
1638 for(q--; *q == ' ' || *q == '\t' || *q == '(';)
1639 q--;
1640 *++q = '\0';
1641 }
1642 }
1643 }
1644
1645 cit_gen.Reset(new objects::CCit_gen);
1646
1647 if(bptr != NULL)
1648 cit_gen->SetCit(bptr);
1649
1650 cit_gen->SetAuthors(*auth_list);
1651 cit_gen->SetTitle(title->GetName());
1652
1653 if(year >= 1900)
1654 cit_gen->SetDate().SetStd().SetYear(year);
1655
1656 return cit_gen;
1657 }
1658
1659 /**********************************************************
1660 *
1661 * ValNodePtr journal(pp, bptr, eptr, auth, title,
1662 * has_muid, cit_art, er):
1663 *
1664 * Return a ValNodePtr.
1665 *
1666 **********************************************************/
journal(ParserPtr pp,char * bptr,char * eptr,CRef<objects::CAuth_list> & auth_list,CRef<objects::CTitle::C_E> & title,bool has_muid,CRef<objects::CCit_art> & cit_art,Int4 er)1667 CRef<objects::CPub> journal(ParserPtr pp, char* bptr, char* eptr, CRef<objects::CAuth_list>& auth_list,
1668 CRef<objects::CTitle::C_E>& title, bool has_muid, CRef<objects::CCit_art>& cit_art, Int4 er)
1669 {
1670 int pre = 0;
1671 char* p;
1672 char* nearend;
1673 char* end;
1674 bool all_zeros;
1675 int retval = ParFlat_MISSING_JOURNAL;
1676
1677 CRef<objects::CPub> ret(new objects::CPub);
1678 if(bptr == NULL)
1679 {
1680 const Char* title_str = title.Empty() ? NULL : title->GetName().c_str();
1681 ret->SetGen(*get_unpub(bptr, eptr, auth_list, title_str));
1682 return ret;
1683 }
1684
1685 p = bptr;
1686 size_t my_len = StringLen(p);
1687 if(my_len > 7)
1688 {
1689 nearend = p + StringLen(p) - 1;
1690 while(*nearend == ' ' || *nearend == '\t' || *nearend == '.')
1691 *nearend-- = '\0';
1692
1693 nearend -= 8;
1694 end = nearend + 2;
1695 if(StringNICmp("In press", nearend + 1, 8) == 0)
1696 {
1697 pre = 2;
1698 *(nearend + 1) = '\0';
1699 }
1700 if(StringNICmp("Submitted", nearend, 9) == 0)
1701 {
1702 pre = 1;
1703 *nearend = '\0';
1704 }
1705 if(pre == 0 && *end == '(' && IS_DIGIT(*(end + 1)) != 0)
1706 {
1707 for(nearend = end - 1; nearend > bptr && *nearend != ' ';)
1708 nearend--;
1709 if(StringNICmp("In press", nearend + 1, 8) == 0)
1710 {
1711 pre = 2;
1712 *(nearend + 1) = '\0';
1713 }
1714 }
1715 }
1716
1717 if(my_len >= 6 && *p == '(')
1718 {
1719 p += 6;
1720 my_len -= 6;
1721 if(StringNCmp(" In press", p, 9) == 0)
1722 {
1723 retval = ParFlat_IN_PRESS;
1724 pre = 2;
1725 }
1726 }
1727
1728 p = bptr;
1729 my_len = StringLen(p);
1730 if(StringNCmp("Unpub", p, 5) == 0 || StringNCmp("Unknown", p, 7) == 0)
1731 {
1732 retval = ParFlat_UNPUB_JOURNAL;
1733 const Char* title_str = title.Empty() ? NULL : title->GetName().c_str();
1734 ret->SetGen(*get_unpub(bptr, eptr, auth_list, title_str));
1735 }
1736 else if(StringNCmp("(in)", p, 4) == 0)
1737 {
1738 retval = ParFlat_MONOGRAPH_NOT_JOURNAL;
1739
1740 CRef<objects::CCit_art> article = get_book(bptr, auth_list, title, pre, pp->format, p);
1741
1742 if (article.Empty())
1743 ret->SetGen(*get_error(bptr, auth_list, title));
1744 else
1745 ret->SetArticle(*article);
1746
1747 }
1748 else if (StringNCmp("Thesis", p, 6) == 0)
1749 {
1750 retval = ParFlat_THESIS_CITATION;
1751
1752 CRef<objects::CCit_let> cit_let = get_thesis(bptr, auth_list, title, pre);
1753 if (cit_let.Empty())
1754 {
1755 ret.Reset();
1756 return ret;
1757 }
1758 ret->SetMan(*cit_let);
1759 }
1760 else if (StringNCmp("Submi", p, 5) == 0)
1761 {
1762 retval = ParFlat_SUBMITTED;
1763
1764 CRef<objects::CCit_sub> cit_sub = get_sub(pp, bptr, auth_list);
1765 if (cit_sub.Empty())
1766 {
1767 ret.Reset();
1768 return ret;
1769 }
1770
1771 ret->SetSub(*cit_sub);
1772 }
1773 else if(StringNCmp("Published in GSDB", p, 17) == 0)
1774 {
1775 ErrPostEx(SEV_WARNING, ERR_REFERENCE_GsdbRefDropped,
1776 "A published-in-gsdb reference was encountered and has been dropped [%s]",
1777 bptr);
1778 retval = ParFlat_SUBMITTED;
1779
1780 CRef<objects::CCit_sub> cit_sub = get_sub_gsdb(bptr, auth_list, title, pp);
1781 if (cit_sub.Empty())
1782 {
1783 ret.Reset();
1784 return ret;
1785 }
1786
1787 ret->SetSub(*cit_sub);
1788 }
1789 else if(StringNCmp("Patent", p, 6) == 0 ||
1790 pp->source == Parser::ESource::USPTO)
1791 {
1792 retval = ParFlat_PATENT_CITATION;
1793
1794 if (pp->seqtype == objects::CSeq_id::e_Genbank || pp->seqtype == objects::CSeq_id::e_Ddbj ||
1795 pp->seqtype == objects::CSeq_id::e_Embl || pp->seqtype == objects::CSeq_id::e_Other ||
1796 pp->seqtype == objects::CSeq_id::e_Tpe || pp->seqtype == objects::CSeq_id::e_Tpg ||
1797 pp->seqtype == objects::CSeq_id::e_Tpd ||
1798 pp->source == Parser::ESource::USPTO)
1799 {
1800 CRef<objects::CCit_pat> cit_pat = get_pat(pp, bptr, auth_list, title, eptr);
1801 if (cit_pat.Empty())
1802 {
1803 ret.Reset();
1804 return ret;
1805 }
1806
1807 ret->SetPatent(*cit_pat);
1808 }
1809 else
1810 {
1811 ret.Reset();
1812 return ret;
1813 }
1814 }
1815 else if(StringNCmp("Book:", p, 5) == 0)
1816 {
1817 retval = ParFlat_BOOK_CITATION;
1818
1819 CRef<objects::CCit_book> book = get_whole_book(bptr, auth_list, title, pre);
1820 if(book.Empty())
1821 {
1822 ret.Reset();
1823 return ret;
1824 }
1825
1826 ret->SetBook(*book);
1827 }
1828 else if(StringNICmp("Published Only in Database", p, 26) == 0)
1829 {
1830 retval = ParFlat_GEN_CITATION;
1831 CRef<objects::CCit_gen> cit_gen = fta_get_citgen(bptr, auth_list, title);
1832
1833 if (cit_gen.Empty())
1834 {
1835 ret.Reset();
1836 return ret;
1837 }
1838
1839 ret->SetGen(*cit_gen);
1840 }
1841 else if(StringNICmp("Online Publication", p, 18) == 0)
1842 {
1843 retval = ParFlat_ONLINE_CITATION;
1844
1845 CRef<objects::CCit_gen> cit_gen = fta_get_citgen(bptr, auth_list, title);
1846
1847 if (cit_gen.Empty())
1848 {
1849 ret.Reset();
1850 return ret;
1851 }
1852
1853 ret->SetGen(*cit_gen);
1854 }
1855
1856 if(retval == ParFlat_MISSING_JOURNAL)
1857 {
1858 if(cit_art.NotEmpty())
1859 ret->SetArticle(*cit_art);
1860 else
1861 {
1862 CRef<objects::CCit_art> new_art = get_art(pp, bptr, auth_list, title, pre,
1863 has_muid, &all_zeros, er);
1864 if (new_art.Empty())
1865 {
1866 if(!all_zeros &&
1867 StringNCmp(bptr, "(er)", 4) != 0 && er == 0)
1868 ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
1869 "Journal format error (cit-gen created): %s",
1870 bptr);
1871
1872 ret->SetGen(*get_error(bptr, auth_list, title));
1873 }
1874 else
1875 ret->SetArticle(*new_art);
1876 }
1877 }
1878
1879 return ret;
1880 }
1881
1882 /**********************************************************/
FindBackSemicolon(char * pchStart,char * pchCurrent)1883 static char* FindBackSemicolon(char* pchStart, char* pchCurrent)
1884 {
1885 if(pchStart == NULL || pchCurrent == NULL || pchStart >= pchCurrent)
1886 return(NULL);
1887
1888 for(pchCurrent--; pchCurrent >= pchStart; pchCurrent--)
1889 {
1890 if(isspace((int) *pchCurrent) != 0)
1891 continue;
1892 if(*pchCurrent == ';')
1893 return(pchCurrent);
1894 break;
1895 }
1896
1897 return(NULL);
1898 }
1899
1900 /**********************************************************/
FindSemicolon(char * str)1901 static char* FindSemicolon(char* str)
1902 {
1903 if(str == NULL || *str == '\0')
1904 return(NULL);
1905
1906 str = SkipSpaces(str);
1907
1908 if(*str == ';')
1909 return(str);
1910
1911 return(NULL);
1912 }
1913
1914 /**********************************************************/
ExtractErratum(char * comm)1915 static char* ExtractErratum(char* comm)
1916 {
1917 char* start;
1918 char* pchNumber = NULL;
1919 char* end;
1920 char* p;
1921
1922 if(comm == NULL)
1923 return(NULL);
1924
1925 start = StringStr(comm, "Erratum:");
1926 if(start == NULL)
1927 return(comm);
1928
1929 end = StringChr(start, ']');
1930 if(end == NULL)
1931 return(comm);
1932
1933 pchNumber = end + 1;
1934 end = FindSemicolon(pchNumber);
1935 if(end != NULL)
1936 pchNumber = end + 1;
1937 p = FindBackSemicolon(comm, start);
1938 if(p != NULL)
1939 start = p;
1940 fta_StringCpy(start, pchNumber);
1941
1942 /* Check if the string after cutting signature is empty. If it's really
1943 * empty we have to ignore the whole string (comment).
1944 * Do you want to have a comment which contains nothing!? Probably no.
1945 */
1946 for(p = comm; *p == ' ' || *p == '\t' || *p == '\n';)
1947 p++;
1948 if(*p == '\0')
1949 *comm = '\0';
1950
1951 return(comm);
1952 }
1953
1954 /**********************************************************/
XMLGetXrefs(char * entry,XmlIndexPtr xip,TQualVector & quals)1955 static void XMLGetXrefs(char* entry, XmlIndexPtr xip, TQualVector& quals)
1956 {
1957 XmlIndexPtr xipqual;
1958
1959 if(entry == NULL || xip == NULL)
1960 return;
1961
1962 for (; xip != NULL; xip = xip->next)
1963 {
1964 if(xip->subtags == NULL)
1965 continue;
1966
1967 CRef<objects::CGb_qual> qual(new objects::CGb_qual);
1968
1969 for(xipqual = xip->subtags; xipqual != NULL; xipqual = xipqual->next)
1970 {
1971 if (xipqual->tag == INSDXREF_DBNAME)
1972 qual->SetQual(XMLGetTagValue(entry, xipqual));
1973 else if(xipqual->tag == INSDXREF_ID)
1974 qual->SetVal(XMLGetTagValue(entry, xipqual));
1975 }
1976
1977 if (qual->IsSetQual() && !qual->GetQual().empty())
1978 quals.push_back(qual);
1979 }
1980 }
1981
1982 /**********************************************************/
fta_add_article_ids(objects::CPub & pub,const std::string & doi,const std::string & agricola)1983 static void fta_add_article_ids(objects::CPub& pub, const std::string& doi, const std::string& agricola)
1984 {
1985 if (doi.empty() && agricola.empty())
1986 return;
1987
1988 if (pub.IsArticle())
1989 {
1990 objects::CCit_art& cit_art = pub.SetArticle();
1991
1992 if (!agricola.empty())
1993 {
1994 CRef<objects::CArticleId> id(new objects::CArticleId);
1995 id->SetOther().SetDb("AGRICOLA");
1996 id->SetOther().SetTag().SetStr(agricola);
1997
1998 cit_art.SetIds().Set().push_front(id);
1999 }
2000
2001 if (!doi.empty())
2002 {
2003 CRef<objects::CArticleId> id(new objects::CArticleId);
2004 id->SetDoi().Set(doi);
2005
2006 cit_art.SetIds().Set().push_front(id);
2007 }
2008 }
2009 }
2010
2011 /**********************************************************/
fta_remark_is_er(const Char * str)2012 Int4 fta_remark_is_er(const Char* str)
2013 {
2014 const char **b;
2015 char* s;
2016 Int4 i;
2017
2018 s = StringSave(str);
2019 ShrinkSpaces(s);
2020 for(i = 1, b = ERRemarks; *b != NULL; b++, i++)
2021 if(StringIStr(s, *b) != NULL)
2022 break;
2023
2024 MemFree(s);
2025 if(*b == NULL)
2026 return(0);
2027 if(i < 7)
2028 return(1); /* epublish */
2029 return(2); /* aheadofprint */
2030 }
2031
2032 /**********************************************************/
XMLRefs(ParserPtr pp,DataBlkPtr dbp,bool & no_auth,bool & rej)2033 static CRef<objects::CPubdesc> XMLRefs(ParserPtr pp, DataBlkPtr dbp, bool& no_auth, bool& rej)
2034 {
2035 char* title;
2036
2037 char* p;
2038 char* q;
2039 char* r;
2040 bool is_online;
2041 Int4 pmid;
2042 bool retstat;
2043
2044 XmlIndexPtr xip;
2045
2046 Int4 er;
2047
2048 CRef<objects::CPubdesc> desc;
2049
2050 if(pp == NULL || dbp == NULL || dbp->offset == NULL || dbp->data == NULL)
2051 return desc;
2052
2053 desc.Reset(new objects::CPubdesc);
2054
2055 p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2056 INSDREFERENCE_REFERENCE);
2057 if(p != NULL && isdigit((int) *p) != 0)
2058 {
2059 desc->SetPub().Set().push_back(get_num(p));
2060 }
2061 else
2062 {
2063 ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
2064 "No reference number.");
2065 }
2066
2067 if(p != NULL)
2068 MemFree(p);
2069
2070 p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2071 INSDREFERENCE_MEDLINE);
2072 if(p != NULL)
2073 {
2074 rej = true;
2075 MemFree(p);
2076 desc.Reset();
2077 return desc;
2078 }
2079
2080 pmid = 0;
2081 p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2082 INSDREFERENCE_PUBMED);
2083 if(p != NULL)
2084 {
2085 pmid = NStr::StringToInt(p, NStr::fAllowTrailingSymbols);
2086 MemFree(p);
2087 }
2088
2089 CRef<objects::CAuth_list> auth_list;
2090
2091 p = XMLConcatSubTags(dbp->offset, (XmlIndexPtr) dbp->data,
2092 INSDREFERENCE_AUTHORS, ',');
2093 if(p != NULL)
2094 {
2095 if(pp->xml_comp)
2096 {
2097 q = StringRChr(p, '.');
2098 if(q == NULL || q[1] != '\0')
2099 {
2100 q = (char*) MemNew(StringLen(p) + 2);
2101 StringCpy(q, p);
2102 StringCat(q, ".");
2103 MemFree(p);
2104 p = q;
2105 q = NULL;
2106 }
2107 }
2108 for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2109 q++;
2110 if(*q != '\0')
2111 {
2112 q = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2113 INSDREFERENCE_JOURNAL);
2114 get_auth(p, (pp->source == Parser::ESource::EMBL) ? EMBL_REF : GB_REF, q, auth_list);
2115 MemFree(q);
2116 }
2117 MemFree(p);
2118 }
2119
2120 p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2121 INSDREFERENCE_CONSORTIUM);
2122 if(p != NULL)
2123 {
2124 for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2125 q++;
2126
2127 if (*q != '\0')
2128 get_auth_consortium(p, auth_list);
2129
2130 MemFree(p);
2131 }
2132
2133 if (auth_list.Empty() || !auth_list->IsSetNames())
2134 no_auth = true;
2135
2136 p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2137 INSDREFERENCE_TITLE);
2138
2139 CRef<objects::CTitle::C_E> title_art(new objects::CTitle::C_E);
2140 if (p != NULL)
2141 {
2142 if(StringNCmp(p, "Direct Submission", 17) != 0 &&
2143 *p != '\0' && *p != ';')
2144 {
2145 title = clean_up(p);
2146 if(title != NULL)
2147 {
2148 title_art->SetName(tata_save(title));
2149 MemFree(title);
2150 }
2151 }
2152 MemFree(p);
2153 }
2154
2155 is_online = false;
2156 p = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2157 INSDREFERENCE_JOURNAL);
2158 if(p == NULL)
2159 {
2160 ErrPostEx(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
2161 "No JOURNAL line, reference dropped");
2162 desc.Reset();
2163 return desc;
2164 }
2165
2166 if(*p == '\0' || *p == ';')
2167 {
2168 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
2169 "JOURNAL line is empty, reference dropped");
2170 MemFree(p);
2171 desc.Reset();
2172 return desc;
2173 }
2174
2175 if (NStr::EqualNocase(p, 0, 18, "Online Publication"))
2176 is_online = true;
2177
2178 r = XMLFindTagValue(dbp->offset, (XmlIndexPtr) dbp->data,
2179 INSDREFERENCE_REMARK);
2180 if(r != NULL)
2181 {
2182 r = ExtractErratum(r);
2183 desc->SetComment(NStr::Sanitize(r));
2184 MemFree(r);
2185
2186 if(!is_online)
2187 normalize_comment(desc->SetComment());
2188 }
2189
2190 er = fta_remark_is_er(desc->IsSetComment() ? desc->GetComment().c_str() : NULL);
2191
2192 CRef<objects::CCit_art> cit_art;
2193 if((StringNCmp(p, "(er)", 4) == 0 || er > 0) &&
2194 pmid > 0 && pp->medserver == 1)
2195 {
2196 cit_art = fta_citart_by_pmid(pmid, retstat);
2197 if(retstat && cit_art.Empty())
2198 pmid = 0;
2199 }
2200
2201 if (pmid > 0)
2202 {
2203 CRef<objects::CPub> pub(new objects::CPub);
2204 pub->SetPmid().Set(ENTREZ_ID_FROM(int, pmid));
2205 desc->SetPub().Set().push_back(pub);
2206 }
2207
2208 CRef<objects::CPub> pub_ref = journal(pp, p, p + StringLen(p), auth_list, title_art, false, cit_art, er);
2209 MemFree(p);
2210
2211 TQualVector xrefs;
2212 for (xip = (XmlIndexPtr)dbp->data; xip != NULL; xip = xip->next)
2213 {
2214 if (xip->tag == INSDREFERENCE_XREF)
2215 XMLGetXrefs(dbp->offset, xip->subtags, xrefs);
2216 }
2217
2218 std::string doi;
2219 std::string agricola;
2220 ITERATE(TQualVector, xref, xrefs)
2221 {
2222 if (!(*xref)->IsSetQual())
2223 continue;
2224
2225 if (NStr::EqualNocase((*xref)->GetQual(), "ARGICOLA") && agricola.empty())
2226 agricola = (*xref)->GetVal();
2227 else if (NStr::EqualNocase((*xref)->GetQual(), "DOI") && doi.empty())
2228 doi = (*xref)->GetVal();
2229 }
2230
2231 fta_add_article_ids(*pub_ref, doi, agricola);
2232
2233 if (pub_ref.Empty())
2234 {
2235 desc.Reset();
2236 return desc;
2237 }
2238
2239 if(dbp->type == ParFlat_REF_NO_TARGET)
2240 desc->SetReftype(3);
2241
2242 desc->SetPub().Set().push_back(pub_ref);
2243
2244 return desc;
2245 }
2246
2247 /**********************************************************/
gb_refs_common(ParserPtr pp,DataBlkPtr dbp,Int4 col_data,bool bParser,DataBlkPtr ** ppInd,bool & no_auth)2248 CRef<objects::CPubdesc> gb_refs_common(ParserPtr pp, DataBlkPtr dbp, Int4 col_data,
2249 bool bParser, DataBlkPtr** ppInd, bool& no_auth)
2250 {
2251 static DataBlkPtr ind[MAXKW+1];
2252
2253 bool has_muid;
2254 char* p;
2255 char* q;
2256 char* r;
2257 bool is_online;
2258 Int4 pmid;
2259 bool retstat;
2260 Int4 er;
2261
2262 CRef<objects::CPubdesc> desc(new objects::CPubdesc);
2263
2264 p = dbp->offset + col_data;
2265 if(bParser)
2266 {
2267 /* This branch works when this function called in context of PARSER
2268 */
2269 if(*p >= '0' && *p <= '9')
2270 desc->SetPub().Set().push_back(get_num(p));
2271 else
2272 ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
2273 "No reference number.");
2274 ind_subdbp(dbp, ind, MAXKW, Parser::EFormat::GenBank);
2275 }
2276 else
2277 {
2278 /* This branch works when this function is called in context of GBDIFF
2279 */
2280 if(ppInd != NULL)
2281 {
2282 ind_subdbp(dbp, ind, MAXKW, Parser::EFormat::GenBank);
2283 *ppInd = &ind[0];
2284
2285 return desc;
2286 }
2287
2288 if(*p < '0' || *p > '9')
2289 ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
2290 "No reference number.");
2291 }
2292
2293 has_muid = false;
2294 if(ind[ParFlat_MEDLINE] != NULL)
2295 {
2296 p = ind[ParFlat_MEDLINE]->offset;
2297 CRef<objects::CPub> pub = get_muid(p, Parser::EFormat::GenBank);
2298 if (pub.NotEmpty())
2299 {
2300 has_muid = true;
2301 desc->SetPub().Set().push_back(get_num(p));
2302 }
2303 }
2304
2305 pmid = 0;
2306 if(ind[ParFlat_PUBMED] != NULL)
2307 {
2308 p = ind[ParFlat_PUBMED]->offset;
2309 if(p != NULL)
2310 pmid = NStr::StringToInt(p, NStr::fAllowTrailingSymbols);
2311 }
2312
2313 CRef<objects::CAuth_list> auth_list;
2314 if(ind[ParFlat_AUTHORS] != NULL)
2315 {
2316 p = ind[ParFlat_AUTHORS]->offset;
2317 for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2318 q++;
2319
2320 if(*q != '\0')
2321 {
2322 if(ind[ParFlat_JOURNAL] != NULL)
2323 q = ind[ParFlat_JOURNAL]->offset;
2324
2325 get_auth(p, GB_REF, q, auth_list);
2326 }
2327 }
2328
2329 if(ind[ParFlat_CONSRTM] != NULL)
2330 {
2331 p = ind[ParFlat_CONSRTM]->offset;
2332 for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2333 q++;
2334
2335 if (*q != '\0')
2336 get_auth_consortium(p, auth_list);
2337 }
2338
2339 if (auth_list.Empty() || !auth_list->IsSetNames())
2340 no_auth = true;
2341
2342 CRef<objects::CTitle::C_E> title_art;
2343 if(ind[ParFlat_TITLE] != NULL)
2344 {
2345 p = ind[ParFlat_TITLE]->offset;
2346 if(StringNCmp(p, "Direct Submission", 17) != 0 &&
2347 *p != '\0' && *p != ';')
2348 {
2349 q = clean_up(p);
2350 if(q != NULL)
2351 {
2352 title_art.Reset(new objects::CTitle::C_E);
2353 title_art->SetName(NStr::Sanitize(q));
2354 MemFree(q);
2355 }
2356 }
2357 }
2358
2359 if(ind[ParFlat_JOURNAL] == NULL)
2360 {
2361 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
2362 "No JOURNAL line, reference dropped");
2363
2364 desc.Reset();
2365 return desc;
2366 }
2367
2368 p = ind[ParFlat_JOURNAL]->offset;
2369 if(*p == '\0' || *p == ';')
2370 {
2371 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Fail_to_parse,
2372 "JOURNAL line is empty, reference dropped");
2373
2374 desc.Reset();
2375 return desc;
2376 }
2377
2378 is_online = (StringNICmp(p, "Online Publication", 18) == 0);
2379
2380 if(ind[ParFlat_REMARK] != NULL)
2381 {
2382 r = ind[ParFlat_REMARK]->offset;
2383 r = ExtractErratum(r);
2384 desc->SetComment(NStr::Sanitize(r));
2385
2386 if(!is_online)
2387 normalize_comment(desc->SetComment());
2388 }
2389
2390 er = fta_remark_is_er(desc->IsSetComment() ? desc->GetComment().c_str() : NULL);
2391
2392 CRef<objects::CCit_art> cit_art;
2393
2394 if(pp->medserver == 1 && pmid > 0 &&
2395 (StringNCmp(p, "(er)", 4) == 0 || er > 0))
2396 {
2397 cit_art = fta_citart_by_pmid(pmid, retstat);
2398 if(retstat && cit_art == NULL)
2399 pmid = 0;
2400 }
2401
2402 if (pmid > 0)
2403 {
2404 CRef<objects::CPub> pub(new objects::CPub);
2405 pub->SetPmid().Set(ENTREZ_ID_FROM(int, pmid));
2406 desc->SetPub().Set().push_back(pub);
2407 }
2408
2409 CRef<objects::CPub> pub_ref = journal(pp, p, p + ind[ParFlat_JOURNAL]->len,
2410 auth_list, title_art, has_muid, cit_art, er);
2411
2412 if (pub_ref.Empty())
2413 {
2414 desc.Reset();
2415 return desc;
2416 }
2417
2418 if(dbp->type == ParFlat_REF_NO_TARGET)
2419 desc->SetReftype(3);
2420
2421 desc->SetPub().Set().push_back(pub_ref);
2422
2423 return desc;
2424 }
2425
2426 /**********************************************************
2427 *
2428 * static PubdescPtr embl_refs(pp, dbp, col_data, no_auth):
2429 *
2430 * Parse EMBL references. Return a Pubdesc pointer.
2431 *
2432 * 11-14-93
2433 *
2434 **********************************************************/
embl_refs(ParserPtr pp,DataBlkPtr dbp,Int4 col_data,bool & no_auth)2435 static CRef<objects::CPubdesc> embl_refs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data, bool& no_auth)
2436 {
2437 static DataBlkPtr ind[MAXKW+1];
2438 char* s;
2439
2440 char* title;
2441 bool has_muid;
2442 char* p;
2443 char* q;
2444 Int4 pmid;
2445
2446 bool retstat;
2447 Int4 er;
2448
2449 CRef<objects::CPubdesc> desc(new objects::CPubdesc);
2450
2451 p = dbp->offset + col_data;
2452 while((*p < '0' || *p > '9') && dbp->len > 0)
2453 p++;
2454 if(*p >= '0' && *p <= '9')
2455 desc->SetPub().Set().push_back(get_num(p));
2456 else
2457 ErrPostEx(SEV_WARNING, ERR_REFERENCE_Illegalreference,
2458 "No reference number.");
2459
2460 ind_subdbp(dbp, ind, MAXKW, Parser::EFormat::EMBL);
2461
2462 has_muid = false;
2463 pmid = 0;
2464
2465 std::string doi;
2466 std::string agricola;
2467
2468 if(ind[ParFlat_RC] != NULL)
2469 desc->SetComment(NStr::Sanitize(ind[ParFlat_RC]->offset));
2470
2471 er = fta_remark_is_er(desc->IsSetComment() ? desc->GetComment().c_str() : NULL);
2472
2473 if(ind[ParFlat_RX] != NULL)
2474 {
2475 p = ind[ParFlat_RX]->offset;
2476 CRef<objects::CPub> pub = get_muid(p, Parser::EFormat::EMBL);
2477
2478 const Char* id = get_embl_str_pub_id(p, "DOI;");
2479 if (id)
2480 doi = id;
2481
2482 id = get_embl_str_pub_id(p, "AGRICOLA;");
2483 if (id)
2484 agricola = id;
2485
2486 if (pub.NotEmpty())
2487 {
2488 desc->SetPub().Set().push_back(pub);
2489 has_muid = true;
2490 }
2491
2492 pmid = get_embl_pmid(p);
2493 }
2494
2495 CRef<objects::CAuth_list> auth_list;
2496 if(ind[ParFlat_RA] != NULL)
2497 {
2498 p = ind[ParFlat_RA]->offset;
2499 s = p + StringLen(p) - 1;
2500 if(*s == ';')
2501 *s = '\0';
2502 for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2503 q++;
2504 if(*q != '\0')
2505 {
2506 if(ind[ParFlat_RL] != NULL)
2507 q = ind[ParFlat_RL]->offset;
2508
2509 get_auth(p, EMBL_REF, q, auth_list);
2510 }
2511 }
2512
2513 if(ind[ParFlat_RG] != NULL)
2514 {
2515 p = ind[ParFlat_RG]->offset;
2516 s = p + StringLen(p) - 1;
2517 if(*s == ';')
2518 *s = '\0';
2519
2520 for(q = p; *q == ' ' || *q == '.' || *q == ',';)
2521 q++;
2522
2523 if (*q != '\0')
2524 get_auth_consortium(p, auth_list);
2525 }
2526
2527 if (auth_list.Empty() || !auth_list->IsSetNames())
2528 no_auth = true;
2529
2530 CRef<objects::CTitle::C_E> title_art;
2531 if (ind[ParFlat_RT] != NULL)
2532 {
2533 p = ind[ParFlat_RT]->offset;
2534 if(*p != '\0' && *p != ';')
2535 {
2536 title = clean_up(p);
2537 if (title != NULL && title[0])
2538 {
2539 title_art.Reset(new objects::CTitle::C_E);
2540 title_art->SetName(NStr::Sanitize(title));
2541 }
2542 MemFree(title);
2543 }
2544 }
2545
2546 if(ind[ParFlat_RL] == NULL)
2547 {
2548 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Illegalreference,
2549 "No JOURNAL line, reference dropped.");
2550
2551 desc.Reset();
2552 return desc;
2553 }
2554
2555 p = ind[ParFlat_RL]->offset;
2556 if(*p == '\0' || *p == ';')
2557 {
2558 ErrPostStr(SEV_ERROR, ERR_REFERENCE_Illegalreference,
2559 "JOURNAL line is empty, reference dropped.");
2560
2561 desc.Reset();
2562 return desc;
2563 }
2564
2565 CRef<objects::CCit_art> cit_art;
2566 if ((StringNCmp(p, "(er)", 4) == 0 || er > 0) &&
2567 pmid > 0 && pp->medserver == 1)
2568 {
2569 cit_art = fta_citart_by_pmid(pmid, retstat);
2570 if(retstat && cit_art == NULL)
2571 pmid = 0;
2572 }
2573
2574 if (pmid > 0)
2575 {
2576 CRef<objects::CPub> pub(new objects::CPub);
2577 pub->SetPmid().Set(ENTREZ_ID_FROM(int, pmid));
2578 desc->SetPub().Set().push_back(pub);
2579 }
2580
2581 CRef<objects::CPub> pub_ref = journal(pp, p, p + ind[ParFlat_RL]->len, auth_list,
2582 title_art, has_muid, cit_art, er);
2583
2584 if (pub_ref.Empty())
2585 {
2586 desc.Reset();
2587 return desc;
2588 }
2589
2590 fta_add_article_ids(*pub_ref, doi, agricola);
2591
2592 if(dbp->type == ParFlat_REF_NO_TARGET)
2593 desc->SetReftype(3);
2594
2595 desc->SetPub().Set().push_back(pub_ref);
2596
2597 return desc;
2598 }
2599
2600 /**********************************************************/
fta_sort_pubs(TPubList & pubs)2601 static void fta_sort_pubs(TPubList& pubs)
2602 {
2603 NON_CONST_ITERATE(TPubList, pub, pubs)
2604 {
2605 TPubList::iterator next_pub = pub;
2606 for (++next_pub; next_pub != pubs.end(); ++next_pub)
2607 {
2608 if ((*next_pub)->Which() > (*pub)->Which())
2609 continue;
2610
2611 if ((*next_pub)->Which() == (*pub)->Which())
2612 {
2613 if (!(*pub)->IsMuid() || (*pub)->GetMuid() >= (*next_pub)->GetMuid())
2614 continue;
2615 }
2616
2617 pub->Swap(*next_pub);
2618 }
2619 }
2620 }
2621
2622 /**********************************************************/
fta_check_long_last_name(const objects::CAuth_list & authors,bool soft_report)2623 static void fta_check_long_last_name(const objects::CAuth_list& authors, bool soft_report)
2624 {
2625 static const size_t MAX_LAST_NAME_LEN = 30;
2626
2627 ErrSev sev;
2628
2629 if (!authors.IsSetNames() || !authors.GetNames().IsStd())
2630 return;
2631
2632 ITERATE(objects::CAuth_list::C_Names::TStd, author, authors.GetNames().GetStd())
2633 {
2634 if (!(*author)->IsSetName() || !(*author)->GetName().IsName())
2635 continue;
2636
2637 const objects::CName_std& name = (*author)->GetName().GetName();
2638
2639 if (name.IsSetLast() && name.GetLast().size() > MAX_LAST_NAME_LEN)
2640 {
2641 /* Downgrade severity of this error to WARNING
2642 * if in HTGS mode. As of 7/31/2002, very long
2643 * consortium names were treated as if
2644 * they were author last names, for HTGS data.
2645 * This can be reverted to ERROR after the
2646 * consortium name slot is available and utilized
2647 * in the ASN.1.
2648 */
2649 sev = (soft_report ? SEV_WARNING : SEV_ERROR);
2650 ErrPostEx(sev, ERR_REFERENCE_LongAuthorName,
2651 "Last name of author exceeds 30 characters in length. A format error in the reference data might have caused the author name to be parsed incorrectly. Name is \"%s\".",
2652 name.GetLast().c_str());
2653 }
2654 }
2655 }
2656
2657 /**********************************************************/
fta_check_long_name_in_article(const objects::CCit_art & cit_art,bool soft_report)2658 static void fta_check_long_name_in_article(const objects::CCit_art& cit_art, bool soft_report)
2659 {
2660 if (cit_art.IsSetAuthors())
2661 fta_check_long_last_name(cit_art.GetAuthors(), soft_report);
2662
2663 if (cit_art.IsSetFrom())
2664 {
2665 const objects::CCit_book* book = nullptr;
2666 if (cit_art.GetFrom().IsBook())
2667 book = &cit_art.GetFrom().GetBook();
2668 else if (cit_art.GetFrom().IsProc())
2669 {
2670 if (cit_art.GetFrom().GetProc().IsSetBook())
2671 book = &cit_art.GetFrom().GetProc().GetBook();
2672 }
2673
2674 if (book != nullptr && book->IsSetAuthors())
2675 fta_check_long_last_name(book->GetAuthors(), soft_report);
2676 }
2677 }
2678
2679 /**********************************************************/
fta_check_long_names(const objects::CPub & pub,bool soft_report)2680 static void fta_check_long_names(const objects::CPub& pub, bool soft_report)
2681 {
2682 if (pub.IsGen()) /* CitGen */
2683 {
2684 const objects::CCit_gen& cit_gen = pub.GetGen();
2685 if (cit_gen.IsSetAuthors())
2686 fta_check_long_last_name(cit_gen.GetAuthors(), soft_report);
2687 }
2688 else if (pub.IsSub()) /* CitSub */
2689 {
2690 if (!soft_report)
2691 {
2692 const objects::CCit_sub& cit_sub = pub.GetSub();
2693 if (cit_sub.IsSetAuthors())
2694 fta_check_long_last_name(cit_sub.GetAuthors(), soft_report);
2695 }
2696 }
2697 else if (pub.IsMedline()) /* Medline */
2698 {
2699 const objects::CMedline_entry& medline = pub.GetMedline();
2700 if (medline.IsSetCit())
2701 {
2702 fta_check_long_name_in_article(medline.GetCit(), soft_report);
2703 }
2704 }
2705 else if (pub.IsArticle()) /* CitArt */
2706 {
2707 fta_check_long_name_in_article(pub.GetArticle(), soft_report);
2708 }
2709 else if (pub.IsBook() || pub.IsProc() || pub.IsMan()) /* CitBook or CitProc or
2710 CitLet */
2711 {
2712 const objects::CCit_book* book = nullptr;
2713
2714 if (pub.IsBook())
2715 book = &pub.GetBook();
2716 else if (pub.IsProc())
2717 {
2718 if (pub.GetProc().IsSetBook())
2719 book = &pub.GetProc().GetBook();
2720 }
2721 else
2722 {
2723 if (pub.GetMan().IsSetCit())
2724 book = &pub.GetMan().GetCit();
2725 }
2726
2727 if (book != nullptr && book->IsSetAuthors())
2728 fta_check_long_last_name(book->GetAuthors(), soft_report);
2729 }
2730 else if (pub.IsPatent()) /* CitPat */
2731 {
2732 const objects::CCit_pat& patent = pub.GetPatent();
2733
2734 if (patent.IsSetAuthors())
2735 fta_check_long_last_name(patent.GetAuthors(), soft_report);
2736
2737 if (patent.IsSetApplicants())
2738 fta_check_long_last_name(patent.GetApplicants(), soft_report);
2739
2740 if (patent.IsSetAssignees())
2741 fta_check_long_last_name(patent.GetAssignees(), soft_report);
2742 }
2743 else if (pub.IsEquiv()) /* PubEquiv */
2744 {
2745 ITERATE(TPubList, cur_pub, pub.GetEquiv().Get())
2746 {
2747 fta_check_long_names(*(*cur_pub), soft_report);
2748 }
2749 }
2750 }
2751
2752 /**********************************************************/
fta_propagate_pmid_muid(objects::CPub_equiv & pub_equiv)2753 static void fta_propagate_pmid_muid(objects::CPub_equiv& pub_equiv)
2754 {
2755 Int4 pmid;
2756 Int4 muid;
2757
2758 pmid = 0;
2759 muid = 0;
2760
2761 objects::CCit_art* cit_art = nullptr;
2762 NON_CONST_ITERATE(TPubList, pub, pub_equiv.Set())
2763 {
2764 if ((*pub)->IsMuid() && muid == 0)
2765 muid = ENTREZ_ID_TO(int, (*pub)->GetMuid());
2766 else if ((*pub)->IsPmid() && pmid == 0)
2767 pmid = ENTREZ_ID_TO(int, (*pub)->GetPmid().Get());
2768 else if ((*pub)->IsArticle() && cit_art == nullptr)
2769 cit_art = &(*pub)->SetArticle();
2770 }
2771
2772 if (cit_art == NULL || (muid == 0 && pmid == 0))
2773 return;
2774
2775 if(muid != 0)
2776 {
2777 CRef<objects::CArticleId> id(new objects::CArticleId);
2778 id->SetMedline().Set(ENTREZ_ID_FROM(int, muid));
2779 cit_art->SetIds().Set().push_front(id);
2780 }
2781
2782 if(pmid != 0)
2783 {
2784 CRef<objects::CArticleId> id(new objects::CArticleId);
2785 id->SetPubmed().Set(ENTREZ_ID_FROM(int, pmid));
2786 cit_art->SetIds().Set().push_front(id);
2787 }
2788 }
2789
2790 /**********************************************************
2791 *
2792 * PubdescPtr DescrRefs(pp, dbp, col_data):
2793 *
2794 * Return a Pubdesc pointer.
2795 *
2796 * 4-14-93
2797 *
2798 **********************************************************/
DescrRefs(ParserPtr pp,DataBlkPtr dbp,Int4 col_data)2799 CRef<objects::CPubdesc> DescrRefs(ParserPtr pp, DataBlkPtr dbp, Int4 col_data)
2800 {
2801 bool soft_report = false;
2802
2803 bool rej = false;
2804 bool no_auth = false;
2805
2806 if(pp->mode == Parser::EMode::HTGS)
2807 soft_report = true;
2808
2809 CRef<objects::CPubdesc> desc;
2810
2811 if (pp->format == Parser::EFormat::SPROT)
2812 desc = sp_refs(pp, dbp, col_data);
2813 else if(pp->format == Parser::EFormat::XML)
2814 desc = XMLRefs(pp, dbp, no_auth, rej);
2815 else if(pp->format == Parser::EFormat::GenBank)
2816 desc = gb_refs_common(pp, dbp, col_data, true, NULL, no_auth);
2817 else if(pp->format == Parser::EFormat::EMBL)
2818 desc = embl_refs(pp, dbp, col_data, no_auth);
2819
2820 if(desc && desc->IsSetComment())
2821 {
2822 char *comment = (char *) desc->GetComment().c_str();
2823 ShrinkSpaces(comment);
2824 desc->SetComment(comment);
2825 }
2826
2827 if(no_auth)
2828 {
2829 if(pp->source == Parser::ESource::EMBL)
2830 ErrPostEx(SEV_ERROR, ERR_REFERENCE_MissingAuthors,
2831 "Reference has no author names.");
2832 else
2833 {
2834 ErrPostEx(SEV_REJECT, ERR_REFERENCE_MissingAuthors,
2835 "Reference has no author names. Entry dropped.");
2836 pp->entrylist[pp->curindx]->drop = 1;
2837 }
2838 }
2839
2840 if(rej)
2841 {
2842 ErrPostEx(SEV_REJECT, ERR_REFERENCE_InvalidMuid,
2843 "Use of Medline ID in INSDSeq format is not alowed. Entry dropped.");
2844 pp->entrylist[pp->curindx]->drop = 1;
2845 }
2846
2847 if (desc.NotEmpty() && desc->IsSetPub())
2848 {
2849 fta_sort_pubs(desc->SetPub().Set());
2850
2851 ITERATE(TPubList, pub, desc->GetPub().Get())
2852 {
2853 fta_check_long_names(*(*pub), soft_report);
2854 }
2855
2856 fta_propagate_pmid_muid(desc->SetPub());
2857 }
2858
2859 return desc;
2860 }
2861
2862 END_NCBI_SCOPE
2863