1 /* sp_index.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  sp_index.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  *      Build SWISS-PROT format index block. Parsing SP to memory blocks.
34  *
35  */
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include "index.h"
41 #include "sprot.h"
42 #include "ftaerr.hpp"
43 #include "indx_blk.h"
44 #include "indx_def.h"
45 #include "utilfun.h"
46 #include "entry.h"
47 
48 #ifdef THIS_FILE
49 #    undef THIS_FILE
50 #endif
51 #define THIS_FILE "sp_index.cpp"
52 
53 BEGIN_NCBI_SCOPE
54 
55 KwordBlk spkwl[] = {
56     {"ID", 2}, {"AC", 2}, {"DT", 2}, {"DE", 2}, {"GN", 2}, {"OS", 2},
57     {"RN", 2}, {"CC", 2}, {"PE", 2}, {"DR", 2}, {"KW", 2}, {"FT", 2},
58     {"SQ", 2}, {"//", 2}, {NULL, 0} };
59 
60 /**********************************************************/
sp_err_field(const char * name)61 static Uint1 sp_err_field(const char *name)
62 {
63     ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
64               "Missing %s line, entry dropped", name);
65     return(1);
66 }
67 
68 /**********************************************************/
SPGetVerNum(char * str,IndexblkPtr ibp)69 static void SPGetVerNum(char* str, IndexblkPtr ibp)
70 {
71     char* p;
72     char* q;
73 
74     if(str == NULL || ibp == NULL)
75         return;
76 
77     p = StringIStr(str, "sequence version");
78     if(p == NULL)
79         return;
80 
81     for(p += 16; *p == ' ';)
82         p++;
83     for(q = p; *p >= '0' && *p <= '9';)
84         p++;
85     if(*p == '.' && (p[1] == '\0' || p[1] == '\n'))
86     {
87         *p = '\0';
88         ibp->vernum = atoi(q);
89         *p = '.';
90     }
91 }
92 
93 /**********************************************************
94  *
95  *   bool SprotIndex(pp, (*fun)()):
96  *
97  *                                              3-26-93
98  *
99  **********************************************************/
SprotIndex(ParserPtr pp,void (* fun)(IndexblkPtr entry,char * offset,Int4 len))100 bool SprotIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len))
101 {
102     TokenStatBlkPtr stoken;
103     FinfoBlkPtr     finfo;
104 
105     bool            after_AC;
106     bool            after_OS;
107     bool            after_OC;
108     bool            after_RN;
109     bool            after_SQ;
110     bool            end_of_file;
111 
112     IndexblkPtr     entry;
113     DataBlkPtr      data;
114     Int4            i;
115     Int4            indx = 0;
116     IndBlkNextPtr   ibnp;
117     IndBlkNextPtr   tibnp;
118     char*         p;
119 
120     bool            reviewed;
121 
122     finfo = (FinfoBlkPtr) MemNew(sizeof(FinfoBlk));
123 
124     end_of_file = SkipTitleBuf(pp->ffbuf, finfo, spkwl[ParFlatSP_ID].str,
125                                spkwl[ParFlatSP_ID].len);
126     if(end_of_file)
127     {
128         MsgSkipTitleFail("Swiss-Prot", finfo);
129         return false;
130     }
131 
132     ibnp = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
133     ibnp->next = NULL;
134     tibnp = ibnp;
135 
136     while (!end_of_file)
137     {
138         entry = InitialEntry(pp, finfo);
139         if(entry != NULL)
140         {
141             pp->curindx = indx;
142             tibnp->next = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
143             tibnp = tibnp->next;
144             tibnp->ibp = entry;
145             tibnp->next = NULL;
146 
147             indx++;
148 
149             after_AC = false;
150             after_OS = false;
151             after_OC = false;
152             after_RN = false;
153             after_SQ = false;
154 
155             p = PointToNextToken(finfo->str + ParFlat_COL_DATA_SP);
156             reviewed = (StringNICmp(p, "reviewed", 8) == 0);
157 
158             while(!end_of_file &&
159                   StringNCmp(finfo->str, spkwl[ParFlatSP_END].str,
160                              spkwl[ParFlatSP_END].len) != 0)
161             {
162                 if(StringNCmp(finfo->str, "RM", 2) == 0)
163                 {
164                     ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
165                               "RM line type has been replaced by RX, skipped %s",
166                               finfo->str);
167                 }
168                 if(after_SQ && IS_ALPHA(finfo->str[0]) != 0)
169                 {
170                     ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd,
171                                "Missing end of the entry, entry dropped");
172                     entry->drop = 1;
173                     break;
174                 }
175                 if(StringNCmp(finfo->str, spkwl[ParFlatSP_SQ].str,
176                               spkwl[ParFlatSP_SQ].len) == 0)
177                     after_SQ = true;
178 
179                 if(StringNCmp(finfo->str, spkwl[ParFlatSP_OS].str,
180                               spkwl[ParFlatSP_OS].len) == 0)
181                     after_OS = true;
182 
183                 if(StringNCmp(finfo->str, "OC", 2) == 0)
184                     after_OC = true;
185 
186                 if(StringNCmp(finfo->str, spkwl[ParFlatSP_RN].str,
187                               spkwl[ParFlatSP_RN].len) == 0)
188                     after_RN = true;
189 
190                 if(StringNCmp(finfo->str, spkwl[ParFlatSP_AC].str,
191                               spkwl[ParFlatSP_AC].len) == 0)
192                 {
193                     if(after_AC == false)
194                     {
195                         after_AC = true;
196                         if(!GetAccession(pp, finfo->str, entry, 2))
197                             pp->num_drop++;
198                     }
199                     else if(entry->drop == 0 && !GetAccession(pp, finfo->str, entry, 1))
200                         pp->num_drop++;
201                 }
202                 else if(StringNCmp(finfo->str, spkwl[ParFlatSP_DT].str,
203                                    spkwl[ParFlatSP_DT].len) == 0)
204                 {
205                     if(reviewed && pp->sp_dt_seq_ver && entry->vernum < 1)
206                         SPGetVerNum(finfo->str, entry);
207                     stoken = TokenString(finfo->str, ' ');
208                     if(stoken->num > 2)
209                     {
210                         entry->date = GetUpdateDate(stoken->list->next->str,
211                                                     pp->source);
212                     }
213                     FreeTokenstatblk(stoken);
214                 }
215 
216                 end_of_file = XReadFileBuf(pp->ffbuf, finfo);
217 
218             } /* while, end of one entry */
219 
220             if(entry->drop != 1)
221             {
222                 if(after_AC == false)
223                 {
224                     ErrPostStr(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
225                                "Missing AC (accession #) line, entry dropped");
226                     entry->drop = 1;
227                 }
228 
229                 if(after_OS == false)
230                     entry->drop = sp_err_field("OS (organism)");
231 
232                 if(after_OC == false)
233                     entry->drop = sp_err_field("OC (organism classification)");
234 
235                 if(after_RN == false)
236                     entry->drop = sp_err_field("RN (reference data)");
237 
238                 if(after_SQ == false)
239                     entry->drop = sp_err_field("SQ (sequence data)");
240             }
241 
242             entry->len = (size_t) (pp->ffbuf.current - pp->ffbuf.start) - entry->offset;
243 
244             if(fun != NULL)
245             {
246                 data = LoadEntry(pp, entry->offset, entry->len);
247                 (*fun)(entry, data->offset, static_cast<Int4>(data->len));
248                 FreeEntry(data);
249             }
250         } /* if, entry */
251         else
252         {
253             end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo,
254                                            spkwl[ParFlatSP_END].str,
255                                            spkwl[ParFlatSP_END].len);
256         }
257         end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo,
258                                        spkwl[ParFlatSP_ID].str,
259                                        spkwl[ParFlatSP_ID].len);
260 
261     } /* while, end_of_file */
262 
263     pp->indx = indx;
264 
265     pp->entrylist = (IndexblkPtr*) MemNew(indx* sizeof(IndexblkPtr));
266     tibnp = ibnp->next;
267     MemFree(ibnp);
268     for(i = 0; i < indx && tibnp != NULL; i++, tibnp = ibnp)
269     {
270         pp->entrylist[i] = tibnp->ibp;
271         ibnp = tibnp->next;
272         MemFree(tibnp);
273     }
274     MemFree(finfo);
275 
276     return end_of_file;
277 }
278 
279 END_NCBI_SCOPE
280