1 /* sp_index.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: sp_index.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 * Build SWISS-PROT format index block. Parsing SP to memory blocks.
34 *
35 */
36 #include <ncbi_pch.hpp>
37
38 #include "ftacpp.hpp"
39
40 #include "index.h"
41 #include "sprot.h"
42 #include "ftaerr.hpp"
43 #include "indx_blk.h"
44 #include "indx_def.h"
45 #include "utilfun.h"
46 #include "entry.h"
47
48 #ifdef THIS_FILE
49 # undef THIS_FILE
50 #endif
51 #define THIS_FILE "sp_index.cpp"
52
53 BEGIN_NCBI_SCOPE
54
55 KwordBlk spkwl[] = {
56 {"ID", 2}, {"AC", 2}, {"DT", 2}, {"DE", 2}, {"GN", 2}, {"OS", 2},
57 {"RN", 2}, {"CC", 2}, {"PE", 2}, {"DR", 2}, {"KW", 2}, {"FT", 2},
58 {"SQ", 2}, {"//", 2}, {NULL, 0} };
59
60 /**********************************************************/
sp_err_field(const char * name)61 static Uint1 sp_err_field(const char *name)
62 {
63 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
64 "Missing %s line, entry dropped", name);
65 return(1);
66 }
67
68 /**********************************************************/
SPGetVerNum(char * str,IndexblkPtr ibp)69 static void SPGetVerNum(char* str, IndexblkPtr ibp)
70 {
71 char* p;
72 char* q;
73
74 if(str == NULL || ibp == NULL)
75 return;
76
77 p = StringIStr(str, "sequence version");
78 if(p == NULL)
79 return;
80
81 for(p += 16; *p == ' ';)
82 p++;
83 for(q = p; *p >= '0' && *p <= '9';)
84 p++;
85 if(*p == '.' && (p[1] == '\0' || p[1] == '\n'))
86 {
87 *p = '\0';
88 ibp->vernum = atoi(q);
89 *p = '.';
90 }
91 }
92
93 /**********************************************************
94 *
95 * bool SprotIndex(pp, (*fun)()):
96 *
97 * 3-26-93
98 *
99 **********************************************************/
SprotIndex(ParserPtr pp,void (* fun)(IndexblkPtr entry,char * offset,Int4 len))100 bool SprotIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len))
101 {
102 TokenStatBlkPtr stoken;
103 FinfoBlkPtr finfo;
104
105 bool after_AC;
106 bool after_OS;
107 bool after_OC;
108 bool after_RN;
109 bool after_SQ;
110 bool end_of_file;
111
112 IndexblkPtr entry;
113 DataBlkPtr data;
114 Int4 i;
115 Int4 indx = 0;
116 IndBlkNextPtr ibnp;
117 IndBlkNextPtr tibnp;
118 char* p;
119
120 bool reviewed;
121
122 finfo = (FinfoBlkPtr) MemNew(sizeof(FinfoBlk));
123
124 end_of_file = SkipTitleBuf(pp->ffbuf, finfo, spkwl[ParFlatSP_ID].str,
125 spkwl[ParFlatSP_ID].len);
126 if(end_of_file)
127 {
128 MsgSkipTitleFail("Swiss-Prot", finfo);
129 return false;
130 }
131
132 ibnp = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
133 ibnp->next = NULL;
134 tibnp = ibnp;
135
136 while (!end_of_file)
137 {
138 entry = InitialEntry(pp, finfo);
139 if(entry != NULL)
140 {
141 pp->curindx = indx;
142 tibnp->next = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
143 tibnp = tibnp->next;
144 tibnp->ibp = entry;
145 tibnp->next = NULL;
146
147 indx++;
148
149 after_AC = false;
150 after_OS = false;
151 after_OC = false;
152 after_RN = false;
153 after_SQ = false;
154
155 p = PointToNextToken(finfo->str + ParFlat_COL_DATA_SP);
156 reviewed = (StringNICmp(p, "reviewed", 8) == 0);
157
158 while(!end_of_file &&
159 StringNCmp(finfo->str, spkwl[ParFlatSP_END].str,
160 spkwl[ParFlatSP_END].len) != 0)
161 {
162 if(StringNCmp(finfo->str, "RM", 2) == 0)
163 {
164 ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
165 "RM line type has been replaced by RX, skipped %s",
166 finfo->str);
167 }
168 if(after_SQ && IS_ALPHA(finfo->str[0]) != 0)
169 {
170 ErrPostStr(SEV_ERROR, ERR_FORMAT_MissingEnd,
171 "Missing end of the entry, entry dropped");
172 entry->drop = 1;
173 break;
174 }
175 if(StringNCmp(finfo->str, spkwl[ParFlatSP_SQ].str,
176 spkwl[ParFlatSP_SQ].len) == 0)
177 after_SQ = true;
178
179 if(StringNCmp(finfo->str, spkwl[ParFlatSP_OS].str,
180 spkwl[ParFlatSP_OS].len) == 0)
181 after_OS = true;
182
183 if(StringNCmp(finfo->str, "OC", 2) == 0)
184 after_OC = true;
185
186 if(StringNCmp(finfo->str, spkwl[ParFlatSP_RN].str,
187 spkwl[ParFlatSP_RN].len) == 0)
188 after_RN = true;
189
190 if(StringNCmp(finfo->str, spkwl[ParFlatSP_AC].str,
191 spkwl[ParFlatSP_AC].len) == 0)
192 {
193 if(after_AC == false)
194 {
195 after_AC = true;
196 if(!GetAccession(pp, finfo->str, entry, 2))
197 pp->num_drop++;
198 }
199 else if(entry->drop == 0 && !GetAccession(pp, finfo->str, entry, 1))
200 pp->num_drop++;
201 }
202 else if(StringNCmp(finfo->str, spkwl[ParFlatSP_DT].str,
203 spkwl[ParFlatSP_DT].len) == 0)
204 {
205 if(reviewed && pp->sp_dt_seq_ver && entry->vernum < 1)
206 SPGetVerNum(finfo->str, entry);
207 stoken = TokenString(finfo->str, ' ');
208 if(stoken->num > 2)
209 {
210 entry->date = GetUpdateDate(stoken->list->next->str,
211 pp->source);
212 }
213 FreeTokenstatblk(stoken);
214 }
215
216 end_of_file = XReadFileBuf(pp->ffbuf, finfo);
217
218 } /* while, end of one entry */
219
220 if(entry->drop != 1)
221 {
222 if(after_AC == false)
223 {
224 ErrPostStr(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
225 "Missing AC (accession #) line, entry dropped");
226 entry->drop = 1;
227 }
228
229 if(after_OS == false)
230 entry->drop = sp_err_field("OS (organism)");
231
232 if(after_OC == false)
233 entry->drop = sp_err_field("OC (organism classification)");
234
235 if(after_RN == false)
236 entry->drop = sp_err_field("RN (reference data)");
237
238 if(after_SQ == false)
239 entry->drop = sp_err_field("SQ (sequence data)");
240 }
241
242 entry->len = (size_t) (pp->ffbuf.current - pp->ffbuf.start) - entry->offset;
243
244 if(fun != NULL)
245 {
246 data = LoadEntry(pp, entry->offset, entry->len);
247 (*fun)(entry, data->offset, static_cast<Int4>(data->len));
248 FreeEntry(data);
249 }
250 } /* if, entry */
251 else
252 {
253 end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo,
254 spkwl[ParFlatSP_END].str,
255 spkwl[ParFlatSP_END].len);
256 }
257 end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo,
258 spkwl[ParFlatSP_ID].str,
259 spkwl[ParFlatSP_ID].len);
260
261 } /* while, end_of_file */
262
263 pp->indx = indx;
264
265 pp->entrylist = (IndexblkPtr*) MemNew(indx* sizeof(IndexblkPtr));
266 tibnp = ibnp->next;
267 MemFree(ibnp);
268 for(i = 0; i < indx && tibnp != NULL; i++, tibnp = ibnp)
269 {
270 pp->entrylist[i] = tibnp->ibp;
271 ibnp = tibnp->next;
272 MemFree(tibnp);
273 }
274 MemFree(finfo);
275
276 return end_of_file;
277 }
278
279 END_NCBI_SCOPE
280