1 /* @source ajseqread **********************************************************
2 **
3 ** AJAX sequence reading functions
4 **
5 ** These functions control all aspects of AJAX sequence reading
6 **
7 ** @author Copyright (C) 2001 Peter Rice
8 ** @version $Revision: 1.334 $
9 ** @modified 2001-2011 pmr
10 ** @modified $Date: 2013/07/15 20:57:32 $ by $Author: rice $
11 ** @@
12 **
13 ** This library is free software; you can redistribute it and/or
14 ** modify it under the terms of the GNU Lesser General Public
15 ** License as published by the Free Software Foundation; either
16 ** version 2.1 of the License, or (at your option) any later version.
17 **
18 ** This library is distributed in the hope that it will be useful,
19 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 ** Lesser General Public License for more details.
22 **
23 ** You should have received a copy of the GNU Lesser General Public
24 ** License along with this library; if not, write to the Free Software
25 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
26 ** MA 02110-1301, USA.
27 **
28 ******************************************************************************/
29
30 #include "ajlib.h"
31
32 #include "ajseqread.h"
33 #include "ajseq.h"
34 #include "ajseqabi.h"
35 #include "ajseqtype.h"
36 #include "ajfeat.h"
37 #include "ajfeatread.h"
38 #include "ajcall.h"
39 #include "ajmath.h"
40 #include "ajlist.h"
41 #include "ajtable.h"
42 #include "ajquery.h"
43 #include "ajutil.h"
44 #include "ajbase.h"
45 #include "ajnexus.h"
46 #include "ajdom.h"
47 #include "ajseqbam.h"
48 #include "ajreg.h"
49 #include "ajtext.h"
50 #include "ajtextread.h"
51 #include "ajfileio.h"
52 #include "ajnam.h"
53
54 #include <limits.h>
55 #include <math.h>
56 #include <errno.h>
57
58
59 #ifdef WIN32
60 #define fileno _fileno
61 #endif /* WIN32 */
62
63 #define SCF_MAGIC (((((((ajuint)'.'<<8)+(ajuint)'s')<<8) \
64 +(ajuint)'c')<<8)+(ajuint)'f')
65
66 AjPTable seqDbMethods = NULL;
67
68 static AjPStr seqAppendTmpstr = NULL;
69 static AjPStrTok seqHandle = NULL;
70 static AjPStrTok seqHandle2 = NULL;
71 static AjPStrTok seqHandleSplit = NULL;
72 static AjPStr seqToken = NULL;
73 static AjPStr seqToken2 = NULL;
74 static AjPStr seqTokenSplit = NULL;
75 static AjPStr seqName = NULL;
76 static AjPStr seqChain = NULL;
77
78 static char* seqAppendFilter = NULL;
79
80 static ajint seqMaxGcglines = 5000;
81
82 static AjPRegexp seqRegTreeconTop = NULL;
83 static AjPRegexp seqRegMegaCommand = NULL;
84 static AjPRegexp seqRegMegaFeat = NULL;
85 static AjPRegexp seqRegMegaSeq = NULL;
86 static AjPRegexp seqRegJackTop = NULL;
87 static AjPRegexp seqRegJackSeq = NULL;
88 static AjPRegexp seqRegGffTyp = NULL;
89 static AjPRegexp seqRegGff3Typ = NULL;
90 static AjPRegexp seqRegRawNonseq = NULL;
91 static AjPRegexp seqRegNbrfId = NULL;
92 static AjPRegexp seqRegStadenId = NULL;
93 static AjPRegexp seqRegHennigBlank = NULL;
94 static AjPRegexp seqRegHennigSeq = NULL;
95 static AjPRegexp seqRegHennigTop = NULL;
96 static AjPRegexp seqRegHennigHead = NULL;
97 static AjPRegexp seqRegFitchHead = NULL;
98 static AjPRegexp seqRegStockholmSeq = NULL;
99 static AjPRegexp seqRegAbiDots = NULL;
100 static AjPRegexp seqRegMaseHead = NULL;
101 static AjPRegexp seqRegPhylipTop = NULL;
102 static AjPRegexp seqRegPhylipHead = NULL;
103 static AjPRegexp seqRegPhylipSeq = NULL;
104 static AjPRegexp seqRegPhylipSeq2 = NULL;
105
106 static AjPRegexp seqRegGcgDot = NULL;
107 static AjPRegexp seqRegGcgChk = NULL;
108 static AjPRegexp seqRegGcgLen = NULL;
109 static AjPRegexp seqRegGcgTyp = NULL;
110 static AjPRegexp seqRegGcgNam = NULL;
111 static AjPRegexp seqRegGcgMsf = NULL;
112 static AjPRegexp seqRegGcgMsflen = NULL;
113 static AjPRegexp seqRegGcgMsfnam = NULL;
114 static AjPRegexp seqRegGcgWgt = NULL;
115
116 static AjBool seqinFormatIsset = AJFALSE;
117
118 static AjPStr seqFtFmtEmbl = NULL;
119 static AjPStr seqFtFmtGenbank = NULL;
120 static AjPStr seqFtFmtRefseq = NULL;
121 static AjPStr seqFtFmtRefseqp = NULL;
122 static AjPStr seqFtFmtGff = NULL;
123 static AjPStr seqFtFmtPir = NULL;
124 static AjPStr seqFtFmtSwiss = NULL;
125 static AjPStr seqUsaTest = NULL;
126 static AjPStr seqQryChr = NULL;
127 static AjPStr seqQryDb = NULL;
128 static AjPStr seqQryList = NULL;
129 static AjPStr seqReadLine = NULL;
130 static AjPStr seqSaveLine = NULL;
131 static AjPStr seqSaveLine2 = NULL;
132 static AjPStr seqAppendRestStr = NULL;
133 static AjPStr seqAppendTmpSeq = NULL;
134 static AjPStr seqQualStr = NULL;
135
136 static AjPRegexp seqRegUsaAsis = NULL;
137 static AjPRegexp seqRegUsaDb = NULL;
138 static AjPRegexp seqRegUsaFmt = NULL;
139 static AjPRegexp seqRegUsaId = NULL;
140 static AjPRegexp seqRegUsaList = NULL;
141 static AjPRegexp seqRegUsaRange = NULL;
142 static AjPRegexp seqRegUsaWild = NULL;
143 static AjBool seqRegUsaInitDone = AJFALSE;
144 static AjBool seqDoWarnAppend = AJFALSE;
145
146 static float seqQualPhred[] =
147 {
148 0.0,
149 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 1-8 */
150 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 9-16 */
151 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 17-24 */
152 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 25-32 */
153 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, /* 33-42 */
154 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, /* 43-52 */
155 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, /* 53-62 */
156 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, /* 63-72 */
157 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, /* 73-82 */
158 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, /* 83-92 */
159 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, /* 93-102 */
160 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, /* 103-112 */
161 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, /* 113-122 */
162 90.0, 91.0, 92.0, 93.0 /* 123-126 */
163 };
164
165 static double seqQualSolexa[] =
166 {
167 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 0-7 */
168 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 8-15 */
169 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 16-23 */
170 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 24-31 */
171 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 32-39 */
172 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 40-47 */
173 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 48-55 */
174 0.0, 0.0, 0.0, 1.193310, /* 56-59 */
175 1.455405, 1.764349, 2.124426, 2.539019, 3.010300, /* 60-64 */
176 3.539019, 4.124426, 4.764349, 5.455405, 6.193310, /* 65-69 */
177 6.973228, 7.790097, 8.638920, 9.514969, 10.413927, /* 70-74 */
178 11.331956, 12.265724, 13.212384, 14.169543, 15.135209, /* 75-79 */
179 16.107742, 17.085800, 18.068291, 19.054333, 20.043214, /* 80-84 */
180 21.034361, 22.027316, 23.021712, 24.017255, 25.013712, /* 85-89 */
181 26.010895, 27.008657, 28.006878, 29.005464, 30.004341, /* 90-94 */
182 31.003448, 32.002739, 33.002176, 34.001729, 35.001373, /* 95-99 */
183 36.001091, 37.000866, 38.000688, 39.000547, 40.000434, /* 100-104 */
184 41.000345, 42.000274, 43.000218, 44.000173, 45.000137, /* 105-109 */
185 46.000109, 47.000087, 48.000069, 49.000055, 50.000043, /* 110-114 */
186 51.000034, 52.000027, 53.000022, 54.000017, 55.000014, /* 115-119 */
187 56.000011, 57.000009, 58.000007, 59.000005, 60.000004, /* 120-124 */
188 61.000003, 62.000003 /* 125-126 */
189 };
190
191
192
193
194 static float seqQualIllumina[] =
195 {
196 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 0-7 */
197 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 8-15 */
198 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 16-23 */
199 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 24-31 */
200 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 32-39 */
201 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 40-47 */
202 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 48-55 */
203 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, /* 56-63 */
204 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, /* 64-73 */
205 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, /* 74-83 */
206 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, /* 84-93 */
207 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, /* 94-103 */
208 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, /* 104-113 */
209 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, /* 114-123 */
210 60.0, 61.0, 62.0 /* 124-126 */
211 };
212
213
214
215
216
217 /* @datastatic SeqPInFormat ***************************************************
218 **
219 ** Sequence input formats data structure
220 **
221 ** @alias SeqSInFormat
222 ** @alias SeqOInFormat
223 **
224 ** @attr Name [const char*] Format name
225 ** @attr Obo [const char*] Ontology term id from EDAM
226 ** @attr Desc [const char*] Format description
227 ** @attr Alias [AjBool] Name is an alias for an identical definition
228 ** @attr Try [AjBool] If true, try for an unknown input. Duplicate names
229 ** and read-anything formats are set false
230 ** @attr Nucleotide [AjBool] True if suitable for nucleotide
231 ** @attr Protein [AjBool] True if suitable for protein
232 ** @attr Feature [AjBool] True if includes parsable feature data
233 ** @attr Gap [AjBool] True if allows gap characters
234 ** @attr Read [AjBool function] Input function, returns ajTrue on success
235 ** @attr Multiset [AjBool] If true, supports multiple sequence sets
236 ** If false, multiple sets must be in separate files
237 ** @attr Binary [AjBool] Binary file format
238 ** @@
239 ******************************************************************************/
240
241 typedef struct SeqSInFormat
242 {
243 const char *Name;
244 const char *Obo;
245 const char *Desc;
246 AjBool Alias;
247 AjBool Try;
248 AjBool Nucleotide;
249 AjBool Protein;
250 AjBool Feature;
251 AjBool Gap;
252 AjBool (*Read) (AjPSeq thys, AjPSeqin seqin);
253 AjBool Multiset;
254 AjBool Binary;
255 } SeqOInFormat;
256
257 #define SeqPInFormat SeqOInFormat*
258
259
260
261
262 /* @datastatic SeqPMsfData ****************************************************
263 **
264 ** Sequence alignment data, stored until written when output file is closed
265 **
266 ** @alias SeqSMsfData
267 ** @alias SeqOMsfData
268 **
269 ** @attr Table [AjPTable] Ajax table of AjPMsfItem objects
270 ** @attr Names [AjPStr*] Sequence names
271 ** @attr Count [ajuint] Undocumented
272 ** @attr Nseq [ajuint] Number of sequences
273 ** @attr Nexus [AjPNexus] Nexus alignment data
274 ** @attr Gene [AjPStr] Gene name
275 ** @attr Domain [AjPStr] Domain name
276 ** @attr NextGene [AjPStr] Next block gene name
277 ** @attr NextDomain [AjPStr] Next block domain name
278 ** @attr Bufflines [ajuint] Number of buffered lines read
279 ** @attr CommentDepth [ajint] Comment depth
280 ** @attr Resume [AjBool] Resume processing
281 ** @attr Identchar [char] Identity character
282 ** @attr Indelchar [char] Gap character
283 ** @attr Misschar [char] Gap character
284 ** @attr Seqtype [char] Sequence type N:nucleotide P:protein
285 ** @@
286 ******************************************************************************/
287
288 typedef struct SeqSMsfData
289 {
290 AjPTable Table;
291 AjPStr* Names;
292 ajuint Count;
293 ajuint Nseq;
294 AjPNexus Nexus;
295 AjPStr Gene;
296 AjPStr Domain;
297 AjPStr NextGene;
298 AjPStr NextDomain;
299 ajuint Bufflines;
300 ajint CommentDepth;
301 AjBool Resume;
302 char Identchar;
303 char Indelchar;
304 char Misschar;
305 char Seqtype;
306 } SeqOMsfData;
307
308 #define SeqPMsfData SeqOMsfData*
309
310
311
312
313 /* @datastatic SeqPMsfItem ****************************************************
314 **
315 ** MSF alignment output individual sequence data
316 **
317 ** @alias SeqSMsfItem
318 ** @alias SeqOMsfItem
319 **
320 ** @attr Name [AjPStr] Sequence name
321 ** @attr Desc [AjPStr] Sequence description
322 ** @attr Len [ajuint] Sequence length
323 ** @attr Check [ajuint] Sequence GCG checksum
324 ** @attr Seq [AjPStr] Sequence
325 ** @attr Weight [float] Weight (default 1.0)
326 ** @attr Padding [char[4]] Padding to alignment boundary
327 ** @@
328 *****************************************************************************/
329
330 typedef struct SeqSMsfItem
331 {
332 AjPStr Name;
333 AjPStr Desc;
334 ajuint Len;
335 ajuint Check;
336 AjPStr Seq;
337 float Weight;
338 char Padding[4];
339 } SeqOMsfItem;
340
341 #define SeqPMsfItem SeqOMsfItem*
342
343
344
345
346 /* @datastatic SeqPStockholm **************************************************
347 **
348 ** Ajax Stockholm object.
349 **
350 ** @new stockholmNew Default constructor
351 ** @delete stockholmDel Default destructor
352 **
353 ** @attr id [AjPStr] identifier
354 ** @attr ac [AjPStr] accession
355 ** @attr de [AjPStr] description
356 ** @attr au [AjPStr] author
357 ** @attr al [AjPStr] Undocumented
358 ** @attr tp [AjPStr] Undocumented
359 ** @attr se [AjPStr] Undocumented
360 ** @attr ga [ajuint[2]] Undocumented
361 ** @attr tc [float[2]] Undocumented
362 ** @attr nc [float[2]] Undocumented
363 ** @attr bm [AjPStr] Undocumented
364 ** @attr ref [AjPStr] Undocumented
365 ** @attr dc [AjPStr] Undocumented
366 ** @attr dr [AjPStr] Undocumented
367 ** @attr cc [AjPStr] Undocumented
368 ** @attr sacons [AjPStr] Undocumented
369 ** @attr sqcons [AjPStr] Undocumented
370 ** @attr sscons [AjPStr] Undocumented
371 ** @attr gs [AjPStr] Undocumented
372 ** @attr name [AjPStr*] Undocumented
373 ** @attr str [AjPStr*] Undocumented
374 ** @attr n [ajuint] Undocumented
375 ** @attr Count [ajuint] Count
376 ** @@
377 ******************************************************************************/
378
379 typedef struct SeqSStockholm
380 {
381 AjPStr id;
382 AjPStr ac;
383 AjPStr de;
384 AjPStr au;
385 AjPStr al;
386 AjPStr tp;
387 AjPStr se;
388 ajuint ga[2];
389 float tc[2];
390 float nc[2];
391 AjPStr bm;
392 AjPStr ref;
393 AjPStr dc;
394 AjPStr dr;
395 AjPStr cc;
396 AjPStr sacons;
397 AjPStr sqcons;
398 AjPStr sscons;
399 AjPStr gs;
400 AjPStr *name;
401 AjPStr *str;
402 ajuint n;
403 ajuint Count;
404 } SeqOStockholm;
405
406 #define SeqPStockholm SeqOStockholm*
407
408
409
410
411 /* @datastatic SeqPStockholmdata **********************************************
412 **
413 ** Ajax Stockholm data object (individual sequences)
414 **
415 ** @new stockholmdataNew Default constructor
416 ** @delete stockholmdataDel Default destructor
417 **
418 ** @attr id [AjPStr] identifier
419 ** @attr ac [AjPStr] accession
420 ** @attr de [AjPStr] description
421 ** @attr au [AjPStr] author
422 ** @attr al [AjPStr] Undocumented
423 ** @attr tp [AjPStr] Undocumented
424 ** @attr se [AjPStr] Undocumented
425 ** @attr bm [AjPStr] Undocumented
426 ** @attr sacons [AjPStr] Undocumented
427 ** @attr sqcons [AjPStr] Undocumented
428 ** @attr sscons [AjPStr] Undocumented
429 ** @attr ref [AjPStr] Undocumented
430 ** @attr dc [AjPStr] Undocumented
431 ** @attr dr [AjPStr] Undocumented
432 ** @attr cc [AjPStr] Undocumented
433 ** @attr gs [AjPStr] Undocumented
434 ** @attr ga [float[2]] Undocumented
435 ** @attr tc [float[2]] Undocumented
436 ** @attr nc [float[2]] Undocumented
437 ** @@
438 ******************************************************************************/
439
440 typedef struct SeqSStockholmdata
441 {
442 AjPStr id;
443 AjPStr ac;
444 AjPStr de;
445 AjPStr au;
446 AjPStr al;
447 AjPStr tp;
448 AjPStr se;
449 AjPStr bm;
450 AjPStr sacons;
451 AjPStr sqcons;
452 AjPStr sscons;
453 AjPStr ref;
454 AjPStr dc;
455 AjPStr dr;
456 AjPStr cc;
457 AjPStr gs;
458 float ga[2];
459 float tc[2];
460 float nc[2];
461 } SeqOStockholmdata;
462
463 #define SeqPStockholmdata SeqOStockholmdata*
464
465
466
467
468 /* @datastatic SeqPSelexseq ***************************************************
469 **
470 ** Ajax Selex object for #=SQ information.
471 **
472 ** @new selexSQNew Default constructor
473 ** @delete selexSQDel Default destructor
474 **
475 ** @attr name [AjPStr] Object name
476 ** @attr source [AjPStr] Source file
477 ** @attr ac [AjPStr] accession
478 ** @attr de [AjPStr] description
479 ** @attr wt [float] weight (default 1.0)
480 ** @attr start [ajuint] start position
481 ** @attr stop [ajuint] end position
482 ** @attr len [ajuint] length
483 ** @@
484 ******************************************************************************/
485
486 typedef struct SeqSSelexseq
487 {
488 AjPStr name;
489 AjPStr source;
490 AjPStr ac;
491 AjPStr de;
492 float wt;
493 ajuint start;
494 ajuint stop;
495 ajuint len;
496 } SeqOSelexseq;
497
498 #define SeqPSelexseq SeqOSelexseq*
499
500
501
502
503 /* @datastatic SeqPSelex ******************************************************
504 **
505 ** Ajax Selex object.
506 **
507 ** @new selexNew Default constructor
508 ** @delete selexDel Default destructor
509 **
510 ** @attr id [AjPStr] identifier
511 ** @attr ac [AjPStr] accession
512 ** @attr de [AjPStr] description
513 ** @attr au [AjPStr] author
514 ** @attr cs [AjPStr] Undocumented
515 ** @attr rf [AjPStr] Undocumented
516 ** @attr name [AjPStr*] Undocumented
517 ** @attr str [AjPStr*] Undocumented
518 ** @attr ss [AjPStr*] Undocumented
519 ** @attr ga [float[2]] Undocumented
520 ** @attr tc [float[2]] Undocumented
521 ** @attr nc [float[2]] Undocumented
522 ** @attr sq [SeqPSelexseq*] Selex sequence objects
523 ** @attr n [ajuint] Number of SeqPSelexseq sequence objects
524 ** @attr Count [ajuint] Count
525 ** @@
526 ******************************************************************************/
527
528 typedef struct SeqSSelex
529 {
530 AjPStr id;
531 AjPStr ac;
532 AjPStr de;
533 AjPStr au;
534 AjPStr cs;
535 AjPStr rf;
536 AjPStr *name;
537 AjPStr *str;
538 AjPStr *ss;
539 float ga[2];
540 float tc[2];
541 float nc[2];
542 SeqPSelexseq *sq;
543 ajuint n;
544 ajuint Count;
545 } SeqOSelex;
546
547 #define SeqPSelex SeqOSelex*
548
549
550
551
552 /* @datastatic SeqPSelexdata **************************************************
553 **
554 ** Ajax Selex data object (individual sequences)
555 **
556 ** @new selexdataNew Default constructor
557 ** @delete selexdataDel Default destructor
558 **
559 ** @attr id [AjPStr] identifier
560 ** @attr ac [AjPStr] accession
561 ** @attr de [AjPStr] description
562 ** @attr au [AjPStr] author
563 ** @attr cs [AjPStr] Undocumented
564 ** @attr rf [AjPStr] Undocumented
565 ** @attr name [AjPStr] Undocumented
566 ** @attr str [AjPStr] Undocumented
567 ** @attr ss [AjPStr] Undocumented
568 ** @attr ga [float[2]] Undocumented
569 ** @attr tc [float[2]] Undocumented
570 ** @attr nc [float[2]] Undocumented
571 ** @attr sq [SeqPSelexseq] Selex sequence object
572 ** @@
573 ******************************************************************************/
574
575 typedef struct SeqSSelexdata
576 {
577 AjPStr id;
578 AjPStr ac;
579 AjPStr de;
580 AjPStr au;
581 AjPStr cs;
582 AjPStr rf;
583 AjPStr name;
584 AjPStr str;
585 AjPStr ss;
586 float ga[2];
587 float tc[2];
588 float nc[2];
589 SeqPSelexseq sq;
590 } SeqOSelexdata;
591
592 #define SeqPSelexdata SeqOSelexdata*
593
594
595
596
597
598
599 typedef struct SeqSScfHeader
600 {
601 ajuint magic_number;
602 ajuint samples; /* Number of elements in Samples matrix */
603 ajuint samples_offset; /* Byte offset from start of file */
604 ajuint bases; /* Number of bases in Bases matrix */
605 ajuint bases_left_clip; /* OBSOLETE: No. bases in left clip (vector) */
606 ajuint bases_right_clip; /* OBSOLETE: No. bases in right clip (qual) */
607 ajuint bases_offset; /* Byte offset from start of file */
608 ajuint comments_size; /* Number of bytes in Comment section */
609 ajuint comments_offset; /* Byte offset from start of file */
610 char version[4]; /* "version.revision", eg '3' '.' '0' '0' */
611 ajuint sample_size; /* Size of samples in bytes 1=8bits, 2=16bits*/
612 ajuint code_set; /* code set used (but ignored!)*/
613 ajuint private_size; /* No. of bytes of Private data, 0 if none */
614 ajuint private_offset; /* Byte offset from start of file */
615 ajuint spare[18]; /* Unused */
616 } SeqOScfHeader;
617
618 #define SeqPScfHeader SeqOScfHeader*
619
620
621
622
623 typedef struct SeqSScfUncertainty
624 {
625 ajuint code;
626 const char* name;
627 } SeqOScfUncertainty;
628
629 #define SeqPScfUncertainty SeqOScfUncertainty*
630
631
632
633
634 static SeqOScfUncertainty SeqScfUncertainCodes[] = {
635 {0, "{A,C,G,T,-}"},
636 {1, "Staden"},
637 {2, "IUPAC (NC-IUB)"},
638 {3, "Pharmacia A.L.F. (NC-IUB)"},
639 {4, "{A,C,G,T,N} (ABI 373A)"},
640 {5, "IBI/Pustell"},
641 {6, "DNA*"},
642 {7, "DNASIS"},
643 {8, "IG/PC-Gene"},
644 {9, "MicroGenie"},
645 };
646
647
648
649
650 /*
651 * Type definition for the sequence data
652 */
653 typedef struct SeqSScfBase {
654 ajuint peak_index; /* Index into Samples matrix for base posn */
655 unsigned char prob_A; /* Probability of it being an A */
656 unsigned char prob_C; /* Probability of it being an C */
657 unsigned char prob_G; /* Probability of it being an G */
658 unsigned char prob_T; /* Probability of it being an T */
659 char base; /* Called base character */
660 char spare[3]; /* Spare */
661 } SeqOScfBase;
662
663 #define SeqPScfBase SeqOScfBase*
664
665
666
667
668 typedef struct SeqSScfData
669 {
670 SeqOScfHeader header;
671 } SeqOScfData;
672
673
674
675
676
677 #define SeqPScfData SeqOScfData*
678
679
680
681
682 /* @datastatic SeqPListUsa ****************************************************
683 **
684 ** Usa processing list of USAs from a list file.
685 **
686 ** Includes data from the original USA (@listfile)
687 **
688 ** @alias SeqSListUsa
689 ** @alias SeqOListUsa
690 **
691 ** @attr Begin [ajint] Begin if defined in original USA
692 ** @attr End [ajint] End if defined in original USA
693 ** @attr Rev [AjBool] Reverse if defined in original USA
694 ** @attr Format [ajuint] Format number from original USA
695 ** @attr Formatstr [AjPStr] Format name from original USA
696 ** @attr Usa [AjPStr] Current USA
697 ** @attr Fpos [ajulong] Start position offset
698 ** @attr Features [AjBool] if true, process features
699 ** @attr Padding [char[4]] Padding to alignment boundary
700 ** @@
701 ******************************************************************************/
702
703 typedef struct SeqSListUsa
704 {
705 ajint Begin;
706 ajint End;
707 AjBool Rev;
708 ajuint Format;
709 AjPStr Formatstr;
710 AjPStr Usa;
711 ajulong Fpos;
712 AjBool Features;
713 char Padding[4];
714 } SeqOListUsa;
715
716 #define SeqPListUsa SeqOListUsa*
717
718
719
720
721 /* @enumstatic SeqEPrefixGenbank **********************************************
722 **
723 ** Genbank/Refseq/GenPept/Codata record type
724 **
725 ** @value GB_UNK Unknown prefix
726 ** @value GB_AC Accession
727 ** @value GB_BASE Base count
728 ** @value GB_CC Comment
729 ** @value GB_DEF Definition
730 ** @value GB_FEAT Feature
731 ** @value GB_ID Locus
732 ** @value GB_KEY Keywords
733 ** @value GB_ORI Origin
734 ** @value GB_REF Reference
735 ** @value GB_SRC Source organism
736 ** @value GB_SQ Sequence
737 ** @value GB_VER Version
738 ** @value GB_WP GCG header
739 ** @value GB_END Final // record
740 ** @value GB_MORE Blank continuation
741 ** @value GB_MAX Beyond last value
742 ******************************************************************************/
743
744 typedef enum
745 {
746 GB_UNK,
747 GB_AC,
748 GB_BASE,
749 GB_CC,
750 GB_DEF,
751 GB_FEAT,
752 GB_ID,
753 GB_KEY,
754 GB_ORI,
755 GB_REF,
756 GB_SRC,
757 GB_SQ,
758 GB_VER,
759 GB_WP,
760 GB_END,
761 GB_MORE,
762 GB_MAX
763 } SeqEPrefixGenbank;
764
765
766
767
768 /* @enumstatic SeqEPrefixGenbankMore ******************************************
769 **
770 ** Genbank/Refseq/GenPept/Codata subrecord type
771 **
772 ** @value GB_MORE_UNK Unknown prefix
773 ** @value GB_MORE_STD Standard (non-prefix) line
774 ** @value GB_MORE_AUT AUTHORS
775 ** @value GB_MORE_JNL JOURNAL
776 ** @value GB_MORE_ORG ORGANISM
777 ** @value GB_MORE_TIT TITLE
778 ** @value GB_MORE_MORE Blank prefix of at least 10 characters
779 ** @value GB_MORE_MAX Beyond last value
780 ******************************************************************************/
781
782 typedef enum
783 {
784 GB_MORE_UNK,
785 GB_MORE_STD,
786 GB_MORE_AUT,
787 GB_MORE_JNL,
788 GB_MORE_ORG,
789 GB_MORE_TIT,
790 GB_MORE_MORE,
791 GB_MORE_MAX
792 } SeqEPrefixGenbankMore;
793
794
795
796
797 /* @enumstatic SeqEPrefixSwiss ************************************************
798 **
799 ** SwissProt/EMBL 2-character line prefix
800 **
801 ** @value SWISS_UNK Unknown prefix
802 ** @value SWISS_AC Accession
803 ** @value SWISS_AS EMBL AS line
804 ** @value SWISS_AV Staden experiment AV line
805 ** @value SWISS_CC Comment
806 ** @value SWISS_CO EMBL contig entry
807 ** @value SWISS_DE Description
808 ** @value SWISS_DR Database reference
809 ** @value SWISS_DT Date
810 ** @value SWISS_EX Staden experiment data
811 ** @value SWISS_FH EMBL feature header
812 ** @value SWISS_FT Feature
813 ** @value SWISS_GN Gene name
814 ** @value SWISS_ID Identifier line
815 ** @value SWISS_IV EMBL IV record
816 ** @value SWISS_KW Keyword
817 ** @value SWISS_OC Organism classification
818 ** @value SWISS_OG Organelle
819 ** @value SWISS_OH Organism host
820 ** @value SWISS_OS Species
821 ** @value SWISS_OX NCBI TaxID
822 ** @value SWISS_PE Swissprot evidence
823 ** @value SWISS_RA Reference authors
824 ** @value SWISS_RC Reference comment
825 ** @value SWISS_RG Reference RG
826 ** @value SWISS_RL Reference location
827 ** @value SWISS_RN Reference number
828 ** @value SWISS_RP Reference RP
829 ** @value SWISS_RT Reference RT
830 ** @value SWISS_RX Reference RX
831 ** @value SWISS_SQ Sequence
832 ** @value SWISS_SV SeqVersion
833 ** @value SWISS_WP GCG header
834 ** @value SWISS_XX Spacer
835 ** @value SWISS_END Final // record
836 ** @value SWISS_MORE Blank continuation (sequence)
837 ** @value SWISS_MAX Beyond last value
838 ******************************************************************************/
839
840 typedef enum
841 {
842 SWISS_UNK,
843 SWISS_AC, SWISS_AS, SWISS_AV,
844 SWISS_CC, SWISS_CO,
845 SWISS_DE, SWISS_DR, SWISS_DT,
846 SWISS_EX,
847 SWISS_FH, SWISS_FT,
848 SWISS_GN,
849 SWISS_ID, SWISS_IV,
850 SWISS_KW,
851 SWISS_OC, SWISS_OG, SWISS_OH, SWISS_OS, SWISS_OX,
852 SWISS_PE,
853 SWISS_RA, SWISS_RC, SWISS_RG, SWISS_RL,
854 SWISS_RN, SWISS_RP, SWISS_RT, SWISS_RX,
855 SWISS_SQ, SWISS_SV,
856 SWISS_WP,
857 SWISS_XX,
858 SWISS_END,
859 SWISS_MORE,
860 SWISS_MAX
861 } SeqEPrefixSwiss;
862
863
864
865
866 /* @enumstatic SeqEDesSwiss ***************************************************
867 **
868 ** SwissProt description codes
869 **
870 ** @value SWISS_DES_UNK Unknown code
871 ** @value SWISS_DES_ALT AltName:
872 ** @value SWISS_DES_CONT Contains:
873 ** @value SWISS_DES_FLG Flags:
874 ** @value SWISS_DES_INC Includes:
875 ** @value SWISS_DES_REC RecName:
876 ** @value SWISS_DES_SUB SubName:
877 ** @value SWISS_DES_MAX Beyond last value
878 ******************************************************************************/
879
880 typedef enum
881 {
882 SWISS_DES_UNK,
883 SWISS_DES_ALT, SWISS_DES_CONT, SWISS_DES_FLG,
884 SWISS_DES_INC, SWISS_DES_REC, SWISS_DES_SUB,
885 SWISS_DES_MAX
886 } SeqEDesSwiss;
887
888
889
890
891 /* @enumstatic SeqESubSwiss ***************************************************
892 **
893 ** SwissProt description subcodes
894 **
895 ** @value SWISS_SUB_UNK Unknown code
896 ** @value SWISS_SUB_ALLER Allergen=
897 ** @value SWISS_SUB_BIOTECH Biotech=
898 ** @value SWISS_SUB_CDA CD_antigen=
899 ** @value SWISS_SUB_EC EC=
900 ** @value SWISS_SUB_FULL Full=
901 ** @value SWISS_SUB_INN INN=
902 ** @value SWISS_SUB_SHORT Short=
903 ** @value SWISS_SUB_MAX Beyond last value
904 ******************************************************************************/
905
906 typedef enum
907 {
908 SWISS_SUB_UNK,
909 SWISS_SUB_ALLER,
910 SWISS_SUB_BIOTECH,
911 SWISS_SUB_CDA,
912 SWISS_SUB_EC,
913 SWISS_SUB_FULL,
914 SWISS_SUB_INN,
915 SWISS_SUB_SHORT,
916 SWISS_SUB_MAX
917 } SeqESubSwiss;
918
919
920
921
922 static SeqEDesSwiss seqDesSwiss(const AjPStr str);
923 static SeqESubSwiss seqDessubSwiss(AjPStr *Pstr);
924 static SeqEPrefixGenbank seqPrefixGenbank(const AjPStr str);
925 static SeqEPrefixGenbankMore seqPrefixGenbankMore(const AjPStr str);
926 static SeqEPrefixSwiss seqPrefixSwiss(const AjPStr str);
927
928 static AjBool seqReadAbi(AjPSeq thys, AjPSeqin seqin);
929
930 static void seqAccSave(AjPSeq thys, const AjPStr acc);
931 static ajuint seqAppend(AjPStr* seq, const AjPStr line);
932 static ajuint seqAppendK(AjPStr* seq, char ch);
933 static const AjPStr seqAppendWarn(AjPStr* seq, const AjPStr line,
934 ajuint informat);
935 static ajuint seqAppendCommented(AjPStr* seq, AjBool* incomment,
936 const AjPStr line);
937 static AjBool seqClustalReadseq(const AjPStr rdLine,
938 const AjPTable msftable);
939 static AjBool seqDefine(AjPSeq thys, AjPSeqin seqin);
940 static AjBool seqinFormatFind(const AjPStr format, ajint *iformat);
941 static AjBool seqinFormatSet(AjPSeqin seqin, AjPSeq thys);
942 static AjBool seqGcgDots(AjPSeq thys, const AjPSeqin seqin,
943 AjPStr* pline, ajuint maxlines, ajuint *len);
944 static void seqGcgRegInit(void);
945 static AjBool seqGcgMsfDots(AjPSeq thys, const AjPSeqin seqin,
946 AjPStr* pline,
947 ajuint maxlines, ajuint *len);
948 static AjBool seqGcgMsfHeader(const AjPStr line, SeqPMsfItem* msfitem);
949 static AjBool seqGcgMsfReadseq(const AjPStr rdline,
950 const AjPTable msftable);
951 static AjBool seqHennig86Readseq(const AjPStr rdline,
952 const AjPTable msftable);
953 static AjBool seqinUfoLocal(const AjPSeqin thys);
954 static void seqListNoComment(AjPStr* text);
955 static AjBool seqinListProcess(AjPSeqin seqin, AjPSeq thys,
956 const AjPStr usa);
957 static void seqMsfDataDel(SeqPMsfData* pthys);
958 static void seqMsfDataTrace(const SeqPMsfData thys);
959 static void seqMsfItemDel(SeqPMsfItem* pthys);
960 static void seqMsfTabDel(void **key, void **value, void *cl);
961 static void seqMsfTabList(const void *key, void **value, void *cl);
962 static AjBool seqPhylipReadseq(const AjPStr rdline,
963 const AjPTable phytable,
964 const AjPStr token,
965 ajuint len, ajuint* ilen, AjBool* done);
966 static AjBool seqQueryMatch(const AjPQuery query, const AjPSeq thys);
967 static AjBool seqRead(AjPSeq thys, AjPSeqin seqin);
968 static AjBool seqReadAce(AjPSeq thys, AjPSeqin seqin);
969 static AjBool seqReadAcedb(AjPSeq thys, AjPSeqin seqin);
970 static AjBool seqReadBam(AjPSeq thys, AjPSeqin seqin);
971 static AjBool seqReadBiomart(AjPSeq thys, AjPSeqin seqin);
972 static AjBool seqReadClustal(AjPSeq thys, AjPSeqin seqin);
973 static AjBool seqReadCodata(AjPSeq thys, AjPSeqin seqin);
974 static AjBool seqReadDAS(AjPSeq thys, AjPSeqin seqin);
975 static AjBool seqReadDbId(AjPSeq thys, AjPSeqin seqin);
976 static AjBool seqReadEmbl(AjPSeq thys, AjPSeqin seqin);
977 static AjBool seqReadEnsembl(AjPSeq thys, AjPSeqin seqin);
978 static AjBool seqReadExperiment(AjPSeq thys, AjPSeqin seqin);
979 static AjBool seqReadFasta(AjPSeq thys, AjPSeqin seqin);
980 static AjBool seqReadFastq(AjPSeq thys, AjPSeqin seqin);
981 static AjBool seqReadFastqIllumina(AjPSeq thys, AjPSeqin seqin);
982 /*static AjBool seqReadFastqInt(AjPSeq thys, AjPSeqin seqin);*/
983 static AjBool seqReadFastqSanger(AjPSeq thys, AjPSeqin seqin);
984 static AjBool seqReadFastqSolexa(AjPSeq thys, AjPSeqin seqin);
985 static AjBool seqReadFitch(AjPSeq thys, AjPSeqin seqin);
986 static ajuint seqReadFmt(AjPSeq thys, AjPSeqin seqin,
987 ajuint format);
988 static AjBool seqReadGcg(AjPSeq thys, AjPSeqin seqin);
989 static AjBool seqReadGde(AjPSeq thys, AjPSeqin seqin);
990 static AjBool seqReadGenbank(AjPSeq thys, AjPSeqin seqin);
991 static AjBool seqReadGenpept(AjPSeq thys, AjPSeqin seqin);
992 static AjBool seqReadGifasta(AjPSeq thys, AjPSeqin seqin);
993 static AjBool seqReadGff2(AjPSeq thys, AjPSeqin seqin);
994 static AjBool seqReadGff3(AjPSeq thys, AjPSeqin seqin);
995 static AjBool seqReadHennig86(AjPSeq thys, AjPSeqin seqin);
996 static AjBool seqReadIg(AjPSeq thys, AjPSeqin seqin);
997 static AjBool seqReadIgstrict(AjPSeq thys, AjPSeqin seqin);
998 static AjBool seqReadIguspto(AjPSeq thys, AjPSeqin seqin);
999 static AjBool seqReadJackknifer(AjPSeq thys, AjPSeqin seqin);
1000 static AjBool seqReadMase(AjPSeq thys, AjPSeqin seqin);
1001 static AjBool seqReadMega(AjPSeq thys, AjPSeqin seqin);
1002 static AjBool seqReadMsf(AjPSeq thys, AjPSeqin seqin);
1003 static AjBool seqReadNbrf(AjPSeq thys, AjPSeqin seqin);
1004 static AjBool seqReadNcbi(AjPSeq thys, AjPSeqin seqin);
1005 static AjBool seqReadNexus(AjPSeq thys, AjPSeqin seqin);
1006 static AjBool seqReadNibble(AjPSeq thys, AjPSeqin seqin);
1007 static AjBool seqReadPdb(AjPSeq thys, AjPSeqin seqin);
1008 static AjBool seqReadPdbseq(AjPSeq thys, AjPSeqin seqin);
1009 static AjBool seqReadPdbnuc(AjPSeq thys, AjPSeqin seqin);
1010 static AjBool seqReadPdbnucseq(AjPSeq thys, AjPSeqin seqin);
1011 static AjBool seqReadPhylip(AjPSeq thys, AjPSeqin seqin);
1012 static AjBool seqReadPhylipnon(AjPSeq thys, AjPSeqin seqin);
1013 static AjBool seqReadRaw(AjPSeq thys, AjPSeqin seqin);
1014 static AjBool seqReadRefseq(AjPSeq thys, AjPSeqin seqin);
1015 static AjBool seqReadRefseqp(AjPSeq thys, AjPSeqin seqin);
1016 static AjBool seqReadSam(AjPSeq thys, AjPSeqin seqin);
1017 static AjBool seqReadScf(AjPSeq thys, AjPSeqin seqin);
1018 static AjBool seqReadSelex(AjPSeq thys, AjPSeqin seqin);
1019 static AjBool seqReadStockholm(AjPSeq thys, AjPSeqin seqin);
1020 static AjBool seqReadStaden(AjPSeq thys, AjPSeqin seqin);
1021 static AjBool seqReadStrider(AjPSeq thys, AjPSeqin seqin);
1022 static AjBool seqReadSwiss(AjPSeq thys, AjPSeqin seqin);
1023 static AjBool seqReadText(AjPSeq thys, AjPSeqin seqin);
1024 static AjBool seqReadTreecon(AjPSeq thys, AjPSeqin seqin);
1025 static void seqSelexAppend(const AjPStr src, AjPStr *dest, ajuint beg,
1026 ajuint end);
1027 static void seqSelexCopy(AjPSeq *thys, SeqPSelex selex, ajuint n);
1028 static AjBool seqSelexHeader(SeqPSelex *thys, const AjPStr line,
1029 AjBool *named, ajuint *sqcnt);
1030 static void seqSelexPos(const AjPStr line, ajuint *begin, ajuint *end);
1031 static AjBool seqSelexReadBlock(SeqPSelex *thys, AjBool *named, ajuint n,
1032 AjPStr *line, AjPSeqin seqin, AjPStr *astr);
1033 static AjBool seqSetInFormat(const AjPStr format);
1034 static void seqSetName(AjPSeq thys, const AjPStr str);
1035 static void seqitemSetName(SeqPMsfItem thys, const AjPStr str);
1036 static void seqnameSetName(AjPStr *name, const AjPStr str);
1037 static void seqSetNameFile(AjPSeq thys, const AjPSeqin seqin);
1038 static void seqSetNameNospace(AjPStr* name, const AjPStr str);
1039 static void seqStockholmCopy(AjPSeq *thys, SeqPStockholm stock, ajint n);
1040 static void seqSvSave(AjPSeq thys, const AjPStr sv);
1041 static void seqTaxSave(AjPSeq thys, const AjPStr tax, ajuint level);
1042 static void seqTaxidSaveI(AjPSeq thys, ajuint tax);
1043 static void seqTaxidSaveS(AjPSeq thys, const AjPStr tax);
1044 static void seqTextSeq(AjPStr* textptr, const AjPStr seq);
1045 static void seqUsaListTrace(const AjPList list);
1046 static AjBool seqinUsaProcess(AjPSeqin seqin, AjPSeq thys);
1047 static void seqUsaRegInit(void);
1048 static void seqUsaRestore(AjPSeqin seqin, const SeqPListUsa node);
1049 static void seqUsaSave(SeqPListUsa node, const AjPSeqin seqin);
1050
1051 static void seqqualAppendWarn(AjPStr* seq, const AjPStr line);
1052
1053 static SeqPStockholm stockholmNew(ajuint i);
1054 static void stockholmDel(SeqPStockholm *thys);
1055
1056 static void selexDel(SeqPSelex *thys);
1057 static void selexseqDel(SeqPSelexseq *thys);
1058
1059 /*
1060 static SeqPStockholmdata stockholmdataNew(void);
1061 static void stockholmdataDel(SeqPStockholmdata *thys);
1062 static SeqPSelexdata seqSelexClone(const SeqPSelexdata thys);
1063 static SeqPSelexdata selexdataNew(void);
1064 static void selexdataDel(SeqPSelexdata *thys);
1065 */
1066
1067 static SeqPSelex selexNew(ajuint n);
1068 static SeqPSelexseq selexseqNew(void);
1069
1070 /* static data that needs the function definitions and so must come later */
1071
1072
1073
1074
1075 /* @funclist seqinFormatDef ***************************************************
1076 **
1077 ** Functions to read each sequence format
1078 **
1079 ** New documentation on sequence formats:
1080 ** http://www.megasoftware.net/mega4.pdf pages 55 onwards (sections 4.1, 4.2)
1081 ** describe MEGA, some other formats, simple XML (name and seq)
1082 **
1083 ** The SeqIO program supports some non-EMBOSS formats:
1084 ** http://biowulf.nih.gov/apps/seqio_docs/seqio_user.html
1085 ** notably FASTA-output, BLAST-output
1086 ** and has its own rules for database definitions (BioSeq)
1087 ** and database references
1088 **
1089 ** For XML formats see Paul Gordon's list at
1090 ** http://www.visualgenomics.ca/gordonp/xml/
1091 **
1092 ******************************************************************************/
1093
1094 static SeqOInFormat seqinFormatDef[] =
1095 {
1096 /* "Name",
1097 "Obo" "Description" */
1098 /* Alias, Try, Nucleotide, Protein */
1099 /* Feature Gap, ReadFunction, Multiset, Binary */
1100 {"unknown",
1101 "0000", "Unknown format",
1102 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1103 AJFALSE, AJTRUE, &seqReadText, AJFALSE, AJFALSE}, /* alias for text */
1104 {"gcg",
1105 "1935", "GCG sequence format",
1106 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1107 AJFALSE, AJTRUE, &seqReadGcg, AJFALSE, AJFALSE}, /* do 1st,
1108 headers mislead */
1109 {"gcg8",
1110 "1935", "GCG old (version 8) sequence format",
1111 AJTRUE, AJFALSE, AJTRUE, AJTRUE,
1112 AJFALSE, AJTRUE, &seqReadGcg, AJFALSE, AJFALSE}, /* alias for gcg
1113 (8.x too) */
1114 {"embl",
1115 "1927", "EMBL format",
1116 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1117 AJTRUE, AJTRUE, &seqReadEmbl, AJFALSE, AJFALSE},
1118 {"em",
1119 "1927", "EMBL format (alias)",
1120 AJTRUE, AJFALSE, AJTRUE, AJFALSE,
1121 AJTRUE, AJTRUE, &seqReadEmbl, AJFALSE, AJFALSE}, /* alias for embl */
1122 {"swiss",
1123 "1963", "Swissprot entry format",
1124 AJFALSE, AJTRUE, AJFALSE, AJTRUE,
1125 AJTRUE, AJTRUE, &seqReadSwiss, AJFALSE, AJFALSE},
1126 {"sw",
1127 "1963", "Swissprot entry format (alias)",
1128 AJTRUE, AJFALSE, AJFALSE, AJTRUE,
1129 AJTRUE, AJTRUE, &seqReadSwiss, AJFALSE, AJFALSE}, /* alias for swiss */
1130 {"swissprot",
1131 "1963", "Swissprot entry format (alias)",
1132 AJTRUE, AJFALSE, AJFALSE, AJTRUE,
1133 AJTRUE, AJTRUE, &seqReadSwiss, AJFALSE, AJFALSE},
1134 {"uniprot",
1135 "2188", "Swissprot entry format (alias)",
1136 AJTRUE, AJFALSE, AJFALSE, AJTRUE,
1137 AJTRUE, AJTRUE, &seqReadSwiss, AJFALSE, AJFALSE},
1138 {"nbrf",
1139 "1948", "NBRF/PIR entry format",
1140 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1141 AJTRUE, AJTRUE, &seqReadNbrf, AJFALSE, AJFALSE}, /* test before NCBI */
1142 {"pir",
1143 "1948", "NBRF/PIR entry format (alias)",
1144 AJTRUE, AJFALSE, AJTRUE, AJTRUE,
1145 AJTRUE, AJTRUE, &seqReadNbrf, AJFALSE, AJFALSE}, /* alias for nbrf */
1146 {"pdb",
1147 "1950", "PDB protein databank format ATOM lines",
1148 AJFALSE, AJTRUE, AJFALSE, AJTRUE,
1149 AJFALSE, AJFALSE, &seqReadPdb, AJFALSE, AJFALSE},
1150 {"pdbseq",
1151 "1953", "PDB protein databank format SEQRES lines",
1152 AJFALSE, AJFALSE, AJFALSE, AJTRUE,
1153 AJFALSE, AJFALSE, &seqReadPdbseq, AJFALSE, AJFALSE},
1154 {"pdbnuc",
1155 "1951", "PDB protein databank format nucleotide ATOM lines",
1156 AJFALSE, AJFALSE, AJTRUE, AJFALSE,
1157 AJFALSE, AJFALSE, &seqReadPdbnuc, AJFALSE, AJFALSE},
1158 {"pdbnucseq",
1159 "1952", "PDB protein databank format nucleotide SEQRES lines",
1160 AJFALSE, AJFALSE, AJTRUE, AJFALSE,
1161 AJFALSE, AJFALSE, &seqReadPdbnucseq, AJFALSE, AJFALSE},
1162 {"fasta",
1163 "1929", "FASTA format including NCBI-style IDs",
1164 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1165 AJFALSE, AJTRUE, &seqReadNcbi, AJFALSE, AJFALSE}, /* alias for ncbi,
1166 preferred name */
1167 {"ncbi",
1168 "1929", "FASTA format including NCBI-style IDs (alias)",
1169 AJTRUE, AJFALSE, AJTRUE, AJTRUE,
1170 AJFALSE, AJTRUE, &seqReadNcbi, AJFALSE, AJFALSE}, /* test before
1171 pearson */
1172 {"gifasta",
1173 "1940", "FASTA format including NCBI-style GIs (alias)",
1174 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1175 AJFALSE, AJTRUE, &seqReadGifasta, AJFALSE, AJFALSE}, /* NCBI with GI
1176 as ID*/
1177 {"pearson",
1178 "1954", "Plain old fasta format with IDs not parsed further",
1179 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1180 AJFALSE, AJTRUE, &seqReadFasta, AJFALSE, AJFALSE}, /* plain fasta - off
1181 by default, can
1182 read bad files */
1183 {"fastq",
1184 "1930", "FASTQ short read format ignoring quality scores",
1185 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1186 AJFALSE, AJFALSE, &seqReadFastq, AJFALSE, AJFALSE},
1187 {"fastq-sanger",
1188 "1932", "FASTQ short read format with phred quality",
1189 AJFALSE, AJFALSE, AJTRUE, AJFALSE,
1190 AJFALSE, AJFALSE, &seqReadFastqSanger, AJFALSE, AJFALSE},
1191 {"fastq-illumina",
1192 "1931", "FASTQ Illumina 1.3 short read format",
1193 AJFALSE, AJFALSE, AJTRUE, AJFALSE,
1194 AJFALSE, AJFALSE, &seqReadFastqIllumina, AJFALSE, AJFALSE},
1195 {"fastq-solexa",
1196 "1933", "FASTQ Solexa/Illumina 1.0 short read format",
1197 AJFALSE, AJFALSE, AJTRUE, AJFALSE,
1198 AJFALSE, AJFALSE, &seqReadFastqSolexa, AJFALSE, AJFALSE},
1199 /*
1200 ** {"fastq-int", "FASTQ short read format with integer Solexa scores",
1201 ** AJFALSE, AJFALSE, AJTRUE, AJFALSE,
1202 ** AJFALSE, AJFALSE, seqReadFastqInt, AJFALSE, AJFALSE},
1203 */
1204 {"sam",
1205 "2573", "Sequence Alignment/Map (SAM) format", /* biomart also tsv */
1206 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1207 AJFALSE, AJTRUE, &seqReadSam, AJFALSE, AJFALSE},
1208 {"genbank",
1209 "1936", "Genbank entry format",
1210 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1211 AJTRUE, AJTRUE, &seqReadGenbank, AJFALSE, AJFALSE},
1212 {"gb",
1213 "1936", "Genbank entry format (alias)",
1214 AJTRUE, AJFALSE, AJTRUE, AJFALSE,
1215 AJTRUE, AJTRUE, &seqReadGenbank, AJFALSE, AJFALSE}, /* alias for
1216 genbank */
1217 {"ddbj",
1218 "1936", "Genbank/DDBJ entry format (alias)",
1219 AJTRUE, AJFALSE, AJTRUE, AJFALSE,
1220 AJTRUE, AJTRUE, &seqReadGenbank, AJFALSE, AJFALSE}, /* alias for
1221 genbank */
1222 {"refseq",
1223 "1936", "Refseq entry format (alias)",
1224 AJTRUE, AJFALSE, AJTRUE, AJFALSE,
1225 AJTRUE, AJTRUE, &seqReadRefseq, AJFALSE, AJFALSE}, /* alias for
1226 genbank */
1227 {"refseqp",
1228 "1958", "Refseq protein entry format",
1229 AJFALSE, AJFALSE, AJFALSE, AJTRUE, /* genbank format proteins */
1230 AJTRUE, AJTRUE, &seqReadRefseqp, AJFALSE, AJFALSE},
1231 {"genpept", "1937", "Refseq protein entry format (alias)",
1232 AJFALSE, AJFALSE, AJFALSE, AJTRUE,
1233 AJFALSE, AJTRUE, &seqReadGenpept, AJFALSE, AJFALSE},
1234 {"codata",
1235 "1925", "Codata entry format",
1236 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1237 AJTRUE, AJTRUE, &seqReadCodata, AJFALSE, AJFALSE},
1238 {"strider",
1239 "1962", "DNA strider output format",
1240 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1241 AJFALSE, AJTRUE, &seqReadStrider, AJFALSE, AJFALSE},
1242 {"clustal",
1243 "1924", "Clustalw output format",
1244 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1245 AJFALSE, AJTRUE, &seqReadClustal, AJFALSE, AJFALSE},
1246 {"aln",
1247 "1924", "Clustalw output format (alias)",
1248 AJTRUE, AJFALSE, AJTRUE, AJTRUE,
1249 AJFALSE, AJTRUE, &seqReadClustal, AJFALSE, AJFALSE}, /* alias for
1250 clustal */
1251 {"phylip",
1252 "1955", "Phylip interleaved and non-interleaved formats",
1253 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1254 AJFALSE, AJTRUE, &seqReadPhylip, AJTRUE, AJFALSE},
1255 {"phylipnon",
1256 "1956", "Phylip non-interleaved format",
1257 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1258 AJFALSE, AJTRUE, &seqReadPhylipnon, AJTRUE, AJFALSE}, /* tried by
1259 phylip */
1260 {"ace",
1261 "3001", "ACE sequence format",
1262 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1263 AJFALSE, AJTRUE, &seqReadAce, AJFALSE, AJFALSE},
1264 {"consed",
1265 "3001", "ACE sequence format",
1266 AJTRUE, AJTRUE, AJTRUE, AJFALSE,
1267 AJFALSE, AJTRUE, &seqReadAce, AJFALSE, AJFALSE}, /* alias for ace */
1268 {"acedb",
1269 "1923", "ACEDB sequence format",
1270 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1271 AJFALSE, AJTRUE, &seqReadAcedb, AJFALSE, AJFALSE},
1272 {"dbid",
1273 "1926", "Fasta format variant with database name before ID",
1274 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1275 AJFALSE, AJTRUE, &seqReadDbId, AJFALSE, AJFALSE}, /* odd fasta with id as
1276 second token */
1277 {"msf",
1278 "1947", "GCG MSF (multiple sequence file) file format",
1279 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1280 AJFALSE, AJTRUE, &seqReadMsf, AJFALSE, AJFALSE},
1281 {"hennig86",
1282 "1941", "Hennig86 output format",
1283 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1284 AJFALSE, AJTRUE, &seqReadHennig86, AJFALSE, AJFALSE},
1285 {"jackknifer",
1286 "1944", "Jackknifer interleaved and non-interleaved formats",
1287 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1288 AJFALSE, AJTRUE, &seqReadJackknifer, AJFALSE, AJFALSE},
1289 {"nexus",
1290 "1949", "Nexus/paup interleaved format",
1291 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1292 AJFALSE, AJTRUE, &seqReadNexus, AJFALSE, AJFALSE},
1293 {"paup",
1294 "1949", "Nexus/paup interleaved format (alias)",
1295 AJTRUE, AJFALSE, AJTRUE, AJTRUE,
1296 AJFALSE, AJTRUE, &seqReadNexus, AJFALSE, AJFALSE}, /* alias for nexus */
1297 {"treecon",
1298 "1965", "Treecon output format",
1299 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1300 AJFALSE, AJTRUE, &seqReadTreecon, AJFALSE, AJFALSE},
1301 {"mega",
1302 "1946 1971", "Mega interleaved and non-interleaved formats",
1303 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1304 AJFALSE, AJTRUE, &seqReadMega, AJFALSE, AJFALSE},
1305 {"igstrict",
1306 "1943", "Intelligenetics sequence format strict parser",
1307 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1308 AJFALSE, AJTRUE, &seqReadIgstrict, AJFALSE, AJFALSE},
1309 {"iguspto",
1310 "1942", "US patent office multi-line Intelligenetics sequence format",
1311 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1312 AJFALSE, AJTRUE, &seqReadIguspto, AJFALSE, AJFALSE},
1313 {"ig",
1314 "1942", "Intelligenetics sequence format",
1315 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1316 AJFALSE, AJTRUE, &seqReadIg, AJFALSE, AJFALSE}, /* can read almost
1317 anything */
1318 {"staden",
1319 "1960", "Old staden package sequence format",
1320 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1321 AJFALSE, AJTRUE, &seqReadStaden, AJFALSE, AJFALSE},/* original staden
1322 format */
1323 {"textonly",
1324 "1964", "Plain text",
1325 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1326 AJFALSE, AJTRUE, &seqReadText, AJFALSE, AJFALSE},/* can read almost
1327 anything */
1328 {"plain",
1329 "1964", "Plain text (alias)",
1330 AJTRUE, AJFALSE, AJTRUE, AJTRUE,
1331 AJFALSE, AJTRUE, &seqReadText, AJFALSE, AJFALSE}, /* alias for text */
1332 {"asis",
1333 "1964", "Data as commandline string",
1334 AJTRUE, AJFALSE, AJTRUE, AJTRUE,
1335 AJFALSE, AJTRUE, &seqReadText, AJFALSE, AJFALSE}, /* one line only */
1336 {"gff2",
1337 "1938", "GFF feature file with sequence in the header",
1338 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1339 AJTRUE, AJTRUE, &seqReadGff2, AJFALSE, AJFALSE},
1340 {"gff3",
1341 "1939", "GFF3 feature file with sequence",
1342 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1343 AJTRUE, AJTRUE, &seqReadGff3, AJFALSE, AJFALSE},
1344 {"gff",
1345 "1939", "GFF3 feature file with sequence",
1346 AJTRUE, AJFALSE, AJTRUE, AJTRUE,
1347 AJTRUE, AJTRUE, &seqReadGff3, AJFALSE, AJFALSE},
1348 {"stockholm",
1349 "1961", "Stockholm (pfam) format",
1350 AJFALSE, AJTRUE, AJFALSE, AJTRUE,
1351 AJFALSE, AJTRUE, &seqReadStockholm, AJFALSE, AJFALSE},
1352 {"pfam",
1353 "1961", "Stockholm (pfam) format (alias)",
1354 AJTRUE, AJTRUE, AJFALSE, AJTRUE,
1355 AJFALSE, AJTRUE, &seqReadStockholm, AJFALSE, AJFALSE},
1356 {"selex",
1357 "1959", "Selex format", /* can read almost anything */
1358 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1359 AJFALSE, AJTRUE, &seqReadSelex, AJFALSE, AJFALSE},
1360 {"fitch",
1361 "1934", "Fitch program format",
1362 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1363 AJFALSE, AJTRUE, &seqReadFitch, AJFALSE, AJFALSE},
1364 {"biomart",
1365 "0000", "Biomart tab-delimited results", /* may clash with SAM */
1366 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1367 AJFALSE, AJTRUE, &seqReadBiomart, AJFALSE, AJFALSE},
1368 {"mase",
1369 "1945", "Mase program format",
1370 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1371 AJFALSE, AJTRUE, &seqReadMase, AJFALSE, AJFALSE}, /* like ig - off by
1372 default */
1373 {"experiment",
1374 "1928", "Staden experiment file",
1375 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1376 AJFALSE, AJTRUE, &seqReadExperiment, AJFALSE, AJFALSE},
1377 {"gde",
1378 "0000", "GDE program format",
1379 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1380 AJFALSE, AJTRUE, &seqReadGde, AJFALSE, AJFALSE},
1381 {"raw",
1382 "1957", "Raw sequence with no non-sequence characters",
1383 AJFALSE, AJTRUE, AJTRUE, AJTRUE,
1384 AJFALSE, AJFALSE, &seqReadRaw, AJFALSE, AJTRUE}, /* OK - only sequence
1385 chars allowed - but
1386 binary so not piped */
1387 {"nibble",
1388 "0000", "Nibble format",
1389 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1390 AJFALSE, AJFALSE, &seqReadNibble, AJFALSE, AJTRUE},
1391 {"nib",
1392 "0000", "Nibble format",
1393 AJTRUE, AJFALSE, AJTRUE, AJFALSE,
1394 AJFALSE, AJFALSE, &seqReadNibble, AJFALSE, AJTRUE},
1395 {"abi",
1396 "1628", "ABI trace file",
1397 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1398 AJFALSE, AJFALSE, &seqReadAbi, AJFALSE, AJTRUE},
1399 {"bam",
1400 "2572", "Binary Sequence Alignment/Map (BAM) format",
1401 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1402 AJFALSE, AJTRUE, &seqReadBam, AJFALSE, AJTRUE},
1403 {"ensembl",
1404 "0000", "Ensembl SQL format",
1405 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1406 AJTRUE, AJTRUE, &seqReadEnsembl, AJFALSE, AJFALSE},
1407 {"das",
1408 "1967", "DAS sequence format",
1409 AJFALSE, AJFALSE, AJTRUE, AJTRUE,
1410 AJTRUE, AJTRUE, &seqReadDAS, AJFALSE, AJFALSE},
1411 {"scf",
1412 "2057", "SCF trace file",
1413 AJFALSE, AJTRUE, AJTRUE, AJFALSE,
1414 AJFALSE, AJTRUE, &seqReadScf, AJFALSE, AJTRUE},
1415 {NULL,
1416 NULL, NULL,
1417 0, 0, 0, 0,
1418 0, 0, NULL, 0, 0}
1419 };
1420
1421
1422
1423 /* ==================================================================== */
1424 /* ========================= constructors ============================= */
1425 /* ==================================================================== */
1426
1427
1428
1429
1430 /* @section Sequence Input Constructors ***************************************
1431 **
1432 ** All constructors return a new sequence input object by pointer. It
1433 ** is the responsibility of the user to first destroy any previous
1434 ** sequence input object. The target pointer does not need to be
1435 ** initialised to NULL, but it is good programming practice to do so
1436 ** anyway.
1437 **
1438 ******************************************************************************/
1439
1440
1441
1442
1443 /* @func ajSeqinNew ***********************************************************
1444 **
1445 ** Creates a new sequence input object.
1446 **
1447 ** @return [AjPSeqin] New sequence input object.
1448 ** @category new [AjPSeqin] Default constructor
1449 **
1450 ** @release 1.0.0
1451 ** @@
1452 ******************************************************************************/
1453
ajSeqinNew(void)1454 AjPSeqin ajSeqinNew(void)
1455 {
1456 AjPSeqin pthis;
1457
1458 AJNEW0(pthis);
1459
1460 pthis->Input = ajTextinNewDatatype(AJDATATYPE_SEQUENCE);
1461 pthis->Name = ajStrNew();
1462 pthis->Acc = ajStrNew();
1463 pthis->Full = ajStrNew();
1464 pthis->Date = ajStrNew();
1465 pthis->Desc = ajStrNew();
1466 pthis->Doc = ajStrNew();
1467 pthis->Rev = ajFalse;
1468 pthis->Begin = 0;
1469 pthis->End = 0;
1470 pthis->Ufo = ajStrNew();
1471
1472 pthis->Inputtype = ajStrNew();
1473 pthis->Entryname = ajStrNew();
1474
1475 pthis->DbSequence = ajStrNew();
1476
1477 pthis->Usalist = NULL; /* create only if needed */
1478
1479 pthis->Features = ajFalse;
1480 pthis->Upper = ajFalse;
1481 pthis->Lower = ajFalse;
1482 pthis->SeqData = NULL;
1483 pthis->Ftquery = ajFeattabinNew(); /* empty object */
1484 pthis->Multiset = ajFalse;
1485
1486 return pthis;
1487 }
1488
1489
1490
1491
1492
1493 /* @func ajSeqinNewQueryC ******************************************************
1494 **
1495 ** Creates a new sequence input object.
1496 **
1497 ** @param [r] qrytxt [const char*] Query string
1498 ** @return [AjPSeqin] New sequence input object.
1499 ** @category new [AjPSeqin] Default constructor
1500 **
1501 ** @release 1.0.0
1502 ** @@
1503 ******************************************************************************/
1504
ajSeqinNewQueryC(const char * qrytxt)1505 AjPSeqin ajSeqinNewQueryC(const char* qrytxt)
1506 {
1507 AjPSeqin thys = ajSeqinNew();
1508
1509 ajStrAssignC(&thys->Input->Qry, qrytxt);
1510 return thys;
1511 }
1512
1513
1514
1515
1516 /* @func ajSeqinNewQueryS ******************************************************
1517 **
1518 ** Creates a new sequence input object.
1519 **
1520 ** @param [r] qry [const AjPStr] Query string
1521 ** @return [AjPSeqin] New sequence input object.
1522 ** @category new [AjPSeqin] Default constructor
1523 **
1524 ** @release 1.0.0
1525 ** @@
1526 ******************************************************************************/
1527
ajSeqinNewQueryS(const AjPStr qry)1528 AjPSeqin ajSeqinNewQueryS(const AjPStr qry)
1529 {
1530 AjPSeqin thys = ajSeqinNew();
1531
1532 ajStrAssignS(&thys->Input->Qry, qry);
1533 return thys;
1534 }
1535
1536
1537
1538
1539 /* ==================================================================== */
1540 /* ========================== destructors ============================= */
1541 /* ==================================================================== */
1542
1543
1544
1545
1546 /* @section Sequence Input Destructors ****************************************
1547 **
1548 ** Destruction destroys all internal data structures and frees the
1549 ** memory allocated for the sequence input object.
1550 **
1551 ******************************************************************************/
1552
1553
1554
1555
1556 /* @func ajSeqinDel ***********************************************************
1557 **
1558 ** Deletes a sequence input object.
1559 **
1560 ** @param [d] pthis [AjPSeqin*] Sequence input
1561 ** @return [void]
1562 ** @category delete [AjPSeqin] Default destructor
1563 **
1564 ** @release 1.0.0
1565 ** @@
1566 ******************************************************************************/
1567
ajSeqinDel(AjPSeqin * pthis)1568 void ajSeqinDel(AjPSeqin* pthis)
1569 {
1570 AjPSeqin thys;
1571 SeqPListUsa node = NULL;
1572
1573 if(!pthis)
1574 return;
1575
1576 thys = *pthis;
1577
1578 if(!thys)
1579 return;
1580
1581 ajDebug("ajSeqinDel called usa:'%S'\n", thys->Input->Qry);
1582
1583 ajTextinDel(&thys->Input);
1584
1585 ajStrDel(&thys->Name);
1586 ajStrDel(&thys->Acc);
1587
1588 ajStrDel(&thys->Inputtype);
1589
1590 ajStrDel(&thys->Full);
1591 ajStrDel(&thys->Date);
1592 ajStrDel(&thys->Desc);
1593 ajStrDel(&thys->Doc);
1594
1595 ajStrDel(&thys->Ufo);
1596 ajStrDel(&thys->Entryname);
1597
1598 ajStrDel(&thys->DbSequence);
1599
1600 ajStrDel(&thys->Inseq);
1601
1602 while(ajListGetLength(thys->Usalist))
1603 {
1604 ajListPop(thys->Usalist, (void**) &node);
1605 ajStrDel(&node->Usa);
1606 ajStrDel(&node->Formatstr);
1607 AJFREE(node);
1608 }
1609
1610 ajListFree(&thys->Usalist);
1611
1612 if(thys->Fttable)
1613 ajFeattableDel(&thys->Fttable);
1614
1615 if(thys->Ftquery) /* this deletes filebuff stuff above anyway */
1616 ajFeattabinDel(&thys->Ftquery);
1617
1618 AJFREE(*pthis);
1619
1620 return;
1621 }
1622
1623
1624
1625
1626 /* ==================================================================== */
1627 /* =========================== Modifiers ============================== */
1628 /* ==================================================================== */
1629
1630
1631
1632
1633 /* @section Sequence Input Modifiers ******************************************
1634 **
1635 ** These functions use the contents of a sequence input object and
1636 ** update them.
1637 **
1638 ******************************************************************************/
1639
1640
1641
1642
1643 /* @func ajSeqinUsa ***********************************************************
1644 **
1645 ** Creates or resets a sequence input object using a new Universal
1646 ** Sequence Address
1647 **
1648 ** @param [u] pthis [AjPSeqin*] Sequence input object.
1649 ** @param [r] Usa [const AjPStr] USA
1650 ** @return [void]
1651 ** @category modify [AjPSeqin] Resets using a new USA
1652 **
1653 ** @release 1.0.0
1654 ** @@
1655 ******************************************************************************/
1656
ajSeqinUsa(AjPSeqin * pthis,const AjPStr Usa)1657 void ajSeqinUsa(AjPSeqin* pthis, const AjPStr Usa)
1658 {
1659 AjPSeqin thys;
1660
1661 if(!*pthis)
1662 thys = *pthis = ajSeqinNew();
1663 else
1664 {
1665 thys = *pthis;
1666 ajSeqinClear(thys);
1667 }
1668
1669 ajStrAssignS(&thys->Input->Qry, Usa);
1670
1671 return;
1672 }
1673
1674
1675
1676
1677 /* @func ajSeqinSetNuc ********************************************************
1678 **
1679 ** Sets the type to be forced as nucleic for a sequence input object
1680 **
1681 ** @param [u] seqin [AjPSeqin] Sequence input object to be set.
1682 ** @return [void]
1683 **
1684 ** @release 1.0.0
1685 ** @@
1686 ******************************************************************************/
1687
ajSeqinSetNuc(AjPSeqin seqin)1688 void ajSeqinSetNuc(AjPSeqin seqin)
1689 {
1690 seqin->IsNuc = ajTrue;
1691
1692 return;
1693 }
1694
1695
1696
1697
1698 /* @func ajSeqinSetProt *******************************************************
1699 **
1700 ** Sets the type to be forced as protein for a sequence input object
1701 **
1702 ** @param [u] seqin [AjPSeqin] Sequence input object to be set.
1703 ** @return [void]
1704 **
1705 ** @release 1.0.0
1706 ** @@
1707 ******************************************************************************/
1708
ajSeqinSetProt(AjPSeqin seqin)1709 void ajSeqinSetProt(AjPSeqin seqin)
1710 {
1711 seqin->IsProt = ajTrue;
1712
1713 return;
1714 }
1715
1716
1717
1718
1719 /* @func ajSeqinSetRange ******************************************************
1720 **
1721 ** Sets the start and end positions for a sequence input object
1722 **
1723 ** @param [u] seqin [AjPSeqin] Sequence input object to be set.
1724 ** @param [r] ibegin [ajint] Start position. Negative values are from the end.
1725 ** @param [r] iend [ajint] End position. Negative values are from the end.
1726 ** @return [void]
1727 ** @category modify [AjPSeqin] Sets a sequence range for all input sequences
1728 **
1729 ** @release 1.0.0
1730 ** @@
1731 ******************************************************************************/
1732
ajSeqinSetRange(AjPSeqin seqin,ajint ibegin,ajint iend)1733 void ajSeqinSetRange(AjPSeqin seqin, ajint ibegin, ajint iend)
1734 {
1735 if(ibegin)
1736 seqin->Begin = ibegin;
1737
1738 if(iend)
1739 seqin->End = iend;
1740
1741 return;
1742 }
1743
1744
1745
1746
1747 /* ==================================================================== */
1748 /* ========================== Assignments ============================= */
1749 /* ==================================================================== */
1750
1751
1752
1753
1754 /* @section Sequence Input Assignments ****************************************
1755 **
1756 ** These functions overwrite the sequence input object provided as the
1757 ** first argument.
1758 **
1759 ******************************************************************************/
1760
1761
1762
1763
1764 /* ==================================================================== */
1765 /* ======================== Operators ==================================*/
1766 /* ==================================================================== */
1767
1768
1769
1770
1771 /* @section Sequence Input Operators ******************************************
1772 **
1773 ** These functions use the contents of a sequence input object but do
1774 ** not make any changes.
1775 **
1776 ******************************************************************************/
1777
1778
1779
1780
1781 /* @func ajSeqAllRead *********************************************************
1782 **
1783 ** Parse a USA Uniform Sequence Address into format, access, file and entry
1784 **
1785 ** Split at delimiters. Check for the first part as a valid format
1786 ** Check for the remaining first part as a database name or as a file
1787 ** that can be opened.
1788 ** Anything left is an entryname spec.
1789 **
1790 ** Return the results in the AjPSeq object but leave the file open for
1791 ** future calls.
1792 **
1793 ** @param [w] thys [AjPSeq] Sequence returned.
1794 ** @param [u] seqin [AjPSeqin] Sequence input definitions
1795 ** @return [AjBool] ajTrue on success.
1796 ** @category input [AjPSeq] Master sequence stream input, reads first
1797 ** sequence from an open input stream.
1798 **
1799 ** @release 1.0.0
1800 ** @@
1801 ******************************************************************************/
1802
ajSeqAllRead(AjPSeq thys,AjPSeqin seqin)1803 AjBool ajSeqAllRead(AjPSeq thys, AjPSeqin seqin)
1804 {
1805 AjBool ret = ajFalse;
1806 AjPStr tmpformat = NULL;
1807 SeqPListUsa node = NULL;
1808 AjBool listdata = ajFalse;
1809
1810 if(!seqinFormatIsset)
1811 {
1812 /* we need a copy of the formatlist */
1813 if(ajNamGetValueC("format", &tmpformat))
1814 {
1815 seqSetInFormat(tmpformat);
1816 ajDebug("seqSetInFormat '%S' from EMBOSS_FORMAT\n", tmpformat);
1817 }
1818
1819 ajStrDel(&tmpformat);
1820 seqinFormatIsset = ajTrue;
1821 }
1822
1823 if(!seqin->Input->Filebuff)
1824 {
1825 /* First call. No file open yet ... */
1826 if(!seqinUsaProcess(seqin, thys) /* ... so process the USA */
1827 && !ajListGetLength(seqin->Usalist)) /* not list with bad 1st item */
1828 return ajFalse; /* if this fails, we read no sequence at all */
1829
1830 if(ajListGetLength(seqin->Usalist))
1831 listdata = ajTrue;
1832
1833 ajTextinClearNewfile(seqin->Input);
1834 }
1835
1836
1837 ret = seqRead(thys, seqin); /* read the sequence */
1838
1839 if(ret) /* clone any specified DB or entryname */
1840 {
1841 if (ajStrGetLen(seqin->Input->Db))
1842 {
1843 ajDebug("++ajSeqallRead set db: '%S' => '%S'\n",
1844 seqin->Input->Db, thys->Db);
1845 ajStrAssignS(&thys->Db, seqin->Input->Db);
1846 }
1847
1848 if (ajStrGetLen(seqin->Entryname))
1849 {
1850 ajDebug("++ajSeqallRead set entryname: '%S' => '%S'\n",
1851 seqin->Entryname, thys->Entryname);
1852 ajStrAssignS(&thys->Entryname, seqin->Entryname);
1853 }
1854
1855 if(!ajStrGetLen(thys->Type)) /* make sure the type is set */
1856 ajSeqType(thys);
1857 }
1858
1859 while(!ret && ajListGetLength(seqin->Usalist))
1860 {
1861 /* Failed, but we have a list still - keep trying it */
1862
1863 ajErr("Failed to read sequence '%S'", seqin->Input->Qry);
1864
1865 ajListPop(seqin->Usalist, (void**) &node);
1866 ajDebug("++try again: pop from list '%S'\n", node->Usa);
1867 ajSeqinUsa(&seqin, node->Usa);
1868 ajDebug("++SAVE (AGAIN) SEQIN '%S' %d..%d(%b) '%S' %d\n",
1869 seqin->Input->Qry, seqin->Begin, seqin->End, seqin->Rev,
1870 seqin->Input->Formatstr, seqin->Input->Format);
1871 seqUsaRestore(seqin, node);
1872
1873 ajStrDel(&node->Usa);
1874 ajStrDel(&node->Formatstr);
1875 AJFREE(node);
1876
1877 /* must exit if this fails ... for bad list USAs */
1878
1879 if(!seqinUsaProcess(seqin, thys))
1880 continue;
1881
1882 ajTextinClearNewfile(seqin->Input);
1883
1884 ret = seqRead(thys, seqin);
1885 }
1886
1887 if(!ret)
1888 {
1889 if(listdata)
1890 ajErr("Failed to read sequence '%S'", seqin->Input->Qry);
1891
1892 return ajFalse;
1893 }
1894
1895 if (seqin->Usalist)
1896 ajSeqinClearPos(seqin);
1897
1898 return ret;
1899 }
1900
1901
1902
1903
1904 /* @func ajSeqallFile *********************************************************
1905 **
1906 ** Parse a USA Uniform Sequence Address
1907 **
1908 ** Return the results in the AjPSeqall object but leave the file open for
1909 ** future calls.
1910 **
1911 ** @param [r] usa [const AjPStr] sequence usa.
1912 ** @return [AjPSeqall] seqall object
1913 **
1914 ** @release 1.13.0
1915 ** @@
1916 ******************************************************************************/
1917
ajSeqallFile(const AjPStr usa)1918 AjPSeqall ajSeqallFile(const AjPStr usa)
1919 {
1920 AjPSeqall seqall = NULL;
1921 AjPSeqin seqin = NULL;
1922 AjPSeq seq = NULL;
1923
1924 seqall = ajSeqallNew();
1925
1926 seqin = seqall->Seqin;
1927 seqin->Input->Multi = ajTrue;
1928 seqin->Input->Single = ajFalse;
1929 seq = seqall->Seq;
1930
1931 ajSeqinUsa(&seqin,usa);
1932
1933 if(!ajSeqAllRead(seq,seqin))
1934 {
1935 ajSeqallDel(&seqall);
1936
1937 return NULL;
1938 }
1939
1940 return seqall;
1941 }
1942
1943
1944
1945
1946 /* @func ajSeqallNext *********************************************************
1947 **
1948 ** Reads the next sequence into a sequence stream. For the first call this
1949 ** simply returns the sequence already loaded. For later calls a new
1950 ** sequence is read.
1951 **
1952 ** @param [u] seqall [AjPSeqall] Sequence stream
1953 ** @param [w] retseq [AjPSeq*] Sequence
1954 ** @return [AjBool] ajTrue if a sequence was refound. ajFalse when all is done.
1955 ** @category input [AjPSeq] Master sequence stream input, reads next
1956 ** sequence from an open input stream.
1957 ** @category modify [AjPSeqall] Master sequence stream input,
1958 ** reads next sequence from an open input stream.
1959 **
1960 ** @release 1.0.0
1961 ** @@
1962 ******************************************************************************/
1963
ajSeqallNext(AjPSeqall seqall,AjPSeq * retseq)1964 AjBool ajSeqallNext(AjPSeqall seqall, AjPSeq* retseq)
1965 {
1966 if(!seqall->Count)
1967 {
1968 seqall->Count = 1;
1969
1970 if(seqall->Rev)
1971 ajSeqSetRangeRev(seqall->Seq, seqall->Begin, seqall->End);
1972 else
1973 ajSeqSetRange(seqall->Seq, seqall->Begin, seqall->End);
1974
1975 /*
1976 seqall->Seq->Begin = seqall->Begin;
1977 seqall->Seq->End = seqall->End;
1978 */
1979
1980 seqall->Totseqs++;
1981 seqall->Totlength += ajSeqGetLenTrimmed(seqall->Seq);;
1982
1983 *retseq = seqall->Seq;
1984 seqall->Returned = ajTrue;
1985
1986 return ajTrue;
1987 }
1988
1989
1990 if(ajSeqRead(seqall->Seq, seqall->Seqin))
1991 {
1992 seqall->Count++;
1993
1994 if(seqall->Rev)
1995 ajSeqSetRangeRev(seqall->Seq, seqall->Begin, seqall->End);
1996 else
1997 ajSeqSetRange(seqall->Seq, seqall->Begin, seqall->End);
1998
1999 seqall->Totseqs++;
2000 seqall->Totlength += ajSeqGetLenTrimmed(seqall->Seq);;
2001
2002 *retseq = seqall->Seq;
2003 seqall->Returned = ajTrue;
2004
2005 ajDebug("ajSeqallNext success\n");
2006
2007 return ajTrue;
2008 }
2009
2010 *retseq = NULL;
2011 ajDebug("ajSeqallNext failed\n");
2012 ajSeqallClear(seqall);
2013
2014 return ajFalse;
2015 }
2016
2017
2018
2019
2020 /* @func ajSeqinClearPos ******************************************************
2021 **
2022 ** Clears a Sequence input object position information as possibly read from
2023 ** a USA that included the begin, end and direction
2024 **
2025 ** @param [u] thys [AjPSeqin] Sequence input
2026 ** @return [void]
2027 **
2028 ** @release 2.9.0
2029 ** @@
2030 ******************************************************************************/
2031
ajSeqinClearPos(AjPSeqin thys)2032 void ajSeqinClearPos(AjPSeqin thys)
2033 {
2034 thys->Rev = ajFalse;
2035 thys->Begin = 0;
2036 thys->End = 0;
2037
2038 return;
2039 }
2040
2041
2042
2043
2044 /* @func ajSeqinClear *********************************************************
2045 **
2046 ** Clears a Sequence input object back to "as new" condition, except
2047 ** for the USA list and the features setting which must be preserved.
2048 **
2049 ** @param [w] thys [AjPSeqin] Sequence input
2050 ** @return [void]
2051 ** @category modify [AjPSeqin] Resets ready for reuse.
2052 **
2053 ** @release 1.0.0
2054 ** @@
2055 ******************************************************************************/
2056
ajSeqinClear(AjPSeqin thys)2057 void ajSeqinClear(AjPSeqin thys)
2058 {
2059 ajDebug("ajSeqinClear called\n");
2060
2061 if(!thys)
2062 return;
2063
2064 ajTextinClear(thys->Input);
2065
2066 ajStrSetClear(&thys->Name);
2067 ajStrSetClear(&thys->Acc);
2068 /* preserve thys->Inputtype */
2069 ajStrSetClear(&thys->Full);
2070 ajStrSetClear(&thys->Date);
2071 ajStrSetClear(&thys->Desc);
2072 ajStrSetClear(&thys->Doc);
2073 /* preserve thys->List */
2074 ajStrSetClear(&thys->Ufo);
2075 ajStrSetClear(&thys->Entryname);
2076
2077 ajStrSetClear(&thys->DbSequence);
2078
2079 ajStrSetClear(&thys->Inseq);
2080
2081 /* preserve thys->Usalist */
2082
2083 /* preserve thys->Query */
2084
2085 if(thys->Fttable)
2086 {
2087 ajFeattableDel(&thys->Fttable);
2088 }
2089
2090 if(thys->Ftquery) /* this clears filebuff stuff above anyway */
2091 ajFeattabinClear(thys->Ftquery);
2092
2093 thys->SeqData = NULL;
2094
2095 thys->Rev = ajFalse;
2096
2097 /* keep thys->Features */
2098 /* thys->Features = ajFalse;*/
2099
2100 thys->Begin = 0;
2101 thys->End = 0;
2102
2103 return;
2104 }
2105
2106
2107
2108
2109 /* ==================================================================== */
2110 /* ============================ Casts ==================================*/
2111 /* ==================================================================== */
2112
2113
2114
2115
2116 /* @section Sequence Input Casts **********************************************
2117 **
2118 ** These functions examine the contents of a sequence input object and
2119 ** return some derived information. Some of them provide access to the
2120 ** internal components of a sequence input object. They are provided
2121 ** for programming convenience but should be used with caution.
2122 **
2123 ******************************************************************************/
2124
2125
2126
2127
2128 /* ==================================================================== */
2129 /* ========================== Assignments ============================= */
2130 /* ==================================================================== */
2131
2132
2133
2134
2135 /* @section Sequence inputs **********************************************
2136 **
2137 ** These functions read the sequence provided by the first argument
2138 **
2139 ******************************************************************************/
2140
2141
2142
2143
2144 /* @func ajSeqRead ************************************************************
2145 **
2146 ** If the file is not yet open, calls seqinUsaProcess to convert the USA into
2147 ** an open file stream.
2148 **
2149 ** Uses seqRead for the actual file reading.
2150 **
2151 ** Returns the results in the AjPSeq object.
2152 **
2153 ** @param [w] thys [AjPSeq] Sequence returned.
2154 ** @param [u] seqin [AjPSeqin] Sequence input definitions
2155 ** @return [AjBool] ajTrue on success.
2156 ** @category input [AjPSeq] Master sequence input, calls specific functions
2157 ** for file access type and sequence format.
2158 **
2159 ** @release 1.0.0
2160 ** @@
2161 ******************************************************************************/
2162
ajSeqRead(AjPSeq thys,AjPSeqin seqin)2163 AjBool ajSeqRead(AjPSeq thys, AjPSeqin seqin)
2164 {
2165 AjPStr tmpformat = NULL;
2166 AjBool ret = ajFalse;
2167 SeqPListUsa node = NULL;
2168 AjBool listdata = ajFalse;
2169
2170 if(!seqinFormatIsset)
2171 {
2172 /* we need a copy of the formatlist */
2173 if(ajNamGetValueC("format", &tmpformat))
2174 {
2175 seqSetInFormat(tmpformat);
2176 ajDebug("seqSetInFormat '%S' from EMBOSS_FORMAT\n", tmpformat);
2177 }
2178
2179 ajStrDel(&tmpformat);
2180 seqinFormatIsset = ajTrue;
2181 }
2182
2183 if(seqin->Input->Filebuff)
2184 {
2185 /* (a) if file still open, keep reading */
2186 ajDebug("ajSeqRead: input file '%F' still there, try again\n",
2187 seqin->Input->Filebuff->File);
2188 ret = seqRead(thys, seqin);
2189 ajDebug("ajSeqRead: open buffer usa: '%S' returns: %B\n",
2190 seqin->Input->Qry, ret);
2191 }
2192 else
2193 {
2194 /* (b) if we have a list, try the next USA in the list */
2195 if(ajListGetLength(seqin->Usalist))
2196 {
2197 listdata = ajTrue;
2198 ajListPop(seqin->Usalist, (void**) &node);
2199
2200 ajDebug("++pop from list '%S'\n", node->Usa);
2201 ajSeqinUsa(&seqin, node->Usa);
2202 ajDebug("++SAVE SEQIN '%S' %d..%d(%b) '%S' %d\n",
2203 seqin->Input->Qry, seqin->Begin, seqin->End, seqin->Rev,
2204 seqin->Input->Formatstr, seqin->Input->Format);
2205 seqUsaRestore(seqin, node);
2206
2207 ajStrDel(&node->Usa);
2208 ajStrDel(&node->Formatstr);
2209 AJFREE(node);
2210
2211 ajDebug("ajSeqRead: open list, try '%S'\n", seqin->Input->Qry);
2212
2213 if(!seqinUsaProcess(seqin, thys) &&
2214 !ajListGetLength(seqin->Usalist))
2215 return ajFalse;
2216
2217 ajTextinClearNewfile(seqin->Input);
2218
2219 ret = seqRead(thys, seqin);
2220 ajDebug("ajSeqRead: list usa: '%S' returns: %B\n",
2221 seqin->Input->Qry, ret);
2222 }
2223 else
2224 {
2225 ajDebug("ajSeqRead: no file yet - test USA '%S'\n",
2226 seqin->Input->Qry);
2227
2228 /* (c) Must be a USA - decode it */
2229 if(!seqinUsaProcess(seqin, thys) &&
2230 !ajListGetLength(seqin->Usalist))
2231 return ajFalse;
2232
2233 if(ajListGetLength(seqin->Usalist)) /* could be a new list */
2234 listdata = ajTrue;
2235
2236 ajTextinClearNewfile(seqin->Input);
2237
2238 ret = seqRead(thys, seqin);
2239 ajDebug("ajSeqRead: new usa: '%S' returns: %B\n",
2240 seqin->Input->Qry, ret);
2241 }
2242 }
2243
2244 /* Now read whatever we got */
2245
2246 while(!ret && ajListGetLength(seqin->Usalist))
2247 {
2248 /* Failed, but we have a list still - keep trying it */
2249 if(listdata)
2250 ajErr("Failed to read sequence '%S'", seqin->Input->Qry);
2251
2252 listdata = ajTrue;
2253 ajListPop(seqin->Usalist,(void**) &node);
2254 ajDebug("++try again: pop from list '%S'\n", node->Usa);
2255 ajSeqinUsa(&seqin, node->Usa);
2256 ajDebug("++SAVE (AGAIN) SEQIN '%S' %d..%d(%b) '%S' %d\n",
2257 seqin->Input->Qry, seqin->Begin, seqin->End, seqin->Rev,
2258 seqin->Input->Formatstr, seqin->Input->Format);
2259 seqUsaRestore(seqin, node);
2260
2261 ajStrDel(&node->Usa);
2262 ajStrDel(&node->Formatstr);
2263 AJFREE(node);
2264
2265 if(!seqinUsaProcess(seqin, thys))
2266 continue;
2267
2268 ajTextinClearNewfile(seqin->Input);
2269
2270 ret = seqRead(thys, seqin);
2271 ajDebug("ajSeqRead: list retry usa: '%S' returns: %B\n",
2272 seqin->Input->Qry, ret);
2273 }
2274
2275 if(!ret)
2276 {
2277 if(listdata)
2278 ajErr("Failed to read sequence '%S'", seqin->Input->Qry);
2279
2280 return ajFalse;
2281 }
2282
2283
2284 seqDefine(thys, seqin);
2285
2286 return ajTrue;
2287 }
2288
2289
2290
2291
2292 /* ==================================================================== */
2293 /* ========================== Assignments ============================= */
2294 /* ==================================================================== */
2295
2296
2297
2298
2299 /* @section Sequence Set Inputs ******************************************
2300 **
2301 ** These functions read the sequence set object provided as the
2302 ** first argument.
2303 **
2304 ******************************************************************************/
2305
2306
2307
2308
2309 /* @func ajSeqsetRead *********************************************************
2310 **
2311 ** Parse a USA Uniform Sequence Address into format, access, file and entry
2312 **
2313 ** Split at delimiters. Check for the first part as a valid format
2314 ** Check for the remaining first part as a database name or as a file
2315 ** that can be opened.
2316 ** Anything left is an entryname spec.
2317 **
2318 ** Read all the sequences until done
2319 **
2320 ** Return the results in the AjPSeqset object.
2321 **
2322 ** @param [w] thys [AjPSeqset] Sequence set returned.
2323 ** @param [u] seqin [AjPSeqin] Sequence input definitions
2324 ** @return [AjBool] ajTrue on success.
2325 ** @category input [AjPSeqset] Master input routine for a sequence
2326 ** set
2327 **
2328 ** @release 1.0.0
2329 ** @@
2330 ******************************************************************************/
2331
ajSeqsetRead(AjPSeqset thys,AjPSeqin seqin)2332 AjBool ajSeqsetRead(AjPSeqset thys, AjPSeqin seqin)
2333 {
2334 AjPSeq seq;
2335 AjPList setlist;
2336
2337 ajuint iseq = 0;
2338
2339 seq = ajSeqNew();
2340
2341 ajDebug("ajSeqsetRead\n");
2342
2343 if(!seqinUsaProcess(seqin, seq))
2344 return ajFalse;
2345
2346 ajTextinClearNewfile(seqin->Input);
2347
2348 ajStrAssignS(&thys->Usa, seqin->Input->Qry);
2349 ajStrAssignS(&thys->Ufo, seqin->Ufo);
2350 thys->Begin = seqin->Begin;
2351 thys->End = seqin->End;
2352
2353 setlist = ajListNew();
2354
2355 ajDebug("ready to start reading format '%S' '%S' %d..%d\n",
2356 seqin->Input->Formatstr, seq->Formatstr, seqin->Begin, seqin->End);
2357
2358 while(!seqin->Multidone && ajSeqRead(seq, seqin))
2359 {
2360 if (seqin->Usalist)
2361 ajSeqinClearPos(seqin);
2362 /*ajDebug("read name '%S' length %d format '%S' '%S' seqindata: %x\n",
2363 seq->Entryname, ajSeqGetLen(seq),
2364 seqin->Input->Formatstr, seq->Formatstr, seqin->SeqData);*/
2365 ajStrAssignEmptyS(&seq->Db, seqin->Input->Db);
2366
2367 if(!ajStrGetLen(seq->Type))
2368 ajSeqType(seq);
2369
2370 if(thys->Rev)
2371 ajSeqSetRangeRev(seq, thys->Begin, thys->End);
2372 else
2373 ajSeqSetRange(seq, thys->Begin, thys->End);
2374
2375 ajDebug ("ajSeqsetRead read sequence %d %x '%S' %d..%d (%d) "
2376 "Rev:%B Reversed:%B\n",
2377 iseq, seq, ajSeqGetNameS(seq),
2378 seq->Begin, seq->End, ajSeqGetLen(seq),
2379 seq->Rev, seq->Reversed);
2380
2381 /*ajSeqTrace(seq);*/
2382 iseq++;
2383
2384 ajListPushAppend(setlist, seq);
2385
2386 /*ajDebug("appended to list\n");*/
2387
2388 /* add to a list of sequences */
2389
2390 seq = ajSeqNew();
2391 seqinFormatSet(seqin, seq);
2392 }
2393
2394 ajSeqDel(&seq);
2395
2396 if(!iseq)
2397 return ajFalse;
2398
2399 /* convert the list of sequences into a seqset structure */
2400
2401 ajSeqsetFromList(thys, setlist);
2402
2403 ajListFree(&setlist);
2404
2405 ajDebug("ajSeqsetRead total %d sequences\n", iseq);
2406
2407 return ajTrue;
2408 }
2409
2410
2411
2412
2413 /* @func ajSeqsetallRead ******************************************************
2414 **
2415 ** Parse a USA Uniform Sequence Address into format, access, file and entry
2416 **
2417 ** Split at delimiters. Check for the first part as a valid format
2418 ** Check for the remaining first part as a database name or as a file
2419 ** that can be opened.
2420 ** Anything left is an entryname spec.
2421 **
2422 ** Read all the sequences into sequence sets until done
2423 **
2424 ** Start a new set for each multiple sequence input
2425 **
2426 ** Return the results in the AjPList object with AjPSeqset nodes
2427 **
2428 ** @param [w] thys [AjPList] List of sequence sets returned.
2429 ** @param [u] seqin [AjPSeqin] Sequence input definitions
2430 ** @return [AjBool] ajTrue on success.
2431 **
2432 ** @release 2.8.0
2433 ** @@
2434 ******************************************************************************/
2435
ajSeqsetallRead(AjPList thys,AjPSeqin seqin)2436 AjBool ajSeqsetallRead(AjPList thys, AjPSeqin seqin)
2437 {
2438 AjPSeq seq = NULL;
2439 AjPList setlist = NULL;
2440 AjPSeqset seqset = NULL;
2441
2442 ajuint iseq = 0;
2443
2444 seq = ajSeqNew();
2445
2446 ajDebug("ajSeqsetallRead\n");
2447
2448 if(!seqinUsaProcess(seqin, seq))
2449 return ajFalse;
2450
2451 ajTextinClearNewfile(seqin->Input);
2452
2453 ajDebug("ready to start reading format '%S' '%S' %d..%d\n",
2454 seqin->Input->Formatstr, seq->Formatstr, seqin->Begin, seqin->End);
2455
2456 while(ajSeqRead(seq, seqin))
2457 {
2458 ajDebug("read name '%S' length %d format '%S' '%S' "
2459 "seqindata: %x multidone: %B\n",
2460 seq->Entryname, ajSeqGetLen(seq),
2461 seqin->Input->Formatstr, seq->Formatstr,
2462 seqin->SeqData, seqin->Multidone);
2463 ajStrAssignEmptyS(&seq->Db, seqin->Input->Db);
2464
2465 if(!ajStrGetLen(seq->Type))
2466 ajSeqType(seq);
2467
2468 /*ajDebug ("ajSeqsetallRead read sequence %d '%s' %d..%d\n",
2469 iseq, ajSeqGetNameC(seq), seq->Begin, seq->End);*/
2470 /*ajSeqTrace(seq);*/
2471 iseq++;
2472
2473 if(!setlist)
2474 setlist = ajListNew();
2475
2476 ajListPushAppend(setlist, seq);
2477
2478 /*ajDebug("appended to list\n");*/
2479
2480 /* add to a list of sequences */
2481
2482 seq = ajSeqNew();
2483 seqinFormatSet(seqin, seq);
2484
2485 if(seqin->Multidone)
2486 {
2487 seqset = ajSeqsetNew();
2488 ajStrAssignS(&seqset->Usa, seqin->Input->Qry);
2489 ajStrAssignS(&seqset->Ufo, seqin->Ufo);
2490 seqset->Begin = seqin->Begin;
2491 seqset->End = seqin->End;
2492
2493 ajSeqsetFromList(seqset, setlist);
2494 ajListFree(&setlist);
2495 ajListPushAppend(thys, seqset);
2496 ajDebug("ajSeqsetallRead multidone save set %Lu of %u sequences\n",
2497 ajListGetLength(thys), ajSeqsetGetSize(seqset));
2498 seqset = NULL;
2499 }
2500 }
2501
2502 ajSeqDel(&seq);
2503
2504 if(!iseq)
2505 return ajFalse;
2506
2507 /* convert the list of sequences into a seqset structure */
2508
2509 if(ajListGetLength(setlist))
2510 {
2511 seqset = ajSeqsetNew();
2512 ajStrAssignS(&seqset->Usa, seqin->Input->Qry);
2513 ajStrAssignS(&seqset->Ufo, seqin->Ufo);
2514 seqset->Begin = seqin->Begin;
2515 seqset->End = seqin->End;
2516
2517 ajSeqsetFromList(seqset, setlist);
2518 ajListFree(&setlist);
2519 ajListPushAppend(thys, seqset);
2520 seqset = NULL;
2521 }
2522
2523 ajDebug("ajSeqsetallRead total %Lu sets of %d sequences\n",
2524 ajListGetLength(thys), iseq);
2525
2526 return ajTrue;
2527 }
2528
2529
2530
2531
2532 /* @func ajSeqsetFromList *****************************************************
2533 **
2534 ** Builds a sequence set from a list of sequences
2535 **
2536 ** @param [w] thys [AjPSeqset] Sequence set
2537 ** @param [r] list [const AjPList] List of sequence objects
2538 ** @return [ajint] Number of sequences in the set.
2539 **
2540 ** @release 2.1.0
2541 ******************************************************************************/
2542
ajSeqsetFromList(AjPSeqset thys,const AjPList list)2543 ajint ajSeqsetFromList(AjPSeqset thys, const AjPList list)
2544 {
2545
2546 ajuint i;
2547 AjIList iter;
2548 AjPSeq seq;
2549
2550 ajDebug("ajSeqsetFromList length: %Lu\n", ajListGetLength(list));
2551
2552 /*ajListTrace(list);*/
2553
2554 thys->Size = (ajuint) ajListGetLength(list);
2555 thys->Seq = AJCALLOC0(thys->Size, sizeof(AjPSeq));
2556 thys->Seqweight = AJCALLOC0(thys->Size, sizeof(float));
2557
2558 i = 0;
2559 iter = ajListIterNewread(list);
2560 ajListIterTrace(iter);
2561
2562 while((seq = (AjPSeq) ajListIterGet(iter)))
2563 {
2564 if(!i)
2565 {
2566 thys->EType = seq->EType;
2567 ajStrAssignS(&thys->Type, seq->Type);
2568 thys->Format = seq->Format;
2569 ajStrAssignS(&thys->Formatstr, seq->Formatstr);
2570 ajStrAssignS(&thys->Filename, seq->Filename);
2571 ajStrAssignS(&thys->Full, seq->Full);
2572 }
2573
2574 thys->Seqweight[i] = seq->Weight;
2575 thys->Seq[i] = seq;
2576 thys->Totweight += seq->Weight;
2577
2578 if(ajSeqGetLen(seq) > thys->Len)
2579 thys->Len = ajSeqGetLen(seq);
2580
2581 /* ajDebug("seq %d '%x'\n", i, seq);*/
2582 ajDebug("seq '%x' len: %d weight: %.3f\n",
2583 seq->Name, ajSeqGetLen(seq), thys->Seq[i]->Weight);
2584 i++;
2585 }
2586 ajListIterDel(&iter);
2587
2588 return thys->Size;
2589 }
2590
2591
2592
2593
2594 /* @func ajSeqsetFromPair *****************************************************
2595 **
2596 ** Builds a sequence set from a pair of sequences
2597 **
2598 ** @param [w] thys [AjPSeqset] Sequence set
2599 ** @param [r] seqa [const AjPSeq] Sequence 1
2600 ** @param [r] seqb [const AjPSeq] Sequence 2
2601 ** @return [ajint] Number of sequences in the set.
2602 **
2603 ** @release 2.1.0
2604 ******************************************************************************/
2605
ajSeqsetFromPair(AjPSeqset thys,const AjPSeq seqa,const AjPSeq seqb)2606 ajint ajSeqsetFromPair(AjPSeqset thys, const AjPSeq seqa, const AjPSeq seqb)
2607 {
2608
2609 ajSeqsetApp(thys, seqa);
2610 ajSeqsetApp(thys, seqb);
2611
2612 return thys->Size;
2613 }
2614
2615
2616
2617
2618 /* @func ajSeqsetApp **********************************************************
2619 **
2620 ** Adds a sequence to a sequence set
2621 **
2622 ** @param [w] thys [AjPSeqset] Sequence set
2623 ** @param [r] seq [const AjPSeq] Sequence
2624 ** @return [ajint] Number of sequences in the set.
2625 **
2626 ** @release 2.1.0
2627 ******************************************************************************/
2628
ajSeqsetApp(AjPSeqset thys,const AjPSeq seq)2629 ajint ajSeqsetApp(AjPSeqset thys, const AjPSeq seq)
2630 {
2631 ajuint iseq;
2632
2633 iseq = thys->Size;
2634
2635 ajDebug("ajSeqsetApp '%S' size %d len %d add '%S' len %d\n",
2636 thys->Full, thys->Size, thys->Len,
2637 seq->Full, ajSeqGetLen(seq));
2638
2639 thys->Size ++;
2640 AJCRESIZE(thys->Seq, thys->Size);
2641 AJCRESIZE(thys->Seqweight, thys->Size);
2642
2643 if(!iseq)
2644 {
2645 thys->EType = seq->EType;
2646 ajStrAssignEmptyS(&thys->Type, seq->Type);
2647 thys->Format = seq->Format;
2648 ajStrAssignEmptyS(&thys->Formatstr, seq->Formatstr);
2649 ajStrAssignEmptyS(&thys->Filename, seq->Filename);
2650 ajStrAssignEmptyS(&thys->Full, seq->Full);
2651 }
2652
2653 thys->Seqweight[iseq] = seq->Weight;
2654 thys->Seq[iseq] = ajSeqNewSeq(seq);
2655 thys->Totweight += seq->Weight;
2656
2657 if(ajSeqGetLen(seq) > thys->Len)
2658 thys->Len = ajSeqGetLen(seq);
2659
2660 ajDebug("result '%S' size %d len\n",
2661 thys->Full, thys->Size, thys->Len);
2662
2663 return thys->Size;
2664 }
2665
2666
2667
2668
2669 /* @funcstatic seqReadFmt *****************************************************
2670 **
2671 ** Tests whether a sequence can be read using the specified format.
2672 ** Then tests whether the sequence matches sequence query criteria
2673 ** and checks any specified type. Applies upper and lower case.
2674 **
2675 ** @param [w] thys [AjPSeq] Sequence object
2676 ** @param [u] seqin [AjPSeqin] Sequence input object
2677 ** @param [r] format [ajuint] input format code
2678 ** @return [ajuint] 0 if successful.
2679 ** 1 if the query match failed.
2680 ** 2 if the sequence type failed
2681 ** 3 if it failed to read a sequence
2682 **
2683 ** @release 1.0.0
2684 ** @@
2685 ** This is the only function that calls the appropriate Read function
2686 ** seqReadXxxxxx where Xxxxxxx is the supported sequence format.
2687 **
2688 ** Some of the seqReadXxxxxx functions fail to reset the buffer correctly,
2689 ** which is a very serious problem when cycling through all of them to
2690 ** identify an unknown format. The extra ajFileBuffReset call at the end is
2691 ** intended to address this problem. The individual functions should still
2692 ** reset the buffer in case they are called from elsewhere.
2693 **
2694 ******************************************************************************/
2695
seqReadFmt(AjPSeq thys,AjPSeqin seqin,ajuint format)2696 static ajuint seqReadFmt(AjPSeq thys, AjPSeqin seqin,
2697 ajuint format)
2698 {
2699 ajDebug("++seqReadFmt format %d (%s) '%S' feat %B\n",
2700 format, seqinFormatDef[format].Name,
2701 seqin->Input->Qry, seqin->Features);
2702
2703 ajTextinClearNewinput(seqin->Input);
2704
2705 /* Calling funclist seqinFormatDef() */
2706 if((*seqinFormatDef[format].Read)(thys, seqin))
2707 {
2708 ajDebug("seqReadFmt success with format %d (%s)\n",
2709 format, seqinFormatDef[format].Name);
2710 ajDebug("id: '%S' len: %d\n",
2711 thys->Name, ajStrGetLen(thys->Seq));
2712 seqin->Input->Format = format;
2713 ajStrAssignC(&seqin->Input->Formatstr, seqinFormatDef[format].Name);
2714 ajStrAssignC(&thys->Formatstr, seqinFormatDef[format].Name);
2715 ajStrAssignEmptyS(&thys->Db, seqin->Input->Db);
2716 ajStrAssignS(&thys->Entryname, seqin->Entryname);
2717 ajStrAssignS(&thys->Filename, seqin->Input->Filename);
2718
2719 if(seqQueryMatch(seqin->Input->Query, thys))
2720 {
2721 ajStrAssignEmptyS(&thys->Entryname, thys->Name);
2722
2723 ajDebug("seqQueryMatch Features:%B FtTable: %x (%u)\n",
2724 seqin->Features, thys->Fttable,
2725 ajFeattableGetSize(thys->Fttable));
2726
2727 if(seqin->Features && !thys->Fttable)
2728 {
2729 ajStrAssignEmptyS(&seqin->Ftquery->Seqname, thys->Entryname);
2730 seqin->Fttable = ajFeattableNewReadUfo(seqin->Ftquery,
2731 seqin->Ufo);
2732 if (!seqin->Fttable)
2733 {
2734 ajDebug("seqReadFmt features input failed UFO: '%S'\n",
2735 seqin->Ufo);
2736 /*
2737 ** GWW 21 Aug 2000 - don't warn about missing feature
2738 ** tables
2739 **/
2740 }
2741 else
2742 {
2743 ajFeattableSetLength(seqin->Fttable,
2744 ajStrGetLen(thys->Seq));
2745 ajFeattableDel(&thys->Fttable);
2746 /* ajFeattableTrace(seqin->Fttable); */
2747 thys->Fttable = seqin->Fttable;
2748 seqin->Fttable = NULL;
2749 }
2750 }
2751
2752 if (!ajStrGetLen(thys->Seq)) /* empty sequence string! */
2753 return FMT_EMPTY;
2754
2755 if(ajSeqTypeCheckIn(thys, seqin))
2756 {
2757 if (!ajStrGetLen(thys->Seq)) /* removed all remaining chars */
2758 return FMT_EMPTY;
2759
2760 /* ajSeqinTrace(seqin); */
2761 if(seqin->Upper)
2762 ajSeqFmtUpper(thys);
2763
2764 if(seqin->Lower)
2765 ajSeqFmtLower(thys);
2766
2767 if(seqin->Begin)
2768 thys->Begin = seqin->Begin;
2769
2770 if(seqin->End)
2771 thys->End = seqin->End;
2772
2773 if(seqin->Rev)
2774 thys->Rev = seqin->Rev;
2775
2776 return FMT_OK;
2777 }
2778 else
2779 return FMT_BADTYPE;
2780 }
2781
2782 ajDebug("query match failed, continuing ...\n");
2783 ajSeqClear(thys);
2784
2785 if(seqinFormatDef[format].Binary)
2786 return FMT_FAIL; /* do not reread - will read whole file again */
2787 else
2788 return FMT_NOMATCH;
2789 }
2790 else
2791 {
2792 ajDebug("Testing input buffer: IsBuff: %B Eof: %B\n",
2793 ajFilebuffIsBuffered(seqin->Input->Filebuff),
2794 ajFilebuffIsEof(seqin->Input->Filebuff));
2795
2796 if (!ajFilebuffIsBuffered(seqin->Input->Filebuff) &&
2797 ajFilebuffIsEof(seqin->Input->Filebuff))
2798 return FMT_EOF;
2799
2800 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
2801 ajDebug("Format %d (%s) failed, file buffer reset by seqReadFmt\n",
2802 format, seqinFormatDef[format].Name);
2803 /* ajFilebuffTraceFull(seqin->Input->Filebuff, 10, 10);*/
2804 }
2805
2806 ajDebug("++seqReadFmt failed - nothing read\n");
2807
2808 return FMT_FAIL;
2809 }
2810
2811
2812
2813
2814 /* @funcstatic seqRead ********************************************************
2815 **
2816 ** Given data in a seqin structure, tries to read everything needed
2817 ** using the specified format or by trial and error.
2818 **
2819 ** @param [w] thys [AjPSeq] Sequence object
2820 ** @param [u] seqin [AjPSeqin] Sequence input object
2821 ** @return [AjBool] ajTrue on success
2822 **
2823 ** @release 1.0.0
2824 ** @@
2825 ******************************************************************************/
2826
seqRead(AjPSeq thys,AjPSeqin seqin)2827 static AjBool seqRead(AjPSeq thys, AjPSeqin seqin)
2828 {
2829 ajuint i;
2830 ajuint istat = 0;
2831 ajuint jstat = 0;
2832
2833 AjPTextin textin = seqin->Input;
2834 AjPFilebuff buff = textin->Filebuff;
2835 AjPQuery qry = textin->Query;
2836 AjBool regfile = ajFalse;
2837 AjBool ok;
2838 AjPTextAccess textaccess = NULL;
2839 AjPSeqAccess seqaccess = NULL;
2840
2841 ajSeqClear(thys);
2842 ajDebug("seqRead: cleared Single:%B Count:%u SeqData:%p "
2843 "TextData:%p\n",
2844 textin->Single, ajTextinGetCount(textin),
2845 seqin->SeqData, textin->TextData);
2846
2847 if(textin->Single && ajTextinGetCount(textin))
2848 {
2849 /*
2850 ** One sequence at a time is read.
2851 ** The first sequence was read by ACD
2852 ** for the following ones we need to reset the AjPSeqin
2853 **
2854 ** Single is set by the access method
2855 */
2856
2857 ajDebug("seqRead: single access - count %u - lines %u (total %u) "
2858 "call access routine again\n",
2859 ajTextinGetCount(textin),
2860 ajTextinGetRecords(textin), ajTextinGetTotrecords(textin));
2861 /* Calling funclist seqAccess() */
2862 textaccess = qry->TextAccess;
2863 seqaccess = qry->Access;
2864 if(textaccess && !(*textaccess->Access)(textin))
2865 {
2866 ajDebug("seqRead: (*textaccess->Access)(seqin->Input) "
2867 "*failed*\n");
2868
2869 return ajFalse;
2870 }
2871 if(seqaccess && !(*seqaccess->Access)(seqin))
2872 {
2873 ajDebug("seqRead: (*seqaccess->Access)(seqin) "
2874 "*failed*\n");
2875
2876 return ajFalse;
2877 }
2878 buff = textin->Filebuff;
2879 }
2880
2881 ajDebug("seqRead: seqin format %d '%S'\n", textin->Format,
2882 textin->Formatstr);
2883
2884 textin->Count++;
2885
2886 if(!textin->Filebuff)
2887 return ajFalse;
2888
2889 ok = ajFilebuffIsBuffered(textin->Filebuff);
2890
2891 if(!seqinFormatDef[textin->Format].Binary)
2892 {
2893 while(ok)
2894 { /* skip blank lines */
2895 ok = ajBuffreadLine(textin->Filebuff, &seqReadLine);
2896
2897 if(ok && !ajStrIsWhite(seqReadLine))
2898 {
2899 ajFilebuffClear(textin->Filebuff,1);
2900 break;
2901 }
2902 }
2903 }
2904
2905 if(!textin->Format)
2906 { /* no format specified, try all defaults */
2907
2908 regfile = ajFileIsFile(ajFilebuffGetFile(textin->Filebuff));
2909
2910 for(i = 1; seqinFormatDef[i].Name; i++)
2911 {
2912 if(!seqinFormatDef[i].Try) /* skip if Try is ajFalse */
2913 continue;
2914
2915 if(seqinFormatDef[i].Binary && !regfile)
2916 {
2917 ajDebug("seqRead: binary stdin skip format %d (%s)\n",
2918 i, seqinFormatDef[i].Name);
2919 continue;
2920 }
2921
2922 ajDebug("seqRead:try format %d (%s) records: %u (total %u) "
2923 "seqdata: %p\n",
2924 i, seqinFormatDef[i].Name,
2925 ajTextinGetRecords(textin),
2926 ajTextinGetTotrecords(textin),
2927 seqin->SeqData);
2928
2929 ajTextinClearNewinput(seqin->Input);
2930
2931 istat = seqReadFmt(thys, seqin, i);
2932
2933 switch(istat)
2934 {
2935 case FMT_OK:
2936 ajDebug("++seqRead OK (1), set format %d\n",
2937 textin->Format);
2938 seqDefine(thys, seqin);
2939
2940 return ajTrue;
2941 case FMT_BADTYPE:
2942 ajDebug("seqRead: (a1) seqReadFmt stat == BADTYPE *failed*\n");
2943
2944 return ajFalse;
2945 case FMT_FAIL:
2946 ajDebug("seqRead: (b1) seqReadFmt stat == FAIL *failed*\n");
2947 break; /* we can try next format */
2948 case FMT_NOMATCH:
2949 ajDebug("seqRead: (c1) seqReadFmt stat==NOMATCH try again\n");
2950 break;
2951 case FMT_EOF:
2952 ajDebug("seqRead: (d1) seqReadFmt stat == EOF *failed*\n");
2953 return ajFalse; /* EOF and unbuffered */
2954 case FMT_EMPTY:
2955 ajWarn("Sequence '%S' has zero length, ignored",
2956 ajSeqGetUsaS(thys));
2957 ajDebug("seqRead: (e1) seqReadFmt stat==EMPTY try again\n");
2958 break;
2959 default:
2960 ajDebug("unknown code %d from seqReadFmt\n", stat);
2961 }
2962
2963 ajSeqClear(thys);
2964
2965 if(textin->Format)
2966 break; /* we read something */
2967
2968 ajFilebuffTrace(textin->Filebuff);
2969 }
2970
2971 if(!textin->Format)
2972 { /* all default formats failed, give up */
2973 ajDebug("seqRead:all default formats failed, give up\n");
2974
2975 return ajFalse;
2976 }
2977
2978 ajDebug("++seqRead set format %d\n", textin->Format);
2979 }
2980 else
2981 { /* one format specified */
2982 ajDebug("seqRead: one format specified\n");
2983 ajFilebuffSetUnbuffered(textin->Filebuff);
2984
2985 ajDebug("++seqRead known format %d\n", textin->Format);
2986 istat = seqReadFmt(thys, seqin, textin->Format);
2987
2988 switch(istat)
2989 {
2990 case FMT_OK:
2991 ajDebug("++seqRead OK (2), set format %d\n",
2992 textin->Format);
2993 seqDefine(thys, seqin);
2994
2995 return ajTrue;
2996 case FMT_BADTYPE:
2997 ajDebug("seqRead: (a2) seqReadFmt stat == BADTYPE *failed*\n");
2998
2999 return ajFalse;
3000
3001 case FMT_FAIL:
3002 ajDebug("seqRead: (b2) seqReadFmt stat == FAIL *failed*\n");
3003
3004 return ajFalse;
3005
3006 case FMT_NOMATCH:
3007 ajDebug("seqRead: (c2) seqReadFmt stat == NOMATCH *try again*\n");
3008 break;
3009 case FMT_EOF:
3010 ajDebug("seqRead: (d2) seqReadFmt stat == EOF *try again*\n");
3011 if(ajTextinGetRecords(textin))
3012 ajErr("Error reading file '%F' with format '%s': "
3013 "end-of-file before end of data "
3014 "(read %u records, total %u)",
3015 ajFilebuffGetFile(textin->Filebuff),
3016 seqinFormatDef[textin->Format].Name,
3017 ajTextinGetRecords(textin),
3018 ajTextinGetTotrecords(textin));
3019 break; /* simply end-of-file */
3020 case FMT_EMPTY:
3021 ajWarn("Sequence '%S' has zero length, ignored",
3022 ajSeqGetUsaS(thys));
3023 ajDebug("seqRead: (e2) seqReadFmt stat == EMPTY *try again*\n");
3024 break;
3025 default:
3026 ajDebug("unknown code %d from seqReadFmt\n", stat);
3027 }
3028
3029 ajSeqClear(thys); /* 1 : read, failed to match id/acc/query */
3030 }
3031
3032 /* failed - probably entry/accession query failed. Can we try again? */
3033
3034 ajDebug("seqRead failed - try again with format %d '%s' code %d\n",
3035 textin->Format, seqinFormatDef[textin->Format].Name, istat);
3036
3037 ajDebug("Search:%B Chunk:%B Data:%x ajFileBuffEmpty:%B\n",
3038 textin->Search, textin->ChunkEntries,
3039 seqin->SeqData, ajFilebuffIsEmpty(buff));
3040
3041 if(ajFilebuffIsEmpty(buff) && textin->ChunkEntries)
3042 {
3043 if(textaccess && !(*textaccess->Access)(textin))
3044 return ajFalse;
3045 else if(seqaccess && !(*seqaccess->Access)(seqin))
3046 return ajFalse;
3047 buff = textin->Filebuff;
3048 }
3049
3050
3051 /* need to check end-of-file to avoid repeats */
3052 while(textin->Search &&
3053 (textin->TextData || !ajFilebuffIsEmpty(buff)))
3054 {
3055 jstat = seqReadFmt(thys, seqin, textin->Format);
3056
3057 switch(jstat)
3058 {
3059 case FMT_OK:
3060 ajDebug("++seqRead OK (3), set format %d\n",
3061 textin->Format);
3062 seqDefine(thys, seqin);
3063
3064 return ajTrue;
3065
3066 case FMT_BADTYPE:
3067 ajDebug("seqRead: (a3) seqReadFmt stat == BADTYPE *failed*\n");
3068
3069 return ajFalse;
3070
3071 case FMT_FAIL:
3072 ajDebug("seqRead: (b3) seqReadFmt stat == FAIL *failed*\n");
3073
3074 return ajFalse;
3075
3076 case FMT_NOMATCH:
3077 ajDebug("seqRead: (c3) seqReadFmt stat == NOMATCH *try again*\n");
3078 break;
3079 case FMT_EOF:
3080 ajDebug("seqRead: (d3) seqReadFmt stat == EOF *failed*\n");
3081
3082 return ajFalse; /* we already tried again */
3083
3084 case FMT_EMPTY:
3085 if(istat != FMT_EMPTY)
3086 ajWarn("Sequence '%S' has zero length, ignored",
3087 ajSeqGetUsaS(thys));
3088 ajDebug("seqRead: (e3) seqReadFmt stat == EMPTY *try again*\n");
3089 break;
3090
3091 default:
3092 ajDebug("unknown code %d from seqReadFmt\n", stat);
3093 }
3094
3095 ajSeqClear(thys); /* 1 : read, failed to match id/acc/query */
3096 }
3097
3098 if(seqin->Input->Format)
3099 ajDebug("seqRead: *failed* to read sequence %S using format %s\n",
3100 textin->Qry, seqinFormatDef[textin->Format].Name);
3101 else
3102 ajDebug("seqRead: *failed* to read sequence %S using any format\n",
3103 textin->Qry);
3104
3105 return ajFalse;
3106 }
3107
3108
3109
3110
3111 /* @funcstatic seqReadFasta ***************************************************
3112 **
3113 ** Given data in a sequence structure, tries to read everything needed
3114 ** using the FASTA format.
3115 **
3116 ** @param [w] thys [AjPSeq] Sequence object
3117 ** @param [u] seqin [AjPSeqin] Sequence input object
3118 ** @return [AjBool] ajTrue on success
3119 **
3120 ** @release 1.0.0
3121 ** @@
3122 ******************************************************************************/
3123
seqReadFasta(AjPSeq thys,AjPSeqin seqin)3124 static AjBool seqReadFasta(AjPSeq thys, AjPSeqin seqin)
3125 {
3126 AjPFilebuff buff;
3127 AjPStr id = NULL;
3128 AjPStr acc = NULL;
3129 AjPStr sv = NULL;
3130 AjPStr desc = NULL;
3131
3132 const char *cp;
3133 ajlong fpos = 0;
3134 ajlong fposb = 0;
3135 AjBool ok = ajTrue;
3136 AjPStr tmpline = NULL;
3137 const AjPStr badstr = NULL;
3138
3139 ajDebug("seqReadFasta\n");
3140
3141 buff = seqin->Input->Filebuff;
3142
3143 /* ajFilebuffTrace(buff); */
3144
3145 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3146 if(!ok)
3147 return ajFalse;
3148
3149 fpos = ajTextinGetFpos(seqin->Input);
3150
3151 ajDebug("First line: %S\n", seqReadLine);
3152
3153 /* If ; then it is really PIR format */
3154 if(ajStrGetCharPos(seqReadLine, 3) == ';')
3155 {
3156 ajStrAssignSubS(&tmpline,seqReadLine, 4, -1);
3157 ajFmtPrintS(&seqReadLine, ">%S",tmpline);
3158 ajDebug("PIR format changed line to %S\n", seqReadLine);
3159 ajStrDel(&tmpline);
3160 }
3161
3162 cp = ajStrGetPtr(seqReadLine);
3163
3164 if(*cp != '>')
3165 {
3166 ajDebug("first line is not FASTA\n");
3167 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3168
3169 return ajFalse;
3170 }
3171
3172 if(!ajSeqParseFasta(seqReadLine, &id, &acc, &sv, &desc))
3173 {
3174 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3175
3176 return ajFalse;
3177 }
3178
3179 /* we know we will succeed from here ... no way to return ajFalse */
3180
3181 ajFilebuffSetUnbuffered(buff);
3182
3183 seqSetNameNospace(&thys->Name, id);
3184
3185 if(ajStrGetLen(sv))
3186 seqSvSave(thys, sv);
3187
3188 if(ajStrGetLen(acc))
3189 seqAccSave(thys, acc);
3190
3191 ajStrAssignS(&thys->Desc, desc);
3192 ajStrDel(&id);
3193 ajStrDel(&acc);
3194 ajStrDel(&sv);
3195 ajStrDel(&desc);
3196
3197 if(ajStrGetLen(seqin->Inseq))
3198 { /* we have a sequence to use */
3199 ajDebug("++fasta use Inseq '%S'\n", seqin->Inseq);
3200 ajStrAssignS(&thys->Seq, seqin->Inseq);
3201 if(seqin->Input->Text)
3202 seqTextSeq(&thys->TextPtr, seqin->Inseq);
3203
3204 ajFilebuffClear(buff, 0);
3205 }
3206 else
3207 {
3208 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3209 while(ok && !ajStrPrefixC(seqReadLine, ">"))
3210 {
3211 badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3212 seqin->Input->Format);
3213
3214 if(badstr)
3215 ajWarn("Sequence '%S' has bad character(s) '%S'",
3216 thys->Name, badstr);
3217
3218 ajDebug("++fasta append line '%S'\n", seqReadLine);
3219 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3220 }
3221
3222 if(ok)
3223 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3224 else
3225 ajFilebuffClear(buff, 0);
3226 }
3227
3228 thys->Fpos = fpos;
3229
3230 ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);
3231
3232 return ajTrue;
3233 }
3234
3235
3236
3237
3238 /* @funcstatic seqReadFastq ***************************************************
3239 **
3240 ** Given data in a sequence structure, tries to read everything needed
3241 ** using the FASTQ format, but ignores quality values.
3242 **
3243 ** See the more specific fastq formats for parsers that read and process
3244 ** the quality scores.
3245 **
3246 ** @param [w] thys [AjPSeq] Sequence object
3247 ** @param [u] seqin [AjPSeqin] Sequence input object
3248 ** @return [AjBool] ajTrue on success
3249 **
3250 ** @release 6.1.0
3251 ** @@
3252 ******************************************************************************/
3253
seqReadFastq(AjPSeq thys,AjPSeqin seqin)3254 static AjBool seqReadFastq(AjPSeq thys, AjPSeqin seqin)
3255 {
3256 AjPFilebuff buff;
3257 AjPStr id = NULL;
3258 AjPStr acc = NULL;
3259 AjPStr sv = NULL;
3260 AjPStr desc = NULL;
3261
3262 ajuint seqlen = 0;
3263 /*AjPStr qualstr = NULL;*/
3264 char minqual;
3265 char maxqual;
3266 char comqual;
3267
3268 const char *cp;
3269 ajlong fpos = 0;
3270 ajlong fposb = 0;
3271 AjBool ok = ajTrue;
3272 const AjPStr badstr = NULL;
3273
3274 ajDebug("seqReadFastq\n");
3275
3276 buff = seqin->Input->Filebuff;
3277
3278 /* ajFilebuffTrace(buff); */
3279
3280 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3281 if(!ok)
3282 return ajFalse;
3283
3284 fpos = ajTextinGetFpos(seqin->Input);
3285
3286 ajDebug("First line: %S\n", seqReadLine);
3287
3288 cp = ajStrGetPtr(seqReadLine);
3289
3290 if(*cp != '@')
3291 {
3292 ajDebug("first line is not FASTQ\n");
3293 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3294
3295 return ajFalse;
3296 }
3297
3298 if(!ajSeqParseFastq(seqReadLine, &id, &desc))
3299 {
3300 ajDebug("first line did not parse as FASTQ\n");
3301 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3302
3303 return ajFalse;
3304 }
3305
3306 seqSetNameNospace(&thys->Name, id);
3307
3308 if(ajStrGetLen(sv))
3309 seqSvSave(thys, sv);
3310
3311 if(ajStrGetLen(acc))
3312 seqAccSave(thys, acc);
3313
3314 ajStrAssignS(&thys->Desc, desc);
3315 ajStrDel(&id);
3316 ajStrDel(&acc);
3317 ajStrDel(&sv);
3318 ajStrDel(&desc);
3319
3320 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3321 while(ok &&
3322 !ajStrPrefixC(seqReadLine, "+"))
3323 {
3324 badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3325 seqin->Input->Format);
3326
3327 if(badstr)
3328 ajWarn("Sequence '%S' has bad character(s) '%S'",
3329 thys->Name, badstr);
3330
3331 ajDebug("++fastq append line '%S'\n", seqReadLine);
3332 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3333 ajDebug("++fastq sequence %4u '%S'\n",
3334 ajStrGetLen(thys->Seq), thys->Seq);
3335 }
3336
3337 if(!ok)
3338 {
3339 ajDebug("failed to find quality scores\n");
3340 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3341
3342 return ajFalse;
3343 }
3344
3345 seqlen = ajStrGetLen(thys->Seq);
3346
3347 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3348
3349 ajStrAssignClear(&seqQualStr);
3350
3351 while(ok &&
3352 ((ajStrGetLen(seqQualStr) < seqlen) ||
3353 ajStrGetCharFirst(seqReadLine) != '@'))
3354 {
3355 seqqualAppendWarn(&seqQualStr, seqReadLine);
3356
3357 ajDebug("++fastq append qualities '%S'\n", seqReadLine);
3358 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3359 ajDebug("++fastq qualities %3u '%S'\n",
3360 ajStrGetLen(seqQualStr), seqQualStr);
3361 }
3362
3363 minqual = ajStrGetAsciiLow(seqQualStr);
3364 maxqual = ajStrGetAsciiHigh(seqQualStr);
3365 comqual = ajStrGetAsciiCommon(seqQualStr);
3366
3367 if(ajStrGetLen(seqQualStr) != seqlen)
3368 {
3369 ajDebug("length mismatch seq: %u quality: %u\n",
3370 seqlen, ajStrGetLen(seqQualStr));
3371 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3372
3373 return ajFalse;
3374 }
3375
3376 if(ok)
3377 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3378 else
3379 ajFilebuffClear(buff, 0);
3380
3381 thys->Fpos = fpos;
3382
3383 ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);
3384
3385 ajDebug("quality characters %d..%d (%d) '%c' '%c' (%c)\n",
3386 (int) minqual, (int) maxqual, (int) comqual,
3387 minqual, maxqual, comqual);
3388
3389 return ajTrue;
3390 }
3391
3392
3393
3394
3395 /* @funcstatic seqReadFastqSanger *********************************************
3396 **
3397 ** Given data in a sequence structure, tries to read everything needed
3398 ** using the FASTQ format, and interprets Sanger (phred) scores.
3399 **
3400 ** @param [w] thys [AjPSeq] Sequence object
3401 ** @param [u] seqin [AjPSeqin] Sequence input object
3402 ** @return [AjBool] ajTrue on success
3403 **
3404 ** @release 6.1.0
3405 ** @@
3406 ******************************************************************************/
3407
seqReadFastqSanger(AjPSeq thys,AjPSeqin seqin)3408 static AjBool seqReadFastqSanger(AjPSeq thys, AjPSeqin seqin)
3409 {
3410 AjPFilebuff buff;
3411 AjPStr id = NULL;
3412 AjPStr acc = NULL;
3413 AjPStr sv = NULL;
3414 AjPStr desc = NULL;
3415
3416 ajuint seqlen = 0;
3417
3418 /*
3419 ** char minqual;
3420 ** char maxqual;
3421 ** char comqual;
3422 */
3423
3424 const char *cp;
3425 ajint iqual;
3426 ajlong fpos = 0;
3427 AjBool ok = ajTrue;
3428 const AjPStr badstr = NULL;
3429
3430 /* ajint amin = 0; */
3431 ajint qmin = 33;
3432 ajint qmax = 126;
3433 ajuint i;
3434 ajuint cntseq = 0;
3435 ajuint cntqual = 0;
3436 ajuint cntnewline = 0;
3437
3438 /* ajDebug("seqReadFastqSanger\n"); */
3439
3440 buff = seqin->Input->Filebuff;
3441
3442 /* ajFilebuffTrace(buff); */
3443
3444 ok = ajTextinStoreReadline(seqin->Input, &seqSaveLine, &thys->TextPtr);
3445 if(!ok)
3446 return ajFalse;
3447
3448 fpos = ajTextinGetFpos(seqin->Input);
3449
3450 /* ajDebug("First line: %S\n", seqSaveLine); */
3451
3452 cp = MAJSTRGETPTR(seqSaveLine);
3453
3454 if(*cp != '@')
3455 {
3456 /* ajDebug("first line is not FASTQ\n"); */
3457 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3458
3459 return ajFalse;
3460 }
3461
3462 if(!ajSeqParseFastq(seqSaveLine, &id, &desc))
3463 {
3464 /* ajDebug("first line did not parse as FASTQ\n"); */
3465 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3466
3467 return ajFalse;
3468 }
3469
3470 seqSetNameNospace(&thys->Name, id);
3471
3472 if(MAJSTRGETLEN(sv))
3473 seqSvSave(thys, sv);
3474
3475 if(MAJSTRGETLEN(acc))
3476 seqAccSave(thys, acc);
3477
3478 ajStrAssignS(&thys->Desc, desc);
3479 ajStrDel(&id);
3480 ajStrDel(&acc);
3481 ajStrDel(&sv);
3482 ajStrDel(&desc);
3483
3484 i = MAJSTRGETLEN(seqSaveLine) - 1;
3485 while(ajStrGetCharPos(seqSaveLine, i) == '\n' ||
3486 ajStrGetCharPos(seqSaveLine, i) == '\r')
3487 {
3488 cntnewline++;
3489 i--;
3490 }
3491
3492 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3493 while(ok &&
3494 ajStrGetCharFirst(seqReadLine) != '+')
3495 {
3496 cntseq += MAJSTRGETLEN(seqReadLine) - cntnewline;
3497 badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3498 seqin->Input->Format);
3499
3500 if(badstr)
3501 ajWarn("Sequence '%S' has bad character(s) '%S'",
3502 thys->Name, badstr);
3503
3504 /* ajDebug("++fastq append line '%S'\n", seqReadLine); */
3505 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3506 /* ajDebug("++fastq sequence %4u '%S'\n",
3507 ajStrGetLen(thys->Seq), thys->Seq); */
3508 }
3509
3510 if(!ok)
3511 {
3512 /* ajDebug("failed to find quality scores\n"); */
3513 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3514
3515 return ajFalse;
3516 }
3517 if(MAJSTRGETLEN(seqReadLine) > (cntnewline+1))
3518 {
3519 ajStrPasteCountK(&seqReadLine, 0,'@', 1);
3520 if(!ajStrMatchS(seqReadLine, seqSaveLine))
3521 {
3522 ajStrPasteCountK(&seqReadLine, 0,'+', 1);
3523 ajWarn("Mismatch in file '%F' + line "
3524 "does not match first line '%.*S' '%.*S'",
3525 ajFilebuffGetFile(buff),
3526 (ajuint)(MAJSTRGETLEN(seqSaveLine) - cntnewline),
3527 seqSaveLine,
3528 (ajuint) (MAJSTRGETLEN(seqReadLine) - cntnewline),
3529 seqReadLine);
3530 }
3531 }
3532
3533 seqlen = MAJSTRGETLEN(thys->Seq);
3534
3535 if(seqlen < cntseq)
3536 {
3537 ajWarn("FASTQ format '%F' sequence '%S' "
3538 "sequence skipped %u character(s)",
3539 ajFilebuffGetFile(buff), thys->Name, cntseq - seqlen);
3540 }
3541 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3542
3543 ajStrAssignClear(&seqQualStr);
3544 ajStrAssignClear(&seqSaveLine2);
3545
3546 while(ok &&
3547 ((MAJSTRGETLEN(seqQualStr) < seqlen) ||
3548 ajStrGetCharFirst(seqReadLine) != '@'))
3549 {
3550 if((ajStrGetCharFirst(seqReadLine) == '@') &&
3551 !MAJSTRGETLEN(seqSaveLine2))
3552 ajStrAssignS(&seqSaveLine2, seqReadLine);
3553
3554 cntqual += MAJSTRGETLEN(seqReadLine) - cntnewline;
3555 seqqualAppendWarn(&seqQualStr, seqReadLine);
3556
3557 /* ajDebug("++fastq append qualities '%S'\n", seqReadLine); */
3558 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3559 /* ajDebug("++fastq qualities %3u '%S'\n",
3560 ajStrGetLen(seqQualStr), seqQualStr); */
3561 }
3562
3563 /*
3564 ** minqual = ajStrGetAsciiLow(seqQualStr);
3565 ** maxqual = ajStrGetAsciiHigh(seqQualStr);
3566 ** comqual = ajStrGetAsciiCommon(seqQualStr);
3567 */
3568
3569 if(MAJSTRGETLEN(seqQualStr) != seqlen)
3570 {
3571 ajWarn("FASTQ quality length mismatch '%F' '%S' "
3572 "expected: %u found: %u",
3573 ajFilebuffGetFile(buff), thys->Name,
3574 seqlen, ajStrGetLen(seqQualStr));
3575 if((MAJSTRGETLEN(seqQualStr) > seqlen) &&
3576 MAJSTRGETLEN(seqSaveLine2))
3577 {
3578 ajStrTrimEndC(&seqSaveLine2, "\n\r");
3579 ajWarn("(Possible short quality record before '%S')",
3580 seqSaveLine2);
3581 }
3582 }
3583 if(MAJSTRGETLEN(seqQualStr) < cntqual)
3584 {
3585 ajWarn("FASTQ format '%F' sequence '%S' "
3586 "quality skipped %u character(s)",
3587 ajFilebuffGetFile(buff), thys->Name,
3588 cntqual - MAJSTRGETLEN(seqQualStr));
3589 }
3590
3591
3592 if(ok)
3593 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3594 else
3595 ajFilebuffClear(buff, 0);
3596
3597 thys->Fpos = fpos;
3598
3599 /* ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb); */
3600
3601 /* ajDebug("Sanger: %d..%d (%d)\n",
3602 (ajint) minqual, (ajint) maxqual, (ajint) comqual); */
3603
3604 cp = MAJSTRGETPTR(seqQualStr);
3605 i=0;
3606
3607 if(seqlen > thys->Qualsize)
3608 AJCRESIZE(thys->Accuracy, seqlen);
3609
3610 thys->Qualsize = seqlen;
3611
3612 if(MAJSTRGETLEN(seqQualStr) > thys->Qualsize)
3613 AJCRESIZE(thys->Accuracy, MAJSTRGETLEN(seqQualStr));
3614
3615 /*
3616 ** Sanger uses Phred quality calculated from error probability p
3617 ** Qp = -10 log (p)
3618 **
3619 ** For Sanger (phred) p = 1 / 10**(Q/10)
3620 ** 10: p=0.1 20: p=0.01 etc.
3621 */
3622
3623 while (*cp)
3624 {
3625 iqual = *cp++;
3626 if(iqual < qmin)
3627 {
3628 ajWarn("FASTQ-SANGER '%F' sequence '%S' "
3629 "quality value '%c' too low",
3630 ajFilebuffGetFile(buff), thys->Name,
3631 (char) iqual);
3632 iqual = qmin;
3633 }
3634 if(iqual > qmax)
3635 {
3636 ajWarn("FASTQ-SANGER '%F' sequence '%S' "
3637 "quality value '%c' too high",
3638 ajFilebuffGetFile(buff), thys->Name,
3639 (char) iqual);
3640 iqual = qmax;
3641 }
3642 thys->Accuracy[i++] = seqQualPhred[iqual];
3643 }
3644
3645
3646 /*
3647 ** ajDebug("quality characters %d..%d (%d) '%c' '%c' (%c) "
3648 ** "scores %d..%d (%d)\n",
3649 ** (int) minqual, (int) maxqual, (int) comqual,
3650 ** minqual, maxqual, comqual,
3651 ** (amin + minqual - qmin), (amin + maxqual - qmin),
3652 ** (amin + comqual - qmin));
3653 */
3654
3655 ajStrAssignClear(&seqQualStr);
3656
3657 return ajTrue;
3658 }
3659
3660
3661
3662
3663 /* #funcstatic seqReadFastqInt ************************************************
3664 **
3665 ** Given data in a sequence structure, tries to read everything needed
3666 ** using the FASTQ numeric format, and interprets integer Solexa scores.
3667 **
3668 ** #param [w] thys [AjPSeq] Sequence object
3669 ** #param [u] seqin [AjPSeqin] Sequence input object
3670 ** #return [AjBool] ajTrue on success
3671 ** ##
3672 ******************************************************************************/
3673
3674 /*
3675 //static AjBool seqReadFastqInt(AjPSeq thys, AjPSeqin seqin)
3676 //{
3677 // AjPFilebuff buff;
3678 // AjPStr id = NULL;
3679 // AjPStr acc = NULL;
3680 // AjPStr sv = NULL;
3681 // AjPStr desc = NULL;
3682 //
3683 // ajuint seqlen = 0;
3684 // AjPStr qualstr = NULL;
3685 //
3686 // const char *cp;
3687 // ajlong fpos = 0;
3688 // ajlong fposb = 0;
3689 // AjBool ok = ajTrue;
3690 //
3691 // const AjPStr badstr = NULL;
3692 //
3693 // ajuint i;
3694 // AjBool badwarn = ajFalse;
3695 // double sval;
3696 // double pval;
3697 // double qval;
3698 //
3699 // ajDebug("seqReadFastqInt\n");
3700 //
3701 // buff = seqin->Input->Filebuff;
3702 //
3703 // ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3704 // if(!ok)
3705 // return ajFalse;
3706 //
3707 // fpos = ajTextinGetFpos(seqin->Input);
3708 //
3709 // ajDebug("First line: %S\n", seqReadLine);
3710 //
3711 // cp = ajStrGetPtr(seqReadLine);
3712 //
3713 // if(*cp != '@')
3714 // {
3715 // ajDebug("first line is not FASTQ\n");
3716 // ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3717 //
3718 // return ajFalse;
3719 // }
3720 //
3721 // if(!ajSeqParseFastq(seqReadLine, &id, &desc))
3722 // {
3723 // ajDebug("first line did not parse as FASTQ\n");
3724 // ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3725 //
3726 // return ajFalse;
3727 // }
3728 //
3729 // seqSetNameNospace(&thys->Name, id);
3730 //
3731 // if(ajStrGetLen(sv))
3732 // seqSvSave(thys, sv);
3733 //
3734 // if(ajStrGetLen(acc))
3735 // seqAccSave(thys, acc);
3736 //
3737 // ajStrAssignS(&thys->Desc, desc);
3738 // ajStrDel(&id);
3739 // ajStrDel(&acc);
3740 // ajStrDel(&sv);
3741 // ajStrDel(&desc);
3742 //
3743 // ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3744 // while(ok &&
3745 // !ajStrPrefixC(seqReadLine, "+"))
3746 // {
3747 // badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3748 seqin->Input->Format);
3749 //
3750 // if(badstr)
3751 // ajWarn("Sequence '%S' has bad character(s) '%S'",
3752 // thys->Name, badstr);
3753 //
3754 // ajDebug("++fastq append line '%S'\n", seqReadLine);
3755 // ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3756 // ajDebug("++fastq sequence %4u '%S'\n",
3757 // ajStrGetLen(thys->Seq), thys->Seq);
3758 // }
3759 //
3760 // if(!ok)
3761 // {
3762 // ajDebug("failed to find quality scores\n");
3763 // ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3764 //
3765 // return ajFalse;
3766 // }
3767 //
3768 // seqlen = ajStrGetLen(thys->Seq);
3769 //
3770 // ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3771 //
3772 // i=0;
3773 // if(seqlen > thys->Qualsize)
3774 // {
3775 // AJCRESIZE(thys->Accuracy, seqlen);
3776 // thys->Qualsize = seqlen;
3777 // }
3778 //
3779 // ajStrAssignClear(&seqQualStr);
3780 // while(ok &&
3781 // (!ajStrPrefixC(seqReadLine, "@")))
3782 // {
3783 // ajStrTokenAssignC(&handle, seqReadLine, " ,\n\r\t");
3784 // while(ajStrTokenNextParse(seqHandle, &seqQualStr))
3785 // {
3786 // if(i >= seqlen){
3787 // if(!badwarn)
3788 // ajWarn("Bad quality '%S' for base %d "
3789 // "in fastq-int format\n",
3790 // qualstr, i);
3791 // badwarn = ajTrue;
3792 // }
3793 // else if(!ajStrToDouble(seqQualStr, &sval))
3794 // {
3795 // if(!badwarn)
3796 // ajWarn("Bad quality '%S' for base %d "
3797 // "in fastq-int format\n",
3798 // qualstr, i);
3799 // badwarn = ajTrue;
3800 // i++;
3801 // }
3802 // else
3803 // {
3804 // pval = pow(10.0, (sval / -10.0));
3805 // qval = pval / (1.0 + pval);
3806 // thys->Accuracy[i++] = -10.0 * log10(qval);
3807 // }
3808 // }
3809 //
3810 // ajDebug("++fastq append qualities '%S'\n", seqReadLine);
3811 // ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3812 // }
3813 //
3814 // if(i != seqlen)
3815 // {
3816 // ajWarn("length mismatch seq: %u quality: %u\n",
3817 // seqlen, i);
3818 // ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3819 //
3820 // return ajFalse;
3821 // }
3822 //
3823 // if(ok)
3824 // ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3825 // else
3826 // ajFilebuffClear(buff, 0);
3827 //
3828 // thys->Fpos = fpos;
3829 //
3830 // ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);
3831 //
3832 // ajStrTokenDel(&handle);
3833 //
3834 // return ajTrue;
3835 //}
3836 */
3837
3838
3839
3840
3841 /* @funcstatic seqReadFastqIllumina *******************************************
3842 **
3843 ** Given data in a sequence structure, tries to read everything needed
3844 ** using the FASTQ format, and processes phred quality scores
3845 ** with Illumina encoding.
3846 **
3847 ** @param [w] thys [AjPSeq] Sequence object
3848 ** @param [u] seqin [AjPSeqin] Sequence input object
3849 ** @return [AjBool] ajTrue on success
3850 **
3851 ** @release 6.1.0
3852 ** @@
3853 ******************************************************************************/
3854
seqReadFastqIllumina(AjPSeq thys,AjPSeqin seqin)3855 static AjBool seqReadFastqIllumina(AjPSeq thys, AjPSeqin seqin)
3856 {
3857 AjPFilebuff buff;
3858 AjPStr id = NULL;
3859 AjPStr acc = NULL;
3860 AjPStr sv = NULL;
3861 AjPStr desc = NULL;
3862
3863 ajuint seqlen = 0;
3864 /*AjPStr qualstr = NULL;*/
3865 /*
3866 ** char minqual;
3867 ** char maxqual;
3868 ** char comqual;
3869 */
3870
3871 const char *cp;
3872 ajint iqual;
3873 ajlong fpos = 0;
3874 AjBool ok = ajTrue;
3875 const AjPStr badstr = NULL;
3876
3877 /*ajint amin = 0;*/
3878 ajint qmin = 64;
3879 ajint qmax = 126;
3880 ajuint i;
3881
3882 ajDebug("seqReadFastqIllumina\n");
3883
3884 buff = seqin->Input->Filebuff;
3885
3886 /* ajFilebuffTrace(buff); */
3887
3888 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3889 if(!ok)
3890 return ajFalse;
3891
3892 fpos = ajTextinGetFpos(seqin->Input);
3893
3894 ajDebug("First line: %S\n", seqReadLine);
3895
3896 cp = ajStrGetPtr(seqReadLine);
3897
3898 if(*cp != '@')
3899 {
3900 ajDebug("first line is not FASTQ\n");
3901 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3902
3903 return ajFalse;
3904 }
3905
3906 if(!ajSeqParseFastq(seqReadLine, &id, &desc))
3907 {
3908 ajDebug("first line did not parse as FASTQ\n");
3909 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3910
3911 return ajFalse;
3912 }
3913
3914 seqSetNameNospace(&thys->Name, id);
3915
3916 if(ajStrGetLen(sv))
3917 seqSvSave(thys, sv);
3918
3919 if(ajStrGetLen(acc))
3920 seqAccSave(thys, acc);
3921
3922 ajStrAssignS(&thys->Desc, desc);
3923 ajStrDel(&id);
3924 ajStrDel(&acc);
3925 ajStrDel(&sv);
3926 ajStrDel(&desc);
3927
3928 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3929 while(ok &&
3930 !ajStrPrefixC(seqReadLine, "+"))
3931 {
3932 badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3933 seqin->Input->Format);
3934
3935 if(badstr)
3936 ajWarn("Sequence '%S' has bad character(s) '%S'",
3937 thys->Name, badstr);
3938
3939 ajDebug("++fastq append line '%S'\n", seqReadLine);
3940 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3941 ajDebug("++fastq sequence %4u '%S'\n",
3942 ajStrGetLen(thys->Seq), thys->Seq);
3943 }
3944
3945 if(!ok)
3946 {
3947 ajDebug("failed to find quality scores\n");
3948 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3949
3950 return ajFalse;
3951 }
3952
3953 seqlen = ajStrGetLen(thys->Seq);
3954
3955 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3956
3957 ajStrAssignClear(&seqQualStr);
3958
3959 while(ok &&
3960 ((ajStrGetLen(seqQualStr) < seqlen) ||
3961 ajStrGetCharFirst(seqReadLine) != '@'))
3962 {
3963 seqqualAppendWarn(&seqQualStr, seqReadLine);
3964
3965 ajDebug("++fastq append qualities '%S'\n", seqReadLine);
3966 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3967 ajDebug("++fastq qualities %3u '%S'\n",
3968 ajStrGetLen(seqQualStr), seqQualStr);
3969 }
3970
3971 /*
3972 ** minqual = ajStrGetAsciiLow(seqQualStr);
3973 ** maxqual = ajStrGetAsciiHigh(seqQualStr);
3974 ** comqual = ajStrGetAsciiCommon(seqQualStr);
3975 */
3976
3977 if(ajStrGetLen(seqQualStr) != seqlen)
3978 {
3979 ajDebug("length mismatch seq: %u quality: %u\n",
3980 seqlen, ajStrGetLen(seqQualStr));
3981 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3982
3983 return ajFalse;
3984 }
3985
3986 if(ok)
3987 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3988 else
3989 ajFilebuffClear(buff, 0);
3990
3991 thys->Fpos = fpos;
3992
3993 /*ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);*/
3994
3995 /*ajDebug("Illumina: %d..%d (%d)\n",
3996 (ajint) minqual, (ajint) maxqual, (ajint) comqual);*/
3997
3998 cp = ajStrGetPtr(seqQualStr);
3999 i=0;
4000
4001 if(seqlen > thys->Qualsize)
4002 AJCRESIZE(thys->Accuracy, seqlen);
4003
4004 thys->Qualsize = seqlen;
4005
4006 /*
4007 ** Illumina uses Phred quality calculated from error probability p
4008 ** Qp = -10 log (p)
4009 **
4010 ** For Sanger (phred) p = 1 / 10**(Q/10)
4011 ** 10: p=0.1 20: p=0.01 etc.
4012 */
4013
4014 while (*cp)
4015 {
4016 iqual = *cp++;
4017 if(iqual < qmin)
4018 {
4019 ajWarn("FASTQ-ILLUMINA quality value too low '%F' '%S' '%c'",
4020 ajFilebuffGetFile(buff), thys->Name,
4021 (char) iqual);
4022 iqual = qmin;
4023 }
4024 if(iqual > qmax)
4025 {
4026 ajWarn("FASTQ-ILLUMINA quality value too high '%F' '%S' '%c'",
4027 ajFilebuffGetFile(buff), thys->Name,
4028 (char) iqual);
4029 iqual = qmax;
4030 }
4031 thys->Accuracy[i++] = seqQualIllumina[iqual];
4032 }
4033
4034 /*
4035 ** ajDebug("quality characters %d..%d (%d) '%c' '%c' (%c) "
4036 ** "scores %d..%d (%d)\n",
4037 ** (int) minqual, (int) maxqual, (int) comqual,
4038 ** minqual, maxqual, comqual,
4039 ** (amin + minqual - qmin), (amin + maxqual - qmin),
4040 ** (amin + comqual - qmin));
4041 */
4042
4043 return ajTrue;
4044 }
4045
4046
4047
4048
4049
4050 /* @funcstatic seqReadFastqSolexa *********************************************
4051 **
4052 ** Given data in a sequence structure, tries to read everything needed
4053 ** using the FASTQ format, and processes Illumina/Solexa quality scores.
4054 **
4055 ** @param [w] thys [AjPSeq] Sequence object
4056 ** @param [u] seqin [AjPSeqin] Sequence input object
4057 ** @return [AjBool] ajTrue on success
4058 **
4059 ** @release 6.1.0
4060 ** @@
4061 ******************************************************************************/
4062
seqReadFastqSolexa(AjPSeq thys,AjPSeqin seqin)4063 static AjBool seqReadFastqSolexa(AjPSeq thys, AjPSeqin seqin)
4064 {
4065 AjPFilebuff buff;
4066 AjPStr id = NULL;
4067 AjPStr acc = NULL;
4068 AjPStr sv = NULL;
4069 AjPStr desc = NULL;
4070
4071 ajuint seqlen = 0;
4072 /*AjPStr qualstr = NULL;*/
4073
4074 /*
4075 ** char minqual;
4076 ** char maxqual;
4077 ** char comqual;
4078 */
4079
4080 const char *cp;
4081 ajint iqual;
4082 ajlong fpos = 0;
4083 AjBool ok = ajTrue;
4084 const AjPStr badstr = NULL;
4085
4086 /*ajint amin = 0;*/
4087 ajint qmin = 59;
4088 ajint qmax = 126;
4089 ajuint i;
4090 /*
4091 ** double sval;
4092 ** double pval;
4093 ** double qval;
4094 */
4095
4096 /*ajDebug("seqReadFastqSolexa\n");*/
4097
4098 buff = seqin->Input->Filebuff;
4099
4100 /* ajFilebuffTrace(buff); */
4101
4102 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4103 if(!ok)
4104 return ajFalse;
4105
4106 fpos = ajTextinGetFpos(seqin->Input);
4107
4108 /*ajDebug("First line: %S\n", seqReadLine);*/
4109
4110 cp = ajStrGetPtr(seqReadLine);
4111
4112 if(*cp != '@')
4113 {
4114 ajDebug("first line is not FASTQ\n");
4115 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4116
4117 return ajFalse;
4118 }
4119
4120 if(!ajSeqParseFastq(seqReadLine, &id, &desc))
4121 {
4122 ajDebug("first line did not parse as FASTQ\n");
4123 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4124
4125 return ajFalse;
4126 }
4127
4128 seqSetNameNospace(&thys->Name, id);
4129
4130 if(ajStrGetLen(sv))
4131 seqSvSave(thys, sv);
4132
4133 if(ajStrGetLen(acc))
4134 seqAccSave(thys, acc);
4135
4136 ajStrAssignS(&thys->Desc, desc);
4137 ajStrDel(&id);
4138 ajStrDel(&acc);
4139 ajStrDel(&sv);
4140 ajStrDel(&desc);
4141
4142 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4143 while(ok &&
4144 !ajStrPrefixC(seqReadLine, "+"))
4145 {
4146 badstr = seqAppendWarn(&thys->Seq, seqReadLine,
4147 seqin->Input->Format);
4148
4149 if(badstr)
4150 ajWarn("Sequence '%S' has bad character(s) '%S'",
4151 thys->Name, badstr);
4152
4153 ajDebug("++fastq append line '%S'\n", seqReadLine);
4154 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4155 ajDebug("++fastq sequence %4u '%S'\n",
4156 ajStrGetLen(thys->Seq), thys->Seq);
4157 }
4158
4159 if(!ok)
4160 {
4161 ajDebug("failed to find quality scores\n");
4162 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4163
4164 return ajFalse;
4165 }
4166
4167 seqlen = ajStrGetLen(thys->Seq);
4168
4169 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4170
4171 ajStrAssignClear(&seqQualStr);
4172
4173 while(ok &&
4174 ((ajStrGetLen(seqQualStr) < seqlen) ||
4175 ajStrGetCharFirst(seqReadLine) != '@'))
4176 {
4177 seqqualAppendWarn(&seqQualStr, seqReadLine);
4178
4179 ajDebug("++fastq append qualities '%S'\n", seqReadLine);
4180 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4181 ajDebug("++fastq qualities %3u '%S'\n",
4182 ajStrGetLen(seqQualStr), seqQualStr);
4183 }
4184
4185 /*
4186 ** minqual = ajStrGetAsciiLow(seqQualStr);
4187 ** maxqual = ajStrGetAsciiHigh(seqQualStr);
4188 ** comqual = ajStrGetAsciiCommon(seqQualStr);
4189 */
4190
4191 if(ajStrGetLen(seqQualStr) != seqlen)
4192 {
4193 ajDebug("length mismatch seq: %u quality: %u\n",
4194 seqlen, ajStrGetLen(seqQualStr));
4195 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4196
4197 return ajFalse;
4198 }
4199
4200 if(ok)
4201 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4202 else
4203 ajFilebuffClear(buff, 0);
4204
4205 thys->Fpos = fpos;
4206
4207 /*ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);*/
4208
4209 /*ajDebug("Solexa: %d..%d (%d)\n",
4210 (ajint) minqual, (ajint) maxqual, (ajint) comqual);*/
4211
4212 cp = ajStrGetPtr(seqQualStr);
4213 i=0;
4214
4215 if(seqlen > thys->Qualsize)
4216 AJCRESIZE(thys->Accuracy, seqlen);
4217
4218 thys->Qualsize = seqlen;
4219
4220 /*
4221 ** Sanger uses Phred quality calculated from error probability p
4222 ** Qp = -10 log (p)
4223 ** Solexa adjusts for the probability of error
4224 ** Qs = -10 log ((p/(1-p))
4225 **
4226 ** For Sanger (phred) p = 1 / 10**(Q/10)
4227 ** 10: p=0.1 20: p=0.01 etc.
4228 **
4229 ** For Solexa (Illumina) ps = p / (1+p) where p is the phred probability
4230 ** calculation which we use as an intermediate value
4231 */
4232
4233 while (*cp)
4234 {
4235 iqual = *cp++;
4236 if(iqual < qmin)
4237 {
4238 ajWarn("FASTQ-SOLEXA quality value too low '%F' '%S' '%c'",
4239 ajFilebuffGetFile(buff), thys->Name,
4240 (char) iqual);
4241 iqual = qmin;
4242 }
4243 if(iqual > qmax)
4244 {
4245 ajWarn("FASTQ-SOLEXA quality value too high '%F' '%S' '%c'",
4246 ajFilebuffGetFile(buff), thys->Name,
4247 (char) iqual);
4248 iqual = qmax;
4249 }
4250 thys->Accuracy[i++] = (float) seqQualSolexa[iqual];
4251 }
4252 /*
4253 ** ajDebug("quality characters %d..%d (%d) '%c' '%c' (%c) "
4254 ** "scores %d..%d (%d)\n",
4255 ** (int) minqual, (int) maxqual, (int) comqual,
4256 ** minqual, maxqual, comqual,
4257 ** (amin + minqual - qmin), (amin + maxqual - qmin),
4258 ** (amin + comqual - qmin));
4259 */
4260
4261 return ajTrue;
4262 }
4263
4264
4265
4266
4267 /* @funcstatic seqReadDbId ****************************************************
4268 **
4269 ** Given data in a sequence structure, tries to read everything needed
4270 ** using the FASTA >db id format.
4271 **
4272 ** @param [w] thys [AjPSeq] Sequence object
4273 ** @param [u] seqin [AjPSeqin] Sequence input object
4274 ** @return [AjBool] ajTrue on success
4275 **
4276 ** @release 1.0.0
4277 ** @@
4278 ******************************************************************************/
4279
seqReadDbId(AjPSeq thys,AjPSeqin seqin)4280 static AjBool seqReadDbId(AjPSeq thys, AjPSeqin seqin)
4281 {
4282 AjPFilebuff buff;
4283
4284 const char *cp;
4285 const AjPStr vacc = NULL;
4286 ajlong fpos = 0;
4287 ajlong fposb = 0;
4288 AjBool ok = ajTrue;
4289
4290 ajDebug("seqReadDbId\n");
4291
4292 buff = seqin->Input->Filebuff;
4293 /* ajFilebuffTrace(buff); */
4294
4295 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4296 if(!ok)
4297 return ajFalse;
4298
4299 fpos = ajTextinGetFpos(seqin->Input);
4300
4301 /* If ; then it is really PIR format */
4302 if(ajStrGetCharPos(seqReadLine, 3) == ';')
4303 return ajFalse;
4304
4305 cp = ajStrGetPtr(seqReadLine);
4306
4307 if(*cp != '>')
4308 {
4309 ajDebug("first line is not FASTA\n");
4310 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4311
4312 return ajFalse;
4313 }
4314
4315 ajStrTokenAssignC(&seqHandle, seqReadLine, "> ");
4316 ajStrTokenStepC(seqHandle, " \t\n\r");
4317 ajStrTokenNextParseC(seqHandle, " \t\n\r", &seqToken);
4318 seqSetName(thys, seqToken);
4319
4320 ajStrTokenNextParse(seqHandle, &seqToken);
4321
4322 vacc = ajSeqtestIsSeqversion(seqToken);
4323 if(vacc)
4324 {
4325 seqSvSave(thys, seqToken);
4326 seqAccSave(thys, vacc);
4327 ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc);
4328 }
4329 else if(ajSeqtestIsAccession(seqToken))
4330 {
4331 seqAccSave(thys, seqToken);
4332 ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc);
4333 }
4334 else
4335 {
4336 ajStrAssignS(&thys->Desc, seqToken);
4337
4338 if(ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken))
4339 {
4340 ajStrAppendC(&thys->Desc, " ");
4341 ajStrAppendS(&thys->Desc, seqToken);
4342 }
4343 }
4344
4345 ajStrDelStatic(&seqToken);
4346 ajStrTokenReset(seqHandle);
4347
4348 if(ajStrGetLen(seqin->Inseq))
4349 { /* we have a sequence to use */
4350 ajStrAssignS(&thys->Seq, seqin->Inseq);
4351
4352 if(seqin->Input->Text)
4353 seqTextSeq(&thys->TextPtr, seqin->Inseq);
4354
4355 ajFilebuffClear(buff, 0);
4356 }
4357 else
4358 {
4359 /* we know we will succeed from here ... no way to return ajFalse */
4360
4361 ajFilebuffSetUnbuffered(buff);
4362
4363 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4364 while(ok && !ajStrPrefixC(seqReadLine, ">"))
4365 {
4366 seqAppend(&thys->Seq, seqReadLine);
4367
4368 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4369 }
4370
4371 if(ok)
4372 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4373 else
4374 ajFilebuffClear(buff, 0);
4375 }
4376
4377 thys->Fpos = fpos;
4378
4379 ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);
4380
4381 return ajTrue;
4382 }
4383
4384
4385
4386
4387 /* @funcstatic seqReadGde *****************************************************
4388 **
4389 ** Given data in a sequence structure, tries to read everything needed
4390 ** using the GDE format
4391 **
4392 ** @param [w] thys [AjPSeq] Sequence object
4393 ** @param [u] seqin [AjPSeqin] Sequence input object
4394 ** @return [AjBool] ajTrue on success
4395 **
4396 ** @release 6.6.0
4397 ** @@
4398 ******************************************************************************/
4399
seqReadGde(AjPSeq thys,AjPSeqin seqin)4400 static AjBool seqReadGde(AjPSeq thys, AjPSeqin seqin)
4401 {
4402 AjPFilebuff buff;
4403
4404 const char *cp;
4405 AjBool ok = ajTrue;
4406 const AjPStr badstr = NULL;
4407 ajlong fpos;
4408
4409 ajDebug("seqReadGde\n");
4410
4411 buff = seqin->Input->Filebuff;
4412
4413 /* ajFilebuffTrace(buff); */
4414
4415 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4416 if(!ok)
4417 return ajFalse;
4418
4419 fpos = ajTextinGetFpos(seqin->Input);
4420
4421 ajDebug("First line: %S\n", seqReadLine);
4422
4423 cp = ajStrGetPtr(seqReadLine);
4424
4425 if(*cp != '#')
4426 {
4427 ajDebug("first line is not GDE\n");
4428 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4429
4430 return ajFalse;
4431 }
4432
4433 ajStrCutStart(&seqReadLine, 1);
4434
4435 ajStrTokenAssign(&seqHandle, seqReadLine);
4436 ajStrTokenNextParse(seqHandle, &seqToken);
4437
4438 seqSetNameNospace(&thys->Name, seqToken);
4439
4440 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4441 while(ok &&
4442 !ajStrPrefixC(seqReadLine, "#"))
4443 {
4444 badstr = seqAppendWarn(&thys->Seq, seqReadLine,
4445 seqin->Input->Format);
4446
4447 if(badstr)
4448 ajWarn("Sequence '%S' has bad character(s) '%S'",
4449 thys->Name, badstr);
4450
4451 ajDebug("++fastq append line '%S'\n", seqReadLine);
4452 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4453 ajDebug("++fastq sequence %4u '%S'\n",
4454 ajStrGetLen(thys->Seq), thys->Seq);
4455 }
4456
4457 if(ok)
4458 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4459 else
4460 ajFilebuffClear(buff, 0);
4461
4462 thys->Fpos = fpos;
4463
4464 return ajTrue;
4465 }
4466
4467
4468
4469
4470 /* @funcstatic seqReadNbrf ****************************************************
4471 **
4472 ** Given data in a sequence structure, tries to read everything needed
4473 ** using NBRF format.
4474 **
4475 ** @param [w] thys [AjPSeq] Sequence object
4476 ** @param [u] seqin [AjPSeqin] Sequence input object
4477 ** @return [AjBool] ajTrue on success
4478 **
4479 ** @release 1.0.0
4480 ** @@
4481 ******************************************************************************/
4482
seqReadNbrf(AjPSeq thys,AjPSeqin seqin)4483 static AjBool seqReadNbrf(AjPSeq thys, AjPSeqin seqin)
4484 {
4485 AjPStr idline = NULL;
4486 AjPStr tmpline = NULL;
4487
4488 AjBool dofeat = ajFalse;
4489 AjBool tryfeat = ajFalse;
4490 AjPStr seqReadLine2 = NULL;
4491
4492 AjBool ok;
4493 AjPFilebuff buff;
4494 AjBool skipheader;
4495
4496 ajDebug("seqReadNbrf\n");
4497
4498 buff = seqin->Input->Filebuff;
4499
4500 if(!seqToken2)
4501 {
4502 seqToken2 = ajStrNew();
4503 seqReadLine2 = ajStrNew();
4504 }
4505
4506 if(!seqFtFmtPir)
4507 ajStrAssignC(&seqFtFmtPir, "pir");
4508
4509 if(!seqRegNbrfId)
4510 seqRegNbrfId = ajRegCompC("^>(..)[>;]([^ \t\n]+)");
4511
4512 skipheader = ajTrue;
4513 while(skipheader)
4514 {
4515 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
4516 return ajFalse;
4517
4518 if(!ajStrPrefixC(seqReadLine, "C;") && !ajStrIsWhite(seqReadLine))
4519 skipheader = ajFalse;
4520 }
4521
4522 ajDebug("nbrf first line:\n%S", seqReadLine);
4523
4524 if(!ajRegExec(seqRegNbrfId, seqReadLine))
4525 {
4526 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4527 return ajFalse;
4528 }
4529
4530 ajRegSubI(seqRegNbrfId, 1, &seqToken);
4531 ajRegSubI(seqRegNbrfId, 2, &thys->Name);
4532 ajDebug("parsed line name '%S' token '%S' token(1) '%c'\n",
4533 thys->Name, seqToken, ajStrGetCharFirst(seqToken));
4534 ajStrAssignSubS(&idline, seqReadLine, 4, -1);
4535
4536 /*
4537 ** token has the NBRF 2-char type. First char is the type
4538 ** and second char is Linear, Circular, or 1
4539 ** or, for GCG databases, this is just '>>'
4540 */
4541
4542 switch(toupper((ajint) ajStrGetCharFirst(seqToken)))
4543 {
4544 case 'P':
4545 case 'F':
4546 ajSeqSetProt(thys);
4547 break;
4548 case 'B': /* used by DIANA */
4549 case 'D': /* DNA */
4550 case 'R': /* RNA */
4551 ajSeqSetNuc(thys);
4552 break;
4553 default:
4554 ajWarn("Unknown NBRF sequence type '%S'", seqToken);
4555 }
4556
4557 /* next line is the description, with no prefix */
4558
4559 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
4560 {
4561 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4562
4563 return ajFalse;
4564 }
4565
4566 ajStrAssignS(&thys->Desc, seqReadLine);
4567
4568 if(ajStrGetCharLast(thys->Desc) == '\n')
4569 ajStrCutEnd(&thys->Desc, 1);
4570
4571 /* read on, looking for feature and sequence lines */
4572
4573 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4574
4575 dofeat = ajFalse;
4576 tryfeat = seqinUfoLocal(seqin);
4577
4578 while(ok && !ajStrPrefixC(seqReadLine, ">"))
4579 {
4580 if(ajStrGetCharPos(seqReadLine, 1) != ';')
4581 seqAppend(&thys->Seq, seqReadLine);
4582 else
4583 {
4584 if(ajStrPrefixC(seqReadLine, "C;Accession:"))
4585 {
4586 ajStrAssignC(&seqReadLine2,ajStrGetPtr(seqReadLine)+13);
4587 ajStrTokenAssignC(&seqHandle2,seqReadLine2, " ;\n\r");
4588
4589 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
4590 seqAccSave(thys, seqToken2);
4591 }
4592
4593 if(ajStrPrefixC(seqReadLine, "C;Species:"))
4594 {
4595 ajStrAssignC(&seqReadLine2,ajStrGetPtr(seqReadLine)+11);
4596 ajStrTokenAssignC(&seqHandle2,seqReadLine2, ";.\n\r");
4597
4598 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
4599 seqTaxSave(thys, seqToken2, 1);
4600 }
4601
4602 if(ajStrGetCharFirst(seqReadLine) == 'R')
4603 { /* skip reference lines with no prefix */
4604 while((ok=ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr)))
4605 if(ajStrGetCharPos(seqReadLine,1)==';' ||
4606 ajStrGetCharFirst(seqReadLine)=='>')
4607 break; /* X; line or next sequence */
4608
4609 if(ok)
4610 continue;
4611 }
4612 else if(tryfeat && ajStrGetCharFirst(seqReadLine) == 'F')
4613 { /* feature lines */
4614 if(!dofeat)
4615 {
4616 dofeat = ajTrue;
4617 ajFeattabinDel(&seqin->Ftquery);
4618 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtPir,
4619 thys->Name, "N");
4620 ajDebug("seqin->Ftquery Filebuff %x\n",
4621 seqin->Ftquery->Input->Filebuff);
4622 }
4623
4624 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff,
4625 seqReadLine);
4626 /* ajDebug("NBRF FEAT saved line:\n%S", seqReadLine); */
4627 }
4628 }
4629
4630 if(ok)
4631 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4632
4633 /* SRS 7 and SRS 8.0 put an extra ID line in here */
4634
4635 /* SRS 8.1 is even worse - it has a peculiar bug that repeats
4636 the ID line but with a few digits in front, and then repeats the
4637 description */
4638
4639 /* just for another oddity ... the extra ID line always starts >P1;
4640 even if the protein is a fragment */
4641
4642 if(ok && !ajStrGetLen(thys->Seq) &&
4643 (ajStrFindAnyK(seqReadLine, '>') != -1))
4644 {
4645 ajStrAssignS(&tmpline, seqReadLine);
4646 ajStrTrimStartC(&tmpline,"0123456789");
4647 ajStrCutStart(&tmpline, 4);
4648
4649 if(ajStrMatchS(tmpline, idline))
4650 {
4651 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4652
4653 if(ok && !ajStrIsWhite(seqReadLine)) /* SRS 8.1 desc line */
4654 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4655 }
4656 }
4657
4658 }
4659
4660 if(ajStrGetCharLast(thys->Seq) == '*')
4661 ajStrCutEnd(&thys->Seq, 1);
4662
4663 if(ok)
4664 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4665 else
4666 ajFilebuffClear(buff, 0);
4667
4668 if(dofeat)
4669 {
4670 ajDebug("seqin->Ftquery Filebuff %x\n",
4671 seqin->Ftquery->Input->Filebuff);
4672 ajFeattableDel(&seqin->Fttable);
4673 thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
4674 if(thys->Fttable)
4675 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
4676 /* ajFeattableTrace(thys->Fttable); */
4677 ajFeattabinClear(seqin->Ftquery);
4678 }
4679
4680 ajStrDel(&idline);
4681 ajStrDel(&tmpline);
4682 ajStrDel(&seqReadLine2);
4683 ajStrTokenReset(seqHandle2);
4684 ajStrDelStatic(&seqToken);
4685 ajStrDelStatic(&seqToken2);
4686
4687 return ajTrue;
4688 }
4689
4690
4691
4692
4693 /* @funcstatic seqReadNibble **************************************************
4694 **
4695 ** Given data in a sequence structure, tries to read everything needed
4696 ** using the half-byte comressed nibble format
4697 **
4698 ** @param [w] thys [AjPSeq] Sequence object
4699 ** @param [u] seqin [AjPSeqin] Sequence input object
4700 ** @return [AjBool] ajTrue on success
4701 **
4702 ** @release 6.6.0
4703 ** @@
4704 ******************************************************************************/
4705
seqReadNibble(AjPSeq thys,AjPSeqin seqin)4706 static AjBool seqReadNibble(AjPSeq thys, AjPSeqin seqin)
4707 {
4708 AjPFilebuff buff;
4709 AjPFile fp;
4710
4711 ajulong filestat = 0L;
4712 AjBool ok = ajTrue;
4713 union lbytes
4714 {
4715 char chars[4];
4716 ajuint i;
4717 } seqbyte;
4718
4719 ajuint seqlen = 0;
4720 ajuint buflen;
4721 ajuint base1;
4722 ajuint base2;
4723 AjPStr buf = NULL;
4724 char *cbuf;
4725 ajuint i;
4726 ajuint j;
4727
4728 AjBool doreverse = AJFALSE;
4729 const char *nibblechars = "TCAGNNNNTCAGNNNN";
4730
4731 ajDebug("seqReadNibble\n");
4732
4733 buff = seqin->Input->Filebuff;
4734 fp = ajFilebuffGetFile(buff);
4735
4736 if(ajFilebuffIsEnded(buff))
4737 {
4738 ajDebug("seqReadNibble buffer already ended\n");
4739
4740 return ajFalse;
4741 }
4742
4743 filestat = ajFileSeek(fp, 0L, SEEK_SET);
4744
4745 if(filestat)
4746 {
4747 ajDebug("seqReadNibble rewind failed errno %d: %s\n",
4748 errno, strerror(errno));
4749 return ajFalse;
4750 }
4751 else
4752 {
4753 if(ajFilebuffIsEnded(buff))
4754 {
4755 ajFileSeek(fp, 0L, SEEK_END);
4756 return ajFalse;
4757 }
4758
4759 ajReadbinUint4(fp, &seqbyte.i);
4760 if(seqbyte.i == 0x6BE9eD3A)
4761 {
4762 ajDebug("seqReadNibble: Magic number found\n");
4763 }
4764 else if(seqbyte.i == 0x3AEDE96B)
4765 {
4766 ajDebug("seqReadNibble: Magic number is reversed\n");
4767 doreverse = ajTrue;
4768 }
4769 else
4770 {
4771 ajDebug("seqReadNibble: Magic number not found (%x)\n", seqbyte.i);
4772 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4773 ajStrAssignClear(&thys->Seq);
4774
4775 return ajFalse;
4776 }
4777
4778 ajReadbinUint4(fp, &seqlen);
4779 if(doreverse)
4780 ajByteRevUint(&seqlen);
4781
4782 buflen = (1+seqlen)/2;
4783 buf = ajStrNewRes(buflen);
4784 ajStrSetValidLen(&buf, buflen);
4785 cbuf = ajStrGetuniquePtr(&buf);
4786
4787 ajReadbinBinary(fp, buflen, 1, cbuf);
4788
4789 j = 0;
4790 for(i=0; i < buflen; i++)
4791 {
4792 seqbyte.chars[0] = cbuf[i];
4793 base2 = seqbyte.chars[0] & 0x0F;
4794 base1 = (seqbyte.chars[0] >> 4);
4795 seqAppendK(&thys->Seq, nibblechars[base1]);
4796 if(++j < seqlen)
4797 seqAppendK(&thys->Seq, nibblechars[base2]);
4798 ++j;
4799 }
4800
4801 if(!ok)
4802 {
4803 ajFileSeek(fp,(ajlong) filestat,0);
4804
4805 if(seqin->Input->Text)
4806 ajStrAssignC(&thys->TextPtr, "");
4807
4808 ajFilebuffResetPos(buff);
4809
4810 return ajFalse;
4811 }
4812 }
4813
4814 ajFilebuffClear(buff, -1);
4815 buff->File->End = ajTrue; /* set to avoid rereading */
4816
4817 if(!ajTextinGetRecords(seqin->Input))
4818 return ajFalse;
4819
4820 return ajTrue;
4821 }
4822
4823
4824
4825
4826 /* @funcstatic seqReadGcg *****************************************************
4827 **
4828 ** Given data in a sequence structure, tries to read everything needed
4829 ** using GCG format.
4830 **
4831 ** @param [w] thys [AjPSeq] Sequence object
4832 ** @param [u] seqin [AjPSeqin] Sequence input object
4833 ** @return [AjBool] ajTrue on success
4834 **
4835 ** @release 1.0.0
4836 ** @@
4837 ******************************************************************************/
4838
seqReadGcg(AjPSeq thys,AjPSeqin seqin)4839 static AjBool seqReadGcg(AjPSeq thys, AjPSeqin seqin)
4840 {
4841 AjBool ok;
4842
4843 ajuint len = 0;
4844 AjBool incomment = ajFalse;
4845
4846 AjPFilebuff buff;
4847
4848 buff = seqin->Input->Filebuff;
4849
4850 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4851
4852 while (ok && ajStrIsWhite(seqReadLine))
4853 {
4854 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4855 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4856 }
4857
4858 if(!ok)
4859 return ajFalse;
4860
4861 ajDebug("seqReadGcg first line ok: %B\n'%S'\n", ok, seqReadLine);
4862
4863 /* test GCG 9.x file types if available */
4864 /* any type on the .. line will override this */
4865
4866 if(ajStrPrefixC(seqReadLine, "!!NA_SEQUENCE"))
4867 ajSeqSetNuc(thys);
4868 else if(ajStrPrefixC(seqReadLine, "!!AA_SEQUENCE"))
4869 ajSeqSetProt(thys);
4870
4871 if(!seqGcgDots(thys, seqin, &seqReadLine, seqMaxGcglines, &len))
4872 {
4873 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4874 return ajFalse;
4875 }
4876
4877 ajDebug(" Gcg dots read ok len: %d\n", len);
4878
4879 while(ok && (ajSeqGetLen(thys) < len))
4880 {
4881 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4882 if(ok)
4883 {
4884 seqAppendCommented(&thys->Seq, &incomment, seqReadLine);
4885 ajDebug("line %d seqlen: %d ok: %B\n",
4886 ajTextinGetRecords(seqin->Input), ajSeqGetLen(thys), ok);
4887 }
4888 }
4889
4890 ajDebug("lines: %d ajSeqGetLen : %d len: %d ok: %B\n",
4891 ajTextinGetRecords(seqin->Input), ajSeqGetLen(thys), len, ok);
4892
4893 ajFilebuffClear(buff, 0);
4894
4895 return ok;
4896 }
4897
4898
4899
4900
4901 /* @funcstatic seqReadNcbi ****************************************************
4902 **
4903 ** Given data in a sequence structure, tries to read everything needed
4904 ** using NCBI format.
4905 **
4906 ** @param [w] thys [AjPSeq] Sequence object
4907 ** @param [u] seqin [AjPSeqin] Sequence input object
4908 ** @return [AjBool] ajTrue on success
4909 **
4910 ** @release 1.0.0
4911 ** @@
4912 ******************************************************************************/
4913
seqReadNcbi(AjPSeq thys,AjPSeqin seqin)4914 static AjBool seqReadNcbi(AjPSeq thys, AjPSeqin seqin)
4915 {
4916 AjPStr id = NULL;
4917 AjPStr acc = NULL;
4918 AjPStr sv = NULL;
4919 AjPStr gi = NULL;
4920 AjPStr db = NULL;
4921 AjPStr desc = NULL;
4922
4923 AjPFilebuff buff;
4924
4925 AjBool ok;
4926 const AjPStr badstr = NULL;
4927
4928 buff = seqin->Input->Filebuff;
4929
4930 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4931 if(!ok)
4932 return ajFalse;
4933
4934 ajStrAssignClear(&id);
4935 ajStrAssignClear(&acc);
4936 ajStrAssignClear(&sv);
4937 ajStrAssignClear(&gi);
4938 ajStrAssignClear(&desc);
4939
4940
4941 if(!ajSeqParseNcbi(seqReadLine,&id,&acc,&sv,&gi,&db,&desc))
4942 {
4943 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4944 ajStrDel(&id);
4945 ajStrDel(&acc);
4946 ajStrDel(&sv);
4947 ajStrDel(&gi);
4948 ajStrDel(&db);
4949 ajStrDel(&desc);
4950
4951 return ajFalse;
4952 }
4953
4954 ajDebug("parsed id '%S' acc '%S' sv '%S' gi '%S' db '%S' (%S) desc '%S'\n",
4955 id, acc, sv, gi, db, thys->Setdb, desc);
4956
4957 ajStrAssignS(&thys->Setdb, db);
4958 ajDebug("set setdb '%S' db '%S'\n", thys->Setdb, thys->Db);
4959
4960 if(ajStrGetLen(gi))
4961 ajStrAssignS(&thys->Gi, gi);
4962
4963 if(ajStrGetLen(sv))
4964 seqSvSave(thys, sv);
4965
4966 if(ajStrGetLen(acc))
4967 seqAccSave(thys, acc);
4968
4969 seqSetName(thys, id);
4970 ajStrAssignS(&thys->Desc, desc);
4971
4972
4973 if(ajStrGetLen(seqin->Inseq))
4974 { /* we have a sequence to use */
4975 ajStrAssignS(&thys->Seq, seqin->Inseq);
4976
4977 if(seqin->Input->Text)
4978 seqTextSeq(&thys->TextPtr, seqin->Inseq);
4979
4980 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4981 }
4982 else
4983 {
4984 /* we know we will succeed from here ... no way to return ajFalse */
4985
4986 ajFilebuffSetUnbuffered(buff);
4987
4988 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4989 while(ok && !ajStrPrefixC(seqReadLine, ">"))
4990 {
4991 badstr = seqAppendWarn(&thys->Seq, seqReadLine,
4992 seqin->Input->Format);
4993
4994 if(badstr)
4995 ajWarn("Sequence '%S' has bad character(s) '%S'",
4996 thys->Name, badstr);
4997
4998 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4999 }
5000
5001 if(ok)
5002 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
5003 else
5004 ajFilebuffClear(buff, 0);
5005 }
5006
5007 ajDebug("seqReadNcbi Setdb '%S' Db '%S'\n", thys->Setdb, thys->Db);
5008 ajStrDel(&id);
5009 ajStrDel(&acc);
5010 ajStrDel(&sv);
5011 ajStrDel(&gi);
5012 ajStrDel(&db);
5013 ajStrDel(&desc);
5014
5015 return ajTrue;
5016 }
5017
5018
5019
5020
5021 /* @funcstatic seqReadGifasta *************************************************
5022 **
5023 ** Given data in a sequence structure, tries to read everything needed
5024 ** using NCBI format. However, unlike NCBI format it uses the GI number
5025 ** as the sequence ID
5026 **
5027 ** @param [w] thys [AjPSeq] Sequence object
5028 ** @param [u] seqin [AjPSeqin] Sequence input object
5029 ** @return [AjBool] ajTrue on success
5030 **
5031 ** @release 4.1.0
5032 ** @@
5033 ******************************************************************************/
5034
seqReadGifasta(AjPSeq thys,AjPSeqin seqin)5035 static AjBool seqReadGifasta(AjPSeq thys, AjPSeqin seqin)
5036 {
5037 AjPStr id = NULL;
5038 AjPStr acc = NULL;
5039 AjPStr sv = NULL;
5040 AjPStr gi = NULL;
5041 AjPStr db = NULL;
5042 AjPStr desc = NULL;
5043
5044 AjPFilebuff buff;
5045
5046 AjBool ok;
5047
5048
5049 buff = seqin->Input->Filebuff;
5050
5051 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
5052 if(!ok)
5053 return ajFalse;
5054
5055 ajStrAssignClear(&id);
5056 ajStrAssignClear(&acc);
5057 ajStrAssignClear(&sv);
5058 ajStrAssignClear(&gi);
5059 ajStrAssignClear(&desc);
5060
5061
5062 if(!ajSeqParseNcbi(seqReadLine,&id,&acc,&sv,&gi,&db,&desc) ||
5063 !ajStrGetLen(gi))
5064 {
5065 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5066 ajStrDel(&id);
5067 ajStrDel(&db);
5068 ajStrDel(&acc);
5069 ajStrDel(&sv);
5070 ajStrDel(&gi);
5071 ajStrDel(&desc);
5072
5073 return ajFalse;
5074 }
5075
5076 ajDebug("parsed id '%S' acc '%S' sv '%S' gi '%S' db '%S' (%S) desc '%S'\n",
5077 id, acc, sv, gi, db, thys->Setdb, desc);
5078
5079 ajStrAssignS(&thys->Gi, gi);
5080
5081 ajStrAssignS(&thys->Setdb, db);
5082 ajDebug("set setdb '%S' db '%S'\n", thys->Setdb, thys->Db);
5083
5084 if(ajStrGetLen(sv))
5085 seqSvSave(thys, sv);
5086
5087 if(ajStrGetLen(acc))
5088 seqAccSave(thys, acc);
5089
5090 seqSetName(thys, gi);
5091 ajStrAssignS(&thys->Desc, desc);
5092
5093
5094 if(ajStrGetLen(seqin->Inseq))
5095 { /* we have a sequence to use */
5096 ajStrAssignS(&thys->Seq, seqin->Inseq);
5097
5098 if(seqin->Input->Text)
5099 seqTextSeq(&thys->TextPtr, seqin->Inseq);
5100
5101 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
5102 }
5103 else
5104 {
5105 /* we know we will succeed from here ... no way to return ajFalse */
5106
5107 ajFilebuffSetUnbuffered(buff);
5108
5109 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
5110
5111
5112
5113 while(ok && !ajStrPrefixC(seqReadLine, ">"))
5114 {
5115 seqAppend(&thys->Seq, seqReadLine);
5116
5117 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
5118 }
5119
5120 if(ok)
5121 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
5122 else
5123 ajFilebuffClear(buff, 0);
5124 }
5125
5126 ajStrDel(&id);
5127 ajStrDel(&db);
5128 ajStrDel(&acc);
5129 ajStrDel(&sv);
5130 ajStrDel(&gi);
5131 ajStrDel(&desc);
5132
5133 return ajTrue;
5134 }
5135
5136
5137
5138
5139 /* @funcstatic seqReadSelex ***************************************************
5140 **
5141 ** Read a Selex file. Assumed a comment on the first line but this may
5142 ** not be true.
5143 **
5144 ** This format can read anything that looks like a block of "name sequence"
5145 ** data. The names are even allowed to change in later blocks.
5146 **
5147 ** The format was used by HMMER, but that package now prefers the better
5148 ** annotated "Stockholm" format used by Pfam and Rfam.
5149 **
5150 ** @param [w] thys [AjPSeq] Sequence object
5151 ** @param [u] seqin [AjPSeqin] Sequence input object
5152 ** @return [AjBool] ajTrue on success
5153 **
5154 ** @release 2.3.0
5155 ** @@
5156 ******************************************************************************/
5157
seqReadSelex(AjPSeq thys,AjPSeqin seqin)5158 static AjBool seqReadSelex(AjPSeq thys, AjPSeqin seqin)
5159 {
5160 AjPFilebuff buff = seqin->Input->Filebuff;
5161 AjPStr line = NULL;
5162 SeqPSelex selex;
5163 ajuint n = 0;
5164 const char *p = NULL;
5165 AjBool ok = ajFalse;
5166 AjBool isseq = ajFalse;
5167 AjBool named = ajFalse;
5168 AjBool head = ajTrue;
5169 ajuint sqcnt = 0;
5170 ajuint i;
5171 char c = '\0';
5172 AjBool first = ajTrue;
5173
5174 line = ajStrNew();
5175
5176
5177 if(seqin->SeqData)
5178 selex = seqin->SeqData;
5179 else
5180 {
5181 ajFilebuffSetBuffered(buff); /* must buffer to test sequences */
5182
5183 /* First count the sequences, and get any header information */
5184 while(!isseq && (ok=ajBuffreadLine(buff,&line)))
5185 {
5186 if(first)
5187 {
5188 first=ajFalse;
5189
5190 if(!ajStrPrefixC(line,"#"))
5191 {
5192 ajStrDel(&line);
5193 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5194
5195 return ajFalse;
5196 }
5197 }
5198
5199 ajStrRemoveWhiteExcess(&line);
5200 p = ajStrGetPtr(line);
5201
5202 if(!*p || *p=='#')
5203 continue;
5204 else
5205 isseq = ajTrue;
5206 }
5207
5208 if(!ok && !isseq)
5209 return ajFalse;
5210 ++n;
5211
5212 ok = ajTrue;
5213
5214 while(ok && ajBuffreadLine(buff,&line))
5215 {
5216 ajStrRemoveWhiteExcess(&line);
5217 p = ajStrGetPtr(line);
5218
5219 if(*p=='#')
5220 continue;
5221
5222 if(!*p)
5223 ok = ajFalse;
5224 else
5225 ++n;
5226 }
5227
5228 ajFilebuffClear(buff,-1);
5229 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5230 buff->Fpos = 0;
5231 ajFileSeek(buff->File, 0L, 0);
5232 selex = selexNew(n);
5233
5234 /* now read it for real */
5235
5236 while(head && ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr))
5237 {
5238 if(ajStrPrefixC(line,"#=RF") ||ajStrPrefixC(line,"#=CS"))
5239 break;
5240
5241 if(ajStrPrefixC(line,"#="))
5242 {
5243 head=seqSelexHeader(&selex,line,&named,&sqcnt);
5244 continue;
5245 }
5246
5247 c = *ajStrGetPtr(line);
5248
5249 if(c>='0')
5250 head = ajFalse;
5251 }
5252
5253 /* Should now be at start of first block, whether RF or sequence */
5254 ajDebug("First Block Line: %S",line);
5255
5256 ok = ajTrue;
5257
5258 while(ok && !ajStrPrefixC(line, "# ID"))
5259 {
5260 seqSelexReadBlock(&selex,&named,n,&line,seqin, &thys->TextPtr);
5261 ok = ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5262 ajDebug("... in loop ok: %B\n", ok);
5263 }
5264
5265 ajDebug(" Block done. More data (ok): %B\n", ok);
5266
5267 if(ok)
5268 ajTextinStoreClear(seqin->Input, 1, line, &thys->TextPtr);
5269 else
5270 ajFilebuffClear(buff, 0);
5271
5272 seqin->SeqData = selex;
5273 }
5274
5275
5276 /* At this point the Selex structure is fully loaded */
5277 if(selex->Count >= selex->n)
5278 {
5279 selexDel(&selex);
5280 seqin->SeqData = NULL;
5281 ajStrDel(&line);
5282
5283 return ajFalse;
5284 }
5285
5286 i = selex->Count;
5287
5288 seqSelexCopy(&thys,selex,i);
5289
5290 ++selex->Count;
5291
5292 ajFilebuffClear(buff,0);
5293
5294 ajStrDel(&line);
5295
5296 return ajTrue;
5297 }
5298
5299
5300
5301
5302 /* @funcstatic seqReadStockholm ***********************************************
5303 **
5304 ** Read a Stockholm file.
5305 **
5306 ** @param [w] thys [AjPSeq] Stockholm input file
5307 ** @param [u] seqin [AjPSeqin] seqin object
5308 ** @return [AjBool] ajTrue if success
5309 **
5310 ** @release 2.3.0
5311 ** @@
5312 ******************************************************************************/
5313
seqReadStockholm(AjPSeq thys,AjPSeqin seqin)5314 static AjBool seqReadStockholm(AjPSeq thys, AjPSeqin seqin)
5315 {
5316 AjPFilebuff buff = seqin->Input->Filebuff;
5317 AjPStr line = NULL;
5318 AjPStr word = NULL;
5319 AjPStr post = NULL;
5320 AjPStr namstr = NULL;
5321 AjPStr seqstr = NULL;
5322 AjBool ok = ajFalse;
5323 AjBool bmf = ajTrue;
5324 AjBool dcf = ajTrue;
5325 AjBool drf = ajTrue;
5326 AjBool ccf = ajTrue;
5327 AjBool gsf = ajTrue;
5328 AjBool reff = ajTrue;
5329
5330 SeqPStockholm stock = NULL;
5331
5332 ajuint i = 0;
5333 ajuint n = 0;
5334 ajuint scnt = INT_MAX;
5335
5336 line = ajStrNew();
5337
5338 ajDebug("seqReadStockholm EOF:%B Data:%x\n",
5339 ajFilebuffIsEof(buff), seqin->SeqData);
5340 if(seqin->SeqData)
5341 stock = seqin->SeqData;
5342 else
5343 {
5344 ajFilebuffSetBuffered(buff); /* must buffer to test sequences */
5345 ok=ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5346 ajStrTrimWhiteEnd(&line);
5347
5348 if(!ok || !ajStrPrefixC(line,"# STOCKHOLM 1."))
5349 {
5350 if (ok)
5351 ajDebug("Stockholm: bad first line: %S", line);
5352 else
5353 ajDebug("Stockholm: no first line\n");
5354
5355 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5356 ajStrDel(&line);
5357
5358 return ajFalse;
5359 }
5360
5361 ajDebug("Stockholm: good first line: %S", line);
5362
5363 while(ok && (ajStrPrefixC(line, "#") || !ajStrGetLen(line)))
5364 {
5365 if(ajStrPrefixC(line,"#=GF SQ"))
5366 {
5367 ajFmtScanS(line,"%*s%*s%d",&n);
5368 ajDebug("Stockholm: parsed SQ line of %d sequences\n", n);
5369 }
5370
5371 ok=ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5372 ajStrTrimWhiteEnd(&line);
5373 ajDebug("Stockholm: SQ search: %S\n", line);
5374 }
5375
5376 if (!n) /* no SQ line, count first block */
5377 {
5378 while(ok && ajStrGetLen(line))
5379 {
5380 if(!ajStrPrefixC(line, "#") &&
5381 !ajStrMatchC(line, "//"))
5382 n++;
5383
5384 ok=ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5385 ajStrTrimWhiteEnd(&line);
5386 ajDebug("Stockholm: block %d read: %S\n", n, line);
5387 }
5388
5389 ajDebug("Stockholm: read block of %d sequences\n", n);
5390 }
5391
5392 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5393
5394 ok=ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5395 ajStrTrimWhiteEnd(&line);
5396 stock = stockholmNew(n);
5397
5398 ajDebug("Created stockholm data object size: %d\n", n);
5399
5400 word = ajStrNew();
5401 post = ajStrNew();
5402 ajStrAssignClear(&seqToken);
5403
5404 if(!seqRegStockholmSeq)
5405 seqRegStockholmSeq = ajRegCompC("^([^ \t\n]+)[ \t]+"
5406 "([^ \t\n]+)[ \t]+");
5407 while(ok && !ajStrPrefixC(line,"//"))
5408 {
5409 if(ajRegExec(seqRegStockholmSeq,line))
5410 {
5411 ajRegSubI(seqRegStockholmSeq,1,&word);
5412 ajRegSubI(seqRegStockholmSeq,2,&seqToken);
5413 ajRegPost(seqRegStockholmSeq,&post);
5414 ajStrRemoveLastNewline(&post);
5415
5416 ajDebug("Stockholm: regex word '%S' token '%S' post '%S'\n",
5417 word, seqToken, post);
5418 if(!ajStrCmpC(word,"#=GF"))
5419 {
5420 if(!ajStrCmpC(seqToken,"ID"))
5421 ajStrAssignS(&stock->id,post);
5422 else if(!ajStrCmpC(seqToken,"AC"))
5423 ajStrAssignS(&stock->ac,post);
5424 else if(!ajStrCmpC(seqToken,"DE"))
5425 ajStrAssignS(&stock->de,post);
5426 else if(!ajStrCmpC(seqToken,"AU"))
5427 ajStrAssignS(&stock->au,post);
5428 else if(!ajStrCmpC(seqToken,"AL"))
5429 ajStrAssignS(&stock->al,post);
5430 else if(!ajStrCmpC(seqToken,"SE"))
5431 ajStrAssignS(&stock->se,post);
5432 else if(!ajStrCmpC(seqToken,"TP"))
5433 ajStrAssignS(&stock->se,post);
5434 else if(!ajStrCmpC(seqToken,"GA"))
5435 ajFmtScanS(post,"%d%d",&stock->ga[0],
5436 &stock->ga[1]);
5437 else if(!ajStrCmpC(seqToken,"TC"))
5438 ajFmtScanS(post,"%f%f",&stock->tc[0],
5439 &stock->tc[1]);
5440 else if(!ajStrCmpC(seqToken,"NC"))
5441 ajFmtScanS(post,"%f%f",&stock->nc[0],
5442 &stock->nc[1]);
5443 else if(!ajStrCmpC(seqToken,"BM"))
5444 {
5445 if(bmf)
5446 {
5447 bmf = ajFalse;
5448 ajStrAssignS(&stock->bm,line);
5449 }
5450 else
5451 ajStrAppendS(&stock->bm,line);
5452 }
5453 else if(!ajStrCmpC(seqToken,"DC"))
5454 {
5455 if(dcf)
5456 {
5457 dcf = ajFalse;
5458 ajStrAssignS(&stock->dc,line);
5459 }
5460 else
5461 ajStrAppendS(&stock->dc,line);
5462 }
5463 else if(!ajStrCmpC(seqToken,"DR"))
5464 {
5465 if(drf)
5466 {
5467 drf = ajFalse;
5468 ajStrAssignS(&stock->dr,line);
5469 }
5470 else
5471 ajStrAppendS(&stock->dr,line);
5472 }
5473 else if(!ajStrCmpC(seqToken,"CC"))
5474 {
5475 if(ccf)
5476 {
5477 ccf = ajFalse;
5478 ajStrAssignS(&stock->cc,line);
5479 }
5480 else
5481 ajStrAppendS(&stock->cc,line);
5482 }
5483 else if(*ajStrGetPtr(seqToken)=='R')
5484 {
5485 if(reff)
5486 {
5487 reff = ajFalse;
5488 ajStrAssignS(&stock->ref,line);
5489 }
5490 else
5491 ajStrAppendS(&stock->ref,line);
5492 }
5493 }
5494
5495 else if(!ajStrCmpC(word,"#=GS"))
5496 {
5497 if(gsf)
5498 {
5499 gsf = ajFalse;
5500 ajStrAssignS(&stock->gs,line);
5501 }
5502 else
5503 ajStrAppendS(&stock->gs,line);
5504 }
5505
5506 else if(!ajStrCmpC(word,"#=GC"))
5507 {
5508 if(!ajStrCmpC(seqToken,"SS_cons"))
5509 ajStrAssignS(&stock->sscons,post);
5510 else if(!ajStrCmpC(seqToken,"SA_cons"))
5511 ajStrAssignS(&stock->sacons,post);
5512 else if(!ajStrCmpC(seqToken,"seq_cons"))
5513 ajStrAssignS(&stock->sqcons,post);
5514 }
5515 }
5516 else if (ajStrGetLen(line))
5517 {
5518 if(ajStrParseCount(line) > 1)
5519 {
5520 ++scnt;
5521
5522 if(scnt >= n)
5523 scnt = 0;
5524
5525 ajFmtScanS(line,"%S%S", &namstr,&seqstr);
5526 ajDebug("Stockholm: scnt: %d namstr '%S' seqstr '%S'\n",
5527 scnt,namstr,seqstr);
5528
5529 if(!ajStrGetLen(stock->name[scnt]))
5530 ajStrAppendS(&stock->name[scnt], namstr);
5531 else
5532 {
5533 if(!ajStrMatchS(namstr, stock->name[scnt]))
5534 ajWarn("Bad stockholm format found id %d '%S' "
5535 "expect '%S'",
5536 scnt, namstr, stock->name[scnt]);
5537 }
5538
5539 ajStrRemoveLastNewline(&seqstr);
5540 ajStrAppendS(&stock->str[scnt], seqstr);
5541 }
5542 else
5543 {
5544 ajStrRemoveLastNewline(&line);
5545 ajStrAppendS(&stock->str[scnt], line);
5546 }
5547
5548 }
5549
5550 ok = ajTextinStoreReadline(seqin->Input,&line, &thys->TextPtr);
5551 ajStrTrimWhiteEnd(&line);
5552 }
5553
5554 while(ok && !ajStrPrefixC(line, "# STOCKHOLM 1."))
5555 ok = ajTextinStoreReadline(seqin->Input,&line, &thys->TextPtr);
5556
5557 if(ok)
5558 ajTextinStoreClear(seqin->Input, 1, line, &thys->TextPtr);
5559 else
5560 ajFilebuffClear(buff, 0);
5561
5562 ajStrDel(&word);
5563 ajStrDel(&post);
5564 ajStrDel(&namstr);
5565 ajStrDel(&seqstr);
5566 ajStrDelStatic(&seqToken);
5567
5568 seqin->SeqData = stock;
5569
5570 ajFilebuffClear(buff,0);
5571 }
5572
5573
5574 /* At this point the Stockholm structure is fully loaded */
5575 if(stock->Count >= stock->n)
5576 {
5577 ajDebug("Stockholm count %d: All done\n", stock->Count);
5578 stockholmDel(&stock);
5579 seqin->SeqData = NULL;
5580 ajStrDel(&line);
5581
5582 return ajFalse;
5583 }
5584
5585 i = stock->Count;
5586
5587 seqStockholmCopy(&thys,stock,i);
5588
5589 ++stock->Count;
5590
5591 ajDebug("Stockholm returning %d/%d '%S' len: %d\n",
5592 stock->Count, stock->n, ajSeqGetNameS(thys),ajSeqGetLen(thys));
5593
5594 ajStrDel(&line);
5595
5596 return ajTrue;
5597 }
5598
5599
5600
5601
5602 /* @funcstatic seqSelexCopy ***************************************************
5603 **
5604 ** Copy Selex data to sequence object.
5605 ** Pad with gaps to make lengths equal.
5606 **
5607 ** @param [w] thys [AjPSeq*] sequence object
5608 ** @param [u] selex [SeqPSelex] seqin containing selex info
5609 ** @param [r] n [ajuint] index into selex object
5610 ** @return [void]
5611 **
5612 ** @release 2.0.1
5613 ** @@
5614 ******************************************************************************/
5615
seqSelexCopy(AjPSeq * thys,SeqPSelex selex,ajuint n)5616 static void seqSelexCopy(AjPSeq *thys, SeqPSelex selex, ajuint n)
5617 {
5618 AjPSeq pthis = *thys;
5619
5620 /*SeqPSelexdata sdata;*/
5621
5622 ajStrAssignS(&pthis->Seq, selex->str[n]);
5623 ajStrAssignS(&pthis->Name, selex->name[n]);
5624 pthis->Weight = selex->sq[n]->wt;
5625
5626 /*
5627 if(!(*thys)->Selexdata)
5628 (*thys)->Selexdata = selexdataNew();
5629
5630 sdata = (*thys)->Selexdata;
5631
5632 ajStrAssignS(&sdata->id,selex->id);
5633 ajStrAssignS(&sdata->ac,selex->ac);
5634 ajStrAssignS(&sdata->de,selex->de);
5635 ajStrAssignS(&sdata->au,selex->au);
5636 ajStrAssignS(&sdata->cs,selex->cs);
5637 ajStrAssignS(&sdata->rf,selex->rf);
5638 ajStrAssignS(&sdata->name,selex->name[n]);
5639 ajStrAssignS(&sdata->str,selex->str[n]);
5640 ajStrAssignS(&sdata->ss,selex->ss[n]);
5641
5642 sdata->ga[0] = selex->ga[0];
5643 sdata->ga[1] = selex->ga[1];
5644 sdata->tc[0] = selex->tc[0];
5645 sdata->tc[1] = selex->tc[1];
5646 sdata->nc[0] = selex->nc[0];
5647 sdata->nc[1] = selex->nc[1];
5648
5649 ajStrAssignS(&sdata->sq->name,selex->sq[n]->name);
5650
5651 ajStrAssignS(&sdata->sq->ac,selex->sq[n]->ac);
5652 ajStrAssignS(&sdata->sq->source,selex->sq[n]->source);
5653 ajStrAssignS(&sdata->sq->de,selex->sq[n]->de);
5654
5655 sdata->sq->wt = selex->sq[n]->wt;
5656 sdata->sq->start = selex->sq[n]->start;
5657 sdata->sq->stop = selex->sq[n]->stop;
5658 sdata->sq->len = selex->sq[n]->len;
5659 */
5660 return;
5661 }
5662
5663
5664
5665
5666 /* @funcstatic seqStockholmCopy ***********************************************
5667 **
5668 ** Copy Stockholm data to sequence object.
5669 ** Pad with gaps to make lengths equal.
5670 **
5671 ** @param [w] thys [AjPSeq*] sequence object
5672 ** @param [u] stock [SeqPStockholm] seqin containing selex info
5673 ** @param [r] n [ajint] index into stockholm object
5674 ** @return [void]
5675 **
5676 ** @release 2.3.0
5677 ** @@
5678 ******************************************************************************/
5679
seqStockholmCopy(AjPSeq * thys,SeqPStockholm stock,ajint n)5680 static void seqStockholmCopy(AjPSeq *thys, SeqPStockholm stock, ajint n)
5681 {
5682 AjPSeq pthis;
5683 /*SeqPStockholmdata sdata;*/
5684
5685 pthis = *thys;
5686
5687 ajStrAssignS(&pthis->Seq, stock->str[n]);
5688 ajStrAssignS(&pthis->Name, stock->name[n]);
5689
5690 /*
5691 if(!(*thys)->Stock)
5692 (*thys)->Stock = stockholmdataNew();
5693
5694 sdata = (*thys)->Stock;
5695
5696 ajStrAssignS(&sdata->id,stock->id);
5697 ajStrAssignS(&sdata->ac,stock->ac);
5698 ajStrAssignS(&sdata->de,stock->de);
5699 ajStrAssignS(&sdata->au,stock->au);
5700 ajStrAssignS(&sdata->al,stock->al);
5701 ajStrAssignS(&sdata->tp,stock->tp);
5702 ajStrAssignS(&sdata->se,stock->se);
5703 ajStrAssignS(&sdata->gs,stock->gs);
5704 ajStrAssignS(&sdata->dc,stock->dc);
5705 ajStrAssignS(&sdata->dr,stock->dr);
5706 ajStrAssignS(&sdata->cc,stock->cc);
5707 ajStrAssignS(&sdata->ref,stock->ref);
5708 ajStrAssignS(&sdata->sacons,stock->sacons);
5709 ajStrAssignS(&sdata->sqcons,stock->sqcons);
5710 ajStrAssignS(&sdata->sscons,stock->sscons);
5711 sdata->ga[0] = stock->ga[0];
5712 sdata->ga[1] = stock->ga[1];
5713 sdata->tc[0] = stock->tc[0];
5714 sdata->tc[1] = stock->tc[1];
5715 sdata->nc[0] = stock->nc[0];
5716 sdata->nc[1] = stock->nc[1];
5717 */
5718 return;
5719 }
5720
5721
5722
5723
5724 /* @funcstatic seqSelexAppend *************************************************
5725 **
5726 ** Append sequence and related Selex info to selex object.
5727 ** Pad with gaps to make lengths equal.
5728 **
5729 ** @param [r] src [const AjPStr] source line from Selex file
5730 ** @param [w] dest [AjPStr*] Destination in Selex object
5731 ** @param [r] beg [ajuint] start of info in src
5732 ** @param [r] end [ajuint] end of info in src
5733 ** @return [void]
5734 **
5735 ** @release 2.0.1
5736 ** @@
5737 ******************************************************************************/
5738
seqSelexAppend(const AjPStr src,AjPStr * dest,ajuint beg,ajuint end)5739 static void seqSelexAppend(const AjPStr src, AjPStr *dest,
5740 ajuint beg, ajuint end)
5741 {
5742 const char *p = NULL;
5743 char c;
5744 ajuint len;
5745 ajuint i;
5746 ajuint pad = 0;
5747
5748 len = end-beg+1;
5749 p = ajStrGetPtr(src);
5750
5751 ajDebug("seqSelexAppend srclen: %u beg: %u end: %u src '%S'\n",
5752 ajStrGetLen(src), beg, end, src);
5753
5754 if(beg>=ajStrGetLen(src))
5755 {
5756 for(i=0;i<len;++i)
5757 ajStrAppendK(dest,'-');
5758
5759 return;
5760 }
5761
5762 p += beg;
5763 pad = end - ajStrGetLen(src) + 2;
5764
5765 while((c=*p) && *p!='\n')
5766 {
5767 if(c=='.' || c=='_' || c==' ')
5768 c='-';
5769
5770 ajStrAppendK(dest,c);
5771 ++p;
5772 }
5773
5774 for(i=0;i<pad;++i)
5775 ajStrAppendK(dest,'-');
5776
5777 return;
5778 }
5779
5780
5781
5782
5783 /* @funcstatic seqSelexHeader *************************************************
5784 **
5785 ** Load a Selex object with header information for a single line
5786 **
5787 ** @param [w] thys [SeqPSelex*] Selex object
5788 ** @param [r] line [const AjPStr] Selex header line
5789 ** @param [w] named [AjBool*] Whether names of sequences have been read
5790 ** @param [w] sqcnt [ajuint*] Number of SQ names read
5791 ** @return [AjBool] ajTrue if the line contained header information
5792 **
5793 ** @release 2.0.1
5794 ** @@
5795 ******************************************************************************/
5796
seqSelexHeader(SeqPSelex * thys,const AjPStr line,AjBool * named,ajuint * sqcnt)5797 static AjBool seqSelexHeader(SeqPSelex *thys, const AjPStr line,
5798 AjBool *named, ajuint *sqcnt)
5799 {
5800 SeqPSelex pthis;
5801
5802 pthis = *thys;
5803
5804
5805 if(ajStrPrefixC(line,"#=ID"))
5806 {
5807 ajFmtScanS(line,"#=ID %S",&pthis->id);
5808
5809 return ajTrue;
5810 }
5811 else if(ajStrPrefixC(line,"#=AC"))
5812 {
5813 ajFmtScanS(line,"#=AC %S",&pthis->ac);
5814
5815 return ajTrue;
5816 }
5817 else if(ajStrPrefixC(line,"#=DE"))
5818 {
5819 ajStrAssignC(&pthis->de,ajStrGetPtr(line)+5);
5820 ajStrRemoveWhiteExcess(&pthis->de);
5821
5822 return ajTrue;
5823 }
5824 else if(ajStrPrefixC(line,"#=AU"))
5825 {
5826 ajStrAssignC(&pthis->au,ajStrGetPtr(line)+5);
5827 ajStrRemoveWhiteExcess(&pthis->au);
5828
5829 return ajTrue;
5830 }
5831 else if(ajStrPrefixC(line,"#=GA"))
5832 {
5833 ajFmtScanS(line,"%*s %f %f",&pthis->ga[0],&pthis->ga[1]);
5834
5835 return ajTrue;
5836 }
5837 else if(ajStrPrefixC(line,"#=TC"))
5838 {
5839 ajFmtScanS(line,"%*s %f %f",&pthis->tc[0],&pthis->tc[1]);
5840
5841 return ajTrue;
5842 }
5843 else if(ajStrPrefixC(line,"#=NC"))
5844 {
5845 ajFmtScanS(line,"%*s %f %f",&pthis->nc[0],&pthis->nc[1]);
5846
5847 return ajTrue;
5848 }
5849 else if(ajStrPrefixC(line,"#=SQ"))
5850 {
5851 ajStrTokenAssignC(&seqHandle,line," \t\n");
5852 ajStrTokenStep(seqHandle);
5853
5854 ajStrTokenNextParse(seqHandle,&pthis->sq[*sqcnt]->name);
5855 ajStrAssignS(&pthis->name[*sqcnt],pthis->sq[*sqcnt]->name);
5856
5857 ajStrTokenNextParse(seqHandle, &seqToken);
5858 ajStrToFloat(seqToken,&pthis->sq[*sqcnt]->wt);
5859
5860 ajStrTokenNextParse(seqHandle,&pthis->sq[*sqcnt]->source);
5861
5862 ajStrTokenNextParse(seqHandle, &pthis->sq[*sqcnt]->ac);
5863
5864 ajStrTokenNextParse(seqHandle, &seqToken);
5865 ajFmtScanS(seqToken,"%d..%d:%d",&pthis->sq[*sqcnt]->start,
5866 &pthis->sq[*sqcnt]->stop,&pthis->sq[*sqcnt]->len);
5867
5868 ajStrTokenNextParseC(seqHandle,"\n",&pthis->sq[*sqcnt]->de);
5869
5870 ajStrDelStatic(&seqToken);
5871 *named = ajTrue;
5872 ++(*sqcnt);
5873
5874 return ajTrue;
5875 }
5876
5877
5878 return ajFalse;
5879 }
5880
5881
5882
5883
5884 /* @funcstatic seqSelexPos ****************************************************
5885 **
5886 ** Find start and end positions of sequence & related Selex information
5887 **
5888 ** @param [r] line [const AjPStr] Selex sequence or related line
5889 ** @param [w] begin [ajuint*] start pos
5890 ** @param [w] end [ajuint*] end pos
5891 ** @return [void]
5892 **
5893 ** @release 2.0.1
5894 ** @@
5895 ******************************************************************************/
5896
seqSelexPos(const AjPStr line,ajuint * begin,ajuint * end)5897 static void seqSelexPos(const AjPStr line, ajuint *begin, ajuint *end)
5898 {
5899 ajuint pos = 0;
5900 ajuint len = 0;
5901
5902 const char *p;
5903
5904 /*
5905 ** Selex sequence info can start any number of spaces
5906 ** after the names so we need to find out where to
5907 ** start counting chars from and where to end
5908 */
5909
5910 len = ajStrGetLen(line) - 1;
5911
5912 if(!len)
5913 {
5914 *begin=0;
5915 *end=0;
5916
5917 return;
5918 }
5919
5920 pos = len -1;
5921 *end = (pos > *end) ? pos : *end;
5922 p = ajStrGetPtr(line);
5923
5924 while(*p && *p!=' ')
5925 ++p;
5926
5927 while(*p && *p==' ')
5928 ++p;
5929
5930 if(p)
5931 pos = p - ajStrGetPtr(line);
5932
5933 *begin = (pos < *begin) ? pos : *begin;
5934
5935 ajDebug("seqSelexPos len:%u pos:%u begin:%u end:%u\n",
5936 len, pos, *begin, *end);
5937
5938 return;
5939 }
5940
5941
5942
5943
5944 /* @funcstatic seqSelexReadBlock **********************************************
5945 **
5946 ** Read a block of sequence information from a selex file
5947 **
5948 ** @param [w] thys [SeqPSelex*] Selex object
5949 ** @param [w] named [AjBool*] Whether names of sequences have been read
5950 ** @param [r] n [ajuint] Number of sequences in Selex file
5951 ** @param [u] line [AjPStr*] Line from Selex file
5952 ** @param [u] seqin [AjPSeqin] Sequence input objext
5953 ** @param [w] astr [AjPStr*] string to append to
5954 ** @return [AjBool] ajTrue if data was read.
5955 **
5956 ** @release 2.0.1
5957 ** @@
5958 ******************************************************************************/
5959
seqSelexReadBlock(SeqPSelex * thys,AjBool * named,ajuint n,AjPStr * line,AjPSeqin seqin,AjPStr * astr)5960 static AjBool seqSelexReadBlock(SeqPSelex *thys, AjBool *named, ajuint n,
5961 AjPStr *line, AjPSeqin seqin, AjPStr *astr)
5962 {
5963 SeqPSelex pthis;
5964 AjPStr *seqs = NULL;
5965 AjPStr *ss = NULL;
5966
5967 AjPStr rf = NULL;
5968 AjPStr cs = NULL;
5969 ajuint i;
5970 ajuint begin=0;
5971 ajuint end=0;
5972 AjBool ok;
5973 ajuint cnt;
5974 AjPStr tmp = NULL;
5975 AjBool haverf = ajFalse;
5976 AjBool havecs = ajFalse;
5977 AjBool havess = ajFalse;
5978
5979 pthis = *thys;
5980
5981 begin = INT_MAX;
5982 end = 0;
5983
5984 tmp = ajStrNew();
5985 rf = ajStrNew();
5986 cs = ajStrNew();
5987 AJCNEW(seqs,n);
5988 AJCNEW(ss,n);
5989
5990 for(i=0;i<n;++i)
5991 {
5992 seqs[i] = ajStrNew();
5993 ss[i] = ajStrNew();
5994 }
5995
5996 ok = ajTrue;
5997 cnt = 0;
5998
5999
6000 while(ajStrPrefixC(*line,"\n"))
6001 ok = ajTextinStoreReadline(seqin->Input, line, astr);
6002
6003 while(ok)
6004 {
6005 seqSelexPos(*line,&begin,&end);
6006
6007 if(ajStrPrefixC(*line,"#=RF"))
6008 {
6009 haverf=ajTrue;
6010 ajStrAssignS(&rf,*line);
6011 }
6012
6013 if(ajStrPrefixC(*line,"#=CS"))
6014 {
6015 havecs=ajTrue;
6016 ajStrAssignS(&cs,*line);
6017 }
6018
6019 if(ajStrPrefixC(*line,"#=SS"))
6020 {
6021 havess=ajTrue;
6022 ajStrAssignS(&ss[--cnt],*line);
6023 ++cnt;
6024 }
6025
6026 if(!ajStrPrefixC(*line,"#"))
6027 {
6028 if(!*named)
6029 {
6030 ajFmtScanS(*line,"%S",&pthis->name[cnt]);
6031 ajStrAssignS(&pthis->sq[cnt]->name,pthis->name[cnt]);
6032 }
6033 else
6034 {
6035 ajFmtScanS(*line,"%S",&tmp);
6036
6037 if(!ajStrPrefixS(pthis->name[cnt],tmp))
6038 ajWarn("Selex format sequence names do not match "
6039 "['%S' '%S']",
6040 pthis->name[cnt],tmp);
6041 }
6042
6043 ajStrAssignS(&seqs[cnt],*line);
6044 ++cnt;
6045 }
6046
6047 ok = ajTextinStoreReadline(seqin->Input,line, astr);
6048
6049 if(ajStrPrefixC(*line,"\n"))
6050 ok = ajFalse;
6051 }
6052
6053 ajDebug("selexReadBlock block done line '%S' n: %u rf:%B cs:%B ss:%B\n",
6054 *line, n, haverf, havecs, havess);
6055
6056 if(cnt != n)
6057 ajWarn("Selex format expected %u sequences in block, found %u",
6058 n, cnt);
6059 if(cnt > n)
6060 cnt = n;
6061
6062 if(haverf)
6063 seqSelexAppend(rf,&pthis->rf,begin,end);
6064
6065 if(havecs)
6066 seqSelexAppend(cs,&pthis->cs,begin,end);
6067
6068 for(i=0;i<cnt;++i)
6069 {
6070 seqSelexAppend(seqs[i],&pthis->str[i],begin,end);
6071 if(havess)
6072 seqSelexAppend(ss[i],&pthis->ss[i],begin,end);
6073 }
6074
6075
6076 for(i=0;i<n;++i)
6077 {
6078 ajStrDel(&seqs[i]);
6079 ajStrDel(&ss[i]);
6080 }
6081
6082 AJFREE(seqs);
6083 AJFREE(ss);
6084
6085 ajStrDel(&rf);
6086 ajStrDel(&cs);
6087 ajStrDel(&tmp);
6088
6089 *named = ajTrue;
6090
6091 return ajTrue;
6092 }
6093
6094
6095
6096
6097 /* @funcstatic seqReadStaden **************************************************
6098 **
6099 ** Given data in a sequence structure, tries to read everything needed
6100 ** using the old Staden package file format.
6101 **
6102 ** @param [w] thys [AjPSeq] Sequence object
6103 ** @param [u] seqin [AjPSeqin] Sequence input object
6104 ** @return [AjBool] ajTrue on success
6105 **
6106 ** @release 1.0.0
6107 ** @@
6108 ******************************************************************************/
6109
seqReadStaden(AjPSeq thys,AjPSeqin seqin)6110 static AjBool seqReadStaden(AjPSeq thys, AjPSeqin seqin)
6111 {
6112 AjPStr tmpname = NULL;
6113 AjPFilebuff buff;
6114 AjBool incomment = ajFalse;
6115
6116 buff = seqin->Input->Filebuff;
6117
6118 if(!seqRegStadenId)
6119 seqRegStadenId = ajRegCompC("^[<]([^>-]+)[-]*[>]");
6120
6121 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
6122 return ajFalse;
6123
6124 if(ajRegExec(seqRegStadenId, seqReadLine))
6125 {
6126 ajRegSubI(seqRegStadenId, 1, &seqToken);
6127 seqSetName(thys, seqToken);
6128 ajDebug("seqReadStaden name '%S' token '%S'\n",
6129 thys->Name, seqToken);
6130 ajRegPost(seqRegStadenId, &seqToken);
6131 seqAppendCommented(&thys->Seq, &incomment, seqToken);
6132 ajStrDelStatic(&seqToken);
6133 }
6134 else
6135 {
6136 tmpname = ajStrNewS(seqin->Input->Filename);
6137 ajFilenameTrimAll(&tmpname);
6138 seqSetName(thys, tmpname);
6139 seqAppendCommented(&thys->Seq, &incomment, seqReadLine);
6140 ajStrDel(&tmpname);
6141 }
6142
6143 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
6144 {
6145 seqAppendCommented(&thys->Seq, &incomment, seqReadLine);
6146 }
6147
6148 if(!ajTextinGetRecords(seqin->Input)) /* but we have read at least 1 line */
6149 return ajFalse;
6150
6151 ajFilebuffClear(buff, 0);
6152
6153 return ajTrue;
6154 }
6155
6156
6157
6158
6159 /* @funcstatic seqReadText ****************************************************
6160 **
6161 ** Given data in a sequence structure, tries to read everything needed
6162 ** using plain text format.
6163 **
6164 ** @param [w] thys [AjPSeq] Sequence object
6165 ** @param [u] seqin [AjPSeqin] Sequence input object
6166 ** @return [AjBool] ajTrue on success
6167 **
6168 ** @release 1.0.0
6169 ** @@
6170 ******************************************************************************/
6171
seqReadText(AjPSeq thys,AjPSeqin seqin)6172 static AjBool seqReadText(AjPSeq thys, AjPSeqin seqin)
6173 {
6174 AjPFilebuff buff;
6175
6176 ajDebug("seqReadText\n");
6177
6178 buff = seqin->Input->Filebuff;
6179
6180 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
6181 {
6182 ajDebug("read '%S'\n", seqReadLine);
6183 seqAppend(&thys->Seq, seqReadLine);
6184 }
6185
6186 ajDebug("read %d lines\n", ajTextinGetRecords(seqin->Input));
6187 ajFilebuffClear(buff, 0);
6188
6189 if(!ajTextinGetRecords(seqin->Input))
6190 return ajFalse;
6191
6192 seqSetNameFile(thys, seqin);
6193
6194 return ajTrue;
6195 }
6196
6197
6198
6199
6200 /* @funcstatic seqReadRaw *****************************************************
6201 **
6202 ** Given data in a sequence structure, tries to read everything needed
6203 ** using raw format, which accepts only alphanumeric and whitespace
6204 ** characters or '-' for gap or '*' for a protein stop
6205 ** and rejects anything else.
6206 **
6207 ** @param [w] thys [AjPSeq] Sequence object
6208 ** @param [u] seqin [AjPSeqin] Sequence input object
6209 ** @return [AjBool] ajTrue on success
6210 **
6211 ** @release 1.0.0
6212 ** @@
6213 ******************************************************************************/
6214
seqReadRaw(AjPSeq thys,AjPSeqin seqin)6215 static AjBool seqReadRaw(AjPSeq thys, AjPSeqin seqin)
6216 {
6217 AjPFilebuff buff;
6218 const char* cp;
6219 AjPFile fp;
6220 AjBool ok = ajFalse;
6221 ajulong filestat = 0L;
6222 ajulong filesize;
6223 ajulong i;
6224 ajuint inc = 2048;
6225 AjPStr buf = NULL;
6226 char *cbuf;
6227 AjPStr tmpseq = NULL;
6228 size_t iread;
6229
6230 ajDebug("seqReadRaw\n");
6231
6232 buff = seqin->Input->Filebuff;
6233 fp = ajFilebuffGetFile(buff);
6234
6235 if(!seqRegRawNonseq)
6236 seqRegRawNonseq = ajRegCompC("[^A-Za-z0-9 \t\n\r*-]");
6237
6238 if(ajFilebuffIsEnded(buff))
6239 {
6240 ajDebug("seqReadRaw filebuff already ended\n");
6241 return ajFalse;
6242 }
6243
6244 filestat = ajFileSeek(fp, 0L, SEEK_CUR);
6245
6246 if(filestat)
6247 {
6248 ajDebug("filestat %Lu\n", filestat);
6249
6250 /* not a file - cannot use binary, so we can only read the buffer */
6251 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
6252 {
6253 ajDebug("read '%S'\n", seqReadLine);
6254
6255 cp = ajStrGetPtr(seqReadLine);
6256 if(strlen(cp) != ajStrGetLen(seqReadLine))
6257 {
6258 ajDebug("seqReadRaw: Null character found in line: %S\n",
6259 seqReadLine);
6260 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6261 ajStrAssignClear(&thys->Seq);
6262
6263 return ajFalse;
6264 }
6265
6266 if(ajRegExec(seqRegRawNonseq, seqReadLine))
6267 {
6268 ajDebug("seqReadRaw: Bad character found in line: %S\n",
6269 seqReadLine);
6270 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6271 ajStrAssignClear(&thys->Seq);
6272
6273 return ajFalse;
6274 }
6275 seqAppend(&thys->Seq, seqReadLine);
6276 ajDebug("read %d lines\n", ajTextinGetRecords(seqin->Input));
6277 }
6278 }
6279 else
6280 {
6281 if(ajFilebuffIsEnded(buff))
6282 {
6283 ajDebug("seqReadRaw filebuff ended\n");
6284 ajFileSeek(fp, 0L, SEEK_END);
6285 return ajFalse;
6286 }
6287
6288 buf = ajStrNewRes(4096);
6289 ajStrSetValidLen(&buf, inc);
6290 cbuf = ajStrGetuniquePtr(&buf);
6291
6292 filestat = ajFileSeek(fp, 0L, SEEK_END);
6293 filesize = ajFileResetPos(fp);
6294 filestat = ajFileSeek(fp, 0L, SEEK_SET);
6295
6296 if(!filesize)
6297 {
6298 ajDebug("seqReadRaw filesize zero\n");
6299 ajFileSeek(fp,(ajlong) filesize, SEEK_SET);
6300 return ajFalse;
6301 }
6302
6303 ok = ajTrue;
6304
6305 for(i=0; i < filesize; i += inc)
6306 {
6307 if((i+inc) > filesize)
6308 {
6309 inc = (ajuint) (filesize - i);
6310 ajStrSetValidLen(&buf, inc);
6311 }
6312
6313 iread = ajReadbinBinary(fp, inc, 1, cbuf);
6314 cbuf[inc] = '\0';
6315
6316 if(strlen(cbuf) != iread)
6317 {
6318 ajDebug("seqReadRaw: Null character found in line: %s\n",
6319 cbuf);
6320 ok = ajFalse;
6321 break;
6322 }
6323
6324 if(ajRegExec(seqRegRawNonseq, buf))
6325 {
6326 ajDebug("seqReadRaw: Bad character found in line: %S\n",
6327 seqReadLine);
6328 ok = ajFalse;
6329 break;
6330 }
6331
6332 ajStrAssignC(&tmpseq, cbuf);
6333
6334 if(seqin->Input->Text)
6335 ajStrAppendS(&thys->TextPtr, tmpseq);
6336
6337 seqAppend(&thys->Seq, tmpseq);
6338 seqin->Input->Records++;
6339
6340 ajDebug("read %d lines\n", ajTextinGetRecords(seqin->Input));
6341 }
6342
6343 ajStrDel(&buf);
6344 ajStrDel(&tmpseq);
6345
6346 if(!ok)
6347 {
6348 ajDebug("seqReadRaw input OK failed\n");
6349
6350 ajFileSeek(fp,(ajlong) filestat,0);
6351
6352 if(seqin->Input->Text)
6353 {
6354 ajStrAssignC(&thys->TextPtr, "");
6355 seqin->Input->Records = 0;
6356 }
6357
6358 ajFilebuffResetPos(buff);
6359
6360 return ajFalse;
6361 }
6362
6363 }
6364
6365 buff->File->End = ajTrue;
6366
6367 if(!ajTextinGetRecords(seqin->Input))
6368 {
6369 ajDebug("seqReadRaw no records read\n");
6370 ajTextinStoreClear(seqin->Input, -1, seqReadLine, &thys->TextPtr);
6371 return ajFalse;
6372 }
6373
6374 ajFilebuffClear(buff, -1);
6375
6376 ajDebug("seqReadRaw success\n");
6377
6378 return ajTrue;
6379 }
6380
6381
6382
6383
6384 /* @funcstatic seqReadIgstrict ************************************************
6385 **
6386 ** Given data in a sequence structure, tries to read everything needed
6387 ** using IntelliGenetics format.
6388 **
6389 ** Requires a trailing number at the end of the sequence
6390 **
6391 ** @param [w] thys [AjPSeq] Sequence object
6392 ** @param [u] seqin [AjPSeqin] Sequence input object
6393 ** @return [AjBool] ajTrue on success
6394 **
6395 ** @release 6.1.0
6396 ** @@
6397 ******************************************************************************/
6398
seqReadIgstrict(AjPSeq thys,AjPSeqin seqin)6399 static AjBool seqReadIgstrict(AjPSeq thys, AjPSeqin seqin)
6400 {
6401 AjPFilebuff buff;
6402 AjBool endnum = ajFalse;
6403 AjBool ok = ajTrue;
6404
6405 buff = seqin->Input->Filebuff;
6406
6407 do
6408 {
6409 if(ajTextinGetRecords(seqin->Input))
6410 {
6411 ajStrCutStart(&seqReadLine, 1); /* trim the semi colon */
6412 ajStrRemoveWhiteExcess(&seqReadLine);
6413 if(ajStrGetLen(thys->Desc))
6414 ajStrAppendK(&thys->Desc, ' ');
6415 ajStrAppendS(&thys->Desc, seqReadLine);
6416 }
6417 /* skip comments with ';' prefix */
6418 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6419 } while(ok && ajStrPrefixC(seqReadLine, ";"));
6420
6421 if(!ok)
6422 {
6423 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6424 return ajFalse;
6425 }
6426
6427 seqSetName(thys, seqReadLine);
6428
6429 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr) &&
6430 !ajStrPrefixC(seqReadLine, ";"))
6431 {
6432 ajStrRemoveWhiteExcess(&seqReadLine);
6433 if(ajStrSuffixC(seqReadLine, "1"))
6434 endnum = ajTrue;
6435 else if(ajStrSuffixC(seqReadLine, "2"))
6436 endnum = ajTrue;
6437 else
6438 endnum = ajFalse;
6439 seqAppend(&thys->Seq, seqReadLine);
6440 }
6441
6442 if(!endnum)
6443 {
6444 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6445 return ajFalse;
6446 }
6447
6448 if(ajStrPrefixC(seqReadLine, ";"))
6449 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
6450 else
6451 ajFilebuffClear(buff, 0);
6452
6453 return ajTrue;
6454 }
6455
6456
6457
6458
6459 /* @funcstatic seqReadIg ******************************************************
6460 **
6461 ** Given data in a sequence structure, tries to read everything needed
6462 ** using IntelliGenetics format.
6463 **
6464 ** @param [w] thys [AjPSeq] Sequence object
6465 ** @param [u] seqin [AjPSeqin] Sequence input object
6466 ** @return [AjBool] ajTrue on success
6467 **
6468 ** @release 1.0.0
6469 ** @@
6470 ******************************************************************************/
6471
seqReadIg(AjPSeq thys,AjPSeqin seqin)6472 static AjBool seqReadIg(AjPSeq thys, AjPSeqin seqin)
6473 {
6474 AjPFilebuff buff;
6475 AjBool ok = ajTrue;
6476
6477 buff = seqin->Input->Filebuff;
6478
6479 do
6480 {
6481 if(ajTextinGetRecords(seqin->Input))
6482 {
6483 ajStrCutStart(&seqReadLine, 1); /* trim the semi colon */
6484 ajStrRemoveWhiteExcess(&seqReadLine);
6485 if(ajStrGetLen(thys->Desc))
6486 ajStrAppendK(&thys->Desc, ' ');
6487 ajStrAppendS(&thys->Desc, seqReadLine);
6488 }
6489 /* skip comments with ';' prefix */
6490 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6491 } while(ok && ajStrPrefixC(seqReadLine, ";"));
6492
6493 if(!ok)
6494 {
6495 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6496 return ajFalse;
6497 }
6498
6499 /* we know we will succeed from here ... no way to return ajFalse */
6500
6501 ajFilebuffSetUnbuffered(buff);
6502
6503 seqSetName(thys, seqReadLine);
6504
6505 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr) &&
6506 !ajStrPrefixC(seqReadLine, ";"))
6507 {
6508 seqAppend(&thys->Seq, seqReadLine);
6509 }
6510
6511 if(ajStrPrefixC(seqReadLine, ";"))
6512 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
6513 else
6514 ajFilebuffClear(buff, 0);
6515
6516 return ajTrue;
6517 }
6518
6519
6520
6521
6522 /* @funcstatic seqReadIguspto *************************************************
6523 **
6524 ** Given data in a sequence structure, tries to read everything needed
6525 ** using the US patent office multi-line IntelliGenetics format.
6526 **
6527 ** Requires a trailing number at the end of the sequence
6528 ** and allows for a trailing control-L at the end of the entry.
6529 **
6530 ** @param [w] thys [AjPSeq] Sequence object
6531 ** @param [u] seqin [AjPSeqin] Sequence input object
6532 ** @return [AjBool] ajTrue on success
6533 **
6534 ** @release 6.6.0
6535 ** @@
6536 ******************************************************************************/
6537
seqReadIguspto(AjPSeq thys,AjPSeqin seqin)6538 static AjBool seqReadIguspto(AjPSeq thys, AjPSeqin seqin)
6539 {
6540 AjPFilebuff buff;
6541 const AjPStr badstr = NULL;
6542 AjBool endnum = ajFalse;
6543 AjBool ok = ajTrue;
6544 AjBool seqok = ajFalse;
6545 AjBool isheader = ajTrue;
6546 AjBool firstline = ajTrue;
6547 AjBool firstgood = ajTrue;
6548 ajlong ipos;
6549
6550 buff = seqin->Input->Filebuff;
6551
6552 while(ok && !seqok)
6553 {
6554 if(!ajStrPrefixC(seqReadLine, ";"))
6555 {
6556 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6557
6558 return ajFalse;
6559 }
6560
6561 if(firstline)
6562 {
6563 firstline = ajFalse;
6564 firstgood = ajTrue;
6565
6566 if(!ajStrPrefixC(seqReadLine, "; Sequence "))
6567 {
6568 ajFmtPrintS(&seqToken,
6569 "'; Sequence ' not found");
6570 firstgood = ajFalse;
6571 }
6572
6573 if(firstgood)
6574 {
6575 ipos = ajStrFindC(seqReadLine, ", Application ");
6576 if(ipos < 1)
6577 {
6578 ajFmtPrintS(&seqToken2,
6579 "', Application ' not found");
6580 firstgood = ajFalse;
6581 }
6582 }
6583
6584 if(firstgood)
6585 {
6586 ajStrAssignSubS(&seqToken, seqReadLine, 11, ipos-1);
6587 if(!ajStrIsInt(seqToken))
6588 {
6589 ajFmtPrintS(&seqToken2,
6590 "Sequence number '%S' not an integer",
6591 seqToken);
6592 firstgood = ajFalse;
6593 }
6594 }
6595
6596 if(firstgood)
6597 {
6598 ajStrAssignSubS(&seqToken, seqReadLine, ipos+14, -2);
6599 if(!ajStrIsWord(seqToken))
6600 {
6601 ajFmtPrintS(&seqToken2,
6602 "Application id '%S' not a word",
6603 seqToken);
6604 firstgood = ajFalse;
6605 }
6606 }
6607
6608 if(!firstgood)
6609 {
6610 if(seqin->Input->Format)
6611 {
6612 ajStrAssignS(&seqToken, seqReadLine);
6613 ajStrTrimWhiteEnd(&seqToken);
6614 ajWarn("Iguspto: bad first line (%S): %S",
6615 seqToken2, seqToken);
6616
6617
6618 }
6619 else
6620 {
6621 return ajFalse;
6622 }
6623 }
6624 }
6625
6626 if(!thys->Fulldesc)
6627 thys->Fulldesc = ajSeqdescNew();
6628
6629 do
6630 {
6631 if(ajTextinGetRecords(seqin->Input))
6632 {
6633 ajStrRemoveLastNewline(&seqReadLine);
6634 ajStrCutStart(&seqReadLine, 1); /* trim the semi colon */
6635 if(ajStrGetCharFirst(seqReadLine) == ' ')
6636 ajStrCutStart(&seqReadLine, 1);
6637 ajListstrPushAppend(thys->Fulldesc->Multi,
6638 ajStrNewS(seqReadLine));
6639
6640 if(ajStrPrefixC(seqReadLine, "GENERAL INFORMATION"))
6641 isheader = ajFalse;
6642
6643 if(isheader)
6644 {
6645 if(ajStrGetLen(thys->Desc))
6646 ajStrAppendC(&thys->Desc, "; ");
6647 ajStrAppendS(&thys->Desc, seqReadLine);
6648 }
6649 }
6650 /* skip comments with ';' prefix */
6651 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6652 } while(ok && ajStrPrefixC(seqReadLine, ";"));
6653
6654 if(!ok)
6655 {
6656 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6657 return ajFalse;
6658 }
6659
6660 seqSetName(thys, seqReadLine);
6661
6662 endnum = ajFalse;
6663 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6664
6665 while(ok &&
6666 !ajStrPrefixC(seqReadLine, ";") &&
6667 !endnum)
6668 {
6669 ajStrRemoveWhiteExcess(&seqReadLine);
6670 if(ajStrSuffixC(seqReadLine, "1"))
6671 endnum = ajTrue;
6672 else if(ajStrSuffixC(seqReadLine, "2"))
6673 endnum = ajTrue;
6674 else
6675 endnum = ajFalse;
6676
6677 if(endnum)
6678 ajStrCutEnd(&seqReadLine, 1);
6679
6680 badstr = seqAppendWarn(&thys->Seq, seqReadLine,
6681 seqin->Input->Format);
6682 if(badstr)
6683 ajWarn("Sequence '%S' has bad character(s) '%S'",
6684 thys->Name, badstr);
6685
6686 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6687 }
6688
6689 if(endnum)
6690 seqok = ajTrue;
6691 else
6692 {
6693 if(seqin->Input->Format)
6694 ajWarn("Sequence '%S' has bad iguspto sequence format",
6695 thys->Name);
6696 ajSeqClear(thys);
6697 }
6698 }
6699
6700 /* test for, but do not store, the trailing space and ^L character */
6701
6702 if(ok)
6703 {
6704 ajStrRemoveWhiteExcess(&seqReadLine);
6705 while(ok &&
6706 (!ajStrGetLen(seqReadLine) || ajStrMatchC(seqReadLine, "\014")))
6707 {
6708 ok = ajBuffreadLine(buff, &seqReadLine);
6709 if(ok)
6710 {
6711 ajStrRemoveWhiteExcess(&seqReadLine);
6712 }
6713 }
6714 }
6715
6716 if(ajStrPrefixC(seqReadLine, ";"))
6717 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
6718 else
6719 ajFilebuffClear(buff, 0);
6720
6721 return ajTrue;
6722 }
6723
6724
6725
6726
6727 /* @funcstatic seqReadPdb *****************************************************
6728 **
6729 ** Given data in a sequence structure, tries to read everything needed
6730 ** using PDB protein databank format using ATOM records.
6731 **
6732 ** See seqReadPdbseq for parsing the SEQRES records
6733 **
6734 ** @param [w] thys [AjPSeq] Sequence object
6735 ** @param [u] seqin [AjPSeqin] Sequence input object
6736 ** @return [AjBool] ajTrue on success
6737 **
6738 ** @release 6.0.0
6739 ** @@
6740 ******************************************************************************/
6741
seqReadPdb(AjPSeq thys,AjPSeqin seqin)6742 static AjBool seqReadPdb(AjPSeq thys, AjPSeqin seqin)
6743 {
6744 AjPStr alnname = NULL;
6745 AjPTable alntable = NULL;
6746 SeqPMsfItem alnitem = NULL;
6747 const SeqPMsfItem readalnitem = NULL;
6748 AjPList alnlist = NULL;
6749 SeqPMsfData alndata = NULL;
6750 char aa;
6751 ajuint nseq = 0;
6752 ajuint i;
6753 AjBool ok = ajTrue;
6754 AjPStr aa3 = NULL;
6755 ajuint iaa = 0;
6756 ajuint lastaa = 0;
6757 AjPStr model = NULL;
6758
6759 ajDebug("seqReadPdb seqin->SeqData %x\n", seqin->SeqData);
6760
6761 if(!seqin->SeqData)
6762 { /* start of file */
6763 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6764
6765 ajDebug("first line:\n'%S'\n", seqReadLine);
6766
6767 if(!ajStrPrefixC(seqReadLine, "HEADER "))
6768 {
6769 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6770
6771 return ajFalse;
6772 }
6773
6774 ajStrAssignSubS(&seqName,seqReadLine, 62, 71);
6775 ajStrTrimWhite(&seqName);
6776
6777 ajDebug("first line OK name '%S'\n", seqName);
6778
6779 seqin->SeqData = AJNEW0(alndata);
6780 alndata->Table = alntable = ajTablestrNew(1000);
6781 alnlist = ajListstrNew();
6782 seqin->Input->Filecount = 0;
6783
6784 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6785
6786 while(ok && !ajStrMatchC(seqReadLine, "END"))
6787 {
6788 if(ajStrPrefixC(seqReadLine, "MODEL"))
6789 {
6790 ajStrAssignSubS(&model, seqReadLine, 7, 14);
6791 ajStrTrimWhite(&model);
6792 }
6793 else if(ajStrPrefixC(seqReadLine, "ATOM"))
6794 {
6795 if(!alnitem)
6796 AJNEW0(alnitem);
6797
6798 ajStrKeepRange(&seqReadLine, 0,71);
6799
6800 ajStrAssignSubS(&aa3, seqReadLine, 17, 19);
6801 ajStrAssignSubS(&seqChain, seqReadLine, 21, 21);
6802 ajStrAssignSubS(&seqToken, seqReadLine, 22, 25);
6803 ajStrToUint(seqToken, &iaa);
6804
6805 if(iaa > lastaa)
6806 {
6807 if(ajResidueFromTriplet(aa3,&aa))
6808 seqAppendK(&alnitem->Seq, aa);
6809 lastaa = iaa;
6810 }
6811
6812 }
6813
6814 else if(ajStrPrefixC(seqReadLine, "TER"))
6815 {
6816 if(alnitem && !ajStrGetLen(alnitem->Seq))
6817 {
6818 ajTableRemoveKey(alntable, alnitem->Name,
6819 (void**) &alnname);
6820 ajStrDel(&alnname);
6821 seqMsfItemDel(&alnitem);
6822 }
6823 else
6824 {
6825 nseq++;
6826 ajFmtPrintS(&seqToken, "%S_%S", seqName, seqChain);
6827
6828 if(ajStrGetLen(model))
6829 ajStrAppendS(&seqToken, model);
6830
6831 seqitemSetName(alnitem, seqToken);
6832 ajStrAssignS(&alnname, alnitem->Name);
6833 alnitem->Weight = 1.0;
6834 ajTablePut(alntable, alnname, alnitem);
6835 alnname = NULL;
6836 ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
6837 alnitem = NULL;
6838 }
6839
6840 lastaa = 0;
6841 }
6842
6843 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6844 }
6845
6846 ajStrDelStatic(&seqToken);
6847 ajStrDelStatic(&seqName);
6848 ajStrDelStatic(&seqChain);
6849 ajStrDel(&aa3);
6850 ajStrDel(&model);
6851
6852 if(!nseq)
6853 {
6854 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
6855 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6856
6857 return ajFalse;
6858 }
6859
6860 ajDebug("PDB Entry has %d sequences\n", nseq);
6861 ajListstrTrace(alnlist);
6862 ajTableTrace(alntable);
6863 ajTableMap(alntable, &seqMsfTabList, NULL);
6864
6865 alndata->Names = AJCALLOC(nseq, sizeof(*alndata->Names));
6866
6867 for(i=0; i < nseq; i++)
6868 {
6869 ajListstrPop(alnlist, &alndata->Names[i]);
6870 ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
6871 }
6872
6873 ajListstrFreeData(&alnlist);
6874
6875 ajTableMap(alntable, &seqMsfTabList, NULL);
6876 alndata->Nseq = nseq;
6877 alndata->Count = 0;
6878 alndata->Bufflines = ajTextinGetRecords(seqin->Input);
6879 ajDebug("PDB format read %d lines\n",
6880 ajTextinGetRecords(seqin->Input));
6881 }
6882
6883 alndata = seqin->SeqData;
6884 alntable = alndata->Table;
6885
6886 if(alndata->Count >= alndata->Nseq)
6887 { /* all done */
6888 ajFilebuffClear(seqin->Input->Filebuff, 0);
6889 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
6890
6891 return ajFalse;
6892 }
6893
6894 i = alndata->Count;
6895 ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
6896 readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
6897 ajStrAssignS(&thys->Name, alndata->Names[i]);
6898
6899 thys->Weight = readalnitem->Weight;
6900 ajStrAssignS(&thys->Seq, readalnitem->Seq);
6901
6902 alndata->Count++;
6903
6904 return ajTrue;
6905 }
6906
6907
6908
6909
6910 /* @funcstatic seqReadPdbseq **************************************************
6911 **
6912 ** Given data in a sequence structure, tries to read everything needed
6913 ** using PDB protein databank format using the SEQRES records.
6914 **
6915 ** This is the original sequence, see seqReadPdb for parsing the ATOM records
6916 ** which give the sequence observed in the structure.
6917 **
6918 ** @param [w] thys [AjPSeq] Sequence object
6919 ** @param [u] seqin [AjPSeqin] Sequence input object
6920 ** @return [AjBool] ajTrue on success
6921 **
6922 ** @release 6.0.0
6923 ** @@
6924 ******************************************************************************/
6925
seqReadPdbseq(AjPSeq thys,AjPSeqin seqin)6926 static AjBool seqReadPdbseq(AjPSeq thys, AjPSeqin seqin)
6927 {
6928 AjPFilebuff buff;
6929 AjPStr name = NULL;
6930 AjPStr alnname = NULL;
6931 AjPStr chain = NULL;
6932 AjPTable alntable = NULL;
6933 SeqPMsfItem alnitem = NULL;
6934 const SeqPMsfItem readalnitem = NULL;
6935 AjPList alnlist = NULL;
6936 SeqPMsfData alndata = NULL;
6937 char aa;
6938 ajuint iseq = 0;
6939 ajuint nseq = 0;
6940 ajuint i;
6941 AjBool ok = ajTrue;
6942
6943 buff = seqin->Input->Filebuff;
6944
6945 ajDebug("seqReadPdbseq seqin->SeqData %x\n", seqin->SeqData);
6946
6947 if(seqin->SeqData)
6948 {
6949 alndata = seqin->SeqData;
6950 alntable = alndata->Table;
6951 if(alndata->Nseq && (alndata->Count >= alndata->Nseq))
6952 { /* try next entry */
6953 ajFilebuffClear(buff, 0);
6954 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
6955 seqin->SeqData = NULL;
6956 }
6957 alndata = NULL;
6958 }
6959
6960 if(!seqin->SeqData)
6961 { /* start of file */
6962 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6963 if(!ok)
6964 {
6965 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6966 return ajFalse;
6967 }
6968
6969 ajDebug("first line:\n'%S'\n", seqReadLine);
6970
6971 if(!ajStrPrefixC(seqReadLine, "HEADER "))
6972 {
6973 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6974
6975 return ajFalse;
6976 }
6977
6978 ajStrAssignSubS(&name,seqReadLine, 62, 71);
6979 ajStrTrimWhite(&name);
6980
6981 ajDebug("first line OK name '%S'\n", name);
6982
6983 seqin->SeqData = AJNEW0(alndata);
6984 alndata->Table = alntable = ajTablestrNew(1000);
6985 alnlist = ajListstrNew();
6986 seqin->Input->Filecount = 0;
6987
6988 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6989
6990 while(ok && !ajStrMatchC(seqReadLine, "END"))
6991 {
6992 if(ajStrPrefixC(seqReadLine, "SEQRES"))
6993 {
6994 ajStrKeepRange(&seqReadLine, 0,71);
6995 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
6996 ajStrTokenStep(seqHandle); /* 'SEQRES' */
6997
6998 ajStrTokenNextParse(seqHandle, &seqToken); /* number */
6999 ajStrToUint(seqToken, &iseq);
7000
7001 ajStrTokenNextParse(seqHandle, &chain); /* chain letter */
7002
7003 if(iseq == 1)
7004 {
7005 if(alnitem && !ajStrGetLen(alnitem->Seq))
7006 {
7007 nseq--;
7008 ajListstrPopLast(alnlist, &alnname);
7009 ajTableRemoveKey(alntable, alnitem->Name,
7010 (void**) &alnname);
7011 ajStrDel(&alnname);
7012 seqMsfItemDel(&alnitem);
7013 }
7014
7015 nseq++;
7016 ajFmtPrintS(&seqToken, "%S_%S", name, chain);
7017 AJNEW0(alnitem);
7018 seqitemSetName(alnitem, seqToken);
7019 ajStrAssignS(&alnname, alnitem->Name);
7020 alnitem->Weight = 1.0;
7021 ajTablePut(alntable, alnname, alnitem);
7022 alnname = NULL;
7023 ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
7024 }
7025
7026 while(ajStrTokenNextParse(seqHandle, &seqToken))
7027 if(ajResidueFromTriplet(seqToken,&aa))
7028 seqAppendK(&alnitem->Seq, aa);
7029 }
7030
7031 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7032 }
7033
7034 if(alnitem && !ajStrGetLen(alnitem->Seq))
7035 {
7036 nseq--;
7037 ajListstrPopLast(alnlist, &alnname);
7038 ajTableRemoveKey(alntable, alnitem->Name,
7039 (void**) &alnname);
7040 ajStrDel(&alnname);
7041 seqMsfItemDel(&alnitem);
7042 }
7043
7044 if(!nseq)
7045 {
7046 ajStrDelStatic(&seqToken);
7047 ajStrDel(&name);
7048 ajStrDel(&chain);
7049 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7050 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7051
7052 return ajFalse;
7053 }
7054
7055
7056 ajDebug("PDB Entry has %d sequences\n", nseq);
7057 ajListstrTrace(alnlist);
7058 ajTableTrace(alntable);
7059 ajTableMap(alntable, &seqMsfTabList, NULL);
7060
7061 alndata->Names = AJCALLOC(nseq, sizeof(*alndata->Names));
7062
7063 for(i=0; i < nseq; i++)
7064 {
7065 ajListstrPop(alnlist, &alndata->Names[i]);
7066 ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
7067 }
7068
7069 ajListstrFreeData(&alnlist);
7070
7071 ajTableMap(alntable, &seqMsfTabList, NULL);
7072 alndata->Nseq = nseq;
7073 alndata->Count = 0;
7074 alndata->Bufflines = ajTextinGetRecords(seqin->Input);
7075 ajDebug("PDBSEQ format read %d lines\n",
7076 ajTextinGetRecords(seqin->Input));
7077 }
7078
7079 alndata = seqin->SeqData;
7080
7081 i = alndata->Count;
7082 ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
7083 readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
7084 ajStrAssignS(&thys->Name, alndata->Names[i]);
7085
7086 thys->Weight = readalnitem->Weight;
7087 ajStrAssignS(&thys->Seq, readalnitem->Seq);
7088
7089 alndata->Count++;
7090
7091 ajStrDelStatic(&seqToken);
7092 ajStrDel(&name);
7093 ajStrDel(&chain);
7094
7095 return ajTrue;
7096 }
7097
7098
7099
7100
7101 /* @funcstatic seqReadPdbnuc **************************************************
7102 **
7103 ** Given nucleotide data in a sequence structure,
7104 ** tries to read everything needed using PDB protein databank format
7105 ** using the SEQRES records.
7106 **
7107 ** This is the sequence observed in the structure. See seqReadPdbnucseq
7108 ** for parsing the SEQRES records which give the original sequence.
7109 **
7110 ** @param [w] thys [AjPSeq] Sequence object
7111 ** @param [u] seqin [AjPSeqin] Sequence input object
7112 ** @return [AjBool] ajTrue on success
7113 **
7114 ** @release 6.1.0
7115 ** @@
7116 ******************************************************************************/
7117
seqReadPdbnuc(AjPSeq thys,AjPSeqin seqin)7118 static AjBool seqReadPdbnuc(AjPSeq thys, AjPSeqin seqin)
7119 {
7120 AjPStr name = NULL;
7121 AjPStr alnname = NULL;
7122 AjPStr token = NULL;
7123 AjPStr chain = NULL;
7124 AjPTable alntable = NULL;
7125 SeqPMsfItem alnitem = NULL;
7126 const SeqPMsfItem readalnitem = NULL;
7127 AjPList alnlist = NULL;
7128 SeqPMsfData alndata = NULL;
7129 char aa;
7130 ajuint nseq = 0;
7131 ajuint i;
7132 AjBool ok = ajTrue;
7133 AjPStr aa3 = NULL;
7134 ajuint iaa = 0;
7135 ajuint lastaa = 0;
7136 AjPStr model = NULL;
7137
7138 ajDebug("seqReadPdbnuc seqin->SeqData %x\n", seqin->SeqData);
7139
7140 if(!seqin->SeqData)
7141 { /* start of file */
7142 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7143
7144 ajDebug("first line:\n'%S'\n", seqReadLine);
7145
7146 if(!ajStrPrefixC(seqReadLine, "HEADER "))
7147 {
7148 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7149
7150 return ajFalse;
7151 }
7152
7153 ajStrAssignSubS(&name,seqReadLine, 62, 71);
7154 ajStrTrimWhite(&name);
7155
7156 ajDebug("first line OK name '%S'\n", name);
7157
7158 seqin->SeqData = AJNEW0(alndata);
7159 alndata->Table = alntable = ajTablestrNew(1000);
7160 alnlist = ajListstrNew();
7161 seqin->Input->Filecount = 0;
7162
7163 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7164
7165 while(ok && !ajStrMatchC(seqReadLine, "END"))
7166 {
7167 if(ajStrPrefixC(seqReadLine, "MODEL"))
7168 {
7169 ajStrAssignSubS(&model, seqReadLine, 7, 14);
7170 ajStrTrimWhite(&model);
7171 }
7172 else if(ajStrPrefixC(seqReadLine, "ATOM"))
7173 {
7174 if(!alnitem)
7175 AJNEW0(alnitem);
7176
7177 ajStrKeepRange(&seqReadLine, 0,71);
7178
7179 ajStrAssignSubS(&aa3, seqReadLine, 18, 19);
7180 ajStrAssignSubS(&chain, seqReadLine, 21, 21);
7181 ajStrAssignSubS(&token, seqReadLine, 22, 25);
7182 ajStrToUint(token, &iaa);
7183
7184 if(iaa > lastaa)
7185 {
7186 if(ajBaseFromDoublet(aa3,&aa))
7187 seqAppendK(&alnitem->Seq, aa);
7188
7189 lastaa = iaa;
7190 }
7191
7192 }
7193
7194 else if(ajStrPrefixC(seqReadLine, "TER"))
7195 {
7196 if(!ajStrGetLen(alnitem->Seq))
7197 {
7198 ajDebug("TER seqlen zero\n");
7199 ajTableRemoveKey(alntable, alnitem->Name,
7200 (void**) &alnname);
7201 ajStrDel(&alnname);
7202 seqMsfItemDel(&alnitem);
7203 }
7204 else
7205 {
7206 nseq++;
7207 ajFmtPrintS(&token, "%S_%S", name, chain);
7208
7209 if(ajStrGetLen(model))
7210 ajStrAppendS(&token, model);
7211
7212 seqitemSetName(alnitem, token);
7213 ajStrAssignS(&alnname, alnitem->Name);
7214 alnitem->Weight = 1.0;
7215 ajTablePut(alntable, alnname, alnitem);
7216 alnname = NULL;
7217 ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
7218 alnitem = NULL;
7219 }
7220 lastaa = 0;
7221 }
7222
7223 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7224 }
7225
7226 ajStrDel(&aa3);
7227 ajStrDel(&token);
7228 ajStrDel(&name);
7229 ajStrDel(&chain);
7230 ajStrDel(&model);
7231
7232 if(alnitem && !ajStrGetLen(alnitem->Seq))
7233 {
7234 ajListstrPopLast(alnlist, &alnname);
7235 ajTableRemoveKey(alntable, alnitem->Name,
7236 (void**) &alnname);
7237 ajStrDel(&alnname);
7238 seqMsfItemDel(&alnitem);
7239 }
7240
7241 if(!nseq)
7242 {
7243 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7244 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7245
7246 return ajFalse;
7247 }
7248
7249 ajDebug("PDB Entry has %d sequences\n", nseq);
7250 ajListstrTrace(alnlist);
7251 ajTableTrace(alntable);
7252 ajTableMap(alntable, &seqMsfTabList, NULL);
7253
7254 alndata->Names = AJCALLOC(nseq, sizeof(*alndata->Names));
7255
7256 for(i=0; i < nseq; i++)
7257 {
7258 ajListstrPop(alnlist, &alndata->Names[i]);
7259 ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
7260 }
7261
7262 ajListstrFreeData(&alnlist);
7263
7264 ajTableMap(alntable, &seqMsfTabList, NULL);
7265 alndata->Nseq = nseq;
7266 alndata->Count = 0;
7267 alndata->Bufflines = ajTextinGetRecords(seqin->Input);
7268 ajDebug("PDB format read %d lines\n",
7269 ajTextinGetRecords(seqin->Input));
7270 }
7271
7272 alndata = seqin->SeqData;
7273 alntable = alndata->Table;
7274
7275 if(alndata->Count >= alndata->Nseq)
7276 { /* all done */
7277 ajFilebuffClear(seqin->Input->Filebuff, 0);
7278 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7279
7280 return ajFalse;
7281 }
7282
7283 i = alndata->Count;
7284 ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
7285 readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
7286 ajStrAssignS(&thys->Name, alndata->Names[i]);
7287
7288 thys->Weight = readalnitem->Weight;
7289 ajStrAssignS(&thys->Seq, readalnitem->Seq);
7290
7291 alndata->Count++;
7292
7293 return ajTrue;
7294 }
7295
7296
7297
7298
7299 /* @funcstatic seqReadPdbnucseq ***********************************************
7300 **
7301 ** Given nucleotide data in a sequence structure,
7302 ** tries to read everything needed using PDB protein databank format
7303 ** using the SEQRES records.
7304 **
7305 ** This is the original sequence, see seqReadPdbnuc for parsing the ATOM records
7306 ** which give the sequence observed in the structure.
7307 **
7308 ** @param [w] thys [AjPSeq] Sequence object
7309 ** @param [u] seqin [AjPSeqin] Sequence input object
7310 ** @return [AjBool] ajTrue on success
7311 **
7312 ** @release 6.1.0
7313 ** @@
7314 ******************************************************************************/
7315
seqReadPdbnucseq(AjPSeq thys,AjPSeqin seqin)7316 static AjBool seqReadPdbnucseq(AjPSeq thys, AjPSeqin seqin)
7317 {
7318 AjPStr name = NULL;
7319 AjPStr alnname = NULL;
7320 AjPStr token = NULL;
7321 AjPStr chain = NULL;
7322 AjPTable alntable = NULL;
7323 SeqPMsfItem alnitem = NULL;
7324 const SeqPMsfItem readalnitem = NULL;
7325 AjPList alnlist = NULL;
7326 SeqPMsfData alndata = NULL;
7327 char aa;
7328 ajuint iseq = 0;
7329 ajuint nseq = 0;
7330 ajuint i;
7331 AjBool ok = ajTrue;
7332
7333 ajDebug("seqReadPdbnucseq seqin->SeqData %x\n", seqin->SeqData);
7334
7335 if(!seqin->SeqData)
7336 { /* start of file */
7337 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7338
7339 ajDebug("first line:\n'%S'\n", seqReadLine);
7340
7341 if(!ajStrPrefixC(seqReadLine, "HEADER "))
7342 {
7343 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7344
7345 return ajFalse;
7346 }
7347
7348 ajStrAssignSubS(&name,seqReadLine, 62, 71);
7349 ajStrTrimWhite(&name);
7350
7351 ajDebug("first line OK name '%S'\n", name);
7352
7353 seqin->SeqData = AJNEW0(alndata);
7354 alndata->Table = alntable = ajTablestrNew(1000);
7355 alnlist = ajListstrNew();
7356 seqin->Input->Filecount = 0;
7357
7358 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7359
7360 while(ok && !ajStrMatchC(seqReadLine, "END"))
7361 {
7362 if(ajStrPrefixC(seqReadLine, "SEQRES"))
7363 {
7364 ajStrKeepRange(&seqReadLine, 0,71);
7365 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
7366 ajStrTokenStep(seqHandle); /* 'SEQRES' */
7367
7368 ajStrTokenNextParse(seqHandle, &seqToken); /* number */
7369 ajStrToUint(seqToken, &iseq);
7370
7371 ajStrTokenNextParse(seqHandle, &chain); /* chain letter */
7372
7373 if(iseq == 1)
7374 {
7375 if(alnitem && !ajStrGetLen(alnitem->Seq))
7376 {
7377 nseq--;
7378 ajListstrPopLast(alnlist,&alnname);
7379 ajTableRemoveKey(alntable, alnitem->Name,
7380 (void**) &alnname);
7381 ajStrDel(&alnname);
7382 seqMsfItemDel(&alnitem);
7383 }
7384
7385 nseq++;
7386 ajFmtPrintS(&token, "%S_%S", name, chain);
7387 AJNEW0(alnitem);
7388 seqitemSetName(alnitem, token);
7389 ajStrAssignS(&alnname, alnitem->Name);
7390 alnitem->Weight = 1.0;
7391 ajTablePut(alntable, alnname, alnitem);
7392 alnname = NULL;
7393 ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
7394 }
7395
7396 while(ajStrTokenNextParse(seqHandle, &seqToken))
7397 if(ajBaseFromDoublet(seqToken,&aa))
7398 seqAppendK(&alnitem->Seq, aa);
7399 }
7400
7401 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7402 }
7403
7404 if(alnitem && !ajStrGetLen(alnitem->Seq))
7405 {
7406 nseq--;
7407 ajListstrPopLast(alnlist,&alnname);
7408 ajTableRemoveKey(alntable, alnitem->Name,
7409 (void**) &alnname);
7410 ajStrDel(&alnname);
7411 seqMsfItemDel(&alnitem);
7412 }
7413
7414 if(!nseq)
7415 {
7416 ajStrDel(&token);
7417 ajStrDel(&name);
7418 ajStrDel(&chain);
7419 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7420 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7421
7422 return ajFalse;
7423 }
7424
7425
7426 ajDebug("PDB Entry has %d sequences\n", nseq);
7427 ajListstrTrace(alnlist);
7428 ajTableTrace(alntable);
7429 ajTableMap(alntable, &seqMsfTabList, NULL);
7430
7431 alndata->Names = AJCALLOC(nseq, sizeof(*alndata->Names));
7432
7433 for(i=0; i < nseq; i++)
7434 {
7435 ajListstrPop(alnlist, &alndata->Names[i]);
7436 ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
7437 }
7438
7439 ajListstrFreeData(&alnlist);
7440
7441 ajTableMap(alntable, &seqMsfTabList, NULL);
7442 alndata->Nseq = nseq;
7443 alndata->Count = 0;
7444 alndata->Bufflines = ajTextinGetRecords(seqin->Input);
7445 ajDebug("PDBNUCSEQ format read %d lines\n",
7446 ajTextinGetRecords(seqin->Input));
7447 }
7448
7449 alndata = seqin->SeqData;
7450 alntable = alndata->Table;
7451
7452 if(alndata->Count >= alndata->Nseq)
7453 { /* all done */
7454 ajFilebuffClear(seqin->Input->Filebuff, 0);
7455 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7456
7457 return ajFalse;
7458 }
7459
7460 i = alndata->Count;
7461 ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
7462 readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
7463 ajStrAssignS(&thys->Name, alndata->Names[i]);
7464
7465 thys->Weight = readalnitem->Weight;
7466 ajStrAssignS(&thys->Seq, readalnitem->Seq);
7467
7468 alndata->Count++;
7469
7470 ajStrDel(&token);
7471 ajStrDel(&name);
7472 ajStrDel(&chain);
7473
7474 return ajTrue;
7475 }
7476
7477
7478
7479
7480 /* @funcstatic seqReadClustal *************************************************
7481 **
7482 ** Tries to read input in Clustal ALN format.
7483 **
7484 ** @param [w] thys [AjPSeq] Sequence object
7485 ** @param [u] seqin [AjPSeqin] Sequence input object
7486 ** @return [AjBool] ajTrue on success
7487 **
7488 ** @release 1.0.0
7489 ** @@
7490 ******************************************************************************/
7491
seqReadClustal(AjPSeq thys,AjPSeqin seqin)7492 static AjBool seqReadClustal(AjPSeq thys, AjPSeqin seqin)
7493 {
7494 AjPStr seqstr = NULL;
7495 AjPStr name = NULL;
7496 AjBool ok = ajFalse;
7497 ajuint iseq = 0;
7498 AjPTable alntable = NULL;
7499 SeqPMsfItem alnitem = NULL;
7500 const SeqPMsfItem readalnitem = NULL;
7501 AjPList alnlist = NULL;
7502 SeqPMsfData alndata = NULL;
7503
7504 ajuint i;
7505
7506 ajDebug("seqReadClustal seqin->SeqData %x\n", seqin->SeqData);
7507
7508 if(!seqin->SeqData)
7509 { /* start of file */
7510 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7511
7512 if(!ok)
7513 return ajFalse;
7514
7515 ajDebug("first line:\n'%S'\n", seqReadLine);
7516
7517 if(!ajStrPrefixC(seqReadLine, "CLUSTAL"))
7518 {
7519 /* first line test */
7520 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7521
7522 return ajFalse;
7523 }
7524
7525 ajDebug("first line OK: '%S'\n", seqReadLine);
7526
7527 while(ok)
7528 { /* skip blank lines */
7529 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7530 if(ok && !ajStrIsWhite(seqReadLine))
7531 break;
7532 }
7533
7534 if(!ok)
7535 {
7536 ajDebug("FAIL (blank lines only)\n");
7537 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7538
7539 return ajFalse;
7540 }
7541
7542 seqin->SeqData = AJNEW0(alndata);
7543 alndata->Table = alntable = ajTablestrNew(1000);
7544 alnlist = ajListstrNew();
7545 seqin->Input->Filecount = 0;
7546
7547 /* first set - create table */
7548 ok = ajTrue;
7549
7550 while(ok && ajStrExtractFirst(seqReadLine, &seqstr, &name))
7551 {
7552 AJNEW0(alnitem);
7553 ajStrAssignS(&alnitem->Name, name);
7554 alnitem->Weight = 1.0;
7555 seqAppend(&alnitem->Seq, seqstr);
7556
7557 iseq++;
7558 ajDebug("first set %d: '%S'\n line: '%S'\n",
7559 iseq, name, seqReadLine);
7560
7561 ajTablePut(alntable, name, alnitem);
7562 name = NULL;
7563 ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
7564
7565 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7566 }
7567
7568 ajStrDel(&seqstr);
7569
7570 ajDebug("Header has %d sequences\n", iseq);
7571 ajListstrTrace(alnlist);
7572 ajTableTrace(alntable);
7573 ajTableMap(alntable, &seqMsfTabList, NULL);
7574
7575 alndata->Names = AJCALLOC(iseq, sizeof(*alndata->Names));
7576
7577 for(i=0; i < iseq; i++)
7578 {
7579 ajListstrPop(alnlist, &alndata->Names[i]);
7580 ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
7581 }
7582
7583 ajListstrFreeData(&alnlist);
7584
7585 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
7586 { /* now read the rest */
7587 seqClustalReadseq(seqReadLine, alntable);
7588 }
7589
7590 ajTableMap(alntable, &seqMsfTabList, NULL);
7591 alndata->Nseq = iseq;
7592 alndata->Count = 0;
7593 alndata->Bufflines = ajTextinGetRecords(seqin->Input);
7594 ajDebug("ALN format read %d lines\n",
7595 ajTextinGetRecords(seqin->Input));
7596 }
7597
7598 alndata = seqin->SeqData;
7599 alntable = alndata->Table;
7600
7601 if(alndata->Count >= alndata->Nseq)
7602 { /* all done */
7603 ajFilebuffClear(seqin->Input->Filebuff, 0);
7604 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7605
7606 return ajFalse;
7607 }
7608
7609 i = alndata->Count;
7610 ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
7611 readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
7612 ajStrAssignS(&thys->Name, alndata->Names[i]);
7613
7614 thys->Weight = readalnitem->Weight;
7615 ajStrAssignS(&thys->Seq, readalnitem->Seq);
7616
7617 alndata->Count++;
7618
7619 return ajTrue;
7620 }
7621
7622
7623
7624
7625 /* @funcstatic seqClustalReadseq **********************************************
7626 **
7627 ** Reads sequence name from first token on the input line, and appends
7628 ** the sequence data to that sequence in the alntable structure.
7629 **
7630 ** @param [r] rdline [const AjPStr] Line from input file.
7631 ** @param [r] msftable [const AjPTable] MSF format sequence table.
7632 ** @return [AjBool] ajTrue on success
7633 **
7634 ** @release 1.0.0
7635 ** @@
7636 ******************************************************************************/
7637
seqClustalReadseq(const AjPStr rdline,const AjPTable msftable)7638 static AjBool seqClustalReadseq(const AjPStr rdline, const AjPTable msftable)
7639 {
7640 SeqPMsfItem msfitem;
7641 AjPStr token = NULL;
7642 AjPStr seqstr = NULL;
7643
7644 if(!ajStrExtractFirst(rdline, &seqstr, &token))
7645 return ajFalse;
7646
7647 msfitem = ajTableFetchmodS(msftable, token);
7648 ajStrDel(&token);
7649
7650 if(!msfitem)
7651 {
7652 ajStrDel(&seqstr);
7653
7654 return ajFalse;
7655 }
7656
7657 seqAppend(&msfitem->Seq, seqstr);
7658 ajStrDel(&seqstr);
7659
7660 return ajTrue;
7661 }
7662
7663
7664
7665
7666 /* @funcstatic seqReadPhylipnon ***********************************************
7667 **
7668 ** Tries to read input in Phylip non-interleaved format.
7669 **
7670 ** @param [w] thys [AjPSeq] Sequence object
7671 ** @param [u] seqin [AjPSeqin] Sequence input object
7672 ** @return [AjBool] ajTrue on success
7673 **
7674 ** @release 3.0.0
7675 ** @@
7676 ******************************************************************************/
7677
seqReadPhylipnon(AjPSeq thys,AjPSeqin seqin)7678 static AjBool seqReadPhylipnon(AjPSeq thys, AjPSeqin seqin)
7679 {
7680 AjPStr seqstr = NULL;
7681 AjPStr tmpstr = NULL;
7682 AjBool ok = ajFalse;
7683 ajuint iseq = 0;
7684 ajuint jseq = 0;
7685 ajuint len = 0;
7686 ajuint ilen = 0;
7687
7688 AjPTable phytable = NULL;
7689 SeqPMsfItem phyitem = NULL;
7690 const SeqPMsfItem readphyitem = NULL;
7691 SeqPMsfData phydata = NULL;
7692 ajuint i;
7693 AjBool done = ajFalse;
7694
7695 ajDebug("seqReadPhylipnon seqin->SeqData %x\n", seqin->SeqData);
7696
7697 if(!seqRegPhylipTop)
7698 seqRegPhylipTop = ajRegCompC("^ *([0-9]+) +([0-9]+)");
7699
7700 if(!seqRegPhylipHead)
7701 seqRegPhylipHead = ajRegCompC("^(..........) ?"); /* 10 chars */
7702
7703 if(!seqin->SeqData)
7704 { /* start of file */
7705 seqin->Multidone = ajFalse;
7706 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7707 if(!ok)
7708 return ajFalse;
7709
7710 ajDebug("first line:\n'%-20.20S'\n", seqReadLine);
7711
7712 if(!ajRegExec(seqRegPhylipTop, seqReadLine))
7713 { /* first line test */
7714 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7715
7716 return ajFalse;
7717 }
7718
7719 ajRegSubI(seqRegPhylipTop, 1, &tmpstr);
7720 ajStrToUint(tmpstr, &iseq);
7721 ajDebug("seqRegPhylipTop1 '%S' %d\n", tmpstr, iseq);
7722 ajRegSubI(seqRegPhylipTop, 2, &tmpstr);
7723 ajStrToUint(tmpstr, &len);
7724 ajDebug("seqRegPhylipTop2 '%S' %d\n", tmpstr,len);
7725 ajDebug("first line OK: '%S' iseq: %d len: %d\n",
7726 seqReadLine, iseq, len);
7727 ajStrDel(&tmpstr);
7728
7729 seqin->SeqData = AJNEW0(phydata);
7730 phydata->Table = phytable = ajTablestrNew(1000);
7731 phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
7732 seqin->Input->Filecount = 0;
7733
7734 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7735 ilen = 0;
7736
7737 while(ok && (jseq < iseq))
7738 {
7739 /* first set - create table */
7740 if(!ajRegExec(seqRegPhylipHead, seqReadLine))
7741 {
7742 ajDebug("FAIL (not seqRegPhylipHead): '%S'\n", seqReadLine);
7743 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7744 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7745
7746 return ajFalse;
7747 }
7748
7749 ajDebug("line: '%S'\n", seqReadLine);
7750 ajRegSubI(seqRegPhylipHead, 1, &tmpstr);
7751
7752 if(!ajStrIsWhite(tmpstr))
7753 {
7754 /* check previous sequence */
7755 if(jseq)
7756 {
7757 if(ilen != len)
7758 {
7759 ajDebug("phylipnon format length mismatch at %d "
7760 "(length %d)\n",
7761 len, ilen);
7762 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7763 ajStrDel(&tmpstr);
7764 return ajFalse;
7765 }
7766 }
7767
7768 /* new sequence */
7769 AJNEW0(phyitem);
7770 seqitemSetName(phyitem, tmpstr);
7771 ajStrAssignS(&phydata->Names[jseq], phyitem->Name);
7772 ajDebug("name: '%S' => '%S'\n", tmpstr, phyitem->Name);
7773 phyitem->Weight = 1.0;
7774 ajRegPost(seqRegPhylipHead, &seqstr);
7775 seqAppend(&phyitem->Seq, seqstr);
7776 ajStrDel(&seqstr);
7777 ilen = ajStrGetLen(phyitem->Seq);
7778
7779 if(ilen == len)
7780 done = ajTrue;
7781 else if(ilen > len)
7782 {
7783 ajDebug("Phylipnon format: sequence %S "
7784 "header size %d exceeded\n",
7785 phyitem->Name, len);
7786 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7787 seqMsfItemDel(&phyitem);
7788 ajStrDel(&tmpstr);
7789
7790 return ajFalse;
7791 }
7792
7793 ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
7794 ajDebug("seq %d: (%d) '%-20.20S'\n", jseq, ilen, seqReadLine);
7795 }
7796 else
7797 {
7798 /* more sequence to append */
7799 if(seqPhylipReadseq(seqReadLine, phytable, phyitem->Name,
7800 len, &ilen, &done))
7801 {
7802 ajDebug("read to len %d\n", ilen);
7803
7804 if (done)
7805 jseq++;
7806 }
7807
7808 }
7809 ajStrDel(&tmpstr);
7810
7811 if(jseq < iseq)
7812 {
7813 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7814 }
7815 }
7816
7817 if(ilen != len)
7818 {
7819 ajDebug("phylipnon format final length mismatch at %d "
7820 "(length %d)\n",
7821 len, ilen);
7822 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7823
7824 return ajFalse;
7825 }
7826
7827 ajDebug("Header has %d sequences\n", jseq);
7828 ajTableTrace(phytable);
7829 ajTableMap(phytable, &seqMsfTabList, NULL);
7830
7831 phydata->Nseq = iseq;
7832 phydata->Count = 0;
7833 phydata->Bufflines = ajTextinGetRecords(seqin->Input);
7834 ajDebug("PHYLIP format read %d lines\n",
7835 ajTextinGetRecords(seqin->Input));
7836 }
7837
7838 phydata = seqin->SeqData;
7839 phytable = phydata->Table;
7840
7841 i = phydata->Count;
7842 ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
7843 readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
7844 ajStrAssignS(&thys->Name, phydata->Names[i]);
7845 ajStrDel(&phydata->Names[i]);
7846
7847 thys->Weight = readphyitem->Weight;
7848 ajStrAssignS(&thys->Seq, readphyitem->Seq);
7849
7850 phydata->Count++;
7851
7852 if(phydata->Count >= phydata->Nseq)
7853 {
7854 seqin->Multidone = ajTrue;
7855 ajFilebuffClear(seqin->Input->Filebuff, 0);
7856 ajDebug("seqReadPhylip multidone\n");
7857 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7858 }
7859
7860 ajStrDel(&seqstr);
7861 ajStrDel(&tmpstr);
7862 seqMsfDataTrace(seqin->SeqData);
7863
7864 return ajTrue;
7865 }
7866
7867
7868
7869
7870 /* @funcstatic seqReadPhylip **************************************************
7871 **
7872 ** Tries to read input in Phylip interleaved format.
7873 **
7874 ** @param [w] thys [AjPSeq] Sequence object
7875 ** @param [u] seqin [AjPSeqin] Sequence input object
7876 ** @return [AjBool] ajTrue on success
7877 **
7878 ** @release 1.0.0
7879 ** @@
7880 ******************************************************************************/
7881
seqReadPhylip(AjPSeq thys,AjPSeqin seqin)7882 static AjBool seqReadPhylip(AjPSeq thys, AjPSeqin seqin)
7883 {
7884 AjPStr seqstr = NULL;
7885 AjPStr tmpstr = NULL;
7886 AjBool ok = ajFalse;
7887 ajuint iseq = 0;
7888 ajuint jseq = 0;
7889 ajuint len = 0;
7890 ajuint ilen = 0;
7891 ajuint maxlen = 0;
7892 AjPFilebuff buff;
7893
7894 AjPTable phytable = NULL;
7895 SeqPMsfItem phyitem = NULL;
7896 const SeqPMsfItem readphyitem = NULL;
7897 AjPList phylist = NULL;
7898 SeqPMsfData phydata = NULL;
7899 ajuint i;
7900 AjBool done = ajFalse;
7901
7902 ajDebug("seqReadPhylip seqin->SeqData %x\n", seqin->SeqData);
7903
7904 buff = seqin->Input->Filebuff;
7905 ajFilebuffSetBuffered(buff); /* must buffer to test non-interleaved */
7906
7907 if(!seqRegPhylipTop)
7908 seqRegPhylipTop = ajRegCompC("^ *([0-9]+) +([0-9]+)");
7909
7910 if(!seqRegPhylipHead)
7911 seqRegPhylipHead = ajRegCompC("^(..........) ?"); /* 10 chars */
7912
7913 if(!seqRegPhylipSeq)
7914 seqRegPhylipSeq = ajRegCompC("^[ \t\n\r]*$");
7915
7916 if(!seqin->SeqData)
7917 { /* start of file */
7918 seqin->Multidone = ajFalse;
7919 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7920 while (ok && ajStrIsWhite(seqReadLine))
7921 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7922
7923 if(!ok)
7924 return ajFalse;
7925
7926 /* ajDebug("first line:\n'%-20.20S'\n", seqReadLine);*/
7927
7928 if(!ajRegExec(seqRegPhylipTop, seqReadLine))
7929 { /* first line test */
7930 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7931
7932 return ajFalse;
7933 }
7934
7935 ajRegSubI(seqRegPhylipTop, 1, &tmpstr);
7936 ajStrToUint(tmpstr, &iseq);
7937 ajRegSubI(seqRegPhylipTop, 2, &tmpstr);
7938 ajStrToUint(tmpstr, &len);
7939 ajStrDel(&tmpstr);
7940 /*ajDebug("first line OK: '%S' iseq: %d len: %d\n",
7941 seqReadLine, iseq, len);*/
7942
7943 seqin->SeqData = AJNEW0(phydata);
7944 phydata->Table = phytable = ajTablestrNew(1000);
7945 phylist = ajListstrNew();
7946 seqin->Input->Filecount = 0;
7947
7948 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7949 ilen = 0;
7950
7951 while(ok && (jseq < iseq))
7952 {
7953 /* first set - create table */
7954 if(!ajRegExec(seqRegPhylipHead, seqReadLine))
7955 {
7956 ajDebug("FAIL (not seqRegPhylipHead): '%S'\n", seqReadLine);
7957 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7958 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7959
7960 return ajFalse;
7961 }
7962
7963 /* ajDebug("line: '%S'\n", seqReadLine); */
7964 AJNEW0(phyitem);
7965 ajRegSubI(seqRegPhylipHead, 1, &tmpstr);
7966 seqitemSetName(phyitem, tmpstr);
7967 ajStrDel(&tmpstr);
7968 /* ajDebug("name: '%S' => '%S'\n", tmpstr, phyitem->Name); */
7969 phyitem->Weight = 1.0;
7970 ajRegPost(seqRegPhylipHead, &seqstr);
7971 seqAppend(&phyitem->Seq, seqstr);
7972 ajStrDel(&seqstr);
7973 ilen = ajStrGetLen(phyitem->Seq);
7974
7975 if(ilen == len)
7976 done = ajTrue;
7977 else if(ilen > len)
7978 {
7979 ajDebug("Phylip format: sequence %S header size %d exceeded\n",
7980 phyitem->Name, len);
7981 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7982
7983 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7984 seqMsfItemDel(&phyitem);
7985
7986 ajListstrFreeData(&phylist);
7987
7988 return ajFalse;
7989 }
7990
7991 if(ajStrIsWhite(phyitem->Name) ||
7992 ajTableFetchS(phytable, phyitem->Name))
7993 {
7994 ajFilebuffSetBuffered(buff);
7995 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7996 ajDebug("phytable repeated name '%S'\n",
7997 phyitem->Name);
7998
7999 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8000 seqMsfItemDel(&phyitem);
8001
8002 ajListstrFreeData(&phylist);
8003
8004 return seqReadPhylipnon(thys, seqin);
8005 }
8006
8007 ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
8008 ajListstrPushAppend(phylist, ajStrNewS(phyitem->Name));
8009 ajDebug("added '%S' list:%Lu table:%Lu\n",
8010 phyitem->Name, ajListGetLength(phylist),
8011 ajTableGetLength(phytable));
8012
8013 if(!jseq)
8014 maxlen = ilen;
8015 else
8016 {
8017 if(ilen != maxlen)
8018 {
8019 ajDebug("phylip format length mismatch in header "
8020 "iseq: %d jseq: %d ilen: %d maxlen: %d\n",
8021 iseq, jseq, ilen, maxlen);
8022 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8023 ajDebug("phytable deleted size:%Lu\n",
8024 ajTableGetLength(phytable));
8025 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8026 ajListstrFreeData(&phylist);
8027
8028 if(seqReadPhylipnon(thys, seqin))
8029 return ajTrue;
8030 else
8031 {
8032 ajWarn("phylip format length mismatch in header");
8033
8034 return ajFalse;
8035 }
8036 }
8037 }
8038
8039 jseq++;
8040 /* ajDebug("first set %d: (%d) '%-20.20S'\n",
8041 jseq, ilen, seqReadLine); */
8042
8043 if(jseq < iseq)
8044 {
8045 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8046 }
8047 }
8048
8049 /* ajDebug("Header has %d sequences\n", jseq);*/
8050 ajListstrTrace(phylist);
8051 ajTableTrace(phytable);
8052 ajTableMap(phytable, &seqMsfTabList, NULL);
8053
8054 phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
8055
8056 for(i=0; i < iseq; i++)
8057 {
8058 ajListstrPop(phylist, &phydata->Names[i]);
8059 /* ajDebug("list [%d] '%S'\n", i, phydata->Names[i]); */
8060 }
8061
8062 ajListstrFreeData(&phylist);
8063
8064 if(ilen < len)
8065 {
8066 jseq=0;
8067
8068 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
8069 { /* now read the rest */
8070 /* ajDebug("seqReadPhylip line '%S\n", seqReadLine); */
8071
8072 if(seqPhylipReadseq(seqReadLine, phytable,
8073 phydata->Names[jseq],
8074 len, &ilen, &done))
8075 {
8076 if(!jseq)
8077 maxlen = ilen;
8078 else
8079 {
8080 if(ilen != maxlen)
8081 {
8082 ajDebug("phylip format length mismatch at %d "
8083 "(length %d)\n",
8084 maxlen, ilen);
8085 ajFilebuffSetBuffered(buff);
8086 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8087 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8088 ajDebug("File reset, try seqReadPhylipnon\n");
8089
8090 return seqReadPhylipnon(thys, seqin);
8091 }
8092 }
8093
8094 jseq++;
8095
8096 if(jseq == iseq)
8097 jseq = 0;
8098
8099 if(!jseq && done)
8100 {
8101 /* ajDebug("seqReadPhylip set done\n"); */
8102 break;
8103 }
8104 done = ajTrue; /* for end-of-file */
8105 }
8106 }
8107
8108 if(!done)
8109 {
8110 ajDebug("seqReadPhylip read failed, try seqReadPhylipnon\n");
8111 ajFilebuffSetBuffered(buff);
8112 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8113 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8114
8115 return seqReadPhylipnon(thys, seqin);
8116 }
8117
8118 if(jseq)
8119 {
8120 ajDebug("Phylip format %d sequences partly read at end\n",
8121 iseq-jseq);
8122 ajFilebuffSetBuffered(buff);
8123 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8124 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8125
8126 return seqReadPhylipnon(thys, seqin);
8127 }
8128 }
8129
8130 ajTableMap(phytable, &seqMsfTabList, NULL);
8131 phydata->Nseq = iseq;
8132 phydata->Count = 0;
8133 phydata->Bufflines = ajTextinGetRecords(seqin->Input);
8134 /* ajDebug("PHYLIP format read %d lines\n",
8135 ajTextinGetRecords(seqin->Input));*/
8136 }
8137
8138 phydata = seqin->SeqData;
8139 phytable = phydata->Table;
8140
8141 i = phydata->Count;
8142 /* ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]); */
8143 readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
8144 ajStrAssignS(&thys->Name, phydata->Names[i]);
8145
8146 thys->Weight = readphyitem->Weight;
8147 ajStrAssignS(&thys->Seq, readphyitem->Seq);
8148
8149 phydata->Count++;
8150
8151 if(phydata->Count >= phydata->Nseq)
8152 {
8153 seqin->Multidone = ajTrue;
8154 ajDebug("seqReadPhylip multidone\n");
8155 ajFilebuffClear(seqin->Input->Filebuff, 0);
8156 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8157 }
8158
8159 seqMsfDataTrace(seqin->SeqData);
8160
8161 return ajTrue;
8162 }
8163
8164
8165
8166
8167 /* @funcstatic seqPhylipReadseq ***********************************************
8168 **
8169 ** Reads sequence from the input line, and appends the sequence data
8170 ** to the named sequence in the phytable structure.
8171 **
8172 ** @param [r] rdline [const AjPStr] Line from input file.
8173 ** @param [r] phytable [const AjPTable] MSF format sequence table.
8174 ** @param [r] token [const AjPStr] Name of sequence so it can append
8175 ** @param [r] len [ajuint] Final length of each sequence (from file header)
8176 ** @param [w] ilen [ajuint*] Length of each sequence so far
8177 ** @param [w] done [AjBool*] ajTrue if sequence was completed
8178 ** @return [AjBool] ajTrue on success
8179 **
8180 ** @release 1.0.0
8181 ** @@
8182 ******************************************************************************/
8183
seqPhylipReadseq(const AjPStr rdline,const AjPTable phytable,const AjPStr token,ajuint len,ajuint * ilen,AjBool * done)8184 static AjBool seqPhylipReadseq(const AjPStr rdline, const AjPTable phytable,
8185 const AjPStr token,
8186 ajuint len, ajuint* ilen, AjBool* done)
8187 {
8188 SeqPMsfItem phyitem;
8189
8190 *done = ajFalse;
8191
8192 if(!seqRegPhylipSeq2)
8193 seqRegPhylipSeq2 = ajRegCompC("[^ \t\n\r]");
8194
8195 if(!ajRegExec(seqRegPhylipSeq2, rdline))
8196 return ajFalse;
8197
8198 phyitem = ajTableFetchmodS(phytable, token);
8199
8200 if(!phyitem)
8201 {
8202 ajDebug("seqPhylipReadseq failed to find '%S' in phytable\n",
8203 token);
8204
8205 return ajFalse;
8206 }
8207
8208 seqAppend(&phyitem->Seq, rdline);
8209 *ilen = ajStrGetLen(phyitem->Seq);
8210
8211 if(*ilen == len)
8212 *done = ajTrue;
8213 else if(*ilen > len)
8214 {
8215 ajDebug("Phylip format error, sequence %S length %d exceeded\n",
8216 token, len);
8217
8218 return ajFalse;
8219 }
8220
8221 ajDebug("seqPhylipReadSeq '%S' len: %d ilen: %d done: %B\n",
8222 token, len, *ilen, *done);
8223
8224 return ajTrue;
8225 }
8226
8227
8228
8229
8230 /* @funcstatic seqReadHennig86 ************************************************
8231 **
8232 ** Tries to read input in Hennig86 format.
8233 **
8234 ** @param [w] thys [AjPSeq] Sequence object
8235 ** @param [u] seqin [AjPSeqin] Sequence input object
8236 ** @return [AjBool] ajTrue on success
8237 **
8238 ** @release 1.0.0
8239 ** @@
8240 ******************************************************************************/
8241
seqReadHennig86(AjPSeq thys,AjPSeqin seqin)8242 static AjBool seqReadHennig86(AjPSeq thys, AjPSeqin seqin)
8243 {
8244 AjPStr seqstr = NULL;
8245 AjPStr tmpstr = NULL;
8246 AjBool ok = ajFalse;
8247 ajuint iseq = 0;
8248 ajuint len = 0;
8249 AjPTable fmttable = NULL;
8250 SeqPMsfItem fmtitem = NULL;
8251 const SeqPMsfItem readfmtitem = NULL;
8252 AjPList fmtlist = NULL;
8253 SeqPMsfData fmtdata = NULL;
8254 char *cp;
8255
8256 ajuint i;
8257 ajuint jseq = 0;
8258
8259 ajDebug("seqReadHennig86 seqin->SeqData %x\n", seqin->SeqData);
8260
8261 if(!seqRegHennigHead)
8262 seqRegHennigHead = ajRegCompC("[^1-4? \t]");
8263
8264 if(!seqRegHennigTop)
8265 seqRegHennigTop = ajRegCompC("^ *([0-9]+) +([0-9]+)");
8266
8267 if(!seqRegHennigBlank)
8268 seqRegHennigBlank = ajRegCompC("^[ \t\n\r]*$");
8269
8270 if(!seqRegHennigSeq)
8271 seqRegHennigSeq = ajRegCompC("^([^ \t\n\r]+)");
8272
8273 if(!seqin->SeqData)
8274 {
8275 /* start: load in file */
8276 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8277 if(!ok)
8278 return ajFalse;
8279
8280 ajDebug("first line:\n'%S'\n", seqReadLine);
8281
8282 if(!ajStrPrefixC(seqReadLine, "xread"))
8283 {
8284 /* first line test */
8285 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8286
8287 return ajFalse;
8288 }
8289
8290 ajDebug("first line OK: '%S'\n", seqReadLine);
8291
8292 /* skip title line */
8293 for(i=0; i<2; i++)
8294 {
8295 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8296 if(!ok)
8297 {
8298 ajDebug("FAIL (bad header)\n");
8299 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8300
8301 return ajFalse;
8302 }
8303 }
8304
8305 if(!ajRegExec(seqRegHennigTop, seqReadLine)) /* first line test */
8306 return ajFalse;
8307
8308 ajRegSubI(seqRegHennigTop, 1, &tmpstr);
8309 ajStrToUint(tmpstr, &iseq);
8310 ajRegSubI(seqRegHennigTop, 2, &tmpstr);
8311 ajStrToUint(tmpstr, &len);
8312 ajDebug("first line OK: '%S' iseq: %d len: %d\n",
8313 seqReadLine, iseq, len);
8314 ajStrDel(&tmpstr);
8315
8316 seqin->SeqData = AJNEW0(fmtdata);
8317 fmtdata->Table = fmttable = ajTablestrNew(1000);
8318 fmtlist = ajListstrNew();
8319 seqin->Input->Filecount = 0;
8320
8321 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8322
8323 while(ok && (jseq < iseq))
8324 { /* first set - create table */
8325 if(!ajRegExec(seqRegHennigHead, seqReadLine))
8326 {
8327 ajDebug("FAIL (not seqRegHennigHead): '%S'\n", seqReadLine);
8328
8329 return ajFalse;
8330 }
8331
8332 AJNEW0(fmtitem);
8333 ajStrAssignS(&fmtitem->Name, seqReadLine);
8334 fmtitem->Weight = 1.0;
8335 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8336 while(ok && ajRegExec(seqRegHennigSeq, seqReadLine))
8337 {
8338 ajRegPost(seqRegHennigSeq, &seqstr);
8339
8340 for(cp = ajStrGetuniquePtr(&seqstr); cp; cp++)
8341 switch(*cp)
8342 {
8343 case 0: *cp = 'A';break;
8344 case 1: *cp = 'T';break;
8345 case 2: *cp = 'G';break;
8346 case 3: *cp = 'C';break;
8347 default: *cp = '.';break;
8348 }
8349
8350 seqAppend(&fmtitem->Seq, seqstr);
8351 }
8352
8353 ajStrDel(&seqstr);
8354
8355 ajTablePut(fmttable, ajStrNewS(fmtitem->Name), fmtitem);
8356 ajListstrPushAppend(fmtlist, ajStrNewS(fmtitem->Name));
8357 jseq++;
8358 ajDebug("first set %d: '%S'\n", jseq, seqReadLine);
8359
8360 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8361 }
8362
8363 ajDebug("Header has %d sequences\n", iseq);
8364 ajListstrTrace(fmtlist);
8365 ajTableTrace(fmttable);
8366 ajTableMap(fmttable, &seqMsfTabList, NULL);
8367
8368 fmtdata->Names = AJCALLOC(iseq, sizeof(*fmtdata->Names));
8369
8370 for(i=0; i < iseq; i++)
8371 {
8372 ajListstrPop(fmtlist, &fmtdata->Names[i]);
8373 ajDebug("list [%d] '%S'\n", i, fmtdata->Names[i]);
8374 }
8375
8376 ajListstrFreeData(&fmtlist);
8377
8378 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
8379 { /* now read the rest */
8380 seqHennig86Readseq(seqReadLine, fmttable);
8381 }
8382
8383 ajTableMap(fmttable, &seqMsfTabList, NULL);
8384 fmtdata->Nseq = iseq;
8385 fmtdata->Count = 0;
8386 fmtdata->Bufflines = ajTextinGetRecords(seqin->Input);
8387 ajDebug("... format read %d lines\n",
8388 ajTextinGetRecords(seqin->Input));
8389 }
8390
8391 /* processing entries */
8392
8393 fmtdata = seqin->SeqData;
8394 fmttable = fmtdata->Table;
8395
8396 if(fmtdata->Count >=fmtdata->Nseq)
8397 { /* all done */
8398 ajFilebuffClear(seqin->Input->Filebuff, 0);
8399 ajTableMapDel(fmttable, &seqMsfTabDel, NULL);
8400 ajTableFree(&fmttable);
8401 AJFREE(fmtdata->Names);
8402 AJFREE(fmtdata);
8403 seqin->SeqData = NULL;
8404
8405 return ajFalse;
8406 }
8407
8408 i = fmtdata->Count;
8409 ajDebug("returning [%d] '%S'\n", i, fmtdata->Names[i]);
8410 readfmtitem = ajTableFetchS(fmttable, fmtdata->Names[i]);
8411 ajStrAssignS(&thys->Name, fmtdata->Names[i]);
8412
8413 thys->Weight = readfmtitem->Weight;
8414 ajStrAssignS(&thys->Seq, readfmtitem->Seq);
8415
8416 fmtdata->Count++;
8417
8418 return ajTrue;
8419 }
8420
8421
8422
8423
8424 /* @funcstatic seqHennig86Readseq *********************************************
8425 **
8426 ** Reads sequence name from first token on the input line, and appends
8427 ** the sequence data to that sequence in the fmttable structure.
8428 **
8429 ** @param [r] rdline [const AjPStr] Line from input file.
8430 ** @param [r] msftable [const AjPTable] MSF format sequence table.
8431 ** @return [AjBool] ajTrue on success
8432 **
8433 ** @release 1.0.0
8434 ** @@
8435 ******************************************************************************/
8436
seqHennig86Readseq(const AjPStr rdline,const AjPTable msftable)8437 static AjBool seqHennig86Readseq(const AjPStr rdline, const AjPTable msftable)
8438 {
8439 SeqPMsfItem msfitem;
8440 AjPStr token = NULL;
8441 AjPStr seqstr = NULL;
8442
8443 if(!seqRegHennigSeq)
8444 seqRegHennigSeq = ajRegCompC("^[^ \t\n\r]+"); /* must be line start */
8445
8446 if(!ajRegExec(seqRegHennigSeq, rdline))
8447 return ajFalse;
8448
8449 ajRegSubI(seqRegHennigSeq, 0, &token);
8450 msfitem = ajTableFetchmodS(msftable, token);
8451 ajStrDel(&token);
8452
8453 if(!msfitem)
8454 return ajFalse;
8455
8456 ajRegPost(seqRegHennigSeq, &seqstr);
8457 seqAppend(&msfitem->Seq, seqstr);
8458
8459 ajStrDel(&seqstr);
8460
8461 return ajTrue;
8462 }
8463
8464
8465
8466
8467 /* @funcstatic seqReadTreecon *************************************************
8468 **
8469 ** Tries to read input in Treecon format.
8470 **
8471 ** Treecon is a windows program for tree drawing.
8472 **
8473 ** Van de Peer, Y., De Wachter, R. (1994)
8474 ** TREECON for Windows: a software package for the construction and
8475 ** drawing of evolutionary trees for the Microsoft Windows environment.
8476 ** Comput. Applic. Biosci. 10, 569-570.
8477 **
8478 ** @param [w] thys [AjPSeq] Sequence object
8479 ** @param [u] seqin [AjPSeqin] Sequence input object
8480 ** @return [AjBool] ajTrue on success
8481 **
8482 ** @release 2.0.0
8483 ** @@
8484 ******************************************************************************/
8485
seqReadTreecon(AjPSeq thys,AjPSeqin seqin)8486 static AjBool seqReadTreecon(AjPSeq thys, AjPSeqin seqin)
8487 {
8488 AjPStr tmpstr = NULL;
8489 AjBool ok = ajFalse;
8490 ajint len = 0;
8491 ajint ilen = 0;
8492 ajuint iseq;
8493 ajuint i;
8494
8495 AjPTable phytable = NULL;
8496 SeqPMsfItem phyitem = NULL;
8497 const SeqPMsfItem readphyitem = NULL;
8498 AjPList phylist = NULL;
8499 SeqPMsfData phydata = NULL;
8500
8501 if(!seqRegTreeconTop)
8502 seqRegTreeconTop = ajRegCompC("^ *([0-9]+)");
8503
8504 if(!seqin->SeqData) /* first time - read the data */
8505 {
8506 iseq = 0;
8507 seqin->Multidone = ajFalse;
8508 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8509 if(!ok)
8510 return ajFalse;
8511
8512 if(!ajRegExec(seqRegTreeconTop, seqReadLine))
8513 { /* first line test */
8514 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8515
8516 return ajFalse;
8517 }
8518
8519 ajRegSubI(seqRegTreeconTop, 1, &tmpstr);
8520 ajStrToInt(tmpstr, &len);
8521 ajDebug("first line OK: len: %d\n",
8522 len);
8523 ajStrDel(&tmpstr);
8524
8525 seqin->SeqData = AJNEW0(phydata);
8526 phydata->Table = phytable = ajTablestrNew(1000);
8527 phylist = ajListstrNew();
8528 seqin->Input->Filecount = 0;
8529
8530 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8531 ilen = UINT_MAX;
8532
8533 if(!ok)
8534 {
8535 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8536 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8537
8538 return ajFalse;
8539 }
8540
8541 while (ok)
8542 {
8543 if (ilen < 0)
8544 {
8545 ajStrRemoveWhiteExcess(&seqReadLine);
8546
8547 if (!ajStrGetLen(seqReadLine)) /* empty line after sequence */
8548 {
8549 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8550 continue;
8551 }
8552
8553 AJNEW0(phyitem);
8554 phyitem->Weight = 1.0;
8555 seqitemSetName(phyitem, seqReadLine);
8556 ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
8557 ajListstrPushAppend(phylist, ajStrNewS(phyitem->Name));
8558 iseq++;
8559 ilen = 0;
8560 }
8561 else
8562 {
8563 ajStrRemoveWhite(&seqReadLine);
8564 ilen += ajStrGetLen(seqReadLine);
8565 seqAppend(&phyitem->Seq, seqReadLine);
8566
8567 if (ilen > len)
8568 {
8569 ajDebug("Treecon format: '%S' too long, read %d/%d\n",
8570 phyitem->Name, ilen, len);
8571 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8572 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8573
8574 return ajFalse;
8575 }
8576
8577 if (ilen == len)
8578 ilen = -1;
8579 }
8580
8581 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8582 }
8583
8584 if (ilen >= 0)
8585 {
8586 ajDebug("Treecon format: unfinished sequence '%S' read %d/%d\n",
8587 phyitem->Name, ilen, len);
8588 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8589
8590 return ajFalse;
8591 }
8592
8593 phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
8594
8595 for(i=0; i < iseq; i++)
8596 {
8597 ajListstrPop(phylist, &phydata->Names[i]);
8598 ajDebug("list [%d] '%S'\n", i, phydata->Names[i]);
8599 }
8600
8601 ajListstrFreeData(&phylist);
8602 phydata->Nseq = iseq;
8603 phydata->Count = 0;
8604 phydata->Bufflines = ajTextinGetRecords(seqin->Input);
8605 ajDebug("Treecon format read %d lines\n",
8606 ajTextinGetRecords(seqin->Input));
8607 }
8608
8609 phydata = seqin->SeqData;
8610 phytable = phydata->Table;
8611
8612 i = phydata->Count;
8613 ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
8614 readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
8615 ajStrAssignS(&thys->Name, phydata->Names[i]);
8616
8617 thys->Weight = readphyitem->Weight;
8618 ajStrAssignS(&thys->Seq, readphyitem->Seq);
8619
8620 phydata->Count++;
8621
8622 if(phydata->Count >=phydata->Nseq)
8623 {
8624 seqin->Multidone = ajTrue;
8625 ajDebug("seqReadTreecon multidone\n");
8626 ajFilebuffClear(seqin->Input->Filebuff, 0);
8627 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8628 }
8629
8630 return ajTrue;
8631 }
8632
8633
8634
8635
8636 /* @funcstatic seqReadJackknifer **********************************************
8637 **
8638 ** Tries to read input in Jackknifer format.
8639 **
8640 ** The Jackknifer program by Farris is a parsimony program that also
8641 ** implements the jackknife method to test the reliability of branches.
8642 ** The format is similar to the MEGA format.
8643 **
8644 ** On the first line a title/description is placed in between single quotes.
8645 ** The alignment can be written in sequential or interleaved format,
8646 ** but the sequence names have to be placed between brackets.
8647 ** Also no blanks are allowed in the names.
8648 ** They should be replaced by underscores ( _ ).
8649 ** The file is ended by a semicolon.
8650 **
8651 ** @param [w] thys [AjPSeq] Sequence object
8652 ** @param [u] seqin [AjPSeqin] Sequence input object
8653 ** @return [AjBool] ajTrue on success
8654 **
8655 ** @release 2.0.0
8656 ** @@
8657 ******************************************************************************/
8658
seqReadJackknifer(AjPSeq thys,AjPSeqin seqin)8659 static AjBool seqReadJackknifer(AjPSeq thys, AjPSeqin seqin)
8660 {
8661 AjPStr tmpstr = NULL;
8662 AjPStr tmpname = NULL;
8663 AjBool ok = ajFalse;
8664 ajuint iseq;
8665 ajuint i;
8666
8667 AjPTable phytable = NULL;
8668 SeqPMsfItem phyitem = NULL;
8669 const SeqPMsfItem readphyitem = NULL;
8670 AjPList phylist = NULL;
8671 SeqPMsfData phydata = NULL;
8672
8673 if(!seqRegJackTop)
8674 seqRegJackTop = ajRegCompC("^'(.*)'\\s*$");
8675
8676 if(!seqRegJackSeq)
8677 seqRegJackSeq = ajRegCompC("^[(]([^)]+)(.*)$");
8678
8679 if(!seqin->SeqData) /* first time - read the data */
8680 {
8681 iseq = 0;
8682 seqin->Multidone = ajFalse;
8683 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8684 if(!ok)
8685 return ajFalse;
8686
8687 if(!ajRegExec(seqRegJackTop, seqReadLine))
8688 { /* first line test */
8689 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8690
8691 return ajFalse;
8692 }
8693
8694 ajDebug("JackKnifer format: First line ok '%S'\n", seqReadLine);
8695
8696 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8697
8698 seqin->SeqData = AJNEW0(phydata);
8699 phydata->Table = phytable = ajTablestrNew(1000);
8700 phylist = ajListstrNew();
8701 seqin->Input->Filecount = 0;
8702
8703 while (ok)
8704 {
8705 if (!ajStrGetLen(seqReadLine)) /* empty line after sequence */
8706 {
8707 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8708 continue;
8709 }
8710
8711 if (ajStrPrefixC(seqReadLine, ";"))
8712 break; /* done */
8713
8714 if (ajStrPrefixC(seqReadLine, "("))
8715 {
8716 if (!ajRegExec(seqRegJackSeq, seqReadLine))
8717 {
8718 ajDebug("JackKnifer format: bad (id) line\n");
8719 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8720
8721 return ajFalse;
8722 }
8723
8724 ajRegSubI(seqRegJackSeq, 1, &tmpstr);
8725 seqnameSetName(&tmpname, tmpstr);
8726 phyitem = ajTableFetchmodS(phytable, tmpname);
8727
8728 if (!phyitem)
8729 {
8730 ajDebug("JackKnifer format: new (id) '%S'\n", tmpname);
8731 AJNEW0(phyitem);
8732 phyitem->Weight = 1.0;
8733 ajStrAssignS(&phyitem->Name,tmpname);
8734 ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
8735 ajListstrPushAppend(phylist, ajStrNewS(phyitem->Name));
8736 iseq++;
8737 }
8738 else
8739 ajDebug("JackKnifer format: More for (id) '%S'\n",
8740 tmpname);
8741
8742 ajRegSubI(seqRegJackSeq, 2, &tmpstr);
8743 ajStrAssignS(&seqReadLine, tmpstr);
8744 }
8745
8746 seqAppend(&phyitem->Seq, seqReadLine);
8747
8748 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8749 }
8750
8751 phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
8752
8753 for(i=0; i < iseq; i++)
8754 {
8755 ajListstrPop(phylist, &phydata->Names[i]);
8756 ajDebug("list [%d] '%S'\n", i, phydata->Names[i]);
8757 }
8758
8759 ajListstrFreeData(&phylist);
8760 phydata->Nseq = iseq;
8761 phydata->Count = 0;
8762 phydata->Bufflines = ajTextinGetRecords(seqin->Input);
8763 ajDebug("JackKnifer format read %d lines\n",
8764 ajTextinGetRecords(seqin->Input));
8765 }
8766
8767 ajStrDel(&tmpstr);
8768 ajStrDel(&tmpname);
8769
8770 phydata = seqin->SeqData;
8771 phytable = phydata->Table;
8772
8773 i = phydata->Count;
8774 ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
8775 readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
8776 ajStrAssignS(&thys->Name, phydata->Names[i]);
8777 ajStrDel(&phydata->Names[i]);
8778
8779 thys->Weight = readphyitem->Weight;
8780 ajStrAssignS(&thys->Seq, readphyitem->Seq);
8781
8782 phydata->Count++;
8783
8784 if(phydata->Count >=phydata->Nseq)
8785 {
8786 seqin->Multidone = ajTrue;
8787 ajDebug("seqReadJackKnifer multidone\n");
8788 ajFilebuffClear(seqin->Input->Filebuff, 0);
8789 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8790 }
8791
8792 return ajTrue;
8793 }
8794
8795
8796
8797
8798 /* @funcstatic seqReadNexus ***************************************************
8799 **
8800 ** Tries to read input in Nexus format.
8801 **
8802 ** Nexus files contain many things.
8803 ** All Nexus files begin with a #NEXUS line
8804 ** Data is in begin ... end blocks
8805 ** Sequence data is in a "begin character" block
8806 **
8807 ** @param [w] thys [AjPSeq] Sequence object
8808 ** @param [u] seqin [AjPSeqin] Sequence input object
8809 ** @return [AjBool] ajTrue on success
8810 **
8811 ** @release 2.0.0
8812 ** @@
8813 ******************************************************************************/
8814
seqReadNexus(AjPSeq thys,AjPSeqin seqin)8815 static AjBool seqReadNexus(AjPSeq thys, AjPSeqin seqin)
8816 {
8817 AjBool ok = ajFalse;
8818 ajuint i;
8819 ajuint j;
8820 AjPFilebuff buff;
8821 AjPStr* seqs = NULL;
8822 AjPStr* names = NULL;
8823 AjPNexus nexus = NULL;
8824
8825 SeqPMsfData phydata = NULL;
8826
8827 buff = seqin->Input->Filebuff;
8828
8829 if(!seqin->SeqData) /* first time - read the data */
8830 {
8831 seqin->Multidone = ajFalse;
8832
8833 ajFilebuffSetBuffered(buff);
8834
8835 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8836 ajDebug("Nexus format: Testing first line '%S'\n", seqReadLine);
8837
8838 if(!ok)
8839 return ajFalse;
8840
8841 if(!ajStrPrefixCaseC(seqReadLine, "#NEXUS"))
8842 { /* first line test */
8843 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8844 return ajFalse;
8845 }
8846
8847 ajDebug("Nexus format: First line ok '%S'\n", seqReadLine);
8848
8849 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8850
8851 while(ok && !ajStrPrefixCaseC(seqReadLine, "#NEXUS"))
8852 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8853
8854 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8855
8856 AJNEW0(phydata);
8857 phydata->Nexus = ajNexusParse(buff);
8858
8859 if (!phydata->Nexus)
8860 {
8861 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8862 ajDebug("Failed to parse in nexus format\n");
8863
8864 return ajFalse;
8865 }
8866
8867 phydata->Count = 0;
8868 phydata->Nseq = ajNexusGetNtaxa(phydata->Nexus);
8869 /* GetTaxa may fail if names are only defined in the sequences */
8870 seqs = ajNexusGetTaxa(phydata->Nexus);
8871 phydata->Names = AJCALLOC(phydata->Nseq, sizeof(*phydata->Names));
8872
8873 if(seqs)
8874 {
8875 for(j=0;j<phydata->Nseq;j++)
8876 ajStrAssignS(&phydata->Names[j], seqs[j]);
8877 }
8878
8879 seqin->SeqData = phydata;
8880 ajDebug("Nexus parsed %d sequences\n", phydata->Nseq);
8881 }
8882
8883 phydata = seqin->SeqData;
8884 nexus = phydata->Nexus;
8885
8886 i = phydata->Count;
8887
8888 seqs = ajNexusGetSequences(nexus);
8889 if (!seqs)
8890 {
8891 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8892
8893 return ajFalse;
8894 }
8895
8896 thys->Weight = 1.0;
8897 ajStrAssignS(&thys->Seq, seqs[i]);
8898
8899 if (!phydata->Names)
8900 phydata->Names = AJCALLOC(phydata->Nseq, sizeof(*phydata->Names));
8901
8902 if (!phydata->Names[0]) /* finally set from the sequences */
8903 {
8904 names = ajNexusGetTaxa(phydata->Nexus);
8905
8906 for(j=0;j<phydata->Nseq;j++)
8907 ajStrAssignS(&phydata->Names[j], names[j]);
8908 }
8909
8910 ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
8911
8912 ajStrAssignS(&thys->Name, phydata->Names[i]);
8913
8914 phydata->Count++;
8915
8916 if(phydata->Count >= phydata->Nseq)
8917 {
8918 seqin->Multidone = ajTrue;
8919 ajDebug("seqReadNexus multidone\n");
8920 ajFilebuffClear(seqin->Input->Filebuff, 0);
8921 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8922 }
8923
8924 return ajTrue;
8925 }
8926
8927
8928
8929
8930 /* @funcstatic seqReadMega ****************************************************
8931 **
8932 ** Tries to read input in Mega interleaved or non-interleaved format.
8933 **
8934 ** The Molecular Evolutionary Genetic Analysis program by
8935 ** Kumar, Tamura & Nei is a tree construction program
8936 ** based on distance- and parsimony methods.
8937 **
8938 ** http://evolgen.biol.metro-u.ac.jp/MEGA/manual/DataFormat.html
8939 **
8940 ** @param [w] thys [AjPSeq] Sequence object
8941 ** @param [u] seqin [AjPSeqin] Sequence input object
8942 ** @return [AjBool] ajTrue on success
8943 **
8944 ** @release 2.0.0
8945 ** @@
8946 ******************************************************************************/
8947
seqReadMega(AjPSeq thys,AjPSeqin seqin)8948 static AjBool seqReadMega(AjPSeq thys, AjPSeqin seqin)
8949 {
8950 AjPStr tmpstr = NULL;
8951 AjPStr tmpdesc = NULL;
8952 AjPStr tmpname = NULL;
8953 AjPStr prestr = NULL;
8954 AjPStr poststr = NULL;
8955 AjBool ok = ajFalse;
8956 ajuint iseq = 0;
8957 ajuint i;
8958 AjBool istitle = ajFalse;
8959 AjBool isformat = ajFalse;
8960 AjBool iscommand = ajFalse;
8961 AjBool resume = ajFalse;
8962 AjPStr genestr = NULL;
8963 AjPStr domainstr = NULL;
8964 AjPStr nextgenestr = NULL;
8965 AjPStr nextdomainstr = NULL;
8966
8967 ajlong ipos;
8968 ajlong istart;
8969 ajlong ilast;
8970 char ichar;
8971
8972 AjPStr formatType = NULL;
8973 AjPStr formatValue = NULL;
8974
8975 char identchar = '.';
8976 char indelchar = '-';
8977 char misschar = '?';
8978 char seqtype = ' ';
8979
8980 char* cp;
8981 const char *cq;
8982
8983 AjPTable phytable = NULL;
8984 SeqPMsfItem phyitem = NULL;
8985 const SeqPMsfItem readphyitem = NULL;
8986 const SeqPMsfItem firstitem = NULL;
8987 AjPList phylist = NULL;
8988 SeqPMsfData phydata = NULL;
8989
8990 AjPSeqGene seqgene = NULL;
8991
8992 if(!seqRegMegaCommand)
8993 seqRegMegaCommand = ajRegCompC("([^ =!]+)=([^ ;]+)");
8994
8995 if(!seqRegMegaFeat)
8996 seqRegMegaFeat = ajRegCompC("^(.*)\"[^\"]*\"(.*)$");
8997
8998 if(!seqRegMegaSeq)
8999 seqRegMegaSeq = ajRegCompC("^#([^ \t\n\r]+)(.*)$");
9000
9001 if(seqin->SeqData)
9002 {
9003 phydata = seqin->SeqData;
9004 if(seqin->Multidone)
9005 resume = phydata->Resume;
9006 }
9007
9008 if(!seqin->SeqData || /* first time - read the data */
9009 (seqin->Multidone && resume)) /* resuming gene/domain block */
9010 {
9011 iseq = 0;
9012 seqin->Multidone = ajFalse;
9013
9014 if(!seqin->SeqData)
9015 {
9016 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9017 ajDebug("Mega format: Testing first line '%S'\n", seqReadLine);
9018
9019 if(!ok)
9020 return ajFalse;
9021
9022 if(!ajStrPrefixCaseC(seqReadLine, "#MEGA"))
9023 { /* first line test */
9024 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9025
9026 return ajFalse;
9027 }
9028
9029 ajDebug("Mega format: First line ok '%S'\n", seqReadLine);
9030
9031 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9032 if(!ok)
9033 return ajFalse;
9034
9035 if(!ajStrPrefixCaseC(seqReadLine, "TITLE") &&
9036 !ajStrPrefixCaseC(seqReadLine, "!TITLE"))
9037 { /* first line test */
9038 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9039
9040 return ajFalse;
9041 }
9042 ajStrAssignSubS(&tmpdesc, seqReadLine, 6, -1);
9043 ajStrTrimStartC(&tmpdesc, ": \t");
9044 ajStrTrimEndC(&tmpdesc, "; \t\n\r");
9045
9046 if(ajStrGetCharFirst(seqReadLine) == '!')
9047 {
9048 istitle = ajTrue;
9049 if(ajStrFindAnyK(seqReadLine, ';') != -1)
9050 istitle = ajFalse;
9051 }
9052
9053 ajDebug("Mega format: Second line ok '%S'\n", seqReadLine);
9054
9055 isformat = ajFalse;
9056
9057 while(ok && !ajStrPrefixC(seqReadLine, "#"))
9058
9059 { /* skip comments in header */
9060 if(iscommand)
9061 {
9062 if(ajStrFindAnyK(seqReadLine, ';') != -1)
9063 iscommand = ajFalse;
9064 }
9065
9066 else if(istitle)
9067 {
9068 ajStrAssignS(&tmpstr, seqReadLine);
9069 ajStrTrimStartC(&tmpstr, ": \t");
9070 ajStrTrimEndC(&tmpstr, "; \t\n\r");
9071 ajStrAppendK(&tmpdesc, ' ');
9072 ajStrAppendS(&tmpdesc, tmpstr);
9073 if(ajStrFindAnyK(seqReadLine, ';') != -1)
9074 istitle = ajFalse;
9075 }
9076
9077 else
9078 {
9079 if(ajStrPrefixCaseC(seqReadLine, "!FORMAT"))
9080 isformat = ajTrue;
9081
9082 if(isformat)
9083 {
9084 ajDebug("Format line: %S", seqReadLine);
9085 ajStrAssignS(&tmpstr, seqReadLine);
9086
9087 while(ajRegExec(seqRegMegaCommand, tmpstr))
9088 {
9089 ajRegSubI(seqRegMegaCommand, 1, &formatType);
9090 ajRegSubI(seqRegMegaCommand, 2, &formatValue);
9091 if(ajStrPrefixCaseC(formatType, "indel"))
9092 indelchar = ajStrGetCharFirst(formatValue);
9093 if(ajStrPrefixCaseC(formatType, "ident"))
9094 identchar = ajStrGetCharFirst(formatValue);
9095 if(ajStrPrefixCaseC(formatType, "match"))
9096 identchar = ajStrGetCharFirst(formatValue);
9097 if(ajStrPrefixCaseC(formatType, "miss"))
9098 misschar = ajStrGetCharFirst(formatValue);
9099 if(ajStrPrefixCaseC(formatType, "DataType"))
9100 seqtype = ajStrGetCharFirst(formatValue);
9101 ajRegPost(seqRegMegaCommand, &tmpstr);
9102 ajDebug("'%S' = '%S' (%S) indel '%c' ident '%c' "
9103 "missing '%c'\n",
9104 formatType, formatValue, tmpstr,
9105 indelchar, identchar, misschar);
9106 }
9107
9108 if(ajStrFindAnyK(seqReadLine, ';') == -1)
9109 isformat = ajFalse;
9110 }
9111
9112 else
9113 {
9114 if(ajStrGetCharFirst(seqReadLine) == '!')
9115 {
9116 ajStrAssignS(&tmpstr, seqReadLine);
9117 while(ajRegExec(seqRegMegaCommand, tmpstr))
9118 {
9119 ajRegSubI(seqRegMegaCommand, 1, &formatType);
9120 ajRegSubI(seqRegMegaCommand, 2, &formatValue);
9121 if(ajStrMatchCaseC(formatType, "gene"))
9122 {
9123 ajStrAssignS(&genestr, formatValue);
9124 ajDebug("command: Gene='%S'\n",
9125 formatValue);
9126 }
9127
9128 if(ajStrMatchCaseC(formatType, "domain"))
9129 {
9130 ajStrAssignS(&domainstr, formatValue);
9131 ajDebug("command: Domain='%S'\n",
9132 formatValue);
9133 }
9134 ajRegPost(seqRegMegaCommand, &tmpstr);
9135 }
9136
9137 if(ajStrFindAnyK(seqReadLine, ';') == -1)
9138 iscommand = ajTrue;
9139 }
9140 }
9141 }
9142
9143 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9144
9145 }
9146 ajStrDel(&tmpstr);
9147
9148 if(isformat || istitle || iscommand)
9149 {
9150 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9151
9152 return ajFalse;
9153 }
9154
9155 /*
9156 ** read through looking for #id
9157 ** Some day we could stop at #mega and read multiple files
9158 */
9159
9160 seqin->SeqData = AJNEW0(phydata);
9161 phydata->Table = phytable = ajTablestrNew(1000);
9162 phylist = ajListstrNew();
9163 seqin->Input->Filecount = 0;
9164
9165 phydata->Identchar = identchar;
9166 phydata->Indelchar = indelchar;
9167 phydata->Misschar = misschar;
9168 phydata->Seqtype = seqtype;
9169 }
9170
9171 /*
9172 ** Resume from here
9173 */
9174
9175 if(resume)
9176 {
9177 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9178
9179 resume = ajFalse;
9180 phydata->Resume = ajFalse;
9181 ajTableMapDel(phydata->Table, &seqMsfTabDel, NULL);
9182 phylist = ajListstrNew();
9183 phytable = phydata->Table;
9184 ajStrAssignS(&phydata->Gene, phydata->NextGene);
9185 ajStrAssignS(&phydata->Domain, phydata->NextDomain);
9186 ajStrAssignClear(&phydata->NextGene);
9187 ajStrAssignClear(&phydata->NextDomain);
9188 }
9189
9190 while (ok)
9191 {
9192 ipos = ajStrFindAnyC(seqReadLine, "[]");
9193 istart = 0;
9194 ichar = ' ';
9195
9196 while((ipos != -1) ||
9197 (phydata->CommentDepth &&
9198 (istart < (ajint) ajStrGetLen(seqReadLine))))
9199 {
9200 ilast = ipos;
9201
9202 if(ipos > -1)
9203 ichar = ajStrGetCharPos(seqReadLine, ipos);
9204
9205 if(!phydata->CommentDepth)
9206 {
9207 istart = ipos;
9208 }
9209
9210 if(ichar == '[')
9211 phydata->CommentDepth++;
9212 else if((ichar == ']') && phydata->CommentDepth)
9213 phydata->CommentDepth--;
9214
9215 ajStrCutRange(&seqReadLine, istart, ilast);
9216 ipos = ajStrFindAnyC(seqReadLine, "[]");
9217 ichar = ' ';
9218 }
9219
9220 /* empty line after a sequence */
9221 if (!ajStrGetLen(seqReadLine))
9222 {
9223 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9224 continue;
9225 }
9226
9227 if (ajStrPrefixC(seqReadLine, "!"))
9228 {
9229 iscommand = ajTrue;
9230 }
9231
9232 if(!iscommand)
9233 {
9234 if(ajStrPrefixC(seqReadLine, "#"))
9235 {
9236 if (!ajRegExec(seqRegMegaSeq, seqReadLine))
9237 {
9238 ajDebug("Mega format: bad #id line\n");
9239 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
9240
9241 return ajFalse;
9242 }
9243
9244 ajRegSubI(seqRegMegaSeq, 1, &tmpstr);
9245 seqnameSetName(&tmpname, tmpstr);
9246 phyitem = ajTableFetchmodS(phytable, tmpname);
9247
9248 if (!phyitem)
9249 {
9250 AJNEW0(phyitem);
9251 phyitem->Weight = 1.0;
9252 ajStrAssignS(&phyitem->Name,tmpname);
9253 ajStrAssignS(&phyitem->Desc, tmpdesc);
9254 ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
9255 ajListstrPushAppend(phylist, ajStrNewS(phyitem->Name));
9256 iseq++;
9257 }
9258 else
9259 ajDebug("Mega format: More for #id '%S'\n", tmpname);
9260
9261 ajRegSubI(seqRegMegaSeq, 2, &tmpstr);
9262 ajStrAssignS(&seqReadLine, tmpstr);
9263 }
9264
9265 while (ajRegExec(seqRegMegaFeat, seqReadLine))
9266 {
9267 ajDebug("Quotes found: '%S'\n", seqReadLine);
9268 ajRegSubI(seqRegMegaFeat, 1, &prestr);
9269 ajRegSubI(seqRegMegaFeat, 2, &poststr);
9270 ajStrAssignS(&seqReadLine, prestr);
9271 ajStrAppendS(&seqReadLine, poststr);
9272 ajDebug("Quotes removed: '%S'\n", seqReadLine);
9273 }
9274
9275 seqAppend(&phyitem->Seq, seqReadLine);
9276 ajDebug("Append '%S' len %u\n",
9277 phyitem->Name, ajStrGetLen(phyitem->Seq));
9278 }
9279
9280 else
9281 {
9282 ajStrAssignS(&tmpstr, seqReadLine);
9283
9284 while(ajRegExec(seqRegMegaCommand, tmpstr))
9285 {
9286 ajRegSubI(seqRegMegaCommand, 1, &formatType);
9287 ajRegSubI(seqRegMegaCommand, 2, &formatValue);
9288 if(ajStrMatchCaseC(formatType, "gene"))
9289 {
9290 if(iseq)
9291 resume = ajTrue;
9292 ajStrAssignS(&nextgenestr, formatValue);
9293 ajDebug("command: Gene='%S'\n",
9294 formatValue);
9295 }
9296
9297 if(ajStrMatchCaseC(formatType, "domain"))
9298 {
9299 if(iseq)
9300 resume = ajTrue;
9301 ajStrAssignS(&nextdomainstr, formatValue);
9302 ajDebug("command: Domain='%S'\n",
9303 formatValue);
9304 }
9305 ajRegPost(seqRegMegaCommand, &tmpstr);
9306 }
9307 if(ajStrFindAnyK(seqReadLine, ';') != -1)
9308 iscommand = ajFalse;
9309 }
9310
9311 if(resume)
9312 break;
9313
9314 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9315 }
9316
9317 if(phydata->Names)
9318 AJCRESIZE0(phydata->Names, phydata->Nseq, iseq);
9319 else
9320 phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
9321
9322 for(i=0; i < iseq; i++)
9323 {
9324 ajListstrPop(phylist, &phydata->Names[i]);
9325 }
9326
9327 ajListstrFreeData(&phylist);
9328 phydata->Nseq = iseq;
9329 phydata->Count = 0;
9330 phydata->Bufflines = ajTextinGetRecords(seqin->Input);
9331 }
9332
9333 ajStrDel(&formatType);
9334 ajStrDel(&formatValue);
9335 ajStrDel(&tmpstr);
9336 ajStrDel(&tmpname);
9337 ajStrDel(&tmpdesc);
9338 ajStrDel(&prestr);
9339 ajStrDel(&poststr);
9340
9341 phydata = seqin->SeqData;
9342 phytable = phydata->Table;
9343
9344 firstitem = ajTableFetchS(phytable, phydata->Names[0]);
9345 i = phydata->Count;
9346 ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
9347 readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
9348 ajStrAssignS(&thys->Name, phydata->Names[i]);
9349 if(i)
9350 ajStrDel(&phydata->Names[i]);
9351
9352 if(ajStrGetLen(genestr))
9353 ajStrAssignS(&phydata->Gene, genestr);
9354
9355 if(ajStrGetLen(domainstr))
9356 ajStrAssignS(&phydata->Domain, domainstr);
9357
9358 if(resume)
9359 {
9360 phydata->Resume = ajTrue;
9361 if(ajStrGetLen(nextgenestr))
9362 ajStrAssignS(&phydata->NextGene, nextgenestr);
9363 else
9364 ajStrAssignClear(&phydata->NextGene);
9365 if(ajStrGetLen(nextdomainstr))
9366 ajStrAssignS(&phydata->NextDomain, nextdomainstr);
9367 else
9368 ajStrAssignClear(&phydata->NextDomain);
9369 }
9370
9371 thys->Weight = readphyitem->Weight;
9372 ajStrAssignS(&thys->Desc, readphyitem->Desc);
9373 ajStrAssignS(&thys->Seq, readphyitem->Seq);
9374 if(ajStrGetLen(phydata->Gene))
9375 {
9376 seqgene = ajSeqgeneNewName(phydata->Gene);
9377 ajSeqAddGene(thys, seqgene);
9378 seqgene = NULL;
9379 }
9380
9381 if(strchr("nNrRdD", phydata->Seqtype))
9382 ajSeqSetNuc(thys);
9383 else if(strchr("pP", phydata->Seqtype))
9384 ajSeqSetProt(thys);
9385
9386 cp = ajStrGetuniquePtr(&thys->Seq);
9387 cq = ajStrGetPtr(firstitem->Seq);
9388
9389 while(*cp)
9390 {
9391 if(*cp == phydata->Indelchar)
9392 *cp = '-';
9393 else if (*cp == phydata->Identchar)
9394 *cp = *cq;
9395
9396 cp++;
9397 cq++;
9398 }
9399
9400 phydata->Count++;
9401
9402 if(phydata->Count >= phydata->Nseq)
9403 {
9404 seqin->Multidone = ajTrue;
9405
9406 ajStrDel(&phydata->Names[0]);
9407 if(!phydata->Resume)
9408 {
9409 ajFilebuffClear(seqin->Input->Filebuff, 0);
9410 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
9411 }
9412 }
9413
9414 ajStrDel(&genestr);
9415 ajStrDel(&nextgenestr);
9416 ajStrDel(&domainstr);
9417 ajStrDel(&nextdomainstr);
9418 ajStrDel(&formatType);
9419 ajStrDel(&formatValue);
9420
9421 return ajTrue;
9422 }
9423
9424
9425
9426
9427 /* @funcstatic seqReadCodata **************************************************
9428 **
9429 ** Given data in a sequence structure, tries to read everything needed
9430 ** using CODATA format.
9431 **
9432 ** @param [w] thys [AjPSeq] Sequence object
9433 ** @param [u] seqin [AjPSeqin] Sequence input object
9434 ** @return [AjBool] ajTrue on success
9435 **
9436 ** @release 1.0.0
9437 ** @@
9438 ******************************************************************************/
9439
seqReadCodata(AjPSeq thys,AjPSeqin seqin)9440 static AjBool seqReadCodata(AjPSeq thys, AjPSeqin seqin)
9441 {
9442 AjPFilebuff buff;
9443 AjBool ok = ajTrue;
9444 AjBool done = ajFalse;
9445
9446 buff = seqin->Input->Filebuff;
9447
9448 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
9449 return ajFalse;
9450
9451 ajDebug("first line '%S'\n", seqReadLine);
9452
9453 if(!ajStrPrefixC(seqReadLine, "ENTRY "))
9454 {
9455 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9456
9457 return ajFalse;
9458 }
9459
9460 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
9461 ajStrTokenStep(seqHandle); /* 'ENTRY' */
9462 ajStrTokenNextParse(seqHandle, &seqToken); /* entry name */
9463
9464 seqSetName(thys, seqToken);
9465
9466 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9467
9468 while(ok && !ajStrPrefixC(seqReadLine, "SEQUENCE"))
9469 {
9470 done = ajFalse;
9471
9472 if(ajStrPrefixC(seqReadLine, "ACCESSION "))
9473 {
9474 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
9475 ajStrTokenStep(seqHandle); /* 'ACCESSION' */
9476 ajStrTokenNextParse(seqHandle, &seqToken); /* accnum */
9477 seqAccSave(thys, seqToken);
9478 }
9479
9480 if(ajStrPrefixC(seqReadLine, "TITLE "))
9481 {
9482 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
9483 ajStrTokenStep(seqHandle); /* 'TITLE' */
9484 ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc); /* desc */
9485
9486 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9487 done = ajTrue;
9488
9489 while(ok && ajStrPrefixC(seqReadLine, " "))
9490 {
9491 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
9492 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
9493 ajStrAppendC(&thys->Desc, " ");
9494 ajStrAppendS(&thys->Desc, seqToken);
9495 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9496 }
9497 }
9498
9499 if(!done)
9500 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9501 }
9502
9503 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9504
9505 while(ok && !ajStrPrefixC(seqReadLine, "///"))
9506 {
9507 seqAppend(&thys->Seq, seqReadLine);
9508 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9509 }
9510
9511 ajFilebuffClear(buff, 0);
9512
9513 ajStrTokenReset(seqHandle);
9514 ajStrDelStatic(&seqToken);
9515
9516 return ajTrue;
9517 }
9518
9519
9520
9521
9522 /* @funcstatic seqReadAce *****************************************************
9523 **
9524 ** Given data in a sequence structure, tries to read everything needed
9525 ** using ACE format as defined by the consed assembly editor.
9526 **
9527 ** @param [w] thys [AjPSeq] Sequence object
9528 ** @param [u] seqin [AjPSeqin] Sequence input object
9529 ** @return [AjBool] ajTrue on success
9530 **
9531 ** @release 6.2.0
9532 ** @@
9533 ******************************************************************************/
9534
seqReadAce(AjPSeq thys,AjPSeqin seqin)9535 static AjBool seqReadAce(AjPSeq thys, AjPSeqin seqin)
9536 {
9537 AjPFilebuff buff;
9538 AjBool ok = ajTrue;
9539 ajuint icontig;
9540 ajuint iseq;
9541 AjPTable acetable = NULL;
9542 const SeqPMsfItem aceitem = NULL;
9543 SeqPMsfData acedata = NULL;
9544 ajuint i;
9545
9546 ajDebug("seqReadAcedb\n");
9547
9548 buff = seqin->Input->Filebuff;
9549
9550 if(!seqin->SeqData)
9551 { /* start of file */
9552 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9553 if(!ok)
9554 return ajFalse;
9555
9556 ajDebug("first line:\n'%S'\n", seqReadLine);
9557
9558 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
9559 ajStrTokenNextParseC(seqHandle, " \t", &seqToken); /* 'AS ncontig nseq' */
9560 ajDebug("Token 1 '%S'\n", seqToken);
9561
9562 if(!ajStrMatchCaseC(seqToken, "AS"))
9563 {
9564 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9565 ajStrDelStatic(&seqToken);
9566 ajStrTokenDel(&seqHandle);
9567 return ajFalse;
9568 }
9569
9570 ajStrTokenNextParseC(seqHandle, " \t", &seqToken); /* number of contigs */
9571 ajStrToUint(seqToken, &icontig);
9572 ajStrTokenNextParseC(seqHandle, " \t", &seqToken); /* number of reads */
9573 ajStrToUint(seqToken, &iseq);
9574
9575 seqin->SeqData = AJNEW0(acedata);
9576 acedata->Table = acetable = ajTablestrNew(1000);
9577 seqin->Input->Filecount = 0;
9578
9579 /*
9580 ** read sequence from CO (* for gap)
9581 ** read accuracy from BQ (no quality for gaps)
9582 **
9583 ** Read with gaps
9584 */
9585 }
9586
9587 acedata = seqin->SeqData;
9588 acetable = acedata->Table;
9589
9590 i = acedata->Count;
9591 /* ajDebug("returning [%d] '%S'\n", i, acedata->Names[i]); */
9592 aceitem = ajTableFetchS(acetable, acedata->Names[i]);
9593 ajStrAssignS(&thys->Name, acedata->Names[i]);
9594
9595 thys->Weight = aceitem->Weight;
9596 ajStrAssignS(&thys->Seq, aceitem->Seq);
9597
9598 acedata->Count++;
9599
9600 if(acedata->Count >= acedata->Nseq)
9601 {
9602 seqin->Multidone = ajTrue;
9603 ajDebug("seqReadAce Multidone\n");
9604 ajFilebuffClear(seqin->Input->Filebuff, 0);
9605 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
9606 }
9607
9608 ajSeqSetNuc(thys);
9609
9610 ajFilebuffClear(buff, 0);
9611
9612 ajStrTokenReset(seqHandle);
9613 ajStrDelStatic(&seqToken);
9614
9615 return ajTrue;
9616 }
9617
9618
9619
9620
9621 /* @funcstatic seqReadAcedb ***************************************************
9622 **
9623 ** Given data in a sequence structure, tries to read everything needed
9624 ** using ACEDB format.
9625 **
9626 ** @param [w] thys [AjPSeq] Sequence object
9627 ** @param [u] seqin [AjPSeqin] Sequence input object
9628 ** @return [AjBool] ajTrue on success
9629 **
9630 ** @release 1.0.0
9631 ** @@
9632 ******************************************************************************/
9633
seqReadAcedb(AjPSeq thys,AjPSeqin seqin)9634 static AjBool seqReadAcedb(AjPSeq thys, AjPSeqin seqin)
9635 {
9636 AjPFilebuff buff;
9637 AjBool ok = ajTrue;
9638
9639 ajDebug("seqReadAcedb\n");
9640
9641 buff = seqin->Input->Filebuff;
9642
9643 do
9644 {
9645 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9646 } while(ok &&
9647 (ajStrPrefixC(seqReadLine, "//") ||
9648 ajStrPrefixC(seqReadLine, "\n")));
9649
9650 if(!ok)
9651 {
9652 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9653
9654 return ajFalse;
9655 }
9656
9657 ajDebug("first line:\n'%S'\n", seqReadLine);
9658
9659
9660 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
9661 ajStrTokenNextParseC(seqHandle, " \t", &seqToken); /* 'DNA' or 'Peptide'*/
9662 ajDebug("Token 1 '%S'\n", seqToken);
9663
9664 if(ajStrMatchCaseC(seqToken, "Peptide"))
9665 {
9666 ajDebug("Protein\n");
9667 ajSeqSetProt(thys);
9668 }
9669 else if(ajStrMatchCaseC(seqToken, "DNA"))
9670 {
9671 ajDebug("DNA\n");
9672 ajSeqSetNuc(thys);
9673 }
9674 else
9675 {
9676 ajDebug("unknown - failed\n");
9677 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9678 ajStrTokenReset(seqHandle);
9679 ajStrDelStatic(&seqToken);
9680
9681 return ajFalse;
9682 }
9683
9684 ajStrTokenNextParseC(seqHandle, " \t\"", &seqToken); /* : */
9685
9686 if(!ajStrMatchC(seqToken, ":"))
9687 {
9688 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9689 ajStrTokenReset(seqHandle);
9690 ajStrDelStatic(&seqToken);
9691
9692 return ajFalse;
9693 }
9694
9695 ajStrTokenNextParseC(seqHandle, "\"", &seqToken); /* name */
9696
9697 if(!ajStrGetLen(seqToken))
9698 {
9699 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9700 ajStrTokenReset(seqHandle);
9701 ajStrDelStatic(&seqToken);
9702
9703 return ajFalse;
9704 }
9705
9706 /* we know we will succeed from here ... no way to return ajFalse */
9707
9708 ajFilebuffSetUnbuffered(buff);
9709
9710 seqSetName(thys, seqToken);
9711
9712 /* OK, we have the name. Now look for the sequence */
9713
9714 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9715 while(ok && !ajStrPrefixC(seqReadLine,"\n"))
9716 {
9717 seqAppend(&thys->Seq, seqReadLine);
9718 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9719 }
9720
9721 ajFilebuffClear(buff, 0);
9722
9723 ajStrTokenReset(seqHandle);
9724 ajStrDelStatic(&seqToken);
9725
9726 return ajTrue;
9727 }
9728
9729
9730
9731
9732 /* @funcstatic seqReadBiomart *************************************************
9733 **
9734 ** Given data in a sequence structure, tries to read everything needed
9735 ** using BioMart tab-delimited format.
9736 **
9737 ** @param [w] thys [AjPSeq] Sequence object
9738 ** @param [u] seqin [AjPSeqin] Sequence input object
9739 ** @return [AjBool] ajTrue on success
9740 **
9741 ** @release 6.3.0
9742 ** @@
9743 ******************************************************************************/
9744
seqReadBiomart(AjPSeq thys,AjPSeqin seqin)9745 static AjBool seqReadBiomart(AjPSeq thys, AjPSeqin seqin)
9746 {
9747 AjPFilebuff buff;
9748 AjBool ok = ajTrue;
9749 ajulong ifields = 0;
9750 ajuint i;
9751
9752 buff = seqin->Input->Filebuff;
9753
9754 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9755 if(!ok)
9756 return ajFalse;
9757
9758 ajDebug("seqReadBiomart record '%S'%u\n",
9759 seqReadLine);
9760
9761 ifields = ajStrCalcCountK(seqReadLine, '\t');
9762 ++ifields;
9763
9764 ajDebug("fields: %u\n", ifields);
9765
9766 if(ifields < 2)
9767 return ajFalse;
9768
9769 ajStrTokenAssignC(&seqHandle, seqReadLine, "\t\n");
9770
9771 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* sequence */
9772 seqAppend(&thys->Seq, seqToken);
9773
9774 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* identifier*/
9775 seqSetName(thys, seqToken);
9776
9777 for(i = 2; i < ifields; i++)
9778 {
9779 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* non-sequence*/
9780
9781 if(ajStrGetLen(seqToken))
9782 {
9783 if(i > 2)
9784 ajStrAppendK(&thys->Desc, ' ');
9785
9786 ajStrAppendS(&thys->Desc, seqToken);
9787 }
9788 }
9789
9790 ajFilebuffClear(buff, 0);
9791
9792 ajStrTokenReset(seqHandle);
9793 ajStrDelStatic(&seqToken);
9794
9795 return ajTrue;
9796 }
9797
9798
9799
9800
9801 /* @funcstatic seqReadDAS *****************************************************
9802 **
9803 ** Reads sequences from given DAS XML buffer.
9804 **
9805 ** @param [w] thys [AjPSeq] Sequence object
9806 ** @param [u] seqin [AjPSeqin] Sequence input object
9807 ** @return [AjBool] ajTrue on success
9808 **
9809 ** @release 6.4.0
9810 ** @@
9811 ******************************************************************************/
9812
seqReadDAS(AjPSeq thys,AjPSeqin seqin)9813 static AjBool seqReadDAS(AjPSeq thys, AjPSeqin seqin)
9814 {
9815 AjPDomDocument doc = NULL;
9816 AjPDomNodeList segments = NULL;
9817 AjPDomNode segment = NULL;
9818
9819 AjPFilebuff buff = NULL;
9820 AjPStr attval = NULL;
9821 AjPStr elmtxt = NULL;
9822 AjPStr seqname = NULL;
9823
9824 AjBool ret = AJTRUE;
9825
9826 ajint r = 0;
9827
9828 buff = seqin->Input->Filebuff;
9829
9830 r = ajTextinGetCount(seqin->Input);
9831
9832 if(r==1)
9833 {
9834 doc = ajDomImplementationCreateDocument(NULL,NULL,NULL);
9835
9836 if (ajDomReadFilebuff(doc,buff) == -1)
9837 {
9838 ajDomDocumentDestroyNode(doc,&doc);
9839 return AJFALSE;
9840 }
9841
9842 ajFilebuffClear(buff, 0);
9843 seqin->SeqData = doc;
9844 }
9845 else
9846 doc = seqin->SeqData;
9847
9848 segments = ajDomDocumentGetElementsByTagNameC(doc, "SEQUENCE");
9849
9850
9851 if(segments==NULL || ajDomNodeListGetLen(segments) < r)
9852 {
9853 ajDomDocumentDestroyNodeList(doc,&segments,AJDOMKEEP);
9854 ajDomDocumentDestroyNode(doc,&doc);
9855 return AJFALSE;
9856 }
9857
9858 segment = ajDomNodeListItem(segments, r-1);
9859
9860 elmtxt = ajDomElementGetText(segment);
9861 seqAppend(&thys->Seq, elmtxt);
9862
9863 attval = ajDomElementGetAttributeC(segment,"id");
9864
9865 if(ajStrGetLen(attval) == 0)
9866 {
9867 ajStrDel(&attval);
9868 ajDomDocumentDestroyNodeList(doc,&segments,AJDOMKEEP);
9869 ajDomDocumentDestroyNode(doc,&doc);
9870 return AJFALSE;
9871 }
9872
9873
9874 if(seqin->Begin && seqin->End)
9875 {
9876
9877 ajFmtPrintS(&seqname,"%S %u,%u",
9878 attval, seqin->Begin, seqin->End);
9879
9880 seqSetName(thys, seqname);
9881
9882 ajStrDel(&seqname);
9883 }
9884 else ajSeqSetName(thys, attval);
9885
9886 ajStrDel(&attval);
9887
9888 /*
9889 * TODO: modifying seqin obj doesn't sound correct
9890 * but I was unable to stop calling function modifying sequence
9891 * Begin and End attributes apparently in a wrong way -- mahmut
9892 */
9893 seqin->Begin = thys->Begin;
9894 seqin->End = thys->End;
9895
9896 /* TODO: how to read features in parallel to reading sequences
9897 * - get sequence query url
9898 * - construct features query url based on sequence query url
9899 */
9900 /*
9901 if(seqin->Features)
9902 {
9903 AjPStr ftq, host, port, fqpath;
9904 ajFeattabInDel(&seqin->Ftquery);
9905 ajFilebuffClear(seqin->Input->Filebuff, -1);
9906
9907 // get sequence query url
9908
9909 // construct features query url based on sequence query url
9910
9911 ajDasdbQueryGet(seqin, host, port, fqpath);
9912
9913 seqin->Ftquery = ajFeattabInNewCSF("das", thys->Name,
9914 ajStrGetPtr(seqin->Type),
9915 seqin->Input->Filebuff);
9916 ajDebug("GFF FEAT TabIn %x\n", seqin->Ftquery);
9917 //ftfile = NULL; // now copied to seqin->FeattabIn
9918 ajFeattableDel(&seqin->Fttable);
9919 seqin->Fttable = ajFeattableNewRead(seqin->Ftquery);
9920 if(seqin->Fttable)
9921 ajFeattableSetLength(seqin->Fttable, ajStrGetLen(thys->Seq));
9922 ajFeattableTrace(seqin->Fttable);
9923 ajFeattableDel(&thys->Fttable);
9924 thys->Fttable = seqin->Fttable;
9925 seqin->Fttable = NULL;
9926 }
9927 */
9928
9929 ajDomDocumentDestroyNodeList(doc,&segments,AJDOMKEEP);
9930
9931
9932 return ret;
9933 }
9934
9935
9936
9937
9938 /* @funcstatic seqReadFitch ***************************************************
9939 **
9940 ** Given data in a sequence structure, tries to read everything needed
9941 ** using fitch format.
9942 **
9943 ** @param [w] thys [AjPSeq] Sequence object
9944 ** @param [u] seqin [AjPSeqin] Sequence input object
9945 ** @return [AjBool] ajTrue on success
9946 **
9947 ** @release 2.8.0
9948 ** @@
9949 ******************************************************************************/
9950
seqReadFitch(AjPSeq thys,AjPSeqin seqin)9951 static AjBool seqReadFitch(AjPSeq thys, AjPSeqin seqin)
9952 {
9953 AjPStr token = NULL;
9954 AjPFilebuff buff;
9955 AjBool ok = ajTrue;
9956 ajuint ilen = 0;
9957
9958 if (!seqRegFitchHead)
9959 seqRegFitchHead = ajRegCompC("^(\\S+),\\s+(\\d+)\\s+bases\n");
9960
9961 buff = seqin->Input->Filebuff;
9962
9963 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9964 ajDebug("seqReadFitch first line '%S'%u\n",
9965 seqReadLine);
9966
9967 if (!ajRegExec(seqRegFitchHead, seqReadLine))
9968 {
9969 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9970
9971 return ajFalse;
9972 }
9973
9974 ajRegSubI(seqRegFitchHead, 1, &token);
9975 seqSetName(thys, token);
9976
9977 ajRegSubI(seqRegFitchHead, 2, &token);
9978 ajStrToUint(token, &ilen);
9979
9980 ajDebug("seqReadFitch header name '%S' bases %u\n",
9981 thys->Name, ilen);
9982
9983 /* we know we will succeed from here ... no way to return ajFalse */
9984
9985 ajFilebuffSetUnbuffered(buff);
9986
9987 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9988
9989 while (ok && (ajStrGetLen(thys->Seq) < ilen))
9990 {
9991 seqAppend(&thys->Seq, seqReadLine);
9992 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9993 ajDebug("seqReadFitch new length %u '%S'\n",
9994 ajStrGetLen(thys->Seq), seqReadLine);
9995 }
9996
9997 ajStrDel(&token);
9998
9999 if(ok)
10000 ajFilebuffClear(buff, 1);
10001 else
10002 ajFilebuffClear(buff, 0);
10003
10004 return ajTrue;
10005 }
10006
10007
10008
10009
10010 /* @funcstatic seqReadMase ****************************************************
10011 **
10012 ** Given data in a sequence structure, tries to read everything needed
10013 ** using mase format.
10014 **
10015 ** @param [w] thys [AjPSeq] Sequence object
10016 ** @param [u] seqin [AjPSeqin] Sequence input object
10017 ** @return [AjBool] ajTrue on success
10018 **
10019 ** @release 2.8.0
10020 ** @@
10021 ******************************************************************************/
10022
seqReadMase(AjPSeq thys,AjPSeqin seqin)10023 static AjBool seqReadMase(AjPSeq thys, AjPSeqin seqin)
10024 {
10025 AjPStr token = NULL;
10026 AjPStr des = NULL;
10027 AjPFilebuff buff;
10028 AjBool ok = ajTrue;
10029
10030 if (!seqRegMaseHead)
10031 seqRegMaseHead = ajRegCompC("^(;+)");
10032
10033 buff = seqin->Input->Filebuff;
10034
10035 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10036 if(!ajRegExec(seqRegMaseHead, seqReadLine))
10037 {
10038 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
10039
10040 return ajFalse;
10041 }
10042
10043 /* we know we will succeed from here ... no way to return ajFalse */
10044
10045 ajFilebuffSetUnbuffered(buff);
10046
10047 while(ok && ajRegExec(seqRegMaseHead, seqReadLine))
10048 {
10049 if(ajRegLenI(seqRegMaseHead, 1) == 1)
10050 {
10051 ajRegPost(seqRegMaseHead, &token);
10052
10053 if(des)
10054 ajStrAppendK(&des, ' ');
10055
10056 ajStrAppendS(&des, token);
10057 }
10058
10059 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10060 }
10061
10062 ajStrRemoveWhiteExcess(&seqReadLine);
10063 seqSetName(thys, seqReadLine);
10064 ajStrRemoveWhiteExcess(&des);
10065 ajSeqAssignDescS(thys, des);
10066
10067 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10068 while(ok && !ajRegExec(seqRegMaseHead, seqReadLine))
10069 {
10070 seqAppend(&thys->Seq, seqReadLine);
10071 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10072 }
10073
10074 ajStrDel(&token);
10075 ajStrDel(&des);
10076
10077 if(ok)
10078 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
10079 else
10080 ajFilebuffClear(buff, 0);
10081
10082 return ajTrue;
10083 }
10084
10085
10086
10087
10088 /* @funcstatic seqReadBam *****************************************************
10089 **
10090 ** Given data in a sequence structure, tries to read everything needed
10091 ** using binary alignment/map (BAM) format.
10092 **
10093 ** @param [w] thys [AjPSeq] Sequence object
10094 ** @param [u] seqin [AjPSeqin] Sequence input object
10095 ** @return [AjBool] ajTrue on success
10096 **
10097 ** @release 6.3.0
10098 ** @@
10099 ******************************************************************************/
10100
seqReadBam(AjPSeq thys,AjPSeqin seqin)10101 static AjBool seqReadBam(AjPSeq thys, AjPSeqin seqin)
10102 {
10103 AjPFilebuff buff;
10104 AjPFile infile;
10105 ajuint i;
10106 AjPSeqBam b = NULL;
10107 AjPSeqBamCore c;
10108 ajint ret = 0;
10109 struct bamdata
10110 {
10111 ajuint Count;
10112 ajuint Nref;
10113 AjPSeqBamBgzf gzfile;
10114 AjPSeqBam bam;
10115 } *bamdata = NULL;
10116 static AjBool called = ajFalse;
10117 static AjBool bigendian = ajFalse;
10118 unsigned char* d;
10119 ajuint dpos;
10120 int cigop;
10121 ajuint cigend;
10122 ajuint cigint;
10123 AjPStr cigarstr = NULL;
10124 AjPStr namestr = NULL;
10125 AjPStr seqstr = NULL;
10126 AjPStr qualstr = NULL;
10127 AjPStr tagstr = NULL;
10128 unsigned char dp;
10129 AjPSeqBamHeader header = NULL;
10130 ajint filestat;
10131
10132
10133 if(!called)
10134 {
10135 called = ajTrue;
10136 bigendian = ajUtilGetBigendian();
10137 ajDebug("seqReadBam bam bigendian: %B\n", bigendian);
10138 }
10139
10140 buff = seqin->Input->Filebuff;
10141 infile = ajFilebuffGetFile(buff);
10142
10143 if(!seqin->SeqData)
10144 {
10145 ajFileTrace(infile);
10146 ajFilebuffTrace(buff);
10147
10148 /* reset to beginning of file -
10149 ** has at least been tested for blank lines */
10150 filestat = ajFileSeek(infile, 0L, SEEK_SET);
10151 if(filestat != 0)
10152 {
10153 ajDebug("seqReadBam rewind failed errno %d: %s\n",
10154 errno, strerror(errno));
10155 return ajFalse;
10156 }
10157
10158 AJNEW0(bamdata);
10159
10160 bamdata->gzfile = ajSeqBamBgzfNew(ajFilebuffGetFileptr(buff),"r");
10161
10162 ajDebug("gzfile %x fd:%d file:%x ubs:%d cbs:%d blen:%d boff:%d "
10163 "cache:%d open:'%c'\n",
10164 bamdata->gzfile, bamdata->gzfile->file_descriptor,
10165 bamdata->gzfile->file,
10166 bamdata->gzfile->uncompressed_block_size,
10167 bamdata->gzfile->compressed_block_size,
10168 bamdata->gzfile->block_length, bamdata->gzfile->block_offset,
10169 bamdata->gzfile->cache_size,
10170 bamdata->gzfile->open_mode);
10171
10172
10173 /* BAM header */
10174
10175 /* read plain text and the number of reference sequences */
10176 header = ajSeqBamHeaderRead(bamdata->gzfile);
10177 if (!header)
10178 {
10179 ajDebug("failed ajSeqBamHeaderRead, seqReadBam returns ajFalse\n");
10180 ajSeqBamBgzfClose(bamdata->gzfile);
10181 AJFREE(bamdata);
10182 ajFileSeek(infile,filestat,0);
10183 ajFilebuffResetPos(buff);
10184 ajFileTrace(infile);
10185 ajFilebuffTrace(buff);
10186 return ajFalse;
10187 }
10188
10189 ajSeqBamHeaderDel(&header);
10190
10191 bamdata->bam = (AjPSeqBam)calloc(1, sizeof(AjOSeqBam));
10192 seqin->SeqData = bamdata;
10193 }
10194
10195 /* next BAM record */
10196
10197 bamdata = seqin->SeqData;
10198 b = bamdata->bam;
10199 ret = ajSeqBamRead(bamdata->gzfile, b);
10200 if(ret < -1)
10201 ajErr("seqReadBam truncated file return %d\n", ret);
10202
10203 if(ret == -1)
10204 {
10205 ajSeqBamBgzfClose(bamdata->gzfile);
10206 ajFilebuffClear(seqin->Input->Filebuff, 0);
10207 /*seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);*/
10208 free(bamdata->bam->data); free(bamdata->bam);
10209
10210 AJFREE(seqin->SeqData);
10211 return ajFalse;
10212 }
10213
10214 c = &b->core;
10215 ajDebug("rID: %d pos: %d bin: %hd mapQual: %d read_name_len: %d"
10216 " flag_nc: %hd cigar_len: %hd read_len: %d"
10217 " mate_rID: %d mate_pos: %d ins_size: %d\n",
10218 c->tid, c->pos, c->bin, c->qual, c->l_qname,
10219 c->flag, c->n_cigar, c->l_qseq,
10220 c->mtid, c->mpos, c->isize);
10221 ajDebug("l_aux: %d data_len:%d m_data:%d\n",
10222 b->l_aux, b->data_len, b->m_data);
10223 d = b->data;
10224 dpos = 0;
10225 ajStrAssignC(&namestr, (const char*) &d[dpos]);
10226 ajSeqSetName(thys, namestr);
10227 ajStrDel(&namestr);
10228 ajDebug("read name: %p '%s'\n", dpos, &d[dpos]);
10229 dpos += (c->l_qname); /* l_qname includes trailing null */
10230 ajStrAssignC(&cigarstr, "");
10231 ajDebug("start of cigar %p\n", dpos);
10232
10233 for(i=0; i < c->n_cigar; i++)
10234 {
10235 memcpy(&cigint, &d[dpos], 4);
10236 cigop = cigint & BAM_CIGAR_MASK;
10237 cigend = cigint >> BAM_CIGAR_SHIFT;
10238
10239 ajFmtPrintAppS(&cigarstr, " %u%c",
10240 cigend, cigarcode[cigop]);
10241 dpos += 4;
10242 }
10243
10244 ajDebug("cigar: %p %S\n", dpos, cigarstr);
10245 ajStrDel(&cigarstr);
10246
10247 ajStrAssignC(&seqstr, "");
10248 for(i=0; i < (ajuint) c->l_qseq; i++)
10249 {
10250 ajStrAppendK(&seqstr,
10251 bam_nt16_rev_table[MAJSEQBAMSEQI(&d[dpos], i)]);
10252 }
10253 dpos += (c->l_qseq+1)/2;
10254 ajDebug("seq: %p '%S'\n", dpos, seqstr);
10255
10256 ajStrAssignRef(&thys->Seq, seqstr);
10257 ajStrDel(&seqstr);
10258
10259 if(d[dpos] == 0xFF)
10260 {
10261 AJFREE(thys->Accuracy);
10262 thys->Qualsize = 0;
10263 ajDebug("qual: MISSING\n");
10264 dpos += c->l_qseq;
10265 }
10266 else
10267 {
10268 ajStrAssignC(&qualstr, "");
10269
10270 if(thys->Qualsize < (ajuint) c->l_qseq)
10271 {
10272 AJCRESIZE(thys->Accuracy, c->l_qseq);
10273 thys->Qualsize = c->l_qseq;
10274 }
10275
10276 for(i=0; i < (ajuint) c->l_qseq; i++)
10277 {
10278 ajFmtPrintAppS(&qualstr, " %02x", 33 + d[dpos]);
10279 thys->Accuracy[i] = (float) d[dpos++];
10280 }
10281
10282 ajDebug("qual: %p %S\n", dpos, qualstr);
10283 ajStrDel(&qualstr);
10284 }
10285
10286 ajStrAssignC(&tagstr, "");
10287
10288 while (dpos < (ajuint) b->data_len)
10289 {
10290 ajStrAppendK(&tagstr, ' ');
10291 ajStrAppendK(&tagstr, d[dpos++]);
10292 ajStrAppendK(&tagstr, d[dpos++]);
10293 ajStrAppendK(&tagstr, ':');
10294 dp = d[dpos++];
10295 ajStrAppendK(&tagstr, dp);
10296 ajStrAppendK(&tagstr, ':');
10297
10298 ajDebug("tag type: '%c\n",dp);
10299
10300 if (dp == 'Z' || dp == 'H')
10301 {
10302 ajFmtPrintAppS(&tagstr,"%s", &d[dpos]);
10303 while(d[dpos])
10304 dpos++;
10305 dpos++;
10306 }
10307 else if (dp == 'f')
10308 {
10309 ajFmtPrintAppS(&tagstr,"%f", (float) *(&d[dpos]));
10310 dpos += 4;
10311 }
10312 else if (dp == 'd')
10313 {
10314 ajFmtPrintAppS(&tagstr,"%lf", (double) *(&d[dpos]));
10315 dpos += 8;
10316 }
10317 else if (dp == 'A')
10318 {
10319 ajFmtPrintAppS(&tagstr,"%c", &d[dpos++]);
10320 }
10321 else if (dp == 'c')
10322 {
10323 ajFmtPrintAppS(&tagstr,"%d",
10324 (ajint) (signed char) d[dpos++]);
10325 }
10326 else if (dp == 's')
10327 {
10328 ajFmtPrintAppS(&tagstr,"%hd",
10329 (ajshort) *(&d[dpos]));
10330 dpos += 2;
10331 }
10332 else if (dp == 'i')
10333 {
10334 ajFmtPrintAppS(&tagstr,"%d",
10335 (ajint) *(&d[dpos]));
10336 dpos += 4;
10337 }
10338 else if (dp == 'C')
10339 {
10340 ajFmtPrintAppS(&tagstr,"%u",
10341 (ajuint) d[dpos++]);
10342 }
10343 else if (dp == 'S')
10344 {
10345 ajFmtPrintAppS(&tagstr,"%hu",
10346 (ajushort) *(&d[dpos]));
10347 dpos += 2;
10348 }
10349 else if (dp == 'I')
10350 {
10351 ajFmtPrintAppS(&tagstr,"%u",
10352 (ajuint) d[dpos]);
10353 dpos += 4;
10354 }
10355 else {
10356 ajWarn("Unknown BAM aux type char(%d) '%c'", (ajint) dp, dp);
10357 ajFmtPrintAppS(&tagstr,"???");
10358 }
10359 }
10360
10361 ajDebug("tags: %p '%S'\n", dpos, tagstr);
10362 ajStrDel(&tagstr);
10363
10364 bamdata->Count++;
10365
10366 return ajTrue;
10367 }
10368
10369
10370
10371
10372
10373 /* @funcstatic seqReadSam *****************************************************
10374 **
10375 ** Given data in a sequence structure, tries to read everything needed
10376 ** using sequence alignment/map (SAM) format.
10377 **
10378 ** @param [w] thys [AjPSeq] Sequence object
10379 ** @param [u] seqin [AjPSeqin] Sequence input object
10380 ** @return [AjBool] ajTrue on success
10381 **
10382 ** @release 6.2.0
10383 ** @@
10384 ******************************************************************************/
10385
seqReadSam(AjPSeq thys,AjPSeqin seqin)10386 static AjBool seqReadSam(AjPSeq thys, AjPSeqin seqin)
10387 {
10388 AjBool ok = ajTrue;
10389 AjPFilebuff buff;
10390 AjPFile infile = NULL;
10391 AjBool badformat = ajFalse;
10392 ajuint seqlen = 0;
10393 const char *cp;
10394 ajuint i;
10395 ajint iqual;
10396 ajint qmin = 33;
10397 ajint qmax = 126;
10398 ajuint flags;
10399 ajint iflags;
10400
10401 buff = seqin->Input->Filebuff;
10402 infile = ajFilebuffGetFile(buff);
10403
10404 /* === header section === */
10405
10406 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10407
10408 if(ajTextinGetCount(seqin->Input) == 1)
10409 {
10410 while(ok && ajStrGetCharFirst(seqReadLine) == '@')
10411 {
10412 ajStrTokenAssignC(&seqHandle, seqReadLine, "\t");
10413 ajStrTokenNextParse(seqHandle,&seqToken);
10414 switch(ajStrGetCharPos(seqToken, 1))
10415 {
10416 case 'H':
10417 /* @HD header VN:
10418 **
10419 */
10420 if(!ajStrMatchC(seqToken, "@HD"))
10421 badformat = ajTrue;
10422 break;
10423 case 'S':
10424 /* @SQ sequence dictionary SN: LN:
10425 **
10426 */
10427 if(!ajStrMatchC(seqToken, "@SQ"))
10428 badformat = ajTrue;
10429 break;
10430 case 'R':
10431 /* @RG read group ID: SM:
10432 **
10433 */
10434 if(!ajStrMatchC(seqToken, "@RG"))
10435 badformat = ajTrue;
10436 break;
10437 case 'P':
10438 /* @PG program name ID:
10439 **
10440 */
10441 if(!ajStrMatchC(seqToken, "@PG"))
10442 badformat = ajTrue;
10443 break;
10444 case 'C':
10445 /* @CO comment
10446 **
10447 */
10448 if(!ajStrMatchC(seqToken, "@CO"))
10449 badformat = ajTrue;
10450 break;
10451 default:
10452 badformat = ajTrue;
10453 break;
10454 }
10455 if(badformat)
10456 {
10457 ajErr("bad sam format header record '%S'", seqReadLine);
10458 return ajFalse;
10459 }
10460 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10461 }
10462 }
10463
10464 if(!ok)
10465 return ajFalse;
10466
10467 /* === alignment section === */
10468
10469 if(ajStrParseCountC(seqReadLine, "\t") < 11)
10470 return ajFalse;
10471
10472 ajStrTokenAssignC(&seqHandle, seqReadLine, "\t\n");
10473
10474 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* QNAME */
10475 seqSetNameNospace(&thys->Name, seqToken);
10476 ajDebug("QNAME '%S' '%S'\n", seqToken, thys->Name);
10477
10478 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* FLAG */
10479 ajDebug("FLAG '%S'\n", seqToken);
10480
10481 if(ajStrGetLen(seqToken))
10482 {
10483 if(!ajStrToUint(seqToken, &flags))
10484 {
10485 ajErr("SAM %F '%S' invalid FLAG value %S\n",
10486 infile, thys->Name, seqToken);
10487 return ajFalse;
10488 }
10489 }
10490
10491 ajDebug("flags %x\n", flags);
10492
10493 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* RNAME */
10494 ajDebug("RNAME '%S'\n", seqToken);
10495
10496 /*
10497 if(ajStrGetLen(seqToken))
10498 seqAccSave(thys, seqToken);
10499 */
10500
10501 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* POS */
10502 ajDebug("POS '%S'\n", seqToken);
10503
10504 if(ajStrGetLen(seqToken))
10505 {
10506 if(!ajStrToUint(seqToken, &flags))
10507 {
10508 ajErr("SAM %F '%S' invalid POS value %S\n",
10509 infile, thys->Name, seqToken);
10510 return ajFalse;
10511 }
10512 }
10513
10514 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* MAPQ */
10515 ajDebug("MAPQ '%S'\n", seqToken);
10516
10517 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* CIGAR */
10518 ajDebug("CIGAR '%S'\n", seqToken);
10519
10520 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* MRNM */
10521 ajDebug("MRNM '%S'\n", seqToken);
10522
10523 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* MPOS */
10524 ajDebug("MPOS '%S'\n", seqToken);
10525
10526 if(ajStrGetLen(seqToken))
10527 {
10528 if(!ajStrToUint(seqToken, &flags))
10529 {
10530 ajErr("SAM %F '%S' invalid MPOS value %S\n",
10531 infile, thys->Name, seqToken);
10532 return ajFalse;
10533 }
10534 }
10535
10536 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* ISIZE */
10537 ajDebug("ISIZE '%S'\n", seqToken);
10538
10539 if(ajStrGetLen(seqToken))
10540 {
10541 if(!ajStrToInt(seqToken, &iflags))
10542 {
10543 ajErr("SAM %F '%S' invalid ISIZE value %S\n",
10544 infile, thys->Name, seqToken);
10545 return ajFalse;
10546 }
10547 }
10548
10549
10550 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* SEQ */
10551 ajDebug("SEQ '%S'\n", seqToken);
10552 seqAppend(&thys->Seq, seqToken);
10553 seqlen = MAJSTRGETLEN(seqToken);
10554
10555 ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* QUAL */
10556 ajDebug("QUAL '%S'", seqToken);
10557
10558 if(ajStrCmpC(seqToken,"*")!=0)
10559 {
10560 if(MAJSTRGETLEN(seqToken) != seqlen)
10561 {
10562 ajErr("SAM quality length mismatch '%F' '%S' "
10563 "expected: %u found: %u '%S' '%S'",
10564 infile, thys->Name,
10565 seqlen, ajStrGetLen(seqQualStr), thys->Seq, seqToken);
10566 return ajFalse;
10567 }
10568
10569 cp = MAJSTRGETPTR(seqToken);
10570 i=0;
10571
10572 if(seqlen > thys->Qualsize)
10573 AJCRESIZE(thys->Accuracy, seqlen);
10574
10575 thys->Qualsize = seqlen;
10576
10577 if(MAJSTRGETLEN(seqToken) > thys->Qualsize)
10578 AJCRESIZE(thys->Accuracy, MAJSTRGETLEN(seqQualStr));
10579
10580 while (*cp)
10581 {
10582 iqual = *cp++;
10583 if(iqual < qmin)
10584 {
10585 ajWarn("SAM '%F' sequence '%S' "
10586 "quality value %d '%c' too low",
10587 infile, thys->Name,
10588 (ajint) (cp - MAJSTRGETPTR(seqToken)), (char) iqual);
10589 iqual = qmin;
10590 }
10591 if(iqual > qmax)
10592 {
10593 ajWarn("SAM '%F' sequence '%S' "
10594 "quality value '%c' too high",
10595 infile, thys->Name,
10596 (char) iqual);
10597 iqual = qmax;
10598 }
10599 thys->Accuracy[i++] = seqQualPhred[iqual];
10600 }
10601 }
10602
10603 /* 11 fields then (tag:vtype:value)... */
10604
10605 ajStrDelStatic(&seqToken);
10606 ajStrTokenReset(seqHandle);
10607
10608 return ajTrue;
10609 }
10610
10611
10612
10613
10614 /* @funcstatic seqReadScf *****************************************************
10615 **
10616 ** Given data in a sequence structure, tries to read everything needed
10617 ** using stored chromatogram format (SCF)
10618 **
10619 ** @param [w] thys [AjPSeq] Sequence object
10620 ** @param [u] seqin [AjPSeqin] Sequence input object
10621 ** @return [AjBool] ajTrue on success
10622 **
10623 ** @release 6.2.0
10624 ** @@
10625 ******************************************************************************/
10626
seqReadScf(AjPSeq thys,AjPSeqin seqin)10627 static AjBool seqReadScf(AjPSeq thys, AjPSeqin seqin)
10628 {
10629 AjPFilebuff buff;
10630 AjPFile infile = NULL;
10631 ajuint i;
10632 ajulong filestat = 0L;
10633 size_t status;
10634 SeqOScfData scfdata;
10635 ajuint magicnum = SCF_MAGIC;
10636 ajuint seqlen;
10637 ajuint iqual;
10638 AjBool revint = ajFalse;
10639 AjBool hasqual = ajFalse;
10640 SeqOScfBase scfbase;
10641 AjPStr tmpstr = NULL;
10642 ajuint scfversion = 0;
10643 ajuint *iprob = NULL;
10644 ajuint *peakoffset = NULL;
10645 unsigned char *probA = NULL;
10646 unsigned char *probC = NULL;
10647 unsigned char *probG = NULL;
10648 unsigned char *probT = NULL;
10649 char *bases = NULL;
10650
10651 buff = seqin->Input->Filebuff;
10652 infile = ajFilebuffGetFile(buff);
10653
10654 if(ajFilebuffIsEnded(buff))
10655 return ajFalse;
10656
10657 filestat = ajFileSeek(infile, 0L, SEEK_SET);
10658 if(filestat != 0)
10659 {
10660 ajDebug("seqReadScf rewind failed errno %d: %s\n",
10661 errno, strerror(errno));
10662 return ajFalse;
10663 }
10664
10665 /* === header section === */
10666
10667 status = ajReadbinBinary(infile, 1, 128, &scfdata.header);
10668 if(!status)
10669 return ajFalse;
10670
10671 if(scfdata.header.magic_number != magicnum)
10672 {
10673 ajByteRevLen4u(&scfdata.header.magic_number);
10674 if(scfdata.header.magic_number != magicnum)
10675 {
10676 ajDebug("SCF magic number expected '%x' reversed '%x'\n",
10677 magicnum, scfdata.header.magic_number);
10678 return ajFalse;
10679 }
10680
10681 ajDebug("SCF magic number '%x' reversed Bigendian: %B\n",
10682 scfdata.header.magic_number,
10683 ajUtilGetBigendian());
10684
10685 revint = ajTrue;
10686
10687 ajByteRevLen4u(&scfdata.header.samples);
10688 ajByteRevLen4u(&scfdata.header.samples_offset);
10689 ajByteRevLen4u(&scfdata.header.bases);
10690 ajByteRevLen4u(&scfdata.header.bases_left_clip);
10691 ajByteRevLen4u(&scfdata.header.bases_right_clip);
10692 ajByteRevLen4u(&scfdata.header.bases_offset);
10693 ajByteRevLen4u(&scfdata.header.comments_size);
10694 ajByteRevLen4u(&scfdata.header.comments_offset);
10695 ajByteRevLen4u(&scfdata.header.sample_size);
10696 ajByteRevLen4u(&scfdata.header.code_set);
10697 ajByteRevLen4u(&scfdata.header.private_size);
10698 ajByteRevLen4u(&scfdata.header.private_offset);
10699 for(i=0; i < 18; i++)
10700 ajByteRevLen4u(&scfdata.header.spare[i]);
10701 }
10702
10703 scfversion = scfdata.header.version[0] - '0';
10704
10705 ajDebug("version %u '%c%c%c%c' uncertainty %u '%s' precision %u %ubit\n",
10706 scfversion, scfdata.header.version[0],
10707 scfdata.header.version[1],
10708 scfdata.header.version[2],
10709 scfdata.header.version[3],
10710 scfdata.header.code_set,
10711 SeqScfUncertainCodes[scfdata.header.code_set].name,
10712 scfdata.header.sample_size,
10713 (scfdata.header.sample_size - 1) ? 8 : 16
10714 );
10715
10716 ajDebug("%u samples at %u\n",
10717 scfdata.header.samples,
10718 scfdata.header.samples_offset);
10719
10720 ajDebug("%u bases at %u\n",
10721 scfdata.header.bases,
10722 scfdata.header.bases_offset);
10723
10724 ajDebug("%u char comment at %u\n",
10725 scfdata.header.comments_size,
10726 scfdata.header.comments_offset);
10727
10728 ajDebug("%u private records at %u\n",
10729 scfdata.header.private_size,
10730 scfdata.header.private_offset);
10731
10732
10733 filestat = ajFileSeek(infile, scfdata.header.bases_offset, SEEK_SET);
10734 if(filestat != 0)
10735 {
10736 ajDebug("seqReadScf seek failed errno %d: %s\n",
10737 errno, strerror(errno));
10738 return ajFalse;
10739 }
10740
10741 seqlen = scfdata.header.bases;
10742 AJCNEW(bases, seqlen+1);
10743 AJCNEW(iprob, seqlen);
10744 bases[seqlen] = '\0';
10745
10746 if(scfversion < 3)
10747 {
10748 for(i=0; i < seqlen; i++)
10749 {
10750 ajReadbinBinary(infile, 1, 12, &scfbase);
10751 if(revint)
10752 {
10753 if(revint)
10754 ajByteRevLen4u(&scfbase.peak_index);
10755 bases[i] = scfbase.base;
10756 switch(scfbase.base)
10757 {
10758 case 'A':
10759 case 'a':
10760 iqual = scfbase.prob_A;
10761 break;
10762 case 'C':
10763 case 'c':
10764 iqual = scfbase.prob_C;
10765 break;
10766 case 'G':
10767 case 'g':
10768 iqual = scfbase.prob_G;
10769 break;
10770 case 'T':
10771 case 't':
10772 iqual = scfbase.prob_T;
10773 break;
10774 default:
10775 bases[i] = 'N';
10776 iqual = 0;
10777 }
10778 if(iqual)
10779 hasqual = ajTrue;
10780 iprob[i] = iqual;
10781 }
10782 }
10783 }
10784 else if (scfversion == 3)
10785 {
10786 AJCNEW(peakoffset, seqlen);
10787 AJCNEW(probA, seqlen);
10788 AJCNEW(probC, seqlen);
10789 AJCNEW(probG, seqlen);
10790 AJCNEW(probT, seqlen);
10791 ajReadbinBinary(infile, seqlen, 4, peakoffset);
10792 ajReadbinBinary(infile, seqlen, 1, probA);
10793 ajReadbinBinary(infile, seqlen, 1, probC);
10794 ajReadbinBinary(infile, seqlen, 1, probG);
10795 ajReadbinBinary(infile, seqlen, 1, probT);
10796 ajReadbinBinary(infile, seqlen, 1, bases);
10797
10798 for(i=0; i < seqlen; i++)
10799 {
10800 if(revint)
10801 ajByteRevLen4u(&peakoffset[i]);
10802 switch(bases[i])
10803 {
10804 case 'A':
10805 case 'a':
10806 iqual = probA[i];
10807 break;
10808 case 'C':
10809 case 'c':
10810 iqual = probC[i];
10811 break;
10812 case 'G':
10813 case 'g':
10814 iqual = probG[i];
10815 break;
10816 case 'T':
10817 case 't':
10818 iqual = probT[i];
10819 break;
10820 default:
10821 bases[i] = 'N';
10822 iqual = 0;
10823 }
10824 if(iqual)
10825 hasqual = ajTrue;
10826 iprob[i] = iqual;
10827 }
10828 }
10829 else
10830 {
10831 ajDebug("Unknown SCF version '%c%c%c%c'",
10832 scfdata.header.version[0],
10833 scfdata.header.version[1],
10834 scfdata.header.version[2],
10835 scfdata.header.version[3]);
10836 return ajFalse;
10837 }
10838
10839 filestat = ajFileSeek(infile, scfdata.header.comments_offset, SEEK_SET);
10840 if(filestat != 0)
10841 {
10842 ajDebug("seqReadScf seek failed errno %d: %s\n",
10843 errno, strerror(errno));
10844 return ajFalse;
10845 }
10846 ajReadbinStr(infile, scfdata.header.comments_size, &tmpstr);
10847
10848 ajStrExchangeCC(&tmpstr, "\r", "\n");
10849 ajStrExchangeCC(&tmpstr, "\n\n", "\n");
10850 ajStrExchangeCC(&tmpstr, "\n", "; ");
10851 ajStrExchangeCC(&tmpstr, " ;", ";");
10852 ajStrTrimWhiteEnd(&tmpstr);
10853 ajStrAssignS(&thys->Desc, tmpstr);
10854
10855 ajStrAssignC(&tmpstr, bases);
10856 seqAppendWarn(&thys->Seq, tmpstr,
10857 seqin->Input->Format);
10858 if(hasqual)
10859 {
10860 AJCRESIZE(thys->Accuracy, seqlen);
10861 thys->Qualsize = seqlen;
10862 for(i=0; i < seqlen; i++)
10863 thys->Accuracy[i] = seqQualPhred[iprob[i]];
10864 }
10865
10866 ajStrDel(&tmpstr);
10867
10868 seqSetNameFile(thys, seqin);
10869
10870 ajFilebuffClear(buff, 0);
10871 buff->File->End = ajTrue;
10872
10873 return ajTrue;
10874 }
10875
10876
10877
10878
10879 /* @funcstatic seqReadStrider *************************************************
10880 **
10881 ** Given data in a sequence structure, tries to read everything needed
10882 ** using DNA strider format.
10883 **
10884 ** @param [w] thys [AjPSeq] Sequence object
10885 ** @param [u] seqin [AjPSeqin] Sequence input object
10886 ** @return [AjBool] ajTrue on success
10887 **
10888 ** @release 1.0.0
10889 ** @@
10890 ******************************************************************************/
10891
seqReadStrider(AjPSeq thys,AjPSeqin seqin)10892 static AjBool seqReadStrider(AjPSeq thys, AjPSeqin seqin)
10893 {
10894 AjPFilebuff buff;
10895 AjBool ok = ajTrue;
10896
10897 buff = seqin->Input->Filebuff;
10898
10899 do
10900 {
10901 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10902
10903 if(ok)
10904 {
10905 if(ajStrPrefixC(seqReadLine, "; DNA sequence"))
10906 {
10907 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\t,\n");
10908 ajStrTokenStep(seqHandle); /* 'DNA' */
10909 ajStrTokenStep(seqHandle); /* sequence */
10910 ajStrTokenNextParse(seqHandle, &seqToken); /* entry name */
10911 }
10912 }
10913
10914 } while(ok && ajStrPrefixC(seqReadLine, ";"));
10915
10916 ajStrTokenReset(seqHandle);
10917
10918 if(!ok || !ajStrGetLen(seqToken))
10919 {
10920 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
10921 ajStrDelStatic(&seqToken);
10922
10923 return ajFalse;
10924 }
10925
10926 /* we know we will succeed from here ... no way to return ajFalse */
10927
10928 ajFilebuffSetUnbuffered(buff);
10929
10930 seqSetName(thys, seqToken);
10931
10932 /* OK, we have the name. Now look for the sequence */
10933
10934 while(ok && !ajStrPrefixC(seqReadLine, "//"))
10935 {
10936 seqAppend(&thys->Seq, seqReadLine);
10937 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10938 }
10939
10940 ajFilebuffClear(buff, 0);
10941 ajStrDelStatic(&seqToken);
10942
10943 return ajTrue;
10944 }
10945
10946
10947
10948
10949 /* @funcstatic seqReadMsf *****************************************************
10950 **
10951 ** Tries to read input in MSF format. If successful, can repeat for the
10952 ** next call to return the second, third, ... sequence from the same file.
10953 **
10954 ** @param [w] thys [AjPSeq] Sequence object
10955 ** @param [u] seqin [AjPSeqin] Sequence input object
10956 ** @return [AjBool] ajTrue on success
10957 **
10958 ** @release 1.0.0
10959 ** @@
10960 ******************************************************************************/
10961
seqReadMsf(AjPSeq thys,AjPSeqin seqin)10962 static AjBool seqReadMsf(AjPSeq thys, AjPSeqin seqin)
10963 {
10964 ajuint len;
10965 AjBool ok = ajFalse;
10966 ajuint iseq = 0;
10967
10968 AjPFilebuff buff;
10969 AjPTable msftable = NULL;
10970 SeqPMsfItem msfitem = NULL;
10971 const SeqPMsfItem readmsfitem = NULL;
10972 AjPList msflist = NULL;
10973 SeqPMsfData msfdata = NULL;
10974
10975 ajuint i;
10976
10977 ajDebug("seqReadMsf seqin->SeqData %x\n", seqin->SeqData);
10978
10979 buff = seqin->Input->Filebuff;
10980
10981 if(!seqin->SeqData)
10982 {
10983 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10984 if(!ok)
10985 return ajFalse;
10986
10987 if(ajStrPrefixC(seqReadLine, "!!"))
10988 {
10989 if(ajStrPrefixC(seqReadLine, "!!AA_MULTIPLE_ALIGNMENT"))
10990 ajSeqSetProt(thys);
10991
10992 if(ajStrPrefixC(seqReadLine, "!!NA_MULTIPLE_ALIGNMENT"))
10993 ajSeqSetNuc(thys);
10994 }
10995
10996 if(!seqGcgMsfDots(thys, seqin, &seqReadLine, seqMaxGcglines, &len))
10997 {
10998 ajDebug("seqGcgMsfDots failed\n");
10999 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
11000
11001 return ajFalse;
11002 }
11003
11004 /* we know we will succeed from here ... no way to return ajFalse */
11005
11006 ajFilebuffSetUnbuffered(buff);
11007
11008 seqin->SeqData = AJNEW0(msfdata);
11009 msfdata->Table = msftable = ajTablestrNew(1000);
11010 msflist = ajListstrNew();
11011 seqin->Input->Filecount = 0;
11012 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
11013
11014 while(ok && !ajStrPrefixC(seqReadLine, "//"))
11015 {
11016 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
11017 if(seqGcgMsfHeader(seqReadLine, &msfitem))
11018 {
11019 ajTablePut(msftable, ajStrNewS(msfitem->Name), msfitem);
11020 ajListstrPushAppend(msflist, ajStrNewS(msfitem->Name));
11021 iseq++;
11022 }
11023 }
11024
11025 ajDebug("Header has %d sequences\n", iseq);
11026 ajListstrTrace(msflist);
11027 ajTableTrace(msftable);
11028 ajTableMap(msftable, &seqMsfTabList, NULL);
11029
11030 msfdata->Names = AJCALLOC(iseq, sizeof(*msfdata->Names));
11031
11032 for(i=0; i < iseq; i++)
11033 {
11034 ajListstrPop(msflist, &msfdata->Names[i]);
11035 ajDebug("list [%d] '%S'\n", i, msfdata->Names[i]);
11036 }
11037
11038 ajListstrFreeData(&msflist);
11039
11040 while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
11041 {
11042 seqGcgMsfReadseq(seqReadLine, msftable);
11043 }
11044
11045 ajTableMap(msftable, &seqMsfTabList, NULL);
11046 msfdata->Nseq = iseq;
11047 msfdata->Count = 0;
11048 msfdata->Bufflines = ajTextinGetRecords(seqin->Input);
11049 ajDebug("MSF format read %d lines\n",
11050 ajTextinGetRecords(seqin->Input));
11051 }
11052
11053 msfdata = seqin->SeqData;
11054 msftable = msfdata->Table;
11055
11056 if(msfdata->Count >= msfdata->Nseq)
11057 {
11058 ajFilebuffClear(seqin->Input->Filebuff, 0);
11059 seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
11060
11061 return ajFalse;
11062 }
11063
11064 i = msfdata->Count;
11065 ajDebug("returning [%d] '%S'\n", i, msfdata->Names[i]);
11066 readmsfitem = ajTableFetchS(msftable, msfdata->Names[i]);
11067 ajStrAssignS(&thys->Name, msfdata->Names[i]);
11068
11069 thys->Weight = readmsfitem->Weight;
11070 ajStrAssignS(&thys->Seq, readmsfitem->Seq);
11071
11072 msfdata->Count++;
11073
11074 return ajTrue;
11075 }
11076
11077
11078
11079
11080 /* @funcstatic seqGcgMsfReadseq ***********************************************
11081 **
11082 ** Reads sequence name from first token on the input line, and appends
11083 ** the sequence data to that sequence in the msftable structure.
11084 **
11085 ** @param [r] rdline [const AjPStr] Line from input file.
11086 ** @param [r] msftable [const AjPTable] MSF format sequence table.
11087 ** @return [AjBool] ajTrue on success
11088 **
11089 ** @release 1.0.0
11090 ** @@
11091 ******************************************************************************/
11092
seqGcgMsfReadseq(const AjPStr rdline,const AjPTable msftable)11093 static AjBool seqGcgMsfReadseq(const AjPStr rdline, const AjPTable msftable)
11094 {
11095 SeqPMsfItem msfitem;
11096 AjPStr token = NULL;
11097 AjPStr seqstr = NULL;
11098 AjBool status;
11099
11100 status = ajStrExtractWord(rdline, &seqstr, &token);
11101
11102 if(!status)
11103 {
11104 ajStrDel(&token);
11105 ajStrDel(&seqstr);
11106
11107 return ajFalse;
11108 }
11109
11110 ajDebug("seqGcgMsfReadseq '%S' '%S'\n", token, seqstr);
11111
11112 msfitem = ajTableFetchmodS(msftable, token);
11113
11114 if(!msfitem)
11115 {
11116 ajStrDel(&token);
11117 ajStrDel(&seqstr);
11118
11119 return ajFalse;
11120 }
11121
11122 seqAppend(&msfitem->Seq, seqstr);
11123
11124 ajStrDel(&token);
11125 ajStrDel(&seqstr);
11126
11127 return ajTrue;
11128 }
11129
11130
11131
11132
11133 /* @funcstatic seqMsfDataDel **************************************************
11134 **
11135 ** Destructor for SeqPMsfData objects
11136 **
11137 ** @param [d] pthys [SeqPMsfData*] MSF data object
11138 ** @return [void]
11139 **
11140 ** @release 4.1.0
11141 ** @@
11142 ******************************************************************************/
11143
seqMsfDataDel(SeqPMsfData * pthys)11144 static void seqMsfDataDel(SeqPMsfData* pthys)
11145 {
11146 SeqPMsfData thys;
11147 ajuint i;
11148
11149 if(!pthys)
11150 return;
11151
11152 if(!*pthys)
11153 return;
11154
11155 thys = *pthys;
11156
11157 ajDebug("seqMsfDataDel Nseq:%u Count:%u Table:%u Nexus:%Lu\n",
11158 thys->Nseq, thys->Count, ajTableGetLength(thys->Table),
11159 ajNexusGetNtaxa(thys->Nexus));
11160
11161 for(i=0; i < thys->Nseq; i++)
11162 {
11163 ajStrDel(&thys->Names[i]);
11164 }
11165
11166
11167 AJFREE(thys->Names);
11168
11169 ajNexusDel(&thys->Nexus);
11170 ajStrDel(&thys->Gene);
11171 ajStrDel(&thys->Domain);
11172 ajStrDel(&thys->NextGene);
11173 ajStrDel(&thys->NextDomain);
11174 ajTableMapDel(thys->Table, &seqMsfTabDel, NULL);
11175 ajTableFree(&thys->Table);
11176
11177 AJFREE(*pthys);
11178
11179 return;
11180 }
11181
11182
11183
11184
11185 /* @funcstatic seqMsfItemDel **************************************************
11186 **
11187 ** Destructor for SeqPMsfItem objects
11188 **
11189 ** @param [d] pthys [SeqPMsfItem*] MSF item object
11190 ** @return [void]
11191 **
11192 ** @release 4.1.0
11193 ** @@
11194 ******************************************************************************/
11195
seqMsfItemDel(SeqPMsfItem * pthys)11196 static void seqMsfItemDel(SeqPMsfItem* pthys)
11197 {
11198 SeqPMsfItem thys;
11199
11200 if(!pthys)
11201 return;
11202
11203 if(!*pthys)
11204 return;
11205
11206 thys = *pthys;
11207
11208 ajStrDel(&thys->Name);
11209 ajStrDel(&thys->Desc);
11210 ajStrDel(&thys->Seq);
11211
11212 AJFREE(*pthys);
11213
11214 return;
11215 }
11216
11217
11218
11219
11220 /* @funcstatic seqMsfTabList **************************************************
11221 **
11222 ** Writes a debug report of the contents of an MSF table.
11223 **
11224 ** @param [r] key [const void*] Standard argument, key from current table item
11225 ** which is a string for MSF internal tables.
11226 ** @param [r] value [void**] Standard argument, data from current table item,
11227 ** converted to an MSF internal table item.
11228 ** @param [r] cl [void*] Standard argument, usually NULL.
11229 ** @return [void]
11230 **
11231 ** @release 1.0.0
11232 ** @@
11233 ******************************************************************************/
11234
seqMsfTabList(const void * key,void ** value,void * cl)11235 static void seqMsfTabList(const void* key, void** value, void* cl)
11236 {
11237 SeqPMsfItem msfitem;
11238
11239 (void) cl;
11240
11241 msfitem = (SeqPMsfItem) *value;
11242
11243 ajDebug("key '%S' Name '%S' Seqlen %d\n",
11244 key, msfitem->Name, ajStrGetLen(msfitem->Seq));
11245
11246 return;
11247 }
11248
11249
11250
11251
11252 /* @funcstatic seqMsfDataTrace ************************************************
11253 **
11254 ** Debug trace report for SeqPMsfData objects
11255 **
11256 ** @param [r] thys [const SeqPMsfData] MSF data object
11257 ** @return [void]
11258 **
11259 ** @release 4.1.0
11260 ** @@
11261 ******************************************************************************/
11262
seqMsfDataTrace(const SeqPMsfData thys)11263 static void seqMsfDataTrace(const SeqPMsfData thys)
11264 {
11265 ajuint i;
11266
11267 if(!thys)
11268 {
11269 ajDebug("seqMsfDataTrace <null>\n");
11270 return;
11271 }
11272
11273 ajDebug("seqMsfDataTrace Nseq:%u Count:%u Table:%u Nexus:%Lu\n",
11274 thys->Nseq, thys->Count, ajTableGetLength(thys->Table),
11275 ajNexusGetNtaxa(thys->Nexus));
11276
11277 for(i=0; i < thys->Nseq; i++)
11278 if(i < thys->Count)
11279 ajDebug("* [%u] '%S'\n", i, thys->Names[i]);
11280 else
11281 ajDebug(" [%u] '%S'\n", i, thys->Names[i]);
11282
11283 ajTableMap(thys->Table, &seqMsfTabList, NULL);
11284
11285 return;
11286 }
11287
11288
11289
11290
11291 /* @funcstatic seqMsfTabDel ***************************************************
11292 **
11293 ** Deletes entries from the MSF internal table. Called for each entry in turn.
11294 **
11295 ** @param [d] key [void**] Standard argument, table key.
11296 ** @param [d] value [void**] Standard argument, table data item.
11297 ** @param [r] cl [void*] Standard argument, usually NULL
11298 ** @return [void]
11299 **
11300 ** @release 1.0.0
11301 ** @@
11302 ******************************************************************************/
11303
seqMsfTabDel(void ** key,void ** value,void * cl)11304 static void seqMsfTabDel(void** key, void** value, void* cl)
11305 {
11306 SeqPMsfItem msfitem;
11307 AjPStr keystr;
11308
11309 (void) cl;
11310
11311 keystr = (AjPStr) *key;
11312 msfitem = (SeqPMsfItem) *value;
11313
11314 ajStrDel(&keystr);
11315
11316 seqMsfItemDel(&msfitem);
11317
11318 *key = NULL;
11319 *value = NULL;
11320
11321 return;
11322 }
11323
11324
11325
11326
11327 /* @funcstatic seqReadSwiss ***************************************************
11328 **
11329 ** Given data in a sequence structure, tries to read everything needed
11330 ** using SWISS format.
11331 **
11332 ** @param [w] thys [AjPSeq] Sequence object
11333 ** @param [u] seqin [AjPSeqin] Sequence input object
11334 ** @return [AjBool] ajTrue on success
11335 **
11336 ** @release 1.0.0
11337 ** @@
11338 ******************************************************************************/
11339
seqReadSwiss(AjPSeq thys,AjPSeqin seqin)11340 static AjBool seqReadSwiss(AjPSeq thys, AjPSeqin seqin)
11341 {
11342 AjBool ok;
11343 AjPFilebuff buff;
11344 AjBool dodes = ajFalse;
11345 AjBool dofeat = ajFalse;
11346 AjBool tryfeat = ajFalse;
11347 AjPStr liststr; /* for lists, do not delete */
11348 AjPStr datestr = NULL;
11349 AjPStr datetype = NULL;
11350 AjPStr relstr = NULL;
11351 AjPStr taxstr = NULL;
11352 AjPStr cmtstr = NULL; /* stored in AjPSeq - do not delete */
11353 ajuint icount = 0;
11354 AjPSeqRef seqref = NULL;
11355 AjPSeqXref xref = NULL;
11356 AjPSeqGene seqgene = NULL;
11357 AjPSeqDesc desctop = NULL;
11358 AjPSeqDesc descmaster = NULL;
11359 AjPSeqSubdesc subdesc = NULL;
11360 AjBool descistop = ajTrue;
11361 AjBool isdescflag = ajFalse;
11362 AjPStr *Pdescstr = NULL;
11363 AjPStr newdescstr = NULL;
11364 AjPStr genetoken = NULL;
11365 const AjPStr tmpstr = NULL;
11366 ajuint refnum;
11367 ajuint itaxtype = 0;
11368 AjBool isnewgene = ajFalse;
11369 AjBool isgenetoken = ajFalse;
11370 AjIList iter;
11371 AjIList itb;
11372 AjIList itc;
11373 SeqEPrefixSwiss lineprefix = SWISS_UNK;
11374
11375 /*
11376 ** To be done: 12-Feb-09
11377 ** input line wrapping test GN,
11378 ** continue lines for OS
11379 **
11380 ** New line types:
11381 ** OH organism host: list of tax ids
11382 **
11383 ** CC line blocks -!- TOPIC:
11384 ** can do this by parsing the stored comment block
11385 **
11386 ** DR lines - can parse out the details
11387 */
11388
11389 buff = seqin->Input->Filebuff;
11390
11391 if(!seqFtFmtSwiss)
11392 ajStrAssignC(&seqFtFmtSwiss, "swissprot");
11393
11394 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
11395 return ajFalse;
11396
11397 lineprefix = seqPrefixSwiss(seqReadLine);
11398
11399 /* for GCG formatted databases */
11400
11401 while(lineprefix == SWISS_WP) /* "WP" */
11402 {
11403 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
11404 return ajFalse;
11405
11406 lineprefix = seqPrefixSwiss(seqReadLine);
11407 }
11408
11409 /* extra blank lines */
11410
11411 while(ajStrIsWhite(seqReadLine))
11412 {
11413 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
11414 return ajFalse;
11415
11416 lineprefix = seqPrefixSwiss(seqReadLine);
11417 }
11418
11419 ajDebug("seqReadSwiss first line '%S'\n", seqReadLine);
11420
11421 if(lineprefix != SWISS_ID) /* "ID" */
11422 {
11423 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
11424
11425 return ajFalse;
11426 }
11427
11428 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11429 ajStrTokenStep(seqHandle); /* 'ID' */
11430 ajStrTokenNextParse(seqHandle, &seqToken); /* entry name */
11431
11432 seqSetName(thys, seqToken);
11433
11434 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
11435 lineprefix = seqPrefixSwiss(seqReadLine);
11436
11437 if(!seqin->Minimal && !thys->Fulldesc)
11438 thys->Fulldesc = ajSeqdescNew();
11439
11440 if(seqin->Minimal)
11441 dodes = ajTrue;
11442
11443 dofeat = ajFalse;
11444 tryfeat = seqinUfoLocal(seqin);
11445
11446 while(ok && lineprefix != SWISS_SQ) /* read until "SQ" */
11447 {
11448 /* check for Staden Experiment format instead */
11449
11450 if(lineprefix == SWISS_EX) /* EN/EX/TN */
11451 {
11452 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
11453 ajStrTokenReset(seqHandle);
11454 ajStrDelStatic(&seqToken);
11455
11456 return ajFalse;;
11457 }
11458
11459 else if(lineprefix == SWISS_AC) /* AC */
11460 {
11461 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
11462 ajStrTokenStep(seqHandle); /* 'AC' */
11463
11464 while(ajStrTokenNextParse(seqHandle, &seqToken))
11465 seqAccSave(thys, seqToken);
11466 }
11467
11468 if(tryfeat && lineprefix == SWISS_FT) /* FT */
11469 {
11470 if(!dofeat) /* set up feature buffer */
11471 {
11472 dofeat = ajTrue;
11473 ajFeattabinDel(&seqin->Ftquery);
11474 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtSwiss,
11475 thys->Name, "N");
11476 ajDebug("seqin->Ftquery ftfile %x\n",
11477 seqin->Ftquery->Input->Filebuff);
11478 }
11479
11480 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
11481 /* ajDebug("SWISS FEAT saved line:\n%S", seqReadLine); */
11482 }
11483
11484 if(seqin->Minimal)
11485 {
11486 /*
11487 ** only simple description needed
11488 ** test DE line, extract basic text if any
11489 ** then go to next record
11490 */
11491
11492 if(lineprefix == SWISS_DE) /* DE minimal processing */
11493 {
11494 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
11495 ajStrTokenStep(seqHandle); /* 'DE' */
11496
11497 while(ajStrTokenNextParseC(seqHandle, " ;\n\r", &seqToken))
11498 {
11499 if(MAJSTRGETCHARLAST(seqToken) == ':')
11500 {
11501 switch(seqDesSwiss(seqToken))
11502 {
11503 case SWISS_DES_REC:
11504 dodes = ajTrue;
11505 break;
11506
11507 case SWISS_DES_UNK:
11508 if(dodes)
11509 {
11510 if(MAJSTRGETLEN(thys->Desc))
11511 ajStrAppendK(&thys->Desc, ' ');
11512 ajStrAppendS(&thys->Desc, seqToken);
11513 }
11514 break;
11515
11516 default:
11517 if(MAJSTRGETLEN(thys->Desc))
11518 dodes = ajFalse;
11519 break;
11520 }
11521 }
11522 else if(ajStrFindK(seqToken, '=') > 0)
11523 {
11524 switch(seqDessubSwiss(&seqToken))
11525 {
11526 case SWISS_SUB_FULL:
11527 if(!MAJSTRGETLEN(thys->Desc))
11528 dodes = ajTrue;
11529 break;
11530 case SWISS_SUB_UNK:
11531 break;
11532 default:
11533 dodes = ajFalse;
11534 break;
11535 }
11536 if(dodes)
11537 {
11538 if(MAJSTRGETLEN(thys->Desc))
11539 ajStrAppendK(&thys->Desc, ' ');
11540 ajStrAppendS(&thys->Desc, seqToken);
11541 }
11542 }
11543 else
11544 {
11545 if(dodes)
11546 {
11547 if(MAJSTRGETLEN(thys->Desc))
11548 ajStrAppendK(&thys->Desc, ' ');
11549 ajStrAppendS(&thys->Desc, seqToken);
11550 }
11551 }
11552 }
11553 }
11554
11555 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
11556 lineprefix = seqPrefixSwiss(seqReadLine);
11557 continue;
11558 }
11559
11560 switch(lineprefix) /* all other line types */
11561 {
11562 case SWISS_DE:
11563 if(!desctop)
11564 {
11565 desctop = thys->Fulldesc;
11566 descmaster = thys->Fulldesc;
11567 Pdescstr = &thys->Desc;
11568 }
11569
11570 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
11571 ajStrTokenStep(seqHandle); /* 'DE' */
11572
11573 while(ajStrTokenNextParseC(seqHandle, " ;\n\r", &seqToken))
11574 {
11575 if(ajStrGetCharLast(seqToken) == ':')
11576 {
11577 isdescflag = ajFalse;
11578
11579 switch(seqDesSwiss(seqToken))
11580 {
11581 case SWISS_DES_REC:
11582 Pdescstr = &descmaster->Name;
11583 descistop = ajTrue;
11584 break;
11585
11586 case SWISS_DES_ALT:
11587 subdesc = ajSeqsubdescNew();
11588 descistop = ajFalse;
11589 Pdescstr = &subdesc->Name;
11590 ajListPushAppend(descmaster->AltNames, subdesc);
11591 break;
11592
11593 case SWISS_DES_SUB:
11594 subdesc = ajSeqsubdescNew();
11595 descistop = ajFalse;
11596 Pdescstr = &subdesc->Name;
11597 ajListPushAppend(descmaster->SubNames, subdesc);
11598 break;
11599
11600 case SWISS_DES_INC:
11601 descmaster = ajSeqdescNew();
11602 descistop = ajTrue;
11603 ajListPushAppend(thys->Fulldesc->Includes,
11604 descmaster);
11605 Pdescstr = &descmaster->Name;
11606 break;
11607
11608 case SWISS_DES_CONT:
11609 descmaster = ajSeqdescNew();
11610 descistop = ajTrue;
11611 ajListPushAppend(thys->Fulldesc->Contains,
11612 descmaster);
11613 Pdescstr = &descmaster->Name;
11614 break;
11615
11616 case SWISS_DES_FLG:
11617 isdescflag = ajTrue;
11618 break;
11619
11620 default:
11621 ajDebug("Swissprot DE line"
11622 "UNKNOWN token '%S'\n",
11623 seqToken);
11624
11625 if(ajStrGetLen(*Pdescstr))
11626 ajStrAppendK(Pdescstr, ' ');
11627
11628 ajStrAppendS(Pdescstr, seqToken);
11629 }
11630 }
11631
11632 else if(ajStrFindK(seqToken, '=') > 0)
11633 {
11634 switch(seqDessubSwiss(&seqToken))
11635 {
11636 case SWISS_SUB_FULL:
11637 if(descistop)
11638 {
11639 Pdescstr = &descmaster->Name;
11640 }
11641 else
11642 {
11643 Pdescstr = &subdesc->Name;
11644 }
11645
11646 ajStrAssignS(Pdescstr, seqToken);
11647 break;
11648
11649 case SWISS_SUB_SHORT:
11650 newdescstr = ajStrNewC("");
11651 Pdescstr = &newdescstr;
11652
11653 if(descistop)
11654 ajListstrPushAppend(descmaster->Short,
11655 newdescstr);
11656 else
11657 ajListstrPushAppend(subdesc->Short,
11658 newdescstr);
11659
11660 ajStrAssignS(Pdescstr, seqToken);
11661 break;
11662
11663 case SWISS_SUB_EC:
11664 newdescstr = ajStrNewC("");
11665 Pdescstr = &newdescstr;
11666
11667 if(descistop)
11668 ajListstrPushAppend(descmaster->EC,
11669 newdescstr);
11670 else
11671 ajListstrPushAppend(subdesc->EC,
11672 newdescstr);
11673
11674 ajStrAssignS(Pdescstr, seqToken);
11675 xref = ajSeqxrefNewDbC(*Pdescstr, "ENZYME",
11676 XREF_EC);
11677 ajSeqAddXref(thys, xref);
11678 xref = NULL;
11679 break;
11680
11681 case SWISS_SUB_ALLER:
11682 newdescstr = ajStrNewC("");
11683 Pdescstr = &newdescstr;
11684 ajListstrPushAppend(subdesc->Allergen,
11685 newdescstr);
11686 ajStrAssignS(Pdescstr, seqToken);
11687 xref = ajSeqxrefNewDbC(*Pdescstr, "Allergen",
11688 XREF_DESC);
11689 ajSeqAddXref(thys, xref);
11690 xref = NULL;
11691 break;
11692
11693 case SWISS_SUB_BIOTECH:
11694 newdescstr = ajStrNewC("");
11695 Pdescstr = &newdescstr;
11696 ajListstrPushAppend(subdesc->Biotech,
11697 newdescstr);
11698 ajStrAssignS(Pdescstr, seqToken);
11699 break;
11700
11701 case SWISS_SUB_CDA:
11702 newdescstr = ajStrNewC("");
11703 Pdescstr = &newdescstr;
11704 ajListstrPushAppend(subdesc->Cdantigen,
11705 newdescstr);
11706 ajStrAssignS(Pdescstr, seqToken);
11707 xref = ajSeqxrefNewDbC(*Pdescstr, "CD_Antigen",
11708 XREF_DESC);
11709 ajSeqAddXref(thys, xref);
11710 xref = NULL;
11711 break;
11712
11713 case SWISS_SUB_INN:
11714 newdescstr = ajStrNewC("");
11715 Pdescstr = &newdescstr;
11716 ajListstrPushAppend(subdesc->Inn, newdescstr);
11717 ajStrAssignSubS(Pdescstr, seqToken, 4, -1);
11718 break;
11719
11720 default:
11721 ajDebug("Swissprot DE line "
11722 "UNKNOWN subtoken '%S'\n",
11723 seqToken);
11724
11725 if(ajStrGetLen(*Pdescstr))
11726 ajStrAppendK(Pdescstr, ' ');
11727
11728 ajStrAppendS(Pdescstr, seqToken);
11729 break;
11730 }
11731 }
11732 else
11733 {
11734 if(isdescflag)
11735 {
11736 if(ajStrMatchC(seqToken,"Precursor"))
11737 thys->Fulldesc->Precursor = ajTrue;
11738 else if(ajStrMatchC(seqToken,"Fragments"))
11739 thys->Fulldesc->Fragments = 2;
11740 else if(ajStrMatchC(seqToken,"Fragment"))
11741 thys->Fulldesc->Fragments = 1;
11742 else
11743 {
11744 ajDebug("unknown description flag text '%S'\n",
11745 seqToken);
11746 if(ajStrGetLen(*Pdescstr))
11747 ajStrAppendK(Pdescstr, ' ');
11748
11749 ajStrAppendC(Pdescstr, "Flags: ");
11750 ajStrAppendS(Pdescstr, seqToken);
11751 }
11752
11753 }
11754 else
11755 {
11756 if(ajStrGetLen(*Pdescstr))
11757 ajStrAppendK(Pdescstr, ' ');
11758
11759 ajStrAppendS(Pdescstr, seqToken);
11760 }
11761 }
11762 }
11763 break;
11764
11765 /* needs a little work for wrapped lines - save token and
11766 ** append rather than set at the current level
11767 */
11768
11769 case SWISS_GN:
11770 if(!seqgene)
11771 {
11772 isnewgene = ajTrue;
11773 seqgene = ajSeqgeneNew();
11774 }
11775
11776 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
11777 ajStrTokenStep(seqHandle); /* 'GN' */
11778 ajStrTokenNextParseC(seqHandle, ";=\n\r", &seqToken);
11779
11780 if(ajStrMatchC(seqToken, "and")) /* test 'and' between genes */
11781 {
11782 isnewgene = ajTrue;
11783 seqgene = ajSeqgeneNew();
11784 }
11785 else
11786 {
11787 while(ajStrGetLen(seqToken))
11788 {
11789 isgenetoken = ajTrue;
11790 ajStrTrimWhite(&seqToken);
11791
11792 if(ajStrMatchC(seqToken, "Name"))
11793 {
11794 ajStrTokenNextParseC(seqHandle, ";\n\r",
11795 &seqToken2);
11796 ajSeqgeneSetName(seqgene, seqToken2);
11797 }
11798 else if (ajStrMatchC(seqToken, "Synonyms"))
11799 {
11800 ajStrTokenNextParseC(seqHandle, ";\n\r",
11801 &seqToken2);
11802 ajSeqgeneSetSynonyms(seqgene, seqToken2);
11803 }
11804 else if (ajStrMatchC(seqToken, "OrderedLocusNames"))
11805 {
11806 ajStrTokenNextParseC(seqHandle, ";\n\r",
11807 &seqToken2);
11808 ajSeqgeneSetOln(seqgene, seqToken2);
11809 }
11810 else if (ajStrMatchC(seqToken, "ORFNames"))
11811 {
11812 ajStrTokenNextParseC(seqHandle, ";\n\r",
11813 &seqToken2);
11814 ajSeqgeneSetOrf(seqgene, seqToken2);
11815 }
11816 else
11817 {
11818 isgenetoken = ajFalse;
11819 ajDebug("Swissnew GN line unexpected '%S' (%S)",
11820 seqToken, genetoken);
11821
11822 if(ajStrMatchC(genetoken, "Name"))
11823 ajSeqgeneAppendName(seqgene, seqToken);
11824 else if (ajStrMatchC(genetoken, "Synonyms"))
11825 ajSeqgeneAppendSynonyms(seqgene, seqToken);
11826 else if (ajStrMatchC(genetoken,
11827 "OrderedLocusNames"))
11828 ajSeqgeneAppendOln(seqgene, seqToken);
11829 else if (ajStrMatchC(genetoken, "ORFNames"))
11830 ajSeqgeneAppendOrf(seqgene, seqToken);
11831 }
11832
11833 ajStrTokenNextParseC(seqHandle, "=;\n\r", &seqToken);
11834
11835 if(isgenetoken)
11836 ajStrAssignS(&genetoken, seqToken);
11837 }
11838
11839 if(isnewgene)
11840 {
11841 isnewgene = ajFalse;
11842 ajSeqAddGene(thys, seqgene);
11843 }
11844 /* keep seqgene so we can add to it if the line wraps */
11845 }
11846 break;
11847
11848 case SWISS_PE:
11849 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11850 ajStrTokenStep(seqHandle); /* PE */
11851 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
11852
11853 if(ajStrGetLen(seqToken))
11854 ajStrAssignS(&thys->Evidence, seqToken);
11855 break;
11856
11857 case SWISS_KW:
11858 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11859 ajStrTokenStep(seqHandle); /* 'KW' */
11860
11861 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
11862 {
11863 liststr = ajStrNewS(seqToken);
11864 ajStrTrimWhite(&liststr);
11865 ajSeqAddKey(thys, liststr);
11866 liststr = NULL;
11867 }
11868 break;
11869
11870 case SWISS_OS:
11871 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11872 ajStrTokenStep(seqHandle); /* 'OS' */
11873
11874 while(ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken))
11875 {
11876 if(ajStrGetLen(taxstr))
11877 ajStrAppendK(&taxstr, ' ');
11878
11879 ajStrAppendS(&taxstr, seqToken);
11880 }
11881 break;
11882
11883 case SWISS_OC:
11884 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11885 ajStrTokenStep(seqHandle); /* 'OC' */
11886
11887 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
11888 {
11889 ajStrTrimWhite(&seqToken);
11890 seqTaxSave(thys, seqToken, 0);
11891 }
11892 break;
11893
11894 case SWISS_OG:
11895 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11896 ajStrTokenStep(seqHandle); /* 'OG' */
11897 ajStrTokenNextParse(seqHandle, &seqToken2);
11898
11899 while(ajStrTokenNextParse(seqHandle, &seqToken))
11900 {
11901 ajStrAppendK(&seqToken2, ' ');
11902 ajStrAppendS(&seqToken2, seqToken);
11903 }
11904
11905 if(ajStrGetCharLast(seqToken2) == '.')
11906 ajStrCutEnd(&seqToken2, 1);
11907
11908 seqTaxSave(thys, seqToken2, 2);
11909 break;
11910
11911 case SWISS_OH:
11912 ajStrTokenAssignC(&seqHandle, seqReadLine, " =;\n\r");
11913 ajStrTokenStep(seqHandle); /* 'OH' */
11914 ajStrTokenNextParse(seqHandle, &seqToken);
11915
11916 if(ajStrMatchC(seqToken, "NCBI_TaxID"))
11917 {
11918 ajStrTokenNextParse(seqHandle, &seqToken2);
11919 seqTaxidSaveS(thys, seqToken2);
11920 xref = ajSeqxrefNewDbC(seqToken2, "taxon", XREF_TAX);
11921 ajSeqAddXref(thys, xref);
11922 xref = NULL;
11923 }
11924 break;
11925
11926 case SWISS_OX:
11927 ajStrTokenAssignC(&seqHandle, seqReadLine, " =;\n\r");
11928 ajStrTokenStep(seqHandle); /* 'OX' */
11929 ajStrTokenNextParse(seqHandle, &seqToken);
11930
11931 if(ajStrMatchC(seqToken, "NCBI_TaxID"))
11932 {
11933 ajStrTokenNextParse(seqHandle, &seqToken2);
11934 seqTaxidSaveS(thys, seqToken2);
11935 xref = ajSeqxrefNewDbC(seqToken2, "taxon", XREF_TAX);
11936 ajSeqAddXref(thys, xref);
11937 xref = NULL;
11938 }
11939 break;
11940
11941 case SWISS_CC:
11942 ajStrAssignSubS(&seqToken, seqReadLine, 5, -1);
11943
11944 if(ajStrGetLen(cmtstr))
11945 {
11946 ajStrAppendC(&cmtstr, "\n");
11947
11948 if(ajStrPrefixC(seqToken, "-!- ") ||
11949 (ajStrPrefixC(seqToken, "--------") &&
11950 ajStrPrefixC(cmtstr, "-!- ")))
11951 {
11952 ajSeqAddCmt(thys, cmtstr);
11953 cmtstr = NULL;
11954 }
11955 }
11956
11957 ajStrAppendS(&cmtstr, seqToken);
11958 break;
11959
11960 case SWISS_DR:
11961 AJNEW0(xref);
11962 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
11963 ajStrTokenStep(seqHandle); /* 'DR' */
11964 ajStrTokenNextParseC(seqHandle, ";\n\r",
11965 &xref->Db); /* dbname */
11966 ajStrTrimWhite(&xref->Db);
11967 ajStrTokenNextParse(seqHandle, &xref->Id); /* primary */
11968 ajStrTrimWhite(&xref->Id);
11969 ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
11970
11971 if(!ajStrGetLen(seqToken))
11972 {
11973 if(ajStrGetCharLast(xref->Id) == '.')
11974 ajStrCutEnd(&xref->Id, 1);
11975 }
11976 else
11977 {
11978 if(ajStrGetCharLast(seqToken) == '.')
11979 ajStrCutEnd(&seqToken, 1);
11980 ajStrTrimWhite(&seqToken);
11981 ajStrAssignS(&xref->Secid, seqToken);
11982
11983 ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
11984
11985 if(!ajStrGetLen(seqToken))
11986 {
11987 if(ajStrGetCharLast(xref->Secid) == '.')
11988 ajStrCutEnd(&xref->Secid, 1);
11989 }
11990 else
11991 {
11992 if(ajStrGetCharLast(seqToken) == '.')
11993 ajStrCutEnd(&seqToken, 1);
11994 ajStrTrimWhite(&seqToken);
11995 ajStrAssignS(&xref->Terid, seqToken);
11996
11997 ajStrTokenNextParse(seqHandle, &seqToken);/* secondary*/
11998
11999 if(!ajStrGetLen(seqToken))
12000 {
12001 if(ajStrGetCharLast(xref->Terid) == '.')
12002 ajStrCutEnd(&xref->Terid, 1);
12003 }
12004 else
12005 {
12006 if(ajStrGetCharLast(seqToken) == '.')
12007 ajStrCutEnd(&seqToken, 1);
12008 ajStrTrimWhite(&seqToken);
12009 ajStrAssignS(&xref->Quatid, seqToken);
12010 }
12011 }
12012 }
12013 xref->Type = XREF_DR;
12014 ajSeqAddXref(thys, xref);
12015 xref = NULL;
12016 break;
12017
12018 case SWISS_RN:
12019 if(seqref)
12020 {
12021 ajSeqrefStandard(seqref);
12022 ajSeqAddRef(thys, seqref);
12023 seqref = NULL;
12024 }
12025
12026 seqref = ajSeqrefNew();
12027 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12028 ajStrTokenStep(seqHandle); /* 'RN' */
12029 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* [num] */
12030 ajStrAssignSubS(&seqToken2, seqToken, 1, -2);
12031 ajStrToUint(seqToken2, &refnum);
12032 ajSeqrefSetnumNumber(seqref, refnum);
12033 break;
12034
12035 case SWISS_RG:
12036 if(!seqref)
12037 seqref = ajSeqrefNew();
12038
12039 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12040 ajStrTokenStep(seqHandle); /* 'RG' */
12041 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* groupname */
12042 ajSeqrefAppendGroupname(seqref, seqToken);
12043 break;
12044
12045 case SWISS_RX:
12046 if(!seqref)
12047 seqref = ajSeqrefNew();
12048
12049 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12050 ajStrTokenStep(seqHandle); /* 'RX' */
12051 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* xref */
12052 ajSeqrefAppendXref(seqref, seqToken);
12053 break;
12054
12055 case SWISS_RP:
12056 if(!seqref)
12057 seqref = ajSeqrefNew();
12058
12059 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12060 ajStrTokenStep(seqHandle); /* 'RP' */
12061 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* position */
12062 ajSeqrefAppendPosition(seqref, seqToken);
12063 break;
12064
12065 case SWISS_RA:
12066 if(!seqref)
12067 seqref = ajSeqrefNew();
12068
12069 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12070 ajStrTokenStep(seqHandle); /* 'RA' */
12071 ajStrTokenNextParseC(seqHandle, "\n\r;", &seqToken); /* authors */
12072 ajSeqrefAppendAuthors(seqref, seqToken);
12073 break;
12074
12075 case SWISS_RT:
12076 if(!seqref)
12077 seqref = ajSeqrefNew();
12078
12079 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12080 ajStrTokenStep(seqHandle); /* 'RT' */
12081 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
12082
12083 if(!ajStrMatchC(seqToken, ";"))
12084 ajSeqrefAppendTitle(seqref, seqToken);
12085 break;
12086
12087 case SWISS_RL:
12088 if(!seqref)
12089 seqref = ajSeqrefNew();
12090
12091 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12092 ajStrTokenStep(seqHandle); /* 'RL' */
12093 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* location */
12094 ajSeqrefAppendLocation(seqref, seqToken);
12095 break;
12096
12097 case SWISS_RC:
12098 if(!seqref)
12099 seqref = ajSeqrefNew();
12100
12101 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12102 ajStrTokenStep(seqHandle); /* 'RC' */
12103 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
12104 ajSeqrefAppendComment(seqref, seqToken);
12105 break;
12106
12107 case SWISS_DT:
12108 if(!thys->Date)
12109 thys->Date = ajSeqdateNew();
12110
12111 ajStrTokenAssignC(&seqHandle, seqReadLine, " (),.\n\r");
12112 icount = 0;
12113
12114 while(ajStrTokenNextParse(seqHandle, &seqToken))
12115 {
12116 icount++;
12117
12118 if(icount==2)
12119 ajStrAssignS(&datestr, seqToken);
12120 else if(icount == 3)
12121 ajStrAssignS(&datetype, seqToken);
12122 else if(icount == 5)
12123 ajStrAssignS(&relstr, seqToken);
12124 }
12125
12126 if(ajStrMatchC(datetype, "integrated"))
12127 {
12128 ajSeqdateSetCreateS(thys->Date, datestr);
12129 ajStrAssignS(&thys->Date->CreVer, relstr);
12130 }
12131 else if (ajStrMatchC(datetype, "sequence"))
12132 {
12133 ajSeqdateSetModseqS(thys->Date, datestr);
12134 ajStrAssignS(&thys->Date->SeqVer, relstr);
12135 }
12136 else if (ajStrMatchC(datetype, "entry"))
12137 {
12138 ajSeqdateSetModifyS(thys->Date, datestr);
12139 ajStrAssignS(&thys->Date->ModVer, relstr);
12140 }
12141 else
12142 {
12143 ajDebug("unknown datetype '%S' '%S'",
12144 datetype, seqReadLine);
12145 }
12146 break;
12147
12148 case SWISS_UNK:
12149 case SWISS_END:
12150 case SWISS_MORE:
12151 case SWISS_XX:
12152 case SWISS_SV:
12153 case SWISS_MAX:
12154 ajWarn("Unknown swissprot line type '%2.2S'", seqReadLine);
12155 break;
12156
12157 default:
12158 break;
12159 }
12160
12161 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
12162 lineprefix = seqPrefixSwiss(seqReadLine);
12163 }
12164
12165 if(MAJSTRGETLEN(taxstr))
12166 {
12167 ajStrTrimWhite(&taxstr);
12168
12169 if(MAJSTRGETCHARLAST(taxstr) == '.')
12170 ajStrCutEnd(&taxstr, 1);
12171
12172 ajStrTokenAssignC(&seqHandle, taxstr, "()");
12173 itaxtype=1;
12174
12175 while(ajStrTokenNextParse(seqHandle, &seqToken))
12176 {
12177 ajStrTrimWhite(&seqToken);
12178 seqTaxSave(thys, seqToken, itaxtype);
12179 itaxtype = 3;
12180 }
12181 }
12182
12183 if(seqref) /* clean up the last reference */
12184 {
12185 ajSeqrefStandard(seqref);
12186 ajSeqAddRef(thys, seqref);
12187 seqref = NULL;
12188 }
12189
12190 if(MAJSTRGETLEN(cmtstr))
12191 {
12192 ajSeqAddCmt(thys, cmtstr);
12193 cmtstr = NULL;
12194 }
12195
12196 if(dofeat)
12197 {
12198 ajDebug("EMBL FEAT TabIn %x\n", seqin->Ftquery);
12199 ajFeattableDel(&thys->Fttable);
12200 thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
12201 /* ajFeattableTrace(thys->Fttable); */
12202 ajFeattabinClear(seqin->Ftquery);
12203 }
12204
12205 if(MAJSTRGETLEN(seqin->Inseq))
12206 {
12207 /* we have a sequence to use */
12208 ajStrAssignS(&thys->Seq, seqin->Inseq);
12209
12210 if(seqin->Input->Text)
12211 {
12212 seqTextSeq(&thys->TextPtr, seqin->Inseq);
12213 ajFmtPrintAppS(&thys->TextPtr, "//\n");
12214 }
12215 }
12216 else
12217 {
12218 /* read the sequence and terminator */
12219 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
12220 lineprefix = seqPrefixSwiss(seqReadLine);
12221
12222 while(ok && lineprefix != SWISS_END)
12223 {
12224 seqAppend(&thys->Seq, seqReadLine);
12225 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
12226 lineprefix = seqPrefixSwiss(seqReadLine);
12227 }
12228 }
12229
12230 if(thys->Fttable)
12231 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
12232
12233 if(!MAJSTRGETLEN(thys->Desc) && thys->Fulldesc)
12234 {
12235 ajStrAssignS(&thys->Desc, thys->Fulldesc->Name);
12236
12237 iter = ajListIterNewread(thys->Fulldesc->Short);
12238
12239 while((tmpstr = (const AjPStr) ajListIterGet(iter)))
12240 {
12241 if(MAJSTRGETLEN(tmpstr))
12242 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12243 }
12244
12245 ajListIterDel(&iter);
12246
12247 iter = ajListIterNewread(thys->Fulldesc->EC);
12248
12249 while((tmpstr = (const AjPStr) ajListIterGet(iter)))
12250 {
12251 if(MAJSTRGETLEN(tmpstr))
12252 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12253 }
12254
12255 ajListIterDel(&iter);
12256
12257 iter = ajListIterNewread(thys->Fulldesc->AltNames);
12258
12259 while((subdesc = (AjPSeqSubdesc) ajListIterGet(iter)))
12260 {
12261 if(MAJSTRGETLEN(subdesc->Name))
12262 {
12263 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12264 }
12265
12266 itb = ajListIterNewread(subdesc->Inn);
12267 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12268 {
12269 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12270 }
12271
12272 ajListIterDel(&itb);
12273
12274 itb = ajListIterNewread(subdesc->Short);
12275
12276 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12277 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12278
12279 ajListIterDel(&itb);
12280
12281 itb = ajListIterNewread(subdesc->EC);
12282
12283 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12284 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12285 ajListIterDel(&itb);
12286
12287 itb = ajListIterNewread(subdesc->Allergen);
12288
12289 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12290 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12291 ajListIterDel(&itb);
12292
12293 itb = ajListIterNewread(subdesc->Biotech);
12294
12295 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12296 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12297 ajListIterDel(&itb);
12298
12299 itb = ajListIterNewread(subdesc->Cdantigen);
12300
12301 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12302 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12303
12304 ajListIterDel(&itb);
12305 }
12306
12307 ajListIterDel(&iter);
12308
12309 iter = ajListIterNewread(thys->Fulldesc->SubNames);
12310
12311 while((subdesc = (AjPSeqSubdesc) ajListIterGet(iter)))
12312 {
12313 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12314
12315 itb = ajListIterNewread(subdesc->Short);
12316
12317 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12318 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12319
12320 ajListIterDel(&itb);
12321
12322 itb = ajListIterNewread(subdesc->EC);
12323
12324 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12325 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12326
12327 ajListIterDel(&itb);
12328 }
12329
12330 ajListIterDel(&iter);
12331
12332 iter = ajListIterNewread(thys->Fulldesc->Includes);
12333
12334 while((desctop = (AjPSeqDesc) ajListIterGet(iter)))
12335 {
12336 ajFmtPrintAppS(&thys->Desc, " (%S)", desctop->Name);
12337 itb = ajListIterNewread(desctop->Short);
12338
12339 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12340 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12341
12342 ajListIterDel(&itb);
12343
12344 itb = ajListIterNewread(desctop->EC);
12345
12346 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12347 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12348
12349 ajListIterDel(&itb);
12350
12351 itb = ajListIterNewread(desctop->AltNames);
12352
12353 while((subdesc = (AjPSeqSubdesc) ajListIterGet(itb)))
12354 {
12355 if(ajStrGetLen(subdesc->Name))
12356 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12357
12358 itc = ajListIterNewread(subdesc->Inn);
12359
12360 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12361 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12362
12363 ajListIterDel(&itc);
12364
12365 itc = ajListIterNewread(subdesc->Short);
12366
12367 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12368 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12369
12370 ajListIterDel(&itc);
12371
12372 itc = ajListIterNewread(subdesc->EC);
12373
12374 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12375 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12376
12377 ajListIterDel(&itc);
12378
12379 itc = ajListIterNewread(subdesc->Allergen);
12380
12381 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12382 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12383
12384 ajListIterDel(&itc);
12385
12386 itc = ajListIterNewread(subdesc->Biotech);
12387
12388 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12389 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12390
12391 ajListIterDel(&itc);
12392
12393 itc = ajListIterNewread(subdesc->Cdantigen);
12394
12395 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12396 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12397
12398 ajListIterDel(&itc);
12399 }
12400
12401 ajListIterDel(&itb);
12402
12403 itb = ajListIterNewread(desctop->SubNames);
12404
12405 while((subdesc = (AjPSeqSubdesc) ajListIterGet(itb)))
12406 {
12407 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12408
12409 itc = ajListIterNewread(subdesc->Short);
12410
12411 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12412 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12413
12414 ajListIterDel(&itc);
12415
12416 itc = ajListIterNewread(subdesc->EC);
12417
12418 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12419 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12420
12421 ajListIterDel(&itc);
12422
12423 }
12424
12425 ajListIterDel(&itb);
12426
12427 }
12428
12429 ajListIterDel(&iter);
12430
12431 iter = ajListIterNewread(thys->Fulldesc->Contains);
12432
12433 while((desctop = (AjPSeqDesc) ajListIterGet(iter)))
12434 {
12435 ajFmtPrintAppS(&thys->Desc, " (%S)", desctop->Name);
12436
12437 itb = ajListIterNewread(desctop->Short);
12438
12439 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12440 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12441
12442 ajListIterDel(&itb);
12443
12444 itb = ajListIterNewread(desctop->EC);
12445
12446 while((tmpstr = (AjPStr) ajListIterGet(itb)))
12447 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12448
12449 ajListIterDel(&itb);
12450
12451 itb = ajListIterNewread(desctop->AltNames);
12452
12453 while((subdesc = (AjPSeqSubdesc) ajListIterGet(itb)))
12454 {
12455 if(ajStrGetLen(subdesc->Name))
12456 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12457
12458 itc = ajListIterNewread(subdesc->Inn);
12459
12460 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12461 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12462
12463 ajListIterDel(&itc);
12464
12465 itc = ajListIterNewread(subdesc->Short);
12466
12467 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12468 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12469
12470 ajListIterDel(&itc);
12471
12472 itc = ajListIterNewread(subdesc->EC);
12473
12474 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12475 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12476
12477 ajListIterDel(&itc);
12478
12479 itc = ajListIterNewread(subdesc->Allergen);
12480
12481 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12482 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12483
12484 ajListIterDel(&itc);
12485
12486 itc = ajListIterNewread(subdesc->Biotech);
12487
12488 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12489 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12490
12491 ajListIterDel(&itc);
12492
12493 itc = ajListIterNewread(subdesc->Cdantigen);
12494
12495 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12496 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12497
12498 ajListIterDel(&itc);
12499
12500 }
12501
12502 ajListIterDel(&itb);
12503
12504 itb = ajListIterNewread(desctop->SubNames);
12505
12506 while((subdesc = (AjPSeqSubdesc) ajListIterGet(itb)))
12507 {
12508 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12509 itc = ajListIterNewread(subdesc->Short);
12510
12511 itc = ajListIterNewread(subdesc->Cdantigen);
12512
12513 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12514 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12515
12516 ajListIterDel(&itc);
12517
12518 itc = ajListIterNewread(subdesc->EC);
12519
12520 itc = ajListIterNewread(subdesc->Cdantigen);
12521
12522 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12523 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12524
12525 ajListIterDel(&itc);
12526
12527 }
12528
12529 ajListIterDel(&itb);
12530 }
12531
12532 ajListIterDel(&iter);
12533
12534 if(thys->Fulldesc->Fragments || thys->Fulldesc->Precursor)
12535 {
12536 if(thys->Fulldesc->Fragments == 1)
12537 ajFmtPrintAppS(&thys->Desc, " (Fragment)");
12538
12539 if(thys->Fulldesc->Fragments == 2)
12540 ajFmtPrintAppS(&thys->Desc, " (Fragments)");
12541
12542 if(thys->Fulldesc->Precursor)
12543 ajFmtPrintAppS(&thys->Desc, " (Precursor)");
12544 }
12545 if(MAJSTRGETCHARFIRST(thys->Desc) == ' ')
12546 ajStrCutStart(&thys->Desc, 1);
12547
12548 tmpstr = NULL;
12549 }
12550
12551 ajSeqSetProt(thys);
12552
12553 if(thys->Reflist)
12554 ajSeqreflistGetXrefs(thys->Reflist, &thys->Xreflist);
12555
12556 ajFilebuffClear(buff, 0);
12557
12558 ajStrDel(&datestr);
12559 ajStrDel(&datetype);
12560 ajStrDel(&relstr);
12561 ajStrDel(&taxstr);
12562 ajStrDel(&genetoken);
12563
12564 ajStrDelStatic(&seqToken);
12565 ajStrDelStatic(&seqToken2);
12566 ajStrTokenReset(seqHandle);
12567
12568 return ajTrue;
12569 }
12570
12571
12572
12573
12574 /* @funcstatic seqReadEmbl ****************************************************
12575 **
12576 ** Given data in a sequence structure, tries to read everything needed
12577 ** using EMBL format.
12578 **
12579 ** @param [w] thys [AjPSeq] Sequence object
12580 ** @param [u] seqin [AjPSeqin] Sequence input object
12581 ** @return [AjBool] ajTrue on success
12582 **
12583 ** @release 1.0.0
12584 ** @@
12585 ******************************************************************************/
12586
seqReadEmbl(AjPSeq thys,AjPSeqin seqin)12587 static AjBool seqReadEmbl(AjPSeq thys, AjPSeqin seqin)
12588 {
12589 AjBool ok;
12590 /* AjBool okdate; */
12591 AjPFilebuff buff;
12592 AjBool dofeat = ajFalse;
12593 AjBool tryfeat = ajFalse;
12594 AjPStr liststr; /* for lists, do not delete */
12595 AjPStr datestr = NULL;
12596 AjPStr relstr = NULL;
12597 AjPStr cmtstr = NULL; /* stored in AjPSeq - do not delete */
12598 ajuint icount;
12599 AjPSeqRef seqref = NULL;
12600 AjPSeqXref xref = NULL;
12601 ajuint refnum;
12602 ajuint seqlen=1024;
12603 ajuint tmplen;
12604 ajuint itmp;
12605 ajuint i;
12606 ajuint taxid = 0;
12607 ajuint itaxtype = 0;
12608 SeqEPrefixSwiss lineprefix = SWISS_UNK;
12609 AjPStrTok handle = NULL;
12610 AjPSeqin conseqin = NULL;
12611 AjPSeq conseq = NULL;
12612 AjPStr conqry = NULL;
12613 AjPStr condb = NULL;
12614 AjPStr confield = NULL;
12615 AjPStr constr = NULL;
12616 AjPStr numstr = NULL;
12617 AjPStr token = NULL;
12618 ajuint gaplen = 0;
12619 ajuint start = 0;
12620 ajuint end = 0;
12621 ajint dotpos;
12622 ajint colonpos;
12623 ajint istat = 0;
12624 AjBool conrev = ajFalse;
12625
12626 buff = seqin->Input->Filebuff;
12627
12628 if(!seqFtFmtEmbl)
12629 ajStrAssignC(&seqFtFmtEmbl, "embl");
12630
12631 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
12632 return ajFalse;
12633 lineprefix = seqPrefixSwiss(seqReadLine);
12634
12635 /* for GCG formatted databases */
12636
12637 while(lineprefix == SWISS_WP)
12638 {
12639 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
12640 return ajFalse;
12641
12642 lineprefix = seqPrefixSwiss(seqReadLine);
12643 }
12644
12645 /* extra blank lines */
12646
12647 while(ajStrIsWhite(seqReadLine))
12648 {
12649 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
12650 return ajFalse;
12651
12652 lineprefix = seqPrefixSwiss(seqReadLine);
12653 }
12654
12655 ajDebug("seqReadEmbl first line '%S'\n", seqReadLine);
12656
12657 if(lineprefix != SWISS_ID)
12658 {
12659 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
12660
12661 return ajFalse;
12662 }
12663
12664 if(seqin->Input->Text)
12665 ajStrAssignS(&thys->TextPtr, seqReadLine);
12666
12667 ajDebug("seqReadEmbl ID line found\n");
12668 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\t\n\r");
12669 ajStrTokenStep(seqHandle); /* 'ID' */
12670 ajStrTokenNextParse(seqHandle, &seqToken); /* entry name */
12671
12672 seqSetName(thys, seqToken);
12673
12674 ajStrTokenNextParse(seqHandle, &seqToken); /* SV for new syntax */
12675
12676 if(ajStrMatchC(seqToken, "SV")) /* new post-2006 EMBL line */
12677 {
12678 ajStrTokenNextParse(seqHandle, &seqToken); /* SV */
12679 ajStrInsertK(&seqToken, 0, '.');
12680 ajStrInsertS(&seqToken, 0, thys->Name);
12681 seqSvSave(thys, seqToken);
12682
12683 ajStrTokenNextParse(seqHandle, &seqToken); /* linear or circular */
12684
12685 if(ajStrMatchC(seqToken, "circular"))
12686 thys->Circular = ajTrue;
12687
12688 ajStrTokenNextParseC(seqHandle, ";\t\n\r", &seqToken);
12689 ajStrTrimWhite(&seqToken);
12690 ajSeqmolSetEmbl(&thys->Molecule, seqToken);
12691
12692 ajStrTokenNextParse(seqHandle, &seqToken);
12693 ajStrTrimWhite(&seqToken);
12694 ajStrAssignS(&thys->Class, seqToken);
12695
12696 ajStrTokenNextParse(seqHandle, &seqToken);
12697 ajStrTrimWhite(&seqToken);
12698 ajStrAssignS(&thys->Division, seqToken);
12699
12700 ajStrTokenNextParse(seqHandle, &seqToken);
12701 ajStrTrimEndC(&seqToken, "BP.");
12702 ajStrTrimWhite(&seqToken);
12703 ajStrToUint(seqToken, &seqlen);
12704 }
12705 else /* test for a SwissProt/SpTrEMBL entry */
12706 {
12707 if(ajStrFindC(seqReadLine, " PRT; ")>= 0 ||
12708 ajStrFindC(seqReadLine, " Unreviewed; ") >= 0 ||
12709 ajStrFindC(seqReadLine, " Reviewed; ") >= 0 ||
12710 ajStrFindC(seqReadLine, " Preliminary; ") >= 0
12711 )
12712 {
12713 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
12714 ajStrTokenReset(seqHandle);
12715 ajStrDelStatic(&seqToken);
12716
12717 return ajFalse;
12718 }
12719 }
12720
12721 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
12722 lineprefix = seqPrefixSwiss(seqReadLine);
12723
12724 dofeat = ajFalse;
12725 tryfeat = seqinUfoLocal(seqin);
12726
12727 while(ok &&
12728 lineprefix != SWISS_SQ &&
12729 lineprefix != SWISS_END)
12730 {
12731 /* check for Staden Experiment format instead */
12732 if(lineprefix == SWISS_EX)
12733 {
12734 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
12735 ajStrDelStatic(&seqToken);
12736
12737 return ajFalse;;
12738 }
12739
12740 else if(lineprefix == SWISS_FH)
12741 ok = ajTrue; /* ignore these lines */
12742
12743 else if(lineprefix == SWISS_AC) /* emblcds database format */
12744 {
12745 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
12746 ajStrTokenStep(seqHandle); /* 'AC' */
12747
12748 while(ajStrTokenNextParse(seqHandle, &seqToken))
12749 seqAccSave(thys, seqToken);
12750 }
12751
12752 else if(lineprefix==SWISS_SV)
12753 {
12754 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12755 ajStrTokenStep(seqHandle); /* 'SV' */
12756 ajStrTokenNextParse(seqHandle, &seqToken); /* version */
12757 seqSvSave(thys, seqToken);
12758 }
12759
12760 else if(lineprefix == SWISS_DE)
12761 {
12762 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12763 ajStrTokenStep(seqHandle); /* 'DE' */
12764 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* desc */
12765
12766 if(ajStrGetLen(thys->Desc))
12767 {
12768 ajStrAppendC(&thys->Desc, " ");
12769 ajStrAppendS(&thys->Desc, seqToken);
12770 }
12771 else
12772 ajStrAssignS(&thys->Desc, seqToken);
12773 }
12774
12775 else if(lineprefix == SWISS_KW)
12776 {
12777 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12778 ajStrTokenStep(seqHandle); /* 'KW' */
12779
12780 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
12781 {
12782 liststr = ajStrNewS(seqToken);
12783 ajStrTrimWhite(&liststr);
12784 ajSeqAddKey(thys, liststr);
12785 liststr = NULL;
12786 }
12787 }
12788
12789 else if(lineprefix == SWISS_OS)
12790 {
12791 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12792 ajStrTokenStep(seqHandle); /* 'OS' */
12793
12794 /* maybe better remove . from this, and trim from end */
12795 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
12796 {
12797 ajStrTrimWhite(&seqToken);
12798 ajStrTokenAssignC(&seqHandle2, seqToken, "()");
12799 itaxtype=1;
12800
12801 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
12802 {
12803 ajStrTrimWhite(&seqToken2);
12804 seqTaxSave(thys, seqToken2, itaxtype);
12805 itaxtype = 3;
12806 }
12807 }
12808 }
12809
12810 else if(lineprefix == SWISS_OC)
12811 {
12812 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12813 ajStrTokenStep(seqHandle); /* 'OC' */
12814
12815 /* maybe better remove . from this, and trim from end */
12816 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
12817 {
12818 ajStrTrimWhite(&seqToken);
12819 seqTaxSave(thys, seqToken, 0);
12820 }
12821 }
12822
12823 else if(lineprefix == SWISS_OG)
12824 {
12825 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12826 ajStrTokenStep(seqHandle); /* 'OG' */
12827
12828 /* maybe better remove . from this, and trim from end */
12829 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
12830 {
12831 ajStrTrimWhite(&seqToken);
12832 seqTaxSave(thys, seqToken, 2);
12833 }
12834 }
12835
12836 else if(lineprefix == SWISS_CC)
12837 {
12838 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12839 ajStrTokenStep(seqHandle); /* 'CC' */
12840 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
12841
12842 if(ajStrGetLen(cmtstr))
12843 ajStrAppendC(&cmtstr, "\n");
12844 ajStrAppendS(&cmtstr, seqToken);
12845
12846 /* trying to keep comments in one long string with embedded returns
12847 ** probably fails for long comments - and also fails for contact details
12848 ** which have very short comment lines
12849 ** switch to just keeping original lines */
12850
12851 /*
12852 if(ajStrGetLen(cmtstr))
12853 {
12854 if(ajStrGetLen(seqToken))
12855 {
12856 if(ajStrGetCharLast(cmtstr) != '\n')
12857 ajStrAppendK(&cmtstr, ' ');
12858 ajStrAppendS(&cmtstr, seqToken);
12859 }
12860 else
12861 {
12862 if(ajStrGetCharLast(cmtstr) != '\n')
12863 ajStrAppendK(&cmtstr, '\n');
12864 ajStrAppendC(&cmtstr, " \n");
12865 }
12866 }
12867 else
12868 ajStrAssignS(&cmtstr, seqToken);
12869 if(ajStrGetCharLast(seqToken) == '.')
12870 ajStrAppendK(&cmtstr, '\n');
12871 */
12872 }
12873
12874 else if(lineprefix == SWISS_DR)
12875 {
12876 AJNEW0(xref);
12877 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
12878 ajStrTokenStep(seqHandle); /* 'DR' */
12879
12880 ajStrTokenNextParseC(seqHandle, ";\n\r", &seqToken); /* dbname */
12881 ajStrTrimWhite(&seqToken);
12882 ajStrAssignS(&xref->Db, seqToken);
12883
12884 ajStrTokenNextParse(seqHandle, &seqToken); /* primary */
12885 ajStrTrimWhite(&seqToken);
12886 ajStrAssignS(&xref->Id, seqToken);
12887
12888 ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
12889
12890 if(!ajStrGetLen(seqToken))
12891 {
12892 if(ajStrGetCharLast(xref->Id) == '.')
12893 ajStrCutEnd(&xref->Id, 1);
12894 }
12895 else
12896 {
12897 if(ajStrGetCharLast(seqToken) == '.')
12898 ajStrCutEnd(&seqToken, 1);
12899 ajStrTrimWhite(&seqToken);
12900 ajStrAssignS(&xref->Secid, seqToken);
12901
12902 ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
12903
12904 if(!ajStrGetLen(seqToken))
12905 {
12906 if(ajStrGetCharLast(xref->Secid) == '.')
12907 ajStrCutEnd(&xref->Secid, 1);
12908 }
12909 else
12910 {
12911 if(ajStrGetCharLast(seqToken) == '.')
12912 ajStrCutEnd(&seqToken, 1);
12913 ajStrTrimWhite(&seqToken);
12914 ajStrAssignS(&xref->Terid, seqToken);
12915
12916 ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
12917
12918 if(!ajStrGetLen(seqToken))
12919 {
12920 if(ajStrGetCharLast(xref->Terid) == '.')
12921 ajStrCutEnd(&xref->Terid, 1);
12922 }
12923 else
12924 {
12925 if(ajStrGetCharLast(seqToken) == '.')
12926 ajStrCutEnd(&seqToken, 1);
12927 ajStrTrimWhite(&seqToken);
12928 ajStrAssignS(&xref->Quatid, seqToken);
12929 }
12930 }
12931 }
12932 xref->Type = XREF_DR;
12933 ajSeqAddXref(thys, xref);
12934 xref = NULL;
12935 }
12936
12937 else if(lineprefix == SWISS_RN)
12938 {
12939 if(seqref)
12940 {
12941 ajSeqrefStandard(seqref);
12942 ajSeqAddRef(thys, seqref);
12943 seqref = NULL;
12944 }
12945
12946 seqref = ajSeqrefNew();
12947 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12948 ajStrTokenStep(seqHandle); /* 'RN' */
12949 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* [num] */
12950 ajStrAssignSubS(&seqToken2, seqToken, 1, -2);
12951 ajStrToUint(seqToken2, &refnum);
12952 ajSeqrefSetnumNumber(seqref, refnum);
12953 }
12954
12955 else if(lineprefix == SWISS_RG)
12956 {
12957 if(!seqref)
12958 seqref = ajSeqrefNew();
12959
12960 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12961 ajStrTokenStep(seqHandle); /* 'RG' */
12962 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* groupname */
12963 ajSeqrefAppendGroupname(seqref, seqToken);
12964 }
12965
12966 else if(lineprefix == SWISS_RX)
12967 {
12968 if(!seqref)
12969 seqref = ajSeqrefNew();
12970
12971 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12972 ajStrTokenStep(seqHandle); /* 'RX' */
12973 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* xref */
12974 ajSeqrefAppendXref(seqref, seqToken);
12975 }
12976
12977 else if(lineprefix == SWISS_RP)
12978 {
12979 if(!seqref)
12980 seqref = ajSeqrefNew();
12981
12982 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12983 ajStrTokenStep(seqHandle); /* 'RP' */
12984 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* position */
12985 ajSeqrefAppendPosition(seqref, seqToken);
12986 }
12987
12988 else if(lineprefix == SWISS_RA)
12989 {
12990 if(!seqref)
12991 seqref = ajSeqrefNew();
12992
12993 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12994 ajStrTokenStep(seqHandle); /* 'RA' */
12995 ajStrTokenNextParseC(seqHandle, "\n\r;", &seqToken); /* authors */
12996 ajSeqrefAppendAuthors(seqref, seqToken);
12997 }
12998
12999 else if(lineprefix == SWISS_RT)
13000 {
13001 if(!seqref)
13002 seqref = ajSeqrefNew();
13003
13004 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13005 ajStrTokenStep(seqHandle); /* 'RT' */
13006 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
13007
13008 if(!ajStrMatchC(seqToken, ";"))
13009 ajSeqrefAppendTitle(seqref, seqToken);
13010 }
13011
13012 else if(lineprefix == SWISS_RL)
13013 {
13014 if(!seqref)
13015 seqref = ajSeqrefNew();
13016
13017 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13018 ajStrTokenStep(seqHandle); /* 'RL' */
13019 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* authors */
13020 ajSeqrefAppendLocation(seqref, seqToken);
13021 }
13022
13023 else if(lineprefix == SWISS_RC)
13024 {
13025 if(!seqref)
13026 seqref = ajSeqrefNew();
13027
13028 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13029 ajStrTokenStep(seqHandle); /* 'RC' */
13030 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
13031 ajSeqrefAppendComment(seqref, seqToken);
13032 }
13033
13034 else if(tryfeat && lineprefix == SWISS_FT)
13035 {
13036 if(!dofeat)
13037 {
13038 dofeat = ajTrue;
13039 ajFeattabinDel(&seqin->Ftquery);
13040 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtEmbl,
13041 thys->Name, "N");
13042 ajDebug("seqReadEmbl: seqin->Ftquery Filebuff %x\n",
13043 seqin->Ftquery->Input->Filebuff);
13044 }
13045
13046 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
13047 /* ajDebug("EMBL FEAT saved line:\n%S", seqReadLine); */
13048 }
13049
13050 else if(lineprefix == SWISS_DT)
13051 {
13052 if(!thys->Date)
13053 thys->Date = ajSeqdateNew();
13054
13055 ajStrTokenAssignC(&seqHandle, seqReadLine, " (),");
13056 icount = 0;
13057
13058 while(ajStrTokenNextParse(seqHandle, &seqToken))
13059 {
13060 icount++;
13061
13062 if(icount==2)
13063 ajStrAssignS(&datestr, seqToken);
13064 else if(icount==4)
13065 ajStrAssignS(&relstr, seqToken);
13066 else if(icount==5)
13067 {
13068 if(ajStrMatchC(
13069 seqToken, "Created"))
13070 {
13071 ajSeqdateSetCreateS(thys->Date, datestr);
13072 ajStrAssignS(&thys->Date->CreRel, relstr);
13073 }
13074 }
13075 else if(icount==8)
13076 {
13077 ajSeqdateSetModifyS(thys->Date, datestr);
13078 ajStrAssignS(&thys->Date->ModRel, relstr);
13079 ajStrAssignS(&thys->Date->ModVer, seqToken);
13080 }
13081 }
13082 }
13083
13084
13085 else if(lineprefix == SWISS_XX)
13086 {
13087 if(seqref)
13088 {
13089 ajSeqrefStandard(seqref);
13090 ajSeqAddRef(thys, seqref);
13091 seqref = NULL;
13092 }
13093
13094 if(ajStrGetLen(cmtstr))
13095 {
13096 ajSeqAddCmt(thys, cmtstr);
13097 cmtstr = NULL;
13098 }
13099
13100 }
13101
13102 else if(lineprefix == SWISS_CO)
13103 {
13104 if(!constr)
13105 constr = ajStrNewRes(4096);
13106
13107 ajStrTrimWhiteEnd(&seqReadLine);
13108 ajStrAppendSubS(&constr, seqReadLine, 5, -1);
13109 }
13110
13111 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13112 lineprefix = seqPrefixSwiss(seqReadLine);
13113 }
13114
13115 if(lineprefix == SWISS_END && ajStrGetLen(constr))
13116 {
13117 conseq = ajSeqNew();
13118
13119 if(ajStrPrefixC(constr, "join(") && ajStrSuffixC(constr, ")"))
13120 {
13121 ajStrCutEnd(&constr, 1);
13122 ajStrCutStart(&constr, 5);
13123 }
13124
13125 ajStrTokenAssignC(&handle, constr, ",");
13126
13127 while(ajStrTokenNextParse(handle, &token))
13128 {
13129 ajDebug("CO parsing token '%S'\n", token);
13130
13131 if(ajStrPrefixC(token, "gap("))
13132 {
13133 ajDebug("CO gap: '%S'\n", token);
13134 ajStrCutEnd(&token, 1);
13135 ajStrCutStart(&token, 4);
13136 if(ajStrToUint(token, &gaplen))
13137 {
13138 ajDebug("gap %u bases total %u\n",
13139 gaplen, ajSeqGetLen(thys));
13140 }
13141 else
13142 ajWarn("Unknown gap length in '%S'", constr);
13143
13144 ajStrAppendCountK(&thys->Seq, 'N', gaplen);
13145 }
13146 else
13147 {
13148 if(ajStrPrefixC(token, "complement("))
13149 {
13150 ajStrCutEnd(&token, 1);
13151 ajStrCutStart(&token, 11);
13152 conrev = ajTrue;
13153 }
13154
13155 if(!condb)
13156 {
13157 if(!ajNamDbGetAttrSpecialC(seqin->Input->Db, "ConDatabase",
13158 &condb))
13159 ajStrAssignS(&condb, seqin->Input->Db);
13160 if(!ajNamDbGetAttrSpecialC(seqin->Input->Db, "ConField",
13161 &confield))
13162 ajStrAssignC(&confield, "acc");
13163 }
13164
13165 dotpos = (ajint) ajStrFindAnyK(token, '.');
13166 colonpos = (ajint) ajStrFindAnyK(token, ':');
13167 ajStrAssignSubS(&numstr, token, colonpos+1, -1);
13168 istat = ajFmtScanS(numstr, "%u..%u", &start, &end);
13169 if(istat != 2)
13170 {
13171 ajWarn("EMBLCON badly formed fragment '%S'", token);
13172 start = 1;
13173 end = 0;
13174 }
13175
13176 if(ajStrMatchC(confield, "sv"))
13177 {
13178 ajFmtPrintS(&conqry, "%S-sv:%S", seqin->Input->Db, token);
13179 if(conrev)
13180 ajStrAppendC(&conqry, ":r");
13181 }
13182 else
13183 {
13184 if((dotpos > 0) && (dotpos < colonpos))
13185 {
13186 ajStrCutRange(&token, dotpos, colonpos-1);
13187 ajFmtPrintS(&conqry, "%S-%S:%S",
13188 condb, confield, token);
13189 if(conrev)
13190 ajStrAppendC(&conqry, ":r");
13191 }
13192 }
13193 ajDebug("CO done: '%S' '%S' rev:%B\n",
13194 token, conqry, conrev);
13195
13196 ajSeqinUsa(&conseqin, conqry);
13197
13198 if(!ajSeqRead(conseq, conseqin))
13199 ajErr("EMBLCON entry '%S' failed to read '%S'",
13200 thys->Name, conqry);
13201 else
13202 {
13203 ajSeqTrim(conseq);
13204 if(conrev)
13205 ajSeqReverseDo(conseq);
13206 seqAppend(&thys->Seq, ajSeqGetSeqS(conseq));
13207 ajDebug("Read %u bases total %u\n",
13208 ajSeqGetLen(conseq), ajSeqGetLen(thys));
13209 }
13210 }
13211 }
13212
13213 ajDebug("CO processed seqlen: %u\n", ajSeqGetLen(thys));
13214
13215 ajStrTokenDel(&handle);
13216
13217 while(ok && lineprefix != SWISS_END)
13218 {
13219 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13220 lineprefix = seqPrefixSwiss(seqReadLine);
13221 }
13222
13223 ajSeqinDel(&conseqin);
13224 ajSeqDel(&conseq);
13225
13226 }
13227
13228 if(ok && lineprefix == SWISS_SQ)
13229 {
13230 /* now we are on the SQ line - or there was nothing */
13231
13232 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13233 ajStrTokenStep(seqHandle); /* 'SQ' */
13234 ajStrTokenStep(seqHandle); /* 'Sequence' */
13235 ajStrTokenNextParse(seqHandle, &seqToken); /* len */
13236 ajStrToUint(seqToken, &tmplen);
13237
13238 if(tmplen > seqlen)
13239 seqlen = tmplen;
13240
13241 ajStrTokenStep(seqHandle); /* BP; */
13242 tmplen = 0;
13243
13244 for(i=0;i<4;i++)
13245 {
13246 ajStrTokenNextParse(seqHandle, &seqToken); /* count */
13247 ajStrToUint(seqToken, &itmp);
13248 ajStrTokenNextParse(seqHandle,
13249 &seqToken); /* 'A' 'C' 'G' 'T' 'other' */
13250 tmplen += itmp;
13251 }
13252
13253 if(tmplen > seqlen)
13254 seqlen = tmplen;
13255
13256 if(dofeat)
13257 {
13258 ajFeattableDel(&thys->Fttable);
13259 thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
13260 /* ajFeattableTrace(thys->Fttable); */
13261 ajDebug("EMBL FEAT SQ TabIn filebuff: %x features: %u\n",
13262 seqin->Ftquery->Input->Filebuff,
13263 ajFeattableGetSize(thys->Fttable));
13264 ajFeattabinClear(seqin->Ftquery);
13265 }
13266 else if(tryfeat) /* but no features in entry */
13267 {
13268 ajDebug("EMBL FEAT SQ empty filebuff: %x\n",
13269 seqin->Ftquery->Input->Filebuff);
13270 thys->Fttable = ajFeattableNewSeq(thys);
13271 }
13272
13273 if(ajStrGetLen(seqin->Inseq))
13274 {
13275 /* we have a sequence to use ...perhaps from GCG/NBRF format */
13276 ajStrAssignS(&thys->Seq, seqin->Inseq);
13277
13278 if(seqin->Input->Text)
13279 {
13280 seqTextSeq(&thys->TextPtr, seqin->Inseq);
13281 ajFmtPrintAppS(&thys->TextPtr, "//\n");
13282 }
13283 }
13284 else
13285 {
13286 /* read the sequence and terminator */
13287 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13288 lineprefix = seqPrefixSwiss(seqReadLine);
13289 ajStrSetRes(&thys->Seq, seqlen+1);
13290
13291 while(ok && lineprefix != SWISS_END)
13292 {
13293 seqAppend(&thys->Seq, seqReadLine);
13294 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13295 lineprefix = seqPrefixSwiss(seqReadLine);
13296 }
13297
13298 }
13299 }
13300
13301 if(!ajSeqIsNuc(thys))
13302 {
13303 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13304 ajStrDel(&datestr);
13305 ajStrDel(&relstr);
13306 ajStrDelStatic(&seqToken);
13307 ajStrTokenReset(seqHandle);
13308
13309 return ajFalse;
13310 }
13311
13312 ajSeqSetNuc(thys);
13313
13314 if(thys->Fttable)
13315 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
13316
13317 if(ajFeattableGetSize(thys->Fttable))
13318 {
13319 ajFeattableGetXrefs(thys->Fttable, &thys->Xreflist, &taxid);
13320 if(taxid)
13321 seqTaxidSaveI(thys, taxid);
13322 }
13323
13324 ajSeqreflistGetXrefs(thys->Reflist, &thys->Xreflist);
13325
13326 if(!taxid)
13327 taxid = ajSeqGetTaxid(thys);
13328
13329 ajFilebuffClear(buff, 0);
13330
13331 ajStrDel(&datestr);
13332 ajStrDel(&relstr);
13333 ajStrDel(&condb);
13334 ajStrDel(&confield);
13335 ajStrDel(&constr);
13336 ajStrDel(&numstr);
13337 ajStrDel(&conqry);
13338 ajStrDel(&token);
13339
13340 ajStrDelStatic(&seqToken);
13341 ajStrDelStatic(&seqToken2);
13342
13343 ajStrTokenReset(seqHandle);
13344 ajStrTokenReset(seqHandle2);
13345
13346 /* ajSeqTrace(thys); */
13347
13348 return ajTrue;
13349 }
13350
13351
13352
13353
13354 /* @funcstatic seqReadExperiment **********************************************
13355 **
13356 ** Given data in a sequence structure, tries to read everything needed
13357 ** using Staden experiment format.
13358 **
13359 ** @param [w] thys [AjPSeq] Sequence object
13360 ** @param [u] seqin [AjPSeqin] Sequence input object
13361 ** @return [AjBool] ajTrue on success
13362 **
13363 ** @release 3.0.0
13364 ** @@
13365 ******************************************************************************/
13366
seqReadExperiment(AjPSeq thys,AjPSeqin seqin)13367 static AjBool seqReadExperiment(AjPSeq thys, AjPSeqin seqin)
13368 {
13369
13370 AjBool ok;
13371 AjPFilebuff buff;
13372 AjBool dofeat = ajFalse;
13373 AjBool tryfeat = ajFalse;
13374 AjPStr liststr; /* for lists, do not delete */
13375 AjPStr accvalstr = NULL;
13376 ajuint i;
13377 ajint ja;
13378 ajuint ilen;
13379 ajuint itaxtype;
13380 SeqEPrefixSwiss lineprefix = SWISS_UNK;
13381
13382 buff = seqin->Input->Filebuff;
13383
13384 if(!seqFtFmtEmbl)
13385 ajStrAssignC(&seqFtFmtEmbl, "embl");
13386
13387 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
13388 return ajFalse;
13389
13390 lineprefix = seqPrefixSwiss(seqReadLine);
13391
13392 ajDebug("seqReadExperiment first line '%S'\n", seqReadLine);
13393
13394 if(lineprefix != SWISS_ID)
13395 {
13396 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13397
13398 return ajFalse;
13399 }
13400
13401 if(seqin->Input->Text)
13402 ajStrAssignS(&thys->TextPtr, seqReadLine);
13403
13404 ajDebug("seqReadExperiment ID line found\n");
13405 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r\t");
13406 ajStrTokenStep(seqHandle); /* 'ID' */
13407 ajStrTokenNextParse(seqHandle, &seqToken); /* entry name */
13408
13409 seqSetName(thys, seqToken);
13410
13411 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13412 lineprefix = seqPrefixSwiss(seqReadLine);
13413
13414 while(ok && lineprefix != SWISS_SQ)
13415 {
13416 if(lineprefix == SWISS_EX)
13417 {
13418 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13419 ajStrTokenStep(seqHandle); /* 'EX'*/
13420 ajStrTokenNextParseC(seqHandle, "\n\r",
13421 &seqToken); /* expt. desc. */
13422
13423 if(ajStrGetLen(thys->Desc))
13424 {
13425 ajStrAppendC(&thys->Desc, " ");
13426 ajStrAppendS(&thys->Desc, seqToken);
13427 }
13428 else
13429 ajStrAssignS(&thys->Desc, seqToken);
13430 }
13431
13432 if(lineprefix == SWISS_AV)
13433 {
13434 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13435 ajStrTokenStep(seqHandle); /* 'AV' */
13436 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* desc */
13437
13438 if(ajStrGetLen(accvalstr))
13439 {
13440 ajStrAppendC(&accvalstr, " ");
13441 ajStrAppendS(&accvalstr, seqToken);
13442 }
13443 else
13444 ajStrAssignS(&accvalstr, seqToken);
13445 }
13446
13447 /* standard EMBL records are allowed */
13448
13449 if(lineprefix == SWISS_AC)
13450 {
13451 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
13452 ajStrTokenStep(seqHandle); /* 'AC' */
13453
13454 while(ajStrTokenNextParse(seqHandle, &seqToken))
13455 seqAccSave(thys, seqToken);
13456 }
13457
13458 if(lineprefix == SWISS_SV)
13459 {
13460 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13461 ajStrTokenStep(seqHandle); /* 'SV' */
13462 ajStrTokenNextParse(seqHandle, &seqToken); /* version */
13463 seqSvSave(thys, seqToken);
13464 }
13465
13466 if(lineprefix == SWISS_DE)
13467 {
13468 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13469 ajStrTokenStep(seqHandle); /* 'DE' */
13470 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* desc */
13471
13472 if(ajStrGetLen(thys->Desc))
13473 {
13474 ajStrAppendC(&thys->Desc, " ");
13475 ajStrAppendS(&thys->Desc, seqToken);
13476 }
13477 else
13478 ajStrAssignS(&thys->Desc, seqToken);
13479 }
13480
13481 if(lineprefix == SWISS_KW)
13482 {
13483 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13484 ajStrTokenStep(seqHandle); /* 'KW' */
13485
13486 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
13487 {
13488 liststr = ajStrNewS(seqToken);
13489 ajStrTrimWhite(&liststr);
13490 ajSeqAddKey(thys, liststr);
13491 liststr = NULL;
13492 }
13493 }
13494
13495 if(lineprefix == SWISS_OS)
13496 {
13497 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13498 ajStrTokenStep(seqHandle); /* 'OS' */
13499
13500 /* maybe better remove . from this, and trim from end */
13501 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
13502 {
13503 ajStrTrimWhite(&seqToken);
13504 ajStrTokenAssignC(&seqHandle2, seqToken, "()");
13505 itaxtype=1;
13506
13507 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
13508 {
13509 ajStrTrimWhite(&seqToken2);
13510 seqTaxSave(thys, seqToken2, itaxtype);
13511 itaxtype = 3;
13512 }
13513
13514 ajStrTokenReset(seqHandle2);
13515 }
13516 }
13517
13518 if(lineprefix == SWISS_OC)
13519 {
13520 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13521 ajStrTokenStep(seqHandle); /* 'OC' */
13522
13523 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
13524 {
13525 ajStrTrimWhite(&seqToken);
13526 seqTaxSave(thys, seqToken, 0);
13527 }
13528 }
13529
13530 if(tryfeat && lineprefix == SWISS_FT)
13531 {
13532 if(!dofeat)
13533 {
13534 dofeat = ajTrue;
13535 ajFeattabinDel(&seqin->Ftquery);
13536 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtEmbl,
13537 thys->Name, "N");
13538 /* ajDebug("seqin->Ftquery Filebuff %x\n",
13539 seqin->Ftquery->Input->Filebuff); */
13540
13541 }
13542
13543 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
13544 /* ajDebug("EMBL FEAT saved line:\n%S", seqReadLine); */
13545 }
13546
13547 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13548 lineprefix = seqPrefixSwiss(seqReadLine);
13549 }
13550
13551 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13552 lineprefix = seqPrefixSwiss(seqReadLine);
13553
13554 while(ok && lineprefix != SWISS_END)
13555 {
13556 seqAppend(&thys->Seq, seqReadLine);
13557 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13558 lineprefix = seqPrefixSwiss(seqReadLine);
13559 }
13560 ajDebug("Sequence read %d bases\n", ajStrGetLen(thys->Seq));
13561
13562 if(thys->Fttable)
13563 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
13564
13565 while(ok && lineprefix != SWISS_ID)
13566 {
13567 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13568 lineprefix = seqPrefixSwiss(seqReadLine);
13569 }
13570
13571 if(ok)
13572 ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
13573 else
13574 ajFilebuffClear(buff, 0);
13575
13576 if(dofeat)
13577 {
13578 /* ajDebug("EMBL FEAT TabIn %x\n", seqin->Ftquery); */
13579 ajFeattableDel(&thys->Fttable);
13580 thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
13581 /* ajFeattableTrace(thys->Fttable); */
13582 ajFeattabinClear(seqin->Ftquery);
13583 }
13584
13585 if(ajStrGetLen(accvalstr))
13586 {
13587 ilen = ajStrGetLen(thys->Seq);
13588 if(ilen > thys->Qualsize)
13589 {
13590 AJCRESIZE(thys->Accuracy, ilen);
13591 thys->Qualsize = ilen;
13592 }
13593
13594 ajStrTokenAssignC(&seqHandle, accvalstr, " ");
13595
13596 for(i=0;i<ilen;i++)
13597 {
13598 thys->Accuracy[i] = INT_MIN;
13599 if(!ajStrTokenNextParse(seqHandle, &seqToken))
13600 {
13601 ajWarn("Missing accuracy for base %d in experiment format\n",
13602 i+1);
13603 break;
13604 }
13605
13606 ajStrTokenAssignC(&seqHandle2, seqToken, ",");
13607
13608 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
13609 {
13610 if(ajStrToInt(seqToken2, &ja))
13611 {
13612 if(ja > thys->Accuracy[i])
13613 thys->Accuracy[i] = (float) ja;
13614 }
13615 else
13616 {
13617 ajWarn("Bad accuracy '%S' for base %d "
13618 "in experiment format\n",
13619 seqToken, i+1);
13620 break;
13621 }
13622 }
13623 ajDebug("Accval[%u] %3f '%S'\n", i+1, thys->Accuracy[i], seqToken);
13624 }
13625 }
13626
13627 ajStrDelStatic(&seqToken);
13628 ajStrDelStatic(&seqToken2);
13629 ajStrDel(&accvalstr);
13630
13631 ajStrTokenReset(seqHandle);
13632 ajStrTokenReset(seqHandle2);
13633
13634
13635 /* ajSeqTrace(thys); */
13636
13637 return ajTrue;
13638 }
13639
13640
13641
13642
13643 /* @funcstatic seqReadGenbank *************************************************
13644 **
13645 ** Given data in a sequence structure, tries to read everything needed
13646 ** using Genbank format.
13647 **
13648 ** @param [w] thys [AjPSeq] Sequence object
13649 ** @param [u] seqin [AjPSeqin] Sequence input object
13650 ** @return [AjBool] ajTrue on success
13651 **
13652 ** @release 1.0.0
13653 ** @@
13654 ******************************************************************************/
13655
seqReadGenbank(AjPSeq thys,AjPSeqin seqin)13656 static AjBool seqReadGenbank(AjPSeq thys, AjPSeqin seqin)
13657 {
13658 AjBool ok;
13659 AjBool done = ajFalse;
13660 AjPFilebuff buff;
13661 AjPStr cmtstr = NULL;
13662 AjBool dofeat = ajFalse;
13663 AjBool tryfeat = ajFalse;
13664 AjPQuery qry;
13665 AjPStr liststr; /* for lists, do not delete */
13666 AjPSeqRef seqref = NULL;
13667 ajuint refnum;
13668 ajuint seqlen = 1024;
13669 ajint i;
13670 ajint nfields;
13671 ajuint taxid = 0;
13672 ajuint itaxtype = 0;
13673 SeqEPrefixGenbank lineprefix = GB_UNK;
13674 SeqEPrefixGenbankMore moreprefix = GB_MORE_UNK;
13675
13676 ajDebug("seqReadGenbank\n");
13677
13678 buff = seqin->Input->Filebuff;
13679 qry = seqin->Input->Query;
13680
13681 if(!seqFtFmtGenbank)
13682 ajStrAssignC(&seqFtFmtGenbank, "genbank");
13683
13684 if(!ajBuffreadLine(buff, &seqReadLine))
13685 return ajFalse;
13686
13687 lineprefix = seqPrefixGenbank(seqReadLine);
13688
13689 ajDebug("++seqReadGenbank first line '%S'\n", seqReadLine);
13690
13691 ok = ajTrue;
13692
13693 /* extra blank lines */
13694
13695 while(ajStrIsWhite(seqReadLine))
13696 {
13697 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
13698 return ajFalse;
13699 lineprefix = seqPrefixGenbank(seqReadLine);
13700 }
13701
13702 /* for GCG formatted databases */
13703
13704 if(lineprefix == GB_WP)
13705 {
13706 ok = ajBuffreadLine(buff, &seqReadLine);
13707 lineprefix = seqPrefixGenbank(seqReadLine);
13708
13709 while(ok && lineprefix == GB_MORE)
13710 {
13711 ok = ajBuffreadLine(buff, &seqReadLine);
13712 lineprefix = seqPrefixGenbank(seqReadLine);
13713 }
13714 }
13715
13716 /* This loop necessary owing to headers on GB distro files */
13717 if(ajStrFindC(seqReadLine,"Genetic Sequence Data Bank") >= 0)
13718 while(ok && lineprefix != GB_ID) /* LOCUS */
13719 {
13720 ok = ajBuffreadLine(buff, &seqReadLine);
13721 lineprefix = seqPrefixGenbank(seqReadLine);
13722 }
13723
13724 if(!ok)
13725 {
13726 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13727
13728 return ajFalse;
13729 }
13730
13731 if(lineprefix != GB_ID) /* LOCUS */
13732 {
13733 ajDebug("failed - LOCUS not found - first line was\n%S\n",
13734 seqReadLine);
13735 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13736
13737 return ajFalse;
13738 }
13739
13740 nfields = ajStrParseCountC(seqReadLine, " \n\r");
13741
13742 if(nfields == 9)
13743 {
13744 ajFilebuffSetBuffered(buff);
13745 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13746
13747 return seqReadGenpept(thys,seqin);
13748 }
13749
13750 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13751 i=0;
13752
13753 while(ajStrTokenNextParse(seqHandle, &seqToken))
13754 {
13755 switch(++i)
13756 {
13757 case 1:
13758 break;
13759 case 2:
13760 seqSetName(thys, seqToken);
13761 break;
13762 case 3:
13763 ajStrToUint(seqToken, &seqlen);
13764 break;
13765 case 4:
13766 if(ajStrMatchC(seqToken, "aa"))
13767 {
13768 ajFilebuffSetBuffered(buff);
13769 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13770 ajStrDelStatic(&seqToken);
13771 ajStrTokenReset(seqHandle);
13772
13773 ajDebug("first line %d aa pass to refseqp '%S'\n",
13774 buff->Pos, seqReadLine);
13775 return seqReadRefseqp(thys,seqin);
13776 }
13777 if(!ajStrMatchC(seqToken, "bp"))
13778 ajWarn("bad Genbank LOCUS line '%S'", seqReadLine);
13779 break;
13780 case 5:
13781 ajSeqmolSetGb(&thys->Molecule, seqToken);
13782 break;
13783 case 6:
13784 if(ajStrMatchC(seqToken, "circular"))
13785 thys->Circular = ajTrue;
13786 break;
13787 case 7:
13788 ajSeqdivSetGb(&thys->Division, seqToken);
13789 ajSeqclsSetGb(&thys->Class, seqToken);
13790 break;
13791 case 8:
13792 if(!thys->Date)
13793 thys->Date = ajSeqdateNew();
13794 ajSeqdateSetModifyS(thys->Date, seqToken);
13795 break;
13796 default:
13797 break;
13798 }
13799 }
13800
13801 if(seqin->Input->Text)
13802 ajStrAssignS(&thys->TextPtr, seqReadLine);
13803
13804 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13805 lineprefix = seqPrefixGenbank(seqReadLine);
13806
13807 dofeat = ajFalse;
13808 tryfeat = seqinUfoLocal(seqin);
13809
13810 while(ok &&
13811 lineprefix != GB_END &&
13812 lineprefix != GB_ORI &&
13813 lineprefix != GB_BASE)
13814 {
13815 done = ajFalse;
13816
13817 if(lineprefix == GB_DEF)
13818 {
13819 ajDebug("definition found\n");
13820 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13821 ajStrTokenStep(seqHandle); /* 'DEFINITION' */
13822 ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc); /* desc */
13823 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13824 lineprefix = seqPrefixGenbank(seqReadLine);
13825 done = ajTrue;
13826
13827 while(ok && lineprefix == GB_MORE)
13828 {
13829 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13830 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
13831 ajStrAppendC(&thys->Desc, " ");
13832 ajStrAppendS(&thys->Desc, seqToken);
13833 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13834 lineprefix = seqPrefixGenbank(seqReadLine);
13835 }
13836 }
13837
13838 else if(lineprefix == GB_AC)
13839 {
13840 ajDebug("accession found\n");
13841
13842 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
13843 ajStrTokenStep(seqHandle); /* 'ACCESSION' */
13844
13845 while(ajStrTokenNextParse(seqHandle, &seqToken))
13846 seqAccSave(thys, seqToken);
13847 }
13848
13849 else if(lineprefix == GB_VER)
13850 {
13851 ajDebug("seqversion found\n");
13852
13853 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13854 ajStrTokenStep(seqHandle); /* 'VERSION' */
13855 ajStrTokenNextParse(seqHandle, &seqToken);
13856 seqSvSave(thys, seqToken);
13857
13858 if(ajStrTokenStepC(seqHandle, ": \n\r")) /* GI: */
13859 {
13860 ajStrTokenNextParse(seqHandle, &thys->Gi);
13861 }
13862 }
13863
13864 else if(lineprefix == GB_SRC)
13865 {
13866 ajDebug("source found\n");
13867 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13868 ajStrTokenStep(seqHandle); /* 'SOURCE' */
13869 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* source */
13870 ajStrTokenAssignC(&seqHandle2, seqToken, "()");
13871 itaxtype=1;
13872
13873 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
13874 {
13875 ajStrTrimWhite(&seqToken2);
13876 seqTaxSave(thys, seqToken2, itaxtype);
13877 itaxtype = 3;
13878 }
13879
13880 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13881 lineprefix = seqPrefixGenbank(seqReadLine);
13882 done = ajTrue;
13883
13884 while(ok && lineprefix == GB_MORE)
13885 {
13886 done = ajFalse;
13887 /* process organism lines */
13888
13889 moreprefix = seqPrefixGenbankMore(seqReadLine);
13890
13891 if(moreprefix == GB_MORE_ORG)
13892 {
13893 ajDebug("organism found\n");
13894 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13895 ajStrTokenNextParse(seqHandle, &seqToken); /* 'ORGANISM' */
13896
13897 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
13898 {
13899 ajStrTrimWhite(&seqToken);
13900 seqTaxSave(thys, seqToken, 1);
13901 }
13902
13903 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13904
13905 moreprefix = seqPrefixGenbankMore(seqReadLine);
13906 done = ajTrue;
13907
13908 while(ok && moreprefix == GB_MORE_MORE)
13909 {
13910 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13911
13912 while(ajStrTokenNextParseC(seqHandle, ".;\n\r",
13913 &seqToken))
13914 {
13915 ajStrTrimWhite(&seqToken);
13916 seqTaxSave(thys, seqToken, 0);
13917 }
13918
13919 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13920 moreprefix = seqPrefixGenbankMore(seqReadLine);
13921 }
13922 }
13923
13924 if(!done)
13925 {
13926 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13927 }
13928 lineprefix = seqPrefixGenbank(seqReadLine);
13929 }
13930 }
13931
13932 else if(tryfeat && lineprefix == GB_FEAT)
13933 {
13934 ajDebug("features found\n");
13935
13936 if(!dofeat)
13937 {
13938 dofeat = ajTrue;
13939 ajFeattabinDel(&seqin->Ftquery);
13940 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtGenbank,
13941 thys->Name, "N");
13942 ajDebug("seqin->Ftquery Filebuff %x\n",
13943 seqin->Ftquery->Input->Filebuff);
13944 /* ajDebug("GENBANK FEAT first line:\n%S", seqReadLine); */
13945 }
13946
13947 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
13948 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13949 lineprefix = seqPrefixGenbank(seqReadLine);
13950 done = ajTrue;
13951
13952 while(ok && lineprefix == GB_MORE)
13953 {
13954 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff,
13955 seqReadLine);
13956 /* ajDebug("GENBANK FEAT saved line:\n%S", seqReadLine); */
13957 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13958 lineprefix = seqPrefixGenbank(seqReadLine);
13959 }
13960 }
13961
13962 else if(lineprefix == GB_REF)
13963 {
13964 ajDebug("reference found\n");
13965 seqref = ajSeqrefNew();
13966 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13967 ajStrTokenStep(seqHandle); /* 'REFERENCE' */
13968 ajStrTokenNextParse(seqHandle, &seqToken); /* number */
13969 ajStrToUint(seqToken, &refnum);
13970 ajSeqrefSetnumNumber(seqref, refnum);
13971 ajStrAssignClear(&seqToken2);
13972
13973 while (ajStrTokenNextParse(seqHandle, &seqToken))
13974 {
13975 if(ajStrMatchC(seqToken, "(bases"))
13976 continue;
13977
13978 if(ajStrMatchC(seqToken, "(residues"))
13979 continue;
13980
13981 if(ajStrMatchC(seqToken, "to"))
13982 continue;
13983
13984 if(!ajStrGetLen(seqToken2))
13985 ajStrAssignS(&seqToken2, seqToken);
13986
13987 if(ajStrSuffixC(seqToken, ")"))
13988 {
13989 ajStrTrimEndC(&seqToken, ")");
13990 ajStrAppendK(&seqToken2, '-');
13991 ajStrAppendS(&seqToken2, seqToken);
13992 }
13993 }
13994
13995 ajSeqrefSetPosition(seqref, seqToken2);
13996
13997 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13998 moreprefix = seqPrefixGenbankMore(seqReadLine);
13999 done = ajTrue;
14000
14001 ajSeqrefStandard(seqref);
14002 ajSeqAddRef(thys, seqref);
14003
14004 if(ok && moreprefix == GB_MORE_AUT)
14005 {
14006 ajDebug("authors found\n");
14007 if(!seqref)
14008 seqref = ajSeqrefNew();
14009 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14010 ajStrTokenStep(seqHandle); /* 'AUTHORS' */
14011 ajStrTokenNextParseC(seqHandle, "\n\r",
14012 &seqToken2); /* authors */
14013
14014 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14015 moreprefix = seqPrefixGenbankMore(seqReadLine);
14016
14017 while(ok && moreprefix == GB_MORE_MORE)
14018 {
14019 ajStrAssignS(&seqToken, seqReadLine);
14020 ajStrTrimWhite(&seqToken);
14021 if(ajStrSuffixC(seqToken2, ".,") ||
14022 ajStrPrefixC(seqToken, "and "))
14023 ajStrAppendC(&seqToken2, " ");
14024 ajStrAppendS(&seqToken2, seqToken);
14025 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14026 moreprefix = seqPrefixGenbankMore(seqReadLine);
14027 }
14028
14029 /* append here - genbank splits author names across lines */
14030 ajSeqrefAppendAuthors(seqref, seqToken2);
14031 }
14032
14033 if(ok && moreprefix == GB_MORE_TIT)
14034 {
14035 ajDebug("title found\n");
14036 if(!seqref)
14037 seqref = ajSeqrefNew();
14038
14039 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14040 ajStrTokenStep(seqHandle); /* 'TITLE' */
14041 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
14042
14043 ajSeqrefAppendTitle(seqref, seqToken);
14044
14045 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14046 moreprefix = seqPrefixGenbankMore(seqReadLine);
14047
14048 while(ok && moreprefix == GB_MORE_MORE)
14049 {
14050 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14051 ajStrTokenStepC(seqHandle, "\n\r"); /* title */
14052 ajSeqrefAppendTitle(seqref, seqToken);
14053
14054 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14055 moreprefix = seqPrefixGenbankMore(seqReadLine);
14056 }
14057 }
14058
14059 if(ok && moreprefix == GB_MORE_JNL)
14060 {
14061 ajDebug("journal location found\n");
14062 if(!seqref)
14063 seqref = ajSeqrefNew();
14064
14065 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14066 ajStrTokenStep(seqHandle); /* 'JOURNAL' */
14067 ajStrTokenNextParseC(seqHandle, "\n\r",
14068 &seqToken); /* location */
14069
14070 ajSeqrefAppendLocation(seqref, seqToken);
14071
14072 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14073 moreprefix = seqPrefixGenbankMore(seqReadLine);
14074 }
14075
14076 while(ok && moreprefix == GB_MORE_MORE)
14077 {
14078 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14079 moreprefix = seqPrefixGenbankMore(seqReadLine);
14080 }
14081
14082 seqref = NULL;
14083 lineprefix = seqPrefixGenbank(seqReadLine);
14084 }
14085
14086 else if(ok && lineprefix == GB_CC)
14087 {
14088 ajDebug("comment found\n");
14089
14090 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14091 ajStrTokenStep(seqHandle); /* 'COMMENT' */
14092 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
14093
14094 if(ajStrGetLen(cmtstr))
14095 ajStrAppendC(&cmtstr, "\n");
14096 ajStrAppendS(&cmtstr, seqToken);
14097
14098 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14099 moreprefix = seqPrefixGenbankMore(seqReadLine);
14100 done = ajTrue;
14101
14102 while(ok && moreprefix == GB_MORE_MORE)
14103 {
14104 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14105 ajStrTokenNextParseC(seqHandle, "\n\r",
14106 &seqToken); /* comment */
14107
14108 if(ajStrGetLen(seqToken))
14109 {
14110 if(ajStrGetLen(cmtstr))
14111 ajStrAppendC(&cmtstr, "\n");
14112 ajStrAppendS(&cmtstr, seqToken);
14113 }
14114 else
14115 {
14116 ajSeqAddCmt(thys, cmtstr);
14117 cmtstr = NULL;
14118 }
14119
14120 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14121 moreprefix = seqPrefixGenbankMore(seqReadLine);
14122 }
14123
14124 if(ajStrGetLen(cmtstr))
14125 ajSeqAddCmt(thys, cmtstr);
14126
14127 lineprefix = seqPrefixGenbank(seqReadLine);
14128 cmtstr = NULL;
14129 }
14130
14131 else if(lineprefix == GB_KEY)
14132 {
14133 ajDebug("keywords found\n");
14134 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14135 ajStrTokenStep(seqHandle); /* 'KEYWORDS' */
14136
14137 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14138 {
14139 liststr = ajStrNewS(seqToken);
14140 ajStrTrimWhite(&liststr);
14141 ajSeqAddKey(thys, liststr);
14142 liststr = NULL;
14143 }
14144
14145 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14146 lineprefix = seqPrefixGenbank(seqReadLine);
14147 done = ajTrue;
14148
14149 while(ok && lineprefix == GB_MORE)
14150 {
14151 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14152
14153 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14154 {
14155 liststr = ajStrNewS(seqToken);
14156 ajStrTrimWhite(&liststr);
14157 ajSeqAddKey(thys, liststr);
14158 liststr = NULL;
14159 }
14160
14161 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14162 lineprefix = seqPrefixGenbank(seqReadLine);
14163 }
14164 }
14165
14166 if(!done)
14167 {
14168 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14169 lineprefix = seqPrefixGenbank(seqReadLine);
14170 }
14171 }
14172
14173 if(dofeat)
14174 {
14175 ajDebug("GENBANK FEAT TabIn %x\n", seqin->Ftquery);
14176 ajFeattableDel(&thys->Fttable);
14177 thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
14178 /* ajFeattableTrace(thys->Fttable); */
14179 ajFeattabinClear(seqin->Ftquery);
14180 }
14181
14182 if(ajStrGetLen(seqin->Inseq))
14183 {
14184 /* we have a sequence to use */
14185 ajDebug("Got an Inseq sequence\n");
14186
14187 if(ajStrMatchC(qry->Method,"gcg"))
14188 {
14189 while(ok && lineprefix != GB_ORI)
14190 {
14191 ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14192 lineprefix = seqPrefixGenbank(seqReadLine);
14193 }
14194 }
14195
14196 ajStrAssignS(&thys->Seq, seqin->Inseq);
14197
14198 if(seqin->Input->Text)
14199 {
14200 seqTextSeq(&thys->TextPtr, seqin->Inseq);
14201 ajFmtPrintAppS(&thys->TextPtr, "//\n");
14202 }
14203 }
14204 else
14205 {
14206 /* read the sequence and terminator */
14207 ajDebug("sequence start at '%S'\n", seqReadLine);
14208
14209 while(ok &&
14210 lineprefix != GB_END &&
14211 lineprefix != GB_ORI &&
14212 lineprefix != GB_BASE)
14213 {
14214 ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14215 lineprefix = seqPrefixGenbank(seqReadLine);
14216
14217 if(!ok)
14218 break;
14219 }
14220
14221 if(ok && lineprefix != GB_END)
14222 {
14223 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14224 lineprefix = seqPrefixGenbank(seqReadLine);
14225 }
14226
14227 ajStrSetRes(&thys->Seq, seqlen+1);
14228
14229 while(ok && lineprefix != GB_END)
14230 {
14231 if(lineprefix != GB_ORI &&
14232 lineprefix != GB_BASE)
14233 seqAppend(&thys->Seq, seqReadLine);
14234
14235 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14236 lineprefix = seqPrefixGenbank(seqReadLine);
14237 }
14238 }
14239
14240 if(!ajStrMatchC(qry->Method,"gcg"))
14241 {
14242 while(ok && lineprefix != GB_END)
14243 {
14244 ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14245 lineprefix = seqPrefixGenbank(seqReadLine);
14246 }
14247 }
14248
14249 if(thys->Fttable)
14250 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
14251
14252 if(ajFeattableGetSize(thys->Fttable))
14253 {
14254 ajFeattableGetXrefs(thys->Fttable, &thys->Xreflist, &taxid);
14255 if(taxid)
14256 seqTaxidSaveI(thys, taxid);
14257 }
14258
14259 if(!taxid)
14260 taxid = ajSeqGetTaxid(thys);
14261
14262 ajFilebuffClear(buff, 0);
14263
14264 ajStrTokenReset(seqHandle);
14265 ajStrTokenReset(seqHandle2);
14266 ajStrDelStatic(&seqToken);
14267 ajStrDelStatic(&seqToken2);
14268
14269 return ajTrue;
14270 }
14271
14272
14273
14274
14275 /* @funcstatic seqReadRefseq **************************************************
14276 **
14277 ** Given data in a sequence structure, tries to read everything needed
14278 ** using Refseq format.
14279 **
14280 ** @param [w] thys [AjPSeq] Sequence object
14281 ** @param [u] seqin [AjPSeqin] Sequence input object
14282 ** @return [AjBool] ajTrue on success
14283 **
14284 ** @release 6.1.0
14285 ** @@
14286 ******************************************************************************/
14287
seqReadRefseq(AjPSeq thys,AjPSeqin seqin)14288 static AjBool seqReadRefseq(AjPSeq thys, AjPSeqin seqin)
14289 {
14290 return seqReadGenbank(thys, seqin);
14291 }
14292
14293
14294
14295
14296 /* @funcstatic seqReadGenpept *************************************************
14297 **
14298 ** Given data in a sequence structure, tries to read everything needed
14299 ** using Genpept format.
14300 **
14301 ** @param [w] thys [AjPSeq] Sequence object
14302 ** @param [u] seqin [AjPSeqin] Sequence input object
14303 ** @return [AjBool] ajTrue on success
14304 **
14305 ** @release 6.1.0
14306 ** @@
14307 ******************************************************************************/
14308
seqReadGenpept(AjPSeq thys,AjPSeqin seqin)14309 static AjBool seqReadGenpept(AjPSeq thys, AjPSeqin seqin)
14310 {
14311 AjBool ok;
14312 AjBool done = ajFalse;
14313 AjPFilebuff buff;
14314 AjPStr cmtstr = NULL;
14315 /*
14316 // AjBool dofeat = ajFalse;
14317 // AjBool tryfeat = ajFalse;
14318 */
14319 AjPQuery qry;
14320 AjPStr liststr; /* for lists, do not delete */
14321 AjPSeqRef seqref = NULL;
14322 ajuint refnum;
14323 ajuint seqlen = 1024;
14324 ajint i;
14325 ajint nfields;
14326 ajuint itaxtype = 0;
14327 SeqEPrefixGenbank lineprefix = GB_UNK;
14328
14329 ajDebug("seqReadGenpept\n");
14330
14331 buff = seqin->Input->Filebuff;
14332 qry = seqin->Input->Query;
14333
14334 if(!ajBuffreadLine(buff, &seqReadLine))
14335 return ajFalse;
14336
14337 lineprefix = seqPrefixGenbank(seqReadLine);
14338
14339 ok = ajTrue;
14340
14341 /* extra blank lines */
14342
14343 while(ajStrIsWhite(seqReadLine))
14344 {
14345 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
14346 return ajFalse;
14347
14348 lineprefix = seqPrefixGenbank(seqReadLine);
14349 }
14350
14351 /* for GCG formatted databases */
14352
14353 if(lineprefix == GB_WP)
14354 {
14355 ok = ajBuffreadLine(buff, &seqReadLine);
14356 lineprefix = seqPrefixGenbank(seqReadLine);
14357
14358 while(ok && lineprefix == GB_MORE)
14359 {
14360 ok = ajBuffreadLine(buff, &seqReadLine);
14361 lineprefix = seqPrefixGenbank(seqReadLine);
14362 }
14363 }
14364
14365 /* This loop necessary owing to headers on GB distro files */
14366 if(ajStrFindC(seqReadLine,"Genetic Sequence Data Bank") >= 0)
14367 {
14368 while(ok && lineprefix != GB_ID) /* LOCUS */
14369 {
14370 ok = ajBuffreadLine(buff, &seqReadLine);
14371 lineprefix = seqPrefixGenbank(seqReadLine);
14372 }
14373 }
14374
14375 if(!ok)
14376 {
14377 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14378
14379 return ajFalse;
14380 }
14381
14382 if(lineprefix != GB_ID) /* LOCUS */
14383 {
14384 ajDebug("failed - LOCUS not found - first line was\n%S\n",
14385 seqReadLine);
14386 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14387
14388 return ajFalse;
14389 }
14390
14391 nfields = ajStrParseCountC(seqReadLine, " \n\r");
14392
14393 if(nfields == 8)
14394 {
14395 ajFilebuffSetBuffered(buff);
14396 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14397
14398 return seqReadRefseqp(thys,seqin);
14399 }
14400
14401 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14402 i=0;
14403
14404 while(ajStrTokenNextParse(seqHandle, &seqToken))
14405 {
14406 switch(++i)
14407 {
14408 case 1:
14409 break;
14410 case 2:
14411 seqSetName(thys, seqToken);
14412 break;
14413 case 3:
14414 ajStrToUint(seqToken, &seqlen);
14415 break;
14416 case 4:
14417 if(!ajStrMatchC(seqToken, "aa"))
14418 ajWarn("bad Genpept LOCUS line '%S'", seqReadLine);
14419 break;
14420 case 5:
14421 break;
14422 case 6:
14423 ajSeqdivSetGb(&thys->Division, seqToken);
14424 ajSeqclsSetGb(&thys->Class, seqToken);
14425 break;
14426 case 7:
14427 if(!thys->Date)
14428 thys->Date = ajSeqdateNew();
14429 ajSeqdateSetModifyS(thys->Date, seqToken);
14430 break;
14431 default:
14432 break;
14433 }
14434 }
14435
14436 if(seqin->Input->Text)
14437 ajStrAssignS(&thys->TextPtr, seqReadLine);
14438
14439 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14440 lineprefix = seqPrefixGenbank(seqReadLine);
14441
14442 /*
14443 // dofeat = ajFalse;
14444 // tryfeat = seqinUfoLocal(seqin);
14445 */
14446
14447 while(ok &&
14448 !ajStrPrefixC(seqReadLine, "ORIGIN") &&
14449 !ajStrPrefixC(seqReadLine, "BASE COUNT"))
14450 {
14451 done = ajFalse;
14452
14453 if(ajStrPrefixC(seqReadLine, "DEFINITION"))
14454 {
14455 ajDebug("definition found\n");
14456 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14457 ajStrTokenStep(seqHandle); /* 'DEFINITION' */
14458 ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc); /* desc */
14459 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14460 lineprefix = seqPrefixGenbank(seqReadLine);
14461 done = ajTrue;
14462
14463 while(ok && ajStrPrefixC(seqReadLine, " "))
14464 {
14465 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14466 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
14467 ajStrAppendC(&thys->Desc, " ");
14468 ajStrAppendS(&thys->Desc, seqToken);
14469 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14470 lineprefix = seqPrefixGenbank(seqReadLine);
14471 }
14472 }
14473
14474 else if(ajStrPrefixC(seqReadLine, "ACCESSION"))
14475 {
14476 ajDebug("accession found\n");
14477
14478 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
14479 ajStrTokenStep(seqHandle); /* 'ACCESSION' */
14480
14481 while(ajStrTokenNextParse(seqHandle, &seqToken))
14482 seqAccSave(thys, seqToken);
14483 }
14484
14485 else if(ajStrPrefixC(seqReadLine, "VERSION"))
14486 {
14487 ajDebug("seqversion found\n");
14488
14489 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14490 ajStrTokenStep(seqHandle); /* 'VERSION' */
14491 ajStrTokenNextParse(seqHandle, &seqToken);
14492 seqSvSave(thys, seqToken);
14493
14494 if(ajStrTokenStepC(seqHandle, ": \n\r")) /* GI: */
14495 {
14496 ajStrTokenNextParse(seqHandle, &thys->Gi);
14497 }
14498 }
14499
14500 else if(ajStrPrefixC(seqReadLine, "SOURCE"))
14501 {
14502 ajDebug("source found\n");
14503 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14504 ajStrTokenStep(seqHandle); /* 'SOURCE' */
14505 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* source */
14506 ajStrTokenAssignC(&seqHandle2, seqToken, "()");
14507 itaxtype=1;
14508
14509 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
14510 {
14511 ajStrTrimWhite(&seqToken2);
14512 seqTaxSave(thys, seqToken2, itaxtype);
14513 itaxtype = 3;
14514 }
14515
14516 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14517 lineprefix = seqPrefixGenbank(seqReadLine);
14518 done = ajTrue;
14519
14520 while(ok && ajStrPrefixC(seqReadLine, " "))
14521 {
14522 done = ajFalse;
14523 /* process organism lines */
14524
14525 if(ajStrPrefixC(seqReadLine, " ORGANISM"))
14526 {
14527 ajDebug("organism found\n");
14528 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14529 ajStrTokenStep(seqHandle); /* 'ORGANISM' */
14530
14531 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14532 {
14533 ajStrTrimWhite(&seqToken);
14534 seqTaxSave(thys, seqToken, 1);
14535 }
14536
14537 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14538 lineprefix = seqPrefixGenbank(seqReadLine);
14539 done = ajTrue;
14540
14541 while(ok && ajStrPrefixC(seqReadLine, " "))
14542 {
14543 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14544
14545 while(ajStrTokenNextParseC(seqHandle, ".;\n\r",
14546 &seqToken))
14547 {
14548 ajStrAssignS(&seqToken2, seqToken);
14549 ajStrTrimWhite(&seqToken2);
14550 seqTaxSave(thys, seqToken2, 0);
14551 }
14552
14553 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14554 lineprefix = seqPrefixGenbank(seqReadLine);
14555 }
14556 }
14557
14558 if(!done)
14559 {
14560 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14561 lineprefix = seqPrefixGenbank(seqReadLine);
14562 }
14563 }
14564 }
14565
14566 else if(ajStrPrefixC(seqReadLine, "REFERENCE"))
14567 {
14568 ajDebug("reference found\n");
14569 seqref = ajSeqrefNew();
14570 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14571 ajStrTokenStep(seqHandle); /* 'REFERENCE' */
14572 ajStrTokenNextParse(seqHandle, &seqToken); /* number */
14573 ajStrToUint(seqToken, &refnum);
14574 ajSeqrefSetnumNumber(seqref, refnum);
14575 ajStrAssignClear(&seqToken2);
14576
14577 while (ajStrTokenNextParse(seqHandle, &seqToken))
14578 {
14579 if(ajStrMatchC(seqToken, "(bases"))
14580 continue;
14581
14582 if(ajStrMatchC(seqToken, "(residues"))
14583 continue;
14584
14585 if(ajStrMatchC(seqToken, "to"))
14586 continue;
14587
14588 if(!ajStrGetLen(seqToken2))
14589 ajStrAssignS(&seqToken2, seqToken);
14590
14591 if(ajStrSuffixC(seqToken, ")"))
14592 {
14593 ajStrTrimEndC(&seqToken, ")");
14594 ajStrAppendK(&seqToken2, '-');
14595 ajStrAppendS(&seqToken2, seqToken);
14596 }
14597 }
14598
14599 ajSeqrefSetPosition(seqref, seqToken2);
14600
14601 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine,&thys->TextPtr);
14602 lineprefix = seqPrefixGenbank(seqReadLine);
14603 done = ajTrue;
14604
14605 ajSeqrefStandard(seqref);
14606 ajSeqAddRef(thys, seqref);
14607
14608 if(ok && ajStrPrefixC(seqReadLine, " AUTHORS"))
14609 {
14610 ajDebug("authors found\n");
14611 if(!seqref)
14612 seqref = ajSeqrefNew();
14613 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14614 ajStrTokenStep(seqHandle); /* 'AUTHORS' */
14615 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken2); /* authors */
14616
14617 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14618 lineprefix = seqPrefixGenbank(seqReadLine);
14619
14620 while(ok && ajStrPrefixC(seqReadLine, " "))
14621 {
14622 ajStrAssignS(&seqToken, seqReadLine);
14623 ajStrTrimWhite(&seqToken);
14624 if(ajStrSuffixC(seqToken2, ".,") ||
14625 ajStrPrefixC(seqToken2, "and "))
14626 ajStrAppendC(&seqToken2, " ");
14627 ajStrAppendS(&seqToken2, seqToken);
14628 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14629 lineprefix = seqPrefixGenbank(seqReadLine);
14630 }
14631
14632 /* append here - genbank splits author names across lines */
14633 ajSeqrefAppendAuthors(seqref, seqToken2);
14634 }
14635
14636 if(ok && ajStrPrefixC(seqReadLine, " TITLE"))
14637 {
14638 ajDebug("title found\n");
14639 if(!seqref)
14640 seqref = ajSeqrefNew();
14641
14642 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14643 ajStrTokenStep(seqHandle); /* 'TITLE' */
14644 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
14645
14646 ajSeqrefAppendTitle(seqref, seqToken);
14647
14648 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14649 lineprefix = seqPrefixGenbank(seqReadLine);
14650
14651 while(ok && ajStrPrefixC(seqReadLine, " "))
14652 {
14653 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14654 ajStrTokenNextParseC(seqHandle, "\n\r",
14655 &seqToken); /* title */
14656 ajSeqrefAppendTitle(seqref, seqToken);
14657
14658 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14659 lineprefix = seqPrefixGenbank(seqReadLine);
14660 }
14661 }
14662
14663 if(ok && ajStrPrefixC(seqReadLine, " JOURNAL"))
14664 {
14665 ajDebug("journal location found\n");
14666 if(!seqref)
14667 seqref = ajSeqrefNew();
14668
14669 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14670 ajStrTokenStep(seqHandle); /* 'JOURNAL' */
14671 ajStrTokenNextParseC(seqHandle, "\n\r",
14672 &seqToken); /* location */
14673
14674 ajSeqrefAppendLocation(seqref, seqToken);
14675
14676 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14677 lineprefix = seqPrefixGenbank(seqReadLine);
14678 }
14679
14680 while(ok && ajStrPrefixC(seqReadLine, " "))
14681 {
14682 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14683 lineprefix = seqPrefixGenbank(seqReadLine);
14684 }
14685
14686 seqref = NULL;
14687 }
14688
14689 else if(ok && ajStrPrefixC(seqReadLine, "COMMENT"))
14690 {
14691 ajDebug("comment found\n");
14692
14693 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14694 ajStrTokenStep(seqHandle); /* 'COMMENT' */
14695 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
14696
14697 if(ajStrGetLen(cmtstr))
14698 ajStrAppendC(&cmtstr, "\n");
14699 ajStrAppendS(&cmtstr, seqToken);
14700
14701 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14702 lineprefix = seqPrefixGenbank(seqReadLine);
14703 done = ajTrue;
14704
14705 while(ok && ajStrPrefixC(seqReadLine, " "))
14706 {
14707 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14708 ajStrTokenNextParseC(seqHandle, "\n\r",
14709 &seqToken); /* comment */
14710
14711 if(ajStrGetLen(seqToken))
14712 {
14713 if(ajStrGetLen(cmtstr))
14714 ajStrAppendC(&cmtstr, "\n");
14715 ajStrAppendS(&cmtstr, seqToken);
14716 }
14717 else
14718 {
14719 ajSeqAddCmt(thys, cmtstr);
14720 cmtstr = NULL;
14721 }
14722
14723 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14724 lineprefix = seqPrefixGenbank(seqReadLine);
14725 }
14726
14727 if(ajStrGetLen(cmtstr))
14728 ajSeqAddCmt(thys, cmtstr);
14729
14730 cmtstr = NULL;
14731 }
14732
14733 else if(ajStrPrefixC(seqReadLine, "KEYWORDS"))
14734 {
14735 ajDebug("keywords found\n");
14736 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14737 ajStrTokenStep(seqHandle); /* 'KEYWORDS' */
14738
14739 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14740 {
14741 liststr = ajStrNewS(seqToken);
14742 ajStrTrimWhite(&liststr);
14743 ajSeqAddKey(thys, liststr);
14744 liststr = NULL;
14745 }
14746
14747 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14748 lineprefix = seqPrefixGenbank(seqReadLine);
14749 done = ajTrue;
14750
14751 while(ok && ajStrPrefixC(seqReadLine, " "))
14752 {
14753 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14754
14755 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14756 {
14757 liststr = ajStrNewS(seqToken);
14758 ajStrTrimWhite(&liststr);
14759 ajSeqAddKey(thys, liststr);
14760 liststr = NULL;
14761 }
14762
14763 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14764 lineprefix = seqPrefixGenbank(seqReadLine);
14765 }
14766 }
14767
14768 else if(ajStrPrefixC(seqReadLine, " ORGANISM"))
14769 {
14770 ajDebug("organism found\n");
14771 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14772 ajStrTokenStep(seqHandle); /* 'ORGANISM' */
14773
14774 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14775 {
14776 ajStrTrimWhite(&seqToken);
14777 seqTaxSave(thys, seqToken, 0);
14778 }
14779
14780 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14781 lineprefix = seqPrefixGenbank(seqReadLine);
14782 done = ajTrue;
14783
14784 while(ok && ajStrPrefixC(seqReadLine, " "))
14785 {
14786 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14787
14788 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14789 {
14790 ajStrTrimWhite(&seqToken);
14791 seqTaxSave(thys, seqToken, 0);
14792 }
14793
14794 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14795 lineprefix = seqPrefixGenbank(seqReadLine);
14796 }
14797 }
14798
14799 if(!done)
14800 {
14801 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14802 lineprefix = seqPrefixGenbank(seqReadLine);
14803 }
14804
14805 }
14806 /*
14807 // if(dofeat)
14808 // {
14809 // ajDebug("GENPEPT FEAT TabIn %x\n", seqin->Ftquery);
14810 // ajFeattableDel(&thys->Fttable);
14811 // thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
14812 // /# ajFeattableTrace(thys->Fttable); #/
14813 // ajFeattabinClear(seqin->Ftquery);
14814 // }
14815 */
14816
14817 if(ajStrGetLen(seqin->Inseq))
14818 {
14819 /* we have a sequence to use */
14820 ajDebug("Got an Inseq sequence\n");
14821
14822 if(ajStrMatchC(qry->Method,"gcg"))
14823 while(ok && !ajStrPrefixC(seqReadLine,"ORIGIN"))
14824 ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14825
14826 ajStrAssignS(&thys->Seq, seqin->Inseq);
14827
14828 if(seqin->Input->Text)
14829 {
14830 seqTextSeq(&thys->TextPtr, seqin->Inseq);
14831 ajFmtPrintAppS(&thys->TextPtr, "//\n");
14832 }
14833 }
14834 else
14835 {
14836 /* read the sequence and terminator */
14837 ajDebug("sequence start at '%S'\n", seqReadLine);
14838
14839 while(!ajStrPrefixC(seqReadLine,"ORIGIN") &&
14840 !ajStrPrefixC(seqReadLine,"BASE COUNT"))
14841 if(!ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr))
14842 break;
14843
14844 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14845 ajStrSetRes(&thys->Seq, seqlen+1);
14846
14847 while(ok && !ajStrPrefixC(seqReadLine, "//"))
14848 {
14849 if(!ajStrPrefixC(seqReadLine, "ORIGIN") &&
14850 !ajStrPrefixC(seqReadLine,"BASE COUNT"))
14851 seqAppend(&thys->Seq, seqReadLine);
14852 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14853 }
14854 }
14855
14856 if(!ajStrMatchC(qry->Method,"gcg"))
14857 while(ok && !ajStrPrefixC(seqReadLine,"//"))
14858 ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14859
14860
14861 if(thys->Fttable)
14862 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
14863
14864 ajFilebuffClear(buff, 0);
14865
14866 ajStrTokenReset(seqHandle);
14867 ajStrTokenReset(seqHandle2);
14868 ajStrDelStatic(&seqToken);
14869 ajStrDelStatic(&seqToken2);
14870
14871 return ajTrue;
14872 }
14873
14874
14875
14876
14877 /* @funcstatic seqReadRefseqp *************************************************
14878 **
14879 ** Given data in a sequence structure, tries to read everything needed
14880 ** using Refseq protein format.
14881 **
14882 ** @param [w] thys [AjPSeq] Sequence object
14883 ** @param [u] seqin [AjPSeqin] Sequence input object
14884 ** @return [AjBool] ajTrue on success
14885 **
14886 ** @release 6.1.0
14887 ** @@
14888 ******************************************************************************/
14889
seqReadRefseqp(AjPSeq thys,AjPSeqin seqin)14890 static AjBool seqReadRefseqp(AjPSeq thys, AjPSeqin seqin)
14891 {
14892 AjBool ok;
14893 AjBool done = ajFalse;
14894 AjPFilebuff buff;
14895 AjPStr cmtstr = NULL;
14896 AjBool dofeat = ajFalse;
14897 AjBool tryfeat = ajFalse;
14898 AjPQuery qry;
14899 AjPStr liststr; /* for lists, do not delete */
14900 AjPSeqRef seqref = NULL;
14901 ajuint refnum;
14902 ajuint seqlen = 1024;
14903 ajint i;
14904 ajuint itaxtype = 0;
14905
14906 ajDebug("seqReadRefseqp\n");
14907
14908 buff = seqin->Input->Filebuff;
14909 qry = seqin->Input->Query;
14910
14911 if(!seqFtFmtRefseqp)
14912 ajStrAssignC(&seqFtFmtRefseqp, "refseqp");
14913
14914 if(!ajBuffreadLine(buff, &seqReadLine))
14915 return ajFalse;
14916
14917 ajDebug("++seqReadRefseqp %d first line '%S'\n", buff->Pos, seqReadLine);
14918
14919 ok = ajTrue;
14920
14921 /* extra blank lines */
14922
14923 while(ajStrIsWhite(seqReadLine))
14924 {
14925 if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
14926 return ajFalse;
14927 }
14928
14929 /* for GCG formatted databases */
14930
14931 if(ajStrPrefixC(seqReadLine, "WPCOMMENT"))
14932 {
14933 ok = ajBuffreadLine(buff, &seqReadLine);
14934
14935 while(ok && ajStrPrefixC(seqReadLine, " "))
14936 {
14937 ok = ajBuffreadLine(buff, &seqReadLine);
14938 }
14939 }
14940
14941 if(!ok)
14942 {
14943 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14944
14945 return ajFalse;
14946 }
14947
14948 if(!ajStrPrefixC(seqReadLine, "LOCUS"))
14949 {
14950 ajDebug("failed - LOCUS not found - first line was\n%S\n",
14951 seqReadLine);
14952 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14953 return ajFalse;
14954 }
14955
14956 if(seqin->Input->Text)
14957 ajStrAssignS(&thys->TextPtr,seqReadLine);
14958
14959 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14960 i=0;
14961
14962 while(ajStrTokenNextParse(seqHandle, &seqToken))
14963 {
14964 switch(++i)
14965 {
14966 case 1: /* 'LOCUS' */
14967 break;
14968 case 2: /* locus name */
14969 seqSetName(thys, seqToken);
14970 break;
14971 case 3: /* length */
14972 ajStrToUint(seqToken, &seqlen);
14973 break;
14974 case 4: /* 'aa' */
14975 if(!ajStrMatchC(seqToken, "aa"))
14976 ajWarn("bad RefseqP LOCUS line '%S'", seqReadLine);
14977 break;
14978 case 5: /* linear etc. */
14979 break;
14980 case 6:
14981 ajSeqdivSetGb(&thys->Division, seqToken);
14982 ajSeqclsSetGb(&thys->Class, seqToken);
14983 break;
14984 case 7:
14985 if(!thys->Date)
14986 thys->Date = ajSeqdateNew();
14987 ajSeqdateSetModifyS(thys->Date, seqToken);
14988 break;
14989 default:
14990 break;
14991 }
14992 }
14993
14994 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14995
14996 dofeat = ajFalse;
14997 tryfeat = seqinUfoLocal(seqin);
14998
14999 while(ok &&
15000 !ajStrPrefixC(seqReadLine, "ORIGIN") &&
15001 !ajStrPrefixC(seqReadLine, "BASE COUNT"))
15002 {
15003 done = ajFalse;
15004
15005 if(ajStrPrefixC(seqReadLine, "DEFINITION"))
15006 {
15007 ajDebug("definition found\n");
15008 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15009 ajStrTokenStep(seqHandle); /* 'DEFINITION' */
15010 ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc); /* desc */
15011 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15012 done = ajTrue;
15013
15014 while(ok && ajStrPrefixC(seqReadLine, " "))
15015 {
15016 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15017 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
15018 ajStrAppendC(&thys->Desc, " ");
15019 ajStrAppendS(&thys->Desc, seqToken);
15020 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15021 }
15022 }
15023
15024 else if(ajStrPrefixC(seqReadLine, "ACCESSION"))
15025 {
15026 ajDebug("accession found\n");
15027
15028 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
15029 ajStrTokenStep(seqHandle); /* 'ACCESSION' */
15030
15031 while(ajStrTokenNextParse(seqHandle, &seqToken))
15032 seqAccSave(thys, seqToken);
15033 }
15034
15035 else if(ajStrPrefixC(seqReadLine, "VERSION"))
15036 {
15037 ajDebug("seqversion found\n");
15038
15039 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
15040 ajStrTokenStep(seqHandle); /* 'VERSION' */
15041 ajStrTokenNextParse(seqHandle, &seqToken);
15042 seqSvSave(thys, seqToken);
15043
15044 if(ajStrTokenStepC(seqHandle, ": \n\r")) /* GI: */
15045 {
15046 ajStrTokenNextParse(seqHandle, &thys->Gi);
15047 }
15048 }
15049
15050 else if(ajStrPrefixC(seqReadLine, "SOURCE"))
15051 {
15052 ajDebug("source found\n");
15053 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15054 ajStrTokenStep(seqHandle); /* 'SOURCE' */
15055 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* source */
15056 ajStrTokenAssignC(&seqHandle2, seqToken, "()");
15057 itaxtype=1;
15058
15059 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
15060 {
15061 ajStrTrimWhite(&seqToken2);
15062 seqTaxSave(thys, seqToken2, itaxtype);
15063 itaxtype = 3;
15064 }
15065
15066 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15067 done = ajTrue;
15068
15069 while(ok && ajStrPrefixC(seqReadLine, " "))
15070 {
15071 done = ajFalse;
15072 /* process organism lines */
15073
15074 if(ajStrPrefixC(seqReadLine, " ORGANISM"))
15075 {
15076 ajDebug("organism found\n");
15077 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15078 ajStrTokenStep(seqHandle); /* 'ORGANISM' */
15079
15080 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15081 {
15082 ajStrTrimWhite(&seqToken);
15083 seqTaxSave(thys, seqToken, 1);
15084 }
15085
15086 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15087
15088 done = ajTrue;
15089
15090 while(ok && ajStrPrefixC(seqReadLine, " "))
15091 {
15092 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15093
15094 while(ajStrTokenNextParseC(seqHandle, ".;\n\r",
15095 &seqToken))
15096 {
15097 ajStrTrimWhite(&seqToken);
15098 seqTaxSave(thys, seqToken, 0);
15099 }
15100
15101 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15102 }
15103 }
15104
15105 if(!done)
15106 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15107 }
15108 }
15109
15110 else if(tryfeat && ajStrPrefixC(seqReadLine, "FEATURES"))
15111 {
15112 ajDebug("features found\n");
15113
15114 if(!dofeat)
15115 {
15116 dofeat = ajTrue;
15117 ajFeattabinDel(&seqin->Ftquery);
15118 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtRefseqp,
15119 thys->Name, "N");
15120 ajDebug("seqin->Ftquery Filebuff %x\n",
15121 seqin->Ftquery->Input->Filebuff);
15122 /* ajDebug("REFSEQP FEAT first line:\n%S", seqReadLine); */
15123 }
15124
15125 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
15126 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15127 done = ajTrue;
15128
15129 while(ok && ajStrPrefixC(seqReadLine, " "))
15130 {
15131 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff,
15132 seqReadLine);
15133 /* ajDebug("REFSEQP FEAT saved line:\n%S", seqReadLine); */
15134 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15135 }
15136 }
15137
15138 else if(ajStrPrefixC(seqReadLine, "REFERENCE"))
15139 {
15140 ajDebug("reference found\n");
15141 seqref = ajSeqrefNew();
15142 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
15143 ajStrTokenStep(seqHandle); /* 'REFERENCE' */
15144 ajStrTokenNextParse(seqHandle, &seqToken); /* number */
15145 ajStrToUint(seqToken, &refnum);
15146 ajSeqrefSetnumNumber(seqref, refnum);
15147 ajStrAssignClear(&seqToken2);
15148
15149 while (ajStrTokenNextParse(seqHandle, &seqToken))
15150 {
15151 if(ajStrMatchC(seqToken, "(bases"))
15152 continue;
15153
15154 if(ajStrMatchC(seqToken, "(residues"))
15155 continue;
15156
15157 if(ajStrMatchC(seqToken, "to"))
15158 continue;
15159
15160 if(!ajStrGetLen(seqToken2))
15161 ajStrAssignS(&seqToken2, seqToken);
15162
15163 if(ajStrSuffixC(seqToken, ")"))
15164 {
15165 ajStrTrimEndC(&seqToken, ")");
15166 ajStrAppendK(&seqToken2, '-');
15167 ajStrAppendS(&seqToken2, seqToken);
15168 }
15169 }
15170
15171 ajSeqrefSetPosition(seqref, seqToken2);
15172
15173 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15174 done = ajTrue;
15175
15176 ajSeqrefStandard(seqref);
15177 ajSeqAddRef(thys, seqref);
15178
15179 if(ok && ajStrPrefixC(seqReadLine, " AUTHORS"))
15180 {
15181 ajDebug("authors found\n");
15182 if(!seqref)
15183 seqref = ajSeqrefNew();
15184 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
15185 ajStrTokenStep(seqHandle); /* 'AUTHORS' */
15186 ajStrTokenNextParseC(seqHandle, "\n\r",
15187 &seqToken2); /* authors */
15188
15189 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15190
15191 while(ok && ajStrPrefixC(seqReadLine, " "))
15192 {
15193 ajStrAssignS(&seqToken, seqReadLine);
15194 ajStrTrimWhite(&seqToken);
15195 if(ajStrSuffixC(seqToken2, ".,") ||
15196 ajStrPrefixC(seqToken, "and "))
15197 ajStrAppendC(&seqToken2, " ");
15198 ajStrAppendS(&seqToken2, seqToken);
15199 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15200 }
15201
15202 /* append here - genbank splits author names across lines */
15203 ajSeqrefAppendAuthors(seqref, seqToken2);
15204 }
15205
15206 if(ok && ajStrPrefixC(seqReadLine, " TITLE"))
15207 {
15208 ajDebug("title found\n");
15209 if(!seqref)
15210 seqref = ajSeqrefNew();
15211
15212 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15213 ajStrTokenStep(seqHandle); /* 'TITLE' */
15214 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
15215
15216 ajSeqrefAppendTitle(seqref, seqToken);
15217
15218 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15219
15220 while(ok && ajStrPrefixC(seqReadLine, " "))
15221 {
15222 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15223 ajStrTokenNextParseC(seqHandle, "\n\r",
15224 &seqToken); /* title */
15225 ajSeqrefAppendTitle(seqref, seqToken);
15226
15227 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15228 }
15229 }
15230
15231 if(ok && ajStrPrefixC(seqReadLine, " JOURNAL"))
15232 {
15233 ajDebug("journal location found\n");
15234 if(!seqref)
15235 seqref = ajSeqrefNew();
15236
15237 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15238 ajStrTokenStep(seqHandle); /* 'JOURNAL' */
15239 ajStrTokenNextParseC(seqHandle, "\n\r",
15240 &seqToken); /* location */
15241
15242 ajSeqrefAppendLocation(seqref, seqToken);
15243
15244 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15245 }
15246
15247 while(ok && ajStrPrefixC(seqReadLine, " "))
15248 {
15249 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15250 }
15251
15252 seqref = NULL;
15253 }
15254
15255 else if(ok && ajStrPrefixC(seqReadLine, "COMMENT"))
15256 {
15257 ajDebug("comment found\n");
15258
15259 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15260 ajStrTokenStep(seqHandle); /* 'COMMENT' */
15261 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
15262
15263 if(ajStrGetLen(cmtstr))
15264 ajStrAppendC(&cmtstr, "\n");
15265 ajStrAppendS(&cmtstr, seqToken);
15266
15267 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15268 done = ajTrue;
15269
15270 while(ok && ajStrPrefixC(seqReadLine, " "))
15271 {
15272 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15273 ajStrTokenNextParseC(seqHandle, "\n\r",
15274 &seqToken); /* comment */
15275
15276 if(ajStrGetLen(seqToken))
15277 {
15278 if(ajStrGetLen(cmtstr))
15279 ajStrAppendC(&cmtstr, "\n");
15280 ajStrAppendS(&cmtstr, seqToken);
15281 }
15282 else
15283 {
15284 ajSeqAddCmt(thys, cmtstr);
15285 cmtstr = NULL;
15286 }
15287
15288 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15289 }
15290
15291 if(ajStrGetLen(cmtstr))
15292 ajSeqAddCmt(thys, cmtstr);
15293
15294 cmtstr = NULL;
15295 }
15296
15297 else if(ajStrPrefixC(seqReadLine, "KEYWORDS"))
15298 {
15299 ajDebug("keywords found\n");
15300 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15301 ajStrTokenStep(seqHandle); /* 'KEYWORDS' */
15302
15303 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15304 {
15305 liststr = ajStrNewS(seqToken);
15306 ajStrTrimWhite(&liststr);
15307 ajSeqAddKey(thys, liststr);
15308 liststr = NULL;
15309 }
15310
15311 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15312 done = ajTrue;
15313
15314 while(ok && ajStrPrefixC(seqReadLine, " "))
15315 {
15316 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15317
15318 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15319 {
15320 liststr = ajStrNewS(seqToken);
15321 ajStrTrimWhite(&liststr);
15322 ajSeqAddKey(thys, liststr);
15323 liststr = NULL;
15324 }
15325
15326 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15327 }
15328 }
15329
15330 else if(ajStrPrefixC(seqReadLine, " ORGANISM"))
15331 {
15332 ajDebug("organism found\n");
15333 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15334 ajStrTokenStep(seqHandle); /* 'ORGANISM' */
15335
15336 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15337 {
15338 ajStrTrimWhite(&seqToken);
15339 seqTaxSave(thys, seqToken, 0);
15340 }
15341
15342 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15343 done = ajTrue;
15344
15345 while(ok && ajStrPrefixC(seqReadLine, " "))
15346 {
15347 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15348
15349 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15350 {
15351 ajStrTrimWhite(&seqToken);
15352 seqTaxSave(thys, seqToken, 0);
15353 }
15354
15355 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15356 }
15357 }
15358
15359 if(!done)
15360 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15361 }
15362
15363 if(dofeat)
15364 {
15365 ajDebug("REFSEQP FEAT TabIn %x\n", seqin->Ftquery);
15366 ajFeattableDel(&thys->Fttable);
15367 thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
15368 /* ajFeattableTrace(thys->Fttable); */
15369 ajFeattabinClear(seqin->Ftquery);
15370 }
15371
15372 if(ajStrGetLen(seqin->Inseq))
15373 {
15374 /* we have a sequence to use */
15375 ajDebug("Got an Inseq sequence\n");
15376
15377 if(ajStrMatchC(qry->Method,"gcg"))
15378 while(ok && !ajStrPrefixC(seqReadLine,"ORIGIN"))
15379 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15380
15381 ajStrAssignS(&thys->Seq, seqin->Inseq);
15382
15383 if(seqin->Input->Text)
15384 {
15385 seqTextSeq(&thys->TextPtr, seqin->Inseq);
15386 ajFmtPrintAppS(&thys->TextPtr, "//\n");
15387 }
15388 }
15389 else
15390 {
15391 /* read the sequence and terminator */
15392 ajDebug("sequence start at '%S'\n", seqReadLine);
15393
15394 while(!ajStrPrefixC(seqReadLine,"ORIGIN") &&
15395 !ajStrPrefixC(seqReadLine,"BASE COUNT"))
15396 if(!ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr))
15397 break;
15398
15399 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15400 ajStrSetRes(&thys->Seq, seqlen+1);
15401
15402 while(ok && !ajStrPrefixC(seqReadLine, "//"))
15403 {
15404 if(!ajStrPrefixC(seqReadLine, "ORIGIN") &&
15405 !ajStrPrefixC(seqReadLine,"BASE COUNT"))
15406 seqAppend(&thys->Seq, seqReadLine);
15407
15408 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15409 }
15410 }
15411
15412 if(!ajStrMatchC(qry->Method,"gcg"))
15413 while(ok && !ajStrPrefixC(seqReadLine,"//"))
15414 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15415
15416 if(thys->Fttable)
15417 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
15418
15419 ajFilebuffClear(buff, 0);
15420 ajDebug("++last line %d '%S'\n", buff->Pos, seqReadLine);
15421
15422 ajStrTokenReset(seqHandle);
15423 ajStrDelStatic(&seqToken);
15424 ajStrDelStatic(&seqToken2);
15425
15426 return ajTrue;
15427 }
15428
15429
15430
15431
15432 /* @funcstatic seqReadGff2 ****************************************************
15433 **
15434 ** Given data in a sequence structure, tries to read everything needed
15435 ** using GFF2 format.
15436 **
15437 ** GFF1 only offers the sequence, and the type, with the DNA, RNA and
15438 ** Protein and End-xxx headers. GFF2 allows other header lines to be defined,
15439 ** so EMBOSS can add more lines for accession number and description
15440 **
15441 ** GFF2 also defines Type and sequence-region headers, but they only
15442 ** provide information that is also in the DNA, RNA or Protein header
15443 ** and these are required for sequence storage so we ignore the alternatives.
15444 **
15445 ** @param [w] thys [AjPSeq] Sequence object
15446 ** @param [u] seqin [AjPSeqin] Sequence input object
15447 ** @return [AjBool] ajTrue on success
15448 **
15449 ** @release 6.4.0
15450 ** @@
15451 ******************************************************************************/
15452
seqReadGff2(AjPSeq thys,AjPSeqin seqin)15453 static AjBool seqReadGff2(AjPSeq thys, AjPSeqin seqin)
15454 {
15455 AjBool ok;
15456 AjBool isseq = ajFalse;
15457 AjPFilebuff buff;
15458 AjPFilebuff ftfile = NULL;
15459 AjBool dofeat = ajFalse;
15460 AjPStr typstr = NULL;
15461 AjPStr verstr = NULL; /* copy of version line */
15462 AjPStr outstr = NULL; /* generated Type line */
15463
15464 buff = seqin->Input->Filebuff;
15465
15466 if(!seqRegGffTyp)
15467 seqRegGffTyp = ajRegCompC("^##([DR]NA|Protein) +([^ \t\r\n]+)");
15468
15469 if(!seqFtFmtGff)
15470 ajStrAssignC(&seqFtFmtGff, "gff");
15471
15472 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15473 if(!ok)
15474 return ajFalse;
15475
15476 ajDebug("seqReadGff2 first line '%S'\n", seqReadLine);
15477
15478 if(!ajStrPrefixC(seqReadLine, "##gff-version "))
15479 {
15480 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15481
15482 return ajFalse;
15483 }
15484
15485 ajStrAssignS(&verstr, seqReadLine);
15486
15487 if(seqin->Input->Text)
15488 ajStrAssignS(&thys->TextPtr,seqReadLine);
15489
15490 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15491
15492 /* read the main header */
15493 while(ok && ajStrPrefixC(seqReadLine, "##"))
15494 {
15495 if(ajRegExec(seqRegGffTyp, seqReadLine))
15496 {
15497 isseq = ajTrue;
15498 ajRegSubI(seqRegGffTyp, 1, &typstr);
15499 ajRegSubI(seqRegGffTyp, 2, &thys->Name);
15500 ajFmtPrintS(&outstr, "##Type %S %S", typstr, thys->Name);
15501 }
15502 else if(ajStrPrefixC(seqReadLine, "##end-"))
15503 isseq = ajFalse;
15504 else if(isseq)
15505 seqAppend(&thys->Seq, seqReadLine);
15506
15507 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15508 }
15509
15510 if(!ajSeqGetLen(thys))
15511 {
15512 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15513 return ajFalse;
15514 }
15515
15516 /* do we want the features now? */
15517
15518 if(ok & seqinUfoLocal(seqin))
15519 {
15520 dofeat = ajTrue;
15521 ftfile = ajFilebuffNewNofile();
15522 ajFilebuffLoadS(ftfile, verstr);
15523 ajFilebuffLoadS(ftfile, outstr);
15524
15525 while(ok && !ajStrPrefixC(seqReadLine, "##"))
15526 {
15527 ajFilebuffLoadS(ftfile, seqReadLine);
15528 /* ajDebug("GFF FEAT saved line:\n%S", seqReadLine); */
15529 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15530 }
15531 }
15532
15533 if(dofeat)
15534 {
15535 ajFeattabinDel(&seqin->Ftquery);
15536 seqin->Ftquery = ajFeattabinNewSeqinSSF(seqin, seqFtFmtGff,
15537 thys->Name,
15538 ajStrGetPtr(seqin->Type),
15539 ftfile);
15540 ajDebug("GFF FEAT TabIn %x type: '%S'\n",
15541 seqin->Ftquery, seqin->Type);
15542 ftfile = NULL; /* now copied to seqin->Feattabin */
15543 ajFeattableDel(&seqin->Fttable);
15544 seqin->Fttable = ajFeattableNewRead(seqin->Ftquery);
15545 /* ajFeattableTrace(seqin->Fttable); */
15546 ajFeattableDel(&thys->Fttable);
15547 thys->Fttable = seqin->Fttable;
15548 seqin->Fttable = NULL;
15549 }
15550
15551 if(ajStrMatchC(typstr, "Protein"))
15552 ajSeqSetProt(thys);
15553 else if(ajSeqIsNuc(thys))
15554 ajSeqSetNuc(thys);
15555 else
15556 ajSeqSetProt(thys);
15557
15558 if(thys->Fttable)
15559 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
15560
15561 ajFilebuffClear(buff, 0);
15562
15563 ajStrDel(&typstr);
15564 ajStrDel(&verstr);
15565 ajStrDel(&outstr);
15566
15567 return ajTrue;
15568 }
15569
15570
15571
15572
15573 /* @funcstatic seqReadGff3 ****************************************************
15574 **
15575 ** Given data in a sequence structure, tries to read everything needed
15576 ** using GFF3 format.
15577 **
15578 ** GFF3 is far stricter than GFF2 but does include a sequence in FASTA format
15579 **
15580 ** GFF also defines Type and sequence-region headers, but they only
15581 ** provide information that is also in the DNA, RNA or Protein header
15582 ** and these are required for sequence storage so we ignore the alternatives.
15583 **
15584 ** @param [w] thys [AjPSeq] Sequence object
15585 ** @param [u] seqin [AjPSeqin] Sequence input object
15586 ** @return [AjBool] ajTrue on success
15587 **
15588 ** @release 6.0.0
15589 ** @@
15590 ******************************************************************************/
15591
seqReadGff3(AjPSeq thys,AjPSeqin seqin)15592 static AjBool seqReadGff3(AjPSeq thys, AjPSeqin seqin)
15593 {
15594 AjBool ok;
15595 AjPFilebuff buff;
15596 AjPFilebuff ftfile = NULL;
15597 AjBool dofeat = ajFalse;
15598 AjPStr verstr = NULL; /* copy of version line */
15599 AjPStr outstr = NULL; /* generated Type line */
15600 AjPStr typstr = NULL;
15601 AjPStr rest = NULL;
15602 AjBool wantseq = ajFalse;
15603
15604 buff = seqin->Input->Filebuff;
15605
15606 if(!seqFtFmtGff)
15607 ajStrAssignC(&seqFtFmtGff, "gff3");
15608
15609 if(!seqRegGff3Typ)
15610 seqRegGff3Typ = ajRegCompC("^#!Type (.*)");
15611
15612 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15613 if(!ok)
15614 return ajFalse;
15615
15616 ajDebug("seqReadGff3 first line '%S'\n", seqReadLine);
15617
15618 ajStrRemoveWhiteExcess(&seqReadLine);
15619
15620 if(!ajStrMatchC(seqReadLine, "##gff-version 3"))
15621 {
15622 ajDebug("bad gff3 version line '%S'\n", seqReadLine);
15623 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15624
15625 return ajFalse;
15626 }
15627
15628 ajStrAssignS(&verstr, seqReadLine);
15629
15630 if(seqin->Input->Text)
15631 ajStrAssignS(&thys->TextPtr,seqReadLine);
15632
15633 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15634
15635 while(ok && ajStrPrefixC(seqReadLine, "#"))
15636 {
15637 if(ajStrPrefixC(seqReadLine, "##sequence-region"))
15638 {
15639 ajStrTokenAssignC(&seqHandle, seqReadLine, " \t");
15640 ajStrTokenStep(seqHandle);
15641 ajStrTokenNextParse(seqHandle, &thys->Name);
15642 ajStrTokenReset(seqHandle);
15643 }
15644 else if(ajStrPrefixC(seqReadLine, "##feature-ontology"))
15645 {
15646 }
15647 else if(ajStrPrefixC(seqReadLine, "##attribute-ontology"))
15648 {
15649 }
15650 else if(ajStrPrefixC(seqReadLine, "##source-ontology"))
15651 {
15652 }
15653 else if(ajStrPrefixC(seqReadLine, "###"))
15654 {
15655 }
15656 else if(ajStrPrefixC(seqReadLine, "##FASTA"))
15657 {
15658 break;
15659 }
15660 else if(ajStrPrefixC(seqReadLine, "##"))
15661 {
15662 ajDebug("GFF3: Unrecognized header directive '%S'\n",
15663 seqReadLine);
15664 }
15665
15666 if(ajRegExec(seqRegGff3Typ, seqReadLine))
15667 {
15668 ajRegSubI(seqRegGff3Typ, 1, &typstr);
15669 ajFmtPrintS(&outstr, "#!Type %S", typstr);
15670 }
15671
15672 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15673 }
15674
15675 /* do we want the features now? */
15676
15677 if(ok & seqinUfoLocal(seqin))
15678 {
15679 dofeat = ajTrue;
15680
15681 ftfile = ajFilebuffNewNofile();
15682 ajFilebuffLoadS(ftfile, verstr);
15683 ajFilebuffLoadS(ftfile, outstr);
15684 }
15685
15686 while(ok)
15687 {
15688 if(ajStrPrefixC(seqReadLine, "##"))
15689 {
15690 if(ajStrPrefixCaseC(seqReadLine, "##FASTA"))
15691 {
15692 break;
15693 }
15694 else if(ajStrPrefixC(seqReadLine, "##gff-version "))
15695 {
15696 return ajFalse;break;
15697 }
15698 }
15699
15700 if(dofeat)
15701 ajFilebuffLoadS(ftfile, seqReadLine);
15702 else if(!ajStrGetLen(thys->Name))
15703 {
15704 if(ajStrExtractFirst(seqReadLine, &rest, &seqToken))
15705 ajStrAssignS(&thys->Name, seqToken);
15706 }
15707
15708 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15709 }
15710
15711 if(!ajStrPrefixCaseC(seqReadLine, "##FASTA")) /* no sequence at end */
15712 {
15713 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15714
15715 ajDebug("No GFF3 ##FASTA line\n");
15716 return ajFalse;
15717 }
15718
15719 if(dofeat)
15720 {
15721 ajFeattabinDel(&seqin->Ftquery);
15722 seqin->Ftquery = ajFeattabinNewSeqinSSF(seqin, seqFtFmtGff,
15723 thys->Name,
15724 ajStrGetPtr(seqin->Type),
15725 ftfile);
15726 ajDebug("GFF3 FEAT TabIn %x\n", seqin->Ftquery);
15727 ftfile = NULL;
15728 ajFeattableDel(&seqin->Fttable);
15729 ajFeattableDel(&thys->Fttable);
15730 thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
15731 if(thys->Fttable)
15732 ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
15733 if(ajFeattableIsCircular(thys->Fttable))
15734 ajSeqSetCircular(thys);
15735 }
15736
15737 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15738 wantseq = ajFalse;
15739
15740 while(ok && !ajStrPrefixC(seqReadLine, "##"))
15741 {
15742 while(ok && ajStrPrefixC(seqReadLine, ">"))
15743 {
15744 ajStrCutStart(&seqReadLine, 1);
15745 if(wantseq)
15746 {
15747 wantseq = ajFalse;
15748 }
15749 else
15750 {
15751 ajStrExtractFirst(seqReadLine, &rest, &seqToken);
15752
15753 if(dofeat)
15754 {
15755 if(ajStrMatchS(seqToken, ajFeattableGetName(thys->Fttable)))
15756 {
15757 wantseq = ajTrue;
15758 ajStrAssignS(&thys->Name, seqToken);
15759 }
15760 }
15761 else
15762 {
15763 if(ajStrMatchS(seqToken, thys->Name))
15764 {
15765 wantseq = ajTrue;
15766 }
15767 }
15768
15769 if(wantseq)
15770 {
15771 ajStrRemoveWhiteExcess(&rest);
15772 ajStrAssignS(&thys->Desc, rest);
15773 }
15774 }
15775 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15776 }
15777
15778 if(wantseq)
15779 seqAppend(&thys->Seq, seqReadLine);
15780
15781 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15782 }
15783
15784 if(!ajSeqGetLen(thys))
15785 {
15786 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15787
15788 ajDebug("No sequence data\n");
15789 return ajFalse;
15790 }
15791
15792 if(ajStrMatchC(typstr, "Protein"))
15793 ajSeqSetProt(thys);
15794 else if(ajSeqIsNuc(thys))
15795 ajSeqSetNuc(thys);
15796 else
15797 ajSeqSetProt(thys);
15798
15799 ajFilebuffClear(buff, 0);
15800
15801 ajStrDel(&typstr);
15802 ajStrDel(&verstr);
15803 ajStrDel(&outstr);
15804 ajStrDelStatic(&seqToken);
15805 ajStrDel(&rest);
15806
15807 return ajTrue;
15808 }
15809
15810
15811
15812
15813 /* @funcstatic seqReadAbi *****************************************************
15814 **
15815 ** Given data in a sequence structure, tries to read everything needed
15816 ** using ABI format.
15817 **
15818 ** @param [w] thys [AjPSeq] Sequence object
15819 ** @param [u] seqin [AjPSeqin] Sequence input object
15820 ** @return [AjBool] ajTrue on success
15821 **
15822 ** @release 1.8.0
15823 ** @@
15824 ******************************************************************************/
15825
seqReadAbi(AjPSeq thys,AjPSeqin seqin)15826 static AjBool seqReadAbi(AjPSeq thys, AjPSeqin seqin)
15827 {
15828 AjPFilebuff buff;
15829 AjBool ok = ajFalse;
15830 ajlong baseO = 0L;
15831 ajlong pconO = 0L;
15832 ajlong numBases = 0L;
15833 AjPStr sample = NULL;
15834 AjPStr smpl = NULL;
15835 AjPFile fp;
15836 ajint filestat;
15837
15838 buff = seqin->Input->Filebuff;
15839 fp = ajFilebuffGetFile(buff);
15840
15841 ajDebug("seqReadAbi file %F\n", fp);
15842
15843 /* ajFilebuffTraceFull(buff, 10, 10); */
15844
15845 if(ajFilebuffIsEnded(buff))
15846 return ajFalse;
15847
15848 if(!ajSeqABITest(fp))
15849 {
15850 ajDebug("seqReadAbi ajSeqABITest failed on %F\n", fp);
15851 ajFilebuffResetPos(buff);
15852
15853 return ajFalse;
15854 }
15855
15856 if(seqin->Input->Text)
15857 ajWarn("Failed to read text from binary ABI file %F", fp);
15858
15859 filestat = ajFileSeek(fp,0L,0);
15860 ajDebug("filestat %d\n", filestat);
15861
15862 numBases = ajSeqABIGetNBase(fp);
15863
15864 ok = ajFalse;
15865
15866 /* Find BASE tag & get offset */
15867 baseO = ajSeqABIGetBaseOffset(fp);
15868 /* Read in sequence */
15869 if(baseO)
15870 ok = ajSeqABIReadSeq(fp,baseO,numBases,&thys->Seq);
15871
15872 if(!ok)
15873 {
15874 ajFileSeek(fp,filestat,0);
15875 ajFilebuffResetPos(buff);
15876
15877 return ajFalse;
15878 }
15879
15880 ok = ajFalse;
15881
15882 pconO = ajSeqABIGetConfidOffset(fp);
15883 if(numBases > (ajlong) thys->Qualsize)
15884 {
15885 AJCRESIZE(thys->Accuracy, (size_t) numBases);
15886 thys->Qualsize = (ajuint) numBases; /* Possibly lossy */
15887 }
15888 if(pconO)
15889 ok = ajSeqABIReadConfid(fp, pconO, numBases, thys->Accuracy);
15890
15891 sample = ajStrNew();
15892 ajSeqABISampleName(fp, &sample);
15893
15894 /* replace dots in the sample name with underscore */
15895 if(!seqRegAbiDots)
15896 seqRegAbiDots = ajRegCompC("^(.*)[.](.*)$");
15897
15898 smpl = ajStrNew();
15899
15900 while(ajRegExec(seqRegAbiDots,sample))
15901 {
15902 ajStrSetClear(&sample);
15903 ajRegSubI(seqRegAbiDots,1,&smpl);
15904 ajStrAppendC(&smpl,"_");
15905 ajStrAppendS(&sample,smpl);
15906 ajRegSubI(seqRegAbiDots,2,&smpl);
15907 ajStrAppendS(&sample,smpl);
15908 }
15909
15910 ajStrAssignS(&thys->Name,sample);
15911 ajFilenameTrimAll(&thys->Name);
15912
15913 ajDebug("seqReadAbi name '%S' sample '%S'\n", thys->Name, sample);
15914
15915 ajSeqSetNuc(thys);
15916
15917 ajFilebuffClear(buff, -1);
15918 buff->File->End=ajTrue;
15919
15920 ajStrDel(&smpl);
15921 ajStrDel(&sample);
15922
15923 return ajTrue;
15924 }
15925
15926
15927
15928
15929 /* @funcstatic seqReadEnsembl *************************************************
15930 **
15931 ** Given data in a sequence structure, tries to read everything needed
15932 ** using Ensembl SQL access.
15933 **
15934 ** @param [w] thys [AjPSeq] Sequence object
15935 ** @param [u] seqin [AjPSeqin] Sequence input object
15936 ** @return [AjBool] ajTrue on success
15937 **
15938 ** @release 6.3.0
15939 ** @@
15940 ******************************************************************************/
15941
seqReadEnsembl(AjPSeq thys,AjPSeqin seqin)15942 static AjBool seqReadEnsembl(AjPSeq thys, AjPSeqin seqin)
15943 {
15944 AjBool debug = AJFALSE;
15945
15946 AjPSeqAccess seqaccess = NULL;
15947
15948 debug = ajDebugTest("seqReadEnsembl");
15949
15950 if(debug)
15951 ajDebug("seqReadEnsembl\n"
15952 " thys %p\n"
15953 " seqin %p\n",
15954 thys,
15955 seqin);
15956
15957 /*
15958 ** Use the SeqData member of the AJAX Sequence Input structure
15959 ** to pass the AJAX Sequence object between the AJAX Sequence Reading
15960 ** (seqReadEnsembl) and AJAX Sequence Database (seqAccessEnsembl) modules.
15961 */
15962
15963 seqin->SeqData = (void*) thys;
15964
15965 seqaccess = seqin->Input->Query->Access;
15966
15967 if(((*seqaccess->Access)(seqin)) == ajFalse)
15968 return ajFalse;
15969
15970 return ajTrue;
15971 }
15972
15973
15974
15975
15976 /* @funcstatic seqPrefixGenbank ***********************************************
15977 **
15978 ** Returns an enumerated prefix for a record in genbank format
15979 **
15980 ** @param [r] str [const AjPStr] Input record
15981 **
15982 ** @return [SeqEPrefixGenbank] Enumerated record prefix
15983 ** @@
15984 ******************************************************************************/
15985
seqPrefixGenbank(const AjPStr str)15986 static SeqEPrefixGenbank seqPrefixGenbank(const AjPStr str)
15987 {
15988 SeqEPrefixGenbank ipref = GB_UNK;
15989 const char* cp = MAJSTRGETPTR(str);
15990 const char* cq = (cp+1);
15991
15992 switch (*cp)
15993 {
15994 case 'A':
15995 if(*cq == 'C' && !strncmp(cp, "ACCESSION",9)) ipref = GB_AC;
15996 break;
15997 case 'B':
15998 if(*cq == 'A' && !strncmp(cp, "BASE COUNT",10)) ipref = GB_BASE;
15999 break;
16000 case 'C':
16001 if(*cq == 'O' && !strncmp(cp, "COMMENT",7)) ipref = GB_CC;
16002 break;
16003 case 'D':
16004 if(*cq == 'E' && !strncmp(cp, "DEFINITION",10)) ipref = GB_DEF;
16005 break;
16006 case 'F':
16007 if(*cq == 'E' && !strncmp(cp, "FEATURES",8)) ipref = GB_FEAT;
16008 break;
16009 case 'K':
16010 if(*cq == 'E' && !strncmp(cp, "KEYWORDS",8)) ipref = GB_KEY;
16011 break;
16012 case 'L':
16013 if(*cq == 'O' && !strncmp(cp, "LOCUS",5)) ipref = GB_ID;
16014 break;
16015 case 'O':
16016 if(*cq == 'R' && !strncmp(cp, "ORIGIN",6)) ipref = GB_ORI;
16017 break;
16018 case 'R':
16019 if(*cq == 'E' && !strncmp(cp, "REFERENCE",9)) ipref = GB_REF;
16020 break;
16021 case 'S':
16022 if(*cq == 'E' && !strncmp(cp, "SEQVERSION",10)) ipref = GB_VER;
16023 if(*cq == 'O' && !strncmp(cp, "SOURCE",6)) ipref = GB_SRC;
16024 break;
16025 case 'W':
16026 if(*cq == 'P' && !strncmp(cp, "WPCOMMENT",9)) ipref = GB_WP;
16027 break;
16028 case '/':
16029 if(*cq == '/' && !strncmp(cp, "//",2)) ipref = GB_END;
16030 break;
16031 case ' ':
16032 if(*cq == ' ' && !strncmp(cp, " ",2)) ipref = GB_MORE;
16033 break;
16034 default:
16035 ipref = GB_UNK;
16036 break;
16037 }
16038
16039 return ipref;
16040 }
16041
16042
16043
16044
16045 /* @funcstatic seqPrefixGenbankMore *******************************************
16046 **
16047 ** Returns an enumerated prefix for a subrecord in genbank format
16048 **
16049 ** @param [r] str [const AjPStr] Input record
16050 **
16051 ** @return [SeqEPrefixGenbankMore] Enumerated record prefix
16052 ** @@
16053 ******************************************************************************/
16054
seqPrefixGenbankMore(const AjPStr str)16055 static SeqEPrefixGenbankMore seqPrefixGenbankMore(const AjPStr str)
16056 {
16057 SeqEPrefixGenbankMore imore = GB_MORE_UNK;
16058 const char* cp = MAJSTRGETPTR(str);
16059 const char* cq = (cp+1);
16060 const char* cr = (cp+1);
16061
16062 if(*cp != ' ' || *cq != ' ')
16063 return GB_MORE_STD;
16064
16065 switch (*cr)
16066 {
16067 case 'A':
16068 if(!strncmp(cr, "AUTHORS",7)) imore = GB_MORE_AUT;
16069 break;
16070 case 'B':
16071 break;
16072 case 'C':
16073 break;
16074 case 'D':
16075 break;
16076 case 'F':
16077 break;
16078 case 'J':
16079 if(!strncmp(cr, "JOURNAL",7)) imore = GB_MORE_JNL;
16080 break;
16081 case 'O':
16082 if(!strncmp(cr, "ORGANISM",8)) imore = GB_MORE_ORG;
16083 break;
16084 case 'T':
16085 break;
16086 if(!strncmp(cr, "TITLE",5)) imore = GB_MORE_TIT;
16087 case 'W':
16088 break;
16089 case '/':
16090 break;
16091 case ' ':
16092 if(!strncmp(cr, " ",8)) imore = GB_MORE_MORE;
16093 break;
16094 default:
16095 imore = GB_MORE_UNK;
16096 break;
16097 }
16098
16099 return imore;
16100 }
16101
16102
16103
16104
16105 /* @funcstatic seqPrefixSwiss *************************************************
16106 **
16107 ** Returns an enumerated prefix for a record in swissprot format
16108 **
16109 ** @param [r] str [const AjPStr] Input record
16110 **
16111 ** @return [SeqEPrefixSwiss] Enumerated record prefix
16112 ** @@
16113 ******************************************************************************/
16114
seqPrefixSwiss(const AjPStr str)16115 static SeqEPrefixSwiss seqPrefixSwiss(const AjPStr str)
16116 {
16117 SeqEPrefixSwiss ipref = SWISS_UNK;
16118 const char* cp = MAJSTRGETPTR(str);
16119 const char* cq = (cp+1);
16120
16121 switch (*cp)
16122 {
16123 case 'A':
16124 switch(*cq)
16125 {
16126 case 'C':
16127 ipref = SWISS_AC;
16128 break;
16129 case 'H':
16130 ipref = SWISS_FH; /* Align header ignored with FH */
16131 break;
16132 case 'S':
16133 ipref = SWISS_AS;
16134 break;
16135 case 'V':
16136 ipref = SWISS_AV; /* staden experiment */
16137 break;
16138 }
16139 break;
16140 case 'C':
16141 switch(*cq)
16142 {
16143 case 'C':
16144 ipref = SWISS_CC;
16145 break;
16146 case 'O':
16147 ipref = SWISS_CO;
16148 break;
16149 }
16150 break;
16151 case 'D':
16152 switch(*cq)
16153 {
16154 case 'E':
16155 ipref = SWISS_DE;
16156 break;
16157 case 'R':
16158 ipref = SWISS_DR;
16159 break;
16160 case 'T':
16161 ipref = SWISS_DT;
16162 break;
16163 }
16164 break;
16165 case 'E':
16166 switch(*cq)
16167 {
16168 case 'N':
16169 case 'X':
16170 ipref = SWISS_EX;
16171 break;
16172 }
16173 break;
16174 case 'F':
16175 switch(*cq)
16176 {
16177 case 'H':
16178 ipref = SWISS_FH;
16179 break;
16180 case 'T':
16181 ipref = SWISS_FT;
16182 break;
16183 }
16184 break;
16185 case 'G':
16186 if(*cq == 'N') ipref = SWISS_GN;
16187 break;
16188 case 'I':
16189 switch(*cq)
16190 {
16191 case 'D':
16192 ipref = SWISS_ID;
16193 break;
16194 case 'V':
16195 ipref = SWISS_SV; /* EMBLCDS Sv equivalent */
16196 break;
16197 }
16198 break;
16199 case 'K':
16200 if(*cq == 'W') ipref = SWISS_KW;
16201 break;
16202 case 'O':
16203 switch (*cq)
16204 {
16205 case 'C':
16206 ipref = SWISS_OC;
16207 break;
16208 case 'G':
16209 ipref = SWISS_OG;
16210 break;
16211 case 'H':
16212 ipref = SWISS_OH;
16213 break;
16214 case 'S':
16215 ipref = SWISS_OS;
16216 break;
16217 case 'X':
16218 ipref = SWISS_OX;
16219 break;
16220 }
16221 break;
16222 case 'P':
16223 switch(*cq)
16224 {
16225 case 'A':
16226 ipref = SWISS_AC; /* PA records in EMBLCDS */
16227 break;
16228 case 'E':
16229 ipref = SWISS_PE;
16230 break;
16231 }
16232 break;
16233 case 'R':
16234 switch(*cq)
16235 {
16236 case 'A':
16237 ipref = SWISS_RA;
16238 break;
16239 case 'C':
16240 ipref = SWISS_RC;
16241 break;
16242 case 'G':
16243 ipref = SWISS_RG;
16244 break;
16245 case 'L':
16246 ipref = SWISS_RL;
16247 break;
16248 case 'N':
16249 ipref = SWISS_RN;
16250 break;
16251 case 'P':
16252 ipref = SWISS_RP;
16253 break;
16254 case 'T':
16255 ipref = SWISS_RT;
16256 break;
16257 case 'X':
16258 ipref = SWISS_RX;
16259 break;
16260 }
16261 break;
16262 case 'S':
16263 switch(*cq)
16264 {
16265 case 'Q':
16266 ipref = SWISS_SQ;
16267 break;
16268 case 'V':
16269 ipref = SWISS_SV;
16270 break;
16271 }
16272 break;
16273 case 'T':
16274 if(*cq == 'N') ipref = SWISS_EX;
16275 break;
16276 case 'W':
16277 if(*cq == 'P') ipref = SWISS_WP;
16278 break;
16279 case 'X':
16280 if(*cq == 'X') ipref = SWISS_XX;
16281 break;
16282 case '/':
16283 if(*cq == '/') ipref = SWISS_END;
16284 break;
16285 case ' ':
16286 if(*cq == ' ') ipref = SWISS_MORE;
16287 break;
16288 default:
16289 ipref = SWISS_UNK;
16290 break;
16291 }
16292
16293 return ipref;
16294 }
16295
16296
16297
16298
16299 /* @funcstatic seqDesSwiss ****************************************************
16300 **
16301 ** Returns an enumerated code for a description record token
16302 **
16303 ** @param [r] str [const AjPStr] Input record
16304 **
16305 ** @return [SeqEDesSwiss] Enumerated record prefix
16306 ** @@
16307 ******************************************************************************/
16308
seqDesSwiss(const AjPStr str)16309 static SeqEDesSwiss seqDesSwiss(const AjPStr str)
16310 {
16311 SeqEDesSwiss ides = SWISS_DES_UNK;
16312 const char* cp = MAJSTRGETPTR(str);
16313
16314 switch (*cp)
16315 {
16316 case 'A':
16317 if(!strcmp(cp, "AltName:")) ides = SWISS_DES_ALT;
16318 break;
16319 case 'C':
16320 if(!strcmp(cp, "Contains:")) ides = SWISS_DES_CONT;
16321 break;
16322 case 'F':
16323 if(!strcmp(cp, "Flags:")) ides = SWISS_DES_FLG;
16324 break;
16325 case 'I':
16326 if(!strcmp(cp, "Includes:")) ides = SWISS_DES_INC;
16327 break;
16328 case 'R':
16329 if(!strcmp(cp, "RecName:")) ides = SWISS_DES_REC;
16330 break;
16331 case 'S':
16332 if(!strcmp(cp, "SubName:")) ides = SWISS_DES_SUB;
16333 break;
16334 default:
16335 ides = SWISS_DES_UNK;
16336 break;
16337 }
16338
16339 return ides;
16340 }
16341
16342
16343
16344
16345 /* @funcstatic seqDessubSwiss *************************************************
16346 **
16347 ** Returns an enumerated subcode for a description record token
16348 **
16349 ** @param [u] Pstr [AjPStr*] Input record
16350 **
16351 ** @return [SeqESubSwiss] Enumerated record prefix
16352 ** @@
16353 ******************************************************************************/
16354
seqDessubSwiss(AjPStr * Pstr)16355 static SeqESubSwiss seqDessubSwiss(AjPStr *Pstr)
16356 {
16357 SeqESubSwiss isub = SWISS_SUB_UNK;
16358 const char* cp = MAJSTRGETPTR(*Pstr);
16359
16360 switch (*cp)
16361 {
16362 case 'A':
16363 if(!strncmp(cp, "Allergen=", 9))
16364 {
16365 isub = SWISS_SUB_ALLER;
16366 ajStrCutStart(Pstr, 9);
16367 }
16368 break;
16369 case 'B':
16370 if(!strncmp(cp, "Biotech=", 8))
16371 {
16372 isub = SWISS_SUB_BIOTECH;
16373 ajStrCutStart(Pstr, 8);
16374 }
16375 break;
16376 case 'C':
16377 if(!strncmp(cp, "CD_antigen=", 11))
16378 {
16379 isub = SWISS_SUB_CDA;
16380 ajStrCutStart(Pstr, 11);
16381 }
16382 break;
16383 case 'E':
16384 if(!strncmp(cp, "EC=", 3))
16385 {
16386 isub = SWISS_SUB_EC;
16387 ajStrCutStart(Pstr, 3);
16388 }
16389 break;
16390 case 'F':
16391 if(!strncmp(cp, "Full=", 5))
16392 {
16393 isub = SWISS_SUB_FULL;
16394 ajStrCutStart(Pstr, 5);
16395 }
16396 break;
16397 case 'I':
16398 if(!strncmp(cp, "INN=", 4))
16399 {
16400 isub = SWISS_SUB_INN;
16401 ajStrCutStart(Pstr, 4);
16402 }
16403 break;
16404 case 'S':
16405 if(!strncmp(cp, "Short=", 6))
16406 {
16407 isub = SWISS_SUB_SHORT;
16408 ajStrCutStart(Pstr, 6);
16409 }
16410 break;
16411 default:
16412 isub = SWISS_DES_UNK;
16413 break;
16414 }
16415
16416 return isub;
16417 }
16418
16419
16420
16421
16422 /* @func ajSeqPrintInFormat ***************************************************
16423 **
16424 ** Reports the internal data structures
16425 **
16426 ** @param [u] outf [AjPFile] Output file
16427 ** @param [r] full [AjBool] Full report (usually ajFalse)
16428 ** @return [void]
16429 **
16430 ** @release 1.0.0
16431 ** @@
16432 ******************************************************************************/
16433
ajSeqPrintInFormat(AjPFile outf,AjBool full)16434 void ajSeqPrintInFormat(AjPFile outf, AjBool full)
16435 {
16436 ajuint i = 0;
16437
16438 ajFmtPrintF(outf, "\n");
16439 ajFmtPrintF(outf, "# Sequence input formats\n");
16440 ajFmtPrintF(outf, "# Name Format name (or alias)\n");
16441 ajFmtPrintF(outf, "# Alias Alias name\n");
16442 ajFmtPrintF(outf, "# Try Test for unknown input files\n");
16443 ajFmtPrintF(outf, "# Nuc Can read nucleotide input\n");
16444 ajFmtPrintF(outf, "# Pro Can read protein input\n");
16445 ajFmtPrintF(outf, "# Feat Can read feature annotation\n");
16446 ajFmtPrintF(outf, "# Gap Can read gap characters\n");
16447 ajFmtPrintF(outf, "# Mset Can read seqsetall (multiple seqsets)\n");
16448 ajFmtPrintF(outf, "# Name Alias Try Nuc Pro Feat Gap MSet "
16449 "Description");
16450 ajFmtPrintF(outf, "\n");
16451 ajFmtPrintF(outf, "InFormat {\n");
16452
16453 for(i=0; seqinFormatDef[i].Name; i++)
16454 if(full || !seqinFormatDef[i].Alias)
16455 ajFmtPrintF(outf,
16456 " %-12s %5B %3B %3B %3B %3B %3B %3B \"%s\"\n",
16457 seqinFormatDef[i].Name,
16458 seqinFormatDef[i].Alias,
16459 seqinFormatDef[i].Try,
16460 seqinFormatDef[i].Nucleotide,
16461 seqinFormatDef[i].Protein,
16462 seqinFormatDef[i].Feature,
16463 seqinFormatDef[i].Gap,
16464 seqinFormatDef[i].Multiset,
16465 seqinFormatDef[i].Desc);
16466
16467 ajFmtPrintF(outf, "}\n\n");
16468
16469 return;
16470 }
16471
16472
16473
16474
16475 /* @func ajSeqPrintbookInFormat ***********************************************
16476 **
16477 ** Reports the internal data structures as a Docbook table
16478 **
16479 ** @param [u] outf [AjPFile] Output file
16480 ** @return [void]
16481 **
16482 ** @release 6.2.0
16483 ** @@
16484 ******************************************************************************/
16485
ajSeqPrintbookInFormat(AjPFile outf)16486 void ajSeqPrintbookInFormat(AjPFile outf)
16487 {
16488 ajuint i = 0;
16489 ajuint j = 0;
16490 AjPStr namestr = NULL;
16491 AjPList fmtlist;
16492 AjPStr* names;
16493
16494 fmtlist = ajListstrNew();
16495
16496 ajFmtPrintF(outf, "<para>The supported sequence formats are summarised "
16497 "in the table below. "
16498 "The columns are as follows: "
16499 "<emphasis>Input format</emphasis> (format name), "
16500 "<emphasis>Output format</emphasis> (format name), "
16501 "<emphasis>Sngl</emphasis> "
16502 "(indicates whether each sequence is written to a new file. "
16503 "This behaviour is the default and can be set by the "
16504 "<option>-ossingle</option> command line qualifier. "
16505 "<emphasis>Save</emphasis> (indicates that sequence data is "
16506 "stored internally and written when the output is closed. "
16507 "This is needed for 'interleaved' formats such as Phylip "
16508 "and MSF), <emphasis>Try</emphasis> (indicates whether the "
16509 "format can be detected automatically on input), "
16510 "<emphasis>Nuc</emphasis> (\"true\" indicates nucleotide "
16511 "sequence data may be represented), <emphasis>Pro</emphasis> "
16512 "(\"true\" indicates protein sequence data may be represented, "
16513 "<emphasis>Feat</emphasis> (whether the format includes "
16514 "feature annotation data. "
16515 "EMBOSS can also read feature data from a separate "
16516 "feature file). "
16517 "<emphasis>Gap</emphasis> (whether the format supports "
16518 "sequence data with gap characters, for example the results "
16519 "of an alignment), "
16520 "<emphasis>Mset</emphasis> (\"true\" indicates that more "
16521 "than one set of sequences can be stored in a single file. "
16522 "This is used by, for example, phylogenetic analysis "
16523 "applications to store many versions of a multiple alignment "
16524 "for statistical analysis) and "
16525 "<emphasis>Description</emphasis> (short description of "
16526 "the format).</para>\n\n");
16527
16528 ajFmtPrintF(outf, "<table frame=\"box\" rules=\"cols\">\n");
16529 ajFmtPrintF(outf, " <caption>Input sequence formats</caption>\n");
16530 ajFmtPrintF(outf, " <thead>\n");
16531 ajFmtPrintF(outf, " <tr align=\"center\">\n");
16532 ajFmtPrintF(outf, " <th>Input Format</th>\n");
16533 ajFmtPrintF(outf, " <th>Try</th>\n");
16534 ajFmtPrintF(outf, " <th>Nuc</th>\n");
16535 ajFmtPrintF(outf, " <th>Pro</th>\n");
16536 ajFmtPrintF(outf, " <th>Feat</th>\n");
16537 ajFmtPrintF(outf, " <th>Gap</th>\n");
16538 ajFmtPrintF(outf, " <th>Mset</th>\n");
16539 ajFmtPrintF(outf, " <th>Description</th>\n");
16540 ajFmtPrintF(outf, " </tr>\n");
16541 ajFmtPrintF(outf, " </thead>\n");
16542 ajFmtPrintF(outf, " <tbody>\n");
16543
16544 for(i=1; seqinFormatDef[i].Name; i++)
16545 {
16546 if(!seqinFormatDef[i].Alias)
16547 {
16548 namestr = ajStrNewC(seqinFormatDef[i].Name);
16549 ajListPush(fmtlist, namestr);
16550 namestr = NULL;
16551 }
16552 }
16553
16554 ajListSort(fmtlist, &ajStrVcmp);
16555 ajListstrToarray(fmtlist, &names);
16556
16557 for(i=0; names[i]; i++)
16558 {
16559 for(j=0; seqinFormatDef[j].Name; j++)
16560 {
16561 if(ajStrMatchC(names[i],seqinFormatDef[j].Name))
16562 {
16563 ajFmtPrintF(outf, " <tr>\n");
16564 ajFmtPrintF(outf, " <td>%s</td>\n",
16565 seqinFormatDef[j].Name);
16566 ajFmtPrintF(outf, " <td>%B</td>\n",
16567 seqinFormatDef[j].Try);
16568 ajFmtPrintF(outf, " <td>%B</td>\n",
16569 seqinFormatDef[j].Nucleotide);
16570 ajFmtPrintF(outf, " <td>%B</td>\n",
16571 seqinFormatDef[j].Protein);
16572 ajFmtPrintF(outf, " <td>%B</td>\n",
16573 seqinFormatDef[j].Feature);
16574 ajFmtPrintF(outf, " <td>%B</td>\n",
16575 seqinFormatDef[j].Gap);
16576 ajFmtPrintF(outf, " <td>%B</td>\n",
16577 seqinFormatDef[j].Multiset);
16578 ajFmtPrintF(outf, " <td>%s</td>\n",
16579 seqinFormatDef[j].Desc);
16580 ajFmtPrintF(outf, " </tr>\n");
16581 }
16582 }
16583 }
16584
16585
16586 ajFmtPrintF(outf, " </tbody>\n");
16587 ajFmtPrintF(outf, "</table>\n");
16588 ajStrDel(&namestr);
16589
16590 names = NULL;
16591 ajListstrFreeData(&fmtlist);
16592
16593 return;
16594 }
16595
16596
16597
16598
16599 /* @func ajSeqPrinthtmlInFormat ***********************************************
16600 **
16601 ** Reports the internal data structures as an HTML table
16602 **
16603 ** @param [u] outf [AjPFile] Output file
16604 ** @return [void]
16605 **
16606 ** @release 6.2.0
16607 ** @@
16608 ******************************************************************************/
16609
ajSeqPrinthtmlInFormat(AjPFile outf)16610 void ajSeqPrinthtmlInFormat(AjPFile outf)
16611 {
16612 ajuint i = 0;
16613 ajuint j = 0;
16614
16615 AjPStr namestr = NULL;
16616
16617 ajFmtPrintF(outf, "<table border=3>");
16618 ajFmtPrintF(outf, "<tr><th>Input Format</th><th>Auto</th>\n");
16619 ajFmtPrintF(outf, "<th>Nuc</th><th>Pro</th><th>Feat</th><th>Gap</th>\n");
16620 ajFmtPrintF(outf, "<th>Multi</th><th>Description</th></tr>\n");
16621
16622 for(i=1; seqinFormatDef[i].Name; i++)
16623 {
16624 ajStrAssignC(&namestr, seqinFormatDef[i].Name);
16625
16626 if(!seqinFormatDef[i].Alias)
16627 {
16628 for(j=i+1; seqinFormatDef[j].Name; j++)
16629 {
16630 if(seqinFormatDef[j].Read == seqinFormatDef[i].Read)
16631 {
16632 ajFmtPrintAppS(&namestr, " %s", seqinFormatDef[j].Name);
16633 if(!seqinFormatDef[j].Alias)
16634 {
16635 ajWarn("Input format '%s' same as '%s' but not alias",
16636 seqinFormatDef[j].Name, seqinFormatDef[i].Name);
16637 }
16638 }
16639 }
16640
16641 ajFmtPrintF(outf, "<tr><td>\n%S\n</td><td>%B</td>\n",
16642 namestr,
16643 seqinFormatDef[i].Try);
16644 ajFmtPrintF(outf, "<td>%B</td><td>%B</td><td>%B</td><td>%B</td>\n",
16645 seqinFormatDef[i].Nucleotide,
16646 seqinFormatDef[i].Protein,
16647 seqinFormatDef[i].Feature,
16648 seqinFormatDef[i].Gap);
16649 ajFmtPrintF(outf, "<td>%B</td><td>\n%s\n</td></tr>\n",
16650 seqinFormatDef[i].Multiset,
16651 seqinFormatDef[i].Desc);
16652 }
16653
16654 }
16655
16656 ajFmtPrintF(outf, "</table>\n");
16657 ajStrDel(&namestr);
16658
16659 return;
16660 }
16661
16662
16663
16664
16665 /* @func ajSeqPrintwikiInFormat ***********************************************
16666 **
16667 ** Reports the internal data structures as a wiki table
16668 **
16669 ** @param [u] outf [AjPFile] Output file
16670 ** @return [void]
16671 **
16672 ** @release 6.2.0
16673 ** @@
16674 ******************************************************************************/
16675
ajSeqPrintwikiInFormat(AjPFile outf)16676 void ajSeqPrintwikiInFormat(AjPFile outf)
16677 {
16678 ajuint i = 0;
16679 ajuint j = 0;
16680
16681 AjPStr namestr = NULL;
16682
16683 ajFmtPrintF(outf, "{| class=\"wikitable sortable\" border=\"2\"\n");
16684 ajFmtPrintF(outf, "|-\n");
16685 ajFmtPrintF(outf, "!Format!!Try!!Nuc!!Pro!!Feat!!Gap!!MSet!!"
16686 "class=\"unsortable\"|Description\n");
16687
16688 for(i=1; seqinFormatDef[i].Name; i++)
16689 {
16690 ajStrAssignC(&namestr, seqinFormatDef[i].Name);
16691
16692 if(!seqinFormatDef[i].Alias)
16693 {
16694 for(j=i+1; seqinFormatDef[j].Name; j++)
16695 {
16696 if(seqinFormatDef[j].Read == seqinFormatDef[i].Read)
16697 {
16698 ajFmtPrintAppS(&namestr, "<br>%s", seqinFormatDef[j].Name);
16699 if(!seqinFormatDef[j].Alias)
16700 {
16701 ajWarn("Input format '%s' same as '%s' but not alias",
16702 seqinFormatDef[j].Name, seqinFormatDef[i].Name);
16703 }
16704 }
16705 }
16706
16707 ajFmtPrintF(outf, "|-\n");
16708 ajFmtPrintF(outf,
16709 "|%S||%B||%B||%B||%B||%B||%B||%s\n",
16710 namestr,
16711 seqinFormatDef[i].Try,
16712 seqinFormatDef[i].Nucleotide,
16713 seqinFormatDef[i].Protein,
16714 seqinFormatDef[i].Feature,
16715 seqinFormatDef[i].Gap,
16716 seqinFormatDef[i].Multiset,
16717 seqinFormatDef[i].Desc);
16718 }
16719
16720 }
16721
16722 ajFmtPrintF(outf, "|}\n\n");
16723 ajStrDel(&namestr);
16724
16725 return;
16726 }
16727
16728
16729
16730
16731 /* @funcstatic seqinFormatFind ************************************************
16732 **
16733 ** Looks for the specified format(s) in the internal definitions and
16734 ** returns the index.
16735 **
16736 ** Sets iformat as the recognised format, and returns ajTrue.
16737 **
16738 ** @param [r] format [const AjPStr] Format required.
16739 ** @param [w] iformat [ajint*] Index
16740 ** @return [AjBool] ajTrue on success.
16741 **
16742 ** @release 6.4.0
16743 ** @@
16744 ******************************************************************************/
16745
seqinFormatFind(const AjPStr format,ajint * iformat)16746 static AjBool seqinFormatFind(const AjPStr format, ajint* iformat)
16747 {
16748 AjPStr tmpformat = NULL;
16749 ajuint i = 0;
16750
16751 ajDebug("seqinFormatFind '%S'\n", format);
16752 if(!ajStrGetLen(format))
16753 return ajFalse;
16754
16755 ajStrAssignS(&tmpformat, format);
16756 ajStrFmtLower(&tmpformat);
16757
16758 for(i=0; seqinFormatDef[i].Name; i++)
16759 {
16760 /*ajDebug("test %d '%s' '%s' '%s' \n",
16761 i, seqinFormatDef[i].Name,
16762 seqinFormatDef[i].Obo,
16763 seqinFormatDef[i].Desc);*/
16764 if(ajStrMatchC(tmpformat, seqinFormatDef[i].Name) ||
16765 ajStrMatchC(format, seqinFormatDef[i].Obo))
16766 {
16767 *iformat = i;
16768 ajStrDel(&tmpformat);
16769 /*ajDebug("found '%s' at %d\n", seqinFormatDef[i].Name, i);*/
16770 return ajTrue;
16771 }
16772 }
16773
16774 ajStrDel(&tmpformat);
16775
16776 return ajFalse;
16777 }
16778
16779
16780
16781
16782 /* @func ajSeqFormatTest ******************************************************
16783 **
16784 ** tests whether a named format is known
16785 **
16786 ** @param [r] format [const AjPStr] Format
16787 ** @return [AjBool] ajTrue if formats was accepted
16788 **
16789 ** @release 2.7.0
16790 ** @@
16791 ******************************************************************************/
16792
ajSeqFormatTest(const AjPStr format)16793 AjBool ajSeqFormatTest(const AjPStr format)
16794 {
16795 ajuint i;
16796
16797 for(i=0; seqinFormatDef[i].Name; i++)
16798 {
16799 if(ajStrMatchCaseC(format, seqinFormatDef[i].Name))
16800 return ajTrue;
16801 if(ajStrMatchC(format, seqinFormatDef[i].Obo))
16802 return ajTrue;
16803 }
16804
16805 return ajFalse;
16806 }
16807
16808
16809
16810
16811 /* @funcstatic seqSetInFormat *************************************************
16812 **
16813 ** Steps through a list of default formats, setting the Try value for
16814 ** each known format to ajTrue if it is in the list, and ajFalse
16815 ** if not.
16816 **
16817 ** @param [r] format [const AjPStr] Format list, punctuated by whitespace
16818 ** or commas
16819 ** @return [AjBool] ajTrue if all formats were accepted
16820 **
16821 ** @release 1.0.0
16822 ** @@
16823 ******************************************************************************/
16824
seqSetInFormat(const AjPStr format)16825 static AjBool seqSetInFormat(const AjPStr format)
16826 {
16827 ajuint i;
16828 ajuint ifound;
16829 AjBool ret = ajTrue;
16830
16831 for(i=0; seqinFormatDef[i].Name; i++)
16832 seqinFormatDef[i].Try = ajFalse;
16833
16834 ajDebug("seqSetInformat '%S'\n", format);
16835
16836 ajStrTokenAssignC(&seqHandle, format, " \t\n\r,;:");
16837
16838 while(ajStrTokenNextParseC(seqHandle, " \t\n\r,;:", &seqToken))
16839 {
16840 ifound = 0;
16841
16842 for(i=0; seqinFormatDef[i].Name; i++)
16843 if(ajStrMatchCaseC(seqToken, seqinFormatDef[i].Name))
16844 {
16845 /* ajDebug("found '%S' %d\n", fmtstr, i); */
16846 seqinFormatDef[i].Try = ajTrue;
16847 ifound = 1;
16848 break;
16849 }
16850
16851 if(!ifound)
16852 {
16853 /* ajDebug("not found '%S'\n", fmtstr); */
16854
16855 ajErr("Input format '%S' not known", seqToken);
16856 ret = ajFalse;
16857 }
16858 }
16859
16860 ajStrTokenReset(seqHandle);
16861
16862 return ret;
16863 }
16864
16865
16866
16867
16868 /* @funcstatic seqAppend ******************************************************
16869 **
16870 ** Appends sequence characters in the input line to a growing sequence.
16871 ** Non sequence characters are simply ignored.
16872 **
16873 ** @param [u] pseq [AjPStr*] Sequence as a string
16874 ** @param [r] line [const AjPStr] Input line.
16875 ** @return [ajuint] Sequence length to date.
16876 **
16877 ** @release 1.0.0
16878 ** @@
16879 ******************************************************************************/
16880
seqAppend(AjPStr * pseq,const AjPStr line)16881 static ajuint seqAppend(AjPStr* pseq, const AjPStr line)
16882 {
16883 ajuint ret = 0;
16884
16885 ajStrAssignS(&seqAppendTmpstr, line);
16886 ajStrKeepSetAlphaC(&seqAppendTmpstr, "*.~?#+-");
16887 ajStrAppendS(pseq, seqAppendTmpstr);
16888
16889 ret = ajStrGetLen(*pseq);
16890
16891 ajStrDelStatic(&seqAppendTmpstr);
16892
16893 return ret;
16894 }
16895
16896
16897
16898
16899 /* @funcstatic seqAppendK *****************************************************
16900 **
16901 ** Appends single sequence character in the input line to a growing sequence.
16902 ** Non sequence characters are simply ignored.
16903 **
16904 ** @param [u] pseq [AjPStr*] Sequence as a string
16905 ** @param [r] ch [char] Input character.
16906 ** @return [ajuint] Sequence length to date.
16907 **
16908 ** @release 6.0.0
16909 ** @@
16910 ******************************************************************************/
16911
seqAppendK(AjPStr * pseq,char ch)16912 static ajuint seqAppendK(AjPStr* pseq, char ch)
16913 {
16914 AjPStr tmpstr = NULL;
16915 ajuint ret = 0;
16916
16917 ajStrAssignK(&tmpstr, ch);
16918 ajStrKeepSetAlphaC(&tmpstr, "*.~?#+-");
16919 ajStrAppendS(pseq, tmpstr);
16920
16921 ret = ajStrGetLen(*pseq);
16922 ajStrDel(&tmpstr);
16923
16924 return ret;
16925 }
16926
16927
16928
16929
16930 /* @funcstatic seqAppendCommented *********************************************
16931 **
16932 ** Appends sequence characters in the input line to a growing sequence.
16933 ** Non sequence characters are simply ignored.
16934 **
16935 ** This version of seqAppend removes comments in the angle brackets style
16936 ** used first by Staden and then later by GCG.
16937 **
16938 ** @param [u] pseq [AjPStr*] Sequence as a string
16939 ** @param [u] incomment [AjBool*] Currently processing a comment
16940 ** @param [r] line [const AjPStr] Input line.
16941 ** @return [ajuint] Sequence length to date.
16942 **
16943 ** @release 3.0.0
16944 ** @@
16945 ******************************************************************************/
16946
seqAppendCommented(AjPStr * pseq,AjBool * incomment,const AjPStr line)16947 static ajuint seqAppendCommented(AjPStr* pseq, AjBool* incomment,
16948 const AjPStr line)
16949 {
16950 AjPStr tmpstr = NULL;
16951 ajlong i;
16952 ajuint ret = 0;
16953
16954 ajStrAssignS(&tmpstr, line);
16955 ajStrKeepSetAlphaC(&tmpstr, "*.~?#+-<>");
16956
16957 ajDebug("seqAppendCommented %B '%S'\n", *incomment, tmpstr);
16958
16959 while(ajStrGetLen(tmpstr))
16960 {
16961 /* if we are in a comment, look for the end of it */
16962 /* Staden comments are <comment> */
16963 /* GCG comments are <comment< or >comment> */
16964
16965 /* there should be no case of >comment<
16966 but in a broken file we can't tell */
16967
16968 /* so we test for both kinds of angle brackets at both ends */
16969
16970 if(*incomment)
16971 {
16972 i = ajStrFindAnyC(tmpstr, "<>");
16973
16974 if(i >= 0) /* comment ends in this line */
16975 {
16976 ajStrCutStart(&tmpstr, (size_t) i+1);
16977 *incomment = ajFalse;
16978 }
16979 else
16980 {
16981 ajStrAssignClear(&tmpstr); /* all comment */
16982 }
16983 }
16984 else
16985 {
16986 i = ajStrFindAnyC(tmpstr, "<>");
16987
16988 if(i >= 0) /* comment starts in this line */
16989 {
16990 if(i)
16991 ajStrAppendSubS(pseq, tmpstr, 0, i-1);
16992
16993 ajDebug("before comment saved '%S'\n", *pseq);
16994 ajStrCutStart(&tmpstr, (size_t) (i+1));
16995 *incomment = ajTrue;
16996 }
16997 else
16998 {
16999 ajStrAppendS(pseq, tmpstr);
17000 ajDebug("all saved '%S'\n", *pseq);
17001 ajStrAssignClear(&tmpstr);
17002 }
17003 }
17004
17005 if(ajStrGetLen(tmpstr))
17006 ajDebug("continuing %B '%S'\n", *incomment, tmpstr);
17007 else
17008 ajDebug("done %B '%S'\n", *incomment, tmpstr);
17009 }
17010
17011 ret = ajStrGetLen(*pseq);
17012 ajStrDel(&tmpstr);
17013
17014 return ret;
17015 }
17016
17017
17018
17019
17020 /* @funcstatic seqAppendWarn **************************************************
17021 **
17022 ** Appends sequence characters in the input line to a growing sequence.
17023 **
17024 ** Non sequence characters are reported in the return value
17025 ** if EMBOSS_SEQWARN is set
17026 **
17027 ** @param [u] pseq [AjPStr*] Sequence as a string
17028 ** @param [r] line [const AjPStr] Input line.
17029 ** @param [r] informat [ajuint] Input format, zero for unknown
17030 ** @return [const AjPStr] Any rejected non-space characters
17031 **
17032 ** @release 5.0.0
17033 ** @@
17034 ******************************************************************************/
17035
seqAppendWarn(AjPStr * pseq,const AjPStr line,ajuint informat)17036 static const AjPStr seqAppendWarn(AjPStr* pseq, const AjPStr line,
17037 ajuint informat)
17038 {
17039 AjPStr tmpstr = NULL;
17040
17041 if(!seqAppendRestStr)
17042 {
17043 if(ajNamGetValueC("seqwarn", &tmpstr))
17044 ajStrToBool(tmpstr, &seqDoWarnAppend);
17045 seqAppendRestStr = ajStrNew();
17046 }
17047
17048 ajStrAssignS(&seqAppendTmpSeq, line);
17049
17050 if(seqDoWarnAppend || informat)
17051 {
17052 ajStrKeepSetAlphaRestC(&seqAppendTmpSeq, "*.~?#+-", &seqAppendRestStr);
17053 ajStrAppendS(pseq, seqAppendTmpSeq);
17054
17055 ajStrDelStatic(&seqAppendTmpSeq);
17056
17057 if(!ajStrGetLen(seqAppendRestStr))
17058 return NULL;
17059
17060 return seqAppendRestStr;
17061 }
17062
17063 if(!seqAppendFilter)
17064 seqAppendFilter = ajCharGetfilter( "*.~?#+-"
17065 "abcdefghijklmnopqrstuvwxyz"
17066 "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
17067
17068 ajStrKeepSetFilter(&seqAppendTmpSeq, seqAppendFilter);
17069
17070 ajStrAppendS(pseq, seqAppendTmpSeq);
17071
17072 ajStrDelStatic(&seqAppendTmpSeq);
17073
17074 return NULL;
17075 }
17076
17077
17078
17079
17080 /* @funcstatic seqqualAppendWarn **********************************************
17081 **
17082 ** Appends sequence quality characters in the input line to a growing string.
17083 **
17084 ** Non sequence characters are reported in the return value
17085 ** if EMBOSS_SEQWARN is set
17086 **
17087 ** @param [u] Pqual [AjPStr*] Quality values as a string
17088 ** @param [r] line [const AjPStr] Input line.
17089 ** @return [void]
17090 **
17091 ** @release 6.1.0
17092 ** @@
17093 ******************************************************************************/
17094
seqqualAppendWarn(AjPStr * Pqual,const AjPStr line)17095 static void seqqualAppendWarn(AjPStr* Pqual, const AjPStr line)
17096 {
17097 ajStrAssignS(&seqAppendTmpSeq, line);
17098
17099 ajStrKeepSetAscii(&seqAppendTmpSeq, 33, 126);
17100 ajStrAppendS(Pqual, seqAppendTmpSeq);
17101
17102 ajStrDelStatic(&seqAppendTmpSeq);
17103
17104 return;
17105 }
17106
17107
17108
17109
17110 /* @funcstatic seqGcgRegInit **************************************************
17111 **
17112 ** Initialises regular expressions for GCG and MSF format parsing
17113 **
17114 **
17115 ** @return [void]
17116 **
17117 ** @release 4.0.0
17118 ******************************************************************************/
17119
seqGcgRegInit(void)17120 static void seqGcgRegInit(void)
17121 {
17122 if(!seqRegGcgDot)
17123 seqRegGcgDot = ajRegCompC("[.][.]");
17124
17125 if(!seqRegGcgChk)
17126 seqRegGcgChk = ajRegCompC("[Cc][Hh][Ee][Cc][Kk]:[ \t]*([0-9]+)");
17127
17128 if(!seqRegGcgLen)
17129 seqRegGcgLen = ajRegCompC("[Ll][Ee][Nn][Gg][Tt][Hh]:[ \t]*([0-9]+)");
17130
17131 if(!seqRegGcgTyp)
17132 seqRegGcgTyp = ajRegCompC("[Tt][Yy][Pp][Ee]:[ \t]*([NP])");
17133
17134 if(!seqRegGcgNam)
17135 seqRegGcgNam = ajRegCompC("[^ \t>]+");
17136
17137 if(!seqRegGcgMsf)
17138 seqRegGcgMsf = ajRegCompC("[Mm][Ss][Ff]:[ \t]*([0-9]+)");
17139
17140 if(!seqRegGcgMsflen)
17141 seqRegGcgMsflen = ajRegCompC("[Ll][Ee][Nn]:[ \t]*([0-9]+)");
17142
17143 if(!seqRegGcgWgt)
17144 seqRegGcgWgt = ajRegCompC("[Ww][Ee][Ii][Gg][Hh][Tt]:[ \t]*([0-9.]+)");
17145
17146 if(!seqRegGcgMsfnam)
17147 seqRegGcgMsfnam = ajRegCompC("[Nn][Aa][Mm][Ee]:[ \t]*([^ \t]+)");
17148
17149 return;
17150 }
17151
17152
17153
17154
17155 /* @funcstatic seqGcgDots *****************************************************
17156 **
17157 ** Looks for the ".." line in the header of a GCG format sequence.
17158 ** Care is needed to make sure this is not an MSF header which
17159 ** has a very similar format.
17160 **
17161 ** Data found on the header line is extracted and returned.
17162 **
17163 ** The number of lines searched is limited to avoid parsing large data
17164 ** files that are not in GCG format. The user should set this limit to
17165 ** be large enough to handle large EMBL/Genbank annotations
17166 **
17167 ** @param [u] thys [AjPSeq] Sequence.
17168 ** @param [r] seqin [const AjPSeqin] Sequence input.
17169 ** @param [u] Pline [AjPStr*] Input buffer.
17170 ** @param [r] maxlines [ajuint] Maximum number of lines to read
17171 ** before giving up
17172 ** @param [w] len [ajuint*] Length of sequence read.
17173 ** @return [AjBool] ajTrue on success. ajFalse on failure or aborting.
17174 **
17175 ** @release 1.0.0
17176 ** @@
17177 ******************************************************************************/
17178
seqGcgDots(AjPSeq thys,const AjPSeqin seqin,AjPStr * Pline,ajuint maxlines,ajuint * len)17179 static AjBool seqGcgDots(AjPSeq thys, const AjPSeqin seqin,
17180 AjPStr* Pline,
17181 ajuint maxlines, ajuint* len)
17182 {
17183 AjPStr token = NULL;
17184 ajuint check = 0;
17185 ajuint nlines = 0;
17186
17187 seqGcgRegInit();
17188
17189 while(nlines < maxlines)
17190 {
17191 if(nlines++)
17192 if(!ajTextinStoreReadline(seqin->Input, Pline, &thys->TextPtr))
17193 return ajFalse;
17194
17195 if(nlines > maxlines)
17196 return ajFalse;
17197
17198 if(!ajRegExec(seqRegGcgDot, *Pline))
17199 continue;
17200
17201 ajDebug("seqGcgDots .. found\n'%S'\n", *Pline);
17202
17203 if(!ajRegExec(seqRegGcgChk, *Pline)) /* checksum required */
17204 return ajFalse;
17205
17206 if(ajRegExec(seqRegGcgMsf, *Pline)) /* oops - it's an MSF file */
17207 return ajFalse;
17208
17209 ajRegSubI(seqRegGcgChk, 1, &token);
17210 ajStrToUint(token, &check);
17211
17212 ajDebug(" checksum %d\n", check);
17213
17214 if(ajRegExec(seqRegGcgLen, *Pline))
17215 {
17216 ajRegSubI(seqRegGcgLen, 1, &token);
17217 ajStrToUint(token, len);
17218 ajDebug(" length %d\n", *len);
17219 }
17220
17221 if(ajRegExec(seqRegGcgNam, *Pline))
17222 {
17223 ajRegSubI(seqRegGcgNam, 0, &thys->Name);
17224 ajDebug(" name '%S'\n", thys->Name);
17225 }
17226
17227 if(ajRegExec(seqRegGcgTyp, *Pline))
17228 {
17229 ajRegSubI(seqRegGcgTyp, 1, &thys->Type);
17230 ajDebug(" type '%S'\n", thys->Type);
17231 }
17232
17233 ajStrDel(&token);
17234
17235 return ajTrue;
17236 }
17237
17238 return ajFalse;
17239 }
17240
17241
17242
17243
17244 /* @funcstatic seqGcgMsfDots **************************************************
17245 **
17246 ** Looks for the ".." line in the header of an MSF format sequence.
17247 ** Care is needed to make sure this is not a simple GCG header which
17248 ** has a very similar format.
17249 **
17250 ** Data found on the header line is extracted and returned.
17251 **
17252 ** The number of lines searched is limited to avoid parsing large data
17253 ** files that are not in GCG format. The user should set this limit to
17254 ** be large enough to handle large EMBL/Genbank annotations
17255 **
17256 ** @param [u] thys [AjPSeq] Sequence.
17257 ** @param [r] seqin [const AjPSeqin] Sequence input.
17258 ** @param [u] Pline [AjPStr*] Input buffer.
17259 ** @param [r] maxlines [ajuint] Maximum number of lines to read
17260 ** before giving up
17261 ** @param [w] len [ajuint*] Length of sequence read.
17262 ** @return [AjBool] ajTrue on success. ajFalse on failure or aborting.
17263 **
17264 ** @release 1.0.0
17265 ** @@
17266 ******************************************************************************/
17267
seqGcgMsfDots(AjPSeq thys,const AjPSeqin seqin,AjPStr * Pline,ajuint maxlines,ajuint * len)17268 static AjBool seqGcgMsfDots(AjPSeq thys, const AjPSeqin seqin, AjPStr* Pline,
17269 ajuint maxlines, ajuint* len)
17270 {
17271 AjPStr token = NULL;
17272 ajuint check = 0;
17273 ajuint nlines = 0;
17274
17275 ajDebug("seqGcgMsfDots maxlines: %d\nline: '%S'\n", maxlines,*Pline);
17276
17277 seqGcgRegInit();
17278
17279 while(nlines < maxlines)
17280 {
17281 if(nlines++)
17282 if(!ajTextinStoreReadline(seqin->Input, Pline, &thys->TextPtr))
17283 return ajFalse;
17284
17285 ajDebug("testing line %d\n'%S'\n", nlines,*Pline);
17286
17287 if(nlines > maxlines)
17288 return ajFalse;
17289
17290 if(!ajRegExec(seqRegGcgDot, *Pline))
17291 continue;
17292
17293 /* dots found. This must be the line if this is MSF format */
17294
17295 if(!ajRegExec(seqRegGcgChk, *Pline)) /* check: is required */
17296 return ajFalse;
17297
17298 if(!ajRegExec(seqRegGcgMsf, *Pline)) /* MSF: len required for GCG*/
17299 return ajFalse;
17300
17301
17302 ajRegSubI(seqRegGcgMsf, 1, &token);
17303 ajStrToUint(token, len);
17304
17305 ajRegSubI(seqRegGcgChk, 1, &token);
17306 ajStrToUint(token, &check);
17307
17308 if(ajRegExec(seqRegGcgNam, *Pline))
17309 ajRegSubI(seqRegGcgNam, 0, &thys->Name);
17310
17311 if(ajRegExec(seqRegGcgTyp, *Pline))
17312 ajRegSubI(seqRegGcgTyp, 1, &thys->Type);
17313
17314 ajStrDel(&token);
17315 ajDebug("seqGcgMsfDots '%S' '%S' len: %d check: %d\n",
17316 thys->Name, thys->Type, *len, check);
17317
17318 return ajTrue;
17319 }
17320
17321 return ajFalse;
17322 }
17323
17324
17325
17326
17327 /* @funcstatic seqGcgMsfHeader ************************************************
17328 **
17329 ** Parses data from a line of an MSF file header. The header stores
17330 ** names and other data for all sequences in the file. Each file
17331 ** is defined on a separate line. The results are stored
17332 ** in the MSF internal table. The sequence data is read later in the
17333 ** input file and added to the table.
17334 **
17335 ** @param [r] line [const AjPStr] Input line.
17336 ** @param [u] Pmsfitem [SeqPMsfItem*] MSF internal table item.
17337 ** @return [AjBool] ajTrue on success.
17338 **
17339 ** @release 1.0.0
17340 ** @@
17341 ******************************************************************************/
17342
seqGcgMsfHeader(const AjPStr line,SeqPMsfItem * Pmsfitem)17343 static AjBool seqGcgMsfHeader(const AjPStr line, SeqPMsfItem* Pmsfitem)
17344 {
17345 AjPStr name = NULL; /* NOTE: not static. New each time for list */
17346 AjPStr token = NULL;
17347 SeqPMsfItem msfitem = NULL;
17348
17349 ajDebug("seqGcgMsfHeader '%S'\n", line);
17350
17351 if(!ajRegExec(seqRegGcgMsfnam, line))
17352 return ajFalse;
17353
17354 ajRegSubI(seqRegGcgMsfnam, 1, &name);
17355 /*ajDebug("Name found\n");*/
17356
17357 if(!ajRegExec(seqRegGcgChk, line))
17358 return ajFalse;
17359
17360 /*ajDebug("Check found\n");*/
17361
17362 *Pmsfitem = AJNEW0(msfitem);
17363 msfitem->Name = name;
17364
17365 ajRegSubI(seqRegGcgChk, 1, &token);
17366 ajStrToUint(token, &msfitem->Check);
17367
17368 if(ajRegExec(seqRegGcgMsflen, line))
17369 {
17370 ajRegSubI(seqRegGcgMsflen, 1, &token);
17371 ajStrToUint(token, &msfitem->Len);
17372 }
17373 else
17374 msfitem->Len = 0;
17375
17376 msfitem->Seq = ajStrNewRes(msfitem->Len+1);
17377
17378 if(ajRegExec(seqRegGcgWgt, line))
17379 {
17380 ajRegSubI(seqRegGcgWgt, 1, &token);
17381 ajStrToFloat(token, &msfitem->Weight);
17382 }
17383 else
17384 msfitem->Weight = 1.0;
17385
17386 ajDebug("MSF header name '%S' check %d len %d weight %.3f\n",
17387 msfitem->Name, msfitem->Check, msfitem->Len, msfitem->Weight);
17388
17389 ajStrDel(&token);
17390
17391 return ajTrue;
17392 }
17393
17394
17395
17396
17397 /* @funcstatic seqUsaRegInit **************************************************
17398 **
17399 ** Initialised regular expressions for parsing USAs
17400 **
17401 ** @return [void]
17402 **
17403 ** @release 6.1.0
17404 ******************************************************************************/
17405
seqUsaRegInit(void)17406 static void seqUsaRegInit(void)
17407 {
17408 if(seqRegUsaInitDone)
17409 return;
17410
17411 if(!seqRegUsaFmt)
17412 seqRegUsaFmt = ajRegCompC("^([A-Za-z0-9-]*)::(.*)$");
17413 /* \1 format letters and numbers only */
17414 /* \2 remainder (filename, etc.)*/
17415
17416 if(!seqRegUsaDb)
17417 seqRegUsaDb = ajRegCompC("^([A-Za-z][A-Za-z0-9_]+)([-]([A-Za-z]+))?"
17418 "([:{]([^}]*)}?)?$");
17419
17420 /* \1 dbname (start with a letter, then alphanumeric) */
17421 /* \2 -id or -acc etc. */
17422 /* \3 qry->SingleField (id or acc etc.) */
17423 /* \4 :qry->QryString */
17424 /* \5 qry->QryString */
17425
17426 if(!seqRegUsaId)
17427 #ifndef WIN32
17428 /* \1 is filename \5 is the qry->SingleField \6 is the qry->QryString */
17429 seqRegUsaId = ajRegCompC("^([^|]+[|]|[^:{%]+)"
17430 "(([:{%])(([^:}]+):)?([^:}]*)}?)?$");
17431 #else /* WIN32 */
17432 /* Windows file names can start with e.g.: 'C:\' */
17433 /* But allow e.g. 'C:/...', for Staden spin */
17434
17435 /* \1 is filename \6 is the qry->SingleField \7 is the qry->QryString */
17436 seqRegUsaId = ajRegCompC ("^(([a-zA-Z]:[\\\\/])?[^:{%]+)"
17437 "(([:{%])(([^:}]+):)?([^:}]*)}?)?$");
17438 #endif /* !WIN32 */
17439
17440
17441 if(!seqRegUsaList) /* \1 is filename \3 is the qry->QryString */
17442 seqRegUsaList = ajRegCompC("^(@|[Ll][Ii][Ss][Tt]:+)(.+)$");
17443
17444 if(!seqRegUsaAsis) /* \1 is filename \3 is the qry->QryString */
17445 seqRegUsaAsis = ajRegCompC("^[Aa][Ss][Ii][Ss]:+(.+)$");
17446
17447 if(!seqRegUsaWild)
17448 seqRegUsaWild = ajRegCompC("(.*[*].*)");
17449 /* \1 wildcard query */
17450
17451 if(!seqRegUsaRange) /* \1 is rest of USA \2 start \3 end \5 reverse*/
17452 seqRegUsaRange = ajRegCompC("(.*)[[](-?[0-9]*):(-?[0-9]*)(:([Rr])?)?[]]$");
17453
17454 seqRegUsaInitDone = ajTrue;
17455
17456 return;
17457 }
17458
17459
17460
17461
17462 /* @func ajSeqUsaGetBase ******************************************************
17463 **
17464 ** Extracts the base part from a USA, suitable for use in fetching other
17465 **sequences from the same source
17466 **
17467 ** @param [r] usa [const AjPStr] Original USA
17468 ** @param [u] Pbaseusa [AjPStr*] Base part of USA
17469 ** @return [AjBool] True on success
17470 **
17471 ** @release 6.1.0
17472 ** @@
17473 ******************************************************************************/
17474
ajSeqUsaGetBase(const AjPStr usa,AjPStr * Pbaseusa)17475 AjBool ajSeqUsaGetBase(const AjPStr usa, AjPStr* Pbaseusa)
17476 {
17477 AjPStr tmpstr = NULL;
17478
17479 AjBool regstat = ajFalse;
17480 #ifdef __CYGWIN__
17481 AjPStr usatmp = NULL;
17482 #endif /* __CYGWIN__ */
17483
17484 seqUsaRegInit();
17485
17486 ajStrAssignC(Pbaseusa, "");
17487
17488 ajStrAssignS(&seqUsaTest, usa);
17489
17490 /* Strip any leading spaces */
17491 ajStrTrimC(&seqUsaTest," \t\n");
17492
17493 #ifdef __CYGWIN__
17494 if(*(ajStrGetPtr(seqUsaTest)+1)==':')
17495 {
17496 usatmp = ajStrNew();
17497 ajFmtPrintS(&usatmp,"/cygdrive/%c/%s",*ajStrGetPtr(seqUsaTest),
17498 ajStrGetPtr(seqUsaTest)+2);
17499 ajStrAssignRef(&seqUsaTest,usatmp);
17500 ajStrDel(&usatmp);
17501 }
17502 #endif /* __CYGWIN__ */
17503
17504 ajDebug("USA to test: '%S'\n\n", seqUsaTest);
17505
17506 /* trim any range */
17507
17508 if(ajRegExec(seqRegUsaRange, seqUsaTest))
17509 {
17510 ajRegPre(seqRegUsaRange, &tmpstr);
17511 ajStrAssignS(&seqUsaTest, tmpstr);
17512 }
17513
17514 /* no base for an ASIS:: USA */
17515
17516 if(ajRegExec(seqRegUsaAsis, seqUsaTest))
17517 return ajFalse;
17518
17519 /* no base for a listfile USA */
17520
17521 if(ajRegExec(seqRegUsaList, seqUsaTest))
17522 return ajFalse;
17523
17524 if(ajRegExec(seqRegUsaFmt, seqUsaTest))
17525 {
17526 ajRegSubI(seqRegUsaFmt, 1, &tmpstr);
17527 ajStrAppendS(Pbaseusa, tmpstr);
17528 ajStrAppendC(Pbaseusa, "::");
17529 ajRegSubI(seqRegUsaFmt, 2,&tmpstr);
17530 ajStrAssignS(&seqUsaTest, tmpstr);
17531 }
17532
17533 regstat = ajRegExec(seqRegUsaDb, seqUsaTest);
17534
17535 if(regstat)
17536 {
17537 ajRegSubI(seqRegUsaDb, 1, &tmpstr);
17538 if(!ajNamDatabase(tmpstr))
17539 regstat = ajFalse;
17540 }
17541
17542 if(regstat)
17543 ajStrAppendS(Pbaseusa, tmpstr);
17544 else
17545 {
17546 if(ajRegExec(seqRegUsaId, seqUsaTest))
17547 {
17548 #ifndef WIN32
17549 ajRegSubI(seqRegUsaId, 1, &tmpstr);
17550 #else /* WIN32 */
17551 ajRegSubI(seqRegUsaId, 1, &tmpstr);
17552 #endif /* !WIN32 */
17553 ajDebug("found filename %S\n", tmpstr);
17554 ajStrAppendS(Pbaseusa, tmpstr);
17555 }
17556
17557 }
17558 ajStrDel(&tmpstr);
17559
17560 if(!ajStrGetLen(*Pbaseusa))
17561 return ajFalse;
17562
17563 return ajTrue;
17564 }
17565
17566
17567
17568
17569 /* @funcstatic seqinUsaProcess ************************************************
17570 **
17571 ** Converts a USA Universal Sequence Address into an open file.
17572 **
17573 ** First tests for "[n:n:r]" range and sets this if it is found
17574 **
17575 ** Then tests for asis:: in which the "filename" is really the sequence
17576 ** and no format is needed.
17577 **
17578 ** Then tests for "format::" and sets this if it is found
17579 **
17580 ** Then tests for "list:" or "@" and processes as a list file
17581 ** using seqinListProcess which in turn invokes seqinUsaProcess
17582 ** until a valid USA is found.
17583 **
17584 ** Then tests for dbname:query and opens the file (at the correct position
17585 ** if the database definition defines it)
17586 **
17587 ** If there is no database, looks for file:query and opens the file.
17588 ** In this case the file position is not known and sequence reading
17589 ** will have to scan for the entry/entries we need.
17590 **
17591 ** @param [u] seqin [AjPSeqin] Sequence input structure.
17592 ** @param [u] thys [AjPSeq] Sequence to be read.
17593 ** @return [AjBool] ajTrue on success.
17594 **
17595 ** @release 6.4.0
17596 ** @@
17597 ******************************************************************************/
17598
seqinUsaProcess(AjPSeqin seqin,AjPSeq thys)17599 static AjBool seqinUsaProcess(AjPSeqin seqin, AjPSeq thys)
17600 {
17601 AjBool ret = ajTrue;
17602 AjPStr qrystr = NULL;
17603 AjBool seqmethod = ajFalse;
17604 const AjPStr fmtstr = NULL;
17605 AjPTextin textin;
17606 AjPQuery qry;
17607 AjPSeqAccess seqaccess = NULL;
17608
17609 textin = seqin->Input;
17610 qry = textin->Query;
17611
17612 /* pick up the original query string */
17613 qrystr = ajStrNewS(textin->Qry);
17614
17615 ajDebug("seqinUsaProcess '%S'\n", qrystr);
17616
17617 /* look for a format:: prefix */
17618 fmtstr = ajQuerystrParseFormat(&qrystr, textin, seqinFormatFind);
17619 ajDebug("seqinUsaProcess ... fmtstr '%S' '%S'\n", fmtstr, qrystr);
17620
17621 /* (seq/feat) look for a [range] suffix */
17622 ajQuerystrParseRange(&qrystr, &seqin->Begin, &seqin->End, &seqin->Rev);
17623 ajDebug("seqinUsaProcess ... range %d..%d rev:%B '%S'\n",
17624 seqin->Begin, seqin->End, seqin->Rev, qrystr);
17625
17626 /* look for a list:: or @:: listfile of queries - process and return */
17627 if(ajQuerystrParseListfile(&qrystr))
17628 {
17629 ajDebug("seqinUsaProcess ... listfile '%S'\n", qrystr);
17630 ret = seqinListProcess(seqin, thys, qrystr);
17631 ajStrDel(&qrystr);
17632 return ret;
17633 }
17634
17635 /* try general text access methods (file, asis, text database access */
17636 ajDebug("seqinUsaProcess ... no listfile '%S'\n", qrystr);
17637 if(!ajQuerystrParseRead(&qrystr, textin, seqinFormatFind, &seqmethod))
17638 {
17639 ajStrDel(&qrystr);
17640 return ajFalse;
17641 }
17642
17643 seqinFormatSet(seqin, thys);
17644
17645 ajDebug("seqinUsaProcess ... read nontext: %B '%S'\n",
17646 seqmethod, qrystr);
17647 ajStrDel(&qrystr);
17648
17649 /* we found a non-text method */
17650 if(seqmethod)
17651 {
17652 ajDebug("seqinUsaProcess ... call method '%S'\n", qry->Method);
17653 ajDebug("seqinUsaProcess ... textin format %d '%S'\n",
17654 textin->Format, textin->Formatstr);
17655 ajDebug("seqinUsaProcess ... query format '%S'\n",
17656 qry->Formatstr);
17657 qry->Access = ajCallTableGetS(seqDbMethods,qry->Method);
17658 seqaccess = qry->Access;
17659
17660 if(!seqaccess)
17661 {
17662 ajErr("sequence access method '%S' not found", qry->Method);
17663 return ajFalse;
17664 }
17665
17666 return (*seqaccess->Access)(seqin);
17667 }
17668
17669 ajDebug("seqinUsaProcess text method '%S' success\n", qry->Method);
17670
17671 return ajTrue;
17672 }
17673
17674
17675
17676
17677 /* @funcstatic seqUsaRestore **************************************************
17678 **
17679 ** Restores a sequence input specification from a SeqPListUsa node
17680 **
17681 ** @param [w] seqin [AjPSeqin] Sequence input object
17682 ** @param [r] node [const SeqPListUsa] Usa list node
17683 ** @return [void]
17684 **
17685 ** @release 2.1.0
17686 ******************************************************************************/
17687
seqUsaRestore(AjPSeqin seqin,const SeqPListUsa node)17688 static void seqUsaRestore(AjPSeqin seqin, const SeqPListUsa node)
17689 {
17690 ajDebug("seqUsaRestore node %d..%d rev:%B '%S' (%u) feat %B '%S'\n",
17691 node->Begin, node->End, node->Rev,
17692 node->Formatstr, node->Format, node->Features, node->Usa);
17693 seqin->Begin = node->Begin;
17694 seqin->End = node->End;
17695 seqin->Rev = node->Rev;
17696 seqin->Input->Format = node->Format;
17697 seqin->Input->Fpos = node->Fpos;
17698 seqin->Features = node->Features;
17699 ajStrAssignS(&seqin->Input->Formatstr, node->Formatstr);
17700
17701 return;
17702 }
17703
17704
17705
17706
17707 /* @funcstatic seqUsaSave *****************************************************
17708 **
17709 ** Saves a sequence input specification in a SeqPListUsa node
17710 **
17711 ** @param [w] node [SeqPListUsa] Usa list node
17712 ** @param [r] seqin [const AjPSeqin] Sequence input object
17713 ** @return [void]
17714 **
17715 ** @release 2.1.0
17716 ******************************************************************************/
17717
seqUsaSave(SeqPListUsa node,const AjPSeqin seqin)17718 static void seqUsaSave(SeqPListUsa node, const AjPSeqin seqin)
17719 {
17720 ajDebug("seqUsaSave seqin %d..%d rev:%B '%S' (%u) feat %B '%S'\n",
17721 seqin->Begin, seqin->End, seqin->Rev,
17722 seqin->Input->Formatstr, seqin->Input->Format,
17723 seqin->Features, seqin->Input->Qry);
17724 node->Begin = seqin->Begin;
17725 node->End = seqin->End;
17726 node->Rev = seqin->Rev;
17727 node->Format = seqin->Input->Format;
17728 node->Fpos = seqin->Input->Fpos;
17729 node->Features = seqin->Features;
17730 ajStrAssignS(&node->Formatstr, seqin->Input->Formatstr);
17731
17732 return;
17733 }
17734
17735
17736
17737
17738 /* @funcstatic seqUsaListTrace ************************************************
17739 **
17740 ** Traces the nodes in a USA list
17741 **
17742 ** @param [r] list [const AjPList] The USA list
17743 ** @return [void]
17744 **
17745 ** @release 2.1.0
17746 ******************************************************************************/
17747
seqUsaListTrace(const AjPList list)17748 static void seqUsaListTrace(const AjPList list)
17749 {
17750 AjIList iter;
17751 SeqPListUsa node;
17752 ajuint i = 0;
17753
17754 iter = ajListIterNewread(list);
17755
17756 ajDebug("SeqUsaListTrace %Lu nodes\n", ajListGetLength(list));
17757
17758 while(!ajListIterDone(iter))
17759 {
17760 node = (SeqPListUsa) ajListIterGet(iter);
17761 ajDebug("%3d: '%S' %4d..%d (%b) '%S' %d\n",
17762 ++i, node->Usa, node->Begin, node->End, node->Rev,
17763 node->Formatstr, node->Format);
17764 }
17765
17766 ajListIterDel(&iter);
17767 ajDebug("...Done...\n");
17768
17769 return;
17770 }
17771
17772
17773
17774
17775 /* @funcstatic seqinListProcess ***********************************************
17776 **
17777 ** Processes a file of USAs.
17778 ** This function is called by, and calls, seqinUsaProcess. There is
17779 ** a depth check to avoid infinite loops, for example where a list file
17780 ** refers to itself.
17781 **
17782 ** This function produces a list (AjPList) of USAs with all list references
17783 ** expanded into lists of USAs.
17784 **
17785 ** Because USAs in a list can have their own begin, end and reverse settings
17786 ** the prior settings are stored with each USA in the list node so that they
17787 ** can be restored after.
17788 **
17789 ** @param [u] seqin [AjPSeqin] Sequence input
17790 ** @param [u] seq [AjPSeq] Sequence
17791 ** @param [r] listfile [const AjPStr] Name of list file.,
17792 ** @return [AjBool] ajTrue on success.
17793 **
17794 ** @release 6.4.0
17795 ** @@
17796 ******************************************************************************/
17797
seqinListProcess(AjPSeqin seqin,AjPSeq seq,const AjPStr listfile)17798 static AjBool seqinListProcess(AjPSeqin seqin, AjPSeq seq,
17799 const AjPStr listfile)
17800 {
17801 AjPList list = NULL;
17802 AjPFile file = NULL;
17803 AjPStr token = NULL;
17804 AjPStr rest = NULL;
17805 AjBool ret = ajFalse;
17806 SeqPListUsa node = NULL;
17807
17808 ajuint recnum = 0;
17809 static ajint depth = 0;
17810 static ajint MAXDEPTH = 16;
17811
17812 depth++;
17813 ajDebug("++seqinListProcess %S depth %d Rev: %B\n",
17814 listfile, depth, seqin->Rev);
17815
17816 if(depth > MAXDEPTH)
17817 ajFatal("USA List too deep");
17818
17819 if(!seqin->Usalist)
17820 seqin->Usalist = ajListNew();
17821
17822 list = ajListNew();
17823
17824 file = ajFileNewInNameS(listfile);
17825
17826 if(!file)
17827 {
17828 ajErr("Failed to open list file '%S'", listfile);
17829 depth--;
17830
17831 return ret;
17832 }
17833
17834 while(ajReadlineTrim(file, &seqReadLine))
17835 {
17836 ++recnum;
17837 seqListNoComment(&seqReadLine);
17838 if(ajStrExtractWord(seqReadLine, &rest, &token))
17839 {
17840 if(ajStrGetLen(rest))
17841 {
17842 ajErr("Bad record %u in list file '%S'\n'%S'",
17843 recnum, listfile, seqReadLine);
17844 }
17845 else if(ajStrGetLen(token))
17846 {
17847 ajDebug("++Add to list: '%S'\n", token);
17848 AJNEW0(node);
17849 ajStrAssignS(&node->Usa, token);
17850 seqUsaSave(node, seqin);
17851 ajListPushAppend(list, node);
17852 }
17853 }
17854 }
17855
17856 ajFileClose(&file);
17857 ajStrDel(&token);
17858 ajStrDel(&rest);
17859
17860 ajDebug("Trace seqin->Usalist\n");
17861 seqUsaListTrace(seqin->Usalist);
17862 ajDebug("Trace new list\n");
17863 seqUsaListTrace(list);
17864 ajListPushlist(seqin->Usalist, &list);
17865
17866 ajDebug("Trace combined seqin->Usalist\n");
17867 seqUsaListTrace(seqin->Usalist);
17868
17869 /*
17870 ** now try the first item on the list
17871 ** this can descend recursively if it is also a list
17872 ** which is why we check the depth above
17873 */
17874
17875 if(ajListPop(seqin->Usalist, (void**) &node))
17876 {
17877 ajDebug("++pop first item '%S'\n", node->Usa);
17878 ajSeqinUsa(&seqin, node->Usa);
17879 seqUsaRestore(seqin, node);
17880 ajStrDel(&node->Usa);
17881 ajStrDel(&node->Formatstr);
17882 AJFREE(node);
17883 ajDebug("descending with usa '%S'\n", seqin->Input->Qry);
17884 ret = seqinUsaProcess(seqin, seq);
17885 }
17886
17887 depth--;
17888 ajDebug("++seqinListProcess depth: %d returns: %B\n", depth, ret);
17889
17890 return ret;
17891 }
17892
17893
17894
17895
17896 /* @funcstatic seqListNoComment ***********************************************
17897 **
17898 ** Strips comments from a character string (a line from a list file).
17899 ** Comments are blank lines or any text following a "#" character.
17900 **
17901 ** @param [u] text [AjPStr*] Line of text from input file.
17902 ** @return [void]
17903 **
17904 ** @release 1.0.0
17905 ** @@
17906 ******************************************************************************/
17907
seqListNoComment(AjPStr * text)17908 static void seqListNoComment(AjPStr* text)
17909 {
17910 ajuint i;
17911 char *cp;
17912
17913 i = ajStrGetLen(*text);
17914
17915 if(!i) /* empty string */
17916 return;
17917
17918 MAJSTRGETUNIQUESTR(text);
17919
17920 cp = strchr(ajStrGetPtr(*text), '#');
17921
17922 if(cp)
17923 { /* comment found */
17924 *cp = '\0';
17925 ajStrSetValid(text);
17926 }
17927
17928 return;
17929 }
17930
17931
17932
17933
17934 /* @funcstatic seqinFormatSet *************************************************
17935 **
17936 ** Sets the input format for a sequence using the sequence input object's
17937 ** defined format, or a default from variable 'EMBOSS_FORMAT'.
17938 **
17939 ** @param [u] seqin [AjPSeqin] Sequence input.
17940 ** @param [u] thys [AjPSeq] Sequence.
17941 ** @return [AjBool] ajTrue on success.
17942 **
17943 ** @release 6.4.0
17944 ** @@
17945 ******************************************************************************/
17946
seqinFormatSet(AjPSeqin seqin,AjPSeq thys)17947 static AjBool seqinFormatSet(AjPSeqin seqin, AjPSeq thys)
17948 {
17949 AjPTextin textin = seqin->Input;
17950
17951 if(ajStrGetLen(textin->Formatstr))
17952 {
17953 ajDebug("... input format value '%S'\n", textin->Formatstr);
17954
17955 if(seqinFormatFind(textin->Formatstr, &textin->Format))
17956 {
17957 ajStrAssignS(&thys->Formatstr, textin->Formatstr);
17958 thys->Format = textin->Format;
17959 ajDebug("...format OK '%S' = %d\n", textin->Formatstr,
17960 textin->Format);
17961 }
17962 else
17963 {
17964 ajDebug("...format unknown '%S'\n", textin->Formatstr);
17965 ajErr("Unknown input format '%S'", textin->Formatstr);
17966 }
17967
17968 return ajTrue;
17969 }
17970 else
17971 ajDebug("...input format not set\n");
17972
17973
17974 return ajFalse;
17975 }
17976
17977
17978
17979
17980 /* @funcstatic seqinUfoLocal **************************************************
17981 **
17982 ** Tests whether a sequence input object will read features from the
17983 ** sequence input file. The alternative is to use a separate UFO.
17984 **
17985 ** @param [r] thys [const AjPSeqin] Sequen input object.
17986 ** @return [AjBool] ajTrue if the features will be read from the sequence
17987 **
17988 ** @release 1.13.0
17989 ** @@
17990 ******************************************************************************/
17991
seqinUfoLocal(const AjPSeqin thys)17992 static AjBool seqinUfoLocal(const AjPSeqin thys)
17993 {
17994 if(thys->Features && ! ajStrGetLen(thys->Ufo))
17995 return ajTrue;
17996
17997 return ajFalse;
17998 }
17999
18000
18001
18002
18003 /* @funcstatic seqSetName *****************************************************
18004 **
18005 ** Sets the name for a sequence object by applying simple conversion
18006 ** rules to the input which could be, for example, the name from a
18007 ** FASTA format file.
18008 **
18009 ** @param [u] thys [AjPSeq] Sequence object
18010 ** @param [r] str [const AjPStr] User supplied name.
18011 ** @return [void]
18012 **
18013 ** @release 1.0.0
18014 ** @@
18015 ******************************************************************************/
18016
seqSetName(AjPSeq thys,const AjPStr str)18017 static void seqSetName(AjPSeq thys, const AjPStr str)
18018 {
18019 if(!ajStrGetLen(str))
18020 {
18021 ajSeqSetNameMulti(thys, NULL);
18022 }
18023 else if(ajStrIsWord(str))
18024 {
18025 ajDebug("seqSetName word '%S'\n", str);
18026 ajStrTokenAssignC(&seqHandleSplit, str, ":");
18027
18028 while(ajStrTokenNextParse(seqHandleSplit, &seqTokenSplit))
18029 if(ajStrGetLen(seqTokenSplit))
18030 ajStrAssignS(&thys->Name, seqTokenSplit);
18031
18032 ajStrExchangeSetCC(&thys->Name, ",/\\", "___");
18033
18034 ajStrTokenReset(seqHandleSplit);
18035 }
18036 else
18037 {
18038 ajDebug("seqSetName non-word '%S'\n", str);
18039 ajStrAssignS(&thys->Name, str);
18040 ajStrRemoveWhiteExcess(&thys->Name);
18041 ajStrExchangeSetCC(&thys->Name, " ,;:/\\", "______");
18042 ajDebug("seqSetName cleaned '%S'\n", thys->Name);
18043 }
18044
18045 ajDebug("seqSetName '%S' result: '%S'\n", str, thys->Name);
18046
18047 ajStrDelStatic(&seqTokenSplit);
18048
18049 return;
18050 }
18051
18052
18053
18054
18055 /* @funcstatic seqitemSetName *************************************************
18056 **
18057 ** Sets the name for a multiple sequence item object by applying simple
18058 ** conversion rules to the input which could be, for example, the name from a
18059 ** FASTA format file.
18060 **
18061 ** @param [u] thys [SeqPMsfItem] Sequence item object
18062 ** @param [r] str [const AjPStr] User supplied name.
18063 ** @return [void]
18064 **
18065 ** @release 6.2.0
18066 ** @@
18067 ******************************************************************************/
18068
seqitemSetName(SeqPMsfItem thys,const AjPStr str)18069 static void seqitemSetName(SeqPMsfItem thys, const AjPStr str)
18070 {
18071 if(ajStrIsWord(str))
18072 {
18073 ajDebug("seqitemSetName word '%S'\n", str);
18074 ajStrTokenAssignC(&seqHandleSplit, str, ":");
18075
18076 while(ajStrTokenNextParse(seqHandleSplit, &seqTokenSplit))
18077 if(ajStrGetLen(seqTokenSplit))
18078 ajStrAssignS(&thys->Name, seqTokenSplit);
18079
18080 ajStrTokenReset(seqHandleSplit);
18081 }
18082 else
18083 {
18084 ajDebug("seqitemSetName non-word '%S'\n", str);
18085 ajStrAssignS(&thys->Name, str);
18086 ajStrRemoveWhiteExcess(&thys->Name);
18087 ajStrExchangeKK(&thys->Name, ' ', '_');
18088 ajDebug("seqitemSetName cleaned '%S'\n", thys->Name);
18089 }
18090
18091 ajDebug("seqitemSetName '%S' result: '%S'\n", str, thys->Name);
18092 ajStrDelStatic(&seqToken);
18093
18094 return;
18095 }
18096
18097
18098
18099
18100 /* @funcstatic seqnameSetName *************************************************
18101 **
18102 ** Sets the name usable by a sequence object by applying simple conversion
18103 ** rules to the input which could be, for example, the name from a
18104 ** FASTA format file.
18105 **
18106 ** @param [u] name [AjPStr*] Sequence name derived.
18107 ** @param [r] str [const AjPStr] User supplied name.
18108 ** @return [void]
18109 **
18110 ** @release 6.2.0
18111 ** @@
18112 ******************************************************************************/
18113
seqnameSetName(AjPStr * name,const AjPStr str)18114 static void seqnameSetName(AjPStr *name, const AjPStr str)
18115 {
18116 if(ajStrIsWord(str))
18117 {
18118 ajDebug("seqnameSetName word '%S'\n", str);
18119 ajStrTokenAssignC(&seqHandleSplit, str, ":");
18120
18121 while(ajStrTokenNextParse(seqHandleSplit, &seqTokenSplit))
18122 if(ajStrGetLen(seqTokenSplit))
18123 ajStrAssignS(name, seqTokenSplit);
18124
18125 ajStrTokenReset(seqHandleSplit);
18126 }
18127 else
18128 {
18129 ajDebug("seqnameSetName non-word '%S'\n", str);
18130 ajStrAssignS(name, str);
18131 ajStrRemoveWhiteExcess(name);
18132 ajStrExchangeKK(name, ' ', '_');
18133 ajDebug("seqnameSetName cleaned '%S'\n", *name);
18134 }
18135
18136 ajDebug("seqnameSetName '%S' result: '%S'\n", str, *name);
18137
18138 ajStrDelStatic(&seqTokenSplit);
18139
18140 return;
18141 }
18142
18143
18144
18145
18146 /* @funcstatic seqSetNameNospace **********************************************
18147 **
18148 ** Sets the name for a sequence object by applying simple conversion
18149 ** rules to the input which could be, for example, the name from a
18150 ** FASTA format file.
18151 **
18152 ** @param [u] name [AjPStr*] Sequence name derived.
18153 ** @param [r] str [const AjPStr] User supplied name.
18154 ** @return [void]
18155 **
18156 ** @release 4.1.0
18157 ** @@
18158 ******************************************************************************/
18159
seqSetNameNospace(AjPStr * name,const AjPStr str)18160 static void seqSetNameNospace(AjPStr* name, const AjPStr str)
18161 {
18162 ajStrAssignS(name, str);
18163
18164 if(!ajStrIsWord(str))
18165 {
18166 ajDebug("seqSetNameNospace non-word '%S'\n", str);
18167 ajStrRemoveWhiteExcess(name);
18168 ajStrExchangeKK(name, ' ', '_');
18169 ajDebug("seqSetNameNospace cleaned '%S'\n", *name);
18170 }
18171
18172 ajDebug("seqSetNameNospace '%S' result: '%S'\n", str, *name);
18173
18174 return;
18175 }
18176
18177
18178
18179
18180 /* @funcstatic seqSetNameFile *************************************************
18181 **
18182 ** Sets the name for a sequence object by applying simple conversion
18183 ** rules to the input source file..
18184 **
18185 ** @param [u] thys [AjPSeq] Sequence object
18186 ** @param [r] seqin [const AjPSeqin] Sequence input object
18187 ** @return [void]
18188 **
18189 ** @release 2.8.0
18190 ** @@
18191 ******************************************************************************/
18192
seqSetNameFile(AjPSeq thys,const AjPSeqin seqin)18193 static void seqSetNameFile(AjPSeq thys, const AjPSeqin seqin)
18194 {
18195 AjPStr tmpname = NULL;
18196
18197 ajStrAssignS(&tmpname, seqin->Input->Filename);
18198
18199 seqSetName(thys, tmpname);
18200
18201 if(ajTextinGetCount(seqin->Input) > 1)
18202 ajFmtPrintAppS(&thys->Name, "_%3d", ajTextinGetCount(seqin->Input));
18203
18204 ajDebug("seqSetNameFile '%S' result: '%S'\n", tmpname, thys->Name);
18205 ajStrDel(&tmpname);
18206
18207 return;
18208 }
18209
18210
18211
18212
18213 /* @funcstatic seqAccSave *****************************************************
18214 **
18215 ** Adds an accession number to the stored list for a sequence.
18216 ** The first accession number is also saved as the primary number.
18217 **
18218 ** @param [u] thys [AjPSeq] Sequence object
18219 ** @param [r] acc [const AjPStr] Accession number
18220 ** @return [void]
18221 **
18222 ** @release 1.0.0
18223 ** @@
18224 ******************************************************************************/
18225
seqAccSave(AjPSeq thys,const AjPStr acc)18226 static void seqAccSave(AjPSeq thys, const AjPStr acc)
18227 {
18228 if(!thys->Acclist)
18229 thys->Acclist = ajListstrNew();
18230
18231 ajListstrPushAppend(thys->Acclist, ajStrNewS(acc));
18232
18233 if(!ajStrGetLen(thys->Acc))
18234 ajStrAssignS(&thys->Acc, acc);
18235
18236 return;
18237 }
18238
18239
18240
18241
18242 /* @funcstatic seqTaxSave *****************************************************
18243 **
18244 ** Adds an organism taxonomy level to the stored list for a sequence.
18245 ** The first is also saved as the primary 'Tax' (should be the species).
18246 **
18247 ** @param [u] thys [AjPSeq] Sequence object
18248 ** @param [r] tax [const AjPStr] Organism taxonomy
18249 ** @param [r] level [ajuint] 0: taxon level 1: species
18250 ** 2: organelle 3: common name
18251 ** @return [void]
18252 **
18253 ** @release 2.4.0
18254 ** @@
18255 ******************************************************************************/
18256
seqTaxSave(AjPSeq thys,const AjPStr tax,ajuint level)18257 static void seqTaxSave(AjPSeq thys, const AjPStr tax, ajuint level)
18258 {
18259 AjPStr newstr = NULL;
18260 AjBool done = ajFalse;
18261
18262 switch(level)
18263 {
18264 case 1:
18265 if(!ajStrGetLen(thys->Tax))
18266 ajStrAssignS(&thys->Tax, tax);
18267 done = ajTrue;
18268 break;
18269 case 2:
18270 if(!ajStrGetLen(thys->Organelle))
18271 ajStrAssignS(&thys->Organelle, tax);
18272 done = ajTrue;
18273 break;
18274 case 3:
18275 if(!ajStrGetLen(thys->Taxcommon))
18276 ajStrAssignS(&thys->Taxcommon, tax);
18277 done = ajTrue;
18278 break;
18279 default:
18280 done = ajFalse;
18281 break;
18282 }
18283
18284 if(!done)
18285 {
18286 if(!thys->Taxlist)
18287 thys->Taxlist = ajListstrNew();
18288 newstr = ajStrNewS(tax);
18289 ajListstrPushAppend(thys->Taxlist, newstr);
18290 }
18291
18292 return;
18293 }
18294
18295
18296
18297
18298 /* @funcstatic seqTaxidSaveI **************************************************
18299 **
18300 ** Adds an organism NCBI taxonomy id to the stored list for a sequence.
18301 **
18302 ** @param [u] thys [AjPSeq] Sequence object
18303 ** @param [r] tax [ajuint] Organism NCBI taxonomy id
18304 ** @return [void]
18305 **
18306 ** @release 6.1.0
18307 ** @@
18308 ******************************************************************************/
18309
seqTaxidSaveI(AjPSeq thys,ajuint tax)18310 static void seqTaxidSaveI(AjPSeq thys, ajuint tax)
18311 {
18312 if(tax && !ajStrGetLen(thys->Taxid))
18313 ajStrFromUint(&thys->Taxid, tax);
18314
18315 return;
18316 }
18317
18318
18319
18320
18321 /* @funcstatic seqTaxidSaveS **************************************************
18322 **
18323 ** Adds an organism NCBI taxonomy id to the stored list for a sequence.
18324 **
18325 ** @param [u] thys [AjPSeq] Sequence object
18326 ** @param [r] tax [const AjPStr] Organism NCBI taxonomy id
18327 ** @return [void]
18328 **
18329 ** @release 6.1.0
18330 ** @@
18331 ******************************************************************************/
18332
seqTaxidSaveS(AjPSeq thys,const AjPStr tax)18333 static void seqTaxidSaveS(AjPSeq thys, const AjPStr tax)
18334 {
18335 if(!ajStrGetLen(thys->Taxid))
18336 ajStrAssignS(&thys->Taxid, tax);
18337
18338 return;
18339 }
18340
18341
18342
18343
18344 /* @funcstatic seqSvSave ******************************************************
18345 **
18346 ** Adds a sequence version number to the stored data for a sequence.
18347 **
18348 ** @param [u] thys [AjPSeq] Sequence object
18349 ** @param [r] sv [const AjPStr] SeqVersion number
18350 ** @return [void]
18351 **
18352 ** @release 2.4.0
18353 ** @@
18354 ******************************************************************************/
18355
seqSvSave(AjPSeq thys,const AjPStr sv)18356 static void seqSvSave(AjPSeq thys, const AjPStr sv)
18357 {
18358 if(!ajStrGetLen(thys->Sv))
18359 ajStrAssignS(&thys->Sv, sv);
18360
18361 return;
18362 }
18363
18364
18365
18366
18367 /* ==================================================================== */
18368 /* ========================= constructors ============================= */
18369 /* ==================================================================== */
18370
18371
18372
18373
18374
18375
18376
18377 /* ==================================================================== */
18378 /* ======================== Operators ==================================*/
18379 /* ==================================================================== */
18380
18381
18382
18383
18384 /* @section Sequence Query Operators ******************************************
18385 **
18386 ** These functions use the contents of a sequence query object but do
18387 ** not make any changes.
18388 **
18389 ******************************************************************************/
18390
18391
18392
18393
18394 /* @funcstatic seqQueryMatch **************************************************
18395 **
18396 ** Compares a sequence to a query and returns true if they match.
18397 **
18398 ** @param [r] thys [const AjPQuery] Sequence query.
18399 ** @param [r] seq [const AjPSeq] Sequence.
18400 ** @return [AjBool] ajTrue if the sequence matches the query.
18401 **
18402 ** @release 1.0.0
18403 ** @@
18404 ******************************************************************************/
18405
seqQueryMatch(const AjPQuery thys,const AjPSeq seq)18406 static AjBool seqQueryMatch(const AjPQuery thys, const AjPSeq seq)
18407 {
18408 AjBool tested = ajFalse;
18409 AjIList iter = NULL;
18410 AjIList iterfield = NULL;
18411 AjPStr accstr; /* from list, do not delete */
18412 AjPStr keystr; /* from list, do not delete */
18413 AjPStr taxstr; /* from list, do not delete */
18414 AjPQueryField field = NULL;
18415 AjBool ok = ajFalse;
18416
18417 ajDebug("seqQueryMatch '%S' fields: %Lu Case %B Done %B\n",
18418 seq->Name, ajListGetLength(thys->QueryFields),
18419 thys->CaseId, thys->QryDone);
18420
18421 if(!thys) /* no query to test, that's fine */
18422 return ajTrue;
18423
18424 if(thys->QryDone) /* do we need to test here? */
18425 return ajTrue;
18426
18427 /* test the query field(s) */
18428
18429 iterfield = ajListIterNewread(thys->QueryFields);
18430 while(!ajListIterDone(iterfield))
18431 {
18432 field = ajListIterGet(iterfield);
18433
18434 ajDebug(" field: '%S' Query: '%S'\n",
18435 field->Field, field->Wildquery);
18436 if(ajStrMatchC(field->Field, "id"))
18437 {
18438 ajDebug(" id test: '%S'\n",
18439 seq->Name);
18440 if(thys->CaseId)
18441 {
18442 if(ajStrMatchWildS(seq->Name, field->Wildquery))
18443 {
18444 ajListIterDel(&iterfield);
18445 return ajTrue;
18446 }
18447 }
18448 else
18449 {
18450 if(ajStrMatchWildCaseS(seq->Name, field->Wildquery))
18451 {
18452 ajListIterDel(&iterfield);
18453 return ajTrue;
18454 }
18455 }
18456
18457 ajDebug("id test failed\n");
18458 tested = ajTrue;
18459 }
18460
18461 else if(ajStrMatchC(field->Field, "sv")) /* test Sv and Gi */
18462 {
18463 ajDebug(" sv test: '%S'\n",
18464 seq->Sv);
18465 if(ajStrMatchWildCaseS(seq->Sv, field->Wildquery))
18466 {
18467 ajListIterDel(&iterfield);
18468 return ajTrue;
18469 }
18470
18471 ajDebug("sv test failed\n");
18472 tested = ajTrue;
18473 }
18474
18475 else if(ajStrMatchC(field->Field, "gi")) /* test Sv and Gi */
18476 {
18477 ajDebug(" gi test: '%S'\n",
18478 seq->Gi);
18479 if(ajStrMatchWildCaseS(seq->Gi, field->Wildquery))
18480 {
18481 ajListIterDel(&iterfield);
18482 return ajTrue;
18483 }
18484
18485 ajDebug("gi test failed\n");
18486 tested = ajTrue;
18487 }
18488
18489 else if(ajStrMatchC(field->Field, "acc"))
18490 {
18491 ajDebug(" acc test:%Lu\n",
18492 ajListGetLength(seq->Acclist));
18493 if(ajListGetLength(seq->Acclist))
18494 { /* accession number test - check the entire list */
18495 iter = ajListIterNewread(seq->Acclist);
18496
18497 while(!ajListIterDone(iter))
18498 {
18499 accstr = ajListIterGet(iter);
18500 ajDebug("... try accession '%S' '%S'\n", accstr,
18501 field->Wildquery);
18502
18503 if(ajStrMatchWildCaseS(accstr, field->Wildquery))
18504 {
18505 ajListIterDel(&iterfield);
18506 ajListIterDel(&iter);
18507
18508 return ajTrue;
18509 }
18510 }
18511 }
18512
18513 tested = ajTrue;
18514 ajDebug("acc test failed\n");
18515 ajListIterDel(&iter);
18516 }
18517
18518 else if(ajStrMatchC(field->Field, "org"))
18519 {
18520 ajDebug(" org test:%Lu\n",
18521 ajListGetLength(seq->Taxlist));
18522 if(ajListGetLength(seq->Taxlist))
18523 { /* taxonomy test - check the entire list */
18524 iter = ajListIterNewread(seq->Taxlist);
18525
18526 while(!ajListIterDone(iter))
18527 {
18528 taxstr = ajListIterGet(iter);
18529 ajDebug("... try organism '%S' '%S'\n", taxstr,
18530 field->Wildquery);
18531
18532 if(ajStrMatchWildCaseS(taxstr, field->Wildquery))
18533 {
18534 ajListIterDel(&iterfield);
18535 ajListIterDel(&iter);
18536
18537 return ajTrue;
18538 }
18539 }
18540
18541 tested = ajTrue;
18542 ajDebug("org test failed\n");
18543 ajListIterDel(&iter);
18544 }
18545 else
18546 {
18547 ajDebug("org test failed - nothing to test\n");
18548
18549 return ajFalse;
18550 }
18551 }
18552
18553 else if(ajStrMatchC(field->Field, "key"))
18554 {
18555 ajDebug(" key test:%Lu\n",
18556 ajListGetLength(seq->Keylist));
18557 if(ajListGetLength(seq->Keylist))
18558 { /* keyword test - check the entire list */
18559 iter = ajListIterNewread(seq->Keylist);
18560
18561 while(!ajListIterDone(iter))
18562 {
18563 keystr = ajListIterGet(iter);
18564 ajDebug("... try keyword '%S' '%S'\n", keystr,
18565 field->Wildquery);
18566
18567 if(ajStrMatchWildCaseS(keystr, field->Wildquery))
18568 {
18569 ajListIterDel(&iterfield);
18570 ajListIterDel(&iter);
18571
18572 return ajTrue;
18573 }
18574 }
18575
18576 tested = ajTrue;
18577 ajDebug("key test failed\n");
18578 ajListIterDel(&iter);
18579 }
18580 else
18581 {
18582 ajDebug("key test failed - nothing to test\n");
18583 ajListIterDel(&iterfield);
18584
18585 return ajFalse;
18586 }
18587 }
18588
18589 else if(ajStrMatchC(field->Field, "des"))
18590 {
18591 ajDebug(" des test: '%S'\n",
18592 seq->Desc);
18593 if(ajStrGetLen(seq->Desc))
18594 { /* description test - check the string */
18595 ajDebug("... try description '%S' '%S'\n", seq->Desc,
18596 field->Wildquery);
18597
18598 if(ajStrMatchWildWordCaseS(seq->Desc, field->Wildquery))
18599 {
18600 ajListIterDel(&iterfield);
18601 return ajTrue;
18602 }
18603
18604 tested = ajTrue;
18605 ajDebug("des test failed\n");
18606 ajListIterDel(&iter);
18607 }
18608 else
18609 {
18610 ajDebug("des test failed - nothing to test\n");
18611 ajListIterDel(&iterfield);
18612 return ajFalse;
18613 }
18614 }
18615 else
18616 {
18617 ajErr("Unknown query field '%S' in query '%S'",
18618 thys->SingleField, thys->QryString);
18619 tested = ajTrue;
18620 }
18621
18622 }
18623
18624 ajListIterDel(&iterfield);
18625
18626 if(!tested) /* nothing to test, so accept it anyway */
18627 {
18628 if(ajListGetLength(thys->QueryFields))
18629 {
18630 ajErr("");
18631 return ajFalse;
18632 }
18633
18634 ajDebug(" no tests: assume OK\n");
18635 return ajTrue;
18636 }
18637
18638 ajDebug("result: %B\n", ok);
18639
18640 return ok;
18641 }
18642
18643
18644
18645
18646
18647 /* @func ajSeqParseFasta ******************************************************
18648 **
18649 ** Parse an NCBI format fasta line. Return id acc sv and description
18650 **
18651 ** @param [r] instr [const AjPStr] fasta line.
18652 ** @param [w] id [AjPStr*] id.
18653 ** @param [w] acc [AjPStr*] accession number.
18654 ** @param [w] sv [AjPStr*] sequence version number.
18655 ** @param [w] desc [AjPStr*] description.
18656 ** @return [AjBool] ajTrue if fasta format
18657 **
18658 ** @release 2.0.0
18659 ** @@
18660 ******************************************************************************/
18661
ajSeqParseFasta(const AjPStr instr,AjPStr * id,AjPStr * acc,AjPStr * sv,AjPStr * desc)18662 AjBool ajSeqParseFasta(const AjPStr instr, AjPStr* id, AjPStr* acc,
18663 AjPStr* sv, AjPStr* desc)
18664 {
18665 AjBool ok = ajFalse;
18666
18667 ajDebug("ajSeqParseFasta '%S'\n", instr);
18668
18669 if(!ajStrPrefixC(instr, ">"))
18670 return ajFalse;
18671
18672 ajStrTokenAssignC(&seqHandle, instr, "> ");
18673 ajStrTokenNextParseC(seqHandle, " \t\n\r", id);
18674
18675 ok = ajStrTokenNextParse(seqHandle, &seqToken);
18676 ajStrAssignS(&seqToken2, seqToken);
18677 ajStrRemoveSetC(&seqToken2, "()");
18678
18679 if(ok && ajSeqtestIsSeqversion(seqToken2))
18680 {
18681 ajStrAssignS(acc, ajSeqtestIsSeqversion(seqToken2));
18682 ajStrAssignS(sv, seqToken2);
18683 ajStrTokenNextParseC(seqHandle, "\n\r", desc);
18684 }
18685 else if(ok && ajSeqtestIsAccession(seqToken2))
18686 {
18687 ajStrAssignS(acc, seqToken2);
18688 ajStrAssignClear(sv);
18689 ajStrTokenNextParseC(seqHandle, "\n\r", desc);
18690 }
18691 else if(ok)
18692 {
18693 ajStrAssignClear(acc);
18694 ajStrAssignClear(sv);
18695 ajStrAssignS(desc, seqToken);
18696
18697 if(ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken))
18698 {
18699 ajStrAppendC(desc, " ");
18700 ajStrAppendS(desc, seqToken);
18701 }
18702 }
18703
18704 ajStrDelStatic(&seqToken); /* duplicate of accession or description */
18705 ajStrDelStatic(&seqToken2);
18706 ajStrTokenReset(seqHandle);
18707
18708 ajDebug("result id: '%S' acc: '%S' desc: '%S'\n", *id, *acc, *desc);
18709
18710 return ajTrue;
18711 }
18712
18713
18714
18715
18716 /* @func ajSeqParseNcbi *******************************************************
18717 **
18718 ** Parse an NCBI format fasta line. Return id acc and description.
18719 **
18720 ** Tries to cope with the amazing variety of identifiers NCBI inflicts
18721 ** on us all - see the BLAST document README.formatdb from NCBI for
18722 ** some of the gory detail, and look at some real files for clues
18723 ** to what can really happen. Sadly,'real files' also includes
18724 ** internal IDs in blast databases reformatted by formatdb.
18725 **
18726 ** @param [r] instr [const AjPStr] fasta line.
18727 ** @param [w] id [AjPStr*] id.
18728 ** @param [w] acc [AjPStr*] accession number.
18729 ** @param [w] sv [AjPStr*] sequence version number.
18730 ** @param [w] gi [AjPStr*] GI version number.
18731 ** @param [w] db [AjPStr*] NCBI database name
18732 ** @param [w] desc [AjPStr*] description.
18733 ** @return [AjBool] ajTrue if ncbi format
18734 **
18735 ** @release 1.0.0
18736 ** @@
18737 ******************************************************************************/
18738
ajSeqParseNcbi(const AjPStr instr,AjPStr * id,AjPStr * acc,AjPStr * sv,AjPStr * gi,AjPStr * db,AjPStr * desc)18739 AjBool ajSeqParseNcbi(const AjPStr instr, AjPStr* id, AjPStr* acc,
18740 AjPStr* sv, AjPStr* gi, AjPStr* db, AjPStr* desc)
18741 {
18742 AjPStr idstr = NULL;
18743 AjPStr reststr = NULL;
18744 AjPStr prefix = NULL;
18745 AjPStr numtoken = NULL;
18746 AjPStr str = NULL;
18747 const AjPStr vacc = NULL;
18748 const char *q;
18749 ajuint i;
18750 ajuint nt;
18751 AjBool ret = ajFalse;
18752
18753 ajStrAssignClear(db);
18754
18755 /* NCBI's list of standard identifiers June 2001
18756 ** ftp://ncbi.nlm.nih.gov/blast/db/README.formatdb
18757 **
18758 ** Database Name Identifier Syntax
18759 **
18760 ** GenBank gb|accession|locus
18761 ** EMBL Data Library emb|accession|locus
18762 ** DDBJ, DNA Database of Japan dbj|accession|locus
18763 ** SWISS-PROT sp|accession|entry name
18764 ** NCBI Reference Sequence ref|accession|locus
18765 **
18766 ** General database identifier gnl|database|identifier
18767 ** BLAST formatdb gnl|BL_ORD_ID|number
18768 ** (prefix for normal FASTA header - remove)
18769 **
18770 ** NBRF PIR pir||entry
18771 ** Protein Research Foundation prf||name
18772 ** (Japanese SEQDB protein DB)
18773 **
18774 ** Brookhaven Protein Data Bank pdb|entry|chain
18775 **
18776 ** Patents pat|country|number
18777 **
18778 ** GenInfo Backbone Id bbs|number
18779 ** Local Sequence identifier lcl|identifier
18780 **
18781 ** GenInfo identifier prefix gi|gi_identifier
18782 ** (prefix - remove)
18783 */
18784
18785 /* ajDebug("ajSeqParseNcbi '%S'\n", instr);*/
18786
18787 if(ajStrGetCharPos(instr, 3) == ';') /* then it is really PIR format */
18788 {
18789 ajDebug("ajSeqParseNcbi failed: this is PIR format\n");
18790
18791 return ajFalse;
18792 }
18793
18794 ajStrAssignS(&str, instr);
18795
18796 /* ajDebug("id test %B %B\n",
18797 !strchr(MAJSTRGETPTR(str), (ajint)'|'),
18798 (*MAJSTRGETPTR(str)!='>')); */
18799
18800 /* Line must start with '>', and include '|' bar, hopefully in the ID */
18801
18802 if(*MAJSTRGETPTR(str)!='>')
18803 {
18804 ajDebug("ajSeqParseNcbi failed: no '>' at start\n");
18805 ajStrDel(&str);
18806
18807 return ajFalse;
18808 }
18809
18810 /* pick out the ID */
18811
18812 ajStrTokenAssignC(&seqHandle2,str,"> \t\r\n");
18813 ajStrTokenNextParse(seqHandle2, &idstr);
18814 ajStrTokenNextParseC(seqHandle2, "\r\n", &reststr);
18815 ajStrTokenReset(seqHandle2);
18816
18817 /* check we have an ID */
18818
18819 if(!ajStrGetLen(idstr))
18820 {
18821 ajDebug("No ID string found - but try FASTA\n");
18822 ret = ajSeqParseFasta(str, id, acc, sv, desc);
18823 ajStrDel(&str);
18824 ajStrDel(&idstr);
18825 ajStrDel(&reststr);
18826
18827 return ret;
18828 }
18829
18830 /* NCBI ids always have | somewhere. Else we try a simple FASTA format */
18831
18832 if(!strchr(MAJSTRGETPTR(idstr),(ajint)'|'))
18833 {
18834 ajDebug("trying ajSeqParseFasta\n");
18835 ret = ajSeqParseFasta(str, id, acc, sv, desc);
18836 ajStrDel(&str);
18837 ajStrDel(&idstr);
18838 ajStrDel(&reststr);
18839
18840 return ret;
18841 }
18842
18843 ajStrAssignClear(id);
18844 ajStrTokenAssignC(&seqHandle,idstr,"|");
18845
18846 ajStrTokenNextParse(seqHandle, &prefix);
18847 q = MAJSTRGETPTR(prefix);
18848
18849 /*
18850 // ajDebug(" idstr: '%S'\n", idstr);
18851 // ajDebug("prefix: '%S'\n", prefix);
18852 */
18853
18854 if(!strncmp(q,"gi",2))
18855 {
18856 /* ajDebug("gi prefix\n"); */
18857 ajStrTokenNextParse(seqHandle, gi);
18858
18859 if(! ajStrTokenNextParse(seqHandle, &prefix))
18860 {
18861 /* we only have a gi prefix */
18862 ajDebug("*only* gi prefix\n");
18863 ajStrAssignS(id, *gi);
18864 ajStrAssignClear(acc);
18865 ajStrAssignS(desc, reststr);
18866 ajDebug("found pref: '%S' id: '%S', acc: '%S' "
18867 "desc: '%S'\n",
18868 prefix, *id, *acc, *desc);
18869 ajStrDel(&str);
18870 ajStrDel(&idstr);
18871 ajStrDel(&reststr);
18872 ajStrDel(&prefix);
18873 ajStrDelStatic(&seqToken);
18874 ajStrTokenReset(seqHandle);
18875
18876 return ajTrue;
18877 }
18878
18879 /* otherwise we continue to parse the rest */
18880 q = MAJSTRGETPTR(prefix);
18881 ajDebug("continue with '%S'\n", prefix);
18882 }
18883
18884
18885 /*
18886 * This next routine and associated function could be used if
18887 * whatever is appended to gnl lines is consistent
18888 */
18889
18890 if(!strncmp(MAJSTRGETPTR(idstr),"gnl|BL_ORD_ID|",14))
18891 {
18892 /* ajDebug("gnl|BL_ORD_ID stripping\n"); */
18893 ajStrTokenStep(seqHandle); /* BL_ORD_ID */
18894 ajStrTokenStep(seqHandle); /* number */
18895 ajStrInsertC(&reststr, 0, ">");
18896 ajStrTokenReset(seqHandle);
18897
18898 if(ajSeqParseNcbi(reststr,id,acc,sv,gi,db,desc))
18899 {
18900 ajStrAssignEmptyC(db, "BL_ORD_ID");
18901 /* recursive ... */
18902 ajDebug("ajSeqParseNcbi recursive success '%S'\n", reststr);
18903 /* ajDebug("found pref: '%S' id: '%S', acc: '%S' "
18904 "sv: '%S' desc: '%S'\n",
18905 prefix, *id, *acc, *sv, *desc); */
18906 ajStrDel(&str);
18907 ajStrDel(&idstr);
18908 ajStrDel(&reststr);
18909 ajStrDel(&prefix);
18910 ajStrDel(&numtoken);
18911 ajStrDelStatic(&seqToken);
18912 ajStrTokenReset(seqHandle);
18913
18914 return ajTrue;
18915 }
18916 ajDebug("ajSeqParseNcbi recursive failed '%S' - use gnl id\n",
18917 reststr);
18918 ajStrAssignS(id,numtoken);
18919 ajStrAssignClear(acc);
18920 /* ajDebug("found pref: '%S' id: '%S', acc: '%S' "
18921 "sv: '%S' desc: '%S'\n",
18922 prefix, *id, *acc, *sv, *desc); */
18923 ajStrDel(&str);
18924 ajStrDel(&idstr);
18925 ajStrDel(&reststr);
18926 ajStrDel(&prefix);
18927 ajStrDel(&numtoken);
18928 ajStrDelStatic(&seqToken);
18929 ajStrTokenDel(&seqHandle);
18930
18931 return ajTrue;
18932 }
18933
18934 /* works for NCBI formatdb reformatted blast databases
18935 ** still checking for any mis-formatted databases elsewhere */
18936
18937 if(!strcmp(q,"bbs") || !strcmp(q,"lcl"))
18938 {
18939 if(!strcmp(q, "lcl"))
18940 ajStrAssignS(db, prefix);
18941
18942 /* ajDebug("bbs or lcl prefix\n"); */
18943 ajStrTokenNextParse(seqHandle, id);
18944 ajStrAssignClear(acc);
18945 ajStrAssignS(desc, reststr);
18946 /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
18947 prefix, *id, *acc, *desc); */
18948 ajStrDel(&str);
18949 ajStrDel(&idstr);
18950 ajStrDel(&reststr);
18951 ajStrDel(&prefix);
18952 ajStrDel(&numtoken);
18953 ajStrDelStatic(&seqToken);
18954 ajStrTokenReset(seqHandle);
18955
18956 return ajTrue;
18957 }
18958
18959 if(!strcmp(q,"gnl") || !strcmp(q,"pat"))
18960 {
18961 /* ajDebug("gnl or pat prefix\n"); */
18962 if(!strcmp(q,"gnl"))
18963 ajStrTokenNextParse(seqHandle, db);
18964 else
18965 ajStrTokenStep(seqHandle);
18966
18967 ajStrTokenNextParse(seqHandle, id);
18968 ajStrAssignClear(acc); /* no accession number */
18969 ajStrAssignS(desc, reststr);
18970 /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
18971 prefix, *id, *acc, *desc); */
18972 ajStrDel(&str);
18973 ajStrDel(&idstr);
18974 ajStrDel(&reststr);
18975 ajStrDel(&prefix);
18976 ajStrDel(&numtoken);
18977 ajStrDel(&seqToken);
18978 ajStrTokenReset(seqHandle);
18979
18980 return ajTrue;
18981 }
18982
18983
18984 if(!strcmp(q,"pdb"))
18985 {
18986 ajStrAssignS(db, prefix);
18987 /* ajDebug("gnl or pat or pdb prefix\n"); */
18988 ajStrTokenNextParse(seqHandle, id);
18989
18990 if(ajStrTokenNextParse(seqHandle, &seqToken))
18991 {
18992 /* chain identifier to append */
18993 ajStrAppendS(id, seqToken);
18994 }
18995
18996 ajStrAssignClear(acc); /* no accession number */
18997 ajStrAssignS(desc, reststr);
18998 /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
18999 prefix, *id, *acc, *desc); */
19000 ajStrDel(&str);
19001 ajStrDel(&idstr);
19002 ajStrDel(&reststr);
19003 ajStrDel(&prefix);
19004 ajStrDel(&numtoken);
19005 ajStrDelStatic(&seqToken);
19006 ajStrTokenReset(seqHandle);
19007
19008 return ajTrue;
19009 }
19010
19011
19012 if(!strcmp(q,"gb") || !strcmp(q,"emb") || !strcmp(q,"dbj")
19013 || !strcmp(q,"tpd") || !strcmp(q,"tpd") || !strcmp(q,"tpg")
19014 || !strcmp(q,"sp") || !strcmp(q,"ref"))
19015 {
19016 /* ajDebug("gb,emb,dbj,sp,ref prefix\n"); */
19017 ajStrAssignS(db, prefix);
19018 ajStrTokenNextParse(seqHandle, &seqToken);
19019 vacc = ajSeqtestIsSeqversion(seqToken);
19020
19021 if(vacc)
19022 {
19023 ajStrAssignS(sv,seqToken);
19024 ajStrAssignS(acc,vacc);
19025 }
19026 else if(ajSeqtestIsAccession(seqToken))
19027 ajStrAssignS(acc,seqToken);
19028
19029 if(!ajStrTokenNextParse(seqHandle, id))
19030 {
19031 /* no ID, reuse accession token */
19032 ajStrAssignS(id, seqToken);
19033 }
19034
19035 ajStrAssignS(desc, reststr);
19036 /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
19037 prefix, *id, *acc, *desc); */
19038 ajStrDel(&str);
19039 ajStrDel(&idstr);
19040 ajStrDel(&reststr);
19041 ajStrDel(&prefix);
19042 ajStrDel(&numtoken);
19043 ajStrDelStatic(&seqToken);
19044 ajStrTokenReset(seqHandle);
19045
19046 return ajTrue;
19047 }
19048
19049
19050 if(!strcmp(q,"pir") || !strcmp(q,"prf"))
19051 {
19052 ajStrAssignS(db, prefix);
19053 /* ajDebug("pir,prf prefix\n"); */
19054 ajStrTokenNextParse(seqHandle, id);
19055 ajStrAssignS(desc, reststr);
19056 ajStrAssignClear(acc);
19057 /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
19058 prefix, *id, *acc, *desc); */
19059 ajStrDel(&str);
19060 ajStrDel(&idstr);
19061 ajStrDel(&reststr);
19062 ajStrDel(&prefix);
19063 ajStrDel(&numtoken);
19064 ajStrDelStatic(&seqToken);
19065 ajStrTokenReset(seqHandle);
19066
19067 return ajTrue;
19068 }
19069
19070
19071 /* else assume that the last two barred tokens contain [acc]|id */
19072
19073 ajDebug("No prefix accepted - try the last 2 fields\n");
19074
19075 nt = ajStrParseCountC(idstr,"|");
19076
19077 if(ajStrGetCharLast(idstr) == '|')
19078 nt++;
19079
19080 ajDebug("Barred tokens - %d found\n", nt);
19081
19082 if(nt < 2)
19083 {
19084 ajStrDel(&str);
19085 ajStrDel(&idstr);
19086 ajStrDel(&reststr);
19087 ajStrDel(&prefix);
19088 ajStrDel(&numtoken);
19089 ajStrDelStatic(&seqToken);
19090 ajStrTokenReset(seqHandle);
19091
19092 return ajFalse;
19093 }
19094
19095 /* restart parsing with only bars */
19096
19097 ajStrTokenAssignC(&seqHandle,idstr,"|");
19098
19099 for(i=0;i<nt-3;++i)
19100 ajStrTokenStep(seqHandle);
19101
19102 ajStrTokenNextParse(seqHandle, &seqToken);
19103
19104 ajStrAssignS(db, seqToken);
19105 ajStrTokenNextParse(seqHandle, &seqToken);
19106 ajDebug("token acc: '%S'\n", seqToken);
19107 vacc = ajSeqtestIsSeqversion(seqToken);
19108
19109 if(vacc)
19110 {
19111 ajStrAssignS(sv,seqToken);
19112 ajStrAssignS(acc,vacc);
19113 ajStrAssignS(id,vacc);
19114 }
19115 else if(ajSeqtestIsAccession(seqToken))
19116 {
19117 ajStrAssignS(acc,seqToken);
19118 ajStrAssignS(id,seqToken);
19119 }
19120 else
19121 {
19122 ajStrAssignS(id,seqToken);
19123 }
19124
19125
19126 if(ajStrTokenNextParseC(seqHandle, " \n\t\r", &seqToken))
19127 {
19128 ajDebug("token id: '%S'\n", seqToken);
19129
19130 if(ajStrGetLen(seqToken))
19131 ajStrAssignS(id,seqToken);
19132 }
19133
19134 ajStrTokenStepC(seqHandle, "\n\r");
19135 ajStrAssignS(desc, reststr);
19136 ajStrTokenReset(seqHandle);
19137 ajStrDelStatic(&seqToken);
19138 /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
19139 prefix, *id, *acc, *desc); */
19140
19141 ajStrDel(&str);
19142 ajStrDel(&idstr);
19143 ajStrDel(&reststr);
19144 ajStrDel(&prefix);
19145 ajStrDel(&numtoken);
19146
19147 return ajTrue;
19148 }
19149
19150
19151
19152
19153 /* @func ajSeqParseFastq ******************************************************
19154 **
19155 ** Parse a fastq id line. Return id acc sv and description
19156 **
19157 ** @param [r] instr [const AjPStr] fastq line.
19158 ** @param [w] id [AjPStr*] id.
19159 ** @param [w] desc [AjPStr*] description.
19160 ** @return [AjBool] ajTrue if fastq format
19161 **
19162 ** @release 6.1.0
19163 ** @@
19164 ******************************************************************************/
19165
ajSeqParseFastq(const AjPStr instr,AjPStr * id,AjPStr * desc)19166 AjBool ajSeqParseFastq(const AjPStr instr, AjPStr* id, AjPStr* desc)
19167 {
19168 AjPStr str = NULL;
19169
19170 /*ajDebug("ajSeqParseFastq '%S'\n", instr);*/
19171
19172 if(!ajStrPrefixC(instr, "@"))
19173 return ajFalse;
19174
19175 ajStrExtractWord(instr, desc, &str);
19176 ajStrTrimC(desc, "\n");
19177 ajStrAssignSubS(id, str, 1, -1);
19178
19179 ajStrDel(&str);
19180 return ajTrue;
19181 }
19182
19183
19184
19185
19186 /* @func ajSeqGetFromUsaRange *************************************************
19187 **
19188 ** Returns a sequence given a USA
19189 **
19190 ** @param [r] thys [const AjPStr] USA
19191 ** @param [r] protein [AjBool] True if protein
19192 ** @param [r] ibegin [ajint] sequence start position
19193 ** @param [r] iend [ajint] sequence end position
19194 ** @param [u] seq [AjPSeq] sequence
19195 ** @return [AjBool] ajTrue on success
19196 **
19197 ** @release 6.4.0
19198 ** @@
19199 ******************************************************************************/
19200
ajSeqGetFromUsaRange(const AjPStr thys,AjBool protein,ajint ibegin,ajint iend,AjPSeq seq)19201 AjBool ajSeqGetFromUsaRange(const AjPStr thys, AjBool protein,
19202 ajint ibegin, ajint iend, AjPSeq seq)
19203 {
19204 AjPSeqin seqin;
19205 AjBool ok;
19206
19207 seqin = NULL;
19208
19209 ajSeqinUsa(&seqin, thys);
19210
19211 if(ibegin!=0 || iend!=0)
19212 ajSeqinSetRange(seqin, ibegin, iend);
19213
19214 seqin->Input->Multi = ajFalse;
19215 seqin->Input->Text = ajFalse;
19216
19217 if(!protein)
19218 ajSeqinSetNuc(seqin);
19219 else
19220 ajSeqinSetProt(seqin);
19221
19222 ok = ajSeqRead(seq, seqin);
19223 ajSeqinDel(&seqin);
19224
19225 if(!ok)
19226 return ajFalse;
19227
19228 return ajTrue;
19229 }
19230
19231
19232
19233
19234 /* @func ajSeqGetFromUsa ******************************************************
19235 **
19236 ** Returns a sequence given a USA
19237 **
19238 ** @param [r] thys [const AjPStr] USA
19239 ** @param [r] protein [AjBool] True if protein
19240 ** @param [u] seq [AjPSeq] sequence
19241 ** @return [AjBool] ajTrue on success
19242 **
19243 ** @release 1.8.0
19244 ** @@
19245 ******************************************************************************/
19246
ajSeqGetFromUsa(const AjPStr thys,AjBool protein,AjPSeq seq)19247 AjBool ajSeqGetFromUsa(const AjPStr thys, AjBool protein, AjPSeq seq)
19248 {
19249
19250 return ajSeqGetFromUsaRange(thys, protein, 0, 0, seq);
19251 }
19252
19253
19254
19255
19256 /* @func ajSeqsetGetFromUsa ***************************************************
19257 **
19258 ** Return a seqset given a usa
19259 **
19260 ** @param [r] thys [const AjPStr] usa
19261 ** @param [w] seq [AjPSeqset*] seqset
19262 ** @return [AjBool] ajTrue on success
19263 **
19264 ** @release 2.7.0
19265 ******************************************************************************/
19266
ajSeqsetGetFromUsa(const AjPStr thys,AjPSeqset * seq)19267 AjBool ajSeqsetGetFromUsa(const AjPStr thys, AjPSeqset *seq)
19268 {
19269 AjPSeqin seqin;
19270 AjBool ok;
19271
19272 seqin = ajSeqinNew();
19273 seqin->Input->Multi = ajTrue;
19274 seqin->Input->Text = ajFalse;
19275
19276 ajSeqinUsa(&seqin, thys);
19277 ok = ajSeqsetRead(*seq, seqin);
19278 ajSeqinDel(&seqin);
19279
19280 if(!ok)
19281 return ajFalse;
19282
19283 return ajTrue;
19284 }
19285
19286
19287
19288
19289 /* @funcstatic seqTextSeq *****************************************************
19290 **
19291 ** Saves a sequence from a string into the text output pointer
19292 **
19293 ** Could do some extra formatting here (left margin, numbering)
19294 ** but as the EMBOSS formats are not too fussy that can wait.
19295 **
19296 ** @param [w] textptr [AjPStr*] Text output
19297 ** @param [r] seq [const AjPStr] sequence as a string
19298 ** @return [void]
19299 **
19300 ** @release 2.4.0
19301 ******************************************************************************/
19302
seqTextSeq(AjPStr * textptr,const AjPStr seq)19303 static void seqTextSeq(AjPStr* textptr, const AjPStr seq)
19304 {
19305 ajuint i;
19306 ajuint istart;
19307 ajuint iend;
19308 ajuint ilen;
19309 ajuint iwidth;
19310 AjPStr tmpstr = NULL;
19311
19312 ilen = ajStrGetLen(seq);
19313 iwidth = 60;
19314
19315 for(i=0; i < ilen; i += iwidth)
19316 {
19317 istart = i;
19318 iend = AJMIN(ilen-1, istart+iwidth-1);
19319 ajStrAssignSubS(&tmpstr, seq, istart, iend);
19320 ajFmtPrintAppS(textptr, "%S\n", tmpstr);
19321 }
19322
19323 ajStrDel(&tmpstr);
19324
19325 return;
19326 }
19327
19328
19329
19330
19331 /* @func ajSeqReadExit ********************************************************
19332 **
19333 ** Cleans up sequence reading internal memory
19334 **
19335 ** @return [void]
19336 **
19337 ** @release 4.0.0
19338 ** @@
19339 ******************************************************************************/
19340
ajSeqReadExit(void)19341 void ajSeqReadExit(void)
19342 {
19343 /* USA processing regular expressions */
19344
19345 ajRegFree(&seqRegUsaAsis);
19346 ajRegFree(&seqRegUsaDb);
19347 ajRegFree(&seqRegUsaFmt);
19348 ajRegFree(&seqRegUsaId);
19349 ajRegFree(&seqRegUsaList);
19350 ajRegFree(&seqRegUsaRange);
19351 ajRegFree(&seqRegUsaWild);
19352
19353 /* sequence reading regular expressions */
19354
19355 ajRegFree(&seqRegTreeconTop);
19356 ajRegFree(&seqRegMegaCommand);
19357 ajRegFree(&seqRegMegaFeat);
19358 ajRegFree(&seqRegMegaSeq);
19359 ajRegFree(&seqRegJackTop);
19360 ajRegFree(&seqRegJackSeq);
19361 ajRegFree(&seqRegGffTyp);
19362 ajRegFree(&seqRegGff3Typ);
19363 ajRegFree(&seqRegGcgDot);
19364 ajRegFree(&seqRegGcgChk);
19365 ajRegFree(&seqRegGcgLen);
19366 ajRegFree(&seqRegGcgNam);
19367 ajRegFree(&seqRegGcgTyp);
19368 ajRegFree(&seqRegGcgMsf);
19369 ajRegFree(&seqRegGcgMsflen);
19370 ajRegFree(&seqRegGcgMsfnam);
19371 ajRegFree(&seqRegGcgWgt);
19372 ajRegFree(&seqRegNbrfId);
19373 ajRegFree(&seqRegStadenId);
19374 ajRegFree(&seqRegHennigBlank);
19375 ajRegFree(&seqRegHennigSeq);
19376 ajRegFree(&seqRegHennigTop);
19377 ajRegFree(&seqRegHennigHead);
19378 ajRegFree(&seqRegFitchHead);
19379 ajRegFree(&seqRegStockholmSeq);
19380 ajRegFree(&seqRegAbiDots);
19381 ajRegFree(&seqRegRawNonseq);
19382 ajRegFree(&seqRegMaseHead);
19383 ajRegFree(&seqRegPhylipTop);
19384 ajRegFree(&seqRegPhylipHead);
19385 ajRegFree(&seqRegPhylipSeq);
19386 ajRegFree(&seqRegPhylipSeq2);
19387
19388 /* sequence reading strings */
19389 ajStrDel(&seqFtFmtEmbl);
19390 ajStrDel(&seqFtFmtGenbank);
19391 ajStrDel(&seqFtFmtRefseq);
19392 ajStrDel(&seqFtFmtRefseqp);
19393 ajStrDel(&seqFtFmtGff);
19394 ajStrDel(&seqFtFmtPir);
19395 ajStrDel(&seqFtFmtSwiss);
19396 ajStrDel(&seqUsaTest);
19397 ajStrDel(&seqQryChr);
19398 ajStrDel(&seqQryDb);
19399 ajStrDel(&seqQryList);
19400 ajStrDel(&seqAppendRestStr);
19401 ajStrDel(&seqAppendTmpSeq);
19402 ajStrDel(&seqQualStr);
19403
19404 ajStrDel(&seqReadLine);
19405 ajStrDel(&seqSaveLine);
19406 ajStrDel(&seqSaveLine2);
19407
19408 ajTableDel(&seqDbMethods);
19409
19410 AJFREE(seqAppendFilter);
19411
19412 ajStrTokenDel(&seqHandle);
19413 ajStrTokenDel(&seqHandle2);
19414 ajStrTokenDel(&seqHandleSplit);
19415
19416 ajStrDel(&seqName);
19417 ajStrDel(&seqChain);
19418 ajStrDel(&seqToken);
19419 ajStrDel(&seqToken2);
19420 ajStrDel(&seqTokenSplit);
19421 ajStrDel(&seqAppendTmpstr);
19422
19423 return;
19424 }
19425
19426
19427
19428
19429 /* @section Internals *********************************************************
19430 **
19431 ** Functions to return internal values
19432 **
19433 ** @nam3rule Type Internals for sequence datatype
19434 ** @nam4rule Get Return a value
19435 ** @nam5rule Fields Known query fields for ajSeqRead
19436 ** @nam5rule Qlinks Known query link operators for ajSeqRead
19437 **
19438 ** @valrule * [const char*] Internal value
19439 **
19440 ** @fcategory misc
19441 **
19442 ******************************************************************************/
19443
19444
19445
19446
19447 /* @func ajSeqinTypeGetFields *************************************************
19448 **
19449 ** Returns the listof known field names for ajSeqinRead
19450 **
19451 ** @return [const char*] List of field names
19452 **
19453 ** @release 6.4.0
19454 ** @@
19455 ******************************************************************************/
19456
ajSeqinTypeGetFields(void)19457 const char* ajSeqinTypeGetFields(void)
19458 {
19459 return "id acc sv gi des key org";
19460 }
19461
19462
19463
19464
19465 /* @func ajSeqinTypeGetQlinks *************************************************
19466 **
19467 ** Returns the listof known query link operators for ajSeqRead
19468 **
19469 ** @return [const char*] List of field names
19470 **
19471 ** @release 6.4.0
19472 ** @@
19473 ******************************************************************************/
19474
ajSeqinTypeGetQlinks(void)19475 const char* ajSeqinTypeGetQlinks(void)
19476 {
19477 return "|&!^=";
19478 }
19479
19480
19481
19482
19483 /* @func ajSeqinTrace *********************************************************
19484 **
19485 ** Debug calls to trace the data in a sequence input object.
19486 **
19487 ** @param [r] thys [const AjPSeqin] Sequence input object.
19488 ** @return [void]
19489 **
19490 ** @release 1.0.0
19491 ** @@
19492 ******************************************************************************/
19493
ajSeqinTrace(const AjPSeqin thys)19494 void ajSeqinTrace(const AjPSeqin thys)
19495 {
19496 ajDebug("Sequence input trace\n");
19497 ajDebug( "====================\n\n");
19498 ajDebug( " Name: '%S'\n", thys->Name);
19499
19500 ajTextinTrace(thys->Input);
19501
19502 if(ajStrGetLen(thys->Acc))
19503 ajDebug( " Accession: '%S'\n", thys->Acc);
19504
19505 if(ajStrGetLen(thys->Inputtype))
19506 ajDebug( " Inputtype: '%S'\n", thys->Inputtype);
19507
19508 if(ajStrGetLen(thys->Desc))
19509 ajDebug( " Description: '%S'\n", thys->Desc);
19510
19511 if(ajStrGetLen(thys->Inseq))
19512 ajDebug( " Inseq len: %d\n", ajStrGetLen(thys->Inseq));
19513
19514 if(thys->Rev)
19515 ajDebug( " Rev: %B\n", thys->Rev);
19516
19517 if(thys->Begin)
19518 ajDebug( " Begin: %d\n", thys->Begin);
19519
19520 if(thys->End)
19521 ajDebug( " End: %d\n", thys->End);
19522
19523 if(ajStrGetLen(thys->Full))
19524 ajDebug( " Full name: '%S'\n", thys->Full);
19525
19526 if(ajStrGetLen(thys->Date))
19527 ajDebug( " Date: '%S'\n", thys->Date);
19528
19529 if(ajStrGetLen(thys->Ufo))
19530 ajDebug( " Ufo: '%S'\n", thys->Ufo);
19531
19532 if(thys->Fttable)
19533 ajDebug( " Fttable: exists\n");
19534
19535 if(thys->Ftquery)
19536 ajDebug( " Ftquery: exists\n");
19537
19538 if(ajStrGetLen(thys->Entryname))
19539 ajDebug( " Entryname: '%S'\n", thys->Entryname);
19540
19541 if(ajStrGetLen(thys->DbSequence))
19542 ajDebug( " DbSequence: '%S'\n", thys->DbSequence);
19543
19544 if(thys->Features)
19545 ajDebug( " Features: %B\n", thys->Features);
19546
19547 if(thys->IsNuc)
19548 ajDebug( " IsNuc: %B\n", thys->IsNuc);
19549
19550 if(thys->IsProt)
19551 ajDebug( " IsProt: %B\n", thys->IsProt);
19552
19553 if(thys->SeqData)
19554 ajDebug( " SeqData: exists\n");
19555
19556 if(ajStrGetLen(thys->Doc))
19557 ajDebug( " Documentation:...\n%S\n", thys->Doc);
19558
19559 return;
19560 }
19561
19562
19563
19564
19565 /* @funcstatic stockholmNew ***************************************************
19566 **
19567 ** Creates and initialises a Stockholm object.
19568 **
19569 ** @param [r] i [ajuint] Number of sequences
19570 ** @return [SeqPStockholm] New sequence object.
19571 **
19572 ** @release 4.0.0
19573 ** @@
19574 ******************************************************************************/
19575
stockholmNew(ajuint i)19576 static SeqPStockholm stockholmNew(ajuint i)
19577 {
19578 SeqPStockholm thys = NULL;
19579
19580 AJNEW0(thys);
19581
19582 thys->id = ajStrNew();
19583 thys->ac = ajStrNew();
19584 thys->de = ajStrNew();
19585 thys->au = ajStrNew();
19586 thys->al = ajStrNew();
19587 thys->tp = ajStrNew();
19588 thys->se = ajStrNew();
19589 thys->bm = ajStrNew();
19590 thys->dc = ajStrNew();
19591 thys->dr = ajStrNew();
19592 thys->cc = ajStrNew();
19593 thys->gs = ajStrNew();
19594 thys->ref = ajStrNew();
19595 thys->sacons = ajStrNew();
19596 thys->sqcons = ajStrNew();
19597 thys->sscons = ajStrNew();
19598
19599 thys->n = i;
19600
19601 AJCNEW0(thys->name,i);
19602 AJCNEW0(thys->str,i);
19603
19604 for(i=0;i<thys->n;++i)
19605 {
19606 thys->name[i] = ajStrNew();
19607 thys->str[i] = ajStrNew();
19608 }
19609
19610 return thys;
19611 }
19612
19613
19614
19615
19616 /* #funcstatic stockholmdataNew ***********************************************
19617 **
19618 ** Creates and initialises a Stockholm data object.
19619 **
19620 ** #return [SeqPStockholmdata] New sequence object.
19621 ** ##
19622 ******************************************************************************/
19623
19624 /*static SeqPStockholmdata stockholmdataNew(void)
19625 {
19626 SeqPStockholmdata thys = NULL;
19627
19628 AJNEW0(thys);
19629
19630 thys->id = ajStrNew();
19631 thys->ac = ajStrNew();
19632 thys->de = ajStrNew();
19633 thys->au = ajStrNew();
19634 thys->al = ajStrNew();
19635 thys->tp = ajStrNew();
19636 thys->se = ajStrNew();
19637 thys->bm = ajStrNew();
19638 thys->dc = ajStrNew();
19639 thys->dr = ajStrNew();
19640 thys->cc = ajStrNew();
19641 thys->gs = ajStrNew();
19642 thys->ref = ajStrNew();
19643 thys->sacons = ajStrNew();
19644 thys->sqcons = ajStrNew();
19645 thys->sscons = ajStrNew();
19646
19647 return thys;
19648 }*/
19649
19650
19651
19652
19653 /* @funcstatic stockholmDel ***************************************************
19654 **
19655 ** Deletes a Stockholm object.
19656 **
19657 ** @param [d] Pseq [SeqPStockholm*] Stockholm object
19658 ** @return [void]
19659 **
19660 ** @release 4.0.0
19661 ** @@
19662 ******************************************************************************/
19663
stockholmDel(SeqPStockholm * Pseq)19664 static void stockholmDel(SeqPStockholm *Pseq)
19665 {
19666 SeqPStockholm pthis = NULL;
19667 ajuint i;
19668
19669 if(!Pseq)
19670 return;
19671
19672 pthis = *Pseq;
19673
19674 if(!pthis)
19675 return;
19676
19677 ajStrDel(&pthis->id);
19678 ajStrDel(&pthis->ac);
19679 ajStrDel(&pthis->de);
19680 ajStrDel(&pthis->au);
19681 ajStrDel(&pthis->al);
19682 ajStrDel(&pthis->tp);
19683 ajStrDel(&pthis->se);
19684 ajStrDel(&pthis->bm);
19685 ajStrDel(&pthis->dc);
19686 ajStrDel(&pthis->dr);
19687 ajStrDel(&pthis->cc);
19688 ajStrDel(&pthis->gs);
19689 ajStrDel(&pthis->ref);
19690 ajStrDel(&pthis->sacons);
19691 ajStrDel(&pthis->sqcons);
19692 ajStrDel(&pthis->sscons);
19693
19694 for(i=0;i<pthis->n;++i)
19695 {
19696 ajStrDel(&pthis->name[i]);
19697 ajStrDel(&pthis->str[i]);
19698 }
19699
19700 AJFREE(pthis->name);
19701 AJFREE(pthis->str);
19702 AJFREE(*Pseq);
19703
19704 return;
19705 }
19706
19707
19708
19709
19710 /* #funcstatic stockholmdataDel ***********************************************
19711 **
19712 ** Deletes a Stockholm data object.
19713 **
19714 ** #param [d] Pseq [SeqPStockholmdata*] Stockholm object
19715 ** #return [void]
19716 ** ##
19717 ******************************************************************************/
19718
19719 /*static void stockholmdataDel(SeqPStockholmdata *Pseq)
19720 {
19721 SeqPStockholmdata pthis = NULL;
19722
19723 if(!Pseq)
19724 return;
19725 pthis = *Pseq;
19726 if(!pthis)
19727 return;
19728
19729 ajStrDel(&pthis->id);
19730 ajStrDel(&pthis->ac);
19731 ajStrDel(&pthis->de);
19732 ajStrDel(&pthis->au);
19733 ajStrDel(&pthis->al);
19734 ajStrDel(&pthis->tp);
19735 ajStrDel(&pthis->se);
19736 ajStrDel(&pthis->bm);
19737 ajStrDel(&pthis->dc);
19738 ajStrDel(&pthis->dr);
19739 ajStrDel(&pthis->cc);
19740 ajStrDel(&pthis->gs);
19741 ajStrDel(&pthis->ref);
19742 ajStrDel(&pthis->sacons);
19743 ajStrDel(&pthis->sqcons);
19744 ajStrDel(&pthis->sscons);
19745
19746 AJFREE(*Pseq);
19747
19748 return;
19749 }*/
19750
19751
19752
19753
19754 /* @funcstatic selexNew *******************************************************
19755 **
19756 ** Creates and initialises a selex #=SQ line object.
19757 **
19758 ** @param [r] n [ajuint] Number of sequences
19759 ** @return [SeqPSelex] New sequence object.
19760 **
19761 ** @release 4.0.0
19762 ** @@
19763 ******************************************************************************/
19764
selexNew(ajuint n)19765 static SeqPSelex selexNew(ajuint n)
19766 {
19767 SeqPSelex thys = NULL;
19768 ajuint i;
19769
19770 AJNEW0(thys);
19771 thys->id = ajStrNew();
19772 thys->ac = ajStrNew();
19773 thys->de = ajStrNew();
19774 thys->au = ajStrNew();
19775 thys->cs = ajStrNew();
19776 thys->rf = ajStrNew();
19777 thys->n = n;
19778
19779 AJCNEW(thys->name,n);
19780 AJCNEW(thys->str,n);
19781 AJCNEW(thys->ss,n);
19782 AJCNEW(thys->sq,n);
19783
19784 for(i=0;i<n;++i)
19785 {
19786 thys->name[i] = ajStrNew();
19787 thys->str[i] = ajStrNew();
19788 thys->ss[i] = ajStrNew();
19789 thys->sq[i] = selexseqNew();
19790 }
19791
19792 return thys;
19793 }
19794
19795
19796
19797
19798 /* @funcstatic selexseqNew ****************************************************
19799 **
19800 ** Creates and initialises a selex #=SQ line object.
19801 **
19802 ** @return [SeqPSelexseq] New sequence object.
19803 **
19804 ** @release 4.0.0
19805 ** @@
19806 ******************************************************************************/
19807
selexseqNew(void)19808 static SeqPSelexseq selexseqNew(void)
19809 {
19810 SeqPSelexseq thys = NULL;
19811
19812 AJNEW0(thys);
19813
19814 thys->name = ajStrNew();
19815 thys->source = ajStrNew();
19816 thys->ac = ajStrNew();
19817 thys->de = ajStrNew();
19818
19819 return thys;
19820 }
19821
19822
19823
19824
19825 /* #funcstatic selexdataNew ***************************************************
19826 **
19827 ** Creates and initialises a selex #=SQ line object.
19828 **
19829 ** #return [SeqPSelexdata] New sequence object.
19830 ** ##
19831 ******************************************************************************/
19832
19833 /*static SeqPSelexdata selexdataNew(void)
19834 {
19835 SeqPSelexdata thys = NULL;
19836
19837 AJNEW0(thys);
19838 thys->id = ajStrNew();
19839 thys->ac = ajStrNew();
19840 thys->de = ajStrNew();
19841 thys->au = ajStrNew();
19842 thys->cs = ajStrNew();
19843 thys->rf = ajStrNew();
19844
19845 thys->name = ajStrNew();
19846 thys->str = ajStrNew();
19847 thys->ss = ajStrNew();
19848 thys->sq = selexseqNew();
19849
19850 return thys;
19851 }*/
19852
19853
19854
19855
19856 /* @funcstatic selexseqDel ****************************************************
19857 **
19858 ** Deletes a Selex object.
19859 **
19860 ** @param [d] Pseq [SeqPSelexseq*] Selex #=SQ object
19861 ** @return [void]
19862 ** **
19863 **
19864 ** @release 4.1.0
19865 ******************************************************************************/
19866
selexseqDel(SeqPSelexseq * Pseq)19867 static void selexseqDel(SeqPSelexseq *Pseq)
19868 {
19869 SeqPSelexseq pthis;
19870
19871 pthis = *Pseq;
19872
19873 if(!Pseq || !pthis)
19874 return;
19875
19876 ajStrDel(&pthis->name);
19877 ajStrDel(&pthis->source);
19878 ajStrDel(&pthis->ac);
19879 ajStrDel(&pthis->de);
19880
19881 AJFREE(pthis);
19882 *Pseq = NULL;
19883
19884 return;
19885 }
19886
19887
19888
19889
19890 /* @funcstatic selexDel *******************************************************
19891 **
19892 ** Deletes a Selex object.
19893 **
19894 ** @param [d] Pseq [SeqPSelex*] Selex object
19895 ** @return [void]
19896 **
19897 ** @release 4.1.0
19898 ** @@
19899 ******************************************************************************/
19900
selexDel(SeqPSelex * Pseq)19901 static void selexDel(SeqPSelex *Pseq)
19902 {
19903 SeqPSelex pthis;
19904 ajuint i;
19905 ajuint n;
19906
19907 pthis = *Pseq;
19908
19909 if(!Pseq || !pthis)
19910 return;
19911
19912 n = pthis->n;
19913
19914 for(i=0;i<n;++i)
19915 {
19916 ajStrDel(&pthis->name[i]);
19917 ajStrDel(&pthis->str[i]);
19918 ajStrDel(&pthis->ss[i]);
19919 selexseqDel(&pthis->sq[i]);
19920 }
19921
19922 if(n)
19923 {
19924 AJFREE(pthis->name);
19925 AJFREE(pthis->str);
19926 AJFREE(pthis->ss);
19927 AJFREE(pthis->sq);
19928 }
19929
19930 ajStrDel(&pthis->id);
19931 ajStrDel(&pthis->ac);
19932 ajStrDel(&pthis->de);
19933 ajStrDel(&pthis->au);
19934 ajStrDel(&pthis->cs);
19935 ajStrDel(&pthis->rf);
19936
19937 AJFREE(pthis);
19938 *Pseq = NULL;
19939
19940 return;
19941 }
19942
19943
19944
19945
19946 /* #funcstatic selexdataDel ***************************************************
19947 **
19948 ** Deletes a Selex data object.
19949 **
19950 ** #param [d] Pseq [SeqPSelexdata*] Selex data object
19951 ** #return [void]
19952 ** ##
19953 ******************************************************************************/
19954
19955 /*static void selexdataDel(SeqPSelexdata *Pseq)
19956 {
19957 SeqPSelexdata pthis;
19958
19959 pthis = *Pseq;
19960
19961 if(!Pseq || !pthis)
19962 return;
19963
19964
19965 ajStrDel(&pthis->name);
19966 ajStrDel(&pthis->str);
19967 ajStrDel(&pthis->ss);
19968 selexseqDel(&pthis->sq);
19969
19970 ajStrDel(&pthis->id);
19971 ajStrDel(&pthis->ac);
19972 ajStrDel(&pthis->de);
19973 ajStrDel(&pthis->au);
19974 ajStrDel(&pthis->cs);
19975 ajStrDel(&pthis->rf);
19976
19977 AJFREE(pthis);
19978 *Pseq = NULL;
19979
19980 return;
19981 }*/
19982
19983
19984
19985
19986 /* #funcstatic seqSelexClone *************************************************
19987 **
19988 ** Clone a Selexdata object
19989 **
19990 ** #param [r] thys [const SeqPSelexdata] selex data object
19991 **
19992 ** #return [SeqPSelexdata] New selex data object.
19993 ** ##
19994 ******************************************************************************/
19995
19996 /*static SeqPSelexdata seqSelexClone(const SeqPSelexdata thys)
19997 {
19998 SeqPSelexdata pthis;
19999
20000 pthis = selexdataNew();
20001
20002 ajStrAssignS(&pthis->id, thys->id);
20003 ajStrAssignS(&pthis->ac, thys->ac);
20004 ajStrAssignS(&pthis->de, thys->de);
20005 ajStrAssignS(&pthis->au, thys->au);
20006 ajStrAssignS(&pthis->cs, thys->cs);
20007 ajStrAssignS(&pthis->rf, thys->rf);
20008 ajStrAssignS(&pthis->name, thys->name);
20009 ajStrAssignS(&pthis->str, thys->str);
20010 ajStrAssignS(&pthis->ss, thys->ss);
20011
20012 pthis->ga[0] = thys->ga[0];
20013 pthis->ga[1] = thys->ga[1];
20014 pthis->tc[0] = thys->tc[0];
20015 pthis->tc[1] = thys->tc[1];
20016 pthis->nc[0] = thys->nc[0];
20017 pthis->nc[1] = thys->nc[1];
20018
20019 ajStrAssignS(&pthis->sq->name, thys->sq->name);
20020 ajStrAssignS(&pthis->sq->source, thys->sq->source);
20021 ajStrAssignS(&pthis->sq->ac, thys->sq->ac);
20022 ajStrAssignS(&pthis->sq->de, thys->sq->de);
20023
20024 pthis->sq->wt = thys->sq->wt;
20025 pthis->sq->start = thys->sq->start;
20026 pthis->sq->stop = thys->sq->stop;
20027 pthis->sq->len = thys->sq->len;
20028
20029
20030 return pthis;
20031 }*/
20032
20033
20034
20035
20036 /* @funcstatic seqDefine ******************************************************
20037 **
20038 ** Make sure all sequence object attributes are defined
20039 ** using values from the sequence input object if needed
20040 **
20041 ** @param [w] thys [AjPSeq] Sequence returned.
20042 ** @param [u] seqin [AjPSeqin] Sequence input definitions
20043 ** @return [AjBool] ajTrue on success.
20044 **
20045 ** @release 4.1.0
20046 ** @@
20047 ******************************************************************************/
20048
seqDefine(AjPSeq thys,AjPSeqin seqin)20049 static AjBool seqDefine(AjPSeq thys, AjPSeqin seqin)
20050 {
20051
20052 /* if values are missing in the sequence object, we can use defaults
20053 from seqin or calculate where possible */
20054
20055 /*ajDebug("seqDefine: thys->Db '%S', seqin->Db '%S'\n",
20056 thys->Db, seqin->Db);*/
20057 /*ajDebug("seqDefine: thys->Name '%S' type: %S\n",
20058 thys->Name, thys->Type);*/
20059 /*ajDebug("seqDefine: thys->Entryname '%S', seqin->Entryname '%S'\n",
20060 thys->Entryname, seqin->Entryname);*/
20061
20062 /* assign the dbname and entryname if defined in the seqin object */
20063 if(ajStrGetLen(seqin->Input->Db))
20064 ajStrAssignS(&thys->Db, seqin->Input->Db);
20065
20066 if(ajStrGetLen(seqin->Entryname))
20067 ajStrAssignEmptyS(&thys->Entryname, seqin->Entryname);
20068
20069 if(ajStrGetLen(thys->Entryname))
20070 ajStrAssignS(&thys->Name, thys->Entryname);
20071
20072 /*ajDebug("seqDefine: returns thys->Name '%S' type: %S\n",
20073 thys->Name, thys->Type);*/
20074
20075 if(!ajStrGetLen(thys->Type))
20076 {
20077 if(thys->Format)
20078 {
20079 if(seqinFormatDef[thys->Format].Nucleotide &&
20080 !seqinFormatDef[thys->Format].Protein)
20081 ajSeqSetNuc(thys);
20082
20083 if(!seqinFormatDef[thys->Format].Nucleotide &&
20084 seqinFormatDef[thys->Format].Protein)
20085 ajSeqSetProt(thys);
20086 }
20087 }
20088
20089 if(!ajStrGetLen(thys->Type))
20090 ajSeqType(thys);
20091
20092 if(seqin->Circular)
20093 thys->Circular = ajTrue;
20094
20095 if(thys->Fttable)
20096 {
20097 if(thys->Circular)
20098 ajFeattableSetCircular(thys->Fttable);
20099 else if(ajFeattableIsCircular(thys->Fttable))
20100 thys->Circular = ajTrue;
20101 }
20102
20103 return ajTrue;
20104 }
20105
20106
20107
20108
20109 /* @func ajSeqaccessGetDb *****************************************************
20110 **
20111 ** returns the table in which sequence database access details are registered
20112 **
20113 ** @return [AjPTable] Access functions hash table
20114 **
20115 ** @release 6.4.0
20116 ** @@
20117 ******************************************************************************/
20118
ajSeqaccessGetDb(void)20119 AjPTable ajSeqaccessGetDb(void)
20120 {
20121 if(!seqDbMethods)
20122 seqDbMethods = ajCallTableNew();
20123 return seqDbMethods;
20124 }
20125
20126
20127
20128
20129
20130 /* @func ajSeqaccessMethodGetQlinks *******************************************
20131 **
20132 ** Tests for a named method for sequence reading and returns the
20133 ** known query link operators
20134 **
20135 ** @param [r] method [const AjPStr] Method required.
20136 ** @return [const char*] Known link operators
20137 **
20138 ** @release 6.4.0
20139 ** @@
20140 ******************************************************************************/
20141
ajSeqaccessMethodGetQlinks(const AjPStr method)20142 const char* ajSeqaccessMethodGetQlinks(const AjPStr method)
20143 {
20144 AjPSeqAccess methoddata;
20145
20146 methoddata = ajCallTableGetS(seqDbMethods, method);
20147 if(!methoddata)
20148 return NULL;
20149
20150 return methoddata->Qlink;
20151 }
20152
20153
20154
20155
20156 /* @func ajSeqaccessMethodGetScope ********************************************
20157 **
20158 ** Tests for a named method for sequence reading and returns the scope
20159 ** (entry, query or all).
20160 *
20161 ** @param [r] method [const AjPStr] Method required.
20162 ** @return [ajuint] Scope flags
20163 **
20164 ** @release 6.4.0
20165 ** @@
20166 ******************************************************************************/
20167
ajSeqaccessMethodGetScope(const AjPStr method)20168 ajuint ajSeqaccessMethodGetScope(const AjPStr method)
20169 {
20170 AjPSeqAccess methoddata;
20171 ajuint ret = 0;
20172
20173 methoddata = ajCallTableGetS(seqDbMethods, method);
20174 if(!methoddata)
20175 return 0;
20176
20177 if(methoddata->Entry)
20178 ret |= AJMETHOD_ENTRY;
20179 if(methoddata->Query)
20180 ret |= AJMETHOD_QUERY;
20181 if(methoddata->All)
20182 ret |= AJMETHOD_ALL;
20183
20184 return ret;
20185 }
20186
20187
20188
20189
20190 /* @func ajSeqaccessMethodTest ************************************************
20191 **
20192 ** Tests for a named method for sequence reading.
20193 **
20194 ** @param [r] method [const AjPStr] Method required.
20195 ** @return [AjBool] ajTrue on success.
20196 **
20197 ** @release 6.4.0
20198 ** @@
20199 ******************************************************************************/
20200
ajSeqaccessMethodTest(const AjPStr method)20201 AjBool ajSeqaccessMethodTest(const AjPStr method)
20202 {
20203 if(ajCallTableGetS(seqDbMethods, method))
20204 return ajTrue;
20205
20206 return ajFalse;
20207 }
20208
20209
20210
20211
20212 /* @func ajSeqinformatTerm ****************************************************
20213 **
20214 ** Tests whether a data input format term is known
20215 **
20216 ** @param [r] term [const AjPStr] Format term EDAM ID
20217 ** @return [AjBool] ajTrue if term was accepted
20218 **
20219 ** @release 6.4.0
20220 ** @@
20221 ******************************************************************************/
20222
ajSeqinformatTerm(const AjPStr term)20223 AjBool ajSeqinformatTerm(const AjPStr term)
20224 {
20225 ajuint i;
20226
20227 for(i=0; seqinFormatDef[i].Name; i++)
20228 if(ajStrMatchC(term, seqinFormatDef[i].Obo))
20229 return ajTrue;
20230
20231 return ajFalse;
20232 }
20233
20234
20235
20236
20237 /* @func ajSeqinformatTest ****************************************************
20238 **
20239 ** Tests whether a named sequence data input format is known
20240 **
20241 ** @param [r] format [const AjPStr] Format
20242 ** @return [AjBool] ajTrue if formats was accepted
20243 **
20244 ** @release 6.4.0
20245 ** @@
20246 ******************************************************************************/
20247
ajSeqinformatTest(const AjPStr format)20248 AjBool ajSeqinformatTest(const AjPStr format)
20249 {
20250 ajuint i;
20251
20252 for(i=0; seqinFormatDef[i].Name; i++)
20253 if(ajStrMatchCaseC(format, seqinFormatDef[i].Name))
20254 return ajTrue;
20255
20256 return ajFalse;
20257 }
20258
20259
20260
20261
20262 #ifdef AJ_COMPILE_DEPRECATED_BOOK
20263 #endif /* AJ_COMPILE_DEPRECATED_BOOK */
20264
20265
20266
20267
20268 #ifdef AJ_COMPILE_DEPRECATED
20269 /* @obsolete ajSeqMethodGetScope
20270 ** @rename ajSeqaccessMethodGetScope
20271 */
20272
ajSeqMethodGetScope(const AjPStr method)20273 __deprecated ajuint ajSeqMethodGetScope(const AjPStr method)
20274 {
20275 return ajSeqaccessMethodGetScope(method);
20276 }
20277
20278
20279
20280
20281 /* @obsolete ajSeqMethodTest
20282 ** @rename ajSeqaccessMethodTest
20283 */
20284
ajSeqMethodTest(const AjPStr method)20285 __deprecated AjBool ajSeqMethodTest(const AjPStr method)
20286 {
20287 return ajSeqaccessMethodTest(method);
20288 }
20289
20290 #endif /* AJ_COMPILE_DEPRECATED */
20291