1 /* @source ajseqread **********************************************************
2 **
3 ** AJAX sequence reading functions
4 **
5 ** These functions control all aspects of AJAX sequence reading
6 **
7 ** @author Copyright (C) 2001 Peter Rice
8 ** @version $Revision: 1.334 $
9 ** @modified 2001-2011 pmr
10 ** @modified $Date: 2013/07/15 20:57:32 $ by $Author: rice $
11 ** @@
12 **
13 ** This library is free software; you can redistribute it and/or
14 ** modify it under the terms of the GNU Lesser General Public
15 ** License as published by the Free Software Foundation; either
16 ** version 2.1 of the License, or (at your option) any later version.
17 **
18 ** This library is distributed in the hope that it will be useful,
19 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21 ** Lesser General Public License for more details.
22 **
23 ** You should have received a copy of the GNU Lesser General Public
24 ** License along with this library; if not, write to the Free Software
25 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
26 ** MA  02110-1301,  USA.
27 **
28 ******************************************************************************/
29 
30 #include "ajlib.h"
31 
32 #include "ajseqread.h"
33 #include "ajseq.h"
34 #include "ajseqabi.h"
35 #include "ajseqtype.h"
36 #include "ajfeat.h"
37 #include "ajfeatread.h"
38 #include "ajcall.h"
39 #include "ajmath.h"
40 #include "ajlist.h"
41 #include "ajtable.h"
42 #include "ajquery.h"
43 #include "ajutil.h"
44 #include "ajbase.h"
45 #include "ajnexus.h"
46 #include "ajdom.h"
47 #include "ajseqbam.h"
48 #include "ajreg.h"
49 #include "ajtext.h"
50 #include "ajtextread.h"
51 #include "ajfileio.h"
52 #include "ajnam.h"
53 
54 #include <limits.h>
55 #include <math.h>
56 #include <errno.h>
57 
58 
59 #ifdef WIN32
60 #define fileno _fileno
61 #endif /* WIN32 */
62 
63 #define SCF_MAGIC (((((((ajuint)'.'<<8)+(ajuint)'s')<<8) \
64                      +(ajuint)'c')<<8)+(ajuint)'f')
65 
66 AjPTable seqDbMethods = NULL;
67 
68 static AjPStr    seqAppendTmpstr = NULL;
69 static AjPStrTok seqHandle  = NULL;
70 static AjPStrTok seqHandle2  = NULL;
71 static AjPStrTok seqHandleSplit = NULL;
72 static AjPStr    seqToken = NULL;
73 static AjPStr    seqToken2 = NULL;
74 static AjPStr    seqTokenSplit = NULL;
75 static AjPStr    seqName = NULL;
76 static AjPStr    seqChain = NULL;
77 
78 static char* seqAppendFilter = NULL;
79 
80 static ajint     seqMaxGcglines = 5000;
81 
82 static AjPRegexp seqRegTreeconTop  = NULL;
83 static AjPRegexp seqRegMegaCommand = NULL;
84 static AjPRegexp seqRegMegaFeat = NULL;
85 static AjPRegexp seqRegMegaSeq  = NULL;
86 static AjPRegexp seqRegJackTop  = NULL;
87 static AjPRegexp seqRegJackSeq  = NULL;
88 static AjPRegexp seqRegGffTyp = NULL;
89 static AjPRegexp seqRegGff3Typ = NULL;
90 static AjPRegexp seqRegRawNonseq = NULL;
91 static AjPRegexp seqRegNbrfId  = NULL;
92 static AjPRegexp seqRegStadenId = NULL;
93 static AjPRegexp seqRegHennigBlank = NULL;
94 static AjPRegexp seqRegHennigSeq   = NULL;
95 static AjPRegexp seqRegHennigTop   = NULL;
96 static AjPRegexp seqRegHennigHead  = NULL;
97 static AjPRegexp seqRegFitchHead = NULL;
98 static AjPRegexp seqRegStockholmSeq  = NULL;
99 static AjPRegexp seqRegAbiDots = NULL;
100 static AjPRegexp seqRegMaseHead = NULL;
101 static AjPRegexp seqRegPhylipTop  = NULL;
102 static AjPRegexp seqRegPhylipHead = NULL;
103 static AjPRegexp seqRegPhylipSeq  = NULL;
104 static AjPRegexp seqRegPhylipSeq2 = NULL;
105 
106 static AjPRegexp seqRegGcgDot = NULL;
107 static AjPRegexp seqRegGcgChk = NULL;
108 static AjPRegexp seqRegGcgLen = NULL;
109 static AjPRegexp seqRegGcgTyp = NULL;
110 static AjPRegexp seqRegGcgNam = NULL;
111 static AjPRegexp seqRegGcgMsf = NULL;
112 static AjPRegexp seqRegGcgMsflen = NULL;
113 static AjPRegexp seqRegGcgMsfnam = NULL;
114 static AjPRegexp seqRegGcgWgt = NULL;
115 
116 static AjBool seqinFormatIsset = AJFALSE;
117 
118 static AjPStr seqFtFmtEmbl    = NULL;
119 static AjPStr seqFtFmtGenbank = NULL;
120 static AjPStr seqFtFmtRefseq  = NULL;
121 static AjPStr seqFtFmtRefseqp = NULL;
122 static AjPStr seqFtFmtGff     = NULL;
123 static AjPStr seqFtFmtPir     = NULL;
124 static AjPStr seqFtFmtSwiss   = NULL;
125 static AjPStr seqUsaTest      = NULL;
126 static AjPStr seqQryChr       = NULL;
127 static AjPStr seqQryDb        = NULL;
128 static AjPStr seqQryList      = NULL;
129 static AjPStr seqReadLine     = NULL;
130 static AjPStr seqSaveLine     = NULL;
131 static AjPStr seqSaveLine2    = NULL;
132 static AjPStr seqAppendRestStr = NULL;
133 static AjPStr seqAppendTmpSeq = NULL;
134 static AjPStr seqQualStr      = NULL;
135 
136 static AjPRegexp seqRegUsaAsis  = NULL;
137 static AjPRegexp seqRegUsaDb    = NULL;
138 static AjPRegexp seqRegUsaFmt   = NULL;
139 static AjPRegexp seqRegUsaId    = NULL;
140 static AjPRegexp seqRegUsaList  = NULL;
141 static AjPRegexp seqRegUsaRange = NULL;
142 static AjPRegexp seqRegUsaWild  = NULL;
143 static AjBool seqRegUsaInitDone = AJFALSE;
144 static AjBool seqDoWarnAppend = AJFALSE;
145 
146 static float seqQualPhred[] =
147 {
148     0.0,
149     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*   1-8 */
150     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*   9-16 */
151     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*  17-24 */
152     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*  25-32 */
153     0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,   8.0,  9.0,  /*  33-42 */
154     10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,  18.0, 19.0, /*  43-52 */
155     20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,  28.0, 29.0, /*  53-62 */
156     30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0,  38.0, 39.0, /*  63-72 */
157     40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,  48.0, 49.0, /*  73-82 */
158     50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0,  58.0, 59.0, /*  83-92 */
159     60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0,  68.0, 69.0, /*  93-102 */
160     70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0,  78.0, 79.0, /* 103-112 */
161     80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0,  88.0, 89.0, /* 113-122 */
162     90.0, 91.0, 92.0, 93.0                                       /* 123-126 */
163 };
164 
165 static double seqQualSolexa[] =
166 {
167     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,         /*   0-7   */
168     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,         /*   8-15  */
169     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,         /*  16-23  */
170     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,         /*  24-31  */
171     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,         /*  32-39  */
172     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,         /*  40-47  */
173     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,         /*  48-55  */
174     0.0,  0.0,  0.0, 1.193310,                             /*  56-59  */
175     1.455405,  1.764349,  2.124426,  2.539019,  3.010300,  /*  60-64  */
176     3.539019,  4.124426,  4.764349,  5.455405,  6.193310,  /*  65-69  */
177     6.973228,  7.790097,  8.638920,  9.514969, 10.413927,  /*  70-74  */
178     11.331956, 12.265724, 13.212384, 14.169543, 15.135209, /*  75-79  */
179     16.107742, 17.085800, 18.068291, 19.054333, 20.043214, /*  80-84  */
180     21.034361, 22.027316, 23.021712, 24.017255, 25.013712, /*  85-89  */
181     26.010895, 27.008657, 28.006878, 29.005464, 30.004341, /*  90-94  */
182     31.003448, 32.002739, 33.002176, 34.001729, 35.001373, /*  95-99  */
183     36.001091, 37.000866, 38.000688, 39.000547, 40.000434, /* 100-104 */
184     41.000345, 42.000274, 43.000218, 44.000173, 45.000137, /* 105-109 */
185     46.000109, 47.000087, 48.000069, 49.000055, 50.000043, /* 110-114 */
186     51.000034, 52.000027, 53.000022, 54.000017, 55.000014, /* 115-119 */
187     56.000011, 57.000009, 58.000007, 59.000005, 60.000004, /* 120-124 */
188     61.000003, 62.000003                                   /* 125-126 */
189 };
190 
191 
192 
193 
194 static float seqQualIllumina[] =
195 {
196     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*   0-7   */
197     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*   8-15  */
198     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*  16-23  */
199     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*  24-31  */
200     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*  32-39  */
201     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*  40-47  */
202     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*  48-55  */
203     0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,  0.0,               /*  56-63  */
204     0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,   8.0,  9.0,  /*  64-73  */
205     10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,  18.0, 19.0, /*  74-83  */
206     20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0,  28.0, 29.0, /*  84-93  */
207     30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0,  38.0, 39.0, /*  94-103 */
208     40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,  48.0, 49.0, /* 104-113 */
209     50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0,  58.0, 59.0, /* 114-123 */
210     60.0, 61.0, 62.0                                             /* 124-126 */
211 };
212 
213 
214 
215 
216 
217 /* @datastatic SeqPInFormat ***************************************************
218 **
219 ** Sequence input formats data structure
220 **
221 ** @alias SeqSInFormat
222 ** @alias SeqOInFormat
223 **
224 ** @attr Name [const char*] Format name
225 ** @attr Obo  [const char*] Ontology term id from EDAM
226 ** @attr Desc [const char*] Format description
227 ** @attr Alias [AjBool] Name is an alias for an identical definition
228 ** @attr Try [AjBool] If true, try for an unknown input. Duplicate names
229 **                    and read-anything formats are set false
230 ** @attr Nucleotide [AjBool] True if suitable for nucleotide
231 ** @attr Protein [AjBool] True if suitable for protein
232 ** @attr Feature [AjBool] True if includes parsable feature data
233 ** @attr Gap [AjBool] True if allows gap characters
234 ** @attr Read [AjBool function] Input function, returns ajTrue on success
235 ** @attr Multiset [AjBool] If true, supports multiple sequence sets
236 **                         If false, multiple sets must be in separate files
237 ** @attr Binary [AjBool] Binary file format
238 ** @@
239 ******************************************************************************/
240 
241 typedef struct SeqSInFormat
242 {
243     const char *Name;
244     const char *Obo;
245     const char *Desc;
246     AjBool Alias;
247     AjBool Try;
248     AjBool Nucleotide;
249     AjBool Protein;
250     AjBool Feature;
251     AjBool Gap;
252     AjBool (*Read) (AjPSeq thys, AjPSeqin seqin);
253     AjBool Multiset;
254     AjBool Binary;
255 } SeqOInFormat;
256 
257 #define SeqPInFormat SeqOInFormat*
258 
259 
260 
261 
262 /* @datastatic SeqPMsfData ****************************************************
263 **
264 ** Sequence alignment data, stored until written when output file is closed
265 **
266 ** @alias SeqSMsfData
267 ** @alias SeqOMsfData
268 **
269 ** @attr Table [AjPTable] Ajax table of AjPMsfItem objects
270 ** @attr Names [AjPStr*] Sequence names
271 ** @attr Count [ajuint] Undocumented
272 ** @attr Nseq [ajuint] Number of sequences
273 ** @attr Nexus [AjPNexus] Nexus alignment data
274 ** @attr Gene [AjPStr] Gene name
275 ** @attr Domain [AjPStr] Domain name
276 ** @attr NextGene [AjPStr] Next block gene name
277 ** @attr NextDomain [AjPStr] Next block domain name
278 ** @attr Bufflines [ajuint] Number of buffered lines read
279 ** @attr CommentDepth [ajint] Comment depth
280 ** @attr Resume [AjBool] Resume processing
281 ** @attr Identchar [char] Identity character
282 ** @attr Indelchar [char] Gap character
283 ** @attr Misschar [char] Gap character
284 ** @attr Seqtype [char] Sequence type N:nucleotide P:protein
285 ** @@
286 ******************************************************************************/
287 
288 typedef struct SeqSMsfData
289 {
290     AjPTable Table;
291     AjPStr* Names;
292     ajuint Count;
293     ajuint Nseq;
294     AjPNexus Nexus;
295     AjPStr Gene;
296     AjPStr Domain;
297     AjPStr NextGene;
298     AjPStr NextDomain;
299     ajuint Bufflines;
300     ajint CommentDepth;
301     AjBool Resume;
302     char Identchar;
303     char Indelchar;
304     char Misschar;
305     char Seqtype;
306 } SeqOMsfData;
307 
308 #define SeqPMsfData SeqOMsfData*
309 
310 
311 
312 
313 /* @datastatic SeqPMsfItem ****************************************************
314 **
315 ** MSF alignment output individual sequence data
316 **
317 ** @alias SeqSMsfItem
318 ** @alias SeqOMsfItem
319 **
320 ** @attr Name [AjPStr] Sequence name
321 ** @attr Desc [AjPStr] Sequence description
322 ** @attr Len [ajuint] Sequence length
323 ** @attr Check [ajuint] Sequence GCG checksum
324 ** @attr Seq [AjPStr] Sequence
325 ** @attr Weight [float] Weight (default 1.0)
326 ** @attr Padding [char[4]] Padding to alignment boundary
327 ** @@
328 *****************************************************************************/
329 
330 typedef struct SeqSMsfItem
331 {
332     AjPStr Name;
333     AjPStr Desc;
334     ajuint Len;
335     ajuint Check;
336     AjPStr Seq;
337     float Weight;
338     char Padding[4];
339 } SeqOMsfItem;
340 
341 #define SeqPMsfItem SeqOMsfItem*
342 
343 
344 
345 
346 /* @datastatic SeqPStockholm **************************************************
347 **
348 ** Ajax Stockholm object.
349 **
350 ** @new stockholmNew Default constructor
351 ** @delete stockholmDel Default destructor
352 **
353 ** @attr id [AjPStr] identifier
354 ** @attr ac [AjPStr] accession
355 ** @attr de [AjPStr] description
356 ** @attr au [AjPStr] author
357 ** @attr al [AjPStr] Undocumented
358 ** @attr tp [AjPStr] Undocumented
359 ** @attr se [AjPStr] Undocumented
360 ** @attr ga [ajuint[2]] Undocumented
361 ** @attr tc [float[2]] Undocumented
362 ** @attr nc [float[2]] Undocumented
363 ** @attr bm [AjPStr] Undocumented
364 ** @attr ref [AjPStr] Undocumented
365 ** @attr dc [AjPStr] Undocumented
366 ** @attr dr [AjPStr] Undocumented
367 ** @attr cc [AjPStr] Undocumented
368 ** @attr sacons [AjPStr] Undocumented
369 ** @attr sqcons [AjPStr] Undocumented
370 ** @attr sscons [AjPStr] Undocumented
371 ** @attr gs [AjPStr] Undocumented
372 ** @attr name [AjPStr*] Undocumented
373 ** @attr str [AjPStr*] Undocumented
374 ** @attr n [ajuint] Undocumented
375 ** @attr Count [ajuint] Count
376 ** @@
377 ******************************************************************************/
378 
379 typedef struct SeqSStockholm
380 {
381     AjPStr id;
382     AjPStr ac;
383     AjPStr de;
384     AjPStr au;
385     AjPStr al;
386     AjPStr tp;
387     AjPStr se;
388     ajuint  ga[2];
389     float  tc[2];
390     float  nc[2];
391     AjPStr bm;
392     AjPStr ref;
393     AjPStr dc;
394     AjPStr dr;
395     AjPStr cc;
396     AjPStr sacons;
397     AjPStr sqcons;
398     AjPStr sscons;
399     AjPStr gs;
400     AjPStr *name;
401     AjPStr *str;
402     ajuint  n;
403     ajuint  Count;
404 } SeqOStockholm;
405 
406 #define SeqPStockholm SeqOStockholm*
407 
408 
409 
410 
411 /* @datastatic SeqPStockholmdata **********************************************
412 **
413 ** Ajax Stockholm data object (individual sequences)
414 **
415 ** @new stockholmdataNew Default constructor
416 ** @delete stockholmdataDel Default destructor
417 **
418 ** @attr id [AjPStr] identifier
419 ** @attr ac [AjPStr] accession
420 ** @attr de [AjPStr] description
421 ** @attr au [AjPStr] author
422 ** @attr al [AjPStr] Undocumented
423 ** @attr tp [AjPStr] Undocumented
424 ** @attr se [AjPStr] Undocumented
425 ** @attr bm [AjPStr] Undocumented
426 ** @attr sacons [AjPStr] Undocumented
427 ** @attr sqcons [AjPStr] Undocumented
428 ** @attr sscons [AjPStr] Undocumented
429 ** @attr ref [AjPStr] Undocumented
430 ** @attr dc [AjPStr] Undocumented
431 ** @attr dr [AjPStr] Undocumented
432 ** @attr cc [AjPStr] Undocumented
433 ** @attr gs [AjPStr] Undocumented
434 ** @attr ga [float[2]] Undocumented
435 ** @attr tc [float[2]] Undocumented
436 ** @attr nc [float[2]] Undocumented
437 ** @@
438 ******************************************************************************/
439 
440 typedef struct SeqSStockholmdata
441 {
442     AjPStr id;
443     AjPStr ac;
444     AjPStr de;
445     AjPStr au;
446     AjPStr al;
447     AjPStr tp;
448     AjPStr se;
449     AjPStr bm;
450     AjPStr sacons;
451     AjPStr sqcons;
452     AjPStr sscons;
453     AjPStr ref;
454     AjPStr dc;
455     AjPStr dr;
456     AjPStr cc;
457     AjPStr gs;
458     float  ga[2];
459     float  tc[2];
460     float  nc[2];
461 } SeqOStockholmdata;
462 
463 #define SeqPStockholmdata SeqOStockholmdata*
464 
465 
466 
467 
468 /* @datastatic SeqPSelexseq ***************************************************
469 **
470 ** Ajax Selex object for #=SQ information.
471 **
472 ** @new selexSQNew Default constructor
473 ** @delete selexSQDel Default destructor
474 **
475 ** @attr name [AjPStr] Object name
476 ** @attr source [AjPStr] Source file
477 ** @attr ac [AjPStr] accession
478 ** @attr de [AjPStr] description
479 ** @attr wt [float] weight (default 1.0)
480 ** @attr start [ajuint] start position
481 ** @attr stop [ajuint] end position
482 ** @attr len [ajuint] length
483 ** @@
484 ******************************************************************************/
485 
486 typedef struct SeqSSelexseq
487 {
488     AjPStr name;
489     AjPStr source;
490     AjPStr ac;
491     AjPStr de;
492     float  wt;
493     ajuint  start;
494     ajuint  stop;
495     ajuint  len;
496 } SeqOSelexseq;
497 
498 #define SeqPSelexseq SeqOSelexseq*
499 
500 
501 
502 
503 /* @datastatic SeqPSelex ******************************************************
504 **
505 ** Ajax Selex object.
506 **
507 ** @new selexNew Default constructor
508 ** @delete selexDel Default destructor
509 **
510 ** @attr id [AjPStr] identifier
511 ** @attr ac [AjPStr] accession
512 ** @attr de [AjPStr] description
513 ** @attr au [AjPStr] author
514 ** @attr cs [AjPStr] Undocumented
515 ** @attr rf [AjPStr] Undocumented
516 ** @attr name [AjPStr*] Undocumented
517 ** @attr str [AjPStr*] Undocumented
518 ** @attr ss [AjPStr*] Undocumented
519 ** @attr ga [float[2]] Undocumented
520 ** @attr tc [float[2]] Undocumented
521 ** @attr nc [float[2]] Undocumented
522 ** @attr sq [SeqPSelexseq*] Selex sequence objects
523 ** @attr n [ajuint] Number of SeqPSelexseq sequence objects
524 ** @attr Count [ajuint] Count
525 ** @@
526 ******************************************************************************/
527 
528 typedef struct SeqSSelex
529 {
530     AjPStr id;
531     AjPStr ac;
532     AjPStr de;
533     AjPStr au;
534     AjPStr cs;
535     AjPStr rf;
536     AjPStr *name;
537     AjPStr *str;
538     AjPStr *ss;
539     float  ga[2];
540     float  tc[2];
541     float  nc[2];
542     SeqPSelexseq *sq;
543     ajuint  n;
544     ajuint  Count;
545 } SeqOSelex;
546 
547 #define SeqPSelex SeqOSelex*
548 
549 
550 
551 
552 /* @datastatic SeqPSelexdata **************************************************
553 **
554 ** Ajax Selex data object (individual sequences)
555 **
556 ** @new selexdataNew Default constructor
557 ** @delete selexdataDel Default destructor
558 **
559 ** @attr id [AjPStr] identifier
560 ** @attr ac [AjPStr] accession
561 ** @attr de [AjPStr] description
562 ** @attr au [AjPStr] author
563 ** @attr cs [AjPStr] Undocumented
564 ** @attr rf [AjPStr] Undocumented
565 ** @attr name [AjPStr] Undocumented
566 ** @attr str [AjPStr] Undocumented
567 ** @attr ss [AjPStr] Undocumented
568 ** @attr ga [float[2]] Undocumented
569 ** @attr tc [float[2]] Undocumented
570 ** @attr nc [float[2]] Undocumented
571 ** @attr sq [SeqPSelexseq] Selex sequence object
572 ** @@
573 ******************************************************************************/
574 
575 typedef struct SeqSSelexdata
576 {
577     AjPStr id;
578     AjPStr ac;
579     AjPStr de;
580     AjPStr au;
581     AjPStr cs;
582     AjPStr rf;
583     AjPStr name;
584     AjPStr str;
585     AjPStr ss;
586     float  ga[2];
587     float  tc[2];
588     float  nc[2];
589     SeqPSelexseq sq;
590 } SeqOSelexdata;
591 
592 #define SeqPSelexdata SeqOSelexdata*
593 
594 
595 
596 
597 
598 
599 typedef struct SeqSScfHeader
600 {
601     ajuint magic_number;
602     ajuint samples;          /* Number of elements in Samples matrix */
603     ajuint samples_offset;   /* Byte offset from start of file */
604     ajuint bases;            /* Number of bases in Bases matrix */
605     ajuint bases_left_clip;  /* OBSOLETE: No. bases in left clip (vector) */
606     ajuint bases_right_clip; /* OBSOLETE: No. bases in right clip (qual) */
607     ajuint bases_offset;     /* Byte offset from start of file */
608     ajuint comments_size;    /* Number of bytes in Comment section */
609     ajuint comments_offset;  /* Byte offset from start of file */
610     char version[4];         /* "version.revision", eg '3' '.' '0' '0' */
611     ajuint sample_size;      /* Size of samples in bytes 1=8bits, 2=16bits*/
612     ajuint code_set;         /* code set used (but ignored!)*/
613     ajuint private_size;     /* No. of bytes of Private data, 0 if none */
614     ajuint private_offset;   /* Byte offset from start of file */
615     ajuint spare[18];        /* Unused */
616 } SeqOScfHeader;
617 
618 #define SeqPScfHeader SeqOScfHeader*
619 
620 
621 
622 
623 typedef struct SeqSScfUncertainty
624 {
625     ajuint code;
626     const char* name;
627 } SeqOScfUncertainty;
628 
629 #define SeqPScfUncertainty SeqOScfUncertainty*
630 
631 
632 
633 
634 static SeqOScfUncertainty SeqScfUncertainCodes[] = {
635     {0,       "{A,C,G,T,-}"},
636     {1,       "Staden"},
637     {2,       "IUPAC (NC-IUB)"},
638     {3,       "Pharmacia A.L.F. (NC-IUB)"},
639     {4,       "{A,C,G,T,N}   (ABI 373A)"},
640     {5,       "IBI/Pustell"},
641     {6,       "DNA*"},
642     {7,       "DNASIS"},
643     {8,       "IG/PC-Gene"},
644     {9,       "MicroGenie"},
645 };
646 
647 
648 
649 
650 /*
651  * Type definition for the sequence data
652  */
653 typedef struct SeqSScfBase {
654     ajuint    peak_index;  /* Index into Samples matrix for base posn */
655     unsigned char prob_A;  /* Probability of it being an A */
656     unsigned char prob_C;  /* Probability of it being an C */
657     unsigned char prob_G;  /* Probability of it being an G */
658     unsigned char prob_T;  /* Probability of it being an T */
659     char base;             /* Called base character */
660     char spare[3];         /* Spare */
661 } SeqOScfBase;
662 
663 #define SeqPScfBase SeqOScfBase*
664 
665 
666 
667 
668 typedef struct SeqSScfData
669 {
670     SeqOScfHeader header;
671 } SeqOScfData;
672 
673 
674 
675 
676 
677 #define SeqPScfData SeqOScfData*
678 
679 
680 
681 
682 /* @datastatic SeqPListUsa ****************************************************
683 **
684 ** Usa processing list of USAs from a list file.
685 **
686 ** Includes data from the original USA (@listfile)
687 **
688 ** @alias SeqSListUsa
689 ** @alias SeqOListUsa
690 **
691 ** @attr Begin [ajint] Begin if defined in original USA
692 ** @attr End [ajint] End if defined in original USA
693 ** @attr Rev [AjBool] Reverse if defined in original USA
694 ** @attr Format [ajuint] Format number from original USA
695 ** @attr Formatstr [AjPStr] Format name from original USA
696 ** @attr Usa [AjPStr] Current USA
697 ** @attr Fpos [ajulong] Start position offset
698 ** @attr Features [AjBool] if true, process features
699 ** @attr Padding [char[4]] Padding to alignment boundary
700 ** @@
701 ******************************************************************************/
702 
703 typedef struct SeqSListUsa
704 {
705     ajint Begin;
706     ajint End;
707     AjBool Rev;
708     ajuint Format;
709     AjPStr Formatstr;
710     AjPStr Usa;
711     ajulong Fpos;
712     AjBool Features;
713     char Padding[4];
714 } SeqOListUsa;
715 
716 #define SeqPListUsa SeqOListUsa*
717 
718 
719 
720 
721 /* @enumstatic SeqEPrefixGenbank **********************************************
722 **
723 ** Genbank/Refseq/GenPept/Codata record type
724 **
725 ** @value GB_UNK Unknown prefix
726 ** @value GB_AC Accession
727 ** @value GB_BASE Base count
728 ** @value GB_CC Comment
729 ** @value GB_DEF Definition
730 ** @value GB_FEAT Feature
731 ** @value GB_ID Locus
732 ** @value GB_KEY Keywords
733 ** @value GB_ORI Origin
734 ** @value GB_REF Reference
735 ** @value GB_SRC Source organism
736 ** @value GB_SQ Sequence
737 ** @value GB_VER Version
738 ** @value GB_WP GCG header
739 ** @value GB_END Final // record
740 ** @value GB_MORE Blank continuation
741 ** @value GB_MAX Beyond last value
742 ******************************************************************************/
743 
744 typedef enum
745 {
746     GB_UNK,
747     GB_AC,
748     GB_BASE,
749     GB_CC,
750     GB_DEF,
751     GB_FEAT,
752     GB_ID,
753     GB_KEY,
754     GB_ORI,
755     GB_REF,
756     GB_SRC,
757     GB_SQ,
758     GB_VER,
759     GB_WP,
760     GB_END,
761     GB_MORE,
762     GB_MAX
763 } SeqEPrefixGenbank;
764 
765 
766 
767 
768 /* @enumstatic SeqEPrefixGenbankMore ******************************************
769 **
770 ** Genbank/Refseq/GenPept/Codata subrecord type
771 **
772 ** @value GB_MORE_UNK Unknown prefix
773 ** @value GB_MORE_STD Standard (non-prefix) line
774 ** @value GB_MORE_AUT AUTHORS
775 ** @value GB_MORE_JNL JOURNAL
776 ** @value GB_MORE_ORG ORGANISM
777 ** @value GB_MORE_TIT TITLE
778 ** @value GB_MORE_MORE Blank prefix of at least 10 characters
779 ** @value GB_MORE_MAX Beyond last value
780 ******************************************************************************/
781 
782 typedef enum
783 {
784     GB_MORE_UNK,
785     GB_MORE_STD,
786     GB_MORE_AUT,
787     GB_MORE_JNL,
788     GB_MORE_ORG,
789     GB_MORE_TIT,
790     GB_MORE_MORE,
791     GB_MORE_MAX
792 } SeqEPrefixGenbankMore;
793 
794 
795 
796 
797 /* @enumstatic SeqEPrefixSwiss ************************************************
798 **
799 ** SwissProt/EMBL 2-character line prefix
800 **
801 ** @value SWISS_UNK Unknown prefix
802 ** @value SWISS_AC Accession
803 ** @value SWISS_AS EMBL AS line
804 ** @value SWISS_AV Staden experiment AV line
805 ** @value SWISS_CC Comment
806 ** @value SWISS_CO EMBL contig entry
807 ** @value SWISS_DE Description
808 ** @value SWISS_DR Database reference
809 ** @value SWISS_DT Date
810 ** @value SWISS_EX Staden experiment data
811 ** @value SWISS_FH EMBL feature header
812 ** @value SWISS_FT Feature
813 ** @value SWISS_GN Gene name
814 ** @value SWISS_ID Identifier line
815 ** @value SWISS_IV EMBL IV record
816 ** @value SWISS_KW Keyword
817 ** @value SWISS_OC Organism classification
818 ** @value SWISS_OG Organelle
819 ** @value SWISS_OH Organism host
820 ** @value SWISS_OS Species
821 ** @value SWISS_OX NCBI TaxID
822 ** @value SWISS_PE Swissprot evidence
823 ** @value SWISS_RA Reference authors
824 ** @value SWISS_RC Reference comment
825 ** @value SWISS_RG Reference RG
826 ** @value SWISS_RL Reference location
827 ** @value SWISS_RN Reference number
828 ** @value SWISS_RP Reference RP
829 ** @value SWISS_RT Reference RT
830 ** @value SWISS_RX Reference RX
831 ** @value SWISS_SQ Sequence
832 ** @value SWISS_SV SeqVersion
833 ** @value SWISS_WP GCG header
834 ** @value SWISS_XX Spacer
835 ** @value SWISS_END Final // record
836 ** @value SWISS_MORE Blank continuation (sequence)
837 ** @value SWISS_MAX Beyond last value
838 ******************************************************************************/
839 
840 typedef enum
841 {
842     SWISS_UNK,
843     SWISS_AC, SWISS_AS, SWISS_AV,
844     SWISS_CC, SWISS_CO,
845     SWISS_DE, SWISS_DR, SWISS_DT,
846     SWISS_EX,
847     SWISS_FH, SWISS_FT,
848     SWISS_GN,
849     SWISS_ID, SWISS_IV,
850     SWISS_KW,
851     SWISS_OC, SWISS_OG, SWISS_OH, SWISS_OS, SWISS_OX,
852     SWISS_PE,
853     SWISS_RA, SWISS_RC, SWISS_RG, SWISS_RL,
854     SWISS_RN, SWISS_RP, SWISS_RT, SWISS_RX,
855     SWISS_SQ, SWISS_SV,
856     SWISS_WP,
857     SWISS_XX,
858     SWISS_END,
859     SWISS_MORE,
860     SWISS_MAX
861 } SeqEPrefixSwiss;
862 
863 
864 
865 
866 /* @enumstatic SeqEDesSwiss ***************************************************
867 **
868 ** SwissProt description codes
869 **
870 ** @value SWISS_DES_UNK  Unknown code
871 ** @value SWISS_DES_ALT  AltName:
872 ** @value SWISS_DES_CONT Contains:
873 ** @value SWISS_DES_FLG  Flags:
874 ** @value SWISS_DES_INC  Includes:
875 ** @value SWISS_DES_REC  RecName:
876 ** @value SWISS_DES_SUB  SubName:
877 ** @value SWISS_DES_MAX  Beyond last value
878 ******************************************************************************/
879 
880 typedef enum
881 {
882     SWISS_DES_UNK,
883     SWISS_DES_ALT, SWISS_DES_CONT, SWISS_DES_FLG,
884     SWISS_DES_INC, SWISS_DES_REC, SWISS_DES_SUB,
885     SWISS_DES_MAX
886 } SeqEDesSwiss;
887 
888 
889 
890 
891 /* @enumstatic SeqESubSwiss ***************************************************
892 **
893 ** SwissProt description subcodes
894 **
895 ** @value SWISS_SUB_UNK     Unknown code
896 ** @value SWISS_SUB_ALLER   Allergen=
897 ** @value SWISS_SUB_BIOTECH Biotech=
898 ** @value SWISS_SUB_CDA     CD_antigen=
899 ** @value SWISS_SUB_EC      EC=
900 ** @value SWISS_SUB_FULL    Full=
901 ** @value SWISS_SUB_INN     INN=
902 ** @value SWISS_SUB_SHORT   Short=
903 ** @value SWISS_SUB_MAX     Beyond last value
904 ******************************************************************************/
905 
906 typedef enum
907 {
908     SWISS_SUB_UNK,
909     SWISS_SUB_ALLER,
910     SWISS_SUB_BIOTECH,
911     SWISS_SUB_CDA,
912     SWISS_SUB_EC,
913     SWISS_SUB_FULL,
914     SWISS_SUB_INN,
915     SWISS_SUB_SHORT,
916     SWISS_SUB_MAX
917 } SeqESubSwiss;
918 
919 
920 
921 
922 static SeqEDesSwiss      seqDesSwiss(const AjPStr str);
923 static SeqESubSwiss      seqDessubSwiss(AjPStr *Pstr);
924 static SeqEPrefixGenbank seqPrefixGenbank(const AjPStr str);
925 static SeqEPrefixGenbankMore seqPrefixGenbankMore(const AjPStr str);
926 static SeqEPrefixSwiss   seqPrefixSwiss(const AjPStr str);
927 
928 static AjBool     seqReadAbi(AjPSeq thys, AjPSeqin seqin);
929 
930 static void       seqAccSave(AjPSeq thys, const AjPStr acc);
931 static ajuint     seqAppend(AjPStr* seq, const AjPStr line);
932 static ajuint     seqAppendK(AjPStr* seq, char ch);
933 static const AjPStr seqAppendWarn(AjPStr* seq, const AjPStr line,
934                                   ajuint informat);
935 static ajuint     seqAppendCommented(AjPStr* seq, AjBool* incomment,
936                                      const AjPStr line);
937 static AjBool     seqClustalReadseq(const AjPStr rdLine,
938                                     const AjPTable msftable);
939 static AjBool     seqDefine(AjPSeq thys, AjPSeqin seqin);
940 static AjBool     seqinFormatFind(const AjPStr format, ajint *iformat);
941 static AjBool     seqinFormatSet(AjPSeqin seqin, AjPSeq thys);
942 static AjBool     seqGcgDots(AjPSeq thys, const AjPSeqin seqin,
943                              AjPStr* pline, ajuint maxlines, ajuint *len);
944 static void       seqGcgRegInit(void);
945 static AjBool     seqGcgMsfDots(AjPSeq thys, const AjPSeqin seqin,
946                                 AjPStr* pline,
947                                 ajuint maxlines, ajuint *len);
948 static AjBool     seqGcgMsfHeader(const AjPStr line, SeqPMsfItem* msfitem);
949 static AjBool     seqGcgMsfReadseq(const AjPStr rdline,
950                                    const AjPTable msftable);
951 static AjBool     seqHennig86Readseq(const AjPStr rdline,
952                                      const AjPTable msftable);
953 static AjBool     seqinUfoLocal(const AjPSeqin thys);
954 static void       seqListNoComment(AjPStr* text);
955 static AjBool     seqinListProcess(AjPSeqin seqin, AjPSeq thys,
956                                    const AjPStr usa);
957 static void       seqMsfDataDel(SeqPMsfData* pthys);
958 static void       seqMsfDataTrace(const SeqPMsfData thys);
959 static void       seqMsfItemDel(SeqPMsfItem* pthys);
960 static void       seqMsfTabDel(void **key, void **value, void *cl);
961 static void       seqMsfTabList(const void *key, void **value, void *cl);
962 static AjBool     seqPhylipReadseq(const AjPStr rdline,
963                                    const AjPTable phytable,
964                                    const AjPStr token,
965                                    ajuint len, ajuint* ilen, AjBool* done);
966 static AjBool     seqQueryMatch(const AjPQuery query, const AjPSeq thys);
967 static AjBool     seqRead(AjPSeq thys, AjPSeqin seqin);
968 static AjBool     seqReadAce(AjPSeq thys, AjPSeqin seqin);
969 static AjBool     seqReadAcedb(AjPSeq thys, AjPSeqin seqin);
970 static AjBool     seqReadBam(AjPSeq thys, AjPSeqin seqin);
971 static AjBool     seqReadBiomart(AjPSeq thys, AjPSeqin seqin);
972 static AjBool     seqReadClustal(AjPSeq thys, AjPSeqin seqin);
973 static AjBool     seqReadCodata(AjPSeq thys, AjPSeqin seqin);
974 static AjBool     seqReadDAS(AjPSeq thys, AjPSeqin seqin);
975 static AjBool     seqReadDbId(AjPSeq thys, AjPSeqin seqin);
976 static AjBool     seqReadEmbl(AjPSeq thys, AjPSeqin seqin);
977 static AjBool     seqReadEnsembl(AjPSeq thys, AjPSeqin seqin);
978 static AjBool     seqReadExperiment(AjPSeq thys, AjPSeqin seqin);
979 static AjBool     seqReadFasta(AjPSeq thys, AjPSeqin seqin);
980 static AjBool     seqReadFastq(AjPSeq thys, AjPSeqin seqin);
981 static AjBool     seqReadFastqIllumina(AjPSeq thys, AjPSeqin seqin);
982 /*static AjBool     seqReadFastqInt(AjPSeq thys, AjPSeqin seqin);*/
983 static AjBool     seqReadFastqSanger(AjPSeq thys, AjPSeqin seqin);
984 static AjBool     seqReadFastqSolexa(AjPSeq thys, AjPSeqin seqin);
985 static AjBool     seqReadFitch(AjPSeq thys, AjPSeqin seqin);
986 static ajuint     seqReadFmt(AjPSeq thys, AjPSeqin seqin,
987                              ajuint format);
988 static AjBool     seqReadGcg(AjPSeq thys, AjPSeqin seqin);
989 static AjBool     seqReadGde(AjPSeq thys, AjPSeqin seqin);
990 static AjBool     seqReadGenbank(AjPSeq thys, AjPSeqin seqin);
991 static AjBool     seqReadGenpept(AjPSeq thys, AjPSeqin seqin);
992 static AjBool     seqReadGifasta(AjPSeq thys, AjPSeqin seqin);
993 static AjBool     seqReadGff2(AjPSeq thys, AjPSeqin seqin);
994 static AjBool     seqReadGff3(AjPSeq thys, AjPSeqin seqin);
995 static AjBool     seqReadHennig86(AjPSeq thys, AjPSeqin seqin);
996 static AjBool     seqReadIg(AjPSeq thys, AjPSeqin seqin);
997 static AjBool     seqReadIgstrict(AjPSeq thys, AjPSeqin seqin);
998 static AjBool     seqReadIguspto(AjPSeq thys, AjPSeqin seqin);
999 static AjBool     seqReadJackknifer(AjPSeq thys, AjPSeqin seqin);
1000 static AjBool     seqReadMase(AjPSeq thys, AjPSeqin seqin);
1001 static AjBool     seqReadMega(AjPSeq thys, AjPSeqin seqin);
1002 static AjBool     seqReadMsf(AjPSeq thys, AjPSeqin seqin);
1003 static AjBool     seqReadNbrf(AjPSeq thys, AjPSeqin seqin);
1004 static AjBool     seqReadNcbi(AjPSeq thys, AjPSeqin seqin);
1005 static AjBool     seqReadNexus(AjPSeq thys, AjPSeqin seqin);
1006 static AjBool     seqReadNibble(AjPSeq thys, AjPSeqin seqin);
1007 static AjBool     seqReadPdb(AjPSeq thys, AjPSeqin seqin);
1008 static AjBool     seqReadPdbseq(AjPSeq thys, AjPSeqin seqin);
1009 static AjBool     seqReadPdbnuc(AjPSeq thys, AjPSeqin seqin);
1010 static AjBool     seqReadPdbnucseq(AjPSeq thys, AjPSeqin seqin);
1011 static AjBool     seqReadPhylip(AjPSeq thys, AjPSeqin seqin);
1012 static AjBool     seqReadPhylipnon(AjPSeq thys, AjPSeqin seqin);
1013 static AjBool     seqReadRaw(AjPSeq thys, AjPSeqin seqin);
1014 static AjBool     seqReadRefseq(AjPSeq thys, AjPSeqin seqin);
1015 static AjBool     seqReadRefseqp(AjPSeq thys, AjPSeqin seqin);
1016 static AjBool     seqReadSam(AjPSeq thys, AjPSeqin seqin);
1017 static AjBool     seqReadScf(AjPSeq thys, AjPSeqin seqin);
1018 static AjBool     seqReadSelex(AjPSeq thys, AjPSeqin seqin);
1019 static AjBool     seqReadStockholm(AjPSeq thys, AjPSeqin seqin);
1020 static AjBool     seqReadStaden(AjPSeq thys, AjPSeqin seqin);
1021 static AjBool     seqReadStrider(AjPSeq thys, AjPSeqin seqin);
1022 static AjBool     seqReadSwiss(AjPSeq thys, AjPSeqin seqin);
1023 static AjBool     seqReadText(AjPSeq thys, AjPSeqin seqin);
1024 static AjBool     seqReadTreecon(AjPSeq thys, AjPSeqin seqin);
1025 static void       seqSelexAppend(const AjPStr src, AjPStr *dest, ajuint beg,
1026                                  ajuint end);
1027 static void       seqSelexCopy(AjPSeq *thys, SeqPSelex selex, ajuint n);
1028 static AjBool     seqSelexHeader(SeqPSelex *thys, const AjPStr line,
1029                                  AjBool *named, ajuint *sqcnt);
1030 static void       seqSelexPos(const AjPStr line, ajuint *begin, ajuint *end);
1031 static AjBool     seqSelexReadBlock(SeqPSelex *thys, AjBool *named, ajuint n,
1032                                     AjPStr *line, AjPSeqin seqin, AjPStr *astr);
1033 static AjBool     seqSetInFormat(const AjPStr format);
1034 static void       seqSetName(AjPSeq thys, const AjPStr str);
1035 static void       seqitemSetName(SeqPMsfItem thys, const AjPStr str);
1036 static void       seqnameSetName(AjPStr *name, const AjPStr str);
1037 static void       seqSetNameFile(AjPSeq thys, const AjPSeqin seqin);
1038 static void       seqSetNameNospace(AjPStr* name, const AjPStr str);
1039 static void       seqStockholmCopy(AjPSeq *thys, SeqPStockholm stock, ajint n);
1040 static void       seqSvSave(AjPSeq thys, const AjPStr sv);
1041 static void       seqTaxSave(AjPSeq thys, const AjPStr tax, ajuint level);
1042 static void       seqTaxidSaveI(AjPSeq thys, ajuint tax);
1043 static void       seqTaxidSaveS(AjPSeq thys, const AjPStr tax);
1044 static void       seqTextSeq(AjPStr* textptr, const AjPStr seq);
1045 static void       seqUsaListTrace(const AjPList list);
1046 static AjBool     seqinUsaProcess(AjPSeqin seqin, AjPSeq thys);
1047 static void       seqUsaRegInit(void);
1048 static void       seqUsaRestore(AjPSeqin seqin, const SeqPListUsa node);
1049 static void       seqUsaSave(SeqPListUsa node, const AjPSeqin seqin);
1050 
1051 static void       seqqualAppendWarn(AjPStr* seq, const AjPStr line);
1052 
1053 static SeqPStockholm stockholmNew(ajuint i);
1054 static void         stockholmDel(SeqPStockholm *thys);
1055 
1056 static void         selexDel(SeqPSelex *thys);
1057 static void         selexseqDel(SeqPSelexseq *thys);
1058 
1059 /*
1060   static SeqPStockholmdata stockholmdataNew(void);
1061   static void         stockholmdataDel(SeqPStockholmdata *thys);
1062   static SeqPSelexdata seqSelexClone(const SeqPSelexdata thys);
1063   static SeqPSelexdata selexdataNew(void);
1064   static void         selexdataDel(SeqPSelexdata *thys);
1065 */
1066 
1067 static SeqPSelex     selexNew(ajuint n);
1068 static SeqPSelexseq  selexseqNew(void);
1069 
1070 /* static data that needs the function definitions and so must come later */
1071 
1072 
1073 
1074 
1075 /* @funclist seqinFormatDef ***************************************************
1076 **
1077 ** Functions to read each sequence format
1078 **
1079 ** New documentation on sequence formats:
1080 ** http://www.megasoftware.net/mega4.pdf pages 55 onwards (sections 4.1, 4.2)
1081 ** describe MEGA, some other formats, simple XML (name and seq)
1082 **
1083 ** The SeqIO program supports some non-EMBOSS formats:
1084 ** http://biowulf.nih.gov/apps/seqio_docs/seqio_user.html
1085 ** notably FASTA-output, BLAST-output
1086 ** and has its own rules for database definitions (BioSeq)
1087 ** and database references
1088 **
1089 ** For XML formats see Paul Gordon's list at
1090 ** http://www.visualgenomics.ca/gordonp/xml/
1091 **
1092 ******************************************************************************/
1093 
1094 static SeqOInFormat seqinFormatDef[] =
1095 {
1096 /*   "Name",
1097      "Obo"      "Description" */
1098 /*   Alias,   Try,     Nucleotide, Protein   */
1099 /*   Feature  Gap,     ReadFunction,  Multiset, Binary */
1100     {"unknown",
1101      "0000", "Unknown format",
1102      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1103      AJFALSE, AJTRUE,  &seqReadText, AJFALSE, AJFALSE}, /* alias for text */
1104     {"gcg",
1105      "1935", "GCG sequence format",
1106      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1107      AJFALSE, AJTRUE,  &seqReadGcg, AJFALSE, AJFALSE}, /* do 1st,
1108                                                          headers mislead */
1109     {"gcg8",
1110      "1935", "GCG old (version 8) sequence format",
1111      AJTRUE,  AJFALSE, AJTRUE,  AJTRUE,
1112      AJFALSE, AJTRUE,  &seqReadGcg, AJFALSE, AJFALSE}, /* alias for gcg
1113                                                          (8.x too) */
1114     {"embl",
1115      "1927", "EMBL format",
1116      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1117      AJTRUE,  AJTRUE,  &seqReadEmbl, AJFALSE, AJFALSE},
1118     {"em",
1119      "1927", "EMBL format (alias)",
1120      AJTRUE,  AJFALSE, AJTRUE,  AJFALSE,
1121      AJTRUE,  AJTRUE,  &seqReadEmbl, AJFALSE, AJFALSE}, /* alias for embl */
1122     {"swiss",
1123      "1963", "Swissprot entry format",
1124      AJFALSE, AJTRUE,  AJFALSE, AJTRUE,
1125      AJTRUE,  AJTRUE,  &seqReadSwiss, AJFALSE, AJFALSE},
1126     {"sw",
1127      "1963", "Swissprot entry format (alias)",
1128      AJTRUE,  AJFALSE, AJFALSE, AJTRUE,
1129      AJTRUE,  AJTRUE,  &seqReadSwiss, AJFALSE, AJFALSE}, /* alias for swiss */
1130     {"swissprot",
1131      "1963", "Swissprot entry format (alias)",
1132      AJTRUE,  AJFALSE, AJFALSE, AJTRUE,
1133      AJTRUE,  AJTRUE,  &seqReadSwiss, AJFALSE, AJFALSE},
1134     {"uniprot",
1135      "2188", "Swissprot entry format (alias)",
1136      AJTRUE,  AJFALSE, AJFALSE, AJTRUE,
1137      AJTRUE,  AJTRUE,  &seqReadSwiss, AJFALSE, AJFALSE},
1138     {"nbrf",
1139      "1948", "NBRF/PIR entry format",
1140      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1141      AJTRUE,  AJTRUE,  &seqReadNbrf, AJFALSE, AJFALSE}, /* test before NCBI */
1142     {"pir",
1143      "1948", "NBRF/PIR entry format (alias)",
1144      AJTRUE,  AJFALSE, AJTRUE,  AJTRUE,
1145      AJTRUE,  AJTRUE,  &seqReadNbrf, AJFALSE, AJFALSE}, /* alias for nbrf */
1146     {"pdb",
1147      "1950", "PDB protein databank format ATOM lines",
1148      AJFALSE, AJTRUE,  AJFALSE, AJTRUE,
1149      AJFALSE, AJFALSE, &seqReadPdb, AJFALSE, AJFALSE},
1150     {"pdbseq",
1151      "1953", "PDB protein databank format SEQRES lines",
1152      AJFALSE, AJFALSE, AJFALSE, AJTRUE,
1153      AJFALSE, AJFALSE, &seqReadPdbseq, AJFALSE, AJFALSE},
1154     {"pdbnuc",
1155      "1951", "PDB protein databank format nucleotide ATOM lines",
1156      AJFALSE, AJFALSE, AJTRUE,  AJFALSE,
1157      AJFALSE, AJFALSE, &seqReadPdbnuc, AJFALSE, AJFALSE},
1158     {"pdbnucseq",
1159      "1952", "PDB protein databank format nucleotide SEQRES lines",
1160      AJFALSE, AJFALSE, AJTRUE,  AJFALSE,
1161      AJFALSE, AJFALSE, &seqReadPdbnucseq, AJFALSE, AJFALSE},
1162     {"fasta",
1163      "1929", "FASTA format including NCBI-style IDs",
1164      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1165      AJFALSE, AJTRUE,  &seqReadNcbi, AJFALSE, AJFALSE}, /* alias for ncbi,
1166                                                           preferred name */
1167     {"ncbi",
1168      "1929", "FASTA format including NCBI-style IDs (alias)",
1169      AJTRUE,  AJFALSE, AJTRUE,  AJTRUE,
1170      AJFALSE, AJTRUE,  &seqReadNcbi, AJFALSE, AJFALSE}, /* test before
1171                                                           pearson */
1172     {"gifasta",
1173      "1940", "FASTA format including NCBI-style GIs (alias)",
1174      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1175      AJFALSE, AJTRUE,  &seqReadGifasta, AJFALSE, AJFALSE}, /* NCBI with GI
1176                                                              as ID*/
1177     {"pearson",
1178      "1954", "Plain old fasta format with IDs not parsed further",
1179      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1180      AJFALSE, AJTRUE,  &seqReadFasta, AJFALSE, AJFALSE}, /* plain fasta - off
1181                                                            by default, can
1182                                                            read bad files */
1183     {"fastq",
1184      "1930", "FASTQ short read format ignoring quality scores",
1185      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1186      AJFALSE, AJFALSE, &seqReadFastq, AJFALSE, AJFALSE},
1187     {"fastq-sanger",
1188      "1932", "FASTQ short read format with phred quality",
1189      AJFALSE, AJFALSE, AJTRUE,  AJFALSE,
1190      AJFALSE, AJFALSE, &seqReadFastqSanger, AJFALSE, AJFALSE},
1191     {"fastq-illumina",
1192      "1931", "FASTQ Illumina 1.3 short read format",
1193      AJFALSE, AJFALSE, AJTRUE,  AJFALSE,
1194      AJFALSE, AJFALSE, &seqReadFastqIllumina, AJFALSE, AJFALSE},
1195     {"fastq-solexa",
1196      "1933", "FASTQ Solexa/Illumina 1.0 short read format",
1197      AJFALSE, AJFALSE, AJTRUE,  AJFALSE,
1198      AJFALSE, AJFALSE, &seqReadFastqSolexa, AJFALSE, AJFALSE},
1199 /*
1200 **  {"fastq-int",  "FASTQ short read format with integer Solexa scores",
1201 **       AJFALSE, AJFALSE, AJTRUE,  AJFALSE,
1202 **       AJFALSE, AJFALSE, seqReadFastqInt, AJFALSE, AJFALSE},
1203 */
1204     {"sam",
1205      "2573", "Sequence Alignment/Map (SAM) format", /* biomart also tsv */
1206      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1207      AJFALSE, AJTRUE, &seqReadSam, AJFALSE, AJFALSE},
1208     {"genbank",
1209      "1936", "Genbank entry format",
1210      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1211      AJTRUE,  AJTRUE,  &seqReadGenbank, AJFALSE, AJFALSE},
1212     {"gb",
1213      "1936", "Genbank entry format (alias)",
1214      AJTRUE,  AJFALSE, AJTRUE,  AJFALSE,
1215      AJTRUE,  AJTRUE,  &seqReadGenbank, AJFALSE, AJFALSE}, /* alias for
1216                                                              genbank */
1217     {"ddbj",
1218      "1936", "Genbank/DDBJ entry format (alias)",
1219      AJTRUE,  AJFALSE, AJTRUE,  AJFALSE,
1220      AJTRUE,  AJTRUE,  &seqReadGenbank, AJFALSE, AJFALSE}, /* alias for
1221                                                              genbank */
1222     {"refseq",
1223      "1936", "Refseq entry format (alias)",
1224      AJTRUE,  AJFALSE, AJTRUE,  AJFALSE,
1225      AJTRUE,  AJTRUE,  &seqReadRefseq, AJFALSE, AJFALSE}, /* alias for
1226                                                             genbank */
1227     {"refseqp",
1228      "1958", "Refseq protein entry format",
1229      AJFALSE, AJFALSE, AJFALSE, AJTRUE,       /* genbank format proteins */
1230      AJTRUE,  AJTRUE,  &seqReadRefseqp, AJFALSE, AJFALSE},
1231     {"genpept",     "1937", "Refseq protein entry format (alias)",
1232      AJFALSE, AJFALSE, AJFALSE, AJTRUE,
1233      AJFALSE, AJTRUE,  &seqReadGenpept, AJFALSE, AJFALSE},
1234     {"codata",
1235      "1925", "Codata entry format",
1236      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1237      AJTRUE,  AJTRUE,  &seqReadCodata, AJFALSE, AJFALSE},
1238     {"strider",
1239      "1962", "DNA strider output format",
1240      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1241      AJFALSE, AJTRUE,  &seqReadStrider, AJFALSE, AJFALSE},
1242     {"clustal",
1243      "1924", "Clustalw output format",
1244      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1245      AJFALSE, AJTRUE,  &seqReadClustal, AJFALSE, AJFALSE},
1246     {"aln",
1247      "1924", "Clustalw output format (alias)",
1248      AJTRUE,  AJFALSE, AJTRUE,  AJTRUE,
1249      AJFALSE, AJTRUE,  &seqReadClustal, AJFALSE, AJFALSE}, /* alias for
1250                                                              clustal */
1251     {"phylip",
1252      "1955", "Phylip interleaved and non-interleaved formats",
1253      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1254      AJFALSE, AJTRUE,  &seqReadPhylip, AJTRUE, AJFALSE},
1255     {"phylipnon",
1256      "1956", "Phylip non-interleaved format",
1257      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1258      AJFALSE, AJTRUE,  &seqReadPhylipnon, AJTRUE, AJFALSE}, /* tried by
1259                                                               phylip */
1260     {"ace",
1261      "3001", "ACE sequence format",
1262      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1263      AJFALSE, AJTRUE,  &seqReadAce, AJFALSE, AJFALSE},
1264     {"consed",
1265      "3001", "ACE sequence format",
1266      AJTRUE,  AJTRUE,  AJTRUE,  AJFALSE,
1267      AJFALSE, AJTRUE,  &seqReadAce, AJFALSE, AJFALSE}, /* alias for ace */
1268     {"acedb",
1269      "1923", "ACEDB sequence format",
1270      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1271      AJFALSE, AJTRUE,  &seqReadAcedb, AJFALSE, AJFALSE},
1272     {"dbid",
1273      "1926", "Fasta format variant with database name before ID",
1274      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1275      AJFALSE, AJTRUE,  &seqReadDbId, AJFALSE, AJFALSE}, /* odd fasta with id as
1276                                                           second token */
1277     {"msf",
1278      "1947", "GCG MSF (multiple sequence file) file format",
1279      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1280      AJFALSE, AJTRUE,  &seqReadMsf, AJFALSE, AJFALSE},
1281     {"hennig86",
1282      "1941", "Hennig86 output format",
1283      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1284      AJFALSE, AJTRUE,  &seqReadHennig86, AJFALSE, AJFALSE},
1285     {"jackknifer",
1286      "1944", "Jackknifer interleaved and non-interleaved formats",
1287      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1288      AJFALSE, AJTRUE,  &seqReadJackknifer, AJFALSE, AJFALSE},
1289     {"nexus",
1290      "1949", "Nexus/paup interleaved format",
1291      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1292      AJFALSE, AJTRUE,  &seqReadNexus, AJFALSE, AJFALSE},
1293     {"paup",
1294      "1949", "Nexus/paup interleaved format (alias)",
1295      AJTRUE,  AJFALSE, AJTRUE,  AJTRUE,
1296      AJFALSE, AJTRUE,  &seqReadNexus, AJFALSE, AJFALSE}, /* alias for nexus */
1297     {"treecon",
1298      "1965", "Treecon output format",
1299      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1300      AJFALSE, AJTRUE,  &seqReadTreecon, AJFALSE, AJFALSE},
1301     {"mega",
1302      "1946 1971", "Mega interleaved and non-interleaved formats",
1303      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1304      AJFALSE, AJTRUE,  &seqReadMega, AJFALSE, AJFALSE},
1305     {"igstrict",
1306      "1943", "Intelligenetics sequence format strict parser",
1307      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1308      AJFALSE, AJTRUE,  &seqReadIgstrict, AJFALSE, AJFALSE},
1309     {"iguspto",
1310      "1942", "US patent office multi-line Intelligenetics sequence format",
1311      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1312      AJFALSE, AJTRUE,  &seqReadIguspto, AJFALSE, AJFALSE},
1313     {"ig",
1314      "1942", "Intelligenetics sequence format",
1315      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1316      AJFALSE, AJTRUE,  &seqReadIg, AJFALSE, AJFALSE}, /* can read almost
1317                                                         anything */
1318     {"staden",
1319      "1960", "Old staden package sequence format",
1320      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1321      AJFALSE, AJTRUE,  &seqReadStaden, AJFALSE, AJFALSE},/* original staden
1322                                                            format */
1323     {"textonly",
1324      "1964", "Plain text",
1325      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1326      AJFALSE, AJTRUE,  &seqReadText, AJFALSE, AJFALSE},/* can read almost
1327                                                          anything */
1328     {"plain",
1329      "1964", "Plain text (alias)",
1330      AJTRUE,  AJFALSE, AJTRUE,  AJTRUE,
1331      AJFALSE, AJTRUE,  &seqReadText, AJFALSE, AJFALSE}, /* alias for text */
1332     {"asis",
1333      "1964", "Data as commandline string",
1334      AJTRUE,  AJFALSE, AJTRUE,  AJTRUE,
1335      AJFALSE, AJTRUE,  &seqReadText, AJFALSE, AJFALSE}, /* one line only */
1336     {"gff2",
1337      "1938",  "GFF feature file with sequence in the header",
1338      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1339      AJTRUE,  AJTRUE,  &seqReadGff2, AJFALSE, AJFALSE},
1340     {"gff3",
1341      "1939",  "GFF3 feature file with sequence",
1342      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1343      AJTRUE,  AJTRUE,  &seqReadGff3, AJFALSE, AJFALSE},
1344     {"gff",
1345      "1939",  "GFF3 feature file with sequence",
1346      AJTRUE,  AJFALSE,  AJTRUE,  AJTRUE,
1347      AJTRUE,  AJTRUE,  &seqReadGff3, AJFALSE, AJFALSE},
1348     {"stockholm",
1349      "1961",  "Stockholm (pfam) format",
1350      AJFALSE, AJTRUE,  AJFALSE,  AJTRUE,
1351      AJFALSE, AJTRUE,  &seqReadStockholm, AJFALSE, AJFALSE},
1352     {"pfam",
1353      "1961",  "Stockholm (pfam) format (alias)",
1354      AJTRUE,  AJTRUE,  AJFALSE,  AJTRUE,
1355      AJFALSE, AJTRUE,  &seqReadStockholm, AJFALSE, AJFALSE},
1356     {"selex",
1357      "1959",  "Selex format",                /* can read almost anything */
1358      AJFALSE, AJFALSE, AJTRUE,   AJTRUE,
1359      AJFALSE, AJTRUE,  &seqReadSelex, AJFALSE, AJFALSE},
1360     {"fitch",
1361      "1934",  "Fitch program format",
1362      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1363      AJFALSE, AJTRUE,  &seqReadFitch, AJFALSE, AJFALSE},
1364     {"biomart",
1365      "0000", "Biomart tab-delimited results", /* may clash with SAM */
1366      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1367      AJFALSE, AJTRUE,  &seqReadBiomart, AJFALSE, AJFALSE},
1368     {"mase",
1369      "1945", "Mase program format",
1370      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1371      AJFALSE, AJTRUE,  &seqReadMase, AJFALSE, AJFALSE}, /* like ig - off by
1372                                                           default */
1373     {"experiment",
1374      "1928", "Staden experiment file",
1375      AJFALSE, AJTRUE, AJTRUE,  AJFALSE,
1376      AJFALSE, AJTRUE,  &seqReadExperiment, AJFALSE, AJFALSE},
1377     {"gde",
1378      "0000", "GDE program format",
1379      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1380      AJFALSE, AJTRUE,  &seqReadGde, AJFALSE, AJFALSE},
1381      {"raw",
1382      "1957", "Raw sequence with no non-sequence characters",
1383      AJFALSE, AJTRUE,  AJTRUE,  AJTRUE,
1384      AJFALSE, AJFALSE, &seqReadRaw, AJFALSE, AJTRUE}, /* OK - only sequence
1385                                                         chars allowed - but
1386                                                         binary so not piped */
1387     {"nibble",
1388      "0000", "Nibble format",
1389      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1390      AJFALSE, AJFALSE, &seqReadNibble, AJFALSE, AJTRUE},
1391     {"nib",
1392      "0000", "Nibble format",
1393      AJTRUE,  AJFALSE, AJTRUE,  AJFALSE,
1394      AJFALSE, AJFALSE, &seqReadNibble, AJFALSE, AJTRUE},
1395     {"abi",
1396      "1628", "ABI trace file",
1397      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1398      AJFALSE, AJFALSE, &seqReadAbi, AJFALSE, AJTRUE},
1399     {"bam",
1400      "2572", "Binary Sequence Alignment/Map (BAM) format",
1401      AJFALSE, AJTRUE, AJTRUE,  AJFALSE,
1402      AJFALSE, AJTRUE, &seqReadBam, AJFALSE, AJTRUE},
1403     {"ensembl",
1404      "0000", "Ensembl SQL format",
1405      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1406      AJTRUE, AJTRUE, &seqReadEnsembl, AJFALSE, AJFALSE},
1407     {"das",
1408      "1967", "DAS sequence format",
1409      AJFALSE, AJFALSE, AJTRUE,  AJTRUE,
1410      AJTRUE, AJTRUE, &seqReadDAS, AJFALSE, AJFALSE},
1411     {"scf",
1412      "2057", "SCF trace file",
1413      AJFALSE, AJTRUE,  AJTRUE,  AJFALSE,
1414      AJFALSE, AJTRUE,  &seqReadScf, AJFALSE, AJTRUE},
1415     {NULL,
1416      NULL, NULL,
1417      0, 0, 0, 0,
1418      0, 0, NULL, 0, 0}
1419 };
1420 
1421 
1422 
1423 /* ==================================================================== */
1424 /* ========================= constructors ============================= */
1425 /* ==================================================================== */
1426 
1427 
1428 
1429 
1430 /* @section Sequence Input Constructors ***************************************
1431 **
1432 ** All constructors return a new sequence input object by pointer. It
1433 ** is the responsibility of the user to first destroy any previous
1434 ** sequence input object. The target pointer does not need to be
1435 ** initialised to NULL, but it is good programming practice to do so
1436 ** anyway.
1437 **
1438 ******************************************************************************/
1439 
1440 
1441 
1442 
1443 /* @func ajSeqinNew ***********************************************************
1444 **
1445 ** Creates a new sequence input object.
1446 **
1447 ** @return [AjPSeqin] New sequence input object.
1448 ** @category new [AjPSeqin] Default constructor
1449 **
1450 ** @release 1.0.0
1451 ** @@
1452 ******************************************************************************/
1453 
ajSeqinNew(void)1454 AjPSeqin ajSeqinNew(void)
1455 {
1456     AjPSeqin pthis;
1457 
1458     AJNEW0(pthis);
1459 
1460     pthis->Input = ajTextinNewDatatype(AJDATATYPE_SEQUENCE);
1461     pthis->Name  = ajStrNew();
1462     pthis->Acc   = ajStrNew();
1463     pthis->Full  = ajStrNew();
1464     pthis->Date  = ajStrNew();
1465     pthis->Desc  = ajStrNew();
1466     pthis->Doc   = ajStrNew();
1467     pthis->Rev   = ajFalse;
1468     pthis->Begin = 0;
1469     pthis->End   = 0;
1470     pthis->Ufo   = ajStrNew();
1471 
1472     pthis->Inputtype = ajStrNew();
1473     pthis->Entryname = ajStrNew();
1474 
1475     pthis->DbSequence = ajStrNew();
1476 
1477     pthis->Usalist = NULL; /* create only if needed */
1478 
1479     pthis->Features  = ajFalse;
1480     pthis->Upper     = ajFalse;
1481     pthis->Lower     = ajFalse;
1482     pthis->SeqData      = NULL;
1483     pthis->Ftquery   = ajFeattabinNew(); /* empty object */
1484     pthis->Multiset  = ajFalse;
1485 
1486     return pthis;
1487 }
1488 
1489 
1490 
1491 
1492 
1493 /* @func ajSeqinNewQueryC ******************************************************
1494 **
1495 ** Creates a new sequence input object.
1496 **
1497 ** @param [r] qrytxt [const char*] Query string
1498 ** @return [AjPSeqin] New sequence input object.
1499 ** @category new [AjPSeqin] Default constructor
1500 **
1501 ** @release 1.0.0
1502 ** @@
1503 ******************************************************************************/
1504 
ajSeqinNewQueryC(const char * qrytxt)1505 AjPSeqin ajSeqinNewQueryC(const char* qrytxt)
1506 {
1507     AjPSeqin thys = ajSeqinNew();
1508 
1509     ajStrAssignC(&thys->Input->Qry, qrytxt);
1510     return thys;
1511 }
1512 
1513 
1514 
1515 
1516 /* @func ajSeqinNewQueryS ******************************************************
1517 **
1518 ** Creates a new sequence input object.
1519 **
1520 ** @param [r] qry [const AjPStr] Query string
1521 ** @return [AjPSeqin] New sequence input object.
1522 ** @category new [AjPSeqin] Default constructor
1523 **
1524 ** @release 1.0.0
1525 ** @@
1526 ******************************************************************************/
1527 
ajSeqinNewQueryS(const AjPStr qry)1528 AjPSeqin ajSeqinNewQueryS(const AjPStr qry)
1529 {
1530     AjPSeqin thys = ajSeqinNew();
1531 
1532     ajStrAssignS(&thys->Input->Qry, qry);
1533     return thys;
1534 }
1535 
1536 
1537 
1538 
1539 /* ==================================================================== */
1540 /* ========================== destructors ============================= */
1541 /* ==================================================================== */
1542 
1543 
1544 
1545 
1546 /* @section Sequence Input Destructors ****************************************
1547 **
1548 ** Destruction destroys all internal data structures and frees the
1549 ** memory allocated for the sequence input object.
1550 **
1551 ******************************************************************************/
1552 
1553 
1554 
1555 
1556 /* @func ajSeqinDel ***********************************************************
1557 **
1558 ** Deletes a sequence input object.
1559 **
1560 ** @param [d] pthis [AjPSeqin*] Sequence input
1561 ** @return [void]
1562 ** @category delete [AjPSeqin] Default destructor
1563 **
1564 ** @release 1.0.0
1565 ** @@
1566 ******************************************************************************/
1567 
ajSeqinDel(AjPSeqin * pthis)1568 void ajSeqinDel(AjPSeqin* pthis)
1569 {
1570     AjPSeqin thys;
1571     SeqPListUsa node = NULL;
1572 
1573     if(!pthis)
1574         return;
1575 
1576     thys = *pthis;
1577 
1578     if(!thys)
1579         return;
1580 
1581     ajDebug("ajSeqinDel called usa:'%S'\n", thys->Input->Qry);
1582 
1583     ajTextinDel(&thys->Input);
1584 
1585     ajStrDel(&thys->Name);
1586     ajStrDel(&thys->Acc);
1587 
1588     ajStrDel(&thys->Inputtype);
1589 
1590     ajStrDel(&thys->Full);
1591     ajStrDel(&thys->Date);
1592     ajStrDel(&thys->Desc);
1593     ajStrDel(&thys->Doc);
1594 
1595     ajStrDel(&thys->Ufo);
1596     ajStrDel(&thys->Entryname);
1597 
1598     ajStrDel(&thys->DbSequence);
1599 
1600     ajStrDel(&thys->Inseq);
1601 
1602     while(ajListGetLength(thys->Usalist))
1603     {
1604         ajListPop(thys->Usalist, (void**) &node);
1605         ajStrDel(&node->Usa);
1606         ajStrDel(&node->Formatstr);
1607         AJFREE(node);
1608     }
1609 
1610     ajListFree(&thys->Usalist);
1611 
1612     if(thys->Fttable)
1613         ajFeattableDel(&thys->Fttable);
1614 
1615     if(thys->Ftquery)           /* this deletes filebuff stuff above anyway */
1616         ajFeattabinDel(&thys->Ftquery);
1617 
1618     AJFREE(*pthis);
1619 
1620     return;
1621 }
1622 
1623 
1624 
1625 
1626 /* ==================================================================== */
1627 /* =========================== Modifiers ============================== */
1628 /* ==================================================================== */
1629 
1630 
1631 
1632 
1633 /* @section Sequence Input Modifiers ******************************************
1634 **
1635 ** These functions use the contents of a sequence input object and
1636 ** update them.
1637 **
1638 ******************************************************************************/
1639 
1640 
1641 
1642 
1643 /* @func ajSeqinUsa ***********************************************************
1644 **
1645 ** Creates or resets a sequence input object using a new Universal
1646 ** Sequence Address
1647 **
1648 ** @param [u] pthis [AjPSeqin*] Sequence input object.
1649 ** @param [r] Usa [const AjPStr] USA
1650 ** @return [void]
1651 ** @category modify [AjPSeqin] Resets using a new USA
1652 **
1653 ** @release 1.0.0
1654 ** @@
1655 ******************************************************************************/
1656 
ajSeqinUsa(AjPSeqin * pthis,const AjPStr Usa)1657 void ajSeqinUsa(AjPSeqin* pthis, const AjPStr Usa)
1658 {
1659     AjPSeqin thys;
1660 
1661     if(!*pthis)
1662         thys = *pthis = ajSeqinNew();
1663     else
1664     {
1665         thys = *pthis;
1666         ajSeqinClear(thys);
1667     }
1668 
1669     ajStrAssignS(&thys->Input->Qry, Usa);
1670 
1671     return;
1672 }
1673 
1674 
1675 
1676 
1677 /* @func ajSeqinSetNuc ********************************************************
1678 **
1679 ** Sets the type to be forced as nucleic for a sequence input object
1680 **
1681 ** @param [u] seqin [AjPSeqin] Sequence input object to be set.
1682 ** @return [void]
1683 **
1684 ** @release 1.0.0
1685 ** @@
1686 ******************************************************************************/
1687 
ajSeqinSetNuc(AjPSeqin seqin)1688 void ajSeqinSetNuc(AjPSeqin seqin)
1689 {
1690     seqin->IsNuc = ajTrue;
1691 
1692     return;
1693 }
1694 
1695 
1696 
1697 
1698 /* @func ajSeqinSetProt *******************************************************
1699 **
1700 ** Sets the type to be forced as protein for a sequence input object
1701 **
1702 ** @param [u] seqin [AjPSeqin] Sequence input object to be set.
1703 ** @return [void]
1704 **
1705 ** @release 1.0.0
1706 ** @@
1707 ******************************************************************************/
1708 
ajSeqinSetProt(AjPSeqin seqin)1709 void ajSeqinSetProt(AjPSeqin seqin)
1710 {
1711     seqin->IsProt = ajTrue;
1712 
1713     return;
1714 }
1715 
1716 
1717 
1718 
1719 /* @func ajSeqinSetRange ******************************************************
1720 **
1721 ** Sets the start and end positions for a sequence input object
1722 **
1723 ** @param [u] seqin [AjPSeqin] Sequence input object to be set.
1724 ** @param [r] ibegin [ajint] Start position. Negative values are from the end.
1725 ** @param [r] iend [ajint] End position. Negative values are from the end.
1726 ** @return [void]
1727 ** @category modify [AjPSeqin] Sets a sequence range for all input sequences
1728 **
1729 ** @release 1.0.0
1730 ** @@
1731 ******************************************************************************/
1732 
ajSeqinSetRange(AjPSeqin seqin,ajint ibegin,ajint iend)1733 void ajSeqinSetRange(AjPSeqin seqin, ajint ibegin, ajint iend)
1734 {
1735     if(ibegin)
1736         seqin->Begin = ibegin;
1737 
1738     if(iend)
1739         seqin->End = iend;
1740 
1741     return;
1742 }
1743 
1744 
1745 
1746 
1747 /* ==================================================================== */
1748 /* ========================== Assignments ============================= */
1749 /* ==================================================================== */
1750 
1751 
1752 
1753 
1754 /* @section Sequence Input Assignments ****************************************
1755 **
1756 ** These functions overwrite the sequence input object provided as the
1757 ** first argument.
1758 **
1759 ******************************************************************************/
1760 
1761 
1762 
1763 
1764 /* ==================================================================== */
1765 /* ======================== Operators ==================================*/
1766 /* ==================================================================== */
1767 
1768 
1769 
1770 
1771 /* @section Sequence Input Operators ******************************************
1772 **
1773 ** These functions use the contents of a sequence input object but do
1774 ** not make any changes.
1775 **
1776 ******************************************************************************/
1777 
1778 
1779 
1780 
1781 /* @func ajSeqAllRead *********************************************************
1782 **
1783 ** Parse a USA Uniform Sequence Address into format, access, file and entry
1784 **
1785 ** Split at delimiters. Check for the first part as a valid format
1786 ** Check for the remaining first part as a database name or as a file
1787 ** that can be opened.
1788 ** Anything left is an entryname spec.
1789 **
1790 ** Return the results in the AjPSeq object but leave the file open for
1791 ** future calls.
1792 **
1793 ** @param [w] thys [AjPSeq] Sequence returned.
1794 ** @param [u] seqin [AjPSeqin] Sequence input definitions
1795 ** @return [AjBool] ajTrue on success.
1796 ** @category input [AjPSeq] Master sequence stream input, reads first
1797 **           sequence from an open input stream.
1798 **
1799 ** @release 1.0.0
1800 ** @@
1801 ******************************************************************************/
1802 
ajSeqAllRead(AjPSeq thys,AjPSeqin seqin)1803 AjBool ajSeqAllRead(AjPSeq thys, AjPSeqin seqin)
1804 {
1805     AjBool ret       = ajFalse;
1806     AjPStr tmpformat = NULL;
1807     SeqPListUsa node = NULL;
1808     AjBool listdata  = ajFalse;
1809 
1810     if(!seqinFormatIsset)
1811     {
1812         /* we need a copy of the formatlist */
1813         if(ajNamGetValueC("format", &tmpformat))
1814         {
1815             seqSetInFormat(tmpformat);
1816             ajDebug("seqSetInFormat '%S' from EMBOSS_FORMAT\n", tmpformat);
1817         }
1818 
1819         ajStrDel(&tmpformat);
1820         seqinFormatIsset = ajTrue;
1821     }
1822 
1823     if(!seqin->Input->Filebuff)
1824     {
1825         /* First call. No file open yet ... */
1826         if(!seqinUsaProcess(seqin, thys) /* ... so process the USA */
1827            && !ajListGetLength(seqin->Usalist))  /* not list with bad 1st item */
1828             return ajFalse; /* if this fails, we read no sequence at all */
1829 
1830         if(ajListGetLength(seqin->Usalist))
1831             listdata = ajTrue;
1832 
1833         ajTextinClearNewfile(seqin->Input);
1834     }
1835 
1836 
1837     ret = seqRead(thys, seqin); /* read the sequence */
1838 
1839     if(ret)                     /* clone any specified DB or entryname */
1840     {
1841         if (ajStrGetLen(seqin->Input->Db))
1842         {
1843             ajDebug("++ajSeqallRead set db: '%S' => '%S'\n",
1844                     seqin->Input->Db, thys->Db);
1845             ajStrAssignS(&thys->Db, seqin->Input->Db);
1846         }
1847 
1848         if (ajStrGetLen(seqin->Entryname))
1849         {
1850             ajDebug("++ajSeqallRead set entryname: '%S' => '%S'\n",
1851                     seqin->Entryname, thys->Entryname);
1852             ajStrAssignS(&thys->Entryname, seqin->Entryname);
1853         }
1854 
1855         if(!ajStrGetLen(thys->Type)) /* make sure the type is set */
1856             ajSeqType(thys);
1857     }
1858 
1859     while(!ret && ajListGetLength(seqin->Usalist))
1860     {
1861         /* Failed, but we have a list still - keep trying it */
1862 
1863         ajErr("Failed to read sequence '%S'", seqin->Input->Qry);
1864 
1865         ajListPop(seqin->Usalist, (void**) &node);
1866         ajDebug("++try again: pop from list '%S'\n", node->Usa);
1867         ajSeqinUsa(&seqin, node->Usa);
1868         ajDebug("++SAVE (AGAIN) SEQIN '%S' %d..%d(%b) '%S' %d\n",
1869                 seqin->Input->Qry, seqin->Begin, seqin->End, seqin->Rev,
1870                 seqin->Input->Formatstr, seqin->Input->Format);
1871         seqUsaRestore(seqin, node);
1872 
1873         ajStrDel(&node->Usa);
1874         ajStrDel(&node->Formatstr);
1875         AJFREE(node);
1876 
1877         /* must exit if this fails ... for bad list USAs */
1878 
1879         if(!seqinUsaProcess(seqin, thys))
1880             continue;
1881 
1882         ajTextinClearNewfile(seqin->Input);
1883 
1884         ret = seqRead(thys, seqin);
1885     }
1886 
1887     if(!ret)
1888     {
1889         if(listdata)
1890             ajErr("Failed to read sequence '%S'", seqin->Input->Qry);
1891 
1892         return ajFalse;
1893     }
1894 
1895     if (seqin->Usalist)
1896         ajSeqinClearPos(seqin);
1897 
1898     return ret;
1899 }
1900 
1901 
1902 
1903 
1904 /* @func ajSeqallFile *********************************************************
1905 **
1906 ** Parse a USA Uniform Sequence Address
1907 **
1908 ** Return the results in the AjPSeqall object but leave the file open for
1909 ** future calls.
1910 **
1911 ** @param [r] usa [const AjPStr] sequence usa.
1912 ** @return [AjPSeqall] seqall object
1913 **
1914 ** @release 1.13.0
1915 ** @@
1916 ******************************************************************************/
1917 
ajSeqallFile(const AjPStr usa)1918 AjPSeqall ajSeqallFile(const AjPStr usa)
1919 {
1920     AjPSeqall seqall = NULL;
1921     AjPSeqin  seqin  = NULL;
1922     AjPSeq    seq    = NULL;
1923 
1924     seqall = ajSeqallNew();
1925 
1926     seqin = seqall->Seqin;
1927     seqin->Input->Multi  = ajTrue;
1928     seqin->Input->Single = ajFalse;
1929     seq = seqall->Seq;
1930 
1931     ajSeqinUsa(&seqin,usa);
1932 
1933     if(!ajSeqAllRead(seq,seqin))
1934     {
1935         ajSeqallDel(&seqall);
1936 
1937         return NULL;
1938     }
1939 
1940     return seqall;
1941 }
1942 
1943 
1944 
1945 
1946 /* @func ajSeqallNext *********************************************************
1947 **
1948 ** Reads the next sequence into a sequence stream. For the first call this
1949 ** simply returns the sequence already loaded. For later calls a new
1950 ** sequence is read.
1951 **
1952 ** @param [u] seqall [AjPSeqall] Sequence stream
1953 ** @param [w] retseq [AjPSeq*] Sequence
1954 ** @return [AjBool] ajTrue if a sequence was refound. ajFalse when all is done.
1955 ** @category input [AjPSeq] Master sequence stream input, reads next
1956 **                         sequence from an open input stream.
1957 ** @category modify [AjPSeqall] Master sequence stream input,
1958 **                 reads next sequence from an open input stream.
1959 **
1960 ** @release 1.0.0
1961 ** @@
1962 ******************************************************************************/
1963 
ajSeqallNext(AjPSeqall seqall,AjPSeq * retseq)1964 AjBool ajSeqallNext(AjPSeqall seqall, AjPSeq* retseq)
1965 {
1966     if(!seqall->Count)
1967     {
1968         seqall->Count = 1;
1969 
1970         if(seqall->Rev)
1971             ajSeqSetRangeRev(seqall->Seq, seqall->Begin, seqall->End);
1972         else
1973             ajSeqSetRange(seqall->Seq, seqall->Begin, seqall->End);
1974 
1975         /*
1976           seqall->Seq->Begin = seqall->Begin;
1977           seqall->Seq->End   = seqall->End;
1978         */
1979 
1980         seqall->Totseqs++;
1981         seqall->Totlength += ajSeqGetLenTrimmed(seqall->Seq);;
1982 
1983         *retseq = seqall->Seq;
1984         seqall->Returned = ajTrue;
1985 
1986         return ajTrue;
1987     }
1988 
1989 
1990     if(ajSeqRead(seqall->Seq, seqall->Seqin))
1991     {
1992         seqall->Count++;
1993 
1994         if(seqall->Rev)
1995             ajSeqSetRangeRev(seqall->Seq, seqall->Begin, seqall->End);
1996         else
1997             ajSeqSetRange(seqall->Seq, seqall->Begin, seqall->End);
1998 
1999         seqall->Totseqs++;
2000         seqall->Totlength += ajSeqGetLenTrimmed(seqall->Seq);;
2001 
2002         *retseq = seqall->Seq;
2003         seqall->Returned = ajTrue;
2004 
2005         ajDebug("ajSeqallNext success\n");
2006 
2007         return ajTrue;
2008     }
2009 
2010     *retseq = NULL;
2011     ajDebug("ajSeqallNext failed\n");
2012     ajSeqallClear(seqall);
2013 
2014     return ajFalse;
2015 }
2016 
2017 
2018 
2019 
2020 /* @func ajSeqinClearPos ******************************************************
2021 **
2022 ** Clears a Sequence input object position information as possibly read from
2023 ** a USA that included the begin, end and direction
2024 **
2025 ** @param [u] thys [AjPSeqin] Sequence input
2026 ** @return [void]
2027 **
2028 ** @release 2.9.0
2029 ** @@
2030 ******************************************************************************/
2031 
ajSeqinClearPos(AjPSeqin thys)2032 void ajSeqinClearPos(AjPSeqin thys)
2033 {
2034     thys->Rev    = ajFalse;
2035     thys->Begin = 0;
2036     thys->End = 0;
2037 
2038     return;
2039 }
2040 
2041 
2042 
2043 
2044 /* @func ajSeqinClear *********************************************************
2045 **
2046 ** Clears a Sequence input object back to "as new" condition, except
2047 ** for the USA list and the features setting which must be preserved.
2048 **
2049 ** @param [w] thys [AjPSeqin] Sequence input
2050 ** @return [void]
2051 ** @category modify [AjPSeqin] Resets ready for reuse.
2052 **
2053 ** @release 1.0.0
2054 ** @@
2055 ******************************************************************************/
2056 
ajSeqinClear(AjPSeqin thys)2057 void ajSeqinClear(AjPSeqin thys)
2058 {
2059     ajDebug("ajSeqinClear called\n");
2060 
2061     if(!thys)
2062         return;
2063 
2064     ajTextinClear(thys->Input);
2065 
2066     ajStrSetClear(&thys->Name);
2067     ajStrSetClear(&thys->Acc);
2068     /* preserve thys->Inputtype */
2069     ajStrSetClear(&thys->Full);
2070     ajStrSetClear(&thys->Date);
2071     ajStrSetClear(&thys->Desc);
2072     ajStrSetClear(&thys->Doc);
2073     /* preserve thys->List */
2074     ajStrSetClear(&thys->Ufo);
2075     ajStrSetClear(&thys->Entryname);
2076 
2077     ajStrSetClear(&thys->DbSequence);
2078 
2079     ajStrSetClear(&thys->Inseq);
2080 
2081     /* preserve thys->Usalist */
2082 
2083     /* preserve thys->Query */
2084 
2085     if(thys->Fttable)
2086     {
2087         ajFeattableDel(&thys->Fttable);
2088     }
2089 
2090     if(thys->Ftquery)           /* this clears filebuff stuff above anyway */
2091         ajFeattabinClear(thys->Ftquery);
2092 
2093     thys->SeqData = NULL;
2094 
2095     thys->Rev    = ajFalse;
2096 
2097     /* keep thys->Features */
2098     /* thys->Features = ajFalse;*/
2099 
2100     thys->Begin = 0;
2101     thys->End = 0;
2102 
2103     return;
2104 }
2105 
2106 
2107 
2108 
2109 /* ==================================================================== */
2110 /* ============================ Casts ==================================*/
2111 /* ==================================================================== */
2112 
2113 
2114 
2115 
2116 /* @section Sequence Input Casts **********************************************
2117 **
2118 ** These functions examine the contents of a sequence input object and
2119 ** return some derived information. Some of them provide access to the
2120 ** internal components of a sequence input object. They are provided
2121 ** for programming convenience but should be used with caution.
2122 **
2123 ******************************************************************************/
2124 
2125 
2126 
2127 
2128 /* ==================================================================== */
2129 /* ========================== Assignments ============================= */
2130 /* ==================================================================== */
2131 
2132 
2133 
2134 
2135 /* @section Sequence inputs **********************************************
2136 **
2137 ** These functions read the sequence provided by the first argument
2138 **
2139 ******************************************************************************/
2140 
2141 
2142 
2143 
2144 /* @func ajSeqRead ************************************************************
2145 **
2146 ** If the file is not yet open, calls seqinUsaProcess to convert the USA into
2147 ** an open file stream.
2148 **
2149 ** Uses seqRead for the actual file reading.
2150 **
2151 ** Returns the results in the AjPSeq object.
2152 **
2153 ** @param [w] thys [AjPSeq] Sequence returned.
2154 ** @param [u] seqin [AjPSeqin] Sequence input definitions
2155 ** @return [AjBool] ajTrue on success.
2156 ** @category input [AjPSeq] Master sequence input, calls specific functions
2157 **                  for file access type and sequence format.
2158 **
2159 ** @release 1.0.0
2160 ** @@
2161 ******************************************************************************/
2162 
ajSeqRead(AjPSeq thys,AjPSeqin seqin)2163 AjBool ajSeqRead(AjPSeq thys, AjPSeqin seqin)
2164 {
2165     AjPStr tmpformat = NULL;
2166     AjBool ret       = ajFalse;
2167     SeqPListUsa node = NULL;
2168     AjBool listdata  = ajFalse;
2169 
2170     if(!seqinFormatIsset)
2171     {
2172         /* we need a copy of the formatlist */
2173         if(ajNamGetValueC("format", &tmpformat))
2174         {
2175             seqSetInFormat(tmpformat);
2176             ajDebug("seqSetInFormat '%S' from EMBOSS_FORMAT\n", tmpformat);
2177         }
2178 
2179         ajStrDel(&tmpformat);
2180         seqinFormatIsset = ajTrue;
2181     }
2182 
2183     if(seqin->Input->Filebuff)
2184     {
2185         /* (a) if file still open, keep reading */
2186         ajDebug("ajSeqRead: input file '%F' still there, try again\n",
2187                 seqin->Input->Filebuff->File);
2188         ret = seqRead(thys, seqin);
2189         ajDebug("ajSeqRead: open buffer  usa: '%S' returns: %B\n",
2190                 seqin->Input->Qry, ret);
2191     }
2192     else
2193     {
2194         /* (b) if we have a list, try the next USA in the list */
2195         if(ajListGetLength(seqin->Usalist))
2196         {
2197             listdata = ajTrue;
2198             ajListPop(seqin->Usalist, (void**) &node);
2199 
2200             ajDebug("++pop from list '%S'\n", node->Usa);
2201             ajSeqinUsa(&seqin, node->Usa);
2202             ajDebug("++SAVE SEQIN '%S' %d..%d(%b) '%S' %d\n",
2203                     seqin->Input->Qry, seqin->Begin, seqin->End, seqin->Rev,
2204                     seqin->Input->Formatstr, seqin->Input->Format);
2205             seqUsaRestore(seqin, node);
2206 
2207             ajStrDel(&node->Usa);
2208             ajStrDel(&node->Formatstr);
2209             AJFREE(node);
2210 
2211             ajDebug("ajSeqRead: open list, try '%S'\n", seqin->Input->Qry);
2212 
2213             if(!seqinUsaProcess(seqin, thys) &&
2214                !ajListGetLength(seqin->Usalist))
2215                 return ajFalse;
2216 
2217             ajTextinClearNewfile(seqin->Input);
2218 
2219             ret = seqRead(thys, seqin);
2220             ajDebug("ajSeqRead: list usa: '%S' returns: %B\n",
2221                     seqin->Input->Qry, ret);
2222         }
2223         else
2224         {
2225             ajDebug("ajSeqRead: no file yet - test USA '%S'\n",
2226                     seqin->Input->Qry);
2227 
2228             /* (c) Must be a USA - decode it */
2229             if(!seqinUsaProcess(seqin, thys) &&
2230                !ajListGetLength(seqin->Usalist))
2231                 return ajFalse;
2232 
2233             if(ajListGetLength(seqin->Usalist)) /* could be a new list */
2234                 listdata = ajTrue;
2235 
2236             ajTextinClearNewfile(seqin->Input);
2237 
2238             ret = seqRead(thys, seqin);
2239             ajDebug("ajSeqRead: new usa: '%S' returns: %B\n",
2240                     seqin->Input->Qry, ret);
2241         }
2242     }
2243 
2244     /* Now read whatever we got */
2245 
2246     while(!ret && ajListGetLength(seqin->Usalist))
2247     {
2248         /* Failed, but we have a list still - keep trying it */
2249         if(listdata)
2250             ajErr("Failed to read sequence '%S'", seqin->Input->Qry);
2251 
2252         listdata = ajTrue;
2253         ajListPop(seqin->Usalist,(void**) &node);
2254         ajDebug("++try again: pop from list '%S'\n", node->Usa);
2255         ajSeqinUsa(&seqin, node->Usa);
2256         ajDebug("++SAVE (AGAIN) SEQIN '%S' %d..%d(%b) '%S' %d\n",
2257                 seqin->Input->Qry, seqin->Begin, seqin->End, seqin->Rev,
2258                 seqin->Input->Formatstr, seqin->Input->Format);
2259         seqUsaRestore(seqin, node);
2260 
2261         ajStrDel(&node->Usa);
2262         ajStrDel(&node->Formatstr);
2263         AJFREE(node);
2264 
2265         if(!seqinUsaProcess(seqin, thys))
2266             continue;
2267 
2268         ajTextinClearNewfile(seqin->Input);
2269 
2270         ret = seqRead(thys, seqin);
2271         ajDebug("ajSeqRead: list retry usa: '%S' returns: %B\n",
2272                 seqin->Input->Qry, ret);
2273     }
2274 
2275     if(!ret)
2276     {
2277         if(listdata)
2278             ajErr("Failed to read sequence '%S'", seqin->Input->Qry);
2279 
2280         return ajFalse;
2281     }
2282 
2283 
2284     seqDefine(thys, seqin);
2285 
2286     return ajTrue;
2287 }
2288 
2289 
2290 
2291 
2292 /* ==================================================================== */
2293 /* ========================== Assignments ============================= */
2294 /* ==================================================================== */
2295 
2296 
2297 
2298 
2299 /* @section Sequence Set Inputs ******************************************
2300 **
2301 ** These functions read the sequence set object provided as the
2302 ** first argument.
2303 **
2304 ******************************************************************************/
2305 
2306 
2307 
2308 
2309 /* @func ajSeqsetRead *********************************************************
2310 **
2311 ** Parse a USA Uniform Sequence Address into format, access, file and entry
2312 **
2313 ** Split at delimiters. Check for the first part as a valid format
2314 ** Check for the remaining first part as a database name or as a file
2315 ** that can be opened.
2316 ** Anything left is an entryname spec.
2317 **
2318 ** Read all the sequences until done
2319 **
2320 ** Return the results in the AjPSeqset object.
2321 **
2322 ** @param [w] thys [AjPSeqset] Sequence set returned.
2323 ** @param [u] seqin [AjPSeqin] Sequence input definitions
2324 ** @return [AjBool] ajTrue on success.
2325 ** @category input [AjPSeqset] Master input routine for a sequence
2326 **                set
2327 **
2328 ** @release 1.0.0
2329 ** @@
2330 ******************************************************************************/
2331 
ajSeqsetRead(AjPSeqset thys,AjPSeqin seqin)2332 AjBool ajSeqsetRead(AjPSeqset thys, AjPSeqin seqin)
2333 {
2334     AjPSeq seq;
2335     AjPList setlist;
2336 
2337     ajuint iseq = 0;
2338 
2339     seq = ajSeqNew();
2340 
2341     ajDebug("ajSeqsetRead\n");
2342 
2343     if(!seqinUsaProcess(seqin, seq))
2344         return ajFalse;
2345 
2346     ajTextinClearNewfile(seqin->Input);
2347 
2348     ajStrAssignS(&thys->Usa, seqin->Input->Qry);
2349     ajStrAssignS(&thys->Ufo, seqin->Ufo);
2350     thys->Begin = seqin->Begin;
2351     thys->End = seqin->End;
2352 
2353     setlist = ajListNew();
2354 
2355     ajDebug("ready to start reading format '%S' '%S' %d..%d\n",
2356             seqin->Input->Formatstr, seq->Formatstr, seqin->Begin, seqin->End);
2357 
2358     while(!seqin->Multidone && ajSeqRead(seq, seqin))
2359     {
2360         if (seqin->Usalist)
2361             ajSeqinClearPos(seqin);
2362         /*ajDebug("read name '%S' length %d format '%S' '%S' seqindata: %x\n",
2363           seq->Entryname, ajSeqGetLen(seq),
2364           seqin->Input->Formatstr, seq->Formatstr, seqin->SeqData);*/
2365         ajStrAssignEmptyS(&seq->Db, seqin->Input->Db);
2366 
2367         if(!ajStrGetLen(seq->Type))
2368             ajSeqType(seq);
2369 
2370         if(thys->Rev)
2371             ajSeqSetRangeRev(seq, thys->Begin, thys->End);
2372         else
2373             ajSeqSetRange(seq, thys->Begin, thys->End);
2374 
2375         ajDebug ("ajSeqsetRead read sequence %d %x '%S' %d..%d (%d) "
2376                  "Rev:%B Reversed:%B\n",
2377                  iseq, seq, ajSeqGetNameS(seq),
2378                  seq->Begin, seq->End, ajSeqGetLen(seq),
2379                  seq->Rev, seq->Reversed);
2380 
2381         /*ajSeqTrace(seq);*/
2382         iseq++;
2383 
2384         ajListPushAppend(setlist, seq);
2385 
2386         /*ajDebug("appended to list\n");*/
2387 
2388         /* add to a list of sequences */
2389 
2390         seq = ajSeqNew();
2391         seqinFormatSet(seqin, seq);
2392     }
2393 
2394     ajSeqDel(&seq);
2395 
2396     if(!iseq)
2397         return ajFalse;
2398 
2399     /* convert the list of sequences into a seqset structure */
2400 
2401     ajSeqsetFromList(thys, setlist);
2402 
2403     ajListFree(&setlist);
2404 
2405     ajDebug("ajSeqsetRead total %d sequences\n", iseq);
2406 
2407     return ajTrue;
2408 }
2409 
2410 
2411 
2412 
2413 /* @func ajSeqsetallRead ******************************************************
2414 **
2415 ** Parse a USA Uniform Sequence Address into format, access, file and entry
2416 **
2417 ** Split at delimiters. Check for the first part as a valid format
2418 ** Check for the remaining first part as a database name or as a file
2419 ** that can be opened.
2420 ** Anything left is an entryname spec.
2421 **
2422 ** Read all the sequences into sequence sets until done
2423 **
2424 ** Start a new set for each multiple sequence input
2425 **
2426 ** Return the results in the AjPList object with AjPSeqset nodes
2427 **
2428 ** @param [w] thys [AjPList] List of sequence sets returned.
2429 ** @param [u] seqin [AjPSeqin] Sequence input definitions
2430 ** @return [AjBool] ajTrue on success.
2431 **
2432 ** @release 2.8.0
2433 ** @@
2434 ******************************************************************************/
2435 
ajSeqsetallRead(AjPList thys,AjPSeqin seqin)2436 AjBool ajSeqsetallRead(AjPList thys, AjPSeqin seqin)
2437 {
2438     AjPSeq seq = NULL;
2439     AjPList setlist = NULL;
2440     AjPSeqset seqset = NULL;
2441 
2442     ajuint iseq = 0;
2443 
2444     seq = ajSeqNew();
2445 
2446     ajDebug("ajSeqsetallRead\n");
2447 
2448     if(!seqinUsaProcess(seqin, seq))
2449         return ajFalse;
2450 
2451     ajTextinClearNewfile(seqin->Input);
2452 
2453     ajDebug("ready to start reading format '%S' '%S' %d..%d\n",
2454             seqin->Input->Formatstr, seq->Formatstr, seqin->Begin, seqin->End);
2455 
2456     while(ajSeqRead(seq, seqin))
2457     {
2458         ajDebug("read name '%S' length %d format '%S' '%S' "
2459                 "seqindata: %x multidone: %B\n",
2460                 seq->Entryname, ajSeqGetLen(seq),
2461                 seqin->Input->Formatstr, seq->Formatstr,
2462                 seqin->SeqData, seqin->Multidone);
2463         ajStrAssignEmptyS(&seq->Db, seqin->Input->Db);
2464 
2465         if(!ajStrGetLen(seq->Type))
2466             ajSeqType(seq);
2467 
2468         /*ajDebug ("ajSeqsetallRead read sequence %d '%s' %d..%d\n",
2469           iseq, ajSeqGetNameC(seq), seq->Begin, seq->End);*/
2470         /*ajSeqTrace(seq);*/
2471         iseq++;
2472 
2473         if(!setlist)
2474             setlist = ajListNew();
2475 
2476         ajListPushAppend(setlist, seq);
2477 
2478         /*ajDebug("appended to list\n");*/
2479 
2480         /* add to a list of sequences */
2481 
2482         seq = ajSeqNew();
2483         seqinFormatSet(seqin, seq);
2484 
2485         if(seqin->Multidone)
2486         {
2487             seqset = ajSeqsetNew();
2488             ajStrAssignS(&seqset->Usa, seqin->Input->Qry);
2489             ajStrAssignS(&seqset->Ufo, seqin->Ufo);
2490             seqset->Begin = seqin->Begin;
2491             seqset->End = seqin->End;
2492 
2493             ajSeqsetFromList(seqset, setlist);
2494             ajListFree(&setlist);
2495             ajListPushAppend(thys, seqset);
2496             ajDebug("ajSeqsetallRead multidone save set %Lu of %u sequences\n",
2497                     ajListGetLength(thys), ajSeqsetGetSize(seqset));
2498             seqset = NULL;
2499         }
2500     }
2501 
2502     ajSeqDel(&seq);
2503 
2504     if(!iseq)
2505         return ajFalse;
2506 
2507     /* convert the list of sequences into a seqset structure */
2508 
2509     if(ajListGetLength(setlist))
2510     {
2511         seqset = ajSeqsetNew();
2512         ajStrAssignS(&seqset->Usa, seqin->Input->Qry);
2513         ajStrAssignS(&seqset->Ufo, seqin->Ufo);
2514         seqset->Begin = seqin->Begin;
2515         seqset->End = seqin->End;
2516 
2517         ajSeqsetFromList(seqset, setlist);
2518         ajListFree(&setlist);
2519         ajListPushAppend(thys, seqset);
2520         seqset = NULL;
2521     }
2522 
2523     ajDebug("ajSeqsetallRead total %Lu sets of %d sequences\n",
2524             ajListGetLength(thys), iseq);
2525 
2526     return ajTrue;
2527 }
2528 
2529 
2530 
2531 
2532 /* @func ajSeqsetFromList *****************************************************
2533 **
2534 ** Builds a sequence set from a list of sequences
2535 **
2536 ** @param [w] thys [AjPSeqset] Sequence set
2537 ** @param [r] list [const AjPList] List of sequence objects
2538 ** @return [ajint] Number of sequences in the set.
2539 **
2540 ** @release 2.1.0
2541 ******************************************************************************/
2542 
ajSeqsetFromList(AjPSeqset thys,const AjPList list)2543 ajint ajSeqsetFromList(AjPSeqset thys, const AjPList list)
2544 {
2545 
2546     ajuint i;
2547     AjIList iter;
2548     AjPSeq seq;
2549 
2550     ajDebug("ajSeqsetFromList length: %Lu\n", ajListGetLength(list));
2551 
2552     /*ajListTrace(list);*/
2553 
2554     thys->Size      = (ajuint) ajListGetLength(list);
2555     thys->Seq       = AJCALLOC0(thys->Size, sizeof(AjPSeq));
2556     thys->Seqweight = AJCALLOC0(thys->Size, sizeof(float));
2557 
2558     i = 0;
2559     iter = ajListIterNewread(list);
2560     ajListIterTrace(iter);
2561 
2562     while((seq = (AjPSeq) ajListIterGet(iter)))
2563     {
2564         if(!i)
2565         {
2566             thys->EType = seq->EType;
2567             ajStrAssignS(&thys->Type, seq->Type);
2568             thys->Format = seq->Format;
2569             ajStrAssignS(&thys->Formatstr, seq->Formatstr);
2570             ajStrAssignS(&thys->Filename, seq->Filename);
2571             ajStrAssignS(&thys->Full, seq->Full);
2572         }
2573 
2574         thys->Seqweight[i] = seq->Weight;
2575         thys->Seq[i] = seq;
2576         thys->Totweight += seq->Weight;
2577 
2578         if(ajSeqGetLen(seq) > thys->Len)
2579             thys->Len = ajSeqGetLen(seq);
2580 
2581         /*      ajDebug("seq %d '%x'\n", i, seq);*/
2582         ajDebug("seq '%x' len: %d weight: %.3f\n",
2583                 seq->Name, ajSeqGetLen(seq), thys->Seq[i]->Weight);
2584         i++;
2585     }
2586     ajListIterDel(&iter);
2587 
2588     return thys->Size;
2589 }
2590 
2591 
2592 
2593 
2594 /* @func ajSeqsetFromPair *****************************************************
2595 **
2596 ** Builds a sequence set from a pair of sequences
2597 **
2598 ** @param [w] thys [AjPSeqset] Sequence set
2599 ** @param [r] seqa [const AjPSeq] Sequence 1
2600 ** @param [r] seqb [const AjPSeq] Sequence 2
2601 ** @return [ajint] Number of sequences in the set.
2602 **
2603 ** @release 2.1.0
2604 ******************************************************************************/
2605 
ajSeqsetFromPair(AjPSeqset thys,const AjPSeq seqa,const AjPSeq seqb)2606 ajint ajSeqsetFromPair(AjPSeqset thys, const AjPSeq seqa, const AjPSeq seqb)
2607 {
2608 
2609     ajSeqsetApp(thys, seqa);
2610     ajSeqsetApp(thys, seqb);
2611 
2612     return thys->Size;
2613 }
2614 
2615 
2616 
2617 
2618 /* @func ajSeqsetApp **********************************************************
2619 **
2620 ** Adds a sequence to a sequence set
2621 **
2622 ** @param [w] thys [AjPSeqset] Sequence set
2623 ** @param [r] seq [const AjPSeq] Sequence
2624 ** @return [ajint] Number of sequences in the set.
2625 **
2626 ** @release 2.1.0
2627 ******************************************************************************/
2628 
ajSeqsetApp(AjPSeqset thys,const AjPSeq seq)2629 ajint ajSeqsetApp(AjPSeqset thys, const AjPSeq seq)
2630 {
2631     ajuint iseq;
2632 
2633     iseq = thys->Size;
2634 
2635     ajDebug("ajSeqsetApp '%S' size %d len %d add '%S' len %d\n",
2636             thys->Full, thys->Size, thys->Len,
2637             seq->Full, ajSeqGetLen(seq));
2638 
2639     thys->Size ++;
2640     AJCRESIZE(thys->Seq, thys->Size);
2641     AJCRESIZE(thys->Seqweight, thys->Size);
2642 
2643     if(!iseq)
2644     {
2645         thys->EType = seq->EType;
2646         ajStrAssignEmptyS(&thys->Type, seq->Type);
2647         thys->Format = seq->Format;
2648         ajStrAssignEmptyS(&thys->Formatstr, seq->Formatstr);
2649         ajStrAssignEmptyS(&thys->Filename, seq->Filename);
2650         ajStrAssignEmptyS(&thys->Full, seq->Full);
2651     }
2652 
2653     thys->Seqweight[iseq] = seq->Weight;
2654     thys->Seq[iseq] = ajSeqNewSeq(seq);
2655     thys->Totweight += seq->Weight;
2656 
2657     if(ajSeqGetLen(seq) > thys->Len)
2658         thys->Len = ajSeqGetLen(seq);
2659 
2660     ajDebug("result '%S' size %d len\n",
2661             thys->Full, thys->Size, thys->Len);
2662 
2663     return thys->Size;
2664 }
2665 
2666 
2667 
2668 
2669 /* @funcstatic seqReadFmt *****************************************************
2670 **
2671 ** Tests whether a sequence can be read using the specified format.
2672 ** Then tests whether the sequence matches sequence query criteria
2673 ** and checks any specified type. Applies upper and lower case.
2674 **
2675 ** @param [w] thys [AjPSeq] Sequence object
2676 ** @param [u] seqin [AjPSeqin] Sequence input object
2677 ** @param [r] format [ajuint] input format code
2678 ** @return [ajuint] 0 if successful.
2679 **                  1 if the query match failed.
2680 **                  2 if the sequence type failed
2681 **                  3 if it failed to read a sequence
2682 **
2683 ** @release 1.0.0
2684 ** @@
2685 ** This is the only function that calls the appropriate Read function
2686 ** seqReadXxxxxx where Xxxxxxx is the supported sequence format.
2687 **
2688 ** Some of the seqReadXxxxxx functions fail to reset the buffer correctly,
2689 ** which is a very serious problem when cycling through all of them to
2690 ** identify an unknown format. The extra ajFileBuffReset call at the end is
2691 ** intended to address this problem. The individual functions should still
2692 ** reset the buffer in case they are called from elsewhere.
2693 **
2694 ******************************************************************************/
2695 
seqReadFmt(AjPSeq thys,AjPSeqin seqin,ajuint format)2696 static ajuint seqReadFmt(AjPSeq thys, AjPSeqin seqin,
2697                          ajuint format)
2698 {
2699     ajDebug("++seqReadFmt format %d (%s) '%S' feat %B\n",
2700             format, seqinFormatDef[format].Name,
2701             seqin->Input->Qry, seqin->Features);
2702 
2703     ajTextinClearNewinput(seqin->Input);
2704 
2705     /* Calling funclist seqinFormatDef() */
2706     if((*seqinFormatDef[format].Read)(thys, seqin))
2707     {
2708         ajDebug("seqReadFmt success with format %d (%s)\n",
2709                 format, seqinFormatDef[format].Name);
2710         ajDebug("id: '%S' len: %d\n",
2711                 thys->Name, ajStrGetLen(thys->Seq));
2712         seqin->Input->Format = format;
2713         ajStrAssignC(&seqin->Input->Formatstr, seqinFormatDef[format].Name);
2714         ajStrAssignC(&thys->Formatstr, seqinFormatDef[format].Name);
2715         ajStrAssignEmptyS(&thys->Db, seqin->Input->Db);
2716         ajStrAssignS(&thys->Entryname, seqin->Entryname);
2717         ajStrAssignS(&thys->Filename, seqin->Input->Filename);
2718 
2719         if(seqQueryMatch(seqin->Input->Query, thys))
2720         {
2721             ajStrAssignEmptyS(&thys->Entryname, thys->Name);
2722 
2723             ajDebug("seqQueryMatch Features:%B FtTable: %x (%u)\n",
2724                     seqin->Features, thys->Fttable,
2725                     ajFeattableGetSize(thys->Fttable));
2726 
2727             if(seqin->Features && !thys->Fttable)
2728             {
2729                 ajStrAssignEmptyS(&seqin->Ftquery->Seqname, thys->Entryname);
2730                 seqin->Fttable = ajFeattableNewReadUfo(seqin->Ftquery,
2731                                                        seqin->Ufo);
2732                 if (!seqin->Fttable)
2733                 {
2734                     ajDebug("seqReadFmt features input failed UFO: '%S'\n",
2735                             seqin->Ufo);
2736                     /*
2737                     **  GWW 21 Aug 2000 - don't warn about missing feature
2738                     **  tables
2739                     **/
2740                 }
2741                 else
2742                 {
2743                     ajFeattableSetLength(seqin->Fttable,
2744                                          ajStrGetLen(thys->Seq));
2745                     ajFeattableDel(&thys->Fttable);
2746                     /* ajFeattableTrace(seqin->Fttable); */
2747                     thys->Fttable = seqin->Fttable;
2748                     seqin->Fttable = NULL;
2749                 }
2750             }
2751 
2752             if (!ajStrGetLen(thys->Seq))      /* empty sequence string! */
2753                 return FMT_EMPTY;
2754 
2755             if(ajSeqTypeCheckIn(thys, seqin))
2756             {
2757                 if (!ajStrGetLen(thys->Seq))  /* removed all remaining chars */
2758                     return FMT_EMPTY;
2759 
2760                 /* ajSeqinTrace(seqin); */
2761                 if(seqin->Upper)
2762                     ajSeqFmtUpper(thys);
2763 
2764                 if(seqin->Lower)
2765                     ajSeqFmtLower(thys);
2766 
2767                 if(seqin->Begin)
2768                     thys->Begin = seqin->Begin;
2769 
2770                 if(seqin->End)
2771                     thys->End = seqin->End;
2772 
2773                 if(seqin->Rev)
2774                     thys->Rev = seqin->Rev;
2775 
2776                 return FMT_OK;
2777             }
2778             else
2779                 return FMT_BADTYPE;
2780         }
2781 
2782         ajDebug("query match failed, continuing ...\n");
2783         ajSeqClear(thys);
2784 
2785         if(seqinFormatDef[format].Binary)
2786             return FMT_FAIL; /* do not reread - will read whole file again */
2787         else
2788             return FMT_NOMATCH;
2789     }
2790     else
2791     {
2792         ajDebug("Testing input buffer: IsBuff: %B Eof: %B\n",
2793                 ajFilebuffIsBuffered(seqin->Input->Filebuff),
2794                 ajFilebuffIsEof(seqin->Input->Filebuff));
2795 
2796         if (!ajFilebuffIsBuffered(seqin->Input->Filebuff) &&
2797             ajFilebuffIsEof(seqin->Input->Filebuff))
2798             return FMT_EOF;
2799 
2800         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
2801         ajDebug("Format %d (%s) failed, file buffer reset by seqReadFmt\n",
2802                 format, seqinFormatDef[format].Name);
2803         /* ajFilebuffTraceFull(seqin->Input->Filebuff, 10, 10);*/
2804     }
2805 
2806     ajDebug("++seqReadFmt failed - nothing read\n");
2807 
2808     return FMT_FAIL;
2809 }
2810 
2811 
2812 
2813 
2814 /* @funcstatic seqRead ********************************************************
2815 **
2816 ** Given data in a seqin structure, tries to read everything needed
2817 ** using the specified format or by trial and error.
2818 **
2819 ** @param [w] thys [AjPSeq] Sequence object
2820 ** @param [u] seqin [AjPSeqin] Sequence input object
2821 ** @return [AjBool] ajTrue on success
2822 **
2823 ** @release 1.0.0
2824 ** @@
2825 ******************************************************************************/
2826 
seqRead(AjPSeq thys,AjPSeqin seqin)2827 static AjBool seqRead(AjPSeq thys, AjPSeqin seqin)
2828 {
2829     ajuint i;
2830     ajuint istat = 0;
2831     ajuint jstat = 0;
2832 
2833     AjPTextin textin = seqin->Input;
2834     AjPFilebuff buff = textin->Filebuff;
2835     AjPQuery qry = textin->Query;
2836     AjBool regfile = ajFalse;
2837     AjBool ok;
2838     AjPTextAccess textaccess = NULL;
2839     AjPSeqAccess  seqaccess  = NULL;
2840 
2841     ajSeqClear(thys);
2842     ajDebug("seqRead: cleared Single:%B Count:%u SeqData:%p "
2843             "TextData:%p\n",
2844             textin->Single, ajTextinGetCount(textin),
2845             seqin->SeqData, textin->TextData);
2846 
2847     if(textin->Single && ajTextinGetCount(textin))
2848     {
2849         /*
2850         ** One sequence at a time is read.
2851         ** The first sequence was read by ACD
2852         ** for the following ones we need to reset the AjPSeqin
2853         **
2854         ** Single is set by the access method
2855         */
2856 
2857         ajDebug("seqRead: single access - count %u - lines %u (total %u) "
2858                 "call access routine again\n",
2859                 ajTextinGetCount(textin),
2860                 ajTextinGetRecords(textin), ajTextinGetTotrecords(textin));
2861         /* Calling funclist seqAccess() */
2862         textaccess = qry->TextAccess;
2863         seqaccess = qry->Access;
2864         if(textaccess && !(*textaccess->Access)(textin))
2865         {
2866             ajDebug("seqRead: (*textaccess->Access)(seqin->Input) "
2867                     "*failed*\n");
2868 
2869             return ajFalse;
2870         }
2871         if(seqaccess && !(*seqaccess->Access)(seqin))
2872         {
2873             ajDebug("seqRead: (*seqaccess->Access)(seqin) "
2874                     "*failed*\n");
2875 
2876             return ajFalse;
2877         }
2878         buff = textin->Filebuff;
2879     }
2880 
2881     ajDebug("seqRead: seqin format %d '%S'\n", textin->Format,
2882             textin->Formatstr);
2883 
2884     textin->Count++;
2885 
2886     if(!textin->Filebuff)
2887         return ajFalse;
2888 
2889     ok = ajFilebuffIsBuffered(textin->Filebuff);
2890 
2891     if(!seqinFormatDef[textin->Format].Binary)
2892     {
2893         while(ok)
2894         {                               /* skip blank lines */
2895             ok = ajBuffreadLine(textin->Filebuff, &seqReadLine);
2896 
2897             if(ok && !ajStrIsWhite(seqReadLine))
2898             {
2899                 ajFilebuffClear(textin->Filebuff,1);
2900                 break;
2901             }
2902         }
2903     }
2904 
2905     if(!textin->Format)
2906     {                      /* no format specified, try all defaults */
2907 
2908         regfile = ajFileIsFile(ajFilebuffGetFile(textin->Filebuff));
2909 
2910         for(i = 1; seqinFormatDef[i].Name; i++)
2911         {
2912             if(!seqinFormatDef[i].Try)  /* skip if Try is ajFalse */
2913                 continue;
2914 
2915             if(seqinFormatDef[i].Binary && !regfile)
2916             {
2917                 ajDebug("seqRead: binary stdin skip format %d (%s)\n",
2918                         i, seqinFormatDef[i].Name);
2919                 continue;
2920             }
2921 
2922             ajDebug("seqRead:try format %d (%s) records: %u (total %u) "
2923                     "seqdata: %p\n",
2924                     i, seqinFormatDef[i].Name,
2925                     ajTextinGetRecords(textin),
2926                     ajTextinGetTotrecords(textin),
2927                     seqin->SeqData);
2928 
2929             ajTextinClearNewinput(seqin->Input);
2930 
2931             istat = seqReadFmt(thys, seqin, i);
2932 
2933             switch(istat)
2934             {
2935                 case FMT_OK:
2936                     ajDebug("++seqRead OK (1), set format %d\n",
2937                             textin->Format);
2938                     seqDefine(thys, seqin);
2939 
2940                     return ajTrue;
2941                 case FMT_BADTYPE:
2942                     ajDebug("seqRead: (a1) seqReadFmt stat == BADTYPE *failed*\n");
2943 
2944                     return ajFalse;
2945                 case FMT_FAIL:
2946                     ajDebug("seqRead: (b1) seqReadFmt stat == FAIL *failed*\n");
2947                     break;                  /* we can try next format */
2948                 case FMT_NOMATCH:
2949                     ajDebug("seqRead: (c1) seqReadFmt stat==NOMATCH try again\n");
2950                     break;
2951                 case FMT_EOF:
2952                     ajDebug("seqRead: (d1) seqReadFmt stat == EOF *failed*\n");
2953                     return ajFalse;                 /* EOF and unbuffered */
2954                 case FMT_EMPTY:
2955                     ajWarn("Sequence '%S' has zero length, ignored",
2956                            ajSeqGetUsaS(thys));
2957                     ajDebug("seqRead: (e1) seqReadFmt stat==EMPTY try again\n");
2958                     break;
2959                 default:
2960                     ajDebug("unknown code %d from seqReadFmt\n", stat);
2961             }
2962 
2963             ajSeqClear(thys);
2964 
2965             if(textin->Format)
2966                 break;                  /* we read something */
2967 
2968             ajFilebuffTrace(textin->Filebuff);
2969         }
2970 
2971         if(!textin->Format)
2972         {                    /* all default formats failed, give up */
2973             ajDebug("seqRead:all default formats failed, give up\n");
2974 
2975             return ajFalse;
2976         }
2977 
2978         ajDebug("++seqRead set format %d\n", textin->Format);
2979     }
2980     else
2981     {                                   /* one format specified */
2982         ajDebug("seqRead: one format specified\n");
2983         ajFilebuffSetUnbuffered(textin->Filebuff);
2984 
2985         ajDebug("++seqRead known format %d\n", textin->Format);
2986         istat = seqReadFmt(thys, seqin, textin->Format);
2987 
2988         switch(istat)
2989         {
2990             case FMT_OK:
2991                 ajDebug("++seqRead OK (2), set format %d\n",
2992                         textin->Format);
2993                 seqDefine(thys, seqin);
2994 
2995                 return ajTrue;
2996             case FMT_BADTYPE:
2997                 ajDebug("seqRead: (a2) seqReadFmt stat == BADTYPE *failed*\n");
2998 
2999                 return ajFalse;
3000 
3001             case FMT_FAIL:
3002                 ajDebug("seqRead: (b2) seqReadFmt stat == FAIL *failed*\n");
3003 
3004                 return ajFalse;
3005 
3006             case FMT_NOMATCH:
3007                 ajDebug("seqRead: (c2) seqReadFmt stat == NOMATCH *try again*\n");
3008                 break;
3009             case FMT_EOF:
3010                 ajDebug("seqRead: (d2) seqReadFmt stat == EOF *try again*\n");
3011                 if(ajTextinGetRecords(textin))
3012                     ajErr("Error reading file '%F' with format '%s': "
3013                           "end-of-file before end of data "
3014                           "(read %u records, total %u)",
3015                           ajFilebuffGetFile(textin->Filebuff),
3016                           seqinFormatDef[textin->Format].Name,
3017                           ajTextinGetRecords(textin),
3018                           ajTextinGetTotrecords(textin));
3019                 break;                   /* simply end-of-file */
3020             case FMT_EMPTY:
3021                 ajWarn("Sequence '%S' has zero length, ignored",
3022                        ajSeqGetUsaS(thys));
3023                 ajDebug("seqRead: (e2) seqReadFmt stat == EMPTY *try again*\n");
3024                 break;
3025             default:
3026                 ajDebug("unknown code %d from seqReadFmt\n", stat);
3027         }
3028 
3029         ajSeqClear(thys); /* 1 : read, failed to match id/acc/query */
3030     }
3031 
3032     /* failed - probably entry/accession query failed. Can we try again? */
3033 
3034     ajDebug("seqRead failed - try again with format %d '%s' code %d\n",
3035             textin->Format, seqinFormatDef[textin->Format].Name, istat);
3036 
3037     ajDebug("Search:%B Chunk:%B Data:%x ajFileBuffEmpty:%B\n",
3038             textin->Search, textin->ChunkEntries,
3039             seqin->SeqData, ajFilebuffIsEmpty(buff));
3040 
3041     if(ajFilebuffIsEmpty(buff) && textin->ChunkEntries)
3042     {
3043         if(textaccess && !(*textaccess->Access)(textin))
3044             return ajFalse;
3045         else if(seqaccess && !(*seqaccess->Access)(seqin))
3046             return ajFalse;
3047         buff = textin->Filebuff;
3048     }
3049 
3050 
3051     /* need to check end-of-file to avoid repeats */
3052     while(textin->Search &&
3053           (textin->TextData || !ajFilebuffIsEmpty(buff)))
3054     {
3055         jstat = seqReadFmt(thys, seqin, textin->Format);
3056 
3057         switch(jstat)
3058         {
3059             case FMT_OK:
3060                 ajDebug("++seqRead OK (3), set format %d\n",
3061                         textin->Format);
3062                 seqDefine(thys, seqin);
3063 
3064                 return ajTrue;
3065 
3066             case FMT_BADTYPE:
3067                 ajDebug("seqRead: (a3) seqReadFmt stat == BADTYPE *failed*\n");
3068 
3069                 return ajFalse;
3070 
3071             case FMT_FAIL:
3072                 ajDebug("seqRead: (b3) seqReadFmt stat == FAIL *failed*\n");
3073 
3074                 return ajFalse;
3075 
3076             case FMT_NOMATCH:
3077                 ajDebug("seqRead: (c3) seqReadFmt stat == NOMATCH *try again*\n");
3078                 break;
3079             case FMT_EOF:
3080                 ajDebug("seqRead: (d3) seqReadFmt stat == EOF *failed*\n");
3081 
3082                 return ajFalse;                     /* we already tried again */
3083 
3084             case FMT_EMPTY:
3085                 if(istat != FMT_EMPTY)
3086                     ajWarn("Sequence '%S' has zero length, ignored",
3087                            ajSeqGetUsaS(thys));
3088                 ajDebug("seqRead: (e3) seqReadFmt stat == EMPTY *try again*\n");
3089                 break;
3090 
3091             default:
3092                 ajDebug("unknown code %d from seqReadFmt\n", stat);
3093         }
3094 
3095         ajSeqClear(thys); /* 1 : read, failed to match id/acc/query */
3096     }
3097 
3098     if(seqin->Input->Format)
3099         ajDebug("seqRead: *failed* to read sequence %S using format %s\n",
3100                 textin->Qry, seqinFormatDef[textin->Format].Name);
3101     else
3102         ajDebug("seqRead: *failed* to read sequence %S using any format\n",
3103                 textin->Qry);
3104 
3105     return ajFalse;
3106 }
3107 
3108 
3109 
3110 
3111 /* @funcstatic seqReadFasta ***************************************************
3112 **
3113 ** Given data in a sequence structure, tries to read everything needed
3114 ** using the FASTA format.
3115 **
3116 ** @param [w] thys [AjPSeq] Sequence object
3117 ** @param [u] seqin [AjPSeqin] Sequence input object
3118 ** @return [AjBool] ajTrue on success
3119 **
3120 ** @release 1.0.0
3121 ** @@
3122 ******************************************************************************/
3123 
seqReadFasta(AjPSeq thys,AjPSeqin seqin)3124 static AjBool seqReadFasta(AjPSeq thys, AjPSeqin seqin)
3125 {
3126     AjPFilebuff buff;
3127     AjPStr id   = NULL;
3128     AjPStr acc  = NULL;
3129     AjPStr sv   = NULL;
3130     AjPStr desc = NULL;
3131 
3132     const char *cp;
3133     ajlong fpos     = 0;
3134     ajlong fposb    = 0;
3135     AjBool ok       = ajTrue;
3136     AjPStr tmpline = NULL;
3137     const AjPStr badstr = NULL;
3138 
3139     ajDebug("seqReadFasta\n");
3140 
3141     buff = seqin->Input->Filebuff;
3142 
3143     /* ajFilebuffTrace(buff); */
3144 
3145     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3146     if(!ok)
3147         return ajFalse;
3148 
3149     fpos = ajTextinGetFpos(seqin->Input);
3150 
3151     ajDebug("First line: %S\n", seqReadLine);
3152 
3153     /* If ; then it is really PIR format */
3154     if(ajStrGetCharPos(seqReadLine, 3) == ';')
3155     {
3156         ajStrAssignSubS(&tmpline,seqReadLine, 4, -1);
3157         ajFmtPrintS(&seqReadLine, ">%S",tmpline);
3158         ajDebug("PIR format changed line to %S\n", seqReadLine);
3159         ajStrDel(&tmpline);
3160     }
3161 
3162     cp = ajStrGetPtr(seqReadLine);
3163 
3164     if(*cp != '>')
3165     {
3166         ajDebug("first line is not FASTA\n");
3167         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3168 
3169         return ajFalse;
3170     }
3171 
3172     if(!ajSeqParseFasta(seqReadLine, &id, &acc, &sv, &desc))
3173     {
3174         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3175 
3176         return ajFalse;
3177     }
3178 
3179     /* we know we will succeed from here ... no way to return ajFalse */
3180 
3181     ajFilebuffSetUnbuffered(buff);
3182 
3183     seqSetNameNospace(&thys->Name, id);
3184 
3185     if(ajStrGetLen(sv))
3186         seqSvSave(thys, sv);
3187 
3188     if(ajStrGetLen(acc))
3189         seqAccSave(thys, acc);
3190 
3191     ajStrAssignS(&thys->Desc, desc);
3192     ajStrDel(&id);
3193     ajStrDel(&acc);
3194     ajStrDel(&sv);
3195     ajStrDel(&desc);
3196 
3197     if(ajStrGetLen(seqin->Inseq))
3198     {                                  /* we have a sequence to use */
3199         ajDebug("++fasta use Inseq '%S'\n", seqin->Inseq);
3200         ajStrAssignS(&thys->Seq, seqin->Inseq);
3201         if(seqin->Input->Text)
3202             seqTextSeq(&thys->TextPtr, seqin->Inseq);
3203 
3204         ajFilebuffClear(buff, 0);
3205     }
3206     else
3207     {
3208         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3209         while(ok && !ajStrPrefixC(seqReadLine, ">"))
3210         {
3211             badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3212                                    seqin->Input->Format);
3213 
3214             if(badstr)
3215                 ajWarn("Sequence '%S' has bad character(s) '%S'",
3216                        thys->Name, badstr);
3217 
3218             ajDebug("++fasta append line '%S'\n", seqReadLine);
3219             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3220         }
3221 
3222         if(ok)
3223             ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3224         else
3225             ajFilebuffClear(buff, 0);
3226     }
3227 
3228     thys->Fpos = fpos;
3229 
3230     ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);
3231 
3232     return ajTrue;
3233 }
3234 
3235 
3236 
3237 
3238 /* @funcstatic seqReadFastq ***************************************************
3239 **
3240 ** Given data in a sequence structure, tries to read everything needed
3241 ** using the FASTQ format, but ignores quality values.
3242 **
3243 ** See the more specific fastq formats for parsers that read and process
3244 ** the quality scores.
3245 **
3246 ** @param [w] thys [AjPSeq] Sequence object
3247 ** @param [u] seqin [AjPSeqin] Sequence input object
3248 ** @return [AjBool] ajTrue on success
3249 **
3250 ** @release 6.1.0
3251 ** @@
3252 ******************************************************************************/
3253 
seqReadFastq(AjPSeq thys,AjPSeqin seqin)3254 static AjBool seqReadFastq(AjPSeq thys, AjPSeqin seqin)
3255 {
3256     AjPFilebuff buff;
3257     AjPStr id   = NULL;
3258     AjPStr acc  = NULL;
3259     AjPStr sv   = NULL;
3260     AjPStr desc = NULL;
3261 
3262     ajuint seqlen = 0;
3263     /*AjPStr qualstr = NULL;*/
3264     char minqual;
3265     char maxqual;
3266     char comqual;
3267 
3268     const char *cp;
3269     ajlong fpos     = 0;
3270     ajlong fposb    = 0;
3271     AjBool ok       = ajTrue;
3272     const AjPStr badstr = NULL;
3273 
3274     ajDebug("seqReadFastq\n");
3275 
3276     buff = seqin->Input->Filebuff;
3277 
3278     /* ajFilebuffTrace(buff); */
3279 
3280     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3281     if(!ok)
3282         return ajFalse;
3283 
3284     fpos = ajTextinGetFpos(seqin->Input);
3285 
3286     ajDebug("First line: %S\n", seqReadLine);
3287 
3288     cp = ajStrGetPtr(seqReadLine);
3289 
3290     if(*cp != '@')
3291     {
3292         ajDebug("first line is not FASTQ\n");
3293         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3294 
3295         return ajFalse;
3296     }
3297 
3298     if(!ajSeqParseFastq(seqReadLine, &id, &desc))
3299     {
3300         ajDebug("first line did not parse as FASTQ\n");
3301         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3302 
3303         return ajFalse;
3304     }
3305 
3306     seqSetNameNospace(&thys->Name, id);
3307 
3308     if(ajStrGetLen(sv))
3309         seqSvSave(thys, sv);
3310 
3311     if(ajStrGetLen(acc))
3312         seqAccSave(thys, acc);
3313 
3314     ajStrAssignS(&thys->Desc, desc);
3315     ajStrDel(&id);
3316     ajStrDel(&acc);
3317     ajStrDel(&sv);
3318     ajStrDel(&desc);
3319 
3320     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3321     while(ok &&
3322           !ajStrPrefixC(seqReadLine, "+"))
3323     {
3324         badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3325                                seqin->Input->Format);
3326 
3327         if(badstr)
3328             ajWarn("Sequence '%S' has bad character(s) '%S'",
3329                    thys->Name, badstr);
3330 
3331         ajDebug("++fastq append line '%S'\n", seqReadLine);
3332         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3333         ajDebug("++fastq sequence %4u '%S'\n",
3334                 ajStrGetLen(thys->Seq), thys->Seq);
3335     }
3336 
3337     if(!ok)
3338     {
3339         ajDebug("failed to find quality scores\n");
3340         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3341 
3342         return ajFalse;
3343     }
3344 
3345     seqlen = ajStrGetLen(thys->Seq);
3346 
3347     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3348 
3349     ajStrAssignClear(&seqQualStr);
3350 
3351     while(ok &&
3352           ((ajStrGetLen(seqQualStr) < seqlen) ||
3353            ajStrGetCharFirst(seqReadLine) !=  '@'))
3354     {
3355         seqqualAppendWarn(&seqQualStr, seqReadLine);
3356 
3357         ajDebug("++fastq append qualities '%S'\n", seqReadLine);
3358         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3359         ajDebug("++fastq qualities %3u '%S'\n",
3360                 ajStrGetLen(seqQualStr), seqQualStr);
3361     }
3362 
3363     minqual = ajStrGetAsciiLow(seqQualStr);
3364     maxqual = ajStrGetAsciiHigh(seqQualStr);
3365     comqual = ajStrGetAsciiCommon(seqQualStr);
3366 
3367     if(ajStrGetLen(seqQualStr) != seqlen)
3368     {
3369         ajDebug("length mismatch seq: %u quality: %u\n",
3370                 seqlen, ajStrGetLen(seqQualStr));
3371         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3372 
3373         return ajFalse;
3374     }
3375 
3376     if(ok)
3377         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3378     else
3379         ajFilebuffClear(buff, 0);
3380 
3381     thys->Fpos = fpos;
3382 
3383     ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);
3384 
3385     ajDebug("quality characters %d..%d (%d) '%c' '%c' (%c)\n",
3386             (int) minqual, (int) maxqual, (int) comqual,
3387             minqual, maxqual, comqual);
3388 
3389     return ajTrue;
3390 }
3391 
3392 
3393 
3394 
3395 /* @funcstatic seqReadFastqSanger *********************************************
3396 **
3397 ** Given data in a sequence structure, tries to read everything needed
3398 ** using the FASTQ format, and interprets Sanger (phred) scores.
3399 **
3400 ** @param [w] thys [AjPSeq] Sequence object
3401 ** @param [u] seqin [AjPSeqin] Sequence input object
3402 ** @return [AjBool] ajTrue on success
3403 **
3404 ** @release 6.1.0
3405 ** @@
3406 ******************************************************************************/
3407 
seqReadFastqSanger(AjPSeq thys,AjPSeqin seqin)3408 static AjBool seqReadFastqSanger(AjPSeq thys, AjPSeqin seqin)
3409 {
3410     AjPFilebuff buff;
3411     AjPStr id   = NULL;
3412     AjPStr acc  = NULL;
3413     AjPStr sv   = NULL;
3414     AjPStr desc = NULL;
3415 
3416     ajuint seqlen = 0;
3417 
3418 /*
3419 **    char minqual;
3420 **    char maxqual;
3421 **    char comqual;
3422 */
3423 
3424     const char *cp;
3425     ajint iqual;
3426     ajlong fpos     = 0;
3427     AjBool ok       = ajTrue;
3428     const AjPStr badstr = NULL;
3429 
3430     /*    ajint amin = 0; */
3431     ajint qmin = 33;
3432     ajint qmax = 126;
3433     ajuint i;
3434     ajuint cntseq = 0;
3435     ajuint cntqual = 0;
3436     ajuint cntnewline = 0;
3437 
3438     /* ajDebug("seqReadFastqSanger\n"); */
3439 
3440     buff = seqin->Input->Filebuff;
3441 
3442     /* ajFilebuffTrace(buff); */
3443 
3444     ok = ajTextinStoreReadline(seqin->Input, &seqSaveLine, &thys->TextPtr);
3445     if(!ok)
3446         return ajFalse;
3447 
3448     fpos = ajTextinGetFpos(seqin->Input);
3449 
3450     /* ajDebug("First line: %S\n", seqSaveLine); */
3451 
3452     cp = MAJSTRGETPTR(seqSaveLine);
3453 
3454     if(*cp != '@')
3455     {
3456         /* ajDebug("first line is not FASTQ\n"); */
3457         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3458 
3459         return ajFalse;
3460     }
3461 
3462     if(!ajSeqParseFastq(seqSaveLine, &id, &desc))
3463     {
3464         /* ajDebug("first line did not parse as FASTQ\n"); */
3465         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3466 
3467         return ajFalse;
3468     }
3469 
3470     seqSetNameNospace(&thys->Name, id);
3471 
3472     if(MAJSTRGETLEN(sv))
3473         seqSvSave(thys, sv);
3474 
3475     if(MAJSTRGETLEN(acc))
3476         seqAccSave(thys, acc);
3477 
3478     ajStrAssignS(&thys->Desc, desc);
3479     ajStrDel(&id);
3480     ajStrDel(&acc);
3481     ajStrDel(&sv);
3482     ajStrDel(&desc);
3483 
3484     i = MAJSTRGETLEN(seqSaveLine) - 1;
3485     while(ajStrGetCharPos(seqSaveLine, i) == '\n' ||
3486           ajStrGetCharPos(seqSaveLine, i) == '\r')
3487     {
3488         cntnewline++;
3489         i--;
3490     }
3491 
3492     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3493     while(ok &&
3494           ajStrGetCharFirst(seqReadLine) != '+')
3495     {
3496         cntseq += MAJSTRGETLEN(seqReadLine) - cntnewline;
3497         badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3498                                seqin->Input->Format);
3499 
3500         if(badstr)
3501             ajWarn("Sequence '%S' has bad character(s) '%S'",
3502                    thys->Name, badstr);
3503 
3504         /* ajDebug("++fastq append line '%S'\n", seqReadLine); */
3505         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3506         /* ajDebug("++fastq sequence %4u '%S'\n",
3507            ajStrGetLen(thys->Seq), thys->Seq); */
3508     }
3509 
3510     if(!ok)
3511     {
3512         /* ajDebug("failed to find quality scores\n"); */
3513         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3514 
3515         return ajFalse;
3516     }
3517     if(MAJSTRGETLEN(seqReadLine) > (cntnewline+1))
3518     {
3519         ajStrPasteCountK(&seqReadLine, 0,'@', 1);
3520         if(!ajStrMatchS(seqReadLine, seqSaveLine))
3521         {
3522             ajStrPasteCountK(&seqReadLine, 0,'+', 1);
3523             ajWarn("Mismatch in file '%F' + line "
3524                    "does not match first line '%.*S' '%.*S'",
3525                    ajFilebuffGetFile(buff),
3526                    (ajuint)(MAJSTRGETLEN(seqSaveLine) - cntnewline),
3527                    seqSaveLine,
3528                    (ajuint) (MAJSTRGETLEN(seqReadLine) - cntnewline),
3529                    seqReadLine);
3530         }
3531     }
3532 
3533     seqlen = MAJSTRGETLEN(thys->Seq);
3534 
3535     if(seqlen < cntseq)
3536     {
3537         ajWarn("FASTQ format '%F' sequence '%S' "
3538                "sequence skipped %u character(s)",
3539                ajFilebuffGetFile(buff), thys->Name, cntseq - seqlen);
3540     }
3541     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3542 
3543     ajStrAssignClear(&seqQualStr);
3544     ajStrAssignClear(&seqSaveLine2);
3545 
3546     while(ok &&
3547           ((MAJSTRGETLEN(seqQualStr) < seqlen) ||
3548            ajStrGetCharFirst(seqReadLine) != '@'))
3549     {
3550         if((ajStrGetCharFirst(seqReadLine) == '@') &&
3551            !MAJSTRGETLEN(seqSaveLine2))
3552             ajStrAssignS(&seqSaveLine2, seqReadLine);
3553 
3554         cntqual += MAJSTRGETLEN(seqReadLine) - cntnewline;
3555         seqqualAppendWarn(&seqQualStr, seqReadLine);
3556 
3557         /* ajDebug("++fastq append qualities '%S'\n", seqReadLine); */
3558         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3559         /* ajDebug("++fastq qualities %3u '%S'\n",
3560            ajStrGetLen(seqQualStr), seqQualStr); */
3561     }
3562 
3563 /*
3564 **    minqual = ajStrGetAsciiLow(seqQualStr);
3565 **    maxqual = ajStrGetAsciiHigh(seqQualStr);
3566 **    comqual = ajStrGetAsciiCommon(seqQualStr);
3567 */
3568 
3569     if(MAJSTRGETLEN(seqQualStr) != seqlen)
3570     {
3571         ajWarn("FASTQ quality length mismatch '%F' '%S' "
3572                "expected: %u found: %u",
3573                ajFilebuffGetFile(buff), thys->Name,
3574                seqlen, ajStrGetLen(seqQualStr));
3575         if((MAJSTRGETLEN(seqQualStr) > seqlen) &&
3576            MAJSTRGETLEN(seqSaveLine2))
3577         {
3578             ajStrTrimEndC(&seqSaveLine2, "\n\r");
3579             ajWarn("(Possible short quality record before '%S')",
3580                    seqSaveLine2);
3581         }
3582     }
3583     if(MAJSTRGETLEN(seqQualStr) < cntqual)
3584     {
3585         ajWarn("FASTQ format '%F' sequence '%S' "
3586                "quality skipped %u character(s)",
3587                ajFilebuffGetFile(buff), thys->Name,
3588                cntqual - MAJSTRGETLEN(seqQualStr));
3589     }
3590 
3591 
3592     if(ok)
3593         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3594     else
3595         ajFilebuffClear(buff, 0);
3596 
3597     thys->Fpos = fpos;
3598 
3599     /* ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb); */
3600 
3601     /* ajDebug("Sanger: %d..%d (%d)\n",
3602        (ajint) minqual, (ajint) maxqual, (ajint) comqual); */
3603 
3604     cp = MAJSTRGETPTR(seqQualStr);
3605     i=0;
3606 
3607     if(seqlen > thys->Qualsize)
3608         AJCRESIZE(thys->Accuracy, seqlen);
3609 
3610     thys->Qualsize = seqlen;
3611 
3612     if(MAJSTRGETLEN(seqQualStr) > thys->Qualsize)
3613         AJCRESIZE(thys->Accuracy, MAJSTRGETLEN(seqQualStr));
3614 
3615     /*
3616     ** Sanger uses Phred quality calculated from error probability p
3617     ** Qp = -10 log (p)
3618     **
3619     ** For Sanger (phred) p = 1 / 10**(Q/10)
3620     ** 10: p=0.1 20: p=0.01 etc.
3621     */
3622 
3623     while (*cp)
3624     {
3625         iqual = *cp++;
3626         if(iqual < qmin)
3627         {
3628             ajWarn("FASTQ-SANGER '%F' sequence '%S' "
3629                    "quality value '%c' too low",
3630                    ajFilebuffGetFile(buff), thys->Name,
3631                    (char) iqual);
3632             iqual = qmin;
3633         }
3634         if(iqual > qmax)
3635         {
3636             ajWarn("FASTQ-SANGER '%F' sequence '%S' "
3637                    "quality value '%c' too high",
3638                    ajFilebuffGetFile(buff), thys->Name,
3639                    (char) iqual);
3640             iqual = qmax;
3641         }
3642         thys->Accuracy[i++] = seqQualPhred[iqual];
3643     }
3644 
3645 
3646 /*
3647 **    ajDebug("quality characters %d..%d (%d) '%c' '%c' (%c) "
3648 **            "scores %d..%d (%d)\n",
3649 **            (int) minqual, (int) maxqual, (int) comqual,
3650 **            minqual, maxqual, comqual,
3651 **            (amin + minqual - qmin), (amin + maxqual - qmin),
3652 **            (amin + comqual - qmin));
3653 */
3654 
3655     ajStrAssignClear(&seqQualStr);
3656 
3657     return ajTrue;
3658 }
3659 
3660 
3661 
3662 
3663 /* #funcstatic seqReadFastqInt ************************************************
3664 **
3665 ** Given data in a sequence structure, tries to read everything needed
3666 ** using the FASTQ numeric format, and interprets integer Solexa scores.
3667 **
3668 ** #param [w] thys [AjPSeq] Sequence object
3669 ** #param [u] seqin [AjPSeqin] Sequence input object
3670 ** #return [AjBool] ajTrue on success
3671 ** ##
3672 ******************************************************************************/
3673 
3674 /*
3675 //static AjBool seqReadFastqInt(AjPSeq thys, AjPSeqin seqin)
3676 //{
3677 //    AjPFilebuff buff;
3678 //    AjPStr id   = NULL;
3679 //    AjPStr acc  = NULL;
3680 //    AjPStr sv   = NULL;
3681 //    AjPStr desc = NULL;
3682 //
3683 //    ajuint seqlen = 0;
3684 //    AjPStr qualstr = NULL;
3685 //
3686 //    const char *cp;
3687 //    ajlong fpos     = 0;
3688 //    ajlong fposb    = 0;
3689 //    AjBool ok       = ajTrue;
3690 //
3691 //    const AjPStr badstr = NULL;
3692 //
3693 //    ajuint i;
3694 //    AjBool badwarn = ajFalse;
3695 //    double sval;
3696 //    double pval;
3697 //    double qval;
3698 //
3699 //    ajDebug("seqReadFastqInt\n");
3700 //
3701 //    buff = seqin->Input->Filebuff;
3702 //
3703 //    ok = ajTextinStoreReadline(seqin->Input, &seqReadLine,  &thys->TextPtr);
3704 //    if(!ok)
3705 //      return ajFalse;
3706 //
3707 //    fpos = ajTextinGetFpos(seqin->Input);
3708 //
3709 //    ajDebug("First line: %S\n", seqReadLine);
3710 //
3711 //    cp = ajStrGetPtr(seqReadLine);
3712 //
3713 //    if(*cp != '@')
3714 //    {
3715 //      ajDebug("first line is not FASTQ\n");
3716 //      ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3717 //
3718 //      return ajFalse;
3719 //    }
3720 //
3721 //    if(!ajSeqParseFastq(seqReadLine, &id, &desc))
3722 //    {
3723 //      ajDebug("first line did not parse as FASTQ\n");
3724 //      ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3725 //
3726 //      return ajFalse;
3727 //    }
3728 //
3729 //    seqSetNameNospace(&thys->Name, id);
3730 //
3731 //    if(ajStrGetLen(sv))
3732 //      seqSvSave(thys, sv);
3733 //
3734 //    if(ajStrGetLen(acc))
3735 //      seqAccSave(thys, acc);
3736 //
3737 //    ajStrAssignS(&thys->Desc, desc);
3738 //    ajStrDel(&id);
3739 //    ajStrDel(&acc);
3740 //    ajStrDel(&sv);
3741 //    ajStrDel(&desc);
3742 //
3743 //    ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3744 //    while(ok &&
3745 //          !ajStrPrefixC(seqReadLine, "+"))
3746 //    {
3747 //        badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3748                                  seqin->Input->Format);
3749 //
3750 //        if(badstr)
3751 //            ajWarn("Sequence '%S' has bad character(s) '%S'",
3752 //                   thys->Name, badstr);
3753 //
3754 //        ajDebug("++fastq append line '%S'\n", seqReadLine);
3755 //        ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3756 //        ajDebug("++fastq sequence %4u '%S'\n",
3757 //                ajStrGetLen(thys->Seq), thys->Seq);
3758 //    }
3759 //
3760 //    if(!ok)
3761 //    {
3762 //      ajDebug("failed to find quality scores\n");
3763 //      ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3764 //
3765 //      return ajFalse;
3766 //    }
3767 //
3768 //    seqlen = ajStrGetLen(thys->Seq);
3769 //
3770 //    ok = ajTextinStoreReadline(seqin->Input, &seqReadLine,  &thys->TextPtr);
3771 //
3772 //    i=0;
3773 //    if(seqlen > thys->Qualsize)
3774 //    {
3775 //        AJCRESIZE(thys->Accuracy, seqlen);
3776 //        thys->Qualsize = seqlen;
3777 //    }
3778 //
3779 //    ajStrAssignClear(&seqQualStr);
3780 //    while(ok &&
3781 //          (!ajStrPrefixC(seqReadLine, "@")))
3782 //    {
3783 //      ajStrTokenAssignC(&handle, seqReadLine, " ,\n\r\t");
3784 //        while(ajStrTokenNextParse(seqHandle, &seqQualStr))
3785 //        {
3786 //            if(i >= seqlen){
3787 //                if(!badwarn)
3788 //                    ajWarn("Bad quality '%S' for base %d "
3789 //                       "in fastq-int format\n",
3790 //                       qualstr, i);
3791 //                badwarn = ajTrue;
3792 //            }
3793 //            else if(!ajStrToDouble(seqQualStr, &sval))
3794 //            {
3795 //                if(!badwarn)
3796 //                    ajWarn("Bad quality '%S' for base %d "
3797 //                         "in fastq-int format\n",
3798 //                         qualstr, i);
3799 //                badwarn = ajTrue;
3800 //                i++;
3801 //            }
3802 //            else
3803 //            {
3804 //                pval = pow(10.0, (sval / -10.0));
3805 //                qval = pval / (1.0 + pval);
3806 //                thys->Accuracy[i++] = -10.0 * log10(qval);
3807 //            }
3808 //        }
3809 //
3810 //        ajDebug("++fastq append qualities '%S'\n", seqReadLine);
3811 //        ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3812 //    }
3813 //
3814 //    if(i != seqlen)
3815 //    {
3816 //      ajWarn("length mismatch seq: %u quality: %u\n",
3817 //                seqlen, i);
3818 //      ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3819 //
3820 //      return ajFalse;
3821 //    }
3822 //
3823 //    if(ok)
3824 //        ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3825 //    else
3826 //        ajFilebuffClear(buff, 0);
3827 //
3828 //    thys->Fpos = fpos;
3829 //
3830 //    ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);
3831 //
3832 //    ajStrTokenDel(&handle);
3833 //
3834 //    return ajTrue;
3835 //}
3836 */
3837 
3838 
3839 
3840 
3841 /* @funcstatic seqReadFastqIllumina *******************************************
3842 **
3843 ** Given data in a sequence structure, tries to read everything needed
3844 ** using the FASTQ format, and processes phred quality scores
3845 ** with Illumina encoding.
3846 **
3847 ** @param [w] thys [AjPSeq] Sequence object
3848 ** @param [u] seqin [AjPSeqin] Sequence input object
3849 ** @return [AjBool] ajTrue on success
3850 **
3851 ** @release 6.1.0
3852 ** @@
3853 ******************************************************************************/
3854 
seqReadFastqIllumina(AjPSeq thys,AjPSeqin seqin)3855 static AjBool seqReadFastqIllumina(AjPSeq thys, AjPSeqin seqin)
3856 {
3857     AjPFilebuff buff;
3858     AjPStr id   = NULL;
3859     AjPStr acc  = NULL;
3860     AjPStr sv   = NULL;
3861     AjPStr desc = NULL;
3862 
3863     ajuint seqlen = 0;
3864     /*AjPStr qualstr = NULL;*/
3865 /*
3866 **    char minqual;
3867 **    char maxqual;
3868 **    char comqual;
3869 */
3870 
3871     const char *cp;
3872     ajint iqual;
3873     ajlong fpos     = 0;
3874     AjBool ok       = ajTrue;
3875     const AjPStr badstr = NULL;
3876 
3877     /*ajint amin = 0;*/
3878     ajint qmin = 64;
3879     ajint qmax = 126;
3880     ajuint i;
3881 
3882     ajDebug("seqReadFastqIllumina\n");
3883 
3884     buff = seqin->Input->Filebuff;
3885 
3886     /* ajFilebuffTrace(buff); */
3887 
3888     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3889     if(!ok)
3890         return ajFalse;
3891 
3892     fpos = ajTextinGetFpos(seqin->Input);
3893 
3894     ajDebug("First line: %S\n", seqReadLine);
3895 
3896     cp = ajStrGetPtr(seqReadLine);
3897 
3898     if(*cp != '@')
3899     {
3900         ajDebug("first line is not FASTQ\n");
3901         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3902 
3903         return ajFalse;
3904     }
3905 
3906     if(!ajSeqParseFastq(seqReadLine, &id, &desc))
3907     {
3908         ajDebug("first line did not parse as FASTQ\n");
3909         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3910 
3911         return ajFalse;
3912     }
3913 
3914     seqSetNameNospace(&thys->Name, id);
3915 
3916     if(ajStrGetLen(sv))
3917         seqSvSave(thys, sv);
3918 
3919     if(ajStrGetLen(acc))
3920         seqAccSave(thys, acc);
3921 
3922     ajStrAssignS(&thys->Desc, desc);
3923     ajStrDel(&id);
3924     ajStrDel(&acc);
3925     ajStrDel(&sv);
3926     ajStrDel(&desc);
3927 
3928     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3929     while(ok &&
3930           !ajStrPrefixC(seqReadLine, "+"))
3931     {
3932         badstr = seqAppendWarn(&thys->Seq, seqReadLine,
3933                                seqin->Input->Format);
3934 
3935         if(badstr)
3936             ajWarn("Sequence '%S' has bad character(s) '%S'",
3937                    thys->Name, badstr);
3938 
3939         ajDebug("++fastq append line '%S'\n", seqReadLine);
3940         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3941         ajDebug("++fastq sequence %4u '%S'\n",
3942                 ajStrGetLen(thys->Seq), thys->Seq);
3943     }
3944 
3945     if(!ok)
3946     {
3947         ajDebug("failed to find quality scores\n");
3948         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3949 
3950         return ajFalse;
3951     }
3952 
3953     seqlen = ajStrGetLen(thys->Seq);
3954 
3955     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3956 
3957     ajStrAssignClear(&seqQualStr);
3958 
3959     while(ok &&
3960           ((ajStrGetLen(seqQualStr) < seqlen) ||
3961            ajStrGetCharFirst(seqReadLine) != '@'))
3962     {
3963         seqqualAppendWarn(&seqQualStr, seqReadLine);
3964 
3965         ajDebug("++fastq append qualities '%S'\n", seqReadLine);
3966         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
3967         ajDebug("++fastq qualities %3u '%S'\n",
3968                 ajStrGetLen(seqQualStr), seqQualStr);
3969     }
3970 
3971 /*
3972 **    minqual = ajStrGetAsciiLow(seqQualStr);
3973 **    maxqual = ajStrGetAsciiHigh(seqQualStr);
3974 **    comqual = ajStrGetAsciiCommon(seqQualStr);
3975 */
3976 
3977     if(ajStrGetLen(seqQualStr) != seqlen)
3978     {
3979         ajDebug("length mismatch seq: %u quality: %u\n",
3980                 seqlen, ajStrGetLen(seqQualStr));
3981         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
3982 
3983         return ajFalse;
3984     }
3985 
3986     if(ok)
3987         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
3988     else
3989         ajFilebuffClear(buff, 0);
3990 
3991     thys->Fpos = fpos;
3992 
3993     /*ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);*/
3994 
3995     /*ajDebug("Illumina: %d..%d (%d)\n",
3996       (ajint) minqual, (ajint) maxqual, (ajint) comqual);*/
3997 
3998     cp = ajStrGetPtr(seqQualStr);
3999     i=0;
4000 
4001     if(seqlen > thys->Qualsize)
4002         AJCRESIZE(thys->Accuracy, seqlen);
4003 
4004     thys->Qualsize = seqlen;
4005 
4006     /*
4007     ** Illumina uses Phred quality calculated from error probability p
4008     ** Qp = -10 log (p)
4009     **
4010     ** For Sanger (phred) p = 1 / 10**(Q/10)
4011     ** 10: p=0.1 20: p=0.01 etc.
4012     */
4013 
4014     while (*cp)
4015     {
4016         iqual = *cp++;
4017         if(iqual < qmin)
4018         {
4019             ajWarn("FASTQ-ILLUMINA quality value too low '%F' '%S' '%c'",
4020                    ajFilebuffGetFile(buff), thys->Name,
4021                    (char) iqual);
4022             iqual = qmin;
4023         }
4024         if(iqual > qmax)
4025         {
4026             ajWarn("FASTQ-ILLUMINA quality value too high '%F' '%S' '%c'",
4027                    ajFilebuffGetFile(buff), thys->Name,
4028                    (char) iqual);
4029             iqual = qmax;
4030         }
4031         thys->Accuracy[i++] = seqQualIllumina[iqual];
4032     }
4033 
4034 /*
4035 **    ajDebug("quality characters %d..%d (%d) '%c' '%c' (%c) "
4036 **            "scores %d..%d (%d)\n",
4037 **            (int) minqual, (int) maxqual, (int) comqual,
4038 **            minqual, maxqual, comqual,
4039 **            (amin + minqual - qmin), (amin + maxqual - qmin),
4040 **            (amin + comqual - qmin));
4041 */
4042 
4043     return ajTrue;
4044 }
4045 
4046 
4047 
4048 
4049 
4050 /* @funcstatic seqReadFastqSolexa *********************************************
4051 **
4052 ** Given data in a sequence structure, tries to read everything needed
4053 ** using the FASTQ format, and processes Illumina/Solexa quality scores.
4054 **
4055 ** @param [w] thys [AjPSeq] Sequence object
4056 ** @param [u] seqin [AjPSeqin] Sequence input object
4057 ** @return [AjBool] ajTrue on success
4058 **
4059 ** @release 6.1.0
4060 ** @@
4061 ******************************************************************************/
4062 
seqReadFastqSolexa(AjPSeq thys,AjPSeqin seqin)4063 static AjBool seqReadFastqSolexa(AjPSeq thys, AjPSeqin seqin)
4064 {
4065     AjPFilebuff buff;
4066     AjPStr id   = NULL;
4067     AjPStr acc  = NULL;
4068     AjPStr sv   = NULL;
4069     AjPStr desc = NULL;
4070 
4071     ajuint seqlen = 0;
4072     /*AjPStr qualstr = NULL;*/
4073 
4074 /*
4075 **    char minqual;
4076 **    char maxqual;
4077 **    char comqual;
4078 */
4079 
4080     const char *cp;
4081     ajint iqual;
4082     ajlong fpos     = 0;
4083     AjBool ok       = ajTrue;
4084     const AjPStr badstr = NULL;
4085 
4086     /*ajint amin = 0;*/
4087     ajint qmin = 59;
4088     ajint qmax = 126;
4089     ajuint i;
4090 /*
4091 **    double sval;
4092 **    double pval;
4093 **    double qval;
4094 */
4095 
4096     /*ajDebug("seqReadFastqSolexa\n");*/
4097 
4098     buff = seqin->Input->Filebuff;
4099 
4100     /* ajFilebuffTrace(buff); */
4101 
4102     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4103     if(!ok)
4104         return ajFalse;
4105 
4106     fpos = ajTextinGetFpos(seqin->Input);
4107 
4108     /*ajDebug("First line: %S\n", seqReadLine);*/
4109 
4110     cp = ajStrGetPtr(seqReadLine);
4111 
4112     if(*cp != '@')
4113     {
4114         ajDebug("first line is not FASTQ\n");
4115         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4116 
4117         return ajFalse;
4118     }
4119 
4120     if(!ajSeqParseFastq(seqReadLine, &id, &desc))
4121     {
4122         ajDebug("first line did not parse as FASTQ\n");
4123         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4124 
4125         return ajFalse;
4126     }
4127 
4128     seqSetNameNospace(&thys->Name, id);
4129 
4130     if(ajStrGetLen(sv))
4131         seqSvSave(thys, sv);
4132 
4133     if(ajStrGetLen(acc))
4134         seqAccSave(thys, acc);
4135 
4136     ajStrAssignS(&thys->Desc, desc);
4137     ajStrDel(&id);
4138     ajStrDel(&acc);
4139     ajStrDel(&sv);
4140     ajStrDel(&desc);
4141 
4142     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4143     while(ok &&
4144           !ajStrPrefixC(seqReadLine, "+"))
4145     {
4146         badstr = seqAppendWarn(&thys->Seq, seqReadLine,
4147                                seqin->Input->Format);
4148 
4149         if(badstr)
4150             ajWarn("Sequence '%S' has bad character(s) '%S'",
4151                    thys->Name, badstr);
4152 
4153         ajDebug("++fastq append line '%S'\n", seqReadLine);
4154         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4155         ajDebug("++fastq sequence %4u '%S'\n",
4156                 ajStrGetLen(thys->Seq), thys->Seq);
4157     }
4158 
4159     if(!ok)
4160     {
4161         ajDebug("failed to find quality scores\n");
4162         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4163 
4164         return ajFalse;
4165     }
4166 
4167     seqlen = ajStrGetLen(thys->Seq);
4168 
4169     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4170 
4171     ajStrAssignClear(&seqQualStr);
4172 
4173     while(ok &&
4174           ((ajStrGetLen(seqQualStr) < seqlen) ||
4175            ajStrGetCharFirst(seqReadLine) != '@'))
4176     {
4177         seqqualAppendWarn(&seqQualStr, seqReadLine);
4178 
4179         ajDebug("++fastq append qualities '%S'\n", seqReadLine);
4180         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4181         ajDebug("++fastq qualities %3u '%S'\n",
4182                 ajStrGetLen(seqQualStr), seqQualStr);
4183     }
4184 
4185 /*
4186 **    minqual = ajStrGetAsciiLow(seqQualStr);
4187 **    maxqual = ajStrGetAsciiHigh(seqQualStr);
4188 **    comqual = ajStrGetAsciiCommon(seqQualStr);
4189 */
4190 
4191     if(ajStrGetLen(seqQualStr) != seqlen)
4192     {
4193         ajDebug("length mismatch seq: %u quality: %u\n",
4194                 seqlen, ajStrGetLen(seqQualStr));
4195         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4196 
4197         return ajFalse;
4198     }
4199 
4200     if(ok)
4201         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4202     else
4203         ajFilebuffClear(buff, 0);
4204 
4205     thys->Fpos = fpos;
4206 
4207     /*ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);*/
4208 
4209     /*ajDebug("Solexa: %d..%d (%d)\n",
4210       (ajint) minqual, (ajint) maxqual, (ajint) comqual);*/
4211 
4212     cp = ajStrGetPtr(seqQualStr);
4213     i=0;
4214 
4215     if(seqlen > thys->Qualsize)
4216         AJCRESIZE(thys->Accuracy, seqlen);
4217 
4218     thys->Qualsize = seqlen;
4219 
4220     /*
4221     ** Sanger uses Phred quality calculated from error probability p
4222     ** Qp = -10 log (p)
4223     ** Solexa adjusts for the probability of error
4224     ** Qs = -10 log ((p/(1-p))
4225     **
4226     ** For Sanger (phred) p = 1 / 10**(Q/10)
4227     ** 10: p=0.1 20: p=0.01 etc.
4228     **
4229     ** For Solexa (Illumina) ps = p / (1+p) where p is the phred probability
4230     ** calculation which we use as an intermediate value
4231     */
4232 
4233     while (*cp)
4234     {
4235         iqual = *cp++;
4236         if(iqual < qmin)
4237         {
4238             ajWarn("FASTQ-SOLEXA quality value too low '%F' '%S' '%c'",
4239                    ajFilebuffGetFile(buff), thys->Name,
4240                    (char) iqual);
4241             iqual = qmin;
4242         }
4243         if(iqual > qmax)
4244         {
4245             ajWarn("FASTQ-SOLEXA quality value too high '%F' '%S' '%c'",
4246                    ajFilebuffGetFile(buff), thys->Name,
4247                    (char) iqual);
4248             iqual = qmax;
4249         }
4250         thys->Accuracy[i++] = (float) seqQualSolexa[iqual];
4251     }
4252 /*
4253 **    ajDebug("quality characters %d..%d (%d) '%c' '%c' (%c) "
4254 **            "scores %d..%d (%d)\n",
4255 **            (int) minqual, (int) maxqual, (int) comqual,
4256 **            minqual, maxqual, comqual,
4257 **            (amin + minqual - qmin), (amin + maxqual - qmin),
4258 **            (amin + comqual - qmin));
4259 */
4260 
4261     return ajTrue;
4262 }
4263 
4264 
4265 
4266 
4267 /* @funcstatic seqReadDbId ****************************************************
4268 **
4269 ** Given data in a sequence structure, tries to read everything needed
4270 ** using the FASTA >db id format.
4271 **
4272 ** @param [w] thys [AjPSeq] Sequence object
4273 ** @param [u] seqin [AjPSeqin] Sequence input object
4274 ** @return [AjBool] ajTrue on success
4275 **
4276 ** @release 1.0.0
4277 ** @@
4278 ******************************************************************************/
4279 
seqReadDbId(AjPSeq thys,AjPSeqin seqin)4280 static AjBool seqReadDbId(AjPSeq thys, AjPSeqin seqin)
4281 {
4282     AjPFilebuff buff;
4283 
4284     const char *cp;
4285     const AjPStr vacc = NULL;
4286     ajlong fpos     = 0;
4287     ajlong fposb    = 0;
4288     AjBool ok       = ajTrue;
4289 
4290     ajDebug("seqReadDbId\n");
4291 
4292     buff = seqin->Input->Filebuff;
4293     /* ajFilebuffTrace(buff); */
4294 
4295     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4296     if(!ok)
4297         return ajFalse;
4298 
4299     fpos = ajTextinGetFpos(seqin->Input);
4300 
4301     /* If ; then it is really PIR format */
4302     if(ajStrGetCharPos(seqReadLine, 3) == ';')
4303         return ajFalse;
4304 
4305     cp = ajStrGetPtr(seqReadLine);
4306 
4307     if(*cp != '>')
4308     {
4309         ajDebug("first line is not FASTA\n");
4310         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4311 
4312         return ajFalse;
4313     }
4314 
4315     ajStrTokenAssignC(&seqHandle, seqReadLine, "> ");
4316     ajStrTokenStepC(seqHandle, " \t\n\r");
4317     ajStrTokenNextParseC(seqHandle, " \t\n\r", &seqToken);
4318     seqSetName(thys, seqToken);
4319 
4320     ajStrTokenNextParse(seqHandle, &seqToken);
4321 
4322     vacc = ajSeqtestIsSeqversion(seqToken);
4323     if(vacc)
4324     {
4325         seqSvSave(thys, seqToken);
4326         seqAccSave(thys, vacc);
4327         ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc);
4328     }
4329     else if(ajSeqtestIsAccession(seqToken))
4330     {
4331         seqAccSave(thys, seqToken);
4332         ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc);
4333     }
4334     else
4335     {
4336         ajStrAssignS(&thys->Desc, seqToken);
4337 
4338         if(ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken))
4339         {
4340             ajStrAppendC(&thys->Desc, " ");
4341             ajStrAppendS(&thys->Desc, seqToken);
4342         }
4343     }
4344 
4345     ajStrDelStatic(&seqToken);
4346     ajStrTokenReset(seqHandle);
4347 
4348     if(ajStrGetLen(seqin->Inseq))
4349     {                                  /* we have a sequence to use */
4350         ajStrAssignS(&thys->Seq, seqin->Inseq);
4351 
4352         if(seqin->Input->Text)
4353             seqTextSeq(&thys->TextPtr, seqin->Inseq);
4354 
4355         ajFilebuffClear(buff, 0);
4356     }
4357     else
4358     {
4359         /* we know we will succeed from here ... no way to return ajFalse */
4360 
4361         ajFilebuffSetUnbuffered(buff);
4362 
4363         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4364         while(ok && !ajStrPrefixC(seqReadLine, ">"))
4365         {
4366             seqAppend(&thys->Seq, seqReadLine);
4367 
4368             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4369         }
4370 
4371         if(ok)
4372             ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4373         else
4374             ajFilebuffClear(buff, 0);
4375     }
4376 
4377     thys->Fpos = fpos;
4378 
4379     ajDebug("started at fpos %Ld ok: %B fposb: %Ld\n", fpos, ok, fposb);
4380 
4381     return ajTrue;
4382 }
4383 
4384 
4385 
4386 
4387 /* @funcstatic seqReadGde *****************************************************
4388 **
4389 ** Given data in a sequence structure, tries to read everything needed
4390 ** using the GDE format
4391 **
4392 ** @param [w] thys [AjPSeq] Sequence object
4393 ** @param [u] seqin [AjPSeqin] Sequence input object
4394 ** @return [AjBool] ajTrue on success
4395 **
4396 ** @release 6.6.0
4397 ** @@
4398 ******************************************************************************/
4399 
seqReadGde(AjPSeq thys,AjPSeqin seqin)4400 static AjBool seqReadGde(AjPSeq thys, AjPSeqin seqin)
4401 {
4402     AjPFilebuff buff;
4403 
4404     const char *cp;
4405     AjBool ok       = ajTrue;
4406     const AjPStr badstr = NULL;
4407     ajlong fpos;
4408 
4409     ajDebug("seqReadGde\n");
4410 
4411     buff = seqin->Input->Filebuff;
4412 
4413     /* ajFilebuffTrace(buff); */
4414 
4415     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4416     if(!ok)
4417         return ajFalse;
4418 
4419     fpos = ajTextinGetFpos(seqin->Input);
4420 
4421     ajDebug("First line: %S\n", seqReadLine);
4422 
4423     cp = ajStrGetPtr(seqReadLine);
4424 
4425     if(*cp != '#')
4426     {
4427         ajDebug("first line is not GDE\n");
4428         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4429 
4430         return ajFalse;
4431     }
4432 
4433     ajStrCutStart(&seqReadLine, 1);
4434 
4435     ajStrTokenAssign(&seqHandle, seqReadLine);
4436     ajStrTokenNextParse(seqHandle, &seqToken);
4437 
4438     seqSetNameNospace(&thys->Name, seqToken);
4439 
4440     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4441     while(ok &&
4442           !ajStrPrefixC(seqReadLine, "#"))
4443     {
4444         badstr = seqAppendWarn(&thys->Seq, seqReadLine,
4445                                seqin->Input->Format);
4446 
4447         if(badstr)
4448             ajWarn("Sequence '%S' has bad character(s) '%S'",
4449                    thys->Name, badstr);
4450 
4451         ajDebug("++fastq append line '%S'\n", seqReadLine);
4452         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4453         ajDebug("++fastq sequence %4u '%S'\n",
4454                 ajStrGetLen(thys->Seq), thys->Seq);
4455     }
4456 
4457     if(ok)
4458         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4459     else
4460         ajFilebuffClear(buff, 0);
4461 
4462     thys->Fpos = fpos;
4463 
4464     return ajTrue;
4465 }
4466 
4467 
4468 
4469 
4470 /* @funcstatic seqReadNbrf ****************************************************
4471 **
4472 ** Given data in a sequence structure, tries to read everything needed
4473 ** using NBRF format.
4474 **
4475 ** @param [w] thys [AjPSeq] Sequence object
4476 ** @param [u] seqin [AjPSeqin] Sequence input object
4477 ** @return [AjBool] ajTrue on success
4478 **
4479 ** @release 1.0.0
4480 ** @@
4481 ******************************************************************************/
4482 
seqReadNbrf(AjPSeq thys,AjPSeqin seqin)4483 static AjBool seqReadNbrf(AjPSeq thys, AjPSeqin seqin)
4484 {
4485     AjPStr idline = NULL;
4486     AjPStr tmpline = NULL;
4487 
4488     AjBool dofeat  = ajFalse;
4489     AjBool tryfeat = ajFalse;
4490     AjPStr    seqReadLine2 = NULL;
4491 
4492     AjBool ok;
4493     AjPFilebuff buff;
4494     AjBool skipheader;
4495 
4496     ajDebug("seqReadNbrf\n");
4497 
4498     buff = seqin->Input->Filebuff;
4499 
4500     if(!seqToken2)
4501     {
4502         seqToken2 = ajStrNew();
4503         seqReadLine2 = ajStrNew();
4504     }
4505 
4506     if(!seqFtFmtPir)
4507         ajStrAssignC(&seqFtFmtPir, "pir");
4508 
4509     if(!seqRegNbrfId)
4510         seqRegNbrfId = ajRegCompC("^>(..)[>;]([^ \t\n]+)");
4511 
4512     skipheader = ajTrue;
4513     while(skipheader)
4514     {
4515         if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
4516             return ajFalse;
4517 
4518         if(!ajStrPrefixC(seqReadLine, "C;") && !ajStrIsWhite(seqReadLine))
4519             skipheader = ajFalse;
4520     }
4521 
4522     ajDebug("nbrf first line:\n%S", seqReadLine);
4523 
4524     if(!ajRegExec(seqRegNbrfId, seqReadLine))
4525     {
4526         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4527         return ajFalse;
4528     }
4529 
4530     ajRegSubI(seqRegNbrfId, 1, &seqToken);
4531     ajRegSubI(seqRegNbrfId, 2, &thys->Name);
4532     ajDebug("parsed line name '%S' token '%S' token(1) '%c'\n",
4533             thys->Name, seqToken, ajStrGetCharFirst(seqToken));
4534     ajStrAssignSubS(&idline, seqReadLine, 4, -1);
4535 
4536     /*
4537     ** token has the NBRF 2-char type. First char is the type
4538     ** and second char is Linear, Circular, or 1
4539     ** or, for GCG databases, this is just '>>'
4540     */
4541 
4542     switch(toupper((ajint) ajStrGetCharFirst(seqToken)))
4543     {
4544         case 'P':
4545         case 'F':
4546             ajSeqSetProt(thys);
4547         break;
4548         case 'B':                               /* used by DIANA */
4549         case 'D':                               /* DNA */
4550         case 'R':                               /* RNA */
4551             ajSeqSetNuc(thys);
4552         break;
4553         default:
4554             ajWarn("Unknown NBRF sequence type '%S'", seqToken);
4555     }
4556 
4557     /* next line is the description, with no prefix */
4558 
4559     if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
4560     {
4561         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4562 
4563         return ajFalse;
4564     }
4565 
4566     ajStrAssignS(&thys->Desc, seqReadLine);
4567 
4568     if(ajStrGetCharLast(thys->Desc) == '\n')
4569         ajStrCutEnd(&thys->Desc, 1);
4570 
4571     /* read on, looking for feature and sequence lines */
4572 
4573     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4574 
4575     dofeat = ajFalse;
4576     tryfeat = seqinUfoLocal(seqin);
4577 
4578     while(ok && !ajStrPrefixC(seqReadLine, ">"))
4579     {
4580         if(ajStrGetCharPos(seqReadLine, 1) != ';')
4581             seqAppend(&thys->Seq, seqReadLine);
4582         else
4583         {
4584             if(ajStrPrefixC(seqReadLine, "C;Accession:"))
4585             {
4586                 ajStrAssignC(&seqReadLine2,ajStrGetPtr(seqReadLine)+13);
4587                 ajStrTokenAssignC(&seqHandle2,seqReadLine2, " ;\n\r");
4588 
4589                 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
4590                     seqAccSave(thys, seqToken2);
4591             }
4592 
4593             if(ajStrPrefixC(seqReadLine, "C;Species:"))
4594             {
4595                 ajStrAssignC(&seqReadLine2,ajStrGetPtr(seqReadLine)+11);
4596                 ajStrTokenAssignC(&seqHandle2,seqReadLine2, ";.\n\r");
4597 
4598                 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
4599                     seqTaxSave(thys, seqToken2, 1);
4600             }
4601 
4602             if(ajStrGetCharFirst(seqReadLine) == 'R')
4603             {                /* skip reference lines with no prefix */
4604                 while((ok=ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr)))
4605                     if(ajStrGetCharPos(seqReadLine,1)==';' ||
4606                        ajStrGetCharFirst(seqReadLine)=='>')
4607                         break;          /* X; line or next sequence */
4608 
4609                 if(ok)
4610                     continue;
4611             }
4612             else if(tryfeat && ajStrGetCharFirst(seqReadLine) == 'F')
4613             {                           /* feature lines */
4614                 if(!dofeat)
4615                 {
4616                     dofeat = ajTrue;
4617                     ajFeattabinDel(&seqin->Ftquery);
4618                     seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtPir,
4619                                                            thys->Name, "N");
4620                     ajDebug("seqin->Ftquery Filebuff %x\n",
4621                             seqin->Ftquery->Input->Filebuff);
4622                 }
4623 
4624                 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff,
4625                                 seqReadLine);
4626                 /* ajDebug("NBRF FEAT saved line:\n%S", seqReadLine); */
4627             }
4628         }
4629 
4630         if(ok)
4631             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4632 
4633         /* SRS 7 and SRS 8.0 put an extra ID line in here */
4634 
4635         /* SRS 8.1 is even worse - it has a peculiar bug that repeats
4636            the ID line but with a few digits in front, and then repeats the
4637            description */
4638 
4639         /* just for another oddity ... the extra ID line always starts >P1;
4640            even if the protein is a fragment */
4641 
4642         if(ok && !ajStrGetLen(thys->Seq) &&
4643            (ajStrFindAnyK(seqReadLine, '>') != -1))
4644         {
4645             ajStrAssignS(&tmpline, seqReadLine);
4646             ajStrTrimStartC(&tmpline,"0123456789");
4647             ajStrCutStart(&tmpline, 4);
4648 
4649             if(ajStrMatchS(tmpline, idline))
4650             {
4651                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4652 
4653                 if(ok && !ajStrIsWhite(seqReadLine)) /* SRS 8.1 desc line */
4654                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4655             }
4656         }
4657 
4658     }
4659 
4660     if(ajStrGetCharLast(thys->Seq) == '*')
4661         ajStrCutEnd(&thys->Seq, 1);
4662 
4663     if(ok)
4664         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4665     else
4666         ajFilebuffClear(buff, 0);
4667 
4668     if(dofeat)
4669     {
4670         ajDebug("seqin->Ftquery Filebuff %x\n",
4671                 seqin->Ftquery->Input->Filebuff);
4672         ajFeattableDel(&seqin->Fttable);
4673         thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
4674         if(thys->Fttable)
4675             ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
4676         /* ajFeattableTrace(thys->Fttable); */
4677         ajFeattabinClear(seqin->Ftquery);
4678     }
4679 
4680     ajStrDel(&idline);
4681     ajStrDel(&tmpline);
4682     ajStrDel(&seqReadLine2);
4683     ajStrTokenReset(seqHandle2);
4684     ajStrDelStatic(&seqToken);
4685     ajStrDelStatic(&seqToken2);
4686 
4687     return ajTrue;
4688 }
4689 
4690 
4691 
4692 
4693 /* @funcstatic seqReadNibble **************************************************
4694 **
4695 ** Given data in a sequence structure, tries to read everything needed
4696 ** using the half-byte comressed nibble format
4697 **
4698 ** @param [w] thys [AjPSeq] Sequence object
4699 ** @param [u] seqin [AjPSeqin] Sequence input object
4700 ** @return [AjBool] ajTrue on success
4701 **
4702 ** @release 6.6.0
4703 ** @@
4704 ******************************************************************************/
4705 
seqReadNibble(AjPSeq thys,AjPSeqin seqin)4706 static AjBool seqReadNibble(AjPSeq thys, AjPSeqin seqin)
4707 {
4708     AjPFilebuff buff;
4709     AjPFile fp;
4710 
4711     ajulong filestat = 0L;
4712     AjBool ok       = ajTrue;
4713     union lbytes
4714     {
4715         char chars[4];
4716         ajuint i;
4717     } seqbyte;
4718 
4719     ajuint seqlen = 0;
4720     ajuint buflen;
4721     ajuint base1;
4722     ajuint base2;
4723     AjPStr buf = NULL;
4724     char *cbuf;
4725     ajuint i;
4726     ajuint j;
4727 
4728     AjBool doreverse = AJFALSE;
4729     const char *nibblechars = "TCAGNNNNTCAGNNNN";
4730 
4731     ajDebug("seqReadNibble\n");
4732 
4733     buff = seqin->Input->Filebuff;
4734     fp = ajFilebuffGetFile(buff);
4735 
4736     if(ajFilebuffIsEnded(buff))
4737     {
4738         ajDebug("seqReadNibble buffer already ended\n");
4739 
4740         return ajFalse;
4741     }
4742 
4743     filestat = ajFileSeek(fp, 0L, SEEK_SET);
4744 
4745     if(filestat)
4746     {
4747         ajDebug("seqReadNibble rewind failed errno %d: %s\n",
4748                 errno, strerror(errno));
4749         return ajFalse;
4750     }
4751     else
4752     {
4753         if(ajFilebuffIsEnded(buff))
4754         {
4755             ajFileSeek(fp, 0L, SEEK_END);
4756             return ajFalse;
4757         }
4758 
4759         ajReadbinUint4(fp, &seqbyte.i);
4760         if(seqbyte.i == 0x6BE9eD3A)
4761         {
4762             ajDebug("seqReadNibble: Magic number found\n");
4763         }
4764         else if(seqbyte.i == 0x3AEDE96B)
4765         {
4766             ajDebug("seqReadNibble: Magic number is reversed\n");
4767             doreverse = ajTrue;
4768         }
4769         else
4770         {
4771             ajDebug("seqReadNibble: Magic number not found (%x)\n", seqbyte.i);
4772             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4773             ajStrAssignClear(&thys->Seq);
4774 
4775             return ajFalse;
4776         }
4777 
4778         ajReadbinUint4(fp, &seqlen);
4779         if(doreverse)
4780             ajByteRevUint(&seqlen);
4781 
4782         buflen = (1+seqlen)/2;
4783         buf = ajStrNewRes(buflen);
4784         ajStrSetValidLen(&buf, buflen);
4785         cbuf = ajStrGetuniquePtr(&buf);
4786 
4787         ajReadbinBinary(fp, buflen, 1, cbuf);
4788 
4789         j = 0;
4790         for(i=0; i < buflen; i++)
4791         {
4792             seqbyte.chars[0] = cbuf[i];
4793             base2 = seqbyte.chars[0] & 0x0F;
4794             base1 = (seqbyte.chars[0] >> 4);
4795             seqAppendK(&thys->Seq, nibblechars[base1]);
4796             if(++j < seqlen)
4797                 seqAppendK(&thys->Seq, nibblechars[base2]);
4798             ++j;
4799         }
4800 
4801         if(!ok)
4802         {
4803             ajFileSeek(fp,(ajlong) filestat,0);
4804 
4805             if(seqin->Input->Text)
4806                 ajStrAssignC(&thys->TextPtr, "");
4807 
4808             ajFilebuffResetPos(buff);
4809 
4810             return ajFalse;
4811         }
4812     }
4813 
4814     ajFilebuffClear(buff, -1);
4815     buff->File->End = ajTrue;   /* set to avoid rereading */
4816 
4817     if(!ajTextinGetRecords(seqin->Input))
4818         return ajFalse;
4819 
4820     return ajTrue;
4821 }
4822 
4823 
4824 
4825 
4826 /* @funcstatic seqReadGcg *****************************************************
4827 **
4828 ** Given data in a sequence structure, tries to read everything needed
4829 ** using GCG format.
4830 **
4831 ** @param [w] thys [AjPSeq] Sequence object
4832 ** @param [u] seqin [AjPSeqin] Sequence input object
4833 ** @return [AjBool] ajTrue on success
4834 **
4835 ** @release 1.0.0
4836 ** @@
4837 ******************************************************************************/
4838 
seqReadGcg(AjPSeq thys,AjPSeqin seqin)4839 static AjBool seqReadGcg(AjPSeq thys, AjPSeqin seqin)
4840 {
4841     AjBool ok;
4842 
4843     ajuint len     = 0;
4844     AjBool incomment = ajFalse;
4845 
4846     AjPFilebuff buff;
4847 
4848     buff = seqin->Input->Filebuff;
4849 
4850     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4851 
4852     while (ok && ajStrIsWhite(seqReadLine))
4853     {
4854         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4855         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4856     }
4857 
4858     if(!ok)
4859         return ajFalse;
4860 
4861     ajDebug("seqReadGcg first line ok: %B\n'%S'\n", ok, seqReadLine);
4862 
4863     /* test GCG 9.x file types if available */
4864     /* any type on the .. line will override this */
4865 
4866     if(ajStrPrefixC(seqReadLine, "!!NA_SEQUENCE"))
4867         ajSeqSetNuc(thys);
4868     else if(ajStrPrefixC(seqReadLine, "!!AA_SEQUENCE"))
4869         ajSeqSetProt(thys);
4870 
4871     if(!seqGcgDots(thys, seqin, &seqReadLine, seqMaxGcglines, &len))
4872     {
4873         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4874         return ajFalse;
4875     }
4876 
4877     ajDebug("   Gcg dots read ok len: %d\n", len);
4878 
4879     while(ok &&  (ajSeqGetLen(thys) < len))
4880     {
4881         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4882         if(ok)
4883         {
4884             seqAppendCommented(&thys->Seq, &incomment, seqReadLine);
4885             ajDebug("line %d seqlen: %d ok: %B\n",
4886                     ajTextinGetRecords(seqin->Input), ajSeqGetLen(thys), ok);
4887         }
4888     }
4889 
4890     ajDebug("lines: %d ajSeqGetLen : %d len: %d ok: %B\n",
4891             ajTextinGetRecords(seqin->Input), ajSeqGetLen(thys), len, ok);
4892 
4893     ajFilebuffClear(buff, 0);
4894 
4895     return ok;
4896 }
4897 
4898 
4899 
4900 
4901 /* @funcstatic seqReadNcbi ****************************************************
4902 **
4903 ** Given data in a sequence structure, tries to read everything needed
4904 ** using NCBI format.
4905 **
4906 ** @param [w] thys [AjPSeq] Sequence object
4907 ** @param [u] seqin [AjPSeqin] Sequence input object
4908 ** @return [AjBool] ajTrue on success
4909 **
4910 ** @release 1.0.0
4911 ** @@
4912 ******************************************************************************/
4913 
seqReadNcbi(AjPSeq thys,AjPSeqin seqin)4914 static AjBool seqReadNcbi(AjPSeq thys, AjPSeqin seqin)
4915 {
4916     AjPStr id        = NULL;
4917     AjPStr acc       = NULL;
4918     AjPStr sv        = NULL;
4919     AjPStr gi        = NULL;
4920     AjPStr db        = NULL;
4921     AjPStr desc      = NULL;
4922 
4923     AjPFilebuff buff;
4924 
4925     AjBool ok;
4926     const AjPStr badstr = NULL;
4927 
4928     buff = seqin->Input->Filebuff;
4929 
4930     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4931     if(!ok)
4932         return ajFalse;
4933 
4934     ajStrAssignClear(&id);
4935     ajStrAssignClear(&acc);
4936     ajStrAssignClear(&sv);
4937     ajStrAssignClear(&gi);
4938     ajStrAssignClear(&desc);
4939 
4940 
4941     if(!ajSeqParseNcbi(seqReadLine,&id,&acc,&sv,&gi,&db,&desc))
4942     {
4943         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
4944         ajStrDel(&id);
4945         ajStrDel(&acc);
4946         ajStrDel(&sv);
4947         ajStrDel(&gi);
4948         ajStrDel(&db);
4949         ajStrDel(&desc);
4950 
4951         return ajFalse;
4952     }
4953 
4954     ajDebug("parsed id '%S' acc '%S' sv '%S' gi '%S' db '%S' (%S) desc '%S'\n",
4955             id, acc, sv, gi, db, thys->Setdb, desc);
4956 
4957     ajStrAssignS(&thys->Setdb, db);
4958     ajDebug("set setdb '%S' db '%S'\n", thys->Setdb, thys->Db);
4959 
4960     if(ajStrGetLen(gi))
4961         ajStrAssignS(&thys->Gi, gi);
4962 
4963     if(ajStrGetLen(sv))
4964         seqSvSave(thys, sv);
4965 
4966     if(ajStrGetLen(acc))
4967         seqAccSave(thys, acc);
4968 
4969     seqSetName(thys, id);
4970     ajStrAssignS(&thys->Desc, desc);
4971 
4972 
4973     if(ajStrGetLen(seqin->Inseq))
4974     {                                  /* we have a sequence to use */
4975         ajStrAssignS(&thys->Seq, seqin->Inseq);
4976 
4977         if(seqin->Input->Text)
4978             seqTextSeq(&thys->TextPtr, seqin->Inseq);
4979 
4980         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
4981     }
4982     else
4983     {
4984         /* we know we will succeed from here ... no way to return ajFalse */
4985 
4986         ajFilebuffSetUnbuffered(buff);
4987 
4988         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4989         while(ok && !ajStrPrefixC(seqReadLine, ">"))
4990         {
4991             badstr = seqAppendWarn(&thys->Seq, seqReadLine,
4992                                    seqin->Input->Format);
4993 
4994             if(badstr)
4995                 ajWarn("Sequence '%S' has bad character(s) '%S'",
4996                        thys->Name, badstr);
4997 
4998             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
4999         }
5000 
5001         if(ok)
5002             ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
5003         else
5004             ajFilebuffClear(buff, 0);
5005     }
5006 
5007     ajDebug("seqReadNcbi Setdb '%S' Db '%S'\n", thys->Setdb, thys->Db);
5008     ajStrDel(&id);
5009     ajStrDel(&acc);
5010     ajStrDel(&sv);
5011     ajStrDel(&gi);
5012     ajStrDel(&db);
5013     ajStrDel(&desc);
5014 
5015     return ajTrue;
5016 }
5017 
5018 
5019 
5020 
5021 /* @funcstatic seqReadGifasta *************************************************
5022 **
5023 ** Given data in a sequence structure, tries to read everything needed
5024 ** using NCBI format. However, unlike NCBI format it uses the GI number
5025 ** as the sequence ID
5026 **
5027 ** @param [w] thys [AjPSeq] Sequence object
5028 ** @param [u] seqin [AjPSeqin] Sequence input object
5029 ** @return [AjBool] ajTrue on success
5030 **
5031 ** @release 4.1.0
5032 ** @@
5033 ******************************************************************************/
5034 
seqReadGifasta(AjPSeq thys,AjPSeqin seqin)5035 static AjBool seqReadGifasta(AjPSeq thys, AjPSeqin seqin)
5036 {
5037     AjPStr id        = NULL;
5038     AjPStr acc       = NULL;
5039     AjPStr sv        = NULL;
5040     AjPStr gi        = NULL;
5041     AjPStr db        = NULL;
5042     AjPStr desc      = NULL;
5043 
5044     AjPFilebuff buff;
5045 
5046     AjBool ok;
5047 
5048 
5049     buff = seqin->Input->Filebuff;
5050 
5051     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
5052     if(!ok)
5053         return ajFalse;
5054 
5055     ajStrAssignClear(&id);
5056     ajStrAssignClear(&acc);
5057     ajStrAssignClear(&sv);
5058     ajStrAssignClear(&gi);
5059     ajStrAssignClear(&desc);
5060 
5061 
5062     if(!ajSeqParseNcbi(seqReadLine,&id,&acc,&sv,&gi,&db,&desc) ||
5063        !ajStrGetLen(gi))
5064     {
5065         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5066         ajStrDel(&id);
5067         ajStrDel(&db);
5068         ajStrDel(&acc);
5069         ajStrDel(&sv);
5070         ajStrDel(&gi);
5071         ajStrDel(&desc);
5072 
5073         return ajFalse;
5074     }
5075 
5076     ajDebug("parsed id '%S' acc '%S' sv '%S' gi '%S' db '%S' (%S) desc '%S'\n",
5077             id, acc, sv, gi, db, thys->Setdb, desc);
5078 
5079     ajStrAssignS(&thys->Gi, gi);
5080 
5081     ajStrAssignS(&thys->Setdb, db);
5082     ajDebug("set setdb '%S' db '%S'\n", thys->Setdb, thys->Db);
5083 
5084     if(ajStrGetLen(sv))
5085         seqSvSave(thys, sv);
5086 
5087     if(ajStrGetLen(acc))
5088         seqAccSave(thys, acc);
5089 
5090     seqSetName(thys, gi);
5091     ajStrAssignS(&thys->Desc, desc);
5092 
5093 
5094     if(ajStrGetLen(seqin->Inseq))
5095     {                                  /* we have a sequence to use */
5096         ajStrAssignS(&thys->Seq, seqin->Inseq);
5097 
5098         if(seqin->Input->Text)
5099             seqTextSeq(&thys->TextPtr, seqin->Inseq);
5100 
5101         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
5102     }
5103     else
5104     {
5105         /* we know we will succeed from here ... no way to return ajFalse */
5106 
5107         ajFilebuffSetUnbuffered(buff);
5108 
5109         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
5110 
5111 
5112 
5113         while(ok && !ajStrPrefixC(seqReadLine, ">"))
5114         {
5115             seqAppend(&thys->Seq, seqReadLine);
5116 
5117             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
5118         }
5119 
5120         if(ok)
5121             ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
5122         else
5123             ajFilebuffClear(buff, 0);
5124     }
5125 
5126     ajStrDel(&id);
5127     ajStrDel(&db);
5128     ajStrDel(&acc);
5129     ajStrDel(&sv);
5130     ajStrDel(&gi);
5131     ajStrDel(&desc);
5132 
5133     return ajTrue;
5134 }
5135 
5136 
5137 
5138 
5139 /* @funcstatic seqReadSelex ***************************************************
5140 **
5141 ** Read a Selex file. Assumed a comment on the first line but this may
5142 ** not be true.
5143 **
5144 ** This format can read anything that looks like a block of "name sequence"
5145 ** data. The names are even allowed to change in later blocks.
5146 **
5147 ** The format was used by HMMER, but that package now prefers the better
5148 ** annotated "Stockholm" format used by Pfam and Rfam.
5149 **
5150 ** @param [w] thys [AjPSeq] Sequence object
5151 ** @param [u] seqin [AjPSeqin] Sequence input object
5152 ** @return [AjBool] ajTrue on success
5153 **
5154 ** @release 2.3.0
5155 ** @@
5156 ******************************************************************************/
5157 
seqReadSelex(AjPSeq thys,AjPSeqin seqin)5158 static AjBool seqReadSelex(AjPSeq thys, AjPSeqin seqin)
5159 {
5160     AjPFilebuff buff  = seqin->Input->Filebuff;
5161     AjPStr      line  = NULL;
5162     SeqPSelex    selex;
5163     ajuint       n      = 0;
5164     const char  *p     = NULL;
5165     AjBool      ok     = ajFalse;
5166     AjBool      isseq  = ajFalse;
5167     AjBool      named  = ajFalse;
5168     AjBool      head   = ajTrue;
5169     ajuint       sqcnt  = 0;
5170     ajuint       i;
5171     char        c      = '\0';
5172     AjBool      first  = ajTrue;
5173 
5174     line = ajStrNew();
5175 
5176 
5177     if(seqin->SeqData)
5178         selex = seqin->SeqData;
5179     else
5180     {
5181         ajFilebuffSetBuffered(buff);    /* must buffer to test sequences */
5182 
5183         /* First count the sequences, and get any header information */
5184         while(!isseq && (ok=ajBuffreadLine(buff,&line)))
5185         {
5186             if(first)
5187             {
5188                 first=ajFalse;
5189 
5190                 if(!ajStrPrefixC(line,"#"))
5191                 {
5192                     ajStrDel(&line);
5193                     ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5194 
5195                     return ajFalse;
5196                 }
5197             }
5198 
5199             ajStrRemoveWhiteExcess(&line);
5200             p = ajStrGetPtr(line);
5201 
5202             if(!*p || *p=='#')
5203                 continue;
5204             else
5205                 isseq = ajTrue;
5206         }
5207 
5208         if(!ok && !isseq)
5209             return ajFalse;
5210         ++n;
5211 
5212         ok = ajTrue;
5213 
5214         while(ok && ajBuffreadLine(buff,&line))
5215         {
5216             ajStrRemoveWhiteExcess(&line);
5217             p = ajStrGetPtr(line);
5218 
5219             if(*p=='#')
5220                 continue;
5221 
5222             if(!*p)
5223                 ok = ajFalse;
5224             else
5225                 ++n;
5226         }
5227 
5228         ajFilebuffClear(buff,-1);
5229         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5230         buff->Fpos = 0;
5231         ajFileSeek(buff->File, 0L, 0);
5232         selex = selexNew(n);
5233 
5234         /* now read it for real */
5235 
5236         while(head && ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr))
5237         {
5238             if(ajStrPrefixC(line,"#=RF") ||ajStrPrefixC(line,"#=CS"))
5239                 break;
5240 
5241             if(ajStrPrefixC(line,"#="))
5242             {
5243                 head=seqSelexHeader(&selex,line,&named,&sqcnt);
5244                 continue;
5245             }
5246 
5247             c = *ajStrGetPtr(line);
5248 
5249             if(c>='0')
5250                 head = ajFalse;
5251         }
5252 
5253         /* Should now be at start of first block, whether RF or sequence */
5254         ajDebug("First Block Line: %S",line);
5255 
5256         ok = ajTrue;
5257 
5258         while(ok && !ajStrPrefixC(line, "# ID"))
5259         {
5260             seqSelexReadBlock(&selex,&named,n,&line,seqin, &thys->TextPtr);
5261             ok = ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5262             ajDebug("... in loop ok: %B\n", ok);
5263         }
5264 
5265         ajDebug(" Block done. More data (ok): %B\n", ok);
5266 
5267         if(ok)
5268             ajTextinStoreClear(seqin->Input, 1, line, &thys->TextPtr);
5269         else
5270             ajFilebuffClear(buff, 0);
5271 
5272         seqin->SeqData = selex;
5273     }
5274 
5275 
5276     /* At this point the Selex structure is fully loaded */
5277     if(selex->Count >= selex->n)
5278     {
5279         selexDel(&selex);
5280         seqin->SeqData = NULL;
5281         ajStrDel(&line);
5282 
5283         return ajFalse;
5284     }
5285 
5286     i = selex->Count;
5287 
5288     seqSelexCopy(&thys,selex,i);
5289 
5290     ++selex->Count;
5291 
5292     ajFilebuffClear(buff,0);
5293 
5294     ajStrDel(&line);
5295 
5296     return ajTrue;
5297 }
5298 
5299 
5300 
5301 
5302 /* @funcstatic seqReadStockholm ***********************************************
5303 **
5304 ** Read a Stockholm file.
5305 **
5306 ** @param [w] thys [AjPSeq] Stockholm input file
5307 ** @param [u] seqin [AjPSeqin] seqin object
5308 ** @return [AjBool] ajTrue if success
5309 **
5310 ** @release 2.3.0
5311 ** @@
5312 ******************************************************************************/
5313 
seqReadStockholm(AjPSeq thys,AjPSeqin seqin)5314 static AjBool seqReadStockholm(AjPSeq thys, AjPSeqin seqin)
5315 {
5316     AjPFilebuff buff  = seqin->Input->Filebuff;
5317     AjPStr      line  = NULL;
5318     AjPStr      word  = NULL;
5319     AjPStr      post  = NULL;
5320     AjPStr      namstr = NULL;
5321     AjPStr      seqstr = NULL;
5322     AjBool      ok    = ajFalse;
5323     AjBool      bmf   = ajTrue;
5324     AjBool      dcf   = ajTrue;
5325     AjBool      drf   = ajTrue;
5326     AjBool      ccf   = ajTrue;
5327     AjBool      gsf   = ajTrue;
5328     AjBool      reff  = ajTrue;
5329 
5330     SeqPStockholm stock = NULL;
5331 
5332     ajuint i     = 0;
5333     ajuint n     = 0;
5334     ajuint  scnt = INT_MAX;
5335 
5336     line = ajStrNew();
5337 
5338     ajDebug("seqReadStockholm EOF:%B Data:%x\n",
5339             ajFilebuffIsEof(buff), seqin->SeqData);
5340     if(seqin->SeqData)
5341         stock = seqin->SeqData;
5342     else
5343     {
5344         ajFilebuffSetBuffered(buff); /* must buffer to test sequences */
5345         ok=ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5346         ajStrTrimWhiteEnd(&line);
5347 
5348         if(!ok || !ajStrPrefixC(line,"# STOCKHOLM 1."))
5349         {
5350             if (ok)
5351                 ajDebug("Stockholm: bad first line: %S", line);
5352             else
5353                 ajDebug("Stockholm: no first line\n");
5354 
5355             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5356             ajStrDel(&line);
5357 
5358             return ajFalse;
5359         }
5360 
5361         ajDebug("Stockholm: good first line: %S", line);
5362 
5363         while(ok && (ajStrPrefixC(line, "#") || !ajStrGetLen(line)))
5364         {
5365             if(ajStrPrefixC(line,"#=GF SQ"))
5366             {
5367                 ajFmtScanS(line,"%*s%*s%d",&n);
5368                 ajDebug("Stockholm: parsed SQ line of %d sequences\n", n);
5369             }
5370 
5371             ok=ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5372             ajStrTrimWhiteEnd(&line);
5373             ajDebug("Stockholm: SQ search: %S\n", line);
5374         }
5375 
5376         if (!n)                         /* no SQ line, count first block */
5377         {
5378             while(ok && ajStrGetLen(line))
5379             {
5380                 if(!ajStrPrefixC(line, "#") &&
5381                    !ajStrMatchC(line, "//"))
5382                     n++;
5383 
5384                 ok=ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5385                 ajStrTrimWhiteEnd(&line);
5386                 ajDebug("Stockholm: block %d read: %S\n", n, line);
5387             }
5388 
5389             ajDebug("Stockholm: read block of %d sequences\n", n);
5390         }
5391 
5392         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
5393 
5394         ok=ajTextinStoreReadline(seqin->Input, &line, &thys->TextPtr);
5395         ajStrTrimWhiteEnd(&line);
5396         stock = stockholmNew(n);
5397 
5398         ajDebug("Created stockholm data object size: %d\n", n);
5399 
5400         word  = ajStrNew();
5401         post  = ajStrNew();
5402         ajStrAssignClear(&seqToken);
5403 
5404         if(!seqRegStockholmSeq)
5405             seqRegStockholmSeq = ajRegCompC("^([^ \t\n]+)[ \t]+"
5406                                             "([^ \t\n]+)[ \t]+");
5407         while(ok && !ajStrPrefixC(line,"//"))
5408         {
5409             if(ajRegExec(seqRegStockholmSeq,line))
5410             {
5411                 ajRegSubI(seqRegStockholmSeq,1,&word);
5412                 ajRegSubI(seqRegStockholmSeq,2,&seqToken);
5413                 ajRegPost(seqRegStockholmSeq,&post);
5414                 ajStrRemoveLastNewline(&post);
5415 
5416                 ajDebug("Stockholm: regex word '%S' token '%S' post '%S'\n",
5417                         word, seqToken, post);
5418                 if(!ajStrCmpC(word,"#=GF"))
5419                 {
5420                     if(!ajStrCmpC(seqToken,"ID"))
5421                         ajStrAssignS(&stock->id,post);
5422                     else if(!ajStrCmpC(seqToken,"AC"))
5423                         ajStrAssignS(&stock->ac,post);
5424                     else if(!ajStrCmpC(seqToken,"DE"))
5425                         ajStrAssignS(&stock->de,post);
5426                     else if(!ajStrCmpC(seqToken,"AU"))
5427                         ajStrAssignS(&stock->au,post);
5428                     else if(!ajStrCmpC(seqToken,"AL"))
5429                         ajStrAssignS(&stock->al,post);
5430                     else if(!ajStrCmpC(seqToken,"SE"))
5431                         ajStrAssignS(&stock->se,post);
5432                     else if(!ajStrCmpC(seqToken,"TP"))
5433                         ajStrAssignS(&stock->se,post);
5434                     else if(!ajStrCmpC(seqToken,"GA"))
5435                         ajFmtScanS(post,"%d%d",&stock->ga[0],
5436                                    &stock->ga[1]);
5437                     else if(!ajStrCmpC(seqToken,"TC"))
5438                         ajFmtScanS(post,"%f%f",&stock->tc[0],
5439                                    &stock->tc[1]);
5440                     else if(!ajStrCmpC(seqToken,"NC"))
5441                         ajFmtScanS(post,"%f%f",&stock->nc[0],
5442                                    &stock->nc[1]);
5443                     else if(!ajStrCmpC(seqToken,"BM"))
5444                     {
5445                         if(bmf)
5446                         {
5447                             bmf = ajFalse;
5448                             ajStrAssignS(&stock->bm,line);
5449                         }
5450                         else
5451                             ajStrAppendS(&stock->bm,line);
5452                     }
5453                     else if(!ajStrCmpC(seqToken,"DC"))
5454                     {
5455                         if(dcf)
5456                         {
5457                             dcf = ajFalse;
5458                             ajStrAssignS(&stock->dc,line);
5459                         }
5460                         else
5461                             ajStrAppendS(&stock->dc,line);
5462                     }
5463                     else if(!ajStrCmpC(seqToken,"DR"))
5464                     {
5465                         if(drf)
5466                         {
5467                             drf = ajFalse;
5468                             ajStrAssignS(&stock->dr,line);
5469                         }
5470                         else
5471                             ajStrAppendS(&stock->dr,line);
5472                     }
5473                     else if(!ajStrCmpC(seqToken,"CC"))
5474                     {
5475                         if(ccf)
5476                         {
5477                             ccf = ajFalse;
5478                             ajStrAssignS(&stock->cc,line);
5479                         }
5480                         else
5481                             ajStrAppendS(&stock->cc,line);
5482                     }
5483                     else if(*ajStrGetPtr(seqToken)=='R')
5484                     {
5485                         if(reff)
5486                         {
5487                             reff = ajFalse;
5488                             ajStrAssignS(&stock->ref,line);
5489                         }
5490                         else
5491                             ajStrAppendS(&stock->ref,line);
5492                     }
5493                 }
5494 
5495                 else if(!ajStrCmpC(word,"#=GS"))
5496                 {
5497                     if(gsf)
5498                     {
5499                         gsf = ajFalse;
5500                         ajStrAssignS(&stock->gs,line);
5501                     }
5502                     else
5503                         ajStrAppendS(&stock->gs,line);
5504                 }
5505 
5506                 else if(!ajStrCmpC(word,"#=GC"))
5507                 {
5508                     if(!ajStrCmpC(seqToken,"SS_cons"))
5509                         ajStrAssignS(&stock->sscons,post);
5510                     else if(!ajStrCmpC(seqToken,"SA_cons"))
5511                         ajStrAssignS(&stock->sacons,post);
5512                     else if(!ajStrCmpC(seqToken,"seq_cons"))
5513                         ajStrAssignS(&stock->sqcons,post);
5514                 }
5515             }
5516             else if (ajStrGetLen(line))
5517             {
5518                 if(ajStrParseCount(line) > 1)
5519                 {
5520                     ++scnt;
5521 
5522                     if(scnt >= n)
5523                         scnt = 0;
5524 
5525                     ajFmtScanS(line,"%S%S", &namstr,&seqstr);
5526                     ajDebug("Stockholm: scnt: %d namstr '%S' seqstr '%S'\n",
5527                             scnt,namstr,seqstr);
5528 
5529                     if(!ajStrGetLen(stock->name[scnt]))
5530                         ajStrAppendS(&stock->name[scnt], namstr);
5531                     else
5532                     {
5533                         if(!ajStrMatchS(namstr, stock->name[scnt]))
5534                             ajWarn("Bad stockholm format found id %d '%S' "
5535                                    "expect '%S'",
5536                                    scnt, namstr, stock->name[scnt]);
5537                     }
5538 
5539                     ajStrRemoveLastNewline(&seqstr);
5540                     ajStrAppendS(&stock->str[scnt], seqstr);
5541                 }
5542                 else
5543                 {
5544                     ajStrRemoveLastNewline(&line);
5545                     ajStrAppendS(&stock->str[scnt], line);
5546                 }
5547 
5548             }
5549 
5550             ok = ajTextinStoreReadline(seqin->Input,&line, &thys->TextPtr);
5551             ajStrTrimWhiteEnd(&line);
5552         }
5553 
5554         while(ok && !ajStrPrefixC(line, "# STOCKHOLM 1."))
5555             ok = ajTextinStoreReadline(seqin->Input,&line, &thys->TextPtr);
5556 
5557         if(ok)
5558             ajTextinStoreClear(seqin->Input, 1, line, &thys->TextPtr);
5559         else
5560             ajFilebuffClear(buff, 0);
5561 
5562         ajStrDel(&word);
5563         ajStrDel(&post);
5564         ajStrDel(&namstr);
5565         ajStrDel(&seqstr);
5566         ajStrDelStatic(&seqToken);
5567 
5568         seqin->SeqData = stock;
5569 
5570         ajFilebuffClear(buff,0);
5571     }
5572 
5573 
5574     /* At this point the Stockholm structure is fully loaded */
5575     if(stock->Count >= stock->n)
5576     {
5577         ajDebug("Stockholm count %d: All done\n", stock->Count);
5578         stockholmDel(&stock);
5579         seqin->SeqData = NULL;
5580         ajStrDel(&line);
5581 
5582         return ajFalse;
5583     }
5584 
5585     i = stock->Count;
5586 
5587     seqStockholmCopy(&thys,stock,i);
5588 
5589     ++stock->Count;
5590 
5591     ajDebug("Stockholm returning %d/%d '%S' len: %d\n",
5592             stock->Count, stock->n, ajSeqGetNameS(thys),ajSeqGetLen(thys));
5593 
5594     ajStrDel(&line);
5595 
5596     return ajTrue;
5597 }
5598 
5599 
5600 
5601 
5602 /* @funcstatic seqSelexCopy ***************************************************
5603 **
5604 ** Copy Selex data to sequence object.
5605 ** Pad with gaps to make lengths equal.
5606 **
5607 ** @param [w] thys [AjPSeq*] sequence object
5608 ** @param [u] selex [SeqPSelex] seqin containing selex info
5609 ** @param [r] n [ajuint] index into selex object
5610 ** @return [void]
5611 **
5612 ** @release 2.0.1
5613 ** @@
5614 ******************************************************************************/
5615 
seqSelexCopy(AjPSeq * thys,SeqPSelex selex,ajuint n)5616 static void seqSelexCopy(AjPSeq *thys, SeqPSelex selex, ajuint n)
5617 {
5618     AjPSeq pthis   = *thys;
5619 
5620     /*SeqPSelexdata sdata;*/
5621 
5622     ajStrAssignS(&pthis->Seq, selex->str[n]);
5623     ajStrAssignS(&pthis->Name, selex->name[n]);
5624     pthis->Weight = selex->sq[n]->wt;
5625 
5626 /*
5627   if(!(*thys)->Selexdata)
5628   (*thys)->Selexdata = selexdataNew();
5629 
5630   sdata = (*thys)->Selexdata;
5631 
5632   ajStrAssignS(&sdata->id,selex->id);
5633   ajStrAssignS(&sdata->ac,selex->ac);
5634   ajStrAssignS(&sdata->de,selex->de);
5635   ajStrAssignS(&sdata->au,selex->au);
5636   ajStrAssignS(&sdata->cs,selex->cs);
5637   ajStrAssignS(&sdata->rf,selex->rf);
5638   ajStrAssignS(&sdata->name,selex->name[n]);
5639   ajStrAssignS(&sdata->str,selex->str[n]);
5640   ajStrAssignS(&sdata->ss,selex->ss[n]);
5641 
5642   sdata->ga[0] = selex->ga[0];
5643   sdata->ga[1] = selex->ga[1];
5644   sdata->tc[0] = selex->tc[0];
5645   sdata->tc[1] = selex->tc[1];
5646   sdata->nc[0] = selex->nc[0];
5647   sdata->nc[1] = selex->nc[1];
5648 
5649   ajStrAssignS(&sdata->sq->name,selex->sq[n]->name);
5650 
5651   ajStrAssignS(&sdata->sq->ac,selex->sq[n]->ac);
5652   ajStrAssignS(&sdata->sq->source,selex->sq[n]->source);
5653   ajStrAssignS(&sdata->sq->de,selex->sq[n]->de);
5654 
5655   sdata->sq->wt    = selex->sq[n]->wt;
5656   sdata->sq->start = selex->sq[n]->start;
5657   sdata->sq->stop  = selex->sq[n]->stop;
5658   sdata->sq->len   = selex->sq[n]->len;
5659 */
5660     return;
5661 }
5662 
5663 
5664 
5665 
5666 /* @funcstatic seqStockholmCopy ***********************************************
5667 **
5668 ** Copy Stockholm data to sequence object.
5669 ** Pad with gaps to make lengths equal.
5670 **
5671 ** @param [w] thys [AjPSeq*] sequence object
5672 ** @param [u] stock [SeqPStockholm] seqin containing selex info
5673 ** @param [r] n [ajint] index into stockholm object
5674 ** @return [void]
5675 **
5676 ** @release 2.3.0
5677 ** @@
5678 ******************************************************************************/
5679 
seqStockholmCopy(AjPSeq * thys,SeqPStockholm stock,ajint n)5680 static void seqStockholmCopy(AjPSeq *thys, SeqPStockholm stock, ajint n)
5681 {
5682     AjPSeq pthis;
5683     /*SeqPStockholmdata sdata;*/
5684 
5685     pthis = *thys;
5686 
5687     ajStrAssignS(&pthis->Seq, stock->str[n]);
5688     ajStrAssignS(&pthis->Name, stock->name[n]);
5689 
5690 /*
5691   if(!(*thys)->Stock)
5692   (*thys)->Stock = stockholmdataNew();
5693 
5694   sdata = (*thys)->Stock;
5695 
5696   ajStrAssignS(&sdata->id,stock->id);
5697   ajStrAssignS(&sdata->ac,stock->ac);
5698   ajStrAssignS(&sdata->de,stock->de);
5699   ajStrAssignS(&sdata->au,stock->au);
5700   ajStrAssignS(&sdata->al,stock->al);
5701   ajStrAssignS(&sdata->tp,stock->tp);
5702   ajStrAssignS(&sdata->se,stock->se);
5703   ajStrAssignS(&sdata->gs,stock->gs);
5704   ajStrAssignS(&sdata->dc,stock->dc);
5705   ajStrAssignS(&sdata->dr,stock->dr);
5706   ajStrAssignS(&sdata->cc,stock->cc);
5707   ajStrAssignS(&sdata->ref,stock->ref);
5708   ajStrAssignS(&sdata->sacons,stock->sacons);
5709   ajStrAssignS(&sdata->sqcons,stock->sqcons);
5710   ajStrAssignS(&sdata->sscons,stock->sscons);
5711   sdata->ga[0] = stock->ga[0];
5712   sdata->ga[1] = stock->ga[1];
5713   sdata->tc[0] = stock->tc[0];
5714   sdata->tc[1] = stock->tc[1];
5715   sdata->nc[0] = stock->nc[0];
5716   sdata->nc[1] = stock->nc[1];
5717 */
5718     return;
5719 }
5720 
5721 
5722 
5723 
5724 /* @funcstatic seqSelexAppend *************************************************
5725 **
5726 ** Append sequence and related Selex info to selex object.
5727 ** Pad with gaps to make lengths equal.
5728 **
5729 ** @param [r] src [const AjPStr] source line from Selex file
5730 ** @param [w] dest [AjPStr*] Destination in Selex object
5731 ** @param [r] beg  [ajuint] start of info in src
5732 ** @param [r] end  [ajuint] end of info in src
5733 ** @return [void]
5734 **
5735 ** @release 2.0.1
5736 ** @@
5737 ******************************************************************************/
5738 
seqSelexAppend(const AjPStr src,AjPStr * dest,ajuint beg,ajuint end)5739 static void seqSelexAppend(const AjPStr src, AjPStr *dest,
5740                            ajuint beg, ajuint end)
5741 {
5742     const char *p = NULL;
5743     char c;
5744     ajuint len;
5745     ajuint i;
5746     ajuint pad = 0;
5747 
5748     len = end-beg+1;
5749     p   = ajStrGetPtr(src);
5750 
5751     ajDebug("seqSelexAppend srclen: %u beg: %u end: %u src '%S'\n",
5752             ajStrGetLen(src), beg, end, src);
5753 
5754     if(beg>=ajStrGetLen(src))
5755     {
5756         for(i=0;i<len;++i)
5757             ajStrAppendK(dest,'-');
5758 
5759         return;
5760     }
5761 
5762     p += beg;
5763     pad = end - ajStrGetLen(src) + 2;
5764 
5765     while((c=*p) && *p!='\n')
5766     {
5767         if(c=='.' || c=='_' || c==' ')
5768             c='-';
5769 
5770         ajStrAppendK(dest,c);
5771         ++p;
5772     }
5773 
5774     for(i=0;i<pad;++i)
5775         ajStrAppendK(dest,'-');
5776 
5777     return;
5778 }
5779 
5780 
5781 
5782 
5783 /* @funcstatic seqSelexHeader *************************************************
5784 **
5785 ** Load a Selex object with header information for a single line
5786 **
5787 ** @param [w] thys [SeqPSelex*] Selex object
5788 ** @param [r] line [const AjPStr] Selex header line
5789 ** @param [w] named  [AjBool*] Whether names of sequences have been read
5790 ** @param [w] sqcnt  [ajuint*] Number of SQ names read
5791 ** @return [AjBool] ajTrue if the line contained header information
5792 **
5793 ** @release 2.0.1
5794 ** @@
5795 ******************************************************************************/
5796 
seqSelexHeader(SeqPSelex * thys,const AjPStr line,AjBool * named,ajuint * sqcnt)5797 static AjBool seqSelexHeader(SeqPSelex *thys, const AjPStr line,
5798                              AjBool *named, ajuint *sqcnt)
5799 {
5800     SeqPSelex pthis;
5801 
5802     pthis = *thys;
5803 
5804 
5805     if(ajStrPrefixC(line,"#=ID"))
5806     {
5807         ajFmtScanS(line,"#=ID %S",&pthis->id);
5808 
5809         return ajTrue;
5810     }
5811     else if(ajStrPrefixC(line,"#=AC"))
5812     {
5813         ajFmtScanS(line,"#=AC %S",&pthis->ac);
5814 
5815         return ajTrue;
5816     }
5817     else if(ajStrPrefixC(line,"#=DE"))
5818     {
5819         ajStrAssignC(&pthis->de,ajStrGetPtr(line)+5);
5820         ajStrRemoveWhiteExcess(&pthis->de);
5821 
5822         return ajTrue;
5823     }
5824     else if(ajStrPrefixC(line,"#=AU"))
5825     {
5826         ajStrAssignC(&pthis->au,ajStrGetPtr(line)+5);
5827         ajStrRemoveWhiteExcess(&pthis->au);
5828 
5829         return ajTrue;
5830     }
5831     else if(ajStrPrefixC(line,"#=GA"))
5832     {
5833         ajFmtScanS(line,"%*s %f %f",&pthis->ga[0],&pthis->ga[1]);
5834 
5835         return ajTrue;
5836     }
5837     else if(ajStrPrefixC(line,"#=TC"))
5838     {
5839         ajFmtScanS(line,"%*s %f %f",&pthis->tc[0],&pthis->tc[1]);
5840 
5841         return ajTrue;
5842     }
5843     else if(ajStrPrefixC(line,"#=NC"))
5844     {
5845         ajFmtScanS(line,"%*s %f %f",&pthis->nc[0],&pthis->nc[1]);
5846 
5847         return ajTrue;
5848     }
5849     else if(ajStrPrefixC(line,"#=SQ"))
5850     {
5851         ajStrTokenAssignC(&seqHandle,line," \t\n");
5852         ajStrTokenStep(seqHandle);
5853 
5854         ajStrTokenNextParse(seqHandle,&pthis->sq[*sqcnt]->name);
5855         ajStrAssignS(&pthis->name[*sqcnt],pthis->sq[*sqcnt]->name);
5856 
5857         ajStrTokenNextParse(seqHandle, &seqToken);
5858         ajStrToFloat(seqToken,&pthis->sq[*sqcnt]->wt);
5859 
5860         ajStrTokenNextParse(seqHandle,&pthis->sq[*sqcnt]->source);
5861 
5862         ajStrTokenNextParse(seqHandle, &pthis->sq[*sqcnt]->ac);
5863 
5864         ajStrTokenNextParse(seqHandle, &seqToken);
5865         ajFmtScanS(seqToken,"%d..%d:%d",&pthis->sq[*sqcnt]->start,
5866                    &pthis->sq[*sqcnt]->stop,&pthis->sq[*sqcnt]->len);
5867 
5868         ajStrTokenNextParseC(seqHandle,"\n",&pthis->sq[*sqcnt]->de);
5869 
5870         ajStrDelStatic(&seqToken);
5871         *named = ajTrue;
5872         ++(*sqcnt);
5873 
5874         return ajTrue;
5875     }
5876 
5877 
5878     return ajFalse;
5879 }
5880 
5881 
5882 
5883 
5884 /* @funcstatic seqSelexPos ****************************************************
5885 **
5886 ** Find start and end positions of sequence & related Selex information
5887 **
5888 ** @param [r] line [const AjPStr] Selex sequence or related line
5889 ** @param [w] begin  [ajuint*] start pos
5890 ** @param [w] end  [ajuint*] end pos
5891 ** @return [void]
5892 **
5893 ** @release 2.0.1
5894 ** @@
5895 ******************************************************************************/
5896 
seqSelexPos(const AjPStr line,ajuint * begin,ajuint * end)5897 static void seqSelexPos(const AjPStr line, ajuint *begin, ajuint *end)
5898 {
5899     ajuint pos = 0;
5900     ajuint len = 0;
5901 
5902     const char  *p;
5903 
5904     /*
5905     **  Selex sequence info can start any number of spaces
5906     **  after the names so we need to find out where to
5907     **  start counting chars from and where to end
5908     */
5909 
5910     len  = ajStrGetLen(line) - 1;
5911 
5912     if(!len)
5913     {
5914         *begin=0;
5915         *end=0;
5916 
5917         return;
5918     }
5919 
5920     pos  = len -1;
5921     *end = (pos > *end) ? pos : *end;
5922     p = ajStrGetPtr(line);
5923 
5924     while(*p && *p!=' ')
5925         ++p;
5926 
5927     while(*p && *p==' ')
5928         ++p;
5929 
5930     if(p)
5931         pos = p - ajStrGetPtr(line);
5932 
5933     *begin = (pos < *begin) ? pos : *begin;
5934 
5935     ajDebug("seqSelexPos len:%u pos:%u begin:%u end:%u\n",
5936             len, pos, *begin, *end);
5937 
5938     return;
5939 }
5940 
5941 
5942 
5943 
5944 /* @funcstatic seqSelexReadBlock **********************************************
5945 **
5946 ** Read a block of sequence information from a selex file
5947 **
5948 ** @param [w] thys [SeqPSelex*] Selex object
5949 ** @param [w] named  [AjBool*] Whether names of sequences have been read
5950 ** @param [r] n  [ajuint] Number of sequences in Selex file
5951 ** @param [u] line [AjPStr*] Line from Selex file
5952 ** @param [u] seqin  [AjPSeqin] Sequence input objext
5953 ** @param [w] astr [AjPStr*] string to append to
5954 ** @return [AjBool] ajTrue if data was read.
5955 **
5956 ** @release 2.0.1
5957 ** @@
5958 ******************************************************************************/
5959 
seqSelexReadBlock(SeqPSelex * thys,AjBool * named,ajuint n,AjPStr * line,AjPSeqin seqin,AjPStr * astr)5960 static AjBool seqSelexReadBlock(SeqPSelex *thys, AjBool *named, ajuint n,
5961                                 AjPStr *line, AjPSeqin seqin, AjPStr *astr)
5962 {
5963     SeqPSelex pthis;
5964     AjPStr *seqs = NULL;
5965     AjPStr *ss   = NULL;
5966 
5967     AjPStr rf = NULL;
5968     AjPStr cs = NULL;
5969     ajuint  i;
5970     ajuint  begin=0;
5971     ajuint  end=0;
5972     AjBool ok;
5973     ajuint  cnt;
5974     AjPStr tmp    = NULL;
5975     AjBool haverf = ajFalse;
5976     AjBool havecs = ajFalse;
5977     AjBool havess = ajFalse;
5978 
5979     pthis = *thys;
5980 
5981     begin = INT_MAX;
5982     end   = 0;
5983 
5984     tmp = ajStrNew();
5985     rf = ajStrNew();
5986     cs = ajStrNew();
5987     AJCNEW(seqs,n);
5988     AJCNEW(ss,n);
5989 
5990     for(i=0;i<n;++i)
5991     {
5992         seqs[i] = ajStrNew();
5993         ss[i]  = ajStrNew();
5994     }
5995 
5996     ok = ajTrue;
5997     cnt = 0;
5998 
5999 
6000     while(ajStrPrefixC(*line,"\n"))
6001         ok = ajTextinStoreReadline(seqin->Input, line, astr);
6002 
6003     while(ok)
6004     {
6005         seqSelexPos(*line,&begin,&end);
6006 
6007         if(ajStrPrefixC(*line,"#=RF"))
6008         {
6009             haverf=ajTrue;
6010             ajStrAssignS(&rf,*line);
6011         }
6012 
6013         if(ajStrPrefixC(*line,"#=CS"))
6014         {
6015             havecs=ajTrue;
6016             ajStrAssignS(&cs,*line);
6017         }
6018 
6019         if(ajStrPrefixC(*line,"#=SS"))
6020         {
6021             havess=ajTrue;
6022             ajStrAssignS(&ss[--cnt],*line);
6023             ++cnt;
6024         }
6025 
6026         if(!ajStrPrefixC(*line,"#"))
6027         {
6028             if(!*named)
6029             {
6030                 ajFmtScanS(*line,"%S",&pthis->name[cnt]);
6031                 ajStrAssignS(&pthis->sq[cnt]->name,pthis->name[cnt]);
6032             }
6033             else
6034             {
6035                 ajFmtScanS(*line,"%S",&tmp);
6036 
6037                 if(!ajStrPrefixS(pthis->name[cnt],tmp))
6038                     ajWarn("Selex format sequence names do not match "
6039                            "['%S' '%S']",
6040                            pthis->name[cnt],tmp);
6041             }
6042 
6043             ajStrAssignS(&seqs[cnt],*line);
6044             ++cnt;
6045         }
6046 
6047         ok = ajTextinStoreReadline(seqin->Input,line, astr);
6048 
6049         if(ajStrPrefixC(*line,"\n"))
6050             ok = ajFalse;
6051     }
6052 
6053     ajDebug("selexReadBlock block done line '%S' n: %u rf:%B cs:%B ss:%B\n",
6054             *line, n, haverf, havecs, havess);
6055 
6056     if(cnt != n)
6057         ajWarn("Selex format expected %u sequences in block, found %u",
6058                n, cnt);
6059     if(cnt > n)
6060         cnt = n;
6061 
6062     if(haverf)
6063         seqSelexAppend(rf,&pthis->rf,begin,end);
6064 
6065     if(havecs)
6066         seqSelexAppend(cs,&pthis->cs,begin,end);
6067 
6068     for(i=0;i<cnt;++i)
6069     {
6070         seqSelexAppend(seqs[i],&pthis->str[i],begin,end);
6071         if(havess)
6072             seqSelexAppend(ss[i],&pthis->ss[i],begin,end);
6073     }
6074 
6075 
6076     for(i=0;i<n;++i)
6077     {
6078         ajStrDel(&seqs[i]);
6079         ajStrDel(&ss[i]);
6080     }
6081 
6082     AJFREE(seqs);
6083     AJFREE(ss);
6084 
6085     ajStrDel(&rf);
6086     ajStrDel(&cs);
6087     ajStrDel(&tmp);
6088 
6089     *named = ajTrue;
6090 
6091     return ajTrue;
6092 }
6093 
6094 
6095 
6096 
6097 /* @funcstatic seqReadStaden **************************************************
6098 **
6099 ** Given data in a sequence structure, tries to read everything needed
6100 ** using the old Staden package file format.
6101 **
6102 ** @param [w] thys [AjPSeq] Sequence object
6103 ** @param [u] seqin [AjPSeqin] Sequence input object
6104 ** @return [AjBool] ajTrue on success
6105 **
6106 ** @release 1.0.0
6107 ** @@
6108 ******************************************************************************/
6109 
seqReadStaden(AjPSeq thys,AjPSeqin seqin)6110 static AjBool seqReadStaden(AjPSeq thys, AjPSeqin seqin)
6111 {
6112     AjPStr tmpname  = NULL;
6113     AjPFilebuff buff;
6114     AjBool incomment = ajFalse;
6115 
6116     buff = seqin->Input->Filebuff;
6117 
6118     if(!seqRegStadenId)
6119         seqRegStadenId = ajRegCompC("^[<]([^>-]+)[-]*[>]");
6120 
6121     if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
6122         return ajFalse;
6123 
6124     if(ajRegExec(seqRegStadenId, seqReadLine))
6125     {
6126         ajRegSubI(seqRegStadenId, 1, &seqToken);
6127         seqSetName(thys, seqToken);
6128         ajDebug("seqReadStaden name '%S' token '%S'\n",
6129                 thys->Name, seqToken);
6130         ajRegPost(seqRegStadenId, &seqToken);
6131         seqAppendCommented(&thys->Seq, &incomment, seqToken);
6132         ajStrDelStatic(&seqToken);
6133     }
6134     else
6135     {
6136         tmpname = ajStrNewS(seqin->Input->Filename);
6137         ajFilenameTrimAll(&tmpname);
6138         seqSetName(thys, tmpname);
6139         seqAppendCommented(&thys->Seq, &incomment, seqReadLine);
6140         ajStrDel(&tmpname);
6141     }
6142 
6143     while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
6144     {
6145         seqAppendCommented(&thys->Seq, &incomment, seqReadLine);
6146     }
6147 
6148     if(!ajTextinGetRecords(seqin->Input)) /* but we have read at least 1 line */
6149         return ajFalse;
6150 
6151     ajFilebuffClear(buff, 0);
6152 
6153     return ajTrue;
6154 }
6155 
6156 
6157 
6158 
6159 /* @funcstatic seqReadText ****************************************************
6160 **
6161 ** Given data in a sequence structure, tries to read everything needed
6162 ** using plain text format.
6163 **
6164 ** @param [w] thys [AjPSeq] Sequence object
6165 ** @param [u] seqin [AjPSeqin] Sequence input object
6166 ** @return [AjBool] ajTrue on success
6167 **
6168 ** @release 1.0.0
6169 ** @@
6170 ******************************************************************************/
6171 
seqReadText(AjPSeq thys,AjPSeqin seqin)6172 static AjBool seqReadText(AjPSeq thys, AjPSeqin seqin)
6173 {
6174     AjPFilebuff buff;
6175 
6176     ajDebug("seqReadText\n");
6177 
6178     buff = seqin->Input->Filebuff;
6179 
6180     while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
6181     {
6182         ajDebug("read '%S'\n", seqReadLine);
6183         seqAppend(&thys->Seq, seqReadLine);
6184     }
6185 
6186     ajDebug("read %d lines\n", ajTextinGetRecords(seqin->Input));
6187     ajFilebuffClear(buff, 0);
6188 
6189     if(!ajTextinGetRecords(seqin->Input))
6190         return ajFalse;
6191 
6192     seqSetNameFile(thys, seqin);
6193 
6194     return ajTrue;
6195 }
6196 
6197 
6198 
6199 
6200 /* @funcstatic seqReadRaw *****************************************************
6201 **
6202 ** Given data in a sequence structure, tries to read everything needed
6203 ** using raw format, which accepts only alphanumeric and whitespace
6204 ** characters or '-' for gap or '*' for a protein stop
6205 ** and rejects anything else.
6206 **
6207 ** @param [w] thys [AjPSeq] Sequence object
6208 ** @param [u] seqin [AjPSeqin] Sequence input object
6209 ** @return [AjBool] ajTrue on success
6210 **
6211 ** @release 1.0.0
6212 ** @@
6213 ******************************************************************************/
6214 
seqReadRaw(AjPSeq thys,AjPSeqin seqin)6215 static AjBool seqReadRaw(AjPSeq thys, AjPSeqin seqin)
6216 {
6217     AjPFilebuff buff;
6218     const char* cp;
6219     AjPFile fp;
6220     AjBool ok = ajFalse;
6221     ajulong filestat = 0L;
6222     ajulong filesize;
6223     ajulong i;
6224     ajuint inc = 2048;
6225     AjPStr buf = NULL;
6226     char *cbuf;
6227     AjPStr tmpseq = NULL;
6228     size_t iread;
6229 
6230     ajDebug("seqReadRaw\n");
6231 
6232     buff = seqin->Input->Filebuff;
6233     fp = ajFilebuffGetFile(buff);
6234 
6235     if(!seqRegRawNonseq)
6236         seqRegRawNonseq = ajRegCompC("[^A-Za-z0-9 \t\n\r*-]");
6237 
6238     if(ajFilebuffIsEnded(buff))
6239     {
6240         ajDebug("seqReadRaw filebuff already ended\n");
6241         return ajFalse;
6242     }
6243 
6244     filestat = ajFileSeek(fp, 0L, SEEK_CUR);
6245 
6246     if(filestat)
6247     {
6248         ajDebug("filestat %Lu\n", filestat);
6249 
6250         /* not a file - cannot use binary, so we can only read the buffer */
6251         while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
6252         {
6253             ajDebug("read '%S'\n", seqReadLine);
6254 
6255             cp = ajStrGetPtr(seqReadLine);
6256             if(strlen(cp) != ajStrGetLen(seqReadLine))
6257             {
6258                 ajDebug("seqReadRaw: Null character found in line: %S\n",
6259                         seqReadLine);
6260                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6261                 ajStrAssignClear(&thys->Seq);
6262 
6263                 return ajFalse;
6264             }
6265 
6266             if(ajRegExec(seqRegRawNonseq, seqReadLine))
6267             {
6268                 ajDebug("seqReadRaw: Bad character found in line: %S\n",
6269                         seqReadLine);
6270                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6271                 ajStrAssignClear(&thys->Seq);
6272 
6273                 return ajFalse;
6274             }
6275             seqAppend(&thys->Seq, seqReadLine);
6276             ajDebug("read %d lines\n", ajTextinGetRecords(seqin->Input));
6277         }
6278     }
6279     else
6280     {
6281         if(ajFilebuffIsEnded(buff))
6282         {
6283             ajDebug("seqReadRaw filebuff ended\n");
6284             ajFileSeek(fp, 0L, SEEK_END);
6285             return ajFalse;
6286         }
6287 
6288         buf = ajStrNewRes(4096);
6289         ajStrSetValidLen(&buf, inc);
6290         cbuf = ajStrGetuniquePtr(&buf);
6291 
6292         filestat = ajFileSeek(fp, 0L, SEEK_END);
6293         filesize = ajFileResetPos(fp);
6294         filestat = ajFileSeek(fp, 0L, SEEK_SET);
6295 
6296         if(!filesize)
6297         {
6298             ajDebug("seqReadRaw filesize zero\n");
6299             ajFileSeek(fp,(ajlong) filesize, SEEK_SET);
6300             return ajFalse;
6301         }
6302 
6303         ok = ajTrue;
6304 
6305         for(i=0; i < filesize; i += inc)
6306         {
6307             if((i+inc) > filesize)
6308             {
6309                 inc = (ajuint) (filesize - i);
6310                 ajStrSetValidLen(&buf, inc);
6311             }
6312 
6313             iread = ajReadbinBinary(fp, inc, 1, cbuf);
6314             cbuf[inc] = '\0';
6315 
6316             if(strlen(cbuf) != iread)
6317             {
6318                 ajDebug("seqReadRaw: Null character found in line: %s\n",
6319                         cbuf);
6320                 ok = ajFalse;
6321                 break;
6322             }
6323 
6324             if(ajRegExec(seqRegRawNonseq, buf))
6325             {
6326                 ajDebug("seqReadRaw: Bad character found in line: %S\n",
6327                         seqReadLine);
6328                 ok = ajFalse;
6329                 break;
6330             }
6331 
6332             ajStrAssignC(&tmpseq, cbuf);
6333 
6334             if(seqin->Input->Text)
6335                 ajStrAppendS(&thys->TextPtr, tmpseq);
6336 
6337             seqAppend(&thys->Seq, tmpseq);
6338             seqin->Input->Records++;
6339 
6340             ajDebug("read %d lines\n", ajTextinGetRecords(seqin->Input));
6341         }
6342 
6343         ajStrDel(&buf);
6344         ajStrDel(&tmpseq);
6345 
6346         if(!ok)
6347         {
6348             ajDebug("seqReadRaw input OK failed\n");
6349 
6350             ajFileSeek(fp,(ajlong) filestat,0);
6351 
6352             if(seqin->Input->Text)
6353             {
6354                 ajStrAssignC(&thys->TextPtr, "");
6355                 seqin->Input->Records = 0;
6356             }
6357 
6358             ajFilebuffResetPos(buff);
6359 
6360             return ajFalse;
6361         }
6362 
6363     }
6364 
6365     buff->File->End = ajTrue;
6366 
6367     if(!ajTextinGetRecords(seqin->Input))
6368     {
6369         ajDebug("seqReadRaw no records read\n");
6370         ajTextinStoreClear(seqin->Input, -1, seqReadLine, &thys->TextPtr);
6371         return ajFalse;
6372     }
6373 
6374     ajFilebuffClear(buff, -1);
6375 
6376     ajDebug("seqReadRaw success\n");
6377 
6378     return ajTrue;
6379 }
6380 
6381 
6382 
6383 
6384 /* @funcstatic seqReadIgstrict ************************************************
6385 **
6386 ** Given data in a sequence structure, tries to read everything needed
6387 ** using IntelliGenetics format.
6388 **
6389 ** Requires a trailing number at the end of the sequence
6390 **
6391 ** @param [w] thys [AjPSeq] Sequence object
6392 ** @param [u] seqin [AjPSeqin] Sequence input object
6393 ** @return [AjBool] ajTrue on success
6394 **
6395 ** @release 6.1.0
6396 ** @@
6397 ******************************************************************************/
6398 
seqReadIgstrict(AjPSeq thys,AjPSeqin seqin)6399 static AjBool seqReadIgstrict(AjPSeq thys, AjPSeqin seqin)
6400 {
6401     AjPFilebuff buff;
6402     AjBool endnum = ajFalse;
6403     AjBool ok = ajTrue;
6404 
6405     buff = seqin->Input->Filebuff;
6406 
6407     do
6408     {
6409         if(ajTextinGetRecords(seqin->Input))
6410         {
6411             ajStrCutStart(&seqReadLine, 1); /* trim the semi colon */
6412             ajStrRemoveWhiteExcess(&seqReadLine);
6413             if(ajStrGetLen(thys->Desc))
6414                 ajStrAppendK(&thys->Desc, ' ');
6415             ajStrAppendS(&thys->Desc, seqReadLine);
6416         }
6417         /* skip comments with ';' prefix */
6418         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6419     } while(ok && ajStrPrefixC(seqReadLine, ";"));
6420 
6421     if(!ok)
6422     {
6423         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6424         return ajFalse;
6425     }
6426 
6427     seqSetName(thys, seqReadLine);
6428 
6429     while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr) &&
6430           !ajStrPrefixC(seqReadLine, ";"))
6431     {
6432         ajStrRemoveWhiteExcess(&seqReadLine);
6433         if(ajStrSuffixC(seqReadLine, "1"))
6434             endnum = ajTrue;
6435         else if(ajStrSuffixC(seqReadLine, "2"))
6436             endnum = ajTrue;
6437         else
6438             endnum = ajFalse;
6439         seqAppend(&thys->Seq, seqReadLine);
6440     }
6441 
6442     if(!endnum)
6443     {
6444         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6445         return ajFalse;
6446     }
6447 
6448     if(ajStrPrefixC(seqReadLine, ";"))
6449         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
6450     else
6451         ajFilebuffClear(buff, 0);
6452 
6453     return ajTrue;
6454 }
6455 
6456 
6457 
6458 
6459 /* @funcstatic seqReadIg ******************************************************
6460 **
6461 ** Given data in a sequence structure, tries to read everything needed
6462 ** using IntelliGenetics format.
6463 **
6464 ** @param [w] thys [AjPSeq] Sequence object
6465 ** @param [u] seqin [AjPSeqin] Sequence input object
6466 ** @return [AjBool] ajTrue on success
6467 **
6468 ** @release 1.0.0
6469 ** @@
6470 ******************************************************************************/
6471 
seqReadIg(AjPSeq thys,AjPSeqin seqin)6472 static AjBool seqReadIg(AjPSeq thys, AjPSeqin seqin)
6473 {
6474     AjPFilebuff buff;
6475     AjBool ok = ajTrue;
6476 
6477     buff = seqin->Input->Filebuff;
6478 
6479     do
6480     {
6481         if(ajTextinGetRecords(seqin->Input))
6482         {
6483             ajStrCutStart(&seqReadLine, 1); /* trim the semi colon */
6484             ajStrRemoveWhiteExcess(&seqReadLine);
6485             if(ajStrGetLen(thys->Desc))
6486                 ajStrAppendK(&thys->Desc, ' ');
6487             ajStrAppendS(&thys->Desc, seqReadLine);
6488         }
6489         /* skip comments with ';' prefix */
6490         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6491     } while(ok && ajStrPrefixC(seqReadLine, ";"));
6492 
6493     if(!ok)
6494     {
6495         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6496         return ajFalse;
6497     }
6498 
6499     /* we know we will succeed from here ... no way to return ajFalse */
6500 
6501     ajFilebuffSetUnbuffered(buff);
6502 
6503     seqSetName(thys, seqReadLine);
6504 
6505     while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr) &&
6506           !ajStrPrefixC(seqReadLine, ";"))
6507     {
6508         seqAppend(&thys->Seq, seqReadLine);
6509     }
6510 
6511     if(ajStrPrefixC(seqReadLine, ";"))
6512         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
6513     else
6514         ajFilebuffClear(buff, 0);
6515 
6516     return ajTrue;
6517 }
6518 
6519 
6520 
6521 
6522 /* @funcstatic seqReadIguspto *************************************************
6523 **
6524 ** Given data in a sequence structure, tries to read everything needed
6525 ** using the US patent office multi-line IntelliGenetics format.
6526 **
6527 ** Requires a trailing number at the end of the sequence
6528 ** and allows for a trailing control-L at the end of the entry.
6529 **
6530 ** @param [w] thys [AjPSeq] Sequence object
6531 ** @param [u] seqin [AjPSeqin] Sequence input object
6532 ** @return [AjBool] ajTrue on success
6533 **
6534 ** @release 6.6.0
6535 ** @@
6536 ******************************************************************************/
6537 
seqReadIguspto(AjPSeq thys,AjPSeqin seqin)6538 static AjBool seqReadIguspto(AjPSeq thys, AjPSeqin seqin)
6539 {
6540     AjPFilebuff buff;
6541     const AjPStr badstr = NULL;
6542     AjBool endnum = ajFalse;
6543     AjBool ok = ajTrue;
6544     AjBool seqok = ajFalse;
6545     AjBool isheader = ajTrue;
6546     AjBool firstline = ajTrue;
6547     AjBool firstgood = ajTrue;
6548     ajlong ipos;
6549 
6550     buff = seqin->Input->Filebuff;
6551 
6552     while(ok && !seqok)
6553     {
6554         if(!ajStrPrefixC(seqReadLine, ";"))
6555         {
6556             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6557 
6558             return ajFalse;
6559         }
6560 
6561         if(firstline)
6562         {
6563             firstline = ajFalse;
6564             firstgood = ajTrue;
6565 
6566             if(!ajStrPrefixC(seqReadLine, "; Sequence "))
6567             {
6568                 ajFmtPrintS(&seqToken,
6569                             "'; Sequence ' not found");
6570                 firstgood = ajFalse;
6571             }
6572 
6573             if(firstgood)
6574             {
6575                 ipos = ajStrFindC(seqReadLine, ", Application ");
6576                 if(ipos < 1)
6577                 {
6578                     ajFmtPrintS(&seqToken2,
6579                                 "', Application ' not found");
6580                     firstgood = ajFalse;
6581                 }
6582             }
6583 
6584             if(firstgood)
6585             {
6586                 ajStrAssignSubS(&seqToken, seqReadLine, 11, ipos-1);
6587                 if(!ajStrIsInt(seqToken))
6588                 {
6589                     ajFmtPrintS(&seqToken2,
6590                                 "Sequence number '%S' not an integer",
6591                                 seqToken);
6592                     firstgood = ajFalse;
6593                 }
6594             }
6595 
6596             if(firstgood)
6597             {
6598                 ajStrAssignSubS(&seqToken, seqReadLine, ipos+14, -2);
6599                 if(!ajStrIsWord(seqToken))
6600                 {
6601                     ajFmtPrintS(&seqToken2,
6602                                 "Application id '%S' not a word",
6603                                 seqToken);
6604                     firstgood = ajFalse;
6605                 }
6606             }
6607 
6608             if(!firstgood)
6609             {
6610                 if(seqin->Input->Format)
6611                 {
6612                     ajStrAssignS(&seqToken, seqReadLine);
6613                     ajStrTrimWhiteEnd(&seqToken);
6614                     ajWarn("Iguspto: bad first line (%S): %S",
6615                            seqToken2, seqToken);
6616 
6617 
6618                 }
6619                 else
6620                 {
6621                     return ajFalse;
6622                 }
6623             }
6624         }
6625 
6626         if(!thys->Fulldesc)
6627             thys->Fulldesc = ajSeqdescNew();
6628 
6629         do
6630         {
6631             if(ajTextinGetRecords(seqin->Input))
6632             {
6633                 ajStrRemoveLastNewline(&seqReadLine);
6634                 ajStrCutStart(&seqReadLine, 1); /* trim the semi colon */
6635                 if(ajStrGetCharFirst(seqReadLine) == ' ')
6636                     ajStrCutStart(&seqReadLine, 1);
6637                 ajListstrPushAppend(thys->Fulldesc->Multi,
6638                                     ajStrNewS(seqReadLine));
6639 
6640                 if(ajStrPrefixC(seqReadLine, "GENERAL INFORMATION"))
6641                     isheader = ajFalse;
6642 
6643                 if(isheader)
6644                 {
6645                     if(ajStrGetLen(thys->Desc))
6646                         ajStrAppendC(&thys->Desc, "; ");
6647                     ajStrAppendS(&thys->Desc, seqReadLine);
6648                 }
6649             }
6650             /* skip comments with ';' prefix */
6651             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6652         } while(ok && ajStrPrefixC(seqReadLine, ";"));
6653 
6654         if(!ok)
6655         {
6656             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6657             return ajFalse;
6658         }
6659 
6660         seqSetName(thys, seqReadLine);
6661 
6662         endnum = ajFalse;
6663         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6664 
6665         while(ok &&
6666               !ajStrPrefixC(seqReadLine, ";") &&
6667               !endnum)
6668         {
6669             ajStrRemoveWhiteExcess(&seqReadLine);
6670             if(ajStrSuffixC(seqReadLine, "1"))
6671                 endnum = ajTrue;
6672             else if(ajStrSuffixC(seqReadLine, "2"))
6673                 endnum = ajTrue;
6674             else
6675                 endnum = ajFalse;
6676 
6677             if(endnum)
6678                 ajStrCutEnd(&seqReadLine, 1);
6679 
6680             badstr = seqAppendWarn(&thys->Seq, seqReadLine,
6681                                    seqin->Input->Format);
6682             if(badstr)
6683                 ajWarn("Sequence '%S' has bad character(s) '%S'",
6684                        thys->Name, badstr);
6685 
6686             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6687         }
6688 
6689         if(endnum)
6690             seqok = ajTrue;
6691         else
6692         {
6693             if(seqin->Input->Format)
6694                 ajWarn("Sequence '%S' has bad iguspto sequence format",
6695                        thys->Name);
6696             ajSeqClear(thys);
6697         }
6698     }
6699 
6700     /* test for, but do not store, the trailing space and ^L character */
6701 
6702     if(ok)
6703     {
6704         ajStrRemoveWhiteExcess(&seqReadLine);
6705         while(ok &&
6706               (!ajStrGetLen(seqReadLine) || ajStrMatchC(seqReadLine, "\014")))
6707         {
6708             ok = ajBuffreadLine(buff, &seqReadLine);
6709             if(ok)
6710             {
6711                 ajStrRemoveWhiteExcess(&seqReadLine);
6712             }
6713         }
6714     }
6715 
6716     if(ajStrPrefixC(seqReadLine, ";"))
6717         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
6718     else
6719         ajFilebuffClear(buff, 0);
6720 
6721     return ajTrue;
6722 }
6723 
6724 
6725 
6726 
6727 /* @funcstatic seqReadPdb *****************************************************
6728 **
6729 ** Given data in a sequence structure, tries to read everything needed
6730 ** using PDB protein databank format using ATOM records.
6731 **
6732 ** See seqReadPdbseq for parsing the SEQRES records
6733 **
6734 ** @param [w] thys [AjPSeq] Sequence object
6735 ** @param [u] seqin [AjPSeqin] Sequence input object
6736 ** @return [AjBool] ajTrue on success
6737 **
6738 ** @release 6.0.0
6739 ** @@
6740 ******************************************************************************/
6741 
seqReadPdb(AjPSeq thys,AjPSeqin seqin)6742 static AjBool seqReadPdb(AjPSeq thys, AjPSeqin seqin)
6743 {
6744     AjPStr alnname  = NULL;
6745     AjPTable alntable    = NULL;
6746     SeqPMsfItem alnitem  = NULL;
6747     const SeqPMsfItem readalnitem  = NULL;
6748     AjPList alnlist      = NULL;
6749     SeqPMsfData alndata  = NULL;
6750     char aa;
6751     ajuint nseq = 0;
6752     ajuint i;
6753     AjBool ok = ajTrue;
6754     AjPStr aa3 = NULL;
6755     ajuint iaa = 0;
6756     ajuint lastaa = 0;
6757     AjPStr model = NULL;
6758 
6759     ajDebug("seqReadPdb seqin->SeqData %x\n", seqin->SeqData);
6760 
6761     if(!seqin->SeqData)
6762     {                                   /* start of file */
6763         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6764 
6765         ajDebug("first line:\n'%S'\n", seqReadLine);
6766 
6767         if(!ajStrPrefixC(seqReadLine, "HEADER    "))
6768         {
6769             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6770 
6771             return ajFalse;
6772         }
6773 
6774         ajStrAssignSubS(&seqName,seqReadLine, 62, 71);
6775         ajStrTrimWhite(&seqName);
6776 
6777         ajDebug("first line OK name '%S'\n", seqName);
6778 
6779         seqin->SeqData = AJNEW0(alndata);
6780         alndata->Table = alntable = ajTablestrNew(1000);
6781         alnlist = ajListstrNew();
6782         seqin->Input->Filecount = 0;
6783 
6784         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6785 
6786         while(ok && !ajStrMatchC(seqReadLine, "END"))
6787         {
6788             if(ajStrPrefixC(seqReadLine, "MODEL"))
6789             {
6790                 ajStrAssignSubS(&model, seqReadLine, 7, 14);
6791                 ajStrTrimWhite(&model);
6792             }
6793             else if(ajStrPrefixC(seqReadLine, "ATOM"))
6794             {
6795                 if(!alnitem)
6796                     AJNEW0(alnitem);
6797 
6798                 ajStrKeepRange(&seqReadLine, 0,71);
6799 
6800                 ajStrAssignSubS(&aa3, seqReadLine, 17, 19);
6801                 ajStrAssignSubS(&seqChain, seqReadLine, 21, 21);
6802                 ajStrAssignSubS(&seqToken, seqReadLine, 22, 25);
6803                 ajStrToUint(seqToken, &iaa);
6804 
6805                 if(iaa > lastaa)
6806                 {
6807                     if(ajResidueFromTriplet(aa3,&aa))
6808                         seqAppendK(&alnitem->Seq, aa);
6809                     lastaa = iaa;
6810                 }
6811 
6812             }
6813 
6814             else if(ajStrPrefixC(seqReadLine, "TER"))
6815             {
6816                 if(alnitem && !ajStrGetLen(alnitem->Seq))
6817                 {
6818                     ajTableRemoveKey(alntable, alnitem->Name,
6819                                      (void**) &alnname);
6820                     ajStrDel(&alnname);
6821                     seqMsfItemDel(&alnitem);
6822                 }
6823                 else
6824                 {
6825                     nseq++;
6826                     ajFmtPrintS(&seqToken, "%S_%S", seqName, seqChain);
6827 
6828                     if(ajStrGetLen(model))
6829                         ajStrAppendS(&seqToken, model);
6830 
6831                     seqitemSetName(alnitem, seqToken);
6832                     ajStrAssignS(&alnname, alnitem->Name);
6833                     alnitem->Weight = 1.0;
6834                     ajTablePut(alntable, alnname, alnitem);
6835                     alnname = NULL;
6836                     ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
6837                     alnitem = NULL;
6838                 }
6839 
6840                 lastaa = 0;
6841             }
6842 
6843             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6844         }
6845 
6846         ajStrDelStatic(&seqToken);
6847         ajStrDelStatic(&seqName);
6848         ajStrDelStatic(&seqChain);
6849         ajStrDel(&aa3);
6850         ajStrDel(&model);
6851 
6852         if(!nseq)
6853         {
6854             seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
6855             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6856 
6857             return ajFalse;
6858         }
6859 
6860         ajDebug("PDB Entry has %d sequences\n", nseq);
6861         ajListstrTrace(alnlist);
6862         ajTableTrace(alntable);
6863         ajTableMap(alntable, &seqMsfTabList, NULL);
6864 
6865         alndata->Names = AJCALLOC(nseq, sizeof(*alndata->Names));
6866 
6867         for(i=0; i < nseq; i++)
6868         {
6869             ajListstrPop(alnlist, &alndata->Names[i]);
6870             ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
6871         }
6872 
6873         ajListstrFreeData(&alnlist);
6874 
6875         ajTableMap(alntable, &seqMsfTabList, NULL);
6876         alndata->Nseq = nseq;
6877         alndata->Count = 0;
6878         alndata->Bufflines = ajTextinGetRecords(seqin->Input);
6879         ajDebug("PDB format read %d lines\n",
6880                 ajTextinGetRecords(seqin->Input));
6881     }
6882 
6883     alndata = seqin->SeqData;
6884     alntable = alndata->Table;
6885 
6886     if(alndata->Count >= alndata->Nseq)
6887     {                                   /* all done */
6888         ajFilebuffClear(seqin->Input->Filebuff, 0);
6889         seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
6890 
6891         return ajFalse;
6892     }
6893 
6894     i = alndata->Count;
6895     ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
6896     readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
6897     ajStrAssignS(&thys->Name, alndata->Names[i]);
6898 
6899     thys->Weight = readalnitem->Weight;
6900     ajStrAssignS(&thys->Seq, readalnitem->Seq);
6901 
6902     alndata->Count++;
6903 
6904     return ajTrue;
6905 }
6906 
6907 
6908 
6909 
6910 /* @funcstatic seqReadPdbseq **************************************************
6911 **
6912 ** Given data in a sequence structure, tries to read everything needed
6913 ** using PDB protein databank format using the SEQRES records.
6914 **
6915 ** This is the original sequence, see seqReadPdb for parsing the ATOM records
6916 ** which give the sequence observed in the structure.
6917 **
6918 ** @param [w] thys [AjPSeq] Sequence object
6919 ** @param [u] seqin [AjPSeqin] Sequence input object
6920 ** @return [AjBool] ajTrue on success
6921 **
6922 ** @release 6.0.0
6923 ** @@
6924 ******************************************************************************/
6925 
seqReadPdbseq(AjPSeq thys,AjPSeqin seqin)6926 static AjBool seqReadPdbseq(AjPSeq thys, AjPSeqin seqin)
6927 {
6928     AjPFilebuff buff;
6929     AjPStr name  = NULL;
6930     AjPStr alnname  = NULL;
6931     AjPStr chain = NULL;
6932     AjPTable alntable    = NULL;
6933     SeqPMsfItem alnitem  = NULL;
6934     const SeqPMsfItem readalnitem  = NULL;
6935     AjPList alnlist      = NULL;
6936     SeqPMsfData alndata  = NULL;
6937     char aa;
6938     ajuint iseq = 0;
6939     ajuint nseq = 0;
6940     ajuint i;
6941     AjBool ok = ajTrue;
6942 
6943     buff = seqin->Input->Filebuff;
6944 
6945     ajDebug("seqReadPdbseq seqin->SeqData %x\n", seqin->SeqData);
6946 
6947     if(seqin->SeqData)
6948     {
6949         alndata = seqin->SeqData;
6950         alntable = alndata->Table;
6951         if(alndata->Nseq && (alndata->Count >= alndata->Nseq))
6952         {                                       /* try next entry */
6953             ajFilebuffClear(buff, 0);
6954             seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
6955             seqin->SeqData = NULL;
6956         }
6957         alndata = NULL;
6958     }
6959 
6960     if(!seqin->SeqData)
6961     {                                   /* start of file */
6962         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6963         if(!ok)
6964         {
6965             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6966             return ajFalse;
6967         }
6968 
6969         ajDebug("first line:\n'%S'\n", seqReadLine);
6970 
6971         if(!ajStrPrefixC(seqReadLine, "HEADER    "))
6972         {
6973             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
6974 
6975             return ajFalse;
6976         }
6977 
6978         ajStrAssignSubS(&name,seqReadLine, 62, 71);
6979         ajStrTrimWhite(&name);
6980 
6981         ajDebug("first line OK name '%S'\n", name);
6982 
6983         seqin->SeqData = AJNEW0(alndata);
6984         alndata->Table = alntable = ajTablestrNew(1000);
6985         alnlist = ajListstrNew();
6986         seqin->Input->Filecount = 0;
6987 
6988         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
6989 
6990         while(ok && !ajStrMatchC(seqReadLine, "END"))
6991         {
6992             if(ajStrPrefixC(seqReadLine, "SEQRES"))
6993             {
6994                 ajStrKeepRange(&seqReadLine, 0,71);
6995                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
6996                 ajStrTokenStep(seqHandle);   /* 'SEQRES' */
6997 
6998                 ajStrTokenNextParse(seqHandle, &seqToken);   /* number */
6999                 ajStrToUint(seqToken, &iseq);
7000 
7001                 ajStrTokenNextParse(seqHandle, &chain);   /* chain letter */
7002 
7003                 if(iseq == 1)
7004                 {
7005                     if(alnitem && !ajStrGetLen(alnitem->Seq))
7006                     {
7007                         nseq--;
7008                         ajListstrPopLast(alnlist, &alnname);
7009                         ajTableRemoveKey(alntable, alnitem->Name,
7010                                          (void**) &alnname);
7011                         ajStrDel(&alnname);
7012                         seqMsfItemDel(&alnitem);
7013                     }
7014 
7015                     nseq++;
7016                     ajFmtPrintS(&seqToken, "%S_%S", name, chain);
7017                     AJNEW0(alnitem);
7018                     seqitemSetName(alnitem, seqToken);
7019                     ajStrAssignS(&alnname, alnitem->Name);
7020                     alnitem->Weight = 1.0;
7021                     ajTablePut(alntable, alnname, alnitem);
7022                     alnname = NULL;
7023                     ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
7024                 }
7025 
7026                 while(ajStrTokenNextParse(seqHandle, &seqToken))
7027                     if(ajResidueFromTriplet(seqToken,&aa))
7028                         seqAppendK(&alnitem->Seq, aa);
7029             }
7030 
7031             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7032         }
7033 
7034         if(alnitem && !ajStrGetLen(alnitem->Seq))
7035         {
7036             nseq--;
7037             ajListstrPopLast(alnlist, &alnname);
7038             ajTableRemoveKey(alntable, alnitem->Name,
7039                              (void**) &alnname);
7040             ajStrDel(&alnname);
7041             seqMsfItemDel(&alnitem);
7042         }
7043 
7044         if(!nseq)
7045         {
7046             ajStrDelStatic(&seqToken);
7047             ajStrDel(&name);
7048             ajStrDel(&chain);
7049             seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7050             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7051 
7052             return ajFalse;
7053         }
7054 
7055 
7056         ajDebug("PDB Entry has %d sequences\n", nseq);
7057         ajListstrTrace(alnlist);
7058         ajTableTrace(alntable);
7059         ajTableMap(alntable, &seqMsfTabList, NULL);
7060 
7061         alndata->Names = AJCALLOC(nseq, sizeof(*alndata->Names));
7062 
7063         for(i=0; i < nseq; i++)
7064         {
7065             ajListstrPop(alnlist, &alndata->Names[i]);
7066             ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
7067         }
7068 
7069         ajListstrFreeData(&alnlist);
7070 
7071         ajTableMap(alntable, &seqMsfTabList, NULL);
7072         alndata->Nseq = nseq;
7073         alndata->Count = 0;
7074         alndata->Bufflines = ajTextinGetRecords(seqin->Input);
7075         ajDebug("PDBSEQ format read %d lines\n",
7076                 ajTextinGetRecords(seqin->Input));
7077     }
7078 
7079     alndata = seqin->SeqData;
7080 
7081     i = alndata->Count;
7082     ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
7083     readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
7084     ajStrAssignS(&thys->Name, alndata->Names[i]);
7085 
7086     thys->Weight = readalnitem->Weight;
7087     ajStrAssignS(&thys->Seq, readalnitem->Seq);
7088 
7089     alndata->Count++;
7090 
7091     ajStrDelStatic(&seqToken);
7092     ajStrDel(&name);
7093     ajStrDel(&chain);
7094 
7095     return ajTrue;
7096 }
7097 
7098 
7099 
7100 
7101 /* @funcstatic seqReadPdbnuc **************************************************
7102 **
7103 ** Given nucleotide data in a sequence structure,
7104 ** tries to read everything needed using PDB protein databank format
7105 ** using the SEQRES records.
7106 **
7107 ** This is the sequence observed in the structure. See seqReadPdbnucseq
7108 ** for parsing the SEQRES records which give the original sequence.
7109 **
7110 ** @param [w] thys [AjPSeq] Sequence object
7111 ** @param [u] seqin [AjPSeqin] Sequence input object
7112 ** @return [AjBool] ajTrue on success
7113 **
7114 ** @release 6.1.0
7115 ** @@
7116 ******************************************************************************/
7117 
seqReadPdbnuc(AjPSeq thys,AjPSeqin seqin)7118 static AjBool seqReadPdbnuc(AjPSeq thys, AjPSeqin seqin)
7119 {
7120     AjPStr name  = NULL;
7121     AjPStr alnname  = NULL;
7122     AjPStr token = NULL;
7123     AjPStr chain = NULL;
7124     AjPTable alntable    = NULL;
7125     SeqPMsfItem alnitem  = NULL;
7126     const SeqPMsfItem readalnitem  = NULL;
7127     AjPList alnlist      = NULL;
7128     SeqPMsfData alndata  = NULL;
7129     char aa;
7130     ajuint nseq = 0;
7131     ajuint i;
7132     AjBool ok = ajTrue;
7133     AjPStr aa3 = NULL;
7134     ajuint iaa = 0;
7135     ajuint lastaa = 0;
7136     AjPStr model = NULL;
7137 
7138     ajDebug("seqReadPdbnuc seqin->SeqData %x\n", seqin->SeqData);
7139 
7140     if(!seqin->SeqData)
7141     {                                   /* start of file */
7142         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7143 
7144         ajDebug("first line:\n'%S'\n", seqReadLine);
7145 
7146         if(!ajStrPrefixC(seqReadLine, "HEADER    "))
7147         {
7148             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7149 
7150             return ajFalse;
7151         }
7152 
7153         ajStrAssignSubS(&name,seqReadLine, 62, 71);
7154         ajStrTrimWhite(&name);
7155 
7156         ajDebug("first line OK name '%S'\n", name);
7157 
7158         seqin->SeqData = AJNEW0(alndata);
7159         alndata->Table = alntable = ajTablestrNew(1000);
7160         alnlist = ajListstrNew();
7161         seqin->Input->Filecount = 0;
7162 
7163         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7164 
7165         while(ok && !ajStrMatchC(seqReadLine, "END"))
7166         {
7167             if(ajStrPrefixC(seqReadLine, "MODEL"))
7168             {
7169                 ajStrAssignSubS(&model, seqReadLine, 7, 14);
7170                 ajStrTrimWhite(&model);
7171             }
7172             else if(ajStrPrefixC(seqReadLine, "ATOM"))
7173             {
7174                 if(!alnitem)
7175                     AJNEW0(alnitem);
7176 
7177                 ajStrKeepRange(&seqReadLine, 0,71);
7178 
7179                 ajStrAssignSubS(&aa3, seqReadLine, 18, 19);
7180                 ajStrAssignSubS(&chain, seqReadLine, 21, 21);
7181                 ajStrAssignSubS(&token, seqReadLine, 22, 25);
7182                 ajStrToUint(token, &iaa);
7183 
7184                 if(iaa > lastaa)
7185                 {
7186                     if(ajBaseFromDoublet(aa3,&aa))
7187                         seqAppendK(&alnitem->Seq, aa);
7188 
7189                     lastaa = iaa;
7190                 }
7191 
7192             }
7193 
7194             else if(ajStrPrefixC(seqReadLine, "TER"))
7195             {
7196                 if(!ajStrGetLen(alnitem->Seq))
7197                 {
7198                     ajDebug("TER seqlen zero\n");
7199                     ajTableRemoveKey(alntable, alnitem->Name,
7200                                      (void**) &alnname);
7201                     ajStrDel(&alnname);
7202                     seqMsfItemDel(&alnitem);
7203                 }
7204                 else
7205                 {
7206                     nseq++;
7207                     ajFmtPrintS(&token, "%S_%S", name, chain);
7208 
7209                     if(ajStrGetLen(model))
7210                         ajStrAppendS(&token, model);
7211 
7212                     seqitemSetName(alnitem, token);
7213                     ajStrAssignS(&alnname, alnitem->Name);
7214                     alnitem->Weight = 1.0;
7215                     ajTablePut(alntable, alnname, alnitem);
7216                     alnname = NULL;
7217                     ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
7218                     alnitem = NULL;
7219                 }
7220                 lastaa = 0;
7221             }
7222 
7223             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7224         }
7225 
7226         ajStrDel(&aa3);
7227         ajStrDel(&token);
7228         ajStrDel(&name);
7229         ajStrDel(&chain);
7230         ajStrDel(&model);
7231 
7232         if(alnitem && !ajStrGetLen(alnitem->Seq))
7233         {
7234             ajListstrPopLast(alnlist, &alnname);
7235             ajTableRemoveKey(alntable, alnitem->Name,
7236                              (void**) &alnname);
7237             ajStrDel(&alnname);
7238             seqMsfItemDel(&alnitem);
7239         }
7240 
7241         if(!nseq)
7242         {
7243             seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7244             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7245 
7246             return ajFalse;
7247         }
7248 
7249         ajDebug("PDB Entry has %d sequences\n", nseq);
7250         ajListstrTrace(alnlist);
7251         ajTableTrace(alntable);
7252         ajTableMap(alntable, &seqMsfTabList, NULL);
7253 
7254         alndata->Names = AJCALLOC(nseq, sizeof(*alndata->Names));
7255 
7256         for(i=0; i < nseq; i++)
7257         {
7258             ajListstrPop(alnlist, &alndata->Names[i]);
7259             ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
7260         }
7261 
7262         ajListstrFreeData(&alnlist);
7263 
7264         ajTableMap(alntable, &seqMsfTabList, NULL);
7265         alndata->Nseq = nseq;
7266         alndata->Count = 0;
7267         alndata->Bufflines = ajTextinGetRecords(seqin->Input);
7268         ajDebug("PDB format read %d lines\n",
7269                 ajTextinGetRecords(seqin->Input));
7270     }
7271 
7272     alndata = seqin->SeqData;
7273     alntable = alndata->Table;
7274 
7275     if(alndata->Count >= alndata->Nseq)
7276     {                                   /* all done */
7277         ajFilebuffClear(seqin->Input->Filebuff, 0);
7278         seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7279 
7280         return ajFalse;
7281     }
7282 
7283     i = alndata->Count;
7284     ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
7285     readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
7286     ajStrAssignS(&thys->Name, alndata->Names[i]);
7287 
7288     thys->Weight = readalnitem->Weight;
7289     ajStrAssignS(&thys->Seq, readalnitem->Seq);
7290 
7291     alndata->Count++;
7292 
7293     return ajTrue;
7294 }
7295 
7296 
7297 
7298 
7299 /* @funcstatic seqReadPdbnucseq ***********************************************
7300 **
7301 ** Given nucleotide data in a sequence structure,
7302 ** tries to read everything needed using PDB protein databank format
7303 ** using the SEQRES records.
7304 **
7305 ** This is the original sequence, see seqReadPdbnuc for parsing the ATOM records
7306 ** which give the sequence observed in the structure.
7307 **
7308 ** @param [w] thys [AjPSeq] Sequence object
7309 ** @param [u] seqin [AjPSeqin] Sequence input object
7310 ** @return [AjBool] ajTrue on success
7311 **
7312 ** @release 6.1.0
7313 ** @@
7314 ******************************************************************************/
7315 
seqReadPdbnucseq(AjPSeq thys,AjPSeqin seqin)7316 static AjBool seqReadPdbnucseq(AjPSeq thys, AjPSeqin seqin)
7317 {
7318     AjPStr name  = NULL;
7319     AjPStr alnname  = NULL;
7320     AjPStr token = NULL;
7321     AjPStr chain = NULL;
7322     AjPTable alntable    = NULL;
7323     SeqPMsfItem alnitem  = NULL;
7324     const SeqPMsfItem readalnitem  = NULL;
7325     AjPList alnlist      = NULL;
7326     SeqPMsfData alndata  = NULL;
7327     char aa;
7328     ajuint iseq = 0;
7329     ajuint nseq = 0;
7330     ajuint i;
7331     AjBool ok = ajTrue;
7332 
7333     ajDebug("seqReadPdbnucseq seqin->SeqData %x\n", seqin->SeqData);
7334 
7335     if(!seqin->SeqData)
7336     {                                   /* start of file */
7337         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7338 
7339         ajDebug("first line:\n'%S'\n", seqReadLine);
7340 
7341         if(!ajStrPrefixC(seqReadLine, "HEADER    "))
7342         {
7343             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7344 
7345             return ajFalse;
7346         }
7347 
7348         ajStrAssignSubS(&name,seqReadLine, 62, 71);
7349         ajStrTrimWhite(&name);
7350 
7351         ajDebug("first line OK name '%S'\n", name);
7352 
7353         seqin->SeqData = AJNEW0(alndata);
7354         alndata->Table = alntable = ajTablestrNew(1000);
7355         alnlist = ajListstrNew();
7356         seqin->Input->Filecount = 0;
7357 
7358         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7359 
7360         while(ok && !ajStrMatchC(seqReadLine, "END"))
7361         {
7362             if(ajStrPrefixC(seqReadLine, "SEQRES"))
7363             {
7364                 ajStrKeepRange(&seqReadLine, 0,71);
7365                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
7366                 ajStrTokenStep(seqHandle);   /* 'SEQRES' */
7367 
7368                 ajStrTokenNextParse(seqHandle, &seqToken);   /* number */
7369                 ajStrToUint(seqToken, &iseq);
7370 
7371                 ajStrTokenNextParse(seqHandle, &chain);   /* chain letter */
7372 
7373                 if(iseq == 1)
7374                 {
7375                     if(alnitem && !ajStrGetLen(alnitem->Seq))
7376                     {
7377                         nseq--;
7378                         ajListstrPopLast(alnlist,&alnname);
7379                         ajTableRemoveKey(alntable, alnitem->Name,
7380                                          (void**) &alnname);
7381                         ajStrDel(&alnname);
7382                         seqMsfItemDel(&alnitem);
7383                     }
7384 
7385                     nseq++;
7386                     ajFmtPrintS(&token, "%S_%S", name, chain);
7387                     AJNEW0(alnitem);
7388                     seqitemSetName(alnitem, token);
7389                     ajStrAssignS(&alnname, alnitem->Name);
7390                     alnitem->Weight = 1.0;
7391                     ajTablePut(alntable, alnname, alnitem);
7392                     alnname = NULL;
7393                     ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
7394                 }
7395 
7396                 while(ajStrTokenNextParse(seqHandle, &seqToken))
7397                     if(ajBaseFromDoublet(seqToken,&aa))
7398                         seqAppendK(&alnitem->Seq, aa);
7399             }
7400 
7401             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7402         }
7403 
7404         if(alnitem && !ajStrGetLen(alnitem->Seq))
7405         {
7406             nseq--;
7407             ajListstrPopLast(alnlist,&alnname);
7408             ajTableRemoveKey(alntable, alnitem->Name,
7409                              (void**) &alnname);
7410             ajStrDel(&alnname);
7411             seqMsfItemDel(&alnitem);
7412         }
7413 
7414         if(!nseq)
7415         {
7416             ajStrDel(&token);
7417             ajStrDel(&name);
7418             ajStrDel(&chain);
7419             seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7420             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7421 
7422             return ajFalse;
7423         }
7424 
7425 
7426         ajDebug("PDB Entry has %d sequences\n", nseq);
7427         ajListstrTrace(alnlist);
7428         ajTableTrace(alntable);
7429         ajTableMap(alntable, &seqMsfTabList, NULL);
7430 
7431         alndata->Names = AJCALLOC(nseq, sizeof(*alndata->Names));
7432 
7433         for(i=0; i < nseq; i++)
7434         {
7435             ajListstrPop(alnlist, &alndata->Names[i]);
7436             ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
7437         }
7438 
7439         ajListstrFreeData(&alnlist);
7440 
7441         ajTableMap(alntable, &seqMsfTabList, NULL);
7442         alndata->Nseq = nseq;
7443         alndata->Count = 0;
7444         alndata->Bufflines = ajTextinGetRecords(seqin->Input);
7445         ajDebug("PDBNUCSEQ format read %d lines\n",
7446                 ajTextinGetRecords(seqin->Input));
7447     }
7448 
7449     alndata = seqin->SeqData;
7450     alntable = alndata->Table;
7451 
7452     if(alndata->Count >= alndata->Nseq)
7453     {                                   /* all done */
7454         ajFilebuffClear(seqin->Input->Filebuff, 0);
7455         seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7456 
7457         return ajFalse;
7458     }
7459 
7460     i = alndata->Count;
7461     ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
7462     readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
7463     ajStrAssignS(&thys->Name, alndata->Names[i]);
7464 
7465     thys->Weight = readalnitem->Weight;
7466     ajStrAssignS(&thys->Seq, readalnitem->Seq);
7467 
7468     alndata->Count++;
7469 
7470     ajStrDel(&token);
7471     ajStrDel(&name);
7472     ajStrDel(&chain);
7473 
7474     return ajTrue;
7475 }
7476 
7477 
7478 
7479 
7480 /* @funcstatic seqReadClustal *************************************************
7481 **
7482 ** Tries to read input in Clustal ALN format.
7483 **
7484 ** @param [w] thys [AjPSeq] Sequence object
7485 ** @param [u] seqin [AjPSeqin] Sequence input object
7486 ** @return [AjBool] ajTrue on success
7487 **
7488 ** @release 1.0.0
7489 ** @@
7490 ******************************************************************************/
7491 
seqReadClustal(AjPSeq thys,AjPSeqin seqin)7492 static AjBool seqReadClustal(AjPSeq thys, AjPSeqin seqin)
7493 {
7494     AjPStr seqstr        = NULL;
7495     AjPStr name          = NULL;
7496     AjBool ok            = ajFalse;
7497     ajuint iseq          = 0;
7498     AjPTable alntable    = NULL;
7499     SeqPMsfItem alnitem  = NULL;
7500     const SeqPMsfItem readalnitem  = NULL;
7501     AjPList alnlist      = NULL;
7502     SeqPMsfData alndata  = NULL;
7503 
7504     ajuint i;
7505 
7506     ajDebug("seqReadClustal seqin->SeqData %x\n", seqin->SeqData);
7507 
7508     if(!seqin->SeqData)
7509     {                                   /* start of file */
7510         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7511 
7512         if(!ok)
7513             return ajFalse;
7514 
7515         ajDebug("first line:\n'%S'\n", seqReadLine);
7516 
7517         if(!ajStrPrefixC(seqReadLine, "CLUSTAL"))
7518         {
7519             /* first line test */
7520             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7521 
7522             return ajFalse;
7523         }
7524 
7525         ajDebug("first line OK: '%S'\n", seqReadLine);
7526 
7527         while(ok)
7528         {                               /* skip blank lines */
7529             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7530             if(ok && !ajStrIsWhite(seqReadLine))
7531                 break;
7532         }
7533 
7534         if(!ok)
7535         {
7536             ajDebug("FAIL (blank lines only)\n");
7537             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7538 
7539             return ajFalse;
7540         }
7541 
7542         seqin->SeqData = AJNEW0(alndata);
7543         alndata->Table = alntable = ajTablestrNew(1000);
7544         alnlist = ajListstrNew();
7545         seqin->Input->Filecount = 0;
7546 
7547         /* first set - create table */
7548         ok = ajTrue;
7549 
7550         while(ok && ajStrExtractFirst(seqReadLine, &seqstr, &name))
7551         {
7552             AJNEW0(alnitem);
7553             ajStrAssignS(&alnitem->Name, name);
7554             alnitem->Weight = 1.0;
7555             seqAppend(&alnitem->Seq, seqstr);
7556 
7557             iseq++;
7558             ajDebug("first set %d: '%S'\n line: '%S'\n",
7559                     iseq, name, seqReadLine);
7560 
7561             ajTablePut(alntable, name, alnitem);
7562             name = NULL;
7563             ajListstrPushAppend(alnlist, ajStrNewS(alnitem->Name));
7564 
7565             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7566         }
7567 
7568         ajStrDel(&seqstr);
7569 
7570         ajDebug("Header has %d sequences\n", iseq);
7571         ajListstrTrace(alnlist);
7572         ajTableTrace(alntable);
7573         ajTableMap(alntable, &seqMsfTabList, NULL);
7574 
7575         alndata->Names = AJCALLOC(iseq, sizeof(*alndata->Names));
7576 
7577         for(i=0; i < iseq; i++)
7578         {
7579             ajListstrPop(alnlist, &alndata->Names[i]);
7580             ajDebug("list [%d] '%S'\n", i, alndata->Names[i]);
7581         }
7582 
7583         ajListstrFreeData(&alnlist);
7584 
7585         while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
7586         {                               /* now read the rest */
7587             seqClustalReadseq(seqReadLine, alntable);
7588         }
7589 
7590         ajTableMap(alntable, &seqMsfTabList, NULL);
7591         alndata->Nseq = iseq;
7592         alndata->Count = 0;
7593         alndata->Bufflines = ajTextinGetRecords(seqin->Input);
7594         ajDebug("ALN format read %d lines\n",
7595                 ajTextinGetRecords(seqin->Input));
7596     }
7597 
7598     alndata = seqin->SeqData;
7599     alntable = alndata->Table;
7600 
7601     if(alndata->Count >= alndata->Nseq)
7602     {                                   /* all done */
7603         ajFilebuffClear(seqin->Input->Filebuff, 0);
7604         seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7605 
7606         return ajFalse;
7607     }
7608 
7609     i = alndata->Count;
7610     ajDebug("returning [%d] '%S'\n", i, alndata->Names[i]);
7611     readalnitem = ajTableFetchS(alntable, alndata->Names[i]);
7612     ajStrAssignS(&thys->Name, alndata->Names[i]);
7613 
7614     thys->Weight = readalnitem->Weight;
7615     ajStrAssignS(&thys->Seq, readalnitem->Seq);
7616 
7617     alndata->Count++;
7618 
7619     return ajTrue;
7620 }
7621 
7622 
7623 
7624 
7625 /* @funcstatic seqClustalReadseq **********************************************
7626 **
7627 ** Reads sequence name from first token on the input line, and appends
7628 ** the sequence data to that sequence in the alntable structure.
7629 **
7630 ** @param [r] rdline [const AjPStr] Line from input file.
7631 ** @param [r] msftable [const AjPTable] MSF format sequence table.
7632 ** @return [AjBool] ajTrue on success
7633 **
7634 ** @release 1.0.0
7635 ** @@
7636 ******************************************************************************/
7637 
seqClustalReadseq(const AjPStr rdline,const AjPTable msftable)7638 static AjBool seqClustalReadseq(const AjPStr rdline, const AjPTable msftable)
7639 {
7640     SeqPMsfItem msfitem;
7641     AjPStr token     = NULL;
7642     AjPStr seqstr    = NULL;
7643 
7644     if(!ajStrExtractFirst(rdline, &seqstr, &token))
7645         return ajFalse;
7646 
7647     msfitem = ajTableFetchmodS(msftable, token);
7648     ajStrDel(&token);
7649 
7650     if(!msfitem)
7651     {
7652         ajStrDel(&seqstr);
7653 
7654         return ajFalse;
7655     }
7656 
7657     seqAppend(&msfitem->Seq, seqstr);
7658     ajStrDel(&seqstr);
7659 
7660     return ajTrue;
7661 }
7662 
7663 
7664 
7665 
7666 /* @funcstatic seqReadPhylipnon ***********************************************
7667 **
7668 ** Tries to read input in Phylip non-interleaved format.
7669 **
7670 ** @param [w] thys [AjPSeq] Sequence object
7671 ** @param [u] seqin [AjPSeqin] Sequence input object
7672 ** @return [AjBool] ajTrue on success
7673 **
7674 ** @release 3.0.0
7675 ** @@
7676 ******************************************************************************/
7677 
seqReadPhylipnon(AjPSeq thys,AjPSeqin seqin)7678 static AjBool seqReadPhylipnon(AjPSeq thys, AjPSeqin seqin)
7679 {
7680     AjPStr seqstr = NULL;
7681     AjPStr tmpstr = NULL;
7682     AjBool ok       = ajFalse;
7683     ajuint iseq      = 0;
7684     ajuint jseq      = 0;
7685     ajuint len       = 0;
7686     ajuint ilen      = 0;
7687 
7688     AjPTable phytable        = NULL;
7689     SeqPMsfItem phyitem      = NULL;
7690     const SeqPMsfItem readphyitem = NULL;
7691     SeqPMsfData phydata      = NULL;
7692     ajuint i;
7693     AjBool done = ajFalse;
7694 
7695     ajDebug("seqReadPhylipnon seqin->SeqData %x\n", seqin->SeqData);
7696 
7697     if(!seqRegPhylipTop)
7698         seqRegPhylipTop = ajRegCompC("^ *([0-9]+) +([0-9]+)");
7699 
7700     if(!seqRegPhylipHead)
7701         seqRegPhylipHead = ajRegCompC("^(..........) ?"); /* 10 chars */
7702 
7703     if(!seqin->SeqData)
7704     {                                   /* start of file */
7705         seqin->Multidone = ajFalse;
7706         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7707         if(!ok)
7708             return ajFalse;
7709 
7710         ajDebug("first line:\n'%-20.20S'\n", seqReadLine);
7711 
7712         if(!ajRegExec(seqRegPhylipTop, seqReadLine))
7713         {                               /* first line test */
7714             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7715 
7716             return ajFalse;
7717         }
7718 
7719         ajRegSubI(seqRegPhylipTop, 1, &tmpstr);
7720         ajStrToUint(tmpstr, &iseq);
7721         ajDebug("seqRegPhylipTop1 '%S' %d\n", tmpstr, iseq);
7722         ajRegSubI(seqRegPhylipTop, 2, &tmpstr);
7723         ajStrToUint(tmpstr, &len);
7724         ajDebug("seqRegPhylipTop2 '%S' %d\n", tmpstr,len);
7725         ajDebug("first line OK: '%S' iseq: %d len: %d\n",
7726                 seqReadLine, iseq, len);
7727         ajStrDel(&tmpstr);
7728 
7729         seqin->SeqData = AJNEW0(phydata);
7730         phydata->Table = phytable = ajTablestrNew(1000);
7731         phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
7732         seqin->Input->Filecount = 0;
7733 
7734         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7735         ilen = 0;
7736 
7737         while(ok && (jseq < iseq))
7738         {
7739             /* first set - create table */
7740             if(!ajRegExec(seqRegPhylipHead, seqReadLine))
7741             {
7742                 ajDebug("FAIL (not seqRegPhylipHead): '%S'\n", seqReadLine);
7743                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7744                 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7745 
7746                 return ajFalse;
7747             }
7748 
7749             ajDebug("line: '%S'\n", seqReadLine);
7750             ajRegSubI(seqRegPhylipHead, 1, &tmpstr);
7751 
7752             if(!ajStrIsWhite(tmpstr))
7753             {
7754                 /* check previous sequence */
7755                 if(jseq)
7756                 {
7757                     if(ilen != len)
7758                     {
7759                         ajDebug("phylipnon format length mismatch at %d "
7760                                 "(length %d)\n",
7761                                 len, ilen);
7762                         seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7763                         ajStrDel(&tmpstr);
7764                         return ajFalse;
7765                     }
7766                 }
7767 
7768                 /* new sequence */
7769                 AJNEW0(phyitem);
7770                 seqitemSetName(phyitem, tmpstr);
7771                 ajStrAssignS(&phydata->Names[jseq], phyitem->Name);
7772                 ajDebug("name: '%S' => '%S'\n", tmpstr, phyitem->Name);
7773                 phyitem->Weight = 1.0;
7774                 ajRegPost(seqRegPhylipHead, &seqstr);
7775                 seqAppend(&phyitem->Seq, seqstr);
7776                 ajStrDel(&seqstr);
7777                 ilen = ajStrGetLen(phyitem->Seq);
7778 
7779                 if(ilen == len)
7780                     done = ajTrue;
7781                 else if(ilen > len)
7782                 {
7783                     ajDebug("Phylipnon format: sequence %S "
7784                             "header size %d exceeded\n",
7785                             phyitem->Name, len);
7786                     seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7787                     seqMsfItemDel(&phyitem);
7788                     ajStrDel(&tmpstr);
7789 
7790                     return ajFalse;
7791                 }
7792 
7793                 ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
7794                 ajDebug("seq %d: (%d) '%-20.20S'\n", jseq, ilen, seqReadLine);
7795             }
7796             else
7797             {
7798                 /* more sequence to append */
7799                 if(seqPhylipReadseq(seqReadLine, phytable, phyitem->Name,
7800                                     len, &ilen, &done))
7801                 {
7802                     ajDebug("read to len %d\n", ilen);
7803 
7804                     if (done)
7805                         jseq++;
7806                 }
7807 
7808             }
7809             ajStrDel(&tmpstr);
7810 
7811             if(jseq < iseq)
7812             {
7813                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7814             }
7815         }
7816 
7817         if(ilen != len)
7818         {
7819             ajDebug("phylipnon format final length mismatch at %d "
7820                     "(length %d)\n",
7821                     len, ilen);
7822             seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7823 
7824             return ajFalse;
7825         }
7826 
7827         ajDebug("Header has %d sequences\n", jseq);
7828         ajTableTrace(phytable);
7829         ajTableMap(phytable, &seqMsfTabList, NULL);
7830 
7831         phydata->Nseq = iseq;
7832         phydata->Count = 0;
7833         phydata->Bufflines = ajTextinGetRecords(seqin->Input);
7834         ajDebug("PHYLIP format read %d lines\n",
7835                 ajTextinGetRecords(seqin->Input));
7836     }
7837 
7838     phydata = seqin->SeqData;
7839     phytable = phydata->Table;
7840 
7841     i = phydata->Count;
7842     ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
7843     readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
7844     ajStrAssignS(&thys->Name, phydata->Names[i]);
7845     ajStrDel(&phydata->Names[i]);
7846 
7847     thys->Weight = readphyitem->Weight;
7848     ajStrAssignS(&thys->Seq, readphyitem->Seq);
7849 
7850     phydata->Count++;
7851 
7852     if(phydata->Count >= phydata->Nseq)
7853     {
7854         seqin->Multidone = ajTrue;
7855         ajFilebuffClear(seqin->Input->Filebuff, 0);
7856         ajDebug("seqReadPhylip multidone\n");
7857         seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7858     }
7859 
7860     ajStrDel(&seqstr);
7861     ajStrDel(&tmpstr);
7862     seqMsfDataTrace(seqin->SeqData);
7863 
7864     return ajTrue;
7865 }
7866 
7867 
7868 
7869 
7870 /* @funcstatic seqReadPhylip **************************************************
7871 **
7872 ** Tries to read input in Phylip interleaved format.
7873 **
7874 ** @param [w] thys [AjPSeq] Sequence object
7875 ** @param [u] seqin [AjPSeqin] Sequence input object
7876 ** @return [AjBool] ajTrue on success
7877 **
7878 ** @release 1.0.0
7879 ** @@
7880 ******************************************************************************/
7881 
seqReadPhylip(AjPSeq thys,AjPSeqin seqin)7882 static AjBool seqReadPhylip(AjPSeq thys, AjPSeqin seqin)
7883 {
7884     AjPStr seqstr = NULL;
7885     AjPStr tmpstr = NULL;
7886     AjBool ok       = ajFalse;
7887     ajuint iseq      = 0;
7888     ajuint jseq      = 0;
7889     ajuint len       = 0;
7890     ajuint ilen      = 0;
7891     ajuint maxlen    = 0;
7892     AjPFilebuff buff;
7893 
7894     AjPTable phytable        = NULL;
7895     SeqPMsfItem phyitem      = NULL;
7896     const SeqPMsfItem readphyitem = NULL;
7897     AjPList phylist          = NULL;
7898     SeqPMsfData phydata      = NULL;
7899     ajuint i;
7900     AjBool done = ajFalse;
7901 
7902     ajDebug("seqReadPhylip seqin->SeqData %x\n", seqin->SeqData);
7903 
7904     buff = seqin->Input->Filebuff;
7905     ajFilebuffSetBuffered(buff);    /* must buffer to test non-interleaved */
7906 
7907     if(!seqRegPhylipTop)
7908         seqRegPhylipTop = ajRegCompC("^ *([0-9]+) +([0-9]+)");
7909 
7910     if(!seqRegPhylipHead)
7911         seqRegPhylipHead = ajRegCompC("^(..........) ?"); /* 10 chars */
7912 
7913     if(!seqRegPhylipSeq)
7914         seqRegPhylipSeq = ajRegCompC("^[ \t\n\r]*$");
7915 
7916     if(!seqin->SeqData)
7917     {                                   /* start of file */
7918         seqin->Multidone = ajFalse;
7919         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7920         while (ok && ajStrIsWhite(seqReadLine))
7921             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7922 
7923         if(!ok)
7924             return ajFalse;
7925 
7926         /* ajDebug("first line:\n'%-20.20S'\n", seqReadLine);*/
7927 
7928         if(!ajRegExec(seqRegPhylipTop, seqReadLine))
7929         {                               /* first line test */
7930             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7931 
7932             return ajFalse;
7933         }
7934 
7935         ajRegSubI(seqRegPhylipTop, 1, &tmpstr);
7936         ajStrToUint(tmpstr, &iseq);
7937         ajRegSubI(seqRegPhylipTop, 2, &tmpstr);
7938         ajStrToUint(tmpstr, &len);
7939         ajStrDel(&tmpstr);
7940         /*ajDebug("first line OK: '%S' iseq: %d len: %d\n",
7941           seqReadLine, iseq, len);*/
7942 
7943         seqin->SeqData = AJNEW0(phydata);
7944         phydata->Table = phytable = ajTablestrNew(1000);
7945         phylist = ajListstrNew();
7946         seqin->Input->Filecount = 0;
7947 
7948         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
7949         ilen = 0;
7950 
7951         while(ok && (jseq < iseq))
7952         {
7953             /* first set - create table */
7954             if(!ajRegExec(seqRegPhylipHead, seqReadLine))
7955             {
7956                 ajDebug("FAIL (not seqRegPhylipHead): '%S'\n", seqReadLine);
7957                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7958                 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7959 
7960                 return ajFalse;
7961             }
7962 
7963             /* ajDebug("line: '%S'\n", seqReadLine); */
7964             AJNEW0(phyitem);
7965             ajRegSubI(seqRegPhylipHead, 1, &tmpstr);
7966             seqitemSetName(phyitem, tmpstr);
7967             ajStrDel(&tmpstr);
7968             /* ajDebug("name: '%S' => '%S'\n", tmpstr, phyitem->Name); */
7969             phyitem->Weight = 1.0;
7970             ajRegPost(seqRegPhylipHead, &seqstr);
7971             seqAppend(&phyitem->Seq, seqstr);
7972             ajStrDel(&seqstr);
7973             ilen = ajStrGetLen(phyitem->Seq);
7974 
7975             if(ilen == len)
7976                 done = ajTrue;
7977             else if(ilen > len)
7978             {
7979                 ajDebug("Phylip format: sequence %S header size %d exceeded\n",
7980                         phyitem->Name, len);
7981                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7982 
7983                 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
7984                 seqMsfItemDel(&phyitem);
7985 
7986                 ajListstrFreeData(&phylist);
7987 
7988                 return ajFalse;
7989             }
7990 
7991             if(ajStrIsWhite(phyitem->Name) ||
7992                ajTableFetchS(phytable, phyitem->Name))
7993             {
7994                 ajFilebuffSetBuffered(buff);
7995                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
7996                 ajDebug("phytable repeated name '%S'\n",
7997                         phyitem->Name);
7998 
7999                 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8000                 seqMsfItemDel(&phyitem);
8001 
8002                 ajListstrFreeData(&phylist);
8003 
8004                 return seqReadPhylipnon(thys, seqin);
8005             }
8006 
8007             ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
8008             ajListstrPushAppend(phylist, ajStrNewS(phyitem->Name));
8009             ajDebug("added '%S' list:%Lu table:%Lu\n",
8010                     phyitem->Name, ajListGetLength(phylist),
8011                     ajTableGetLength(phytable));
8012 
8013             if(!jseq)
8014                 maxlen = ilen;
8015             else
8016             {
8017                 if(ilen != maxlen)
8018                 {
8019                     ajDebug("phylip format length mismatch in header "
8020                             "iseq: %d jseq: %d ilen: %d maxlen: %d\n",
8021                             iseq, jseq, ilen, maxlen);
8022                     ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8023                     ajDebug("phytable deleted size:%Lu\n",
8024                             ajTableGetLength(phytable));
8025                     seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8026                     ajListstrFreeData(&phylist);
8027 
8028                     if(seqReadPhylipnon(thys, seqin))
8029                         return ajTrue;
8030                     else
8031                     {
8032                         ajWarn("phylip format length mismatch in header");
8033 
8034                         return ajFalse;
8035                     }
8036                 }
8037             }
8038 
8039             jseq++;
8040             /* ajDebug("first set %d: (%d) '%-20.20S'\n",
8041                jseq, ilen, seqReadLine); */
8042 
8043             if(jseq < iseq)
8044             {
8045                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8046             }
8047         }
8048 
8049         /* ajDebug("Header has %d sequences\n", jseq);*/
8050         ajListstrTrace(phylist);
8051         ajTableTrace(phytable);
8052         ajTableMap(phytable, &seqMsfTabList, NULL);
8053 
8054         phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
8055 
8056         for(i=0; i < iseq; i++)
8057         {
8058             ajListstrPop(phylist, &phydata->Names[i]);
8059             /* ajDebug("list [%d] '%S'\n", i, phydata->Names[i]); */
8060         }
8061 
8062         ajListstrFreeData(&phylist);
8063 
8064         if(ilen < len)
8065         {
8066             jseq=0;
8067 
8068             while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
8069             {                           /* now read the rest */
8070                 /* ajDebug("seqReadPhylip line '%S\n", seqReadLine); */
8071 
8072                 if(seqPhylipReadseq(seqReadLine, phytable,
8073                                     phydata->Names[jseq],
8074                                     len, &ilen, &done))
8075                 {
8076                     if(!jseq)
8077                         maxlen = ilen;
8078                     else
8079                     {
8080                         if(ilen != maxlen)
8081                         {
8082                             ajDebug("phylip format length mismatch at %d "
8083                                     "(length %d)\n",
8084                                     maxlen, ilen);
8085                             ajFilebuffSetBuffered(buff);
8086                             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8087                             seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8088                             ajDebug("File reset, try seqReadPhylipnon\n");
8089 
8090                             return seqReadPhylipnon(thys, seqin);
8091                         }
8092                     }
8093 
8094                     jseq++;
8095 
8096                     if(jseq == iseq)
8097                         jseq = 0;
8098 
8099                     if(!jseq && done)
8100                     {
8101                         /* ajDebug("seqReadPhylip set done\n"); */
8102                         break;
8103                     }
8104                     done = ajTrue;      /* for end-of-file */
8105                 }
8106             }
8107 
8108             if(!done)
8109             {
8110                 ajDebug("seqReadPhylip read failed, try seqReadPhylipnon\n");
8111                 ajFilebuffSetBuffered(buff);
8112                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8113                 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8114 
8115                 return seqReadPhylipnon(thys, seqin);
8116             }
8117 
8118             if(jseq)
8119             {
8120                 ajDebug("Phylip format %d sequences partly read at end\n",
8121                         iseq-jseq);
8122                 ajFilebuffSetBuffered(buff);
8123                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8124                 seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8125 
8126                 return seqReadPhylipnon(thys, seqin);
8127             }
8128         }
8129 
8130         ajTableMap(phytable, &seqMsfTabList, NULL);
8131         phydata->Nseq = iseq;
8132         phydata->Count = 0;
8133         phydata->Bufflines = ajTextinGetRecords(seqin->Input);
8134         /* ajDebug("PHYLIP format read %d lines\n",
8135                    ajTextinGetRecords(seqin->Input));*/
8136     }
8137 
8138     phydata = seqin->SeqData;
8139     phytable = phydata->Table;
8140 
8141     i = phydata->Count;
8142     /* ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]); */
8143     readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
8144     ajStrAssignS(&thys->Name, phydata->Names[i]);
8145 
8146     thys->Weight = readphyitem->Weight;
8147     ajStrAssignS(&thys->Seq, readphyitem->Seq);
8148 
8149     phydata->Count++;
8150 
8151     if(phydata->Count >= phydata->Nseq)
8152     {
8153         seqin->Multidone = ajTrue;
8154         ajDebug("seqReadPhylip multidone\n");
8155         ajFilebuffClear(seqin->Input->Filebuff, 0);
8156         seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
8157     }
8158 
8159     seqMsfDataTrace(seqin->SeqData);
8160 
8161     return ajTrue;
8162 }
8163 
8164 
8165 
8166 
8167 /* @funcstatic seqPhylipReadseq ***********************************************
8168 **
8169 ** Reads sequence from the input line, and appends the sequence data
8170 ** to the named sequence in the phytable structure.
8171 **
8172 ** @param [r] rdline [const AjPStr] Line from input file.
8173 ** @param [r] phytable [const AjPTable] MSF format sequence table.
8174 ** @param [r] token [const AjPStr] Name of sequence so it can append
8175 ** @param [r] len [ajuint] Final length of each sequence (from file header)
8176 ** @param [w] ilen [ajuint*] Length of each sequence so far
8177 ** @param [w] done [AjBool*] ajTrue if sequence was completed
8178 ** @return [AjBool] ajTrue on success
8179 **
8180 ** @release 1.0.0
8181 ** @@
8182 ******************************************************************************/
8183 
seqPhylipReadseq(const AjPStr rdline,const AjPTable phytable,const AjPStr token,ajuint len,ajuint * ilen,AjBool * done)8184 static AjBool seqPhylipReadseq(const AjPStr rdline, const AjPTable phytable,
8185                                const AjPStr token,
8186                                ajuint len, ajuint* ilen, AjBool* done)
8187 {
8188     SeqPMsfItem phyitem;
8189 
8190     *done = ajFalse;
8191 
8192     if(!seqRegPhylipSeq2)
8193         seqRegPhylipSeq2 = ajRegCompC("[^ \t\n\r]");
8194 
8195     if(!ajRegExec(seqRegPhylipSeq2, rdline))
8196         return ajFalse;
8197 
8198     phyitem = ajTableFetchmodS(phytable, token);
8199 
8200     if(!phyitem)
8201     {
8202         ajDebug("seqPhylipReadseq failed to find '%S' in phytable\n",
8203                 token);
8204 
8205         return ajFalse;
8206     }
8207 
8208     seqAppend(&phyitem->Seq, rdline);
8209     *ilen = ajStrGetLen(phyitem->Seq);
8210 
8211     if(*ilen == len)
8212         *done = ajTrue;
8213     else if(*ilen > len)
8214     {
8215         ajDebug("Phylip format error, sequence %S length %d exceeded\n",
8216                 token, len);
8217 
8218         return ajFalse;
8219     }
8220 
8221     ajDebug("seqPhylipReadSeq '%S' len: %d ilen: %d done: %B\n",
8222             token, len, *ilen, *done);
8223 
8224     return ajTrue;
8225 }
8226 
8227 
8228 
8229 
8230 /* @funcstatic seqReadHennig86 ************************************************
8231 **
8232 ** Tries to read input in Hennig86 format.
8233 **
8234 ** @param [w] thys [AjPSeq] Sequence object
8235 ** @param [u] seqin [AjPSeqin] Sequence input object
8236 ** @return [AjBool] ajTrue on success
8237 **
8238 ** @release 1.0.0
8239 ** @@
8240 ******************************************************************************/
8241 
seqReadHennig86(AjPSeq thys,AjPSeqin seqin)8242 static AjBool seqReadHennig86(AjPSeq thys, AjPSeqin seqin)
8243 {
8244     AjPStr seqstr = NULL;
8245     AjPStr tmpstr = NULL;
8246     AjBool ok       = ajFalse;
8247     ajuint iseq      = 0;
8248     ajuint len       = 0;
8249     AjPTable fmttable   = NULL;
8250     SeqPMsfItem fmtitem = NULL;
8251     const SeqPMsfItem readfmtitem = NULL;
8252     AjPList fmtlist     = NULL;
8253     SeqPMsfData fmtdata = NULL;
8254     char *cp;
8255 
8256     ajuint i;
8257     ajuint jseq = 0;
8258 
8259     ajDebug("seqReadHennig86 seqin->SeqData %x\n", seqin->SeqData);
8260 
8261     if(!seqRegHennigHead)
8262         seqRegHennigHead = ajRegCompC("[^1-4? \t]");
8263 
8264     if(!seqRegHennigTop)
8265         seqRegHennigTop = ajRegCompC("^ *([0-9]+) +([0-9]+)");
8266 
8267     if(!seqRegHennigBlank)
8268         seqRegHennigBlank = ajRegCompC("^[ \t\n\r]*$");
8269 
8270     if(!seqRegHennigSeq)
8271         seqRegHennigSeq = ajRegCompC("^([^ \t\n\r]+)");
8272 
8273     if(!seqin->SeqData)
8274     {
8275         /* start: load in file */
8276         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8277         if(!ok)
8278             return ajFalse;
8279 
8280         ajDebug("first line:\n'%S'\n", seqReadLine);
8281 
8282         if(!ajStrPrefixC(seqReadLine, "xread"))
8283         {
8284             /* first line test */
8285             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8286 
8287             return ajFalse;
8288         }
8289 
8290         ajDebug("first line OK: '%S'\n", seqReadLine);
8291 
8292         /* skip title line */
8293         for(i=0; i<2; i++)
8294         {
8295             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8296             if(!ok)
8297             {
8298                 ajDebug("FAIL (bad header)\n");
8299                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8300 
8301                 return ajFalse;
8302             }
8303         }
8304 
8305         if(!ajRegExec(seqRegHennigTop, seqReadLine))    /* first line test */
8306             return ajFalse;
8307 
8308         ajRegSubI(seqRegHennigTop, 1, &tmpstr);
8309         ajStrToUint(tmpstr, &iseq);
8310         ajRegSubI(seqRegHennigTop, 2, &tmpstr);
8311         ajStrToUint(tmpstr, &len);
8312         ajDebug("first line OK: '%S' iseq: %d len: %d\n",
8313                 seqReadLine, iseq, len);
8314         ajStrDel(&tmpstr);
8315 
8316         seqin->SeqData = AJNEW0(fmtdata);
8317         fmtdata->Table = fmttable = ajTablestrNew(1000);
8318         fmtlist = ajListstrNew();
8319         seqin->Input->Filecount = 0;
8320 
8321         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8322 
8323         while(ok && (jseq < iseq))
8324         {                               /* first set - create table */
8325             if(!ajRegExec(seqRegHennigHead, seqReadLine))
8326             {
8327                 ajDebug("FAIL (not seqRegHennigHead): '%S'\n", seqReadLine);
8328 
8329                 return ajFalse;
8330             }
8331 
8332             AJNEW0(fmtitem);
8333             ajStrAssignS(&fmtitem->Name, seqReadLine);
8334             fmtitem->Weight = 1.0;
8335             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8336             while(ok && ajRegExec(seqRegHennigSeq, seqReadLine))
8337             {
8338                 ajRegPost(seqRegHennigSeq, &seqstr);
8339 
8340                 for(cp = ajStrGetuniquePtr(&seqstr); cp; cp++)
8341                     switch(*cp)
8342                     {
8343                         case 0: *cp = 'A';break;
8344                         case 1: *cp = 'T';break;
8345                         case 2: *cp = 'G';break;
8346                         case 3: *cp = 'C';break;
8347                         default: *cp = '.';break;
8348                     }
8349 
8350                 seqAppend(&fmtitem->Seq, seqstr);
8351             }
8352 
8353             ajStrDel(&seqstr);
8354 
8355             ajTablePut(fmttable, ajStrNewS(fmtitem->Name), fmtitem);
8356             ajListstrPushAppend(fmtlist, ajStrNewS(fmtitem->Name));
8357             jseq++;
8358             ajDebug("first set %d: '%S'\n", jseq, seqReadLine);
8359 
8360             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8361         }
8362 
8363         ajDebug("Header has %d sequences\n", iseq);
8364         ajListstrTrace(fmtlist);
8365         ajTableTrace(fmttable);
8366         ajTableMap(fmttable, &seqMsfTabList, NULL);
8367 
8368         fmtdata->Names = AJCALLOC(iseq, sizeof(*fmtdata->Names));
8369 
8370         for(i=0; i < iseq; i++)
8371         {
8372             ajListstrPop(fmtlist, &fmtdata->Names[i]);
8373             ajDebug("list [%d] '%S'\n", i, fmtdata->Names[i]);
8374         }
8375 
8376         ajListstrFreeData(&fmtlist);
8377 
8378         while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
8379         {                               /* now read the rest */
8380             seqHennig86Readseq(seqReadLine, fmttable);
8381         }
8382 
8383         ajTableMap(fmttable, &seqMsfTabList, NULL);
8384         fmtdata->Nseq = iseq;
8385         fmtdata->Count = 0;
8386         fmtdata->Bufflines = ajTextinGetRecords(seqin->Input);
8387         ajDebug("... format read %d lines\n",
8388                 ajTextinGetRecords(seqin->Input));
8389     }
8390 
8391     /* processing entries */
8392 
8393     fmtdata = seqin->SeqData;
8394     fmttable = fmtdata->Table;
8395 
8396     if(fmtdata->Count >=fmtdata->Nseq)
8397     {                                   /* all done */
8398         ajFilebuffClear(seqin->Input->Filebuff, 0);
8399         ajTableMapDel(fmttable, &seqMsfTabDel, NULL);
8400         ajTableFree(&fmttable);
8401         AJFREE(fmtdata->Names);
8402         AJFREE(fmtdata);
8403         seqin->SeqData = NULL;
8404 
8405         return ajFalse;
8406     }
8407 
8408     i = fmtdata->Count;
8409     ajDebug("returning [%d] '%S'\n", i, fmtdata->Names[i]);
8410     readfmtitem = ajTableFetchS(fmttable, fmtdata->Names[i]);
8411     ajStrAssignS(&thys->Name, fmtdata->Names[i]);
8412 
8413     thys->Weight = readfmtitem->Weight;
8414     ajStrAssignS(&thys->Seq, readfmtitem->Seq);
8415 
8416     fmtdata->Count++;
8417 
8418     return ajTrue;
8419 }
8420 
8421 
8422 
8423 
8424 /* @funcstatic seqHennig86Readseq *********************************************
8425 **
8426 ** Reads sequence name from first token on the input line, and appends
8427 ** the sequence data to that sequence in the fmttable structure.
8428 **
8429 ** @param [r] rdline [const AjPStr] Line from input file.
8430 ** @param [r] msftable [const AjPTable] MSF format sequence table.
8431 ** @return [AjBool] ajTrue on success
8432 **
8433 ** @release 1.0.0
8434 ** @@
8435 ******************************************************************************/
8436 
seqHennig86Readseq(const AjPStr rdline,const AjPTable msftable)8437 static AjBool seqHennig86Readseq(const AjPStr rdline, const AjPTable msftable)
8438 {
8439     SeqPMsfItem msfitem;
8440     AjPStr token  = NULL;
8441     AjPStr seqstr = NULL;
8442 
8443     if(!seqRegHennigSeq)
8444         seqRegHennigSeq = ajRegCompC("^[^ \t\n\r]+"); /* must be line start */
8445 
8446     if(!ajRegExec(seqRegHennigSeq, rdline))
8447         return ajFalse;
8448 
8449     ajRegSubI(seqRegHennigSeq, 0, &token);
8450     msfitem = ajTableFetchmodS(msftable, token);
8451     ajStrDel(&token);
8452 
8453     if(!msfitem)
8454         return ajFalse;
8455 
8456     ajRegPost(seqRegHennigSeq, &seqstr);
8457     seqAppend(&msfitem->Seq, seqstr);
8458 
8459     ajStrDel(&seqstr);
8460 
8461     return ajTrue;
8462 }
8463 
8464 
8465 
8466 
8467 /* @funcstatic seqReadTreecon *************************************************
8468 **
8469 ** Tries to read input in Treecon format.
8470 **
8471 ** Treecon is a windows program for tree drawing.
8472 **
8473 ** Van de Peer, Y., De Wachter, R. (1994)
8474 ** TREECON for Windows: a software package for the construction and
8475 ** drawing of evolutionary trees for the Microsoft Windows environment.
8476 ** Comput. Applic. Biosci. 10, 569-570.
8477 **
8478 ** @param [w] thys [AjPSeq] Sequence object
8479 ** @param [u] seqin [AjPSeqin] Sequence input object
8480 ** @return [AjBool] ajTrue on success
8481 **
8482 ** @release 2.0.0
8483 ** @@
8484 ******************************************************************************/
8485 
seqReadTreecon(AjPSeq thys,AjPSeqin seqin)8486 static AjBool seqReadTreecon(AjPSeq thys, AjPSeqin seqin)
8487 {
8488     AjPStr tmpstr = NULL;
8489     AjBool ok       = ajFalse;
8490     ajint len       = 0;
8491     ajint ilen      = 0;
8492     ajuint iseq;
8493     ajuint i;
8494 
8495     AjPTable phytable        = NULL;
8496     SeqPMsfItem phyitem      = NULL;
8497     const SeqPMsfItem readphyitem = NULL;
8498     AjPList phylist          = NULL;
8499     SeqPMsfData phydata      = NULL;
8500 
8501     if(!seqRegTreeconTop)
8502         seqRegTreeconTop = ajRegCompC("^ *([0-9]+)");
8503 
8504     if(!seqin->SeqData)                 /* first time - read the data */
8505     {
8506         iseq = 0;
8507         seqin->Multidone = ajFalse;
8508         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8509         if(!ok)
8510             return ajFalse;
8511 
8512         if(!ajRegExec(seqRegTreeconTop, seqReadLine))
8513         {                               /* first line test */
8514             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8515 
8516             return ajFalse;
8517         }
8518 
8519         ajRegSubI(seqRegTreeconTop, 1, &tmpstr);
8520         ajStrToInt(tmpstr, &len);
8521         ajDebug("first line OK: len: %d\n",
8522                 len);
8523         ajStrDel(&tmpstr);
8524 
8525         seqin->SeqData = AJNEW0(phydata);
8526         phydata->Table = phytable = ajTablestrNew(1000);
8527         phylist = ajListstrNew();
8528         seqin->Input->Filecount = 0;
8529 
8530         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8531         ilen = UINT_MAX;
8532 
8533         if(!ok)
8534         {
8535             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8536             seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8537 
8538             return ajFalse;
8539         }
8540 
8541         while (ok)
8542         {
8543             if (ilen < 0)
8544             {
8545                 ajStrRemoveWhiteExcess(&seqReadLine);
8546 
8547                 if (!ajStrGetLen(seqReadLine))   /* empty line after sequence */
8548                 {
8549                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8550                     continue;
8551                 }
8552 
8553                 AJNEW0(phyitem);
8554                 phyitem->Weight = 1.0;
8555                 seqitemSetName(phyitem, seqReadLine);
8556                 ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
8557                 ajListstrPushAppend(phylist, ajStrNewS(phyitem->Name));
8558                 iseq++;
8559                 ilen = 0;
8560             }
8561             else
8562             {
8563                 ajStrRemoveWhite(&seqReadLine);
8564                 ilen += ajStrGetLen(seqReadLine);
8565                 seqAppend(&phyitem->Seq, seqReadLine);
8566 
8567                 if (ilen > len)
8568                 {
8569                     ajDebug("Treecon format: '%S' too long, read %d/%d\n",
8570                             phyitem->Name, ilen, len);
8571                     ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8572                     seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8573 
8574                     return ajFalse;
8575                 }
8576 
8577                 if (ilen == len)
8578                     ilen = -1;
8579             }
8580 
8581             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8582         }
8583 
8584         if (ilen >= 0)
8585         {
8586             ajDebug("Treecon format: unfinished sequence '%S' read %d/%d\n",
8587                     phyitem->Name, ilen, len);
8588             seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8589 
8590             return ajFalse;
8591         }
8592 
8593         phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
8594 
8595         for(i=0; i < iseq; i++)
8596         {
8597             ajListstrPop(phylist, &phydata->Names[i]);
8598             ajDebug("list [%d] '%S'\n", i, phydata->Names[i]);
8599         }
8600 
8601         ajListstrFreeData(&phylist);
8602         phydata->Nseq = iseq;
8603         phydata->Count = 0;
8604         phydata->Bufflines = ajTextinGetRecords(seqin->Input);
8605         ajDebug("Treecon format read %d lines\n",
8606                 ajTextinGetRecords(seqin->Input));
8607     }
8608 
8609     phydata = seqin->SeqData;
8610     phytable = phydata->Table;
8611 
8612     i = phydata->Count;
8613     ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
8614     readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
8615     ajStrAssignS(&thys->Name, phydata->Names[i]);
8616 
8617     thys->Weight = readphyitem->Weight;
8618     ajStrAssignS(&thys->Seq, readphyitem->Seq);
8619 
8620     phydata->Count++;
8621 
8622     if(phydata->Count >=phydata->Nseq)
8623     {
8624         seqin->Multidone = ajTrue;
8625         ajDebug("seqReadTreecon multidone\n");
8626         ajFilebuffClear(seqin->Input->Filebuff, 0);
8627         seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8628     }
8629 
8630     return ajTrue;
8631 }
8632 
8633 
8634 
8635 
8636 /* @funcstatic seqReadJackknifer **********************************************
8637 **
8638 ** Tries to read input in Jackknifer format.
8639 **
8640 ** The Jackknifer program by Farris is a parsimony program that also
8641 ** implements the jackknife method to test the reliability of branches.
8642 ** The format is similar to the MEGA format.
8643 **
8644 ** On the first line a title/description is placed in between single quotes.
8645 ** The alignment can be written in sequential or interleaved format,
8646 ** but the sequence names have to be placed between brackets.
8647 ** Also no blanks are allowed in the names.
8648 ** They should be replaced by underscores ( _ ).
8649 ** The file is ended by a semicolon.
8650 **
8651 ** @param [w] thys [AjPSeq] Sequence object
8652 ** @param [u] seqin [AjPSeqin] Sequence input object
8653 ** @return [AjBool] ajTrue on success
8654 **
8655 ** @release 2.0.0
8656 ** @@
8657 ******************************************************************************/
8658 
seqReadJackknifer(AjPSeq thys,AjPSeqin seqin)8659 static AjBool seqReadJackknifer(AjPSeq thys, AjPSeqin seqin)
8660 {
8661     AjPStr tmpstr = NULL;
8662     AjPStr tmpname = NULL;
8663     AjBool ok       = ajFalse;
8664     ajuint iseq;
8665     ajuint i;
8666 
8667     AjPTable phytable        = NULL;
8668     SeqPMsfItem phyitem      = NULL;
8669     const SeqPMsfItem readphyitem = NULL;
8670     AjPList phylist          = NULL;
8671     SeqPMsfData phydata      = NULL;
8672 
8673     if(!seqRegJackTop)
8674         seqRegJackTop = ajRegCompC("^'(.*)'\\s*$");
8675 
8676     if(!seqRegJackSeq)
8677         seqRegJackSeq = ajRegCompC("^[(]([^)]+)(.*)$");
8678 
8679     if(!seqin->SeqData)                 /* first time - read the data */
8680     {
8681         iseq = 0;
8682         seqin->Multidone = ajFalse;
8683         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8684         if(!ok)
8685             return ajFalse;
8686 
8687         if(!ajRegExec(seqRegJackTop, seqReadLine))
8688         {                               /* first line test */
8689             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8690 
8691             return ajFalse;
8692         }
8693 
8694         ajDebug("JackKnifer format: First line ok '%S'\n", seqReadLine);
8695 
8696         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8697 
8698         seqin->SeqData = AJNEW0(phydata);
8699         phydata->Table = phytable = ajTablestrNew(1000);
8700         phylist = ajListstrNew();
8701         seqin->Input->Filecount = 0;
8702 
8703         while (ok)
8704         {
8705             if (!ajStrGetLen(seqReadLine))      /* empty line after sequence */
8706             {
8707                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8708                 continue;
8709             }
8710 
8711             if (ajStrPrefixC(seqReadLine, ";"))
8712                 break;                  /* done */
8713 
8714             if (ajStrPrefixC(seqReadLine, "("))
8715             {
8716                 if (!ajRegExec(seqRegJackSeq, seqReadLine))
8717                 {
8718                     ajDebug("JackKnifer format: bad (id) line\n");
8719                     seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8720 
8721                     return ajFalse;
8722                 }
8723 
8724                 ajRegSubI(seqRegJackSeq, 1, &tmpstr);
8725                 seqnameSetName(&tmpname, tmpstr);
8726                 phyitem = ajTableFetchmodS(phytable, tmpname);
8727 
8728                 if (!phyitem)
8729                 {
8730                     ajDebug("JackKnifer format: new (id) '%S'\n", tmpname);
8731                     AJNEW0(phyitem);
8732                     phyitem->Weight = 1.0;
8733                     ajStrAssignS(&phyitem->Name,tmpname);
8734                     ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
8735                     ajListstrPushAppend(phylist, ajStrNewS(phyitem->Name));
8736                     iseq++;
8737                 }
8738                 else
8739                     ajDebug("JackKnifer format: More for (id) '%S'\n",
8740                             tmpname);
8741 
8742                 ajRegSubI(seqRegJackSeq, 2, &tmpstr);
8743                 ajStrAssignS(&seqReadLine, tmpstr);
8744             }
8745 
8746             seqAppend(&phyitem->Seq, seqReadLine);
8747 
8748             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8749         }
8750 
8751         phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
8752 
8753         for(i=0; i < iseq; i++)
8754         {
8755             ajListstrPop(phylist, &phydata->Names[i]);
8756             ajDebug("list [%d] '%S'\n", i, phydata->Names[i]);
8757         }
8758 
8759         ajListstrFreeData(&phylist);
8760         phydata->Nseq = iseq;
8761         phydata->Count = 0;
8762         phydata->Bufflines = ajTextinGetRecords(seqin->Input);
8763         ajDebug("JackKnifer format read %d lines\n",
8764                 ajTextinGetRecords(seqin->Input));
8765     }
8766 
8767     ajStrDel(&tmpstr);
8768     ajStrDel(&tmpname);
8769 
8770     phydata = seqin->SeqData;
8771     phytable = phydata->Table;
8772 
8773     i = phydata->Count;
8774     ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
8775     readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
8776     ajStrAssignS(&thys->Name, phydata->Names[i]);
8777     ajStrDel(&phydata->Names[i]);
8778 
8779     thys->Weight = readphyitem->Weight;
8780     ajStrAssignS(&thys->Seq, readphyitem->Seq);
8781 
8782     phydata->Count++;
8783 
8784     if(phydata->Count >=phydata->Nseq)
8785     {
8786         seqin->Multidone = ajTrue;
8787         ajDebug("seqReadJackKnifer multidone\n");
8788         ajFilebuffClear(seqin->Input->Filebuff, 0);
8789         seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8790     }
8791 
8792     return ajTrue;
8793 }
8794 
8795 
8796 
8797 
8798 /* @funcstatic seqReadNexus ***************************************************
8799 **
8800 ** Tries to read input in Nexus format.
8801 **
8802 ** Nexus files contain many things.
8803 ** All Nexus files begin with a #NEXUS line
8804 ** Data is in begin ... end blocks
8805 ** Sequence data is in a "begin character" block
8806 **
8807 ** @param [w] thys [AjPSeq] Sequence object
8808 ** @param [u] seqin [AjPSeqin] Sequence input object
8809 ** @return [AjBool] ajTrue on success
8810 **
8811 ** @release 2.0.0
8812 ** @@
8813 ******************************************************************************/
8814 
seqReadNexus(AjPSeq thys,AjPSeqin seqin)8815 static AjBool seqReadNexus(AjPSeq thys, AjPSeqin seqin)
8816 {
8817     AjBool ok       = ajFalse;
8818     ajuint i;
8819     ajuint j;
8820     AjPFilebuff buff;
8821     AjPStr* seqs = NULL;
8822     AjPStr* names = NULL;
8823     AjPNexus nexus = NULL;
8824 
8825     SeqPMsfData phydata      = NULL;
8826 
8827     buff = seqin->Input->Filebuff;
8828 
8829     if(!seqin->SeqData)                 /* first time - read the data */
8830     {
8831         seqin->Multidone = ajFalse;
8832 
8833         ajFilebuffSetBuffered(buff);
8834 
8835         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8836         ajDebug("Nexus format: Testing first line '%S'\n", seqReadLine);
8837 
8838         if(!ok)
8839             return ajFalse;
8840 
8841         if(!ajStrPrefixCaseC(seqReadLine, "#NEXUS"))
8842         {                               /* first line test */
8843             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8844             return ajFalse;
8845         }
8846 
8847         ajDebug("Nexus format: First line ok '%S'\n", seqReadLine);
8848 
8849         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8850 
8851         while(ok && !ajStrPrefixCaseC(seqReadLine, "#NEXUS"))
8852             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
8853 
8854         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8855 
8856         AJNEW0(phydata);
8857         phydata->Nexus = ajNexusParse(buff);
8858 
8859         if (!phydata->Nexus)
8860         {
8861             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
8862             ajDebug("Failed to parse in nexus format\n");
8863 
8864             return ajFalse;
8865         }
8866 
8867         phydata->Count = 0;
8868         phydata->Nseq = ajNexusGetNtaxa(phydata->Nexus);
8869         /* GetTaxa may fail if names are only defined in the sequences */
8870         seqs = ajNexusGetTaxa(phydata->Nexus);
8871         phydata->Names = AJCALLOC(phydata->Nseq, sizeof(*phydata->Names));
8872 
8873         if(seqs)
8874         {
8875             for(j=0;j<phydata->Nseq;j++)
8876                 ajStrAssignS(&phydata->Names[j], seqs[j]);
8877         }
8878 
8879         seqin->SeqData = phydata;
8880         ajDebug("Nexus parsed %d sequences\n", phydata->Nseq);
8881     }
8882 
8883     phydata = seqin->SeqData;
8884     nexus = phydata->Nexus;
8885 
8886     i = phydata->Count;
8887 
8888     seqs = ajNexusGetSequences(nexus);
8889     if (!seqs)
8890     {
8891         seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8892 
8893         return ajFalse;
8894     }
8895 
8896     thys->Weight = 1.0;
8897     ajStrAssignS(&thys->Seq, seqs[i]);
8898 
8899     if (!phydata->Names)
8900         phydata->Names = AJCALLOC(phydata->Nseq, sizeof(*phydata->Names));
8901 
8902     if (!phydata->Names[0])             /* finally set from the sequences */
8903     {
8904         names = ajNexusGetTaxa(phydata->Nexus);
8905 
8906         for(j=0;j<phydata->Nseq;j++)
8907             ajStrAssignS(&phydata->Names[j], names[j]);
8908     }
8909 
8910     ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
8911 
8912     ajStrAssignS(&thys->Name, phydata->Names[i]);
8913 
8914     phydata->Count++;
8915 
8916     if(phydata->Count >= phydata->Nseq)
8917     {
8918         seqin->Multidone = ajTrue;
8919         ajDebug("seqReadNexus multidone\n");
8920         ajFilebuffClear(seqin->Input->Filebuff, 0);
8921         seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
8922     }
8923 
8924     return ajTrue;
8925 }
8926 
8927 
8928 
8929 
8930 /* @funcstatic seqReadMega ****************************************************
8931 **
8932 ** Tries to read input in Mega interleaved or non-interleaved format.
8933 **
8934 ** The Molecular Evolutionary Genetic Analysis program by
8935 ** Kumar, Tamura & Nei is a tree construction program
8936 ** based on distance- and parsimony methods.
8937 **
8938 ** http://evolgen.biol.metro-u.ac.jp/MEGA/manual/DataFormat.html
8939 **
8940 ** @param [w] thys [AjPSeq] Sequence object
8941 ** @param [u] seqin [AjPSeqin] Sequence input object
8942 ** @return [AjBool] ajTrue on success
8943 **
8944 ** @release 2.0.0
8945 ** @@
8946 ******************************************************************************/
8947 
seqReadMega(AjPSeq thys,AjPSeqin seqin)8948 static AjBool seqReadMega(AjPSeq thys, AjPSeqin seqin)
8949 {
8950     AjPStr tmpstr = NULL;
8951     AjPStr tmpdesc = NULL;
8952     AjPStr tmpname = NULL;
8953     AjPStr prestr = NULL;
8954     AjPStr poststr = NULL;
8955     AjBool ok       = ajFalse;
8956     ajuint iseq = 0;
8957     ajuint i;
8958     AjBool istitle = ajFalse;
8959     AjBool isformat = ajFalse;
8960     AjBool iscommand = ajFalse;
8961     AjBool resume = ajFalse;
8962     AjPStr genestr = NULL;
8963     AjPStr domainstr = NULL;
8964     AjPStr nextgenestr = NULL;
8965     AjPStr nextdomainstr = NULL;
8966 
8967     ajlong ipos;
8968     ajlong istart;
8969     ajlong ilast;
8970     char ichar;
8971 
8972     AjPStr formatType = NULL;
8973     AjPStr formatValue = NULL;
8974 
8975     char identchar = '.';
8976     char indelchar = '-';
8977     char misschar = '?';
8978     char seqtype = ' ';
8979 
8980     char* cp;
8981     const char *cq;
8982 
8983     AjPTable phytable        = NULL;
8984     SeqPMsfItem phyitem      = NULL;
8985     const SeqPMsfItem readphyitem = NULL;
8986     const SeqPMsfItem firstitem    = NULL;
8987     AjPList phylist          = NULL;
8988     SeqPMsfData phydata      = NULL;
8989 
8990     AjPSeqGene seqgene = NULL;
8991 
8992     if(!seqRegMegaCommand)
8993         seqRegMegaCommand = ajRegCompC("([^ =!]+)=([^ ;]+)");
8994 
8995     if(!seqRegMegaFeat)
8996         seqRegMegaFeat = ajRegCompC("^(.*)\"[^\"]*\"(.*)$");
8997 
8998     if(!seqRegMegaSeq)
8999         seqRegMegaSeq = ajRegCompC("^#([^ \t\n\r]+)(.*)$");
9000 
9001     if(seqin->SeqData)
9002     {
9003         phydata = seqin->SeqData;
9004         if(seqin->Multidone)
9005             resume = phydata->Resume;
9006     }
9007 
9008     if(!seqin->SeqData ||          /* first time - read the data */
9009        (seqin->Multidone && resume)) /* resuming gene/domain block */
9010     {
9011         iseq = 0;
9012         seqin->Multidone = ajFalse;
9013 
9014         if(!seqin->SeqData)
9015         {
9016             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9017             ajDebug("Mega format: Testing first line '%S'\n", seqReadLine);
9018 
9019             if(!ok)
9020                 return ajFalse;
9021 
9022             if(!ajStrPrefixCaseC(seqReadLine, "#MEGA"))
9023             {                           /* first line test */
9024                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9025 
9026                 return ajFalse;
9027             }
9028 
9029             ajDebug("Mega format: First line ok '%S'\n", seqReadLine);
9030 
9031             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9032             if(!ok)
9033                 return ajFalse;
9034 
9035             if(!ajStrPrefixCaseC(seqReadLine, "TITLE") &&
9036                !ajStrPrefixCaseC(seqReadLine, "!TITLE"))
9037             {                           /* first line test */
9038                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9039 
9040                 return ajFalse;
9041             }
9042             ajStrAssignSubS(&tmpdesc, seqReadLine, 6, -1);
9043             ajStrTrimStartC(&tmpdesc, ": \t");
9044             ajStrTrimEndC(&tmpdesc, "; \t\n\r");
9045 
9046             if(ajStrGetCharFirst(seqReadLine) == '!')
9047             {
9048                 istitle = ajTrue;
9049                 if(ajStrFindAnyK(seqReadLine, ';') != -1)
9050                     istitle = ajFalse;
9051             }
9052 
9053             ajDebug("Mega format: Second line ok '%S'\n", seqReadLine);
9054 
9055             isformat = ajFalse;
9056 
9057             while(ok && !ajStrPrefixC(seqReadLine, "#"))
9058 
9059             {                           /* skip comments in header */
9060                 if(iscommand)
9061                 {
9062                     if(ajStrFindAnyK(seqReadLine, ';') != -1)
9063                         iscommand = ajFalse;
9064                 }
9065 
9066                 else if(istitle)
9067                 {
9068                     ajStrAssignS(&tmpstr, seqReadLine);
9069                     ajStrTrimStartC(&tmpstr, ": \t");
9070                     ajStrTrimEndC(&tmpstr, "; \t\n\r");
9071                     ajStrAppendK(&tmpdesc, ' ');
9072                     ajStrAppendS(&tmpdesc, tmpstr);
9073                     if(ajStrFindAnyK(seqReadLine, ';') != -1)
9074                         istitle = ajFalse;
9075                 }
9076 
9077                 else
9078                 {
9079                     if(ajStrPrefixCaseC(seqReadLine, "!FORMAT"))
9080                         isformat = ajTrue;
9081 
9082                     if(isformat)
9083                     {
9084                         ajDebug("Format line: %S", seqReadLine);
9085                         ajStrAssignS(&tmpstr, seqReadLine);
9086 
9087                         while(ajRegExec(seqRegMegaCommand, tmpstr))
9088                         {
9089                             ajRegSubI(seqRegMegaCommand, 1, &formatType);
9090                             ajRegSubI(seqRegMegaCommand, 2, &formatValue);
9091                             if(ajStrPrefixCaseC(formatType, "indel"))
9092                                 indelchar = ajStrGetCharFirst(formatValue);
9093                             if(ajStrPrefixCaseC(formatType, "ident"))
9094                                 identchar = ajStrGetCharFirst(formatValue);
9095                             if(ajStrPrefixCaseC(formatType, "match"))
9096                                 identchar = ajStrGetCharFirst(formatValue);
9097                             if(ajStrPrefixCaseC(formatType, "miss"))
9098                                 misschar = ajStrGetCharFirst(formatValue);
9099                             if(ajStrPrefixCaseC(formatType, "DataType"))
9100                                 seqtype = ajStrGetCharFirst(formatValue);
9101                             ajRegPost(seqRegMegaCommand, &tmpstr);
9102                             ajDebug("'%S' = '%S' (%S) indel '%c' ident '%c' "
9103                                     "missing  '%c'\n",
9104                                     formatType, formatValue, tmpstr,
9105                                     indelchar, identchar, misschar);
9106                         }
9107 
9108                         if(ajStrFindAnyK(seqReadLine, ';') == -1)
9109                             isformat = ajFalse;
9110                     }
9111 
9112                     else
9113                     {
9114                         if(ajStrGetCharFirst(seqReadLine) == '!')
9115                         {
9116                             ajStrAssignS(&tmpstr, seqReadLine);
9117                             while(ajRegExec(seqRegMegaCommand, tmpstr))
9118                             {
9119                                 ajRegSubI(seqRegMegaCommand, 1, &formatType);
9120                                 ajRegSubI(seqRegMegaCommand, 2, &formatValue);
9121                                 if(ajStrMatchCaseC(formatType, "gene"))
9122                                 {
9123                                     ajStrAssignS(&genestr, formatValue);
9124                                     ajDebug("command: Gene='%S'\n",
9125                                             formatValue);
9126                                 }
9127 
9128                                 if(ajStrMatchCaseC(formatType, "domain"))
9129                                 {
9130                                     ajStrAssignS(&domainstr, formatValue);
9131                                     ajDebug("command: Domain='%S'\n",
9132                                             formatValue);
9133                                 }
9134                                 ajRegPost(seqRegMegaCommand, &tmpstr);
9135                             }
9136 
9137                             if(ajStrFindAnyK(seqReadLine, ';') == -1)
9138                                 iscommand = ajTrue;
9139                         }
9140                     }
9141                 }
9142 
9143                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9144 
9145             }
9146             ajStrDel(&tmpstr);
9147 
9148             if(isformat || istitle || iscommand)
9149             {
9150                 ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9151 
9152                 return ajFalse;
9153             }
9154 
9155             /*
9156             ** read through looking for #id
9157             ** Some day we could stop at #mega and read multiple files
9158             */
9159 
9160             seqin->SeqData = AJNEW0(phydata);
9161             phydata->Table = phytable = ajTablestrNew(1000);
9162             phylist = ajListstrNew();
9163             seqin->Input->Filecount = 0;
9164 
9165             phydata->Identchar = identchar;
9166             phydata->Indelchar = indelchar;
9167             phydata->Misschar = misschar;
9168             phydata->Seqtype = seqtype;
9169         }
9170 
9171         /*
9172         ** Resume from here
9173         */
9174 
9175         if(resume)
9176         {
9177             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9178 
9179             resume = ajFalse;
9180             phydata->Resume = ajFalse;
9181             ajTableMapDel(phydata->Table, &seqMsfTabDel, NULL);
9182             phylist = ajListstrNew();
9183             phytable = phydata->Table;
9184             ajStrAssignS(&phydata->Gene, phydata->NextGene);
9185             ajStrAssignS(&phydata->Domain, phydata->NextDomain);
9186             ajStrAssignClear(&phydata->NextGene);
9187             ajStrAssignClear(&phydata->NextDomain);
9188         }
9189 
9190         while (ok)
9191         {
9192             ipos = ajStrFindAnyC(seqReadLine, "[]");
9193             istart = 0;
9194             ichar = ' ';
9195 
9196             while((ipos != -1) ||
9197                   (phydata->CommentDepth &&
9198                    (istart < (ajint) ajStrGetLen(seqReadLine))))
9199             {
9200                 ilast = ipos;
9201 
9202                 if(ipos > -1)
9203                     ichar = ajStrGetCharPos(seqReadLine, ipos);
9204 
9205                 if(!phydata->CommentDepth)
9206                 {
9207                     istart = ipos;
9208                 }
9209 
9210                 if(ichar == '[')
9211                     phydata->CommentDepth++;
9212                 else if((ichar == ']') && phydata->CommentDepth)
9213                     phydata->CommentDepth--;
9214 
9215                 ajStrCutRange(&seqReadLine, istart, ilast);
9216                 ipos = ajStrFindAnyC(seqReadLine, "[]");
9217                 ichar = ' ';
9218             }
9219 
9220             /* empty line after a sequence */
9221             if (!ajStrGetLen(seqReadLine))
9222             {
9223                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9224                 continue;
9225             }
9226 
9227             if (ajStrPrefixC(seqReadLine, "!"))
9228             {
9229                 iscommand = ajTrue;
9230             }
9231 
9232             if(!iscommand)
9233             {
9234                 if(ajStrPrefixC(seqReadLine, "#"))
9235                 {
9236                     if (!ajRegExec(seqRegMegaSeq, seqReadLine))
9237                     {
9238                         ajDebug("Mega format: bad #id line\n");
9239                         seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
9240 
9241                         return ajFalse;
9242                     }
9243 
9244                     ajRegSubI(seqRegMegaSeq, 1, &tmpstr);
9245                     seqnameSetName(&tmpname, tmpstr);
9246                     phyitem = ajTableFetchmodS(phytable, tmpname);
9247 
9248                     if (!phyitem)
9249                     {
9250                         AJNEW0(phyitem);
9251                         phyitem->Weight = 1.0;
9252                         ajStrAssignS(&phyitem->Name,tmpname);
9253                         ajStrAssignS(&phyitem->Desc, tmpdesc);
9254                         ajTablePut(phytable, ajStrNewS(phyitem->Name), phyitem);
9255                         ajListstrPushAppend(phylist, ajStrNewS(phyitem->Name));
9256                         iseq++;
9257                     }
9258                     else
9259                         ajDebug("Mega format: More for #id '%S'\n", tmpname);
9260 
9261                     ajRegSubI(seqRegMegaSeq, 2, &tmpstr);
9262                     ajStrAssignS(&seqReadLine, tmpstr);
9263                 }
9264 
9265                 while (ajRegExec(seqRegMegaFeat, seqReadLine))
9266                 {
9267                     ajDebug("Quotes found: '%S'\n", seqReadLine);
9268                     ajRegSubI(seqRegMegaFeat, 1, &prestr);
9269                     ajRegSubI(seqRegMegaFeat, 2, &poststr);
9270                     ajStrAssignS(&seqReadLine, prestr);
9271                     ajStrAppendS(&seqReadLine, poststr);
9272                     ajDebug("Quotes removed: '%S'\n", seqReadLine);
9273                 }
9274 
9275                 seqAppend(&phyitem->Seq, seqReadLine);
9276                 ajDebug("Append '%S' len %u\n",
9277                         phyitem->Name, ajStrGetLen(phyitem->Seq));
9278             }
9279 
9280             else
9281             {
9282                 ajStrAssignS(&tmpstr, seqReadLine);
9283 
9284                 while(ajRegExec(seqRegMegaCommand, tmpstr))
9285                 {
9286                     ajRegSubI(seqRegMegaCommand, 1, &formatType);
9287                     ajRegSubI(seqRegMegaCommand, 2, &formatValue);
9288                     if(ajStrMatchCaseC(formatType, "gene"))
9289                     {
9290                         if(iseq)
9291                             resume = ajTrue;
9292                         ajStrAssignS(&nextgenestr, formatValue);
9293                         ajDebug("command: Gene='%S'\n",
9294                                 formatValue);
9295                     }
9296 
9297                     if(ajStrMatchCaseC(formatType, "domain"))
9298                     {
9299                         if(iseq)
9300                             resume = ajTrue;
9301                         ajStrAssignS(&nextdomainstr, formatValue);
9302                         ajDebug("command: Domain='%S'\n",
9303                                 formatValue);
9304                     }
9305                     ajRegPost(seqRegMegaCommand, &tmpstr);
9306                 }
9307                 if(ajStrFindAnyK(seqReadLine, ';') != -1)
9308                     iscommand = ajFalse;
9309             }
9310 
9311             if(resume)
9312                 break;
9313 
9314             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9315         }
9316 
9317         if(phydata->Names)
9318             AJCRESIZE0(phydata->Names, phydata->Nseq, iseq);
9319         else
9320             phydata->Names = AJCALLOC(iseq, sizeof(*phydata->Names));
9321 
9322         for(i=0; i < iseq; i++)
9323         {
9324             ajListstrPop(phylist, &phydata->Names[i]);
9325         }
9326 
9327         ajListstrFreeData(&phylist);
9328         phydata->Nseq = iseq;
9329         phydata->Count = 0;
9330         phydata->Bufflines = ajTextinGetRecords(seqin->Input);
9331     }
9332 
9333     ajStrDel(&formatType);
9334     ajStrDel(&formatValue);
9335     ajStrDel(&tmpstr);
9336     ajStrDel(&tmpname);
9337     ajStrDel(&tmpdesc);
9338     ajStrDel(&prestr);
9339     ajStrDel(&poststr);
9340 
9341     phydata = seqin->SeqData;
9342     phytable = phydata->Table;
9343 
9344     firstitem = ajTableFetchS(phytable, phydata->Names[0]);
9345     i = phydata->Count;
9346     ajDebug("returning [%d] '%S'\n", i, phydata->Names[i]);
9347     readphyitem = ajTableFetchS(phytable, phydata->Names[i]);
9348     ajStrAssignS(&thys->Name, phydata->Names[i]);
9349     if(i)
9350         ajStrDel(&phydata->Names[i]);
9351 
9352     if(ajStrGetLen(genestr))
9353         ajStrAssignS(&phydata->Gene, genestr);
9354 
9355     if(ajStrGetLen(domainstr))
9356         ajStrAssignS(&phydata->Domain, domainstr);
9357 
9358     if(resume)
9359     {
9360         phydata->Resume = ajTrue;
9361         if(ajStrGetLen(nextgenestr))
9362             ajStrAssignS(&phydata->NextGene, nextgenestr);
9363         else
9364             ajStrAssignClear(&phydata->NextGene);
9365         if(ajStrGetLen(nextdomainstr))
9366             ajStrAssignS(&phydata->NextDomain, nextdomainstr);
9367         else
9368             ajStrAssignClear(&phydata->NextDomain);
9369     }
9370 
9371     thys->Weight = readphyitem->Weight;
9372     ajStrAssignS(&thys->Desc, readphyitem->Desc);
9373     ajStrAssignS(&thys->Seq, readphyitem->Seq);
9374     if(ajStrGetLen(phydata->Gene))
9375     {
9376         seqgene = ajSeqgeneNewName(phydata->Gene);
9377         ajSeqAddGene(thys, seqgene);
9378         seqgene = NULL;
9379     }
9380 
9381     if(strchr("nNrRdD", phydata->Seqtype))
9382         ajSeqSetNuc(thys);
9383     else if(strchr("pP", phydata->Seqtype))
9384         ajSeqSetProt(thys);
9385 
9386     cp = ajStrGetuniquePtr(&thys->Seq);
9387     cq = ajStrGetPtr(firstitem->Seq);
9388 
9389     while(*cp)
9390     {
9391         if(*cp == phydata->Indelchar)
9392             *cp = '-';
9393         else if (*cp == phydata->Identchar)
9394             *cp = *cq;
9395 
9396         cp++;
9397         cq++;
9398     }
9399 
9400     phydata->Count++;
9401 
9402     if(phydata->Count >= phydata->Nseq)
9403     {
9404         seqin->Multidone = ajTrue;
9405 
9406         ajStrDel(&phydata->Names[0]);
9407         if(!phydata->Resume)
9408         {
9409             ajFilebuffClear(seqin->Input->Filebuff, 0);
9410             seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
9411         }
9412     }
9413 
9414     ajStrDel(&genestr);
9415     ajStrDel(&nextgenestr);
9416     ajStrDel(&domainstr);
9417     ajStrDel(&nextdomainstr);
9418     ajStrDel(&formatType);
9419     ajStrDel(&formatValue);
9420 
9421     return ajTrue;
9422 }
9423 
9424 
9425 
9426 
9427 /* @funcstatic seqReadCodata **************************************************
9428 **
9429 ** Given data in a sequence structure, tries to read everything needed
9430 ** using CODATA format.
9431 **
9432 ** @param [w] thys [AjPSeq] Sequence object
9433 ** @param [u] seqin [AjPSeqin] Sequence input object
9434 ** @return [AjBool] ajTrue on success
9435 **
9436 ** @release 1.0.0
9437 ** @@
9438 ******************************************************************************/
9439 
seqReadCodata(AjPSeq thys,AjPSeqin seqin)9440 static AjBool seqReadCodata(AjPSeq thys, AjPSeqin seqin)
9441 {
9442     AjPFilebuff buff;
9443     AjBool ok = ajTrue;
9444     AjBool done = ajFalse;
9445 
9446     buff = seqin->Input->Filebuff;
9447 
9448     if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
9449         return ajFalse;
9450 
9451     ajDebug("first line '%S'\n", seqReadLine);
9452 
9453     if(!ajStrPrefixC(seqReadLine, "ENTRY "))
9454     {
9455         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9456 
9457         return ajFalse;
9458     }
9459 
9460     ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
9461     ajStrTokenStep(seqHandle);       /* 'ENTRY' */
9462     ajStrTokenNextParse(seqHandle, &seqToken);       /* entry name */
9463 
9464     seqSetName(thys, seqToken);
9465 
9466     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9467 
9468     while(ok && !ajStrPrefixC(seqReadLine, "SEQUENCE"))
9469     {
9470         done = ajFalse;
9471 
9472         if(ajStrPrefixC(seqReadLine, "ACCESSION "))
9473         {
9474             ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
9475             ajStrTokenStep(seqHandle); /* 'ACCESSION' */
9476             ajStrTokenNextParse(seqHandle, &seqToken); /* accnum */
9477             seqAccSave(thys, seqToken);
9478         }
9479 
9480         if(ajStrPrefixC(seqReadLine, "TITLE "))
9481         {
9482             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
9483             ajStrTokenStep(seqHandle); /* 'TITLE' */
9484             ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc); /* desc */
9485 
9486             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9487             done = ajTrue;
9488 
9489             while(ok && ajStrPrefixC(seqReadLine, " "))
9490             {
9491                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
9492                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
9493                 ajStrAppendC(&thys->Desc, " ");
9494                 ajStrAppendS(&thys->Desc, seqToken);
9495                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9496             }
9497         }
9498 
9499         if(!done)
9500             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9501     }
9502 
9503     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9504 
9505     while(ok && !ajStrPrefixC(seqReadLine, "///"))
9506     {
9507         seqAppend(&thys->Seq, seqReadLine);
9508         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9509     }
9510 
9511     ajFilebuffClear(buff, 0);
9512 
9513     ajStrTokenReset(seqHandle);
9514     ajStrDelStatic(&seqToken);
9515 
9516     return ajTrue;
9517 }
9518 
9519 
9520 
9521 
9522 /* @funcstatic seqReadAce *****************************************************
9523 **
9524 ** Given data in a sequence structure, tries to read everything needed
9525 ** using ACE format as defined by the consed assembly editor.
9526 **
9527 ** @param [w] thys [AjPSeq] Sequence object
9528 ** @param [u] seqin [AjPSeqin] Sequence input object
9529 ** @return [AjBool] ajTrue on success
9530 **
9531 ** @release 6.2.0
9532 ** @@
9533 ******************************************************************************/
9534 
seqReadAce(AjPSeq thys,AjPSeqin seqin)9535 static AjBool seqReadAce(AjPSeq thys, AjPSeqin seqin)
9536 {
9537     AjPFilebuff buff;
9538     AjBool ok = ajTrue;
9539     ajuint icontig;
9540     ajuint iseq;
9541     AjPTable acetable        = NULL;
9542     const SeqPMsfItem aceitem = NULL;
9543     SeqPMsfData acedata      = NULL;
9544     ajuint i;
9545 
9546     ajDebug("seqReadAcedb\n");
9547 
9548     buff = seqin->Input->Filebuff;
9549 
9550     if(!seqin->SeqData)
9551     {                                   /* start of file */
9552         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9553         if(!ok)
9554             return ajFalse;
9555 
9556         ajDebug("first line:\n'%S'\n", seqReadLine);
9557 
9558         ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
9559         ajStrTokenNextParseC(seqHandle, " \t", &seqToken); /* 'AS ncontig nseq' */
9560         ajDebug("Token 1 '%S'\n", seqToken);
9561 
9562         if(!ajStrMatchCaseC(seqToken, "AS"))
9563         {
9564             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9565             ajStrDelStatic(&seqToken);
9566             ajStrTokenDel(&seqHandle);
9567             return ajFalse;
9568         }
9569 
9570         ajStrTokenNextParseC(seqHandle, " \t", &seqToken); /* number of contigs */
9571         ajStrToUint(seqToken, &icontig);
9572         ajStrTokenNextParseC(seqHandle, " \t", &seqToken); /* number of reads */
9573         ajStrToUint(seqToken, &iseq);
9574 
9575         seqin->SeqData = AJNEW0(acedata);
9576         acedata->Table = acetable = ajTablestrNew(1000);
9577         seqin->Input->Filecount = 0;
9578 
9579         /*
9580         ** read sequence from CO (* for gap)
9581         ** read accuracy from BQ (no quality for gaps)
9582         **
9583         ** Read with gaps
9584         */
9585     }
9586 
9587     acedata = seqin->SeqData;
9588     acetable = acedata->Table;
9589 
9590     i = acedata->Count;
9591     /* ajDebug("returning [%d] '%S'\n", i, acedata->Names[i]); */
9592     aceitem = ajTableFetchS(acetable, acedata->Names[i]);
9593     ajStrAssignS(&thys->Name, acedata->Names[i]);
9594 
9595     thys->Weight = aceitem->Weight;
9596     ajStrAssignS(&thys->Seq, aceitem->Seq);
9597 
9598     acedata->Count++;
9599 
9600     if(acedata->Count >= acedata->Nseq)
9601     {
9602         seqin->Multidone = ajTrue;
9603         ajDebug("seqReadAce Multidone\n");
9604         ajFilebuffClear(seqin->Input->Filebuff, 0);
9605         seqMsfDataDel((SeqPMsfData*) &seqin->SeqData);
9606     }
9607 
9608     ajSeqSetNuc(thys);
9609 
9610     ajFilebuffClear(buff, 0);
9611 
9612     ajStrTokenReset(seqHandle);
9613     ajStrDelStatic(&seqToken);
9614 
9615     return ajTrue;
9616 }
9617 
9618 
9619 
9620 
9621 /* @funcstatic seqReadAcedb ***************************************************
9622 **
9623 ** Given data in a sequence structure, tries to read everything needed
9624 ** using ACEDB format.
9625 **
9626 ** @param [w] thys [AjPSeq] Sequence object
9627 ** @param [u] seqin [AjPSeqin] Sequence input object
9628 ** @return [AjBool] ajTrue on success
9629 **
9630 ** @release 1.0.0
9631 ** @@
9632 ******************************************************************************/
9633 
seqReadAcedb(AjPSeq thys,AjPSeqin seqin)9634 static AjBool seqReadAcedb(AjPSeq thys, AjPSeqin seqin)
9635 {
9636     AjPFilebuff buff;
9637     AjBool ok = ajTrue;
9638 
9639     ajDebug("seqReadAcedb\n");
9640 
9641     buff = seqin->Input->Filebuff;
9642 
9643     do
9644     {
9645         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9646     } while(ok &&
9647             (ajStrPrefixC(seqReadLine, "//") ||
9648              ajStrPrefixC(seqReadLine, "\n")));
9649 
9650     if(!ok)
9651     {
9652         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9653 
9654         return ajFalse;
9655     }
9656 
9657     ajDebug("first line:\n'%S'\n", seqReadLine);
9658 
9659 
9660     ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
9661     ajStrTokenNextParseC(seqHandle, " \t", &seqToken); /* 'DNA' or 'Peptide'*/
9662     ajDebug("Token 1 '%S'\n", seqToken);
9663 
9664     if(ajStrMatchCaseC(seqToken, "Peptide"))
9665     {
9666         ajDebug("Protein\n");
9667         ajSeqSetProt(thys);
9668     }
9669     else if(ajStrMatchCaseC(seqToken, "DNA"))
9670     {
9671         ajDebug("DNA\n");
9672         ajSeqSetNuc(thys);
9673     }
9674     else
9675     {
9676         ajDebug("unknown - failed\n");
9677         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9678         ajStrTokenReset(seqHandle);
9679         ajStrDelStatic(&seqToken);
9680 
9681         return ajFalse;
9682     }
9683 
9684     ajStrTokenNextParseC(seqHandle, " \t\"", &seqToken); /* : */
9685 
9686     if(!ajStrMatchC(seqToken, ":"))
9687     {
9688         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9689         ajStrTokenReset(seqHandle);
9690         ajStrDelStatic(&seqToken);
9691 
9692         return ajFalse;
9693     }
9694 
9695     ajStrTokenNextParseC(seqHandle, "\"", &seqToken);        /* name */
9696 
9697     if(!ajStrGetLen(seqToken))
9698     {
9699         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9700         ajStrTokenReset(seqHandle);
9701         ajStrDelStatic(&seqToken);
9702 
9703         return ajFalse;
9704     }
9705 
9706     /* we know we will succeed from here ... no way to return ajFalse */
9707 
9708     ajFilebuffSetUnbuffered(buff);
9709 
9710     seqSetName(thys, seqToken);
9711 
9712     /* OK, we have the name. Now look for the sequence */
9713 
9714     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9715     while(ok && !ajStrPrefixC(seqReadLine,"\n"))
9716     {
9717         seqAppend(&thys->Seq, seqReadLine);
9718         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9719     }
9720 
9721     ajFilebuffClear(buff, 0);
9722 
9723     ajStrTokenReset(seqHandle);
9724     ajStrDelStatic(&seqToken);
9725 
9726     return ajTrue;
9727 }
9728 
9729 
9730 
9731 
9732 /* @funcstatic seqReadBiomart *************************************************
9733 **
9734 ** Given data in a sequence structure, tries to read everything needed
9735 ** using BioMart tab-delimited format.
9736 **
9737 ** @param [w] thys [AjPSeq] Sequence object
9738 ** @param [u] seqin [AjPSeqin] Sequence input object
9739 ** @return [AjBool] ajTrue on success
9740 **
9741 ** @release 6.3.0
9742 ** @@
9743 ******************************************************************************/
9744 
seqReadBiomart(AjPSeq thys,AjPSeqin seqin)9745 static AjBool seqReadBiomart(AjPSeq thys, AjPSeqin seqin)
9746 {
9747     AjPFilebuff buff;
9748     AjBool ok = ajTrue;
9749     ajulong ifields = 0;
9750     ajuint i;
9751 
9752     buff = seqin->Input->Filebuff;
9753 
9754     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9755     if(!ok)
9756         return ajFalse;
9757 
9758     ajDebug("seqReadBiomart record '%S'%u\n",
9759             seqReadLine);
9760 
9761     ifields = ajStrCalcCountK(seqReadLine, '\t');
9762     ++ifields;
9763 
9764     ajDebug("fields: %u\n", ifields);
9765 
9766     if(ifields < 2)
9767         return ajFalse;
9768 
9769     ajStrTokenAssignC(&seqHandle, seqReadLine, "\t\n");
9770 
9771     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* sequence */
9772     seqAppend(&thys->Seq, seqToken);
9773 
9774     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* identifier*/
9775     seqSetName(thys, seqToken);
9776 
9777     for(i = 2; i < ifields; i++)
9778     {
9779         ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* non-sequence*/
9780 
9781         if(ajStrGetLen(seqToken))
9782         {
9783             if(i > 2)
9784                 ajStrAppendK(&thys->Desc, ' ');
9785 
9786             ajStrAppendS(&thys->Desc, seqToken);
9787         }
9788     }
9789 
9790     ajFilebuffClear(buff, 0);
9791 
9792     ajStrTokenReset(seqHandle);
9793     ajStrDelStatic(&seqToken);
9794 
9795     return ajTrue;
9796 }
9797 
9798 
9799 
9800 
9801 /* @funcstatic seqReadDAS *****************************************************
9802 **
9803 ** Reads sequences from given DAS XML buffer.
9804 **
9805 ** @param [w] thys [AjPSeq] Sequence object
9806 ** @param [u] seqin [AjPSeqin] Sequence input object
9807 ** @return [AjBool] ajTrue on success
9808 **
9809 ** @release 6.4.0
9810 ** @@
9811 ******************************************************************************/
9812 
seqReadDAS(AjPSeq thys,AjPSeqin seqin)9813 static AjBool seqReadDAS(AjPSeq thys, AjPSeqin seqin)
9814 {
9815     AjPDomDocument doc      = NULL;
9816     AjPDomNodeList segments = NULL;
9817     AjPDomNode segment      = NULL;
9818 
9819     AjPFilebuff buff = NULL;
9820     AjPStr attval    = NULL;
9821     AjPStr elmtxt    = NULL;
9822     AjPStr seqname   = NULL;
9823 
9824     AjBool ret = AJTRUE;
9825 
9826     ajint r = 0;
9827 
9828     buff = seqin->Input->Filebuff;
9829 
9830     r = ajTextinGetCount(seqin->Input);
9831 
9832     if(r==1)
9833     {
9834         doc = ajDomImplementationCreateDocument(NULL,NULL,NULL);
9835 
9836         if (ajDomReadFilebuff(doc,buff) == -1)
9837         {
9838             ajDomDocumentDestroyNode(doc,&doc);
9839             return AJFALSE;
9840         }
9841 
9842         ajFilebuffClear(buff, 0);
9843         seqin->SeqData = doc;
9844     }
9845     else
9846         doc = seqin->SeqData;
9847 
9848     segments = ajDomDocumentGetElementsByTagNameC(doc, "SEQUENCE");
9849 
9850 
9851     if(segments==NULL || ajDomNodeListGetLen(segments) < r)
9852     {
9853         ajDomDocumentDestroyNodeList(doc,&segments,AJDOMKEEP);
9854         ajDomDocumentDestroyNode(doc,&doc);
9855         return AJFALSE;
9856     }
9857 
9858     segment = ajDomNodeListItem(segments, r-1);
9859 
9860     elmtxt = ajDomElementGetText(segment);
9861     seqAppend(&thys->Seq, elmtxt);
9862 
9863     attval = ajDomElementGetAttributeC(segment,"id");
9864 
9865     if(ajStrGetLen(attval) == 0)
9866     {
9867         ajStrDel(&attval);
9868         ajDomDocumentDestroyNodeList(doc,&segments,AJDOMKEEP);
9869         ajDomDocumentDestroyNode(doc,&doc);
9870         return AJFALSE;
9871     }
9872 
9873 
9874     if(seqin->Begin && seqin->End)
9875     {
9876 
9877         ajFmtPrintS(&seqname,"%S %u,%u",
9878                     attval, seqin->Begin, seqin->End);
9879 
9880         seqSetName(thys, seqname);
9881 
9882         ajStrDel(&seqname);
9883     }
9884     else ajSeqSetName(thys, attval);
9885 
9886     ajStrDel(&attval);
9887 
9888     /*
9889      * TODO: modifying seqin obj doesn't sound correct
9890      *       but I was unable to stop calling function modifying sequence
9891      *       Begin and End attributes apparently in a wrong way  -- mahmut
9892      */
9893     seqin->Begin = thys->Begin;
9894     seqin->End = thys->End;
9895 
9896     /* TODO: how to read features in parallel to reading sequences
9897      *  - get sequence query url
9898      *  - construct features query url based on sequence query url
9899      */
9900     /*
9901       if(seqin->Features)
9902       {
9903       AjPStr ftq, host, port, fqpath;
9904       ajFeattabInDel(&seqin->Ftquery);
9905       ajFilebuffClear(seqin->Input->Filebuff, -1);
9906 
9907       // get sequence query url
9908 
9909       // construct features query url based on sequence query url
9910 
9911       ajDasdbQueryGet(seqin, host, port, fqpath);
9912 
9913       seqin->Ftquery = ajFeattabInNewCSF("das", thys->Name,
9914       ajStrGetPtr(seqin->Type),
9915       seqin->Input->Filebuff);
9916       ajDebug("GFF FEAT TabIn %x\n", seqin->Ftquery);
9917       //ftfile = NULL;                  // now copied to seqin->FeattabIn
9918       ajFeattableDel(&seqin->Fttable);
9919       seqin->Fttable = ajFeattableNewRead(seqin->Ftquery);
9920       if(seqin->Fttable)
9921       ajFeattableSetLength(seqin->Fttable, ajStrGetLen(thys->Seq));
9922       ajFeattableTrace(seqin->Fttable);
9923       ajFeattableDel(&thys->Fttable);
9924       thys->Fttable = seqin->Fttable;
9925       seqin->Fttable = NULL;
9926       }
9927     */
9928 
9929     ajDomDocumentDestroyNodeList(doc,&segments,AJDOMKEEP);
9930 
9931 
9932     return ret;
9933 }
9934 
9935 
9936 
9937 
9938 /* @funcstatic seqReadFitch ***************************************************
9939 **
9940 ** Given data in a sequence structure, tries to read everything needed
9941 ** using fitch format.
9942 **
9943 ** @param [w] thys [AjPSeq] Sequence object
9944 ** @param [u] seqin [AjPSeqin] Sequence input object
9945 ** @return [AjBool] ajTrue on success
9946 **
9947 ** @release 2.8.0
9948 ** @@
9949 ******************************************************************************/
9950 
seqReadFitch(AjPSeq thys,AjPSeqin seqin)9951 static AjBool seqReadFitch(AjPSeq thys, AjPSeqin seqin)
9952 {
9953     AjPStr token     = NULL;
9954     AjPFilebuff buff;
9955     AjBool ok = ajTrue;
9956     ajuint ilen = 0;
9957 
9958     if (!seqRegFitchHead)
9959         seqRegFitchHead = ajRegCompC("^(\\S+),\\s+(\\d+)\\s+bases\n");
9960 
9961     buff = seqin->Input->Filebuff;
9962 
9963     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9964     ajDebug("seqReadFitch first line '%S'%u\n",
9965             seqReadLine);
9966 
9967     if (!ajRegExec(seqRegFitchHead, seqReadLine))
9968     {
9969         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
9970 
9971         return ajFalse;
9972     }
9973 
9974     ajRegSubI(seqRegFitchHead, 1, &token);
9975     seqSetName(thys, token);
9976 
9977     ajRegSubI(seqRegFitchHead, 2, &token);
9978     ajStrToUint(token, &ilen);
9979 
9980     ajDebug("seqReadFitch header name '%S' bases %u\n",
9981             thys->Name, ilen);
9982 
9983     /* we know we will succeed from here ... no way to return ajFalse */
9984 
9985     ajFilebuffSetUnbuffered(buff);
9986 
9987     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9988 
9989     while (ok && (ajStrGetLen(thys->Seq) < ilen))
9990     {
9991         seqAppend(&thys->Seq, seqReadLine);
9992         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
9993         ajDebug("seqReadFitch new length %u '%S'\n",
9994                 ajStrGetLen(thys->Seq), seqReadLine);
9995     }
9996 
9997     ajStrDel(&token);
9998 
9999     if(ok)
10000         ajFilebuffClear(buff, 1);
10001     else
10002         ajFilebuffClear(buff, 0);
10003 
10004     return ajTrue;
10005 }
10006 
10007 
10008 
10009 
10010 /* @funcstatic seqReadMase ****************************************************
10011 **
10012 ** Given data in a sequence structure, tries to read everything needed
10013 ** using mase format.
10014 **
10015 ** @param [w] thys [AjPSeq] Sequence object
10016 ** @param [u] seqin [AjPSeqin] Sequence input object
10017 ** @return [AjBool] ajTrue on success
10018 **
10019 ** @release 2.8.0
10020 ** @@
10021 ******************************************************************************/
10022 
seqReadMase(AjPSeq thys,AjPSeqin seqin)10023 static AjBool seqReadMase(AjPSeq thys, AjPSeqin seqin)
10024 {
10025     AjPStr token     = NULL;
10026     AjPStr des     = NULL;
10027     AjPFilebuff buff;
10028     AjBool ok = ajTrue;
10029 
10030     if (!seqRegMaseHead)
10031         seqRegMaseHead = ajRegCompC("^(;+)");
10032 
10033     buff = seqin->Input->Filebuff;
10034 
10035     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10036     if(!ajRegExec(seqRegMaseHead, seqReadLine))
10037     {
10038         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
10039 
10040         return ajFalse;
10041     }
10042 
10043     /* we know we will succeed from here ... no way to return ajFalse */
10044 
10045     ajFilebuffSetUnbuffered(buff);
10046 
10047     while(ok && ajRegExec(seqRegMaseHead, seqReadLine))
10048     {
10049         if(ajRegLenI(seqRegMaseHead, 1) == 1)
10050         {
10051             ajRegPost(seqRegMaseHead, &token);
10052 
10053             if(des)
10054                 ajStrAppendK(&des, ' ');
10055 
10056             ajStrAppendS(&des, token);
10057         }
10058 
10059         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10060     }
10061 
10062     ajStrRemoveWhiteExcess(&seqReadLine);
10063     seqSetName(thys, seqReadLine);
10064     ajStrRemoveWhiteExcess(&des);
10065     ajSeqAssignDescS(thys, des);
10066 
10067     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10068     while(ok && !ajRegExec(seqRegMaseHead, seqReadLine))
10069     {
10070         seqAppend(&thys->Seq, seqReadLine);
10071         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10072     }
10073 
10074     ajStrDel(&token);
10075     ajStrDel(&des);
10076 
10077     if(ok)
10078         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
10079     else
10080         ajFilebuffClear(buff, 0);
10081 
10082     return ajTrue;
10083 }
10084 
10085 
10086 
10087 
10088 /* @funcstatic seqReadBam *****************************************************
10089 **
10090 ** Given data in a sequence structure, tries to read everything needed
10091 ** using binary alignment/map (BAM) format.
10092 **
10093 ** @param [w] thys [AjPSeq] Sequence object
10094 ** @param [u] seqin [AjPSeqin] Sequence input object
10095 ** @return [AjBool] ajTrue on success
10096 **
10097 ** @release 6.3.0
10098 ** @@
10099 ******************************************************************************/
10100 
seqReadBam(AjPSeq thys,AjPSeqin seqin)10101 static AjBool seqReadBam(AjPSeq thys, AjPSeqin seqin)
10102 {
10103     AjPFilebuff buff;
10104     AjPFile infile;
10105     ajuint i;
10106     AjPSeqBam b = NULL;
10107     AjPSeqBamCore c;
10108     ajint ret = 0;
10109     struct bamdata
10110     {
10111         ajuint Count;
10112         ajuint Nref;
10113         AjPSeqBamBgzf gzfile;
10114         AjPSeqBam bam;
10115     } *bamdata = NULL;
10116     static AjBool called = ajFalse;
10117     static AjBool bigendian = ajFalse;
10118     unsigned char* d;
10119     ajuint dpos;
10120     int cigop;
10121     ajuint cigend;
10122     ajuint cigint;
10123     AjPStr cigarstr = NULL;
10124     AjPStr namestr = NULL;
10125     AjPStr seqstr = NULL;
10126     AjPStr qualstr = NULL;
10127     AjPStr tagstr = NULL;
10128     unsigned char dp;
10129     AjPSeqBamHeader header = NULL;
10130     ajint filestat;
10131 
10132 
10133     if(!called)
10134     {
10135         called = ajTrue;
10136         bigendian = ajUtilGetBigendian();
10137         ajDebug("seqReadBam bam bigendian: %B\n", bigendian);
10138     }
10139 
10140     buff = seqin->Input->Filebuff;
10141     infile = ajFilebuffGetFile(buff);
10142 
10143     if(!seqin->SeqData)
10144     {
10145         ajFileTrace(infile);
10146         ajFilebuffTrace(buff);
10147 
10148         /* reset to beginning of file -
10149         ** has at least been tested for blank lines */
10150         filestat = ajFileSeek(infile, 0L, SEEK_SET);
10151         if(filestat != 0)
10152         {
10153             ajDebug("seqReadBam rewind failed errno %d: %s\n",
10154                     errno, strerror(errno));
10155             return ajFalse;
10156         }
10157 
10158         AJNEW0(bamdata);
10159 
10160         bamdata->gzfile = ajSeqBamBgzfNew(ajFilebuffGetFileptr(buff),"r");
10161 
10162         ajDebug("gzfile %x  fd:%d file:%x ubs:%d cbs:%d blen:%d boff:%d "
10163                 "cache:%d open:'%c'\n",
10164                 bamdata->gzfile, bamdata->gzfile->file_descriptor,
10165                 bamdata->gzfile->file,
10166                 bamdata->gzfile->uncompressed_block_size,
10167                 bamdata->gzfile->compressed_block_size,
10168                 bamdata->gzfile->block_length, bamdata->gzfile->block_offset,
10169                 bamdata->gzfile->cache_size,
10170                 bamdata->gzfile->open_mode);
10171 
10172 
10173         /* BAM header */
10174 
10175         /* read plain text and the number of reference sequences */
10176         header = ajSeqBamHeaderRead(bamdata->gzfile);
10177         if (!header)
10178         {
10179             ajDebug("failed ajSeqBamHeaderRead, seqReadBam returns ajFalse\n");
10180             ajSeqBamBgzfClose(bamdata->gzfile);
10181             AJFREE(bamdata);
10182             ajFileSeek(infile,filestat,0);
10183             ajFilebuffResetPos(buff);
10184             ajFileTrace(infile);
10185             ajFilebuffTrace(buff);
10186             return ajFalse;
10187         }
10188 
10189         ajSeqBamHeaderDel(&header);
10190 
10191         bamdata->bam = (AjPSeqBam)calloc(1, sizeof(AjOSeqBam));
10192         seqin->SeqData = bamdata;
10193     }
10194 
10195     /* next BAM record */
10196 
10197     bamdata = seqin->SeqData;
10198     b = bamdata->bam;
10199     ret = ajSeqBamRead(bamdata->gzfile, b);
10200     if(ret < -1)
10201         ajErr("seqReadBam truncated file return %d\n", ret);
10202 
10203     if(ret == -1)
10204     {
10205         ajSeqBamBgzfClose(bamdata->gzfile);
10206         ajFilebuffClear(seqin->Input->Filebuff, 0);
10207         /*seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);*/
10208         free(bamdata->bam->data); free(bamdata->bam);
10209 
10210         AJFREE(seqin->SeqData);
10211         return ajFalse;
10212     }
10213 
10214     c = &b->core;
10215     ajDebug("rID: %d pos: %d bin: %hd mapQual: %d read_name_len: %d"
10216             " flag_nc: %hd cigar_len: %hd read_len: %d"
10217             " mate_rID: %d mate_pos: %d ins_size: %d\n",
10218             c->tid, c->pos, c->bin, c->qual, c->l_qname,
10219             c->flag, c->n_cigar, c->l_qseq,
10220             c->mtid, c->mpos, c->isize);
10221     ajDebug("l_aux: %d data_len:%d m_data:%d\n",
10222             b->l_aux, b->data_len, b->m_data);
10223     d = b->data;
10224     dpos = 0;
10225     ajStrAssignC(&namestr, (const char*) &d[dpos]);
10226     ajSeqSetName(thys, namestr);
10227     ajStrDel(&namestr);
10228     ajDebug("read name: %p '%s'\n", dpos, &d[dpos]);
10229     dpos += (c->l_qname); /* l_qname includes trailing null */
10230     ajStrAssignC(&cigarstr, "");
10231     ajDebug("start of cigar %p\n", dpos);
10232 
10233     for(i=0; i < c->n_cigar; i++)
10234     {
10235         memcpy(&cigint, &d[dpos], 4);
10236         cigop = cigint & BAM_CIGAR_MASK;
10237         cigend = cigint >> BAM_CIGAR_SHIFT;
10238 
10239         ajFmtPrintAppS(&cigarstr, " %u%c",
10240                        cigend, cigarcode[cigop]);
10241         dpos += 4;
10242     }
10243 
10244     ajDebug("cigar: %p %S\n", dpos, cigarstr);
10245     ajStrDel(&cigarstr);
10246 
10247     ajStrAssignC(&seqstr, "");
10248     for(i=0; i < (ajuint) c->l_qseq; i++)
10249     {
10250         ajStrAppendK(&seqstr,
10251                      bam_nt16_rev_table[MAJSEQBAMSEQI(&d[dpos], i)]);
10252     }
10253     dpos += (c->l_qseq+1)/2;
10254     ajDebug("seq: %p '%S'\n", dpos, seqstr);
10255 
10256     ajStrAssignRef(&thys->Seq, seqstr);
10257     ajStrDel(&seqstr);
10258 
10259     if(d[dpos] == 0xFF)
10260     {
10261         AJFREE(thys->Accuracy);
10262         thys->Qualsize = 0;
10263         ajDebug("qual: MISSING\n");
10264         dpos += c->l_qseq;
10265     }
10266     else
10267     {
10268         ajStrAssignC(&qualstr, "");
10269 
10270         if(thys->Qualsize < (ajuint) c->l_qseq)
10271         {
10272             AJCRESIZE(thys->Accuracy, c->l_qseq);
10273             thys->Qualsize = c->l_qseq;
10274         }
10275 
10276         for(i=0; i < (ajuint) c->l_qseq; i++)
10277         {
10278             ajFmtPrintAppS(&qualstr, " %02x", 33 + d[dpos]);
10279             thys->Accuracy[i] = (float) d[dpos++];
10280         }
10281 
10282         ajDebug("qual: %p %S\n", dpos, qualstr);
10283         ajStrDel(&qualstr);
10284     }
10285 
10286     ajStrAssignC(&tagstr, "");
10287 
10288     while (dpos < (ajuint) b->data_len)
10289     {
10290         ajStrAppendK(&tagstr, ' ');
10291         ajStrAppendK(&tagstr, d[dpos++]);
10292         ajStrAppendK(&tagstr, d[dpos++]);
10293         ajStrAppendK(&tagstr, ':');
10294         dp = d[dpos++];
10295         ajStrAppendK(&tagstr, dp);
10296         ajStrAppendK(&tagstr, ':');
10297 
10298         ajDebug("tag type: '%c\n",dp);
10299 
10300         if (dp == 'Z' || dp == 'H')
10301         {
10302             ajFmtPrintAppS(&tagstr,"%s", &d[dpos]);
10303             while(d[dpos])
10304                 dpos++;
10305             dpos++;
10306         }
10307         else if (dp == 'f')
10308         {
10309             ajFmtPrintAppS(&tagstr,"%f", (float) *(&d[dpos]));
10310             dpos += 4;
10311         }
10312         else if (dp == 'd')
10313         {
10314             ajFmtPrintAppS(&tagstr,"%lf", (double) *(&d[dpos]));
10315             dpos += 8;
10316         }
10317         else if (dp == 'A')
10318         {
10319             ajFmtPrintAppS(&tagstr,"%c", &d[dpos++]);
10320         }
10321         else if (dp == 'c')
10322         {
10323             ajFmtPrintAppS(&tagstr,"%d",
10324                            (ajint) (signed char) d[dpos++]);
10325         }
10326         else if (dp == 's')
10327         {
10328             ajFmtPrintAppS(&tagstr,"%hd",
10329                            (ajshort) *(&d[dpos]));
10330             dpos += 2;
10331         }
10332         else if (dp == 'i')
10333         {
10334             ajFmtPrintAppS(&tagstr,"%d",
10335                            (ajint) *(&d[dpos]));
10336             dpos += 4;
10337         }
10338         else if (dp == 'C')
10339         {
10340             ajFmtPrintAppS(&tagstr,"%u",
10341                            (ajuint) d[dpos++]);
10342         }
10343         else if (dp == 'S')
10344         {
10345             ajFmtPrintAppS(&tagstr,"%hu",
10346                            (ajushort) *(&d[dpos]));
10347             dpos += 2;
10348         }
10349         else if (dp == 'I')
10350         {
10351             ajFmtPrintAppS(&tagstr,"%u",
10352                            (ajuint) d[dpos]);
10353             dpos += 4;
10354         }
10355         else {
10356             ajWarn("Unknown BAM aux type char(%d) '%c'", (ajint) dp, dp);
10357             ajFmtPrintAppS(&tagstr,"???");
10358         }
10359     }
10360 
10361     ajDebug("tags: %p '%S'\n", dpos, tagstr);
10362     ajStrDel(&tagstr);
10363 
10364     bamdata->Count++;
10365 
10366     return ajTrue;
10367 }
10368 
10369 
10370 
10371 
10372 
10373 /* @funcstatic seqReadSam *****************************************************
10374 **
10375 ** Given data in a sequence structure, tries to read everything needed
10376 ** using sequence alignment/map (SAM) format.
10377 **
10378 ** @param [w] thys [AjPSeq] Sequence object
10379 ** @param [u] seqin [AjPSeqin] Sequence input object
10380 ** @return [AjBool] ajTrue on success
10381 **
10382 ** @release 6.2.0
10383 ** @@
10384 ******************************************************************************/
10385 
seqReadSam(AjPSeq thys,AjPSeqin seqin)10386 static AjBool seqReadSam(AjPSeq thys, AjPSeqin seqin)
10387 {
10388     AjBool ok = ajTrue;
10389     AjPFilebuff buff;
10390     AjPFile infile = NULL;
10391     AjBool badformat = ajFalse;
10392     ajuint seqlen = 0;
10393     const char *cp;
10394     ajuint i;
10395     ajint iqual;
10396     ajint qmin = 33;
10397     ajint qmax = 126;
10398     ajuint flags;
10399     ajint iflags;
10400 
10401     buff = seqin->Input->Filebuff;
10402     infile = ajFilebuffGetFile(buff);
10403 
10404     /* === header section === */
10405 
10406     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10407 
10408     if(ajTextinGetCount(seqin->Input) == 1)
10409     {
10410         while(ok && ajStrGetCharFirst(seqReadLine) == '@')
10411         {
10412             ajStrTokenAssignC(&seqHandle, seqReadLine, "\t");
10413             ajStrTokenNextParse(seqHandle,&seqToken);
10414             switch(ajStrGetCharPos(seqToken, 1))
10415             {
10416                 case 'H':
10417                     /* @HD header VN:
10418                     **
10419                     */
10420                     if(!ajStrMatchC(seqToken, "@HD"))
10421                         badformat = ajTrue;
10422                     break;
10423                 case 'S':
10424                     /* @SQ sequence dictionary SN: LN:
10425                     **
10426                     */
10427                     if(!ajStrMatchC(seqToken, "@SQ"))
10428                         badformat = ajTrue;
10429                     break;
10430                 case 'R':
10431                     /* @RG read group ID: SM:
10432                     **
10433                     */
10434                     if(!ajStrMatchC(seqToken, "@RG"))
10435                         badformat = ajTrue;
10436                     break;
10437                 case 'P':
10438                     /* @PG program name ID:
10439                     **
10440                     */
10441                     if(!ajStrMatchC(seqToken, "@PG"))
10442                         badformat = ajTrue;
10443                     break;
10444                 case 'C':
10445                     /* @CO comment
10446                     **
10447                     */
10448                     if(!ajStrMatchC(seqToken, "@CO"))
10449                         badformat = ajTrue;
10450                     break;
10451                 default:
10452                     badformat = ajTrue;
10453                     break;
10454             }
10455             if(badformat)
10456             {
10457                 ajErr("bad sam format header record '%S'", seqReadLine);
10458                 return ajFalse;
10459             }
10460             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10461         }
10462     }
10463 
10464     if(!ok)
10465         return ajFalse;
10466 
10467     /* === alignment section === */
10468 
10469     if(ajStrParseCountC(seqReadLine, "\t") < 11)
10470         return ajFalse;
10471 
10472     ajStrTokenAssignC(&seqHandle, seqReadLine, "\t\n");
10473 
10474     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* QNAME */
10475     seqSetNameNospace(&thys->Name, seqToken);
10476     ajDebug("QNAME '%S' '%S'\n", seqToken, thys->Name);
10477 
10478     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* FLAG */
10479     ajDebug("FLAG  '%S'\n", seqToken);
10480 
10481     if(ajStrGetLen(seqToken))
10482     {
10483         if(!ajStrToUint(seqToken, &flags))
10484         {
10485             ajErr("SAM %F '%S' invalid FLAG value %S\n",
10486                   infile, thys->Name, seqToken);
10487             return ajFalse;
10488         }
10489     }
10490 
10491     ajDebug("flags %x\n", flags);
10492 
10493     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* RNAME */
10494     ajDebug("RNAME '%S'\n", seqToken);
10495 
10496     /*
10497       if(ajStrGetLen(seqToken))
10498       seqAccSave(thys, seqToken);
10499     */
10500 
10501     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* POS */
10502     ajDebug("POS   '%S'\n", seqToken);
10503 
10504     if(ajStrGetLen(seqToken))
10505     {
10506         if(!ajStrToUint(seqToken, &flags))
10507         {
10508             ajErr("SAM %F '%S' invalid POS value %S\n",
10509                   infile, thys->Name, seqToken);
10510             return ajFalse;
10511         }
10512     }
10513 
10514     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* MAPQ */
10515     ajDebug("MAPQ  '%S'\n", seqToken);
10516 
10517     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* CIGAR */
10518     ajDebug("CIGAR '%S'\n", seqToken);
10519 
10520     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* MRNM */
10521     ajDebug("MRNM  '%S'\n", seqToken);
10522 
10523     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* MPOS */
10524     ajDebug("MPOS  '%S'\n", seqToken);
10525 
10526     if(ajStrGetLen(seqToken))
10527     {
10528         if(!ajStrToUint(seqToken, &flags))
10529         {
10530             ajErr("SAM %F '%S' invalid MPOS value %S\n",
10531                   infile, thys->Name, seqToken);
10532             return ajFalse;
10533         }
10534     }
10535 
10536     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* ISIZE */
10537     ajDebug("ISIZE '%S'\n", seqToken);
10538 
10539     if(ajStrGetLen(seqToken))
10540     {
10541         if(!ajStrToInt(seqToken, &iflags))
10542         {
10543             ajErr("SAM %F '%S' invalid ISIZE value %S\n",
10544                   infile, thys->Name, seqToken);
10545             return ajFalse;
10546         }
10547     }
10548 
10549 
10550     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* SEQ */
10551     ajDebug("SEQ   '%S'\n", seqToken);
10552     seqAppend(&thys->Seq, seqToken);
10553     seqlen = MAJSTRGETLEN(seqToken);
10554 
10555     ajStrTokenNextParseNoskip(seqHandle,&seqToken); /* QUAL */
10556     ajDebug("QUAL  '%S'", seqToken);
10557 
10558     if(ajStrCmpC(seqToken,"*")!=0)
10559     {
10560         if(MAJSTRGETLEN(seqToken) != seqlen)
10561         {
10562             ajErr("SAM quality length mismatch '%F' '%S' "
10563                   "expected: %u found: %u '%S' '%S'",
10564                   infile, thys->Name,
10565                   seqlen, ajStrGetLen(seqQualStr), thys->Seq, seqToken);
10566             return ajFalse;
10567         }
10568 
10569         cp = MAJSTRGETPTR(seqToken);
10570         i=0;
10571 
10572         if(seqlen > thys->Qualsize)
10573             AJCRESIZE(thys->Accuracy, seqlen);
10574 
10575         thys->Qualsize = seqlen;
10576 
10577         if(MAJSTRGETLEN(seqToken) > thys->Qualsize)
10578             AJCRESIZE(thys->Accuracy, MAJSTRGETLEN(seqQualStr));
10579 
10580         while (*cp)
10581         {
10582             iqual = *cp++;
10583             if(iqual < qmin)
10584             {
10585                 ajWarn("SAM '%F' sequence '%S' "
10586                        "quality value %d '%c' too low",
10587                        infile, thys->Name,
10588                        (ajint) (cp - MAJSTRGETPTR(seqToken)), (char) iqual);
10589                 iqual = qmin;
10590             }
10591             if(iqual > qmax)
10592             {
10593                 ajWarn("SAM '%F' sequence '%S' "
10594                        "quality value '%c' too high",
10595                        infile, thys->Name,
10596                        (char) iqual);
10597                 iqual = qmax;
10598             }
10599             thys->Accuracy[i++] = seqQualPhred[iqual];
10600         }
10601     }
10602 
10603     /* 11 fields then (tag:vtype:value)... */
10604 
10605     ajStrDelStatic(&seqToken);
10606     ajStrTokenReset(seqHandle);
10607 
10608     return ajTrue;
10609 }
10610 
10611 
10612 
10613 
10614 /* @funcstatic seqReadScf *****************************************************
10615 **
10616 ** Given data in a sequence structure, tries to read everything needed
10617 ** using stored chromatogram format (SCF)
10618 **
10619 ** @param [w] thys [AjPSeq] Sequence object
10620 ** @param [u] seqin [AjPSeqin] Sequence input object
10621 ** @return [AjBool] ajTrue on success
10622 **
10623 ** @release 6.2.0
10624 ** @@
10625 ******************************************************************************/
10626 
seqReadScf(AjPSeq thys,AjPSeqin seqin)10627 static AjBool seqReadScf(AjPSeq thys, AjPSeqin seqin)
10628 {
10629     AjPFilebuff buff;
10630     AjPFile infile = NULL;
10631     ajuint i;
10632     ajulong filestat = 0L;
10633     size_t status;
10634     SeqOScfData scfdata;
10635     ajuint magicnum = SCF_MAGIC;
10636     ajuint seqlen;
10637     ajuint iqual;
10638     AjBool revint = ajFalse;
10639     AjBool hasqual = ajFalse;
10640     SeqOScfBase scfbase;
10641     AjPStr tmpstr = NULL;
10642     ajuint scfversion = 0;
10643     ajuint *iprob = NULL;
10644     ajuint *peakoffset = NULL;
10645     unsigned char *probA = NULL;
10646     unsigned char *probC = NULL;
10647     unsigned char *probG = NULL;
10648     unsigned char *probT = NULL;
10649     char *bases = NULL;
10650 
10651     buff = seqin->Input->Filebuff;
10652     infile = ajFilebuffGetFile(buff);
10653 
10654     if(ajFilebuffIsEnded(buff))
10655         return ajFalse;
10656 
10657     filestat = ajFileSeek(infile, 0L, SEEK_SET);
10658     if(filestat != 0)
10659     {
10660         ajDebug("seqReadScf rewind failed errno %d: %s\n",
10661                 errno, strerror(errno));
10662         return ajFalse;
10663     }
10664 
10665     /* === header section === */
10666 
10667     status = ajReadbinBinary(infile, 1, 128, &scfdata.header);
10668     if(!status)
10669         return ajFalse;
10670 
10671     if(scfdata.header.magic_number != magicnum)
10672     {
10673         ajByteRevLen4u(&scfdata.header.magic_number);
10674         if(scfdata.header.magic_number != magicnum)
10675         {
10676             ajDebug("SCF magic number expected '%x' reversed '%x'\n",
10677                     magicnum, scfdata.header.magic_number);
10678             return ajFalse;
10679         }
10680 
10681         ajDebug("SCF magic number '%x' reversed Bigendian: %B\n",
10682                 scfdata.header.magic_number,
10683                 ajUtilGetBigendian());
10684 
10685         revint = ajTrue;
10686 
10687         ajByteRevLen4u(&scfdata.header.samples);
10688         ajByteRevLen4u(&scfdata.header.samples_offset);
10689         ajByteRevLen4u(&scfdata.header.bases);
10690         ajByteRevLen4u(&scfdata.header.bases_left_clip);
10691         ajByteRevLen4u(&scfdata.header.bases_right_clip);
10692         ajByteRevLen4u(&scfdata.header.bases_offset);
10693         ajByteRevLen4u(&scfdata.header.comments_size);
10694         ajByteRevLen4u(&scfdata.header.comments_offset);
10695         ajByteRevLen4u(&scfdata.header.sample_size);
10696         ajByteRevLen4u(&scfdata.header.code_set);
10697         ajByteRevLen4u(&scfdata.header.private_size);
10698         ajByteRevLen4u(&scfdata.header.private_offset);
10699         for(i=0; i < 18; i++)
10700             ajByteRevLen4u(&scfdata.header.spare[i]);
10701     }
10702 
10703     scfversion = scfdata.header.version[0] - '0';
10704 
10705     ajDebug("version %u '%c%c%c%c' uncertainty %u '%s' precision %u %ubit\n",
10706             scfversion, scfdata.header.version[0],
10707             scfdata.header.version[1],
10708             scfdata.header.version[2],
10709             scfdata.header.version[3],
10710             scfdata.header.code_set,
10711             SeqScfUncertainCodes[scfdata.header.code_set].name,
10712             scfdata.header.sample_size,
10713             (scfdata.header.sample_size - 1) ? 8 : 16
10714             );
10715 
10716     ajDebug("%u samples at %u\n",
10717             scfdata.header.samples,
10718             scfdata.header.samples_offset);
10719 
10720     ajDebug("%u bases at %u\n",
10721             scfdata.header.bases,
10722             scfdata.header.bases_offset);
10723 
10724     ajDebug("%u char comment at %u\n",
10725             scfdata.header.comments_size,
10726             scfdata.header.comments_offset);
10727 
10728     ajDebug("%u private records at %u\n",
10729             scfdata.header.private_size,
10730             scfdata.header.private_offset);
10731 
10732 
10733     filestat = ajFileSeek(infile, scfdata.header.bases_offset, SEEK_SET);
10734     if(filestat != 0)
10735     {
10736         ajDebug("seqReadScf seek failed errno %d: %s\n",
10737                 errno, strerror(errno));
10738         return ajFalse;
10739     }
10740 
10741     seqlen = scfdata.header.bases;
10742     AJCNEW(bases, seqlen+1);
10743     AJCNEW(iprob, seqlen);
10744     bases[seqlen] = '\0';
10745 
10746     if(scfversion < 3)
10747     {
10748         for(i=0; i < seqlen; i++)
10749         {
10750             ajReadbinBinary(infile, 1, 12, &scfbase);
10751             if(revint)
10752             {
10753                 if(revint)
10754                     ajByteRevLen4u(&scfbase.peak_index);
10755                 bases[i] = scfbase.base;
10756                 switch(scfbase.base)
10757                 {
10758                     case 'A':
10759                     case 'a':
10760                         iqual = scfbase.prob_A;
10761                         break;
10762                     case 'C':
10763                     case 'c':
10764                         iqual = scfbase.prob_C;
10765                         break;
10766                     case 'G':
10767                     case 'g':
10768                         iqual = scfbase.prob_G;
10769                         break;
10770                     case 'T':
10771                     case 't':
10772                         iqual = scfbase.prob_T;
10773                         break;
10774                     default:
10775                         bases[i] = 'N';
10776                         iqual = 0;
10777                 }
10778                 if(iqual)
10779                     hasqual = ajTrue;
10780                 iprob[i] = iqual;
10781             }
10782         }
10783     }
10784     else if (scfversion == 3)
10785     {
10786         AJCNEW(peakoffset, seqlen);
10787         AJCNEW(probA, seqlen);
10788         AJCNEW(probC, seqlen);
10789         AJCNEW(probG, seqlen);
10790         AJCNEW(probT, seqlen);
10791         ajReadbinBinary(infile, seqlen, 4, peakoffset);
10792         ajReadbinBinary(infile, seqlen, 1, probA);
10793         ajReadbinBinary(infile, seqlen, 1, probC);
10794         ajReadbinBinary(infile, seqlen, 1, probG);
10795         ajReadbinBinary(infile, seqlen, 1, probT);
10796         ajReadbinBinary(infile, seqlen, 1, bases);
10797 
10798         for(i=0; i < seqlen; i++)
10799         {
10800             if(revint)
10801                 ajByteRevLen4u(&peakoffset[i]);
10802             switch(bases[i])
10803             {
10804                 case 'A':
10805                 case 'a':
10806                     iqual = probA[i];
10807                     break;
10808                 case 'C':
10809                 case 'c':
10810                     iqual = probC[i];
10811                     break;
10812                 case 'G':
10813                 case 'g':
10814                     iqual = probG[i];
10815                     break;
10816                 case 'T':
10817                 case 't':
10818                     iqual = probT[i];
10819                     break;
10820                 default:
10821                     bases[i] = 'N';
10822                     iqual = 0;
10823             }
10824             if(iqual)
10825                 hasqual = ajTrue;
10826             iprob[i] = iqual;
10827         }
10828     }
10829     else
10830     {
10831         ajDebug("Unknown SCF version '%c%c%c%c'",
10832                scfdata.header.version[0],
10833                scfdata.header.version[1],
10834                scfdata.header.version[2],
10835                scfdata.header.version[3]);
10836         return ajFalse;
10837     }
10838 
10839     filestat = ajFileSeek(infile, scfdata.header.comments_offset, SEEK_SET);
10840     if(filestat != 0)
10841     {
10842         ajDebug("seqReadScf seek failed errno %d: %s\n",
10843                 errno, strerror(errno));
10844         return ajFalse;
10845     }
10846     ajReadbinStr(infile, scfdata.header.comments_size, &tmpstr);
10847 
10848     ajStrExchangeCC(&tmpstr, "\r", "\n");
10849     ajStrExchangeCC(&tmpstr, "\n\n", "\n");
10850     ajStrExchangeCC(&tmpstr, "\n", "; ");
10851     ajStrExchangeCC(&tmpstr, " ;", ";");
10852     ajStrTrimWhiteEnd(&tmpstr);
10853     ajStrAssignS(&thys->Desc, tmpstr);
10854 
10855     ajStrAssignC(&tmpstr, bases);
10856     seqAppendWarn(&thys->Seq, tmpstr,
10857                   seqin->Input->Format);
10858     if(hasqual)
10859     {
10860         AJCRESIZE(thys->Accuracy, seqlen);
10861         thys->Qualsize = seqlen;
10862         for(i=0; i < seqlen; i++)
10863             thys->Accuracy[i] = seqQualPhred[iprob[i]];
10864     }
10865 
10866     ajStrDel(&tmpstr);
10867 
10868     seqSetNameFile(thys, seqin);
10869 
10870     ajFilebuffClear(buff, 0);
10871     buff->File->End = ajTrue;
10872 
10873     return ajTrue;
10874 }
10875 
10876 
10877 
10878 
10879 /* @funcstatic seqReadStrider *************************************************
10880 **
10881 ** Given data in a sequence structure, tries to read everything needed
10882 ** using DNA strider format.
10883 **
10884 ** @param [w] thys [AjPSeq] Sequence object
10885 ** @param [u] seqin [AjPSeqin] Sequence input object
10886 ** @return [AjBool] ajTrue on success
10887 **
10888 ** @release 1.0.0
10889 ** @@
10890 ******************************************************************************/
10891 
seqReadStrider(AjPSeq thys,AjPSeqin seqin)10892 static AjBool seqReadStrider(AjPSeq thys, AjPSeqin seqin)
10893 {
10894     AjPFilebuff buff;
10895     AjBool ok = ajTrue;
10896 
10897     buff = seqin->Input->Filebuff;
10898 
10899     do
10900     {
10901         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10902 
10903         if(ok)
10904         {
10905             if(ajStrPrefixC(seqReadLine, "; DNA sequence"))
10906             {
10907                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\t,\n");
10908                 ajStrTokenStep(seqHandle); /* 'DNA' */
10909                 ajStrTokenStep(seqHandle); /* sequence */
10910                 ajStrTokenNextParse(seqHandle, &seqToken); /* entry name */
10911             }
10912         }
10913 
10914     } while(ok && ajStrPrefixC(seqReadLine, ";"));
10915 
10916     ajStrTokenReset(seqHandle);
10917 
10918     if(!ok || !ajStrGetLen(seqToken))
10919     {
10920         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
10921         ajStrDelStatic(&seqToken);
10922 
10923         return ajFalse;
10924     }
10925 
10926     /* we know we will succeed from here ... no way to return ajFalse */
10927 
10928     ajFilebuffSetUnbuffered(buff);
10929 
10930     seqSetName(thys, seqToken);
10931 
10932     /* OK, we have the name. Now look for the sequence */
10933 
10934     while(ok && !ajStrPrefixC(seqReadLine, "//"))
10935     {
10936         seqAppend(&thys->Seq, seqReadLine);
10937         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10938     }
10939 
10940     ajFilebuffClear(buff, 0);
10941     ajStrDelStatic(&seqToken);
10942 
10943     return ajTrue;
10944 }
10945 
10946 
10947 
10948 
10949 /* @funcstatic seqReadMsf *****************************************************
10950 **
10951 ** Tries to read input in MSF format. If successful, can repeat for the
10952 ** next call to return the second, third, ... sequence from the same file.
10953 **
10954 ** @param [w] thys [AjPSeq] Sequence object
10955 ** @param [u] seqin [AjPSeqin] Sequence input object
10956 ** @return [AjBool] ajTrue on success
10957 **
10958 ** @release 1.0.0
10959 ** @@
10960 ******************************************************************************/
10961 
seqReadMsf(AjPSeq thys,AjPSeqin seqin)10962 static AjBool seqReadMsf(AjPSeq thys, AjPSeqin seqin)
10963 {
10964     ajuint len;
10965     AjBool ok  = ajFalse;
10966     ajuint iseq = 0;
10967 
10968     AjPFilebuff buff;
10969     AjPTable msftable   = NULL;
10970     SeqPMsfItem msfitem = NULL;
10971     const SeqPMsfItem readmsfitem = NULL;
10972     AjPList msflist     = NULL;
10973     SeqPMsfData msfdata = NULL;
10974 
10975     ajuint i;
10976 
10977     ajDebug("seqReadMsf seqin->SeqData %x\n", seqin->SeqData);
10978 
10979     buff = seqin->Input->Filebuff;
10980 
10981     if(!seqin->SeqData)
10982     {
10983         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
10984         if(!ok)
10985             return ajFalse;
10986 
10987         if(ajStrPrefixC(seqReadLine, "!!"))
10988         {
10989             if(ajStrPrefixC(seqReadLine, "!!AA_MULTIPLE_ALIGNMENT"))
10990                 ajSeqSetProt(thys);
10991 
10992             if(ajStrPrefixC(seqReadLine, "!!NA_MULTIPLE_ALIGNMENT"))
10993                 ajSeqSetNuc(thys);
10994         }
10995 
10996         if(!seqGcgMsfDots(thys, seqin, &seqReadLine, seqMaxGcglines, &len))
10997         {
10998             ajDebug("seqGcgMsfDots failed\n");
10999             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
11000 
11001             return ajFalse;
11002         }
11003 
11004         /* we know we will succeed from here ... no way to return ajFalse */
11005 
11006         ajFilebuffSetUnbuffered(buff);
11007 
11008         seqin->SeqData = AJNEW0(msfdata);
11009         msfdata->Table = msftable = ajTablestrNew(1000);
11010         msflist = ajListstrNew();
11011         seqin->Input->Filecount = 0;
11012         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
11013 
11014         while(ok && !ajStrPrefixC(seqReadLine, "//"))
11015         {
11016             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
11017             if(seqGcgMsfHeader(seqReadLine, &msfitem))
11018             {
11019                 ajTablePut(msftable, ajStrNewS(msfitem->Name), msfitem);
11020                 ajListstrPushAppend(msflist, ajStrNewS(msfitem->Name));
11021                 iseq++;
11022             }
11023         }
11024 
11025         ajDebug("Header has %d sequences\n", iseq);
11026         ajListstrTrace(msflist);
11027         ajTableTrace(msftable);
11028         ajTableMap(msftable, &seqMsfTabList, NULL);
11029 
11030         msfdata->Names = AJCALLOC(iseq, sizeof(*msfdata->Names));
11031 
11032         for(i=0; i < iseq; i++)
11033         {
11034             ajListstrPop(msflist, &msfdata->Names[i]);
11035             ajDebug("list [%d] '%S'\n", i, msfdata->Names[i]);
11036         }
11037 
11038         ajListstrFreeData(&msflist);
11039 
11040         while(ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
11041         {
11042             seqGcgMsfReadseq(seqReadLine, msftable);
11043         }
11044 
11045         ajTableMap(msftable, &seqMsfTabList, NULL);
11046         msfdata->Nseq = iseq;
11047         msfdata->Count = 0;
11048         msfdata->Bufflines = ajTextinGetRecords(seqin->Input);
11049         ajDebug("MSF format read %d lines\n",
11050                 ajTextinGetRecords(seqin->Input));
11051     }
11052 
11053     msfdata = seqin->SeqData;
11054     msftable = msfdata->Table;
11055 
11056     if(msfdata->Count >= msfdata->Nseq)
11057     {
11058         ajFilebuffClear(seqin->Input->Filebuff, 0);
11059         seqMsfDataDel((SeqPMsfData*)&seqin->SeqData);
11060 
11061         return ajFalse;
11062     }
11063 
11064     i = msfdata->Count;
11065     ajDebug("returning [%d] '%S'\n", i, msfdata->Names[i]);
11066     readmsfitem = ajTableFetchS(msftable, msfdata->Names[i]);
11067     ajStrAssignS(&thys->Name, msfdata->Names[i]);
11068 
11069     thys->Weight = readmsfitem->Weight;
11070     ajStrAssignS(&thys->Seq, readmsfitem->Seq);
11071 
11072     msfdata->Count++;
11073 
11074     return ajTrue;
11075 }
11076 
11077 
11078 
11079 
11080 /* @funcstatic seqGcgMsfReadseq ***********************************************
11081 **
11082 ** Reads sequence name from first token on the input line, and appends
11083 ** the sequence data to that sequence in the msftable structure.
11084 **
11085 ** @param [r] rdline [const AjPStr] Line from input file.
11086 ** @param [r] msftable [const AjPTable] MSF format sequence table.
11087 ** @return [AjBool] ajTrue on success
11088 **
11089 ** @release 1.0.0
11090 ** @@
11091 ******************************************************************************/
11092 
seqGcgMsfReadseq(const AjPStr rdline,const AjPTable msftable)11093 static AjBool seqGcgMsfReadseq(const AjPStr rdline, const AjPTable msftable)
11094 {
11095     SeqPMsfItem msfitem;
11096     AjPStr token     = NULL;
11097     AjPStr seqstr    = NULL;
11098     AjBool status;
11099 
11100     status = ajStrExtractWord(rdline, &seqstr, &token);
11101 
11102     if(!status)
11103     {
11104         ajStrDel(&token);
11105         ajStrDel(&seqstr);
11106 
11107         return ajFalse;
11108     }
11109 
11110     ajDebug("seqGcgMsfReadseq '%S' '%S'\n", token, seqstr);
11111 
11112     msfitem = ajTableFetchmodS(msftable, token);
11113 
11114     if(!msfitem)
11115     {
11116         ajStrDel(&token);
11117         ajStrDel(&seqstr);
11118 
11119         return ajFalse;
11120     }
11121 
11122     seqAppend(&msfitem->Seq, seqstr);
11123 
11124     ajStrDel(&token);
11125     ajStrDel(&seqstr);
11126 
11127     return ajTrue;
11128 }
11129 
11130 
11131 
11132 
11133 /* @funcstatic seqMsfDataDel **************************************************
11134 **
11135 ** Destructor for SeqPMsfData objects
11136 **
11137 ** @param [d] pthys [SeqPMsfData*] MSF data object
11138 ** @return [void]
11139 **
11140 ** @release 4.1.0
11141 ** @@
11142 ******************************************************************************/
11143 
seqMsfDataDel(SeqPMsfData * pthys)11144 static void seqMsfDataDel(SeqPMsfData* pthys)
11145 {
11146     SeqPMsfData thys;
11147     ajuint i;
11148 
11149     if(!pthys)
11150         return;
11151 
11152     if(!*pthys)
11153         return;
11154 
11155     thys = *pthys;
11156 
11157     ajDebug("seqMsfDataDel Nseq:%u Count:%u Table:%u Nexus:%Lu\n",
11158             thys->Nseq, thys->Count, ajTableGetLength(thys->Table),
11159             ajNexusGetNtaxa(thys->Nexus));
11160 
11161     for(i=0; i < thys->Nseq; i++)
11162     {
11163         ajStrDel(&thys->Names[i]);
11164     }
11165 
11166 
11167     AJFREE(thys->Names);
11168 
11169     ajNexusDel(&thys->Nexus);
11170     ajStrDel(&thys->Gene);
11171     ajStrDel(&thys->Domain);
11172     ajStrDel(&thys->NextGene);
11173     ajStrDel(&thys->NextDomain);
11174     ajTableMapDel(thys->Table, &seqMsfTabDel, NULL);
11175     ajTableFree(&thys->Table);
11176 
11177     AJFREE(*pthys);
11178 
11179     return;
11180 }
11181 
11182 
11183 
11184 
11185 /* @funcstatic seqMsfItemDel **************************************************
11186 **
11187 ** Destructor for SeqPMsfItem objects
11188 **
11189 ** @param [d] pthys [SeqPMsfItem*] MSF item object
11190 ** @return [void]
11191 **
11192 ** @release 4.1.0
11193 ** @@
11194 ******************************************************************************/
11195 
seqMsfItemDel(SeqPMsfItem * pthys)11196 static void seqMsfItemDel(SeqPMsfItem* pthys)
11197 {
11198     SeqPMsfItem thys;
11199 
11200     if(!pthys)
11201         return;
11202 
11203     if(!*pthys)
11204         return;
11205 
11206     thys = *pthys;
11207 
11208     ajStrDel(&thys->Name);
11209     ajStrDel(&thys->Desc);
11210     ajStrDel(&thys->Seq);
11211 
11212     AJFREE(*pthys);
11213 
11214     return;
11215 }
11216 
11217 
11218 
11219 
11220 /* @funcstatic seqMsfTabList **************************************************
11221 **
11222 ** Writes a debug report of the contents of an MSF table.
11223 **
11224 ** @param [r] key [const void*] Standard argument, key from current table item
11225 **                              which is a string for MSF internal tables.
11226 ** @param [r] value [void**] Standard argument, data from current table item,
11227 **                           converted to an MSF internal table item.
11228 ** @param [r] cl [void*] Standard argument, usually NULL.
11229 ** @return [void]
11230 **
11231 ** @release 1.0.0
11232 ** @@
11233 ******************************************************************************/
11234 
seqMsfTabList(const void * key,void ** value,void * cl)11235 static void seqMsfTabList(const void* key, void** value, void* cl)
11236 {
11237     SeqPMsfItem msfitem;
11238 
11239     (void) cl;
11240 
11241     msfitem = (SeqPMsfItem) *value;
11242 
11243     ajDebug("key '%S' Name '%S' Seqlen %d\n",
11244             key, msfitem->Name, ajStrGetLen(msfitem->Seq));
11245 
11246     return;
11247 }
11248 
11249 
11250 
11251 
11252 /* @funcstatic seqMsfDataTrace ************************************************
11253 **
11254 ** Debug trace report for SeqPMsfData objects
11255 **
11256 ** @param [r] thys [const SeqPMsfData] MSF data object
11257 ** @return [void]
11258 **
11259 ** @release 4.1.0
11260 ** @@
11261 ******************************************************************************/
11262 
seqMsfDataTrace(const SeqPMsfData thys)11263 static void seqMsfDataTrace(const SeqPMsfData thys)
11264 {
11265     ajuint i;
11266 
11267     if(!thys)
11268     {
11269         ajDebug("seqMsfDataTrace <null>\n");
11270         return;
11271     }
11272 
11273     ajDebug("seqMsfDataTrace Nseq:%u Count:%u Table:%u Nexus:%Lu\n",
11274             thys->Nseq, thys->Count, ajTableGetLength(thys->Table),
11275             ajNexusGetNtaxa(thys->Nexus));
11276 
11277     for(i=0; i < thys->Nseq; i++)
11278         if(i < thys->Count)
11279             ajDebug("* [%u] '%S'\n", i, thys->Names[i]);
11280         else
11281             ajDebug("  [%u] '%S'\n", i, thys->Names[i]);
11282 
11283     ajTableMap(thys->Table, &seqMsfTabList, NULL);
11284 
11285     return;
11286 }
11287 
11288 
11289 
11290 
11291 /* @funcstatic seqMsfTabDel ***************************************************
11292 **
11293 ** Deletes entries from the MSF internal table. Called for each entry in turn.
11294 **
11295 ** @param [d] key [void**] Standard argument, table key.
11296 ** @param [d] value [void**] Standard argument, table data item.
11297 ** @param [r] cl [void*] Standard argument, usually NULL
11298 ** @return [void]
11299 **
11300 ** @release 1.0.0
11301 ** @@
11302 ******************************************************************************/
11303 
seqMsfTabDel(void ** key,void ** value,void * cl)11304 static void seqMsfTabDel(void** key, void** value, void* cl)
11305 {
11306     SeqPMsfItem msfitem;
11307     AjPStr keystr;
11308 
11309     (void) cl;
11310 
11311     keystr = (AjPStr) *key;
11312     msfitem = (SeqPMsfItem) *value;
11313 
11314     ajStrDel(&keystr);
11315 
11316     seqMsfItemDel(&msfitem);
11317 
11318     *key = NULL;
11319     *value = NULL;
11320 
11321     return;
11322 }
11323 
11324 
11325 
11326 
11327 /* @funcstatic seqReadSwiss ***************************************************
11328 **
11329 ** Given data in a sequence structure, tries to read everything needed
11330 ** using SWISS format.
11331 **
11332 ** @param [w] thys [AjPSeq] Sequence object
11333 ** @param [u] seqin [AjPSeqin] Sequence input object
11334 ** @return [AjBool] ajTrue on success
11335 **
11336 ** @release 1.0.0
11337 ** @@
11338 ******************************************************************************/
11339 
seqReadSwiss(AjPSeq thys,AjPSeqin seqin)11340 static AjBool seqReadSwiss(AjPSeq thys, AjPSeqin seqin)
11341 {
11342     AjBool ok;
11343     AjPFilebuff buff;
11344     AjBool dodes = ajFalse;
11345     AjBool dofeat  = ajFalse;
11346     AjBool tryfeat = ajFalse;
11347     AjPStr liststr;                     /* for lists, do not delete */
11348     AjPStr datestr = NULL;
11349     AjPStr datetype = NULL;
11350     AjPStr relstr = NULL;
11351     AjPStr taxstr = NULL;
11352     AjPStr cmtstr = NULL;               /* stored in AjPSeq - do not delete */
11353     ajuint icount = 0;
11354     AjPSeqRef  seqref  = NULL;
11355     AjPSeqXref xref    = NULL;
11356     AjPSeqGene seqgene = NULL;
11357     AjPSeqDesc desctop = NULL;
11358     AjPSeqDesc descmaster = NULL;
11359     AjPSeqSubdesc subdesc = NULL;
11360     AjBool descistop = ajTrue;
11361     AjBool isdescflag = ajFalse;
11362     AjPStr *Pdescstr = NULL;
11363     AjPStr newdescstr = NULL;
11364     AjPStr genetoken = NULL;
11365     const AjPStr tmpstr = NULL;
11366     ajuint refnum;
11367     ajuint itaxtype = 0;
11368     AjBool isnewgene = ajFalse;
11369     AjBool isgenetoken = ajFalse;
11370     AjIList iter;
11371     AjIList itb;
11372     AjIList itc;
11373     SeqEPrefixSwiss lineprefix = SWISS_UNK;
11374 
11375 /*
11376 ** To be done: 12-Feb-09
11377 ** input line wrapping test GN,
11378 ** continue lines for OS
11379 **
11380 ** New line types:
11381 **    OH organism host: list of tax ids
11382 **
11383 ** CC line blocks -!- TOPIC:
11384 ** can do this by parsing the stored comment block
11385 **
11386 ** DR lines - can parse out the details
11387 */
11388 
11389     buff = seqin->Input->Filebuff;
11390 
11391     if(!seqFtFmtSwiss)
11392         ajStrAssignC(&seqFtFmtSwiss, "swissprot");
11393 
11394     if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
11395         return ajFalse;
11396 
11397     lineprefix = seqPrefixSwiss(seqReadLine);
11398 
11399     /* for GCG formatted databases */
11400 
11401     while(lineprefix == SWISS_WP) /* "WP" */
11402     {
11403         if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
11404             return ajFalse;
11405 
11406         lineprefix = seqPrefixSwiss(seqReadLine);
11407     }
11408 
11409     /* extra blank lines */
11410 
11411     while(ajStrIsWhite(seqReadLine))
11412     {
11413         if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
11414             return ajFalse;
11415 
11416         lineprefix = seqPrefixSwiss(seqReadLine);
11417     }
11418 
11419     ajDebug("seqReadSwiss first line '%S'\n", seqReadLine);
11420 
11421     if(lineprefix != SWISS_ID)  /* "ID" */
11422     {
11423         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
11424 
11425         return ajFalse;
11426     }
11427 
11428     ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11429     ajStrTokenStep(seqHandle);       /* 'ID' */
11430     ajStrTokenNextParse(seqHandle, &seqToken);       /* entry name */
11431 
11432     seqSetName(thys, seqToken);
11433 
11434     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
11435     lineprefix = seqPrefixSwiss(seqReadLine);
11436 
11437     if(!seqin->Minimal && !thys->Fulldesc)
11438         thys->Fulldesc = ajSeqdescNew();
11439 
11440     if(seqin->Minimal)
11441         dodes = ajTrue;
11442 
11443     dofeat = ajFalse;
11444     tryfeat = seqinUfoLocal(seqin);
11445 
11446     while(ok && lineprefix != SWISS_SQ) /* read until "SQ" */
11447     {
11448         /* check for Staden Experiment format instead */
11449 
11450         if(lineprefix == SWISS_EX) /* EN/EX/TN */
11451         {
11452             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
11453             ajStrTokenReset(seqHandle);
11454             ajStrDelStatic(&seqToken);
11455 
11456             return ajFalse;;
11457         }
11458 
11459         else if(lineprefix == SWISS_AC) /* AC */
11460         {
11461             ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
11462             ajStrTokenStep(seqHandle); /* 'AC' */
11463 
11464             while(ajStrTokenNextParse(seqHandle, &seqToken))
11465                 seqAccSave(thys, seqToken);
11466         }
11467 
11468         if(tryfeat && lineprefix == SWISS_FT) /* FT */
11469         {
11470             if(!dofeat)         /* set up feature buffer */
11471             {
11472                 dofeat = ajTrue;
11473                 ajFeattabinDel(&seqin->Ftquery);
11474                 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtSwiss,
11475                                                        thys->Name, "N");
11476                 ajDebug("seqin->Ftquery ftfile %x\n",
11477                         seqin->Ftquery->Input->Filebuff);
11478             }
11479 
11480             ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
11481             /* ajDebug("SWISS FEAT saved line:\n%S", seqReadLine); */
11482         }
11483 
11484         if(seqin->Minimal)
11485         {
11486             /*
11487             ** only simple description needed
11488             ** test DE line, extract basic text if any
11489             ** then go to next record
11490            */
11491 
11492             if(lineprefix == SWISS_DE) /* DE minimal processing */
11493             {
11494                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
11495                 ajStrTokenStep(seqHandle); /* 'DE' */
11496 
11497                 while(ajStrTokenNextParseC(seqHandle, " ;\n\r", &seqToken))
11498                 {
11499                     if(MAJSTRGETCHARLAST(seqToken) == ':')
11500                     {
11501                         switch(seqDesSwiss(seqToken))
11502                         {
11503                             case SWISS_DES_REC:
11504                                 dodes = ajTrue;
11505                                 break;
11506 
11507                             case SWISS_DES_UNK:
11508                                 if(dodes)
11509                                 {
11510                                     if(MAJSTRGETLEN(thys->Desc))
11511                                         ajStrAppendK(&thys->Desc, ' ');
11512                                     ajStrAppendS(&thys->Desc, seqToken);
11513                                 }
11514                                 break;
11515 
11516                             default:
11517                                 if(MAJSTRGETLEN(thys->Desc))
11518                                     dodes = ajFalse;
11519                                 break;
11520                         }
11521                     }
11522                     else if(ajStrFindK(seqToken, '=') > 0)
11523                     {
11524                         switch(seqDessubSwiss(&seqToken))
11525                         {
11526                             case SWISS_SUB_FULL:
11527                                 if(!MAJSTRGETLEN(thys->Desc))
11528                                     dodes = ajTrue;
11529                                 break;
11530                             case SWISS_SUB_UNK:
11531                                 break;
11532                             default:
11533                                 dodes = ajFalse;
11534                                 break;
11535                         }
11536                         if(dodes)
11537                         {
11538                             if(MAJSTRGETLEN(thys->Desc))
11539                                 ajStrAppendK(&thys->Desc, ' ');
11540                             ajStrAppendS(&thys->Desc, seqToken);
11541                         }
11542                     }
11543                     else
11544                     {
11545                         if(dodes)
11546                         {
11547                             if(MAJSTRGETLEN(thys->Desc))
11548                                 ajStrAppendK(&thys->Desc, ' ');
11549                             ajStrAppendS(&thys->Desc, seqToken);
11550                         }
11551                     }
11552                 }
11553             }
11554 
11555             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
11556             lineprefix = seqPrefixSwiss(seqReadLine);
11557             continue;
11558         }
11559 
11560         switch(lineprefix)      /* all other line types */
11561         {
11562             case SWISS_DE:
11563                 if(!desctop)
11564                 {
11565                     desctop = thys->Fulldesc;
11566                     descmaster = thys->Fulldesc;
11567                     Pdescstr = &thys->Desc;
11568                 }
11569 
11570                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
11571                 ajStrTokenStep(seqHandle); /* 'DE' */
11572 
11573                 while(ajStrTokenNextParseC(seqHandle, " ;\n\r", &seqToken))
11574                 {
11575                     if(ajStrGetCharLast(seqToken) == ':')
11576                     {
11577                         isdescflag = ajFalse;
11578 
11579                         switch(seqDesSwiss(seqToken))
11580                         {
11581                             case SWISS_DES_REC:
11582                                 Pdescstr = &descmaster->Name;
11583                                 descistop = ajTrue;
11584                                 break;
11585 
11586                             case SWISS_DES_ALT:
11587                                 subdesc = ajSeqsubdescNew();
11588                                 descistop = ajFalse;
11589                                 Pdescstr = &subdesc->Name;
11590                                 ajListPushAppend(descmaster->AltNames, subdesc);
11591                                 break;
11592 
11593                             case SWISS_DES_SUB:
11594                                 subdesc = ajSeqsubdescNew();
11595                                 descistop = ajFalse;
11596                                 Pdescstr = &subdesc->Name;
11597                                 ajListPushAppend(descmaster->SubNames, subdesc);
11598                                 break;
11599 
11600                             case SWISS_DES_INC:
11601                                 descmaster = ajSeqdescNew();
11602                                 descistop = ajTrue;
11603                                 ajListPushAppend(thys->Fulldesc->Includes,
11604                                                  descmaster);
11605                                 Pdescstr = &descmaster->Name;
11606                                 break;
11607 
11608                             case SWISS_DES_CONT:
11609                                 descmaster = ajSeqdescNew();
11610                                 descistop = ajTrue;
11611                                 ajListPushAppend(thys->Fulldesc->Contains,
11612                                                  descmaster);
11613                                 Pdescstr = &descmaster->Name;
11614                                 break;
11615 
11616                             case SWISS_DES_FLG:
11617                                 isdescflag = ajTrue;
11618                                 break;
11619 
11620                             default:
11621                                 ajDebug("Swissprot DE line"
11622                                         "UNKNOWN token '%S'\n",
11623                                         seqToken);
11624 
11625                                 if(ajStrGetLen(*Pdescstr))
11626                                     ajStrAppendK(Pdescstr, ' ');
11627 
11628                                 ajStrAppendS(Pdescstr, seqToken);
11629                         }
11630                     }
11631 
11632                     else if(ajStrFindK(seqToken, '=') > 0)
11633                     {
11634                         switch(seqDessubSwiss(&seqToken))
11635                         {
11636                             case SWISS_SUB_FULL:
11637                                 if(descistop)
11638                                 {
11639                                     Pdescstr = &descmaster->Name;
11640                                 }
11641                                 else
11642                                 {
11643                                     Pdescstr = &subdesc->Name;
11644                                 }
11645 
11646                                 ajStrAssignS(Pdescstr, seqToken);
11647                                 break;
11648 
11649                             case SWISS_SUB_SHORT:
11650                                 newdescstr = ajStrNewC("");
11651                                 Pdescstr = &newdescstr;
11652 
11653                                 if(descistop)
11654                                     ajListstrPushAppend(descmaster->Short,
11655                                                         newdescstr);
11656                                 else
11657                                     ajListstrPushAppend(subdesc->Short,
11658                                                         newdescstr);
11659 
11660                                 ajStrAssignS(Pdescstr, seqToken);
11661                                 break;
11662 
11663                             case SWISS_SUB_EC:
11664                                 newdescstr = ajStrNewC("");
11665                                 Pdescstr = &newdescstr;
11666 
11667                                 if(descistop)
11668                                     ajListstrPushAppend(descmaster->EC,
11669                                                         newdescstr);
11670                                 else
11671                                     ajListstrPushAppend(subdesc->EC,
11672                                                         newdescstr);
11673 
11674                                 ajStrAssignS(Pdescstr, seqToken);
11675                                 xref = ajSeqxrefNewDbC(*Pdescstr, "ENZYME",
11676                                                        XREF_EC);
11677                                 ajSeqAddXref(thys, xref);
11678                                 xref = NULL;
11679                                 break;
11680 
11681                             case SWISS_SUB_ALLER:
11682                                 newdescstr = ajStrNewC("");
11683                                 Pdescstr = &newdescstr;
11684                                 ajListstrPushAppend(subdesc->Allergen,
11685                                                     newdescstr);
11686                                 ajStrAssignS(Pdescstr, seqToken);
11687                                 xref = ajSeqxrefNewDbC(*Pdescstr, "Allergen",
11688                                                        XREF_DESC);
11689                                 ajSeqAddXref(thys, xref);
11690                                 xref = NULL;
11691                                 break;
11692 
11693                             case SWISS_SUB_BIOTECH:
11694                                     newdescstr = ajStrNewC("");
11695                                 Pdescstr = &newdescstr;
11696                                 ajListstrPushAppend(subdesc->Biotech,
11697                                                     newdescstr);
11698                                 ajStrAssignS(Pdescstr, seqToken);
11699                                 break;
11700 
11701                             case SWISS_SUB_CDA:
11702                                 newdescstr = ajStrNewC("");
11703                                 Pdescstr = &newdescstr;
11704                                 ajListstrPushAppend(subdesc->Cdantigen,
11705                                                     newdescstr);
11706                                 ajStrAssignS(Pdescstr, seqToken);
11707                                 xref = ajSeqxrefNewDbC(*Pdescstr, "CD_Antigen",
11708                                                        XREF_DESC);
11709                                 ajSeqAddXref(thys, xref);
11710                                 xref = NULL;
11711                                 break;
11712 
11713                             case SWISS_SUB_INN:
11714                                 newdescstr = ajStrNewC("");
11715                                 Pdescstr = &newdescstr;
11716                                 ajListstrPushAppend(subdesc->Inn, newdescstr);
11717                                 ajStrAssignSubS(Pdescstr, seqToken, 4, -1);
11718                                 break;
11719 
11720                             default:
11721                                 ajDebug("Swissprot DE line "
11722                                         "UNKNOWN subtoken '%S'\n",
11723                                         seqToken);
11724 
11725                                 if(ajStrGetLen(*Pdescstr))
11726                                     ajStrAppendK(Pdescstr, ' ');
11727 
11728                                 ajStrAppendS(Pdescstr, seqToken);
11729                                 break;
11730                         }
11731                     }
11732                     else
11733                     {
11734                         if(isdescflag)
11735                         {
11736                             if(ajStrMatchC(seqToken,"Precursor"))
11737                                 thys->Fulldesc->Precursor = ajTrue;
11738                             else if(ajStrMatchC(seqToken,"Fragments"))
11739                                 thys->Fulldesc->Fragments = 2;
11740                             else if(ajStrMatchC(seqToken,"Fragment"))
11741                                 thys->Fulldesc->Fragments = 1;
11742                             else
11743                             {
11744                                 ajDebug("unknown description flag text '%S'\n",
11745                                         seqToken);
11746                                 if(ajStrGetLen(*Pdescstr))
11747                                     ajStrAppendK(Pdescstr, ' ');
11748 
11749                                 ajStrAppendC(Pdescstr, "Flags: ");
11750                                 ajStrAppendS(Pdescstr, seqToken);
11751                             }
11752 
11753                         }
11754                         else
11755                         {
11756                             if(ajStrGetLen(*Pdescstr))
11757                                 ajStrAppendK(Pdescstr, ' ');
11758 
11759                             ajStrAppendS(Pdescstr, seqToken);
11760                         }
11761                     }
11762                 }
11763                 break;
11764 
11765                 /* needs a little work for wrapped lines - save token and
11766                 ** append rather than set at the current level
11767                 */
11768 
11769             case SWISS_GN:
11770                 if(!seqgene)
11771                 {
11772                     isnewgene = ajTrue;
11773                     seqgene = ajSeqgeneNew();
11774                 }
11775 
11776                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
11777                 ajStrTokenStep(seqHandle); /* 'GN' */
11778                 ajStrTokenNextParseC(seqHandle, ";=\n\r", &seqToken);
11779 
11780                 if(ajStrMatchC(seqToken, "and")) /* test 'and' between genes */
11781                 {
11782                     isnewgene = ajTrue;
11783                     seqgene = ajSeqgeneNew();
11784                 }
11785                 else
11786                 {
11787                     while(ajStrGetLen(seqToken))
11788                     {
11789                         isgenetoken = ajTrue;
11790                         ajStrTrimWhite(&seqToken);
11791 
11792                         if(ajStrMatchC(seqToken, "Name"))
11793                         {
11794                             ajStrTokenNextParseC(seqHandle, ";\n\r",
11795                                                  &seqToken2);
11796                             ajSeqgeneSetName(seqgene, seqToken2);
11797                         }
11798                         else if (ajStrMatchC(seqToken, "Synonyms"))
11799                         {
11800                             ajStrTokenNextParseC(seqHandle, ";\n\r",
11801                                                  &seqToken2);
11802                             ajSeqgeneSetSynonyms(seqgene, seqToken2);
11803                         }
11804                         else if (ajStrMatchC(seqToken, "OrderedLocusNames"))
11805                         {
11806                             ajStrTokenNextParseC(seqHandle, ";\n\r",
11807                                                  &seqToken2);
11808                             ajSeqgeneSetOln(seqgene, seqToken2);
11809                         }
11810                         else if (ajStrMatchC(seqToken, "ORFNames"))
11811                         {
11812                             ajStrTokenNextParseC(seqHandle, ";\n\r",
11813                                                  &seqToken2);
11814                             ajSeqgeneSetOrf(seqgene, seqToken2);
11815                         }
11816                         else
11817                         {
11818                             isgenetoken = ajFalse;
11819                             ajDebug("Swissnew GN line unexpected '%S' (%S)",
11820                                     seqToken, genetoken);
11821 
11822                             if(ajStrMatchC(genetoken, "Name"))
11823                                 ajSeqgeneAppendName(seqgene, seqToken);
11824                             else if (ajStrMatchC(genetoken, "Synonyms"))
11825                                 ajSeqgeneAppendSynonyms(seqgene, seqToken);
11826                             else if (ajStrMatchC(genetoken,
11827                                                  "OrderedLocusNames"))
11828                                 ajSeqgeneAppendOln(seqgene, seqToken);
11829                             else if (ajStrMatchC(genetoken, "ORFNames"))
11830                                 ajSeqgeneAppendOrf(seqgene, seqToken);
11831                         }
11832 
11833                         ajStrTokenNextParseC(seqHandle, "=;\n\r", &seqToken);
11834 
11835                         if(isgenetoken)
11836                             ajStrAssignS(&genetoken, seqToken);
11837                     }
11838 
11839                     if(isnewgene)
11840                     {
11841                         isnewgene = ajFalse;
11842                         ajSeqAddGene(thys, seqgene);
11843                     }
11844                     /* keep seqgene so we can add to it if the line wraps */
11845                 }
11846                 break;
11847 
11848             case SWISS_PE:
11849                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11850                 ajStrTokenStep(seqHandle); /* PE */
11851                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
11852 
11853                 if(ajStrGetLen(seqToken))
11854                     ajStrAssignS(&thys->Evidence, seqToken);
11855                 break;
11856 
11857             case SWISS_KW:
11858                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11859                 ajStrTokenStep(seqHandle); /* 'KW' */
11860 
11861                 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
11862                 {
11863                     liststr = ajStrNewS(seqToken);
11864                     ajStrTrimWhite(&liststr);
11865                     ajSeqAddKey(thys, liststr);
11866                     liststr = NULL;
11867                 }
11868                 break;
11869 
11870             case SWISS_OS:
11871                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11872                 ajStrTokenStep(seqHandle); /* 'OS' */
11873 
11874                 while(ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken))
11875                 {
11876                     if(ajStrGetLen(taxstr))
11877                         ajStrAppendK(&taxstr, ' ');
11878 
11879                     ajStrAppendS(&taxstr, seqToken);
11880                 }
11881                 break;
11882 
11883             case SWISS_OC:
11884                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11885                 ajStrTokenStep(seqHandle); /* 'OC' */
11886 
11887                 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
11888                 {
11889                     ajStrTrimWhite(&seqToken);
11890                     seqTaxSave(thys, seqToken, 0);
11891                 }
11892                 break;
11893 
11894             case SWISS_OG:
11895                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
11896                 ajStrTokenStep(seqHandle); /* 'OG' */
11897                 ajStrTokenNextParse(seqHandle, &seqToken2);
11898 
11899                 while(ajStrTokenNextParse(seqHandle, &seqToken))
11900                 {
11901                     ajStrAppendK(&seqToken2, ' ');
11902                     ajStrAppendS(&seqToken2, seqToken);
11903                 }
11904 
11905                 if(ajStrGetCharLast(seqToken2) == '.')
11906                     ajStrCutEnd(&seqToken2, 1);
11907 
11908                 seqTaxSave(thys, seqToken2, 2);
11909                 break;
11910 
11911             case SWISS_OH:
11912                 ajStrTokenAssignC(&seqHandle, seqReadLine, " =;\n\r");
11913                 ajStrTokenStep(seqHandle); /* 'OH' */
11914                 ajStrTokenNextParse(seqHandle, &seqToken);
11915 
11916                 if(ajStrMatchC(seqToken, "NCBI_TaxID"))
11917                 {
11918                     ajStrTokenNextParse(seqHandle, &seqToken2);
11919                     seqTaxidSaveS(thys, seqToken2);
11920                     xref = ajSeqxrefNewDbC(seqToken2, "taxon", XREF_TAX);
11921                     ajSeqAddXref(thys, xref);
11922                     xref = NULL;
11923                 }
11924                 break;
11925 
11926             case SWISS_OX:
11927                 ajStrTokenAssignC(&seqHandle, seqReadLine, " =;\n\r");
11928                 ajStrTokenStep(seqHandle); /* 'OX' */
11929                 ajStrTokenNextParse(seqHandle, &seqToken);
11930 
11931                 if(ajStrMatchC(seqToken, "NCBI_TaxID"))
11932                 {
11933                     ajStrTokenNextParse(seqHandle, &seqToken2);
11934                     seqTaxidSaveS(thys, seqToken2);
11935                     xref = ajSeqxrefNewDbC(seqToken2, "taxon", XREF_TAX);
11936                     ajSeqAddXref(thys, xref);
11937                     xref = NULL;
11938                 }
11939                 break;
11940 
11941             case SWISS_CC:
11942                 ajStrAssignSubS(&seqToken, seqReadLine, 5, -1);
11943 
11944                 if(ajStrGetLen(cmtstr))
11945                 {
11946                     ajStrAppendC(&cmtstr, "\n");
11947 
11948                     if(ajStrPrefixC(seqToken, "-!- ") ||
11949                        (ajStrPrefixC(seqToken, "--------") &&
11950                         ajStrPrefixC(cmtstr, "-!- ")))
11951                     {
11952                         ajSeqAddCmt(thys, cmtstr);
11953                         cmtstr = NULL;
11954                     }
11955                 }
11956 
11957                 ajStrAppendS(&cmtstr, seqToken);
11958                 break;
11959 
11960             case SWISS_DR:
11961                 AJNEW0(xref);
11962                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
11963                 ajStrTokenStep(seqHandle); /* 'DR' */
11964                 ajStrTokenNextParseC(seqHandle, ";\n\r",
11965                                      &xref->Db); /* dbname */
11966                 ajStrTrimWhite(&xref->Db);
11967                 ajStrTokenNextParse(seqHandle, &xref->Id); /* primary */
11968                 ajStrTrimWhite(&xref->Id);
11969                 ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
11970 
11971                 if(!ajStrGetLen(seqToken))
11972                 {
11973                     if(ajStrGetCharLast(xref->Id) == '.')
11974                         ajStrCutEnd(&xref->Id, 1);
11975                 }
11976                 else
11977                 {
11978                     if(ajStrGetCharLast(seqToken) == '.')
11979                         ajStrCutEnd(&seqToken, 1);
11980                     ajStrTrimWhite(&seqToken);
11981                     ajStrAssignS(&xref->Secid, seqToken);
11982 
11983                     ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
11984 
11985                     if(!ajStrGetLen(seqToken))
11986                     {
11987                         if(ajStrGetCharLast(xref->Secid) == '.')
11988                             ajStrCutEnd(&xref->Secid, 1);
11989                     }
11990                     else
11991                     {
11992                         if(ajStrGetCharLast(seqToken) == '.')
11993                             ajStrCutEnd(&seqToken, 1);
11994                         ajStrTrimWhite(&seqToken);
11995                         ajStrAssignS(&xref->Terid, seqToken);
11996 
11997                         ajStrTokenNextParse(seqHandle, &seqToken);/* secondary*/
11998 
11999                         if(!ajStrGetLen(seqToken))
12000                         {
12001                             if(ajStrGetCharLast(xref->Terid) == '.')
12002                                 ajStrCutEnd(&xref->Terid, 1);
12003                         }
12004                         else
12005                         {
12006                             if(ajStrGetCharLast(seqToken) == '.')
12007                                 ajStrCutEnd(&seqToken, 1);
12008                             ajStrTrimWhite(&seqToken);
12009                             ajStrAssignS(&xref->Quatid, seqToken);
12010                         }
12011                     }
12012                 }
12013                 xref->Type = XREF_DR;
12014                 ajSeqAddXref(thys, xref);
12015                 xref = NULL;
12016                 break;
12017 
12018             case SWISS_RN:
12019                 if(seqref)
12020                 {
12021                     ajSeqrefStandard(seqref);
12022                     ajSeqAddRef(thys, seqref);
12023                     seqref = NULL;
12024                 }
12025 
12026                 seqref = ajSeqrefNew();
12027                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12028                 ajStrTokenStep(seqHandle); /* 'RN' */
12029                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* [num] */
12030                 ajStrAssignSubS(&seqToken2, seqToken, 1, -2);
12031                 ajStrToUint(seqToken2, &refnum);
12032                 ajSeqrefSetnumNumber(seqref, refnum);
12033                 break;
12034 
12035             case SWISS_RG:
12036                 if(!seqref)
12037                     seqref = ajSeqrefNew();
12038 
12039                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12040                 ajStrTokenStep(seqHandle); /* 'RG' */
12041                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* groupname */
12042                 ajSeqrefAppendGroupname(seqref, seqToken);
12043                 break;
12044 
12045             case SWISS_RX:
12046                 if(!seqref)
12047                     seqref = ajSeqrefNew();
12048 
12049                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12050                 ajStrTokenStep(seqHandle); /* 'RX' */
12051                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* xref */
12052                 ajSeqrefAppendXref(seqref, seqToken);
12053                 break;
12054 
12055             case SWISS_RP:
12056                 if(!seqref)
12057                     seqref = ajSeqrefNew();
12058 
12059                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12060                 ajStrTokenStep(seqHandle); /* 'RP' */
12061                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* position */
12062                 ajSeqrefAppendPosition(seqref, seqToken);
12063                 break;
12064 
12065             case SWISS_RA:
12066                 if(!seqref)
12067                     seqref = ajSeqrefNew();
12068 
12069                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12070                 ajStrTokenStep(seqHandle); /* 'RA' */
12071                 ajStrTokenNextParseC(seqHandle, "\n\r;", &seqToken); /* authors */
12072                 ajSeqrefAppendAuthors(seqref, seqToken);
12073                 break;
12074 
12075             case SWISS_RT:
12076                 if(!seqref)
12077                     seqref = ajSeqrefNew();
12078 
12079                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12080                 ajStrTokenStep(seqHandle); /* 'RT' */
12081                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
12082 
12083                 if(!ajStrMatchC(seqToken, ";"))
12084                     ajSeqrefAppendTitle(seqref, seqToken);
12085                 break;
12086 
12087             case SWISS_RL:
12088                 if(!seqref)
12089                     seqref = ajSeqrefNew();
12090 
12091                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12092                 ajStrTokenStep(seqHandle); /* 'RL' */
12093                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* location */
12094                 ajSeqrefAppendLocation(seqref, seqToken);
12095                 break;
12096 
12097             case SWISS_RC:
12098                 if(!seqref)
12099                     seqref = ajSeqrefNew();
12100 
12101                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12102                 ajStrTokenStep(seqHandle); /* 'RC' */
12103                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
12104                 ajSeqrefAppendComment(seqref, seqToken);
12105                 break;
12106 
12107             case SWISS_DT:
12108                 if(!thys->Date)
12109                     thys->Date = ajSeqdateNew();
12110 
12111                 ajStrTokenAssignC(&seqHandle, seqReadLine, " (),.\n\r");
12112                 icount = 0;
12113 
12114                 while(ajStrTokenNextParse(seqHandle, &seqToken))
12115                 {
12116                     icount++;
12117 
12118                     if(icount==2)
12119                         ajStrAssignS(&datestr, seqToken);
12120                     else if(icount == 3)
12121                         ajStrAssignS(&datetype, seqToken);
12122                     else if(icount == 5)
12123                         ajStrAssignS(&relstr, seqToken);
12124                 }
12125 
12126                 if(ajStrMatchC(datetype, "integrated"))
12127                 {
12128                     ajSeqdateSetCreateS(thys->Date, datestr);
12129                     ajStrAssignS(&thys->Date->CreVer, relstr);
12130                 }
12131                 else if (ajStrMatchC(datetype, "sequence"))
12132                 {
12133                     ajSeqdateSetModseqS(thys->Date, datestr);
12134                     ajStrAssignS(&thys->Date->SeqVer, relstr);
12135                 }
12136                 else if (ajStrMatchC(datetype, "entry"))
12137                 {
12138                     ajSeqdateSetModifyS(thys->Date, datestr);
12139                     ajStrAssignS(&thys->Date->ModVer, relstr);
12140                 }
12141                 else
12142                 {
12143                     ajDebug("unknown datetype '%S' '%S'",
12144                             datetype, seqReadLine);
12145                 }
12146                 break;
12147 
12148             case SWISS_UNK:
12149             case SWISS_END:
12150             case SWISS_MORE:
12151             case SWISS_XX:
12152             case SWISS_SV:
12153             case SWISS_MAX:
12154                 ajWarn("Unknown swissprot line type '%2.2S'", seqReadLine);
12155                 break;
12156 
12157             default:
12158                 break;
12159         }
12160 
12161         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
12162         lineprefix = seqPrefixSwiss(seqReadLine);
12163     }
12164 
12165     if(MAJSTRGETLEN(taxstr))
12166     {
12167         ajStrTrimWhite(&taxstr);
12168 
12169         if(MAJSTRGETCHARLAST(taxstr) == '.')
12170             ajStrCutEnd(&taxstr, 1);
12171 
12172         ajStrTokenAssignC(&seqHandle, taxstr, "()");
12173         itaxtype=1;
12174 
12175         while(ajStrTokenNextParse(seqHandle, &seqToken))
12176         {
12177             ajStrTrimWhite(&seqToken);
12178             seqTaxSave(thys, seqToken, itaxtype);
12179             itaxtype = 3;
12180         }
12181     }
12182 
12183     if(seqref)                  /* clean up the last reference */
12184     {
12185         ajSeqrefStandard(seqref);
12186         ajSeqAddRef(thys, seqref);
12187         seqref = NULL;
12188     }
12189 
12190     if(MAJSTRGETLEN(cmtstr))
12191     {
12192         ajSeqAddCmt(thys, cmtstr);
12193         cmtstr = NULL;
12194     }
12195 
12196     if(dofeat)
12197     {
12198         ajDebug("EMBL FEAT TabIn %x\n", seqin->Ftquery);
12199         ajFeattableDel(&thys->Fttable);
12200         thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
12201         /* ajFeattableTrace(thys->Fttable); */
12202         ajFeattabinClear(seqin->Ftquery);
12203     }
12204 
12205     if(MAJSTRGETLEN(seqin->Inseq))
12206     {
12207         /* we have a sequence to use */
12208         ajStrAssignS(&thys->Seq, seqin->Inseq);
12209 
12210         if(seqin->Input->Text)
12211         {
12212             seqTextSeq(&thys->TextPtr, seqin->Inseq);
12213             ajFmtPrintAppS(&thys->TextPtr, "//\n");
12214         }
12215     }
12216     else
12217     {
12218         /* read the sequence and terminator */
12219         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
12220         lineprefix = seqPrefixSwiss(seqReadLine);
12221 
12222         while(ok && lineprefix != SWISS_END)
12223         {
12224             seqAppend(&thys->Seq, seqReadLine);
12225             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
12226             lineprefix = seqPrefixSwiss(seqReadLine);
12227         }
12228     }
12229 
12230     if(thys->Fttable)
12231         ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
12232 
12233     if(!MAJSTRGETLEN(thys->Desc) && thys->Fulldesc)
12234     {
12235         ajStrAssignS(&thys->Desc, thys->Fulldesc->Name);
12236 
12237         iter = ajListIterNewread(thys->Fulldesc->Short);
12238 
12239         while((tmpstr = (const AjPStr) ajListIterGet(iter)))
12240         {
12241             if(MAJSTRGETLEN(tmpstr))
12242                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12243         }
12244 
12245         ajListIterDel(&iter);
12246 
12247         iter = ajListIterNewread(thys->Fulldesc->EC);
12248 
12249         while((tmpstr = (const AjPStr) ajListIterGet(iter)))
12250         {
12251             if(MAJSTRGETLEN(tmpstr))
12252                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12253         }
12254 
12255         ajListIterDel(&iter);
12256 
12257         iter = ajListIterNewread(thys->Fulldesc->AltNames);
12258 
12259         while((subdesc = (AjPSeqSubdesc) ajListIterGet(iter)))
12260         {
12261             if(MAJSTRGETLEN(subdesc->Name))
12262             {
12263                 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12264             }
12265 
12266             itb = ajListIterNewread(subdesc->Inn);
12267             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12268             {
12269                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12270             }
12271 
12272             ajListIterDel(&itb);
12273 
12274             itb = ajListIterNewread(subdesc->Short);
12275 
12276             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12277                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12278 
12279             ajListIterDel(&itb);
12280 
12281             itb = ajListIterNewread(subdesc->EC);
12282 
12283             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12284                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12285             ajListIterDel(&itb);
12286 
12287             itb = ajListIterNewread(subdesc->Allergen);
12288 
12289             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12290                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12291             ajListIterDel(&itb);
12292 
12293             itb = ajListIterNewread(subdesc->Biotech);
12294 
12295             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12296                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12297             ajListIterDel(&itb);
12298 
12299             itb = ajListIterNewread(subdesc->Cdantigen);
12300 
12301             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12302                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12303 
12304             ajListIterDel(&itb);
12305         }
12306 
12307         ajListIterDel(&iter);
12308 
12309         iter = ajListIterNewread(thys->Fulldesc->SubNames);
12310 
12311         while((subdesc = (AjPSeqSubdesc) ajListIterGet(iter)))
12312         {
12313             ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12314 
12315             itb = ajListIterNewread(subdesc->Short);
12316 
12317             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12318                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12319 
12320             ajListIterDel(&itb);
12321 
12322             itb = ajListIterNewread(subdesc->EC);
12323 
12324             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12325                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12326 
12327             ajListIterDel(&itb);
12328         }
12329 
12330         ajListIterDel(&iter);
12331 
12332         iter = ajListIterNewread(thys->Fulldesc->Includes);
12333 
12334         while((desctop = (AjPSeqDesc) ajListIterGet(iter)))
12335         {
12336             ajFmtPrintAppS(&thys->Desc, " (%S)", desctop->Name);
12337             itb = ajListIterNewread(desctop->Short);
12338 
12339             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12340                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12341 
12342             ajListIterDel(&itb);
12343 
12344             itb = ajListIterNewread(desctop->EC);
12345 
12346             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12347                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12348 
12349             ajListIterDel(&itb);
12350 
12351             itb = ajListIterNewread(desctop->AltNames);
12352 
12353             while((subdesc = (AjPSeqSubdesc) ajListIterGet(itb)))
12354             {
12355                 if(ajStrGetLen(subdesc->Name))
12356                     ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12357 
12358                 itc = ajListIterNewread(subdesc->Inn);
12359 
12360                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12361                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12362 
12363                 ajListIterDel(&itc);
12364 
12365                 itc = ajListIterNewread(subdesc->Short);
12366 
12367                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12368                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12369 
12370                 ajListIterDel(&itc);
12371 
12372                 itc = ajListIterNewread(subdesc->EC);
12373 
12374                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12375                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12376 
12377                 ajListIterDel(&itc);
12378 
12379                 itc = ajListIterNewread(subdesc->Allergen);
12380 
12381                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12382                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12383 
12384                 ajListIterDel(&itc);
12385 
12386                 itc = ajListIterNewread(subdesc->Biotech);
12387 
12388                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12389                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12390 
12391                 ajListIterDel(&itc);
12392 
12393                 itc = ajListIterNewread(subdesc->Cdantigen);
12394 
12395                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12396                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12397 
12398                 ajListIterDel(&itc);
12399             }
12400 
12401             ajListIterDel(&itb);
12402 
12403             itb = ajListIterNewread(desctop->SubNames);
12404 
12405             while((subdesc = (AjPSeqSubdesc) ajListIterGet(itb)))
12406             {
12407                 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12408 
12409                 itc = ajListIterNewread(subdesc->Short);
12410 
12411                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12412                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12413 
12414                 ajListIterDel(&itc);
12415 
12416                 itc = ajListIterNewread(subdesc->EC);
12417 
12418                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12419                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12420 
12421                 ajListIterDel(&itc);
12422 
12423             }
12424 
12425             ajListIterDel(&itb);
12426 
12427         }
12428 
12429         ajListIterDel(&iter);
12430 
12431         iter = ajListIterNewread(thys->Fulldesc->Contains);
12432 
12433         while((desctop = (AjPSeqDesc) ajListIterGet(iter)))
12434         {
12435             ajFmtPrintAppS(&thys->Desc, " (%S)", desctop->Name);
12436 
12437             itb = ajListIterNewread(desctop->Short);
12438 
12439             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12440                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12441 
12442             ajListIterDel(&itb);
12443 
12444             itb = ajListIterNewread(desctop->EC);
12445 
12446             while((tmpstr = (AjPStr) ajListIterGet(itb)))
12447                 ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12448 
12449             ajListIterDel(&itb);
12450 
12451             itb = ajListIterNewread(desctop->AltNames);
12452 
12453             while((subdesc = (AjPSeqSubdesc) ajListIterGet(itb)))
12454             {
12455                 if(ajStrGetLen(subdesc->Name))
12456                     ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12457 
12458                 itc = ajListIterNewread(subdesc->Inn);
12459 
12460                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12461                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12462 
12463                 ajListIterDel(&itc);
12464 
12465                 itc = ajListIterNewread(subdesc->Short);
12466 
12467                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12468                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12469 
12470                 ajListIterDel(&itc);
12471 
12472                 itc = ajListIterNewread(subdesc->EC);
12473 
12474                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12475                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12476 
12477                 ajListIterDel(&itc);
12478 
12479                 itc = ajListIterNewread(subdesc->Allergen);
12480 
12481                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12482                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12483 
12484                 ajListIterDel(&itc);
12485 
12486                 itc = ajListIterNewread(subdesc->Biotech);
12487 
12488                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12489                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12490 
12491                 ajListIterDel(&itc);
12492 
12493                 itc = ajListIterNewread(subdesc->Cdantigen);
12494 
12495                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12496                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12497 
12498                 ajListIterDel(&itc);
12499 
12500             }
12501 
12502             ajListIterDel(&itb);
12503 
12504             itb = ajListIterNewread(desctop->SubNames);
12505 
12506             while((subdesc = (AjPSeqSubdesc) ajListIterGet(itb)))
12507             {
12508                 ajFmtPrintAppS(&thys->Desc, " (%S)", subdesc->Name);
12509                 itc = ajListIterNewread(subdesc->Short);
12510 
12511                 itc = ajListIterNewread(subdesc->Cdantigen);
12512 
12513                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12514                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12515 
12516                 ajListIterDel(&itc);
12517 
12518                 itc = ajListIterNewread(subdesc->EC);
12519 
12520                 itc = ajListIterNewread(subdesc->Cdantigen);
12521 
12522                 while((tmpstr = (AjPStr) ajListIterGet(itc)))
12523                     ajFmtPrintAppS(&thys->Desc, " (%S)", tmpstr);
12524 
12525                 ajListIterDel(&itc);
12526 
12527             }
12528 
12529             ajListIterDel(&itb);
12530         }
12531 
12532         ajListIterDel(&iter);
12533 
12534         if(thys->Fulldesc->Fragments || thys->Fulldesc->Precursor)
12535         {
12536             if(thys->Fulldesc->Fragments == 1)
12537                 ajFmtPrintAppS(&thys->Desc, " (Fragment)");
12538 
12539             if(thys->Fulldesc->Fragments == 2)
12540                 ajFmtPrintAppS(&thys->Desc, " (Fragments)");
12541 
12542             if(thys->Fulldesc->Precursor)
12543                 ajFmtPrintAppS(&thys->Desc, " (Precursor)");
12544         }
12545         if(MAJSTRGETCHARFIRST(thys->Desc) == ' ')
12546             ajStrCutStart(&thys->Desc, 1);
12547 
12548         tmpstr = NULL;
12549     }
12550 
12551     ajSeqSetProt(thys);
12552 
12553     if(thys->Reflist)
12554         ajSeqreflistGetXrefs(thys->Reflist, &thys->Xreflist);
12555 
12556     ajFilebuffClear(buff, 0);
12557 
12558     ajStrDel(&datestr);
12559     ajStrDel(&datetype);
12560     ajStrDel(&relstr);
12561     ajStrDel(&taxstr);
12562     ajStrDel(&genetoken);
12563 
12564     ajStrDelStatic(&seqToken);
12565     ajStrDelStatic(&seqToken2);
12566     ajStrTokenReset(seqHandle);
12567 
12568     return ajTrue;
12569 }
12570 
12571 
12572 
12573 
12574 /* @funcstatic seqReadEmbl ****************************************************
12575 **
12576 ** Given data in a sequence structure, tries to read everything needed
12577 ** using EMBL format.
12578 **
12579 ** @param [w] thys [AjPSeq] Sequence object
12580 ** @param [u] seqin [AjPSeqin] Sequence input object
12581 ** @return [AjBool] ajTrue on success
12582 **
12583 ** @release 1.0.0
12584 ** @@
12585 ******************************************************************************/
12586 
seqReadEmbl(AjPSeq thys,AjPSeqin seqin)12587 static AjBool seqReadEmbl(AjPSeq thys, AjPSeqin seqin)
12588 {
12589     AjBool ok;
12590 /*    AjBool okdate; */
12591     AjPFilebuff buff;
12592     AjBool dofeat  = ajFalse;
12593     AjBool tryfeat = ajFalse;
12594     AjPStr liststr;                     /* for lists, do not delete */
12595     AjPStr datestr = NULL;
12596     AjPStr relstr = NULL;
12597     AjPStr cmtstr = NULL;               /* stored in AjPSeq - do not delete */
12598     ajuint icount;
12599     AjPSeqRef seqref = NULL;
12600     AjPSeqXref xref  = NULL;
12601     ajuint refnum;
12602     ajuint seqlen=1024;
12603     ajuint tmplen;
12604     ajuint itmp;
12605     ajuint i;
12606     ajuint taxid = 0;
12607     ajuint itaxtype = 0;
12608     SeqEPrefixSwiss lineprefix = SWISS_UNK;
12609     AjPStrTok handle = NULL;
12610     AjPSeqin conseqin = NULL;
12611     AjPSeq conseq = NULL;
12612     AjPStr conqry = NULL;
12613     AjPStr condb = NULL;
12614     AjPStr confield = NULL;
12615     AjPStr constr = NULL;
12616     AjPStr numstr = NULL;
12617     AjPStr token     = NULL;
12618     ajuint gaplen = 0;
12619     ajuint start = 0;
12620     ajuint end = 0;
12621     ajint dotpos;
12622     ajint colonpos;
12623     ajint istat = 0;
12624     AjBool conrev = ajFalse;
12625 
12626     buff = seqin->Input->Filebuff;
12627 
12628     if(!seqFtFmtEmbl)
12629         ajStrAssignC(&seqFtFmtEmbl, "embl");
12630 
12631     if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
12632         return ajFalse;
12633     lineprefix = seqPrefixSwiss(seqReadLine);
12634 
12635     /* for GCG formatted databases */
12636 
12637     while(lineprefix == SWISS_WP)
12638     {
12639         if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
12640             return ajFalse;
12641 
12642         lineprefix = seqPrefixSwiss(seqReadLine);
12643     }
12644 
12645     /* extra blank lines */
12646 
12647     while(ajStrIsWhite(seqReadLine))
12648     {
12649         if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
12650             return ajFalse;
12651 
12652         lineprefix = seqPrefixSwiss(seqReadLine);
12653     }
12654 
12655     ajDebug("seqReadEmbl first line '%S'\n", seqReadLine);
12656 
12657     if(lineprefix != SWISS_ID)
12658     {
12659         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
12660 
12661         return ajFalse;
12662     }
12663 
12664     if(seqin->Input->Text)
12665         ajStrAssignS(&thys->TextPtr, seqReadLine);
12666 
12667     ajDebug("seqReadEmbl ID line found\n");
12668     ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\t\n\r");
12669     ajStrTokenStep(seqHandle);       /* 'ID' */
12670     ajStrTokenNextParse(seqHandle, &seqToken);       /* entry name */
12671 
12672     seqSetName(thys, seqToken);
12673 
12674     ajStrTokenNextParse(seqHandle, &seqToken);       /* SV for new syntax */
12675 
12676     if(ajStrMatchC(seqToken, "SV"))        /* new post-2006 EMBL line */
12677     {
12678         ajStrTokenNextParse(seqHandle, &seqToken);   /* SV */
12679         ajStrInsertK(&seqToken, 0, '.');
12680         ajStrInsertS(&seqToken, 0, thys->Name);
12681         seqSvSave(thys, seqToken);
12682 
12683         ajStrTokenNextParse(seqHandle, &seqToken); /* linear or circular */
12684 
12685         if(ajStrMatchC(seqToken, "circular"))
12686             thys->Circular = ajTrue;
12687 
12688         ajStrTokenNextParseC(seqHandle, ";\t\n\r", &seqToken);
12689         ajStrTrimWhite(&seqToken);
12690         ajSeqmolSetEmbl(&thys->Molecule, seqToken);
12691 
12692         ajStrTokenNextParse(seqHandle, &seqToken);
12693         ajStrTrimWhite(&seqToken);
12694         ajStrAssignS(&thys->Class, seqToken);
12695 
12696         ajStrTokenNextParse(seqHandle, &seqToken);
12697         ajStrTrimWhite(&seqToken);
12698         ajStrAssignS(&thys->Division, seqToken);
12699 
12700         ajStrTokenNextParse(seqHandle, &seqToken);
12701         ajStrTrimEndC(&seqToken, "BP.");
12702         ajStrTrimWhite(&seqToken);
12703         ajStrToUint(seqToken, &seqlen);
12704     }
12705     else                     /* test for a SwissProt/SpTrEMBL entry */
12706     {
12707         if(ajStrFindC(seqReadLine, " PRT; ")>= 0  ||
12708            ajStrFindC(seqReadLine, " Unreviewed; ") >= 0 ||
12709            ajStrFindC(seqReadLine, " Reviewed; ") >= 0 ||
12710            ajStrFindC(seqReadLine, " Preliminary; ") >= 0
12711            )
12712         {
12713             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
12714             ajStrTokenReset(seqHandle);
12715             ajStrDelStatic(&seqToken);
12716 
12717             return ajFalse;
12718         }
12719     }
12720 
12721     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
12722     lineprefix = seqPrefixSwiss(seqReadLine);
12723 
12724     dofeat = ajFalse;
12725     tryfeat = seqinUfoLocal(seqin);
12726 
12727     while(ok &&
12728           lineprefix != SWISS_SQ &&
12729           lineprefix != SWISS_END)
12730     {
12731         /* check for Staden Experiment format instead */
12732         if(lineprefix == SWISS_EX)
12733         {
12734             ajTextinStoreReset(seqin->Input, &thys->TextPtr);
12735             ajStrDelStatic(&seqToken);
12736 
12737             return ajFalse;;
12738         }
12739 
12740         else if(lineprefix == SWISS_FH)
12741             ok = ajTrue;                /* ignore these lines */
12742 
12743         else if(lineprefix == SWISS_AC) /* emblcds database format */
12744         {
12745             ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
12746             ajStrTokenStep(seqHandle); /* 'AC' */
12747 
12748             while(ajStrTokenNextParse(seqHandle, &seqToken))
12749                 seqAccSave(thys, seqToken);
12750         }
12751 
12752         else if(lineprefix==SWISS_SV)
12753         {
12754             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12755             ajStrTokenStep(seqHandle); /* 'SV' */
12756             ajStrTokenNextParse(seqHandle, &seqToken); /* version */
12757             seqSvSave(thys, seqToken);
12758         }
12759 
12760         else if(lineprefix == SWISS_DE)
12761         {
12762             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12763             ajStrTokenStep(seqHandle); /* 'DE' */
12764             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* desc */
12765 
12766             if(ajStrGetLen(thys->Desc))
12767             {
12768                 ajStrAppendC(&thys->Desc, " ");
12769                 ajStrAppendS(&thys->Desc, seqToken);
12770             }
12771             else
12772                 ajStrAssignS(&thys->Desc, seqToken);
12773         }
12774 
12775         else if(lineprefix == SWISS_KW)
12776         {
12777             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12778             ajStrTokenStep(seqHandle); /* 'KW' */
12779 
12780             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
12781             {
12782                 liststr = ajStrNewS(seqToken);
12783                 ajStrTrimWhite(&liststr);
12784                 ajSeqAddKey(thys, liststr);
12785                 liststr = NULL;
12786             }
12787         }
12788 
12789         else if(lineprefix == SWISS_OS)
12790         {
12791             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12792             ajStrTokenStep(seqHandle); /* 'OS' */
12793 
12794             /* maybe better remove . from this, and trim from end */
12795             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
12796             {
12797                 ajStrTrimWhite(&seqToken);
12798                 ajStrTokenAssignC(&seqHandle2, seqToken, "()");
12799                 itaxtype=1;
12800 
12801                 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
12802                 {
12803                     ajStrTrimWhite(&seqToken2);
12804                     seqTaxSave(thys, seqToken2, itaxtype);
12805                     itaxtype = 3;
12806                 }
12807             }
12808         }
12809 
12810         else if(lineprefix == SWISS_OC)
12811         {
12812             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12813             ajStrTokenStep(seqHandle); /* 'OC' */
12814 
12815             /* maybe better remove . from this, and trim from end */
12816             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
12817             {
12818                 ajStrTrimWhite(&seqToken);
12819                 seqTaxSave(thys, seqToken, 0);
12820             }
12821         }
12822 
12823         else if(lineprefix == SWISS_OG)
12824         {
12825             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
12826             ajStrTokenStep(seqHandle); /* 'OG' */
12827 
12828             /* maybe better remove . from this, and trim from end */
12829             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
12830             {
12831                 ajStrTrimWhite(&seqToken);
12832                 seqTaxSave(thys, seqToken, 2);
12833             }
12834         }
12835 
12836         else if(lineprefix == SWISS_CC)
12837         {
12838             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12839             ajStrTokenStep(seqHandle); /* 'CC' */
12840             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
12841 
12842             if(ajStrGetLen(cmtstr))
12843                 ajStrAppendC(&cmtstr, "\n");
12844             ajStrAppendS(&cmtstr, seqToken);
12845 
12846 /* trying to keep comments in one long string with embedded returns
12847 ** probably fails for long comments - and also fails for contact details
12848 ** which have very short comment lines
12849 ** switch to just keeping original lines */
12850 
12851 /*
12852   if(ajStrGetLen(cmtstr))
12853   {
12854   if(ajStrGetLen(seqToken))
12855   {
12856   if(ajStrGetCharLast(cmtstr) != '\n')
12857   ajStrAppendK(&cmtstr, ' ');
12858   ajStrAppendS(&cmtstr, seqToken);
12859   }
12860   else
12861   {
12862   if(ajStrGetCharLast(cmtstr) != '\n')
12863   ajStrAppendK(&cmtstr, '\n');
12864   ajStrAppendC(&cmtstr, " \n");
12865   }
12866   }
12867   else
12868   ajStrAssignS(&cmtstr, seqToken);
12869   if(ajStrGetCharLast(seqToken) == '.')
12870   ajStrAppendK(&cmtstr, '\n');
12871 */
12872         }
12873 
12874         else if(lineprefix == SWISS_DR)
12875         {
12876             AJNEW0(xref);
12877             ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
12878             ajStrTokenStep(seqHandle); /* 'DR' */
12879 
12880             ajStrTokenNextParseC(seqHandle, ";\n\r", &seqToken); /* dbname */
12881             ajStrTrimWhite(&seqToken);
12882             ajStrAssignS(&xref->Db, seqToken);
12883 
12884             ajStrTokenNextParse(seqHandle, &seqToken); /* primary */
12885             ajStrTrimWhite(&seqToken);
12886             ajStrAssignS(&xref->Id, seqToken);
12887 
12888             ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
12889 
12890             if(!ajStrGetLen(seqToken))
12891             {
12892                 if(ajStrGetCharLast(xref->Id) == '.')
12893                     ajStrCutEnd(&xref->Id, 1);
12894             }
12895             else
12896             {
12897                 if(ajStrGetCharLast(seqToken) == '.')
12898                     ajStrCutEnd(&seqToken, 1);
12899                 ajStrTrimWhite(&seqToken);
12900                 ajStrAssignS(&xref->Secid, seqToken);
12901 
12902                 ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
12903 
12904                 if(!ajStrGetLen(seqToken))
12905                 {
12906                     if(ajStrGetCharLast(xref->Secid) == '.')
12907                         ajStrCutEnd(&xref->Secid, 1);
12908                 }
12909                 else
12910                 {
12911                     if(ajStrGetCharLast(seqToken) == '.')
12912                         ajStrCutEnd(&seqToken, 1);
12913                     ajStrTrimWhite(&seqToken);
12914                     ajStrAssignS(&xref->Terid, seqToken);
12915 
12916                     ajStrTokenNextParse(seqHandle, &seqToken); /* secondary*/
12917 
12918                     if(!ajStrGetLen(seqToken))
12919                     {
12920                         if(ajStrGetCharLast(xref->Terid) == '.')
12921                             ajStrCutEnd(&xref->Terid, 1);
12922                     }
12923                     else
12924                     {
12925                         if(ajStrGetCharLast(seqToken) == '.')
12926                             ajStrCutEnd(&seqToken, 1);
12927                         ajStrTrimWhite(&seqToken);
12928                         ajStrAssignS(&xref->Quatid, seqToken);
12929                     }
12930                 }
12931             }
12932             xref->Type = XREF_DR;
12933             ajSeqAddXref(thys, xref);
12934             xref = NULL;
12935         }
12936 
12937         else if(lineprefix == SWISS_RN)
12938         {
12939             if(seqref)
12940             {
12941                 ajSeqrefStandard(seqref);
12942                 ajSeqAddRef(thys, seqref);
12943                 seqref = NULL;
12944             }
12945 
12946             seqref = ajSeqrefNew();
12947             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12948             ajStrTokenStep(seqHandle); /* 'RN' */
12949             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* [num] */
12950             ajStrAssignSubS(&seqToken2, seqToken, 1, -2);
12951             ajStrToUint(seqToken2, &refnum);
12952             ajSeqrefSetnumNumber(seqref, refnum);
12953         }
12954 
12955         else if(lineprefix == SWISS_RG)
12956         {
12957             if(!seqref)
12958                 seqref = ajSeqrefNew();
12959 
12960             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12961             ajStrTokenStep(seqHandle); /* 'RG' */
12962             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* groupname */
12963             ajSeqrefAppendGroupname(seqref, seqToken);
12964         }
12965 
12966         else if(lineprefix == SWISS_RX)
12967         {
12968             if(!seqref)
12969                 seqref = ajSeqrefNew();
12970 
12971             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12972             ajStrTokenStep(seqHandle); /* 'RX' */
12973             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* xref */
12974             ajSeqrefAppendXref(seqref, seqToken);
12975         }
12976 
12977         else if(lineprefix == SWISS_RP)
12978         {
12979             if(!seqref)
12980                 seqref = ajSeqrefNew();
12981 
12982             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12983             ajStrTokenStep(seqHandle); /* 'RP' */
12984             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* position */
12985             ajSeqrefAppendPosition(seqref, seqToken);
12986         }
12987 
12988         else if(lineprefix == SWISS_RA)
12989         {
12990             if(!seqref)
12991                 seqref = ajSeqrefNew();
12992 
12993             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
12994             ajStrTokenStep(seqHandle); /* 'RA' */
12995             ajStrTokenNextParseC(seqHandle, "\n\r;", &seqToken); /* authors */
12996             ajSeqrefAppendAuthors(seqref, seqToken);
12997         }
12998 
12999         else if(lineprefix == SWISS_RT)
13000         {
13001             if(!seqref)
13002                 seqref = ajSeqrefNew();
13003 
13004             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13005             ajStrTokenStep(seqHandle); /* 'RT' */
13006             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
13007 
13008             if(!ajStrMatchC(seqToken, ";"))
13009                 ajSeqrefAppendTitle(seqref, seqToken);
13010         }
13011 
13012         else if(lineprefix == SWISS_RL)
13013         {
13014             if(!seqref)
13015                 seqref = ajSeqrefNew();
13016 
13017             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13018             ajStrTokenStep(seqHandle); /* 'RL' */
13019             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* authors */
13020             ajSeqrefAppendLocation(seqref, seqToken);
13021         }
13022 
13023         else if(lineprefix == SWISS_RC)
13024         {
13025             if(!seqref)
13026                 seqref = ajSeqrefNew();
13027 
13028             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13029             ajStrTokenStep(seqHandle); /* 'RC' */
13030             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
13031             ajSeqrefAppendComment(seqref, seqToken);
13032         }
13033 
13034         else if(tryfeat && lineprefix == SWISS_FT)
13035         {
13036             if(!dofeat)
13037             {
13038                 dofeat = ajTrue;
13039                 ajFeattabinDel(&seqin->Ftquery);
13040                 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtEmbl,
13041                                                        thys->Name, "N");
13042                 ajDebug("seqReadEmbl: seqin->Ftquery Filebuff %x\n",
13043                         seqin->Ftquery->Input->Filebuff);
13044             }
13045 
13046             ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
13047             /* ajDebug("EMBL FEAT saved line:\n%S", seqReadLine); */
13048         }
13049 
13050         else if(lineprefix == SWISS_DT)
13051         {
13052             if(!thys->Date)
13053                 thys->Date = ajSeqdateNew();
13054 
13055             ajStrTokenAssignC(&seqHandle, seqReadLine, " (),");
13056             icount = 0;
13057 
13058             while(ajStrTokenNextParse(seqHandle, &seqToken))
13059             {
13060                 icount++;
13061 
13062                 if(icount==2)
13063                     ajStrAssignS(&datestr, seqToken);
13064                 else if(icount==4)
13065                     ajStrAssignS(&relstr, seqToken);
13066                 else if(icount==5)
13067                 {
13068                     if(ajStrMatchC(
13069                            seqToken, "Created"))
13070                     {
13071                         ajSeqdateSetCreateS(thys->Date, datestr);
13072                         ajStrAssignS(&thys->Date->CreRel, relstr);
13073                     }
13074                 }
13075                 else if(icount==8)
13076                 {
13077                     ajSeqdateSetModifyS(thys->Date, datestr);
13078                     ajStrAssignS(&thys->Date->ModRel, relstr);
13079                     ajStrAssignS(&thys->Date->ModVer, seqToken);
13080                 }
13081             }
13082         }
13083 
13084 
13085         else if(lineprefix == SWISS_XX)
13086         {
13087             if(seqref)
13088             {
13089                 ajSeqrefStandard(seqref);
13090                 ajSeqAddRef(thys, seqref);
13091                 seqref = NULL;
13092             }
13093 
13094             if(ajStrGetLen(cmtstr))
13095             {
13096                 ajSeqAddCmt(thys, cmtstr);
13097                 cmtstr = NULL;
13098             }
13099 
13100         }
13101 
13102         else if(lineprefix == SWISS_CO)
13103         {
13104             if(!constr)
13105                 constr = ajStrNewRes(4096);
13106 
13107             ajStrTrimWhiteEnd(&seqReadLine);
13108             ajStrAppendSubS(&constr, seqReadLine, 5, -1);
13109         }
13110 
13111         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13112         lineprefix = seqPrefixSwiss(seqReadLine);
13113     }
13114 
13115     if(lineprefix == SWISS_END && ajStrGetLen(constr))
13116     {
13117         conseq = ajSeqNew();
13118 
13119         if(ajStrPrefixC(constr, "join(") && ajStrSuffixC(constr, ")"))
13120         {
13121             ajStrCutEnd(&constr, 1);
13122             ajStrCutStart(&constr, 5);
13123         }
13124 
13125         ajStrTokenAssignC(&handle, constr, ",");
13126 
13127         while(ajStrTokenNextParse(handle, &token))
13128         {
13129             ajDebug("CO parsing token '%S'\n", token);
13130 
13131             if(ajStrPrefixC(token, "gap("))
13132             {
13133                 ajDebug("CO gap: '%S'\n", token);
13134                 ajStrCutEnd(&token, 1);
13135                 ajStrCutStart(&token, 4);
13136                 if(ajStrToUint(token, &gaplen))
13137                 {
13138                     ajDebug("gap %u bases total %u\n",
13139                            gaplen, ajSeqGetLen(thys));
13140                 }
13141                 else
13142                     ajWarn("Unknown gap length in '%S'", constr);
13143 
13144                 ajStrAppendCountK(&thys->Seq, 'N', gaplen);
13145             }
13146             else
13147             {
13148                 if(ajStrPrefixC(token, "complement("))
13149                 {
13150                     ajStrCutEnd(&token, 1);
13151                     ajStrCutStart(&token, 11);
13152                     conrev = ajTrue;
13153                 }
13154 
13155                 if(!condb)
13156                 {
13157                     if(!ajNamDbGetAttrSpecialC(seqin->Input->Db, "ConDatabase",
13158                                                &condb))
13159                         ajStrAssignS(&condb, seqin->Input->Db);
13160                     if(!ajNamDbGetAttrSpecialC(seqin->Input->Db, "ConField",
13161                                                &confield))
13162                         ajStrAssignC(&confield, "acc");
13163                 }
13164 
13165                 dotpos   = (ajint) ajStrFindAnyK(token, '.');
13166                 colonpos = (ajint) ajStrFindAnyK(token, ':');
13167                 ajStrAssignSubS(&numstr, token, colonpos+1, -1);
13168                 istat = ajFmtScanS(numstr, "%u..%u", &start, &end);
13169                 if(istat != 2)
13170                 {
13171                     ajWarn("EMBLCON badly formed fragment '%S'", token);
13172                     start = 1;
13173                     end = 0;
13174                 }
13175 
13176                 if(ajStrMatchC(confield, "sv"))
13177                 {
13178                     ajFmtPrintS(&conqry, "%S-sv:%S", seqin->Input->Db, token);
13179                     if(conrev)
13180                         ajStrAppendC(&conqry, ":r");
13181                 }
13182                 else
13183                 {
13184                     if((dotpos > 0) && (dotpos < colonpos))
13185                     {
13186                         ajStrCutRange(&token, dotpos, colonpos-1);
13187                         ajFmtPrintS(&conqry, "%S-%S:%S",
13188                                     condb, confield, token);
13189                         if(conrev)
13190                             ajStrAppendC(&conqry, ":r");
13191                     }
13192                 }
13193                 ajDebug("CO done: '%S' '%S' rev:%B\n",
13194                         token, conqry, conrev);
13195 
13196                 ajSeqinUsa(&conseqin, conqry);
13197 
13198                 if(!ajSeqRead(conseq, conseqin))
13199                     ajErr("EMBLCON entry '%S' failed to read '%S'",
13200                           thys->Name, conqry);
13201                 else
13202                 {
13203                     ajSeqTrim(conseq);
13204                     if(conrev)
13205                         ajSeqReverseDo(conseq);
13206                     seqAppend(&thys->Seq, ajSeqGetSeqS(conseq));
13207                     ajDebug("Read %u bases total %u\n",
13208                             ajSeqGetLen(conseq), ajSeqGetLen(thys));
13209                 }
13210             }
13211         }
13212 
13213         ajDebug("CO processed seqlen: %u\n", ajSeqGetLen(thys));
13214 
13215         ajStrTokenDel(&handle);
13216 
13217         while(ok && lineprefix != SWISS_END)
13218         {
13219             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13220             lineprefix = seqPrefixSwiss(seqReadLine);
13221         }
13222 
13223         ajSeqinDel(&conseqin);
13224         ajSeqDel(&conseq);
13225 
13226     }
13227 
13228     if(ok && lineprefix == SWISS_SQ)
13229     {
13230         /* now we are on the SQ line - or there was nothing */
13231 
13232         ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13233         ajStrTokenStep(seqHandle); /* 'SQ' */
13234         ajStrTokenStep(seqHandle); /* 'Sequence' */
13235         ajStrTokenNextParse(seqHandle, &seqToken); /* len */
13236         ajStrToUint(seqToken, &tmplen);
13237 
13238         if(tmplen > seqlen)
13239             seqlen = tmplen;
13240 
13241         ajStrTokenStep(seqHandle); /* BP; */
13242         tmplen = 0;
13243 
13244         for(i=0;i<4;i++)
13245         {
13246             ajStrTokenNextParse(seqHandle, &seqToken); /* count */
13247             ajStrToUint(seqToken, &itmp);
13248             ajStrTokenNextParse(seqHandle,
13249                                 &seqToken); /* 'A' 'C' 'G' 'T' 'other' */
13250             tmplen += itmp;
13251         }
13252 
13253         if(tmplen > seqlen)
13254             seqlen = tmplen;
13255 
13256         if(dofeat)
13257         {
13258             ajFeattableDel(&thys->Fttable);
13259             thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
13260             /* ajFeattableTrace(thys->Fttable); */
13261             ajDebug("EMBL FEAT SQ TabIn filebuff: %x features: %u\n",
13262                     seqin->Ftquery->Input->Filebuff,
13263                     ajFeattableGetSize(thys->Fttable));
13264             ajFeattabinClear(seqin->Ftquery);
13265         }
13266         else if(tryfeat) /* but no features in entry */
13267         {
13268             ajDebug("EMBL FEAT SQ empty filebuff: %x\n",
13269                     seqin->Ftquery->Input->Filebuff);
13270             thys->Fttable = ajFeattableNewSeq(thys);
13271         }
13272 
13273         if(ajStrGetLen(seqin->Inseq))
13274         {
13275             /* we have a sequence to use ...perhaps from GCG/NBRF format */
13276             ajStrAssignS(&thys->Seq, seqin->Inseq);
13277 
13278             if(seqin->Input->Text)
13279             {
13280                 seqTextSeq(&thys->TextPtr, seqin->Inseq);
13281                 ajFmtPrintAppS(&thys->TextPtr, "//\n");
13282             }
13283         }
13284         else
13285         {
13286             /* read the sequence and terminator */
13287             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13288             lineprefix = seqPrefixSwiss(seqReadLine);
13289             ajStrSetRes(&thys->Seq, seqlen+1);
13290 
13291             while(ok && lineprefix != SWISS_END)
13292             {
13293                 seqAppend(&thys->Seq, seqReadLine);
13294                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13295                 lineprefix = seqPrefixSwiss(seqReadLine);
13296             }
13297 
13298         }
13299     }
13300 
13301     if(!ajSeqIsNuc(thys))
13302     {
13303         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13304         ajStrDel(&datestr);
13305         ajStrDel(&relstr);
13306         ajStrDelStatic(&seqToken);
13307         ajStrTokenReset(seqHandle);
13308 
13309         return ajFalse;
13310     }
13311 
13312     ajSeqSetNuc(thys);
13313 
13314     if(thys->Fttable)
13315         ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
13316 
13317     if(ajFeattableGetSize(thys->Fttable))
13318     {
13319         ajFeattableGetXrefs(thys->Fttable, &thys->Xreflist, &taxid);
13320         if(taxid)
13321             seqTaxidSaveI(thys, taxid);
13322     }
13323 
13324     ajSeqreflistGetXrefs(thys->Reflist, &thys->Xreflist);
13325 
13326     if(!taxid)
13327         taxid = ajSeqGetTaxid(thys);
13328 
13329     ajFilebuffClear(buff, 0);
13330 
13331     ajStrDel(&datestr);
13332     ajStrDel(&relstr);
13333     ajStrDel(&condb);
13334     ajStrDel(&confield);
13335     ajStrDel(&constr);
13336     ajStrDel(&numstr);
13337     ajStrDel(&conqry);
13338     ajStrDel(&token);
13339 
13340     ajStrDelStatic(&seqToken);
13341     ajStrDelStatic(&seqToken2);
13342 
13343     ajStrTokenReset(seqHandle);
13344     ajStrTokenReset(seqHandle2);
13345 
13346     /* ajSeqTrace(thys); */
13347 
13348     return ajTrue;
13349 }
13350 
13351 
13352 
13353 
13354 /* @funcstatic seqReadExperiment **********************************************
13355 **
13356 ** Given data in a sequence structure, tries to read everything needed
13357 ** using Staden experiment format.
13358 **
13359 ** @param [w] thys [AjPSeq] Sequence object
13360 ** @param [u] seqin [AjPSeqin] Sequence input object
13361 ** @return [AjBool] ajTrue on success
13362 **
13363 ** @release 3.0.0
13364 ** @@
13365 ******************************************************************************/
13366 
seqReadExperiment(AjPSeq thys,AjPSeqin seqin)13367 static AjBool seqReadExperiment(AjPSeq thys, AjPSeqin seqin)
13368 {
13369 
13370     AjBool ok;
13371     AjPFilebuff buff;
13372     AjBool dofeat  = ajFalse;
13373     AjBool tryfeat = ajFalse;
13374     AjPStr liststr;                     /* for lists, do not delete */
13375     AjPStr accvalstr = NULL;
13376     ajuint i;
13377     ajint  ja;
13378     ajuint ilen;
13379     ajuint itaxtype;
13380     SeqEPrefixSwiss lineprefix = SWISS_UNK;
13381 
13382     buff = seqin->Input->Filebuff;
13383 
13384     if(!seqFtFmtEmbl)
13385         ajStrAssignC(&seqFtFmtEmbl, "embl");
13386 
13387     if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
13388         return ajFalse;
13389 
13390     lineprefix = seqPrefixSwiss(seqReadLine);
13391 
13392     ajDebug("seqReadExperiment first line '%S'\n", seqReadLine);
13393 
13394     if(lineprefix != SWISS_ID)
13395     {
13396         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13397 
13398         return ajFalse;
13399     }
13400 
13401     if(seqin->Input->Text)
13402         ajStrAssignS(&thys->TextPtr, seqReadLine);
13403 
13404     ajDebug("seqReadExperiment ID line found\n");
13405     ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r\t");
13406     ajStrTokenStep(seqHandle);       /* 'ID' */
13407     ajStrTokenNextParse(seqHandle, &seqToken);       /* entry name */
13408 
13409     seqSetName(thys, seqToken);
13410 
13411     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13412     lineprefix = seqPrefixSwiss(seqReadLine);
13413 
13414     while(ok && lineprefix != SWISS_SQ)
13415     {
13416         if(lineprefix == SWISS_EX)
13417         {
13418             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13419             ajStrTokenStep(seqHandle); /* 'EX'*/
13420             ajStrTokenNextParseC(seqHandle, "\n\r",
13421                                  &seqToken); /*  expt. desc. */
13422 
13423             if(ajStrGetLen(thys->Desc))
13424             {
13425                 ajStrAppendC(&thys->Desc, " ");
13426                 ajStrAppendS(&thys->Desc, seqToken);
13427             }
13428             else
13429                 ajStrAssignS(&thys->Desc, seqToken);
13430         }
13431 
13432         if(lineprefix == SWISS_AV)
13433         {
13434             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13435             ajStrTokenStep(seqHandle); /* 'AV' */
13436             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* desc */
13437 
13438             if(ajStrGetLen(accvalstr))
13439             {
13440                 ajStrAppendC(&accvalstr, " ");
13441                 ajStrAppendS(&accvalstr, seqToken);
13442             }
13443             else
13444                 ajStrAssignS(&accvalstr, seqToken);
13445         }
13446 
13447         /* standard EMBL records are allowed */
13448 
13449         if(lineprefix == SWISS_AC)
13450         {
13451             ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
13452             ajStrTokenStep(seqHandle); /* 'AC' */
13453 
13454             while(ajStrTokenNextParse(seqHandle, &seqToken))
13455                 seqAccSave(thys, seqToken);
13456         }
13457 
13458         if(lineprefix == SWISS_SV)
13459         {
13460             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13461             ajStrTokenStep(seqHandle); /* 'SV' */
13462             ajStrTokenNextParse(seqHandle, &seqToken); /* version */
13463             seqSvSave(thys, seqToken);
13464         }
13465 
13466         if(lineprefix == SWISS_DE)
13467         {
13468             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13469             ajStrTokenStep(seqHandle); /* 'DE' */
13470             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* desc */
13471 
13472             if(ajStrGetLen(thys->Desc))
13473             {
13474                 ajStrAppendC(&thys->Desc, " ");
13475                 ajStrAppendS(&thys->Desc, seqToken);
13476             }
13477             else
13478                 ajStrAssignS(&thys->Desc, seqToken);
13479         }
13480 
13481         if(lineprefix == SWISS_KW)
13482         {
13483             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13484             ajStrTokenStep(seqHandle); /* 'KW' */
13485 
13486             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
13487             {
13488                 liststr = ajStrNewS(seqToken);
13489                 ajStrTrimWhite(&liststr);
13490                 ajSeqAddKey(thys, liststr);
13491                 liststr = NULL;
13492             }
13493         }
13494 
13495         if(lineprefix == SWISS_OS)
13496         {
13497             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13498             ajStrTokenStep(seqHandle); /* 'OS' */
13499 
13500             /* maybe better remove . from this, and trim from end */
13501             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
13502             {
13503                 ajStrTrimWhite(&seqToken);
13504                 ajStrTokenAssignC(&seqHandle2, seqToken, "()");
13505                 itaxtype=1;
13506 
13507                 while(ajStrTokenNextParse(seqHandle2, &seqToken2))
13508                 {
13509                     ajStrTrimWhite(&seqToken2);
13510                     seqTaxSave(thys, seqToken2, itaxtype);
13511                     itaxtype = 3;
13512                 }
13513 
13514                 ajStrTokenReset(seqHandle2);
13515             }
13516         }
13517 
13518         if(lineprefix == SWISS_OC)
13519         {
13520             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13521             ajStrTokenStep(seqHandle); /* 'OC' */
13522 
13523             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
13524             {
13525                 ajStrTrimWhite(&seqToken);
13526                 seqTaxSave(thys, seqToken, 0);
13527             }
13528         }
13529 
13530         if(tryfeat && lineprefix == SWISS_FT)
13531         {
13532             if(!dofeat)
13533             {
13534                 dofeat = ajTrue;
13535                 ajFeattabinDel(&seqin->Ftquery);
13536                 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtEmbl,
13537                                                        thys->Name, "N");
13538                 /* ajDebug("seqin->Ftquery Filebuff %x\n",
13539                    seqin->Ftquery->Input->Filebuff); */
13540 
13541             }
13542 
13543             ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
13544             /* ajDebug("EMBL FEAT saved line:\n%S", seqReadLine); */
13545         }
13546 
13547         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13548         lineprefix = seqPrefixSwiss(seqReadLine);
13549     }
13550 
13551     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13552     lineprefix = seqPrefixSwiss(seqReadLine);
13553 
13554     while(ok && lineprefix != SWISS_END)
13555     {
13556         seqAppend(&thys->Seq, seqReadLine);
13557         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13558         lineprefix = seqPrefixSwiss(seqReadLine);
13559     }
13560     ajDebug("Sequence read %d bases\n", ajStrGetLen(thys->Seq));
13561 
13562     if(thys->Fttable)
13563         ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
13564 
13565     while(ok && lineprefix != SWISS_ID)
13566     {
13567         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13568         lineprefix = seqPrefixSwiss(seqReadLine);
13569     }
13570 
13571     if(ok)
13572         ajTextinStoreClear(seqin->Input, 1, seqReadLine, &thys->TextPtr);
13573     else
13574         ajFilebuffClear(buff, 0);
13575 
13576     if(dofeat)
13577     {
13578         /* ajDebug("EMBL FEAT TabIn %x\n", seqin->Ftquery); */
13579         ajFeattableDel(&thys->Fttable);
13580         thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
13581         /* ajFeattableTrace(thys->Fttable); */
13582         ajFeattabinClear(seqin->Ftquery);
13583     }
13584 
13585     if(ajStrGetLen(accvalstr))
13586     {
13587         ilen = ajStrGetLen(thys->Seq);
13588         if(ilen > thys->Qualsize)
13589         {
13590             AJCRESIZE(thys->Accuracy, ilen);
13591             thys->Qualsize = ilen;
13592         }
13593 
13594         ajStrTokenAssignC(&seqHandle, accvalstr, " ");
13595 
13596         for(i=0;i<ilen;i++)
13597         {
13598             thys->Accuracy[i] = INT_MIN;
13599             if(!ajStrTokenNextParse(seqHandle, &seqToken))
13600             {
13601                 ajWarn("Missing accuracy for base %d in experiment format\n",
13602                        i+1);
13603                 break;
13604             }
13605 
13606             ajStrTokenAssignC(&seqHandle2, seqToken, ",");
13607 
13608             while(ajStrTokenNextParse(seqHandle2, &seqToken2))
13609             {
13610                 if(ajStrToInt(seqToken2, &ja))
13611                 {
13612                     if(ja > thys->Accuracy[i])
13613                         thys->Accuracy[i] = (float) ja;
13614                 }
13615                 else
13616                 {
13617                     ajWarn("Bad accuracy '%S' for base %d "
13618                            "in experiment format\n",
13619                            seqToken, i+1);
13620                     break;
13621                 }
13622             }
13623             ajDebug("Accval[%u] %3f '%S'\n", i+1, thys->Accuracy[i], seqToken);
13624         }
13625     }
13626 
13627     ajStrDelStatic(&seqToken);
13628     ajStrDelStatic(&seqToken2);
13629     ajStrDel(&accvalstr);
13630 
13631     ajStrTokenReset(seqHandle);
13632     ajStrTokenReset(seqHandle2);
13633 
13634 
13635     /* ajSeqTrace(thys); */
13636 
13637     return ajTrue;
13638 }
13639 
13640 
13641 
13642 
13643 /* @funcstatic seqReadGenbank *************************************************
13644 **
13645 ** Given data in a sequence structure, tries to read everything needed
13646 ** using Genbank format.
13647 **
13648 ** @param [w] thys [AjPSeq] Sequence object
13649 ** @param [u] seqin [AjPSeqin] Sequence input object
13650 ** @return [AjBool] ajTrue on success
13651 **
13652 ** @release 1.0.0
13653 ** @@
13654 ******************************************************************************/
13655 
seqReadGenbank(AjPSeq thys,AjPSeqin seqin)13656 static AjBool seqReadGenbank(AjPSeq thys, AjPSeqin seqin)
13657 {
13658     AjBool ok;
13659     AjBool done = ajFalse;
13660     AjPFilebuff buff;
13661     AjPStr cmtstr = NULL;
13662     AjBool dofeat  = ajFalse;
13663     AjBool tryfeat = ajFalse;
13664     AjPQuery qry;
13665     AjPStr liststr;                     /* for lists, do not delete */
13666     AjPSeqRef seqref = NULL;
13667     ajuint refnum;
13668     ajuint seqlen = 1024;
13669     ajint i;
13670     ajint nfields;
13671     ajuint taxid = 0;
13672     ajuint itaxtype = 0;
13673     SeqEPrefixGenbank lineprefix = GB_UNK;
13674     SeqEPrefixGenbankMore moreprefix = GB_MORE_UNK;
13675 
13676     ajDebug("seqReadGenbank\n");
13677 
13678     buff = seqin->Input->Filebuff;
13679     qry  = seqin->Input->Query;
13680 
13681     if(!seqFtFmtGenbank)
13682         ajStrAssignC(&seqFtFmtGenbank, "genbank");
13683 
13684     if(!ajBuffreadLine(buff, &seqReadLine))
13685         return ajFalse;
13686 
13687     lineprefix = seqPrefixGenbank(seqReadLine);
13688 
13689     ajDebug("++seqReadGenbank first line '%S'\n", seqReadLine);
13690 
13691     ok = ajTrue;
13692 
13693     /* extra blank lines */
13694 
13695     while(ajStrIsWhite(seqReadLine))
13696     {
13697         if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
13698             return ajFalse;
13699         lineprefix = seqPrefixGenbank(seqReadLine);
13700     }
13701 
13702     /* for GCG formatted databases */
13703 
13704     if(lineprefix == GB_WP)
13705     {
13706         ok = ajBuffreadLine(buff, &seqReadLine);
13707         lineprefix = seqPrefixGenbank(seqReadLine);
13708 
13709         while(ok && lineprefix == GB_MORE)
13710         {
13711             ok = ajBuffreadLine(buff, &seqReadLine);
13712             lineprefix = seqPrefixGenbank(seqReadLine);
13713         }
13714     }
13715 
13716     /* This loop necessary owing to headers on GB distro files */
13717     if(ajStrFindC(seqReadLine,"Genetic Sequence Data Bank") >= 0)
13718         while(ok && lineprefix != GB_ID) /* LOCUS */
13719         {
13720             ok = ajBuffreadLine(buff, &seqReadLine);
13721             lineprefix = seqPrefixGenbank(seqReadLine);
13722         }
13723 
13724     if(!ok)
13725     {
13726         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13727 
13728         return ajFalse;
13729     }
13730 
13731     if(lineprefix != GB_ID)     /* LOCUS */
13732     {
13733         ajDebug("failed - LOCUS not found - first line was\n%S\n",
13734                 seqReadLine);
13735         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13736 
13737         return ajFalse;
13738     }
13739 
13740     nfields = ajStrParseCountC(seqReadLine, " \n\r");
13741 
13742     if(nfields == 9)
13743     {
13744         ajFilebuffSetBuffered(buff);
13745         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13746 
13747         return seqReadGenpept(thys,seqin);
13748     }
13749 
13750     ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13751     i=0;
13752 
13753     while(ajStrTokenNextParse(seqHandle, &seqToken))
13754     {
13755         switch(++i)
13756         {
13757             case 1:
13758                 break;
13759             case 2:
13760                 seqSetName(thys, seqToken);
13761                 break;
13762             case 3:
13763                 ajStrToUint(seqToken, &seqlen);
13764                 break;
13765             case 4:
13766                 if(ajStrMatchC(seqToken, "aa"))
13767                 {
13768                     ajFilebuffSetBuffered(buff);
13769                     ajTextinStoreReset(seqin->Input, &thys->TextPtr);
13770                     ajStrDelStatic(&seqToken);
13771                     ajStrTokenReset(seqHandle);
13772 
13773                     ajDebug("first line %d aa pass to refseqp '%S'\n",
13774                             buff->Pos, seqReadLine);
13775                     return seqReadRefseqp(thys,seqin);
13776                 }
13777                 if(!ajStrMatchC(seqToken, "bp"))
13778                     ajWarn("bad Genbank LOCUS line '%S'", seqReadLine);
13779                 break;
13780             case 5:
13781                 ajSeqmolSetGb(&thys->Molecule, seqToken);
13782                 break;
13783             case 6:
13784                 if(ajStrMatchC(seqToken, "circular"))
13785                     thys->Circular = ajTrue;
13786                 break;
13787             case 7:
13788                 ajSeqdivSetGb(&thys->Division, seqToken);
13789                 ajSeqclsSetGb(&thys->Class, seqToken);
13790                 break;
13791             case 8:
13792                 if(!thys->Date)
13793                     thys->Date = ajSeqdateNew();
13794                 ajSeqdateSetModifyS(thys->Date, seqToken);
13795                 break;
13796             default:
13797                 break;
13798         }
13799     }
13800 
13801     if(seqin->Input->Text)
13802         ajStrAssignS(&thys->TextPtr, seqReadLine);
13803 
13804     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13805     lineprefix = seqPrefixGenbank(seqReadLine);
13806 
13807     dofeat = ajFalse;
13808     tryfeat = seqinUfoLocal(seqin);
13809 
13810     while(ok &&
13811           lineprefix != GB_END &&
13812           lineprefix != GB_ORI &&
13813           lineprefix != GB_BASE)
13814     {
13815         done = ajFalse;
13816 
13817         if(lineprefix == GB_DEF)
13818         {
13819             ajDebug("definition found\n");
13820             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13821             ajStrTokenStep(seqHandle); /* 'DEFINITION' */
13822             ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc); /* desc */
13823             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13824             lineprefix = seqPrefixGenbank(seqReadLine);
13825             done = ajTrue;
13826 
13827             while(ok && lineprefix == GB_MORE)
13828             {
13829                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13830                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
13831                 ajStrAppendC(&thys->Desc, " ");
13832                 ajStrAppendS(&thys->Desc, seqToken);
13833                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13834                 lineprefix = seqPrefixGenbank(seqReadLine);
13835             }
13836         }
13837 
13838         else if(lineprefix == GB_AC)
13839         {
13840             ajDebug("accession found\n");
13841 
13842             ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
13843             ajStrTokenStep(seqHandle); /* 'ACCESSION' */
13844 
13845             while(ajStrTokenNextParse(seqHandle, &seqToken))
13846                 seqAccSave(thys, seqToken);
13847         }
13848 
13849         else if(lineprefix == GB_VER)
13850         {
13851             ajDebug("seqversion found\n");
13852 
13853             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13854             ajStrTokenStep(seqHandle); /* 'VERSION' */
13855             ajStrTokenNextParse(seqHandle, &seqToken);
13856             seqSvSave(thys, seqToken);
13857 
13858             if(ajStrTokenStepC(seqHandle, ": \n\r")) /* GI: */
13859             {
13860                 ajStrTokenNextParse(seqHandle, &thys->Gi);
13861             }
13862         }
13863 
13864         else if(lineprefix == GB_SRC)
13865         {
13866             ajDebug("source found\n");
13867             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13868             ajStrTokenStep(seqHandle); /* 'SOURCE' */
13869             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* source */
13870             ajStrTokenAssignC(&seqHandle2, seqToken, "()");
13871             itaxtype=1;
13872 
13873             while(ajStrTokenNextParse(seqHandle2, &seqToken2))
13874             {
13875                 ajStrTrimWhite(&seqToken2);
13876                 seqTaxSave(thys, seqToken2, itaxtype);
13877                 itaxtype = 3;
13878             }
13879 
13880             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13881             lineprefix = seqPrefixGenbank(seqReadLine);
13882             done = ajTrue;
13883 
13884             while(ok && lineprefix == GB_MORE)
13885             {
13886                 done = ajFalse;
13887 /* process organism lines */
13888 
13889                 moreprefix = seqPrefixGenbankMore(seqReadLine);
13890 
13891                 if(moreprefix == GB_MORE_ORG)
13892                 {
13893                     ajDebug("organism found\n");
13894                     ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13895                     ajStrTokenNextParse(seqHandle, &seqToken); /* 'ORGANISM' */
13896 
13897                     while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
13898                     {
13899                         ajStrTrimWhite(&seqToken);
13900                         seqTaxSave(thys, seqToken, 1);
13901                     }
13902 
13903                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13904 
13905                     moreprefix = seqPrefixGenbankMore(seqReadLine);
13906                     done = ajTrue;
13907 
13908                     while(ok && moreprefix == GB_MORE_MORE)
13909                     {
13910                         ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
13911 
13912                         while(ajStrTokenNextParseC(seqHandle, ".;\n\r",
13913                                                    &seqToken))
13914                         {
13915                             ajStrTrimWhite(&seqToken);
13916                             seqTaxSave(thys, seqToken, 0);
13917                         }
13918 
13919                         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13920                         moreprefix = seqPrefixGenbankMore(seqReadLine);
13921                     }
13922                 }
13923 
13924                 if(!done)
13925                 {
13926                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13927                 }
13928                 lineprefix = seqPrefixGenbank(seqReadLine);
13929             }
13930         }
13931 
13932         else if(tryfeat && lineprefix == GB_FEAT)
13933         {
13934             ajDebug("features found\n");
13935 
13936             if(!dofeat)
13937             {
13938                 dofeat = ajTrue;
13939                 ajFeattabinDel(&seqin->Ftquery);
13940                 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtGenbank,
13941                                                        thys->Name, "N");
13942                 ajDebug("seqin->Ftquery Filebuff %x\n",
13943                         seqin->Ftquery->Input->Filebuff);
13944                 /* ajDebug("GENBANK FEAT first line:\n%S", seqReadLine); */
13945             }
13946 
13947             ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
13948             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13949             lineprefix = seqPrefixGenbank(seqReadLine);
13950             done = ajTrue;
13951 
13952             while(ok && lineprefix == GB_MORE)
13953             {
13954                 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff,
13955                                 seqReadLine);
13956                 /* ajDebug("GENBANK FEAT saved line:\n%S", seqReadLine); */
13957                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13958                 lineprefix = seqPrefixGenbank(seqReadLine);
13959             }
13960         }
13961 
13962         else if(lineprefix == GB_REF)
13963         {
13964             ajDebug("reference found\n");
13965             seqref = ajSeqrefNew();
13966             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
13967             ajStrTokenStep(seqHandle); /* 'REFERENCE' */
13968             ajStrTokenNextParse(seqHandle, &seqToken); /* number */
13969             ajStrToUint(seqToken, &refnum);
13970             ajSeqrefSetnumNumber(seqref, refnum);
13971             ajStrAssignClear(&seqToken2);
13972 
13973             while (ajStrTokenNextParse(seqHandle, &seqToken))
13974             {
13975                 if(ajStrMatchC(seqToken, "(bases"))
13976                     continue;
13977 
13978                 if(ajStrMatchC(seqToken, "(residues"))
13979                     continue;
13980 
13981                 if(ajStrMatchC(seqToken, "to"))
13982                     continue;
13983 
13984                 if(!ajStrGetLen(seqToken2))
13985                     ajStrAssignS(&seqToken2, seqToken);
13986 
13987                 if(ajStrSuffixC(seqToken, ")"))
13988                 {
13989                     ajStrTrimEndC(&seqToken, ")");
13990                     ajStrAppendK(&seqToken2, '-');
13991                     ajStrAppendS(&seqToken2, seqToken);
13992                 }
13993             }
13994 
13995             ajSeqrefSetPosition(seqref, seqToken2);
13996 
13997             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
13998             moreprefix = seqPrefixGenbankMore(seqReadLine);
13999             done = ajTrue;
14000 
14001             ajSeqrefStandard(seqref);
14002             ajSeqAddRef(thys, seqref);
14003 
14004             if(ok && moreprefix == GB_MORE_AUT)
14005             {
14006                 ajDebug("authors found\n");
14007                 if(!seqref)
14008                     seqref = ajSeqrefNew();
14009                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14010                 ajStrTokenStep(seqHandle); /* 'AUTHORS' */
14011                 ajStrTokenNextParseC(seqHandle, "\n\r",
14012                                      &seqToken2); /* authors */
14013 
14014                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14015                 moreprefix = seqPrefixGenbankMore(seqReadLine);
14016 
14017                 while(ok && moreprefix == GB_MORE_MORE)
14018                 {
14019                     ajStrAssignS(&seqToken, seqReadLine);
14020                     ajStrTrimWhite(&seqToken);
14021                     if(ajStrSuffixC(seqToken2, ".,") ||
14022                        ajStrPrefixC(seqToken, "and "))
14023                         ajStrAppendC(&seqToken2, " ");
14024                     ajStrAppendS(&seqToken2, seqToken);
14025                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14026                     moreprefix = seqPrefixGenbankMore(seqReadLine);
14027                 }
14028 
14029                 /* append here - genbank splits author names across lines */
14030                 ajSeqrefAppendAuthors(seqref, seqToken2);
14031             }
14032 
14033             if(ok && moreprefix == GB_MORE_TIT)
14034             {
14035                 ajDebug("title found\n");
14036                 if(!seqref)
14037                     seqref = ajSeqrefNew();
14038 
14039                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14040                 ajStrTokenStep(seqHandle); /* 'TITLE' */
14041                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
14042 
14043                 ajSeqrefAppendTitle(seqref, seqToken);
14044 
14045                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14046                 moreprefix = seqPrefixGenbankMore(seqReadLine);
14047 
14048                 while(ok && moreprefix == GB_MORE_MORE)
14049                 {
14050                     ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14051                     ajStrTokenStepC(seqHandle, "\n\r"); /* title */
14052                     ajSeqrefAppendTitle(seqref, seqToken);
14053 
14054                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14055                     moreprefix = seqPrefixGenbankMore(seqReadLine);
14056                 }
14057             }
14058 
14059             if(ok && moreprefix == GB_MORE_JNL)
14060             {
14061                 ajDebug("journal location found\n");
14062                 if(!seqref)
14063                     seqref = ajSeqrefNew();
14064 
14065                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14066                 ajStrTokenStep(seqHandle); /* 'JOURNAL' */
14067                 ajStrTokenNextParseC(seqHandle, "\n\r",
14068                                      &seqToken); /* location */
14069 
14070                 ajSeqrefAppendLocation(seqref, seqToken);
14071 
14072                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14073                 moreprefix = seqPrefixGenbankMore(seqReadLine);
14074             }
14075 
14076             while(ok && moreprefix == GB_MORE_MORE)
14077             {
14078                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14079                 moreprefix = seqPrefixGenbankMore(seqReadLine);
14080             }
14081 
14082             seqref = NULL;
14083             lineprefix = seqPrefixGenbank(seqReadLine);
14084         }
14085 
14086         else if(ok && lineprefix == GB_CC)
14087         {
14088             ajDebug("comment found\n");
14089 
14090             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14091             ajStrTokenStep(seqHandle); /* 'COMMENT' */
14092             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
14093 
14094             if(ajStrGetLen(cmtstr))
14095                 ajStrAppendC(&cmtstr, "\n");
14096             ajStrAppendS(&cmtstr, seqToken);
14097 
14098             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14099             moreprefix = seqPrefixGenbankMore(seqReadLine);
14100             done = ajTrue;
14101 
14102             while(ok && moreprefix == GB_MORE_MORE)
14103             {
14104                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14105                 ajStrTokenNextParseC(seqHandle, "\n\r",
14106                                      &seqToken); /* comment */
14107 
14108                 if(ajStrGetLen(seqToken))
14109                 {
14110                     if(ajStrGetLen(cmtstr))
14111                         ajStrAppendC(&cmtstr, "\n");
14112                     ajStrAppendS(&cmtstr, seqToken);
14113                 }
14114                 else
14115                 {
14116                     ajSeqAddCmt(thys, cmtstr);
14117                     cmtstr = NULL;
14118                 }
14119 
14120                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14121                 moreprefix = seqPrefixGenbankMore(seqReadLine);
14122             }
14123 
14124             if(ajStrGetLen(cmtstr))
14125                 ajSeqAddCmt(thys, cmtstr);
14126 
14127             lineprefix = seqPrefixGenbank(seqReadLine);
14128             cmtstr = NULL;
14129         }
14130 
14131         else if(lineprefix == GB_KEY)
14132         {
14133             ajDebug("keywords found\n");
14134             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14135             ajStrTokenStep(seqHandle); /* 'KEYWORDS' */
14136 
14137             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14138             {
14139                 liststr = ajStrNewS(seqToken);
14140                 ajStrTrimWhite(&liststr);
14141                 ajSeqAddKey(thys, liststr);
14142                 liststr = NULL;
14143             }
14144 
14145             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14146             lineprefix = seqPrefixGenbank(seqReadLine);
14147             done = ajTrue;
14148 
14149             while(ok && lineprefix == GB_MORE)
14150             {
14151                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14152 
14153                 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14154                 {
14155                     liststr = ajStrNewS(seqToken);
14156                     ajStrTrimWhite(&liststr);
14157                     ajSeqAddKey(thys, liststr);
14158                     liststr = NULL;
14159                 }
14160 
14161                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14162                 lineprefix = seqPrefixGenbank(seqReadLine);
14163             }
14164         }
14165 
14166         if(!done)
14167         {
14168             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14169             lineprefix = seqPrefixGenbank(seqReadLine);
14170         }
14171     }
14172 
14173     if(dofeat)
14174     {
14175         ajDebug("GENBANK FEAT TabIn %x\n", seqin->Ftquery);
14176         ajFeattableDel(&thys->Fttable);
14177         thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
14178         /* ajFeattableTrace(thys->Fttable); */
14179         ajFeattabinClear(seqin->Ftquery);
14180     }
14181 
14182     if(ajStrGetLen(seqin->Inseq))
14183     {
14184         /* we have a sequence to use */
14185         ajDebug("Got an Inseq sequence\n");
14186 
14187         if(ajStrMatchC(qry->Method,"gcg"))
14188         {
14189             while(ok && lineprefix != GB_ORI)
14190             {
14191                 ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14192                 lineprefix = seqPrefixGenbank(seqReadLine);
14193             }
14194         }
14195 
14196         ajStrAssignS(&thys->Seq, seqin->Inseq);
14197 
14198         if(seqin->Input->Text)
14199         {
14200             seqTextSeq(&thys->TextPtr, seqin->Inseq);
14201             ajFmtPrintAppS(&thys->TextPtr, "//\n");
14202         }
14203     }
14204     else
14205     {
14206         /* read the sequence and terminator */
14207         ajDebug("sequence start at '%S'\n", seqReadLine);
14208 
14209         while(ok &&
14210               lineprefix != GB_END &&
14211               lineprefix != GB_ORI &&
14212               lineprefix != GB_BASE)
14213         {
14214             ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14215             lineprefix = seqPrefixGenbank(seqReadLine);
14216 
14217             if(!ok)
14218                 break;
14219         }
14220 
14221         if(ok && lineprefix != GB_END)
14222         {
14223             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14224             lineprefix = seqPrefixGenbank(seqReadLine);
14225         }
14226 
14227         ajStrSetRes(&thys->Seq, seqlen+1);
14228 
14229         while(ok && lineprefix != GB_END)
14230         {
14231             if(lineprefix != GB_ORI &&
14232                lineprefix != GB_BASE)
14233                 seqAppend(&thys->Seq, seqReadLine);
14234 
14235             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14236             lineprefix = seqPrefixGenbank(seqReadLine);
14237         }
14238     }
14239 
14240     if(!ajStrMatchC(qry->Method,"gcg"))
14241     {
14242         while(ok && lineprefix != GB_END)
14243         {
14244             ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14245             lineprefix = seqPrefixGenbank(seqReadLine);
14246         }
14247     }
14248 
14249     if(thys->Fttable)
14250         ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
14251 
14252     if(ajFeattableGetSize(thys->Fttable))
14253     {
14254         ajFeattableGetXrefs(thys->Fttable, &thys->Xreflist, &taxid);
14255         if(taxid)
14256             seqTaxidSaveI(thys, taxid);
14257     }
14258 
14259     if(!taxid)
14260         taxid = ajSeqGetTaxid(thys);
14261 
14262     ajFilebuffClear(buff, 0);
14263 
14264     ajStrTokenReset(seqHandle);
14265     ajStrTokenReset(seqHandle2);
14266     ajStrDelStatic(&seqToken);
14267     ajStrDelStatic(&seqToken2);
14268 
14269     return ajTrue;
14270 }
14271 
14272 
14273 
14274 
14275 /* @funcstatic seqReadRefseq **************************************************
14276 **
14277 ** Given data in a sequence structure, tries to read everything needed
14278 ** using Refseq format.
14279 **
14280 ** @param [w] thys [AjPSeq] Sequence object
14281 ** @param [u] seqin [AjPSeqin] Sequence input object
14282 ** @return [AjBool] ajTrue on success
14283 **
14284 ** @release 6.1.0
14285 ** @@
14286 ******************************************************************************/
14287 
seqReadRefseq(AjPSeq thys,AjPSeqin seqin)14288 static AjBool seqReadRefseq(AjPSeq thys, AjPSeqin seqin)
14289 {
14290     return seqReadGenbank(thys, seqin);
14291 }
14292 
14293 
14294 
14295 
14296 /* @funcstatic seqReadGenpept *************************************************
14297 **
14298 ** Given data in a sequence structure, tries to read everything needed
14299 ** using Genpept format.
14300 **
14301 ** @param [w] thys [AjPSeq] Sequence object
14302 ** @param [u] seqin [AjPSeqin] Sequence input object
14303 ** @return [AjBool] ajTrue on success
14304 **
14305 ** @release 6.1.0
14306 ** @@
14307 ******************************************************************************/
14308 
seqReadGenpept(AjPSeq thys,AjPSeqin seqin)14309 static AjBool seqReadGenpept(AjPSeq thys, AjPSeqin seqin)
14310 {
14311     AjBool ok;
14312     AjBool done = ajFalse;
14313     AjPFilebuff buff;
14314     AjPStr cmtstr = NULL;
14315 /*
14316 //    AjBool dofeat  = ajFalse;
14317 //    AjBool tryfeat = ajFalse;
14318 */
14319     AjPQuery qry;
14320     AjPStr liststr;                     /* for lists, do not delete */
14321     AjPSeqRef seqref = NULL;
14322     ajuint refnum;
14323     ajuint seqlen = 1024;
14324     ajint i;
14325     ajint nfields;
14326     ajuint itaxtype = 0;
14327     SeqEPrefixGenbank lineprefix = GB_UNK;
14328 
14329     ajDebug("seqReadGenpept\n");
14330 
14331     buff = seqin->Input->Filebuff;
14332     qry  = seqin->Input->Query;
14333 
14334     if(!ajBuffreadLine(buff, &seqReadLine))
14335         return ajFalse;
14336 
14337     lineprefix = seqPrefixGenbank(seqReadLine);
14338 
14339     ok = ajTrue;
14340 
14341     /* extra blank lines */
14342 
14343     while(ajStrIsWhite(seqReadLine))
14344     {
14345         if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
14346             return ajFalse;
14347 
14348         lineprefix = seqPrefixGenbank(seqReadLine);
14349     }
14350 
14351     /* for GCG formatted databases */
14352 
14353     if(lineprefix == GB_WP)
14354     {
14355         ok = ajBuffreadLine(buff, &seqReadLine);
14356         lineprefix = seqPrefixGenbank(seqReadLine);
14357 
14358         while(ok && lineprefix == GB_MORE)
14359         {
14360             ok = ajBuffreadLine(buff, &seqReadLine);
14361             lineprefix = seqPrefixGenbank(seqReadLine);
14362         }
14363     }
14364 
14365     /* This loop necessary owing to headers on GB distro files */
14366     if(ajStrFindC(seqReadLine,"Genetic Sequence Data Bank") >= 0)
14367     {
14368         while(ok && lineprefix != GB_ID) /* LOCUS */
14369         {
14370             ok = ajBuffreadLine(buff, &seqReadLine);
14371             lineprefix = seqPrefixGenbank(seqReadLine);
14372         }
14373     }
14374 
14375     if(!ok)
14376     {
14377         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14378 
14379         return ajFalse;
14380     }
14381 
14382     if(lineprefix != GB_ID)     /* LOCUS */
14383     {
14384         ajDebug("failed - LOCUS not found - first line was\n%S\n",
14385                 seqReadLine);
14386         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14387 
14388         return ajFalse;
14389     }
14390 
14391     nfields = ajStrParseCountC(seqReadLine, " \n\r");
14392 
14393     if(nfields == 8)
14394     {
14395         ajFilebuffSetBuffered(buff);
14396         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14397 
14398         return seqReadRefseqp(thys,seqin);
14399     }
14400 
14401     ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14402     i=0;
14403 
14404     while(ajStrTokenNextParse(seqHandle, &seqToken))
14405     {
14406         switch(++i)
14407         {
14408             case 1:
14409                 break;
14410             case 2:
14411                 seqSetName(thys, seqToken);
14412                 break;
14413             case 3:
14414                 ajStrToUint(seqToken, &seqlen);
14415                 break;
14416             case 4:
14417                 if(!ajStrMatchC(seqToken, "aa"))
14418                     ajWarn("bad Genpept LOCUS line '%S'", seqReadLine);
14419                 break;
14420             case 5:
14421                 break;
14422             case 6:
14423                 ajSeqdivSetGb(&thys->Division, seqToken);
14424                 ajSeqclsSetGb(&thys->Class, seqToken);
14425                 break;
14426             case 7:
14427                 if(!thys->Date)
14428                     thys->Date = ajSeqdateNew();
14429                 ajSeqdateSetModifyS(thys->Date, seqToken);
14430                 break;
14431             default:
14432                 break;
14433         }
14434     }
14435 
14436     if(seqin->Input->Text)
14437         ajStrAssignS(&thys->TextPtr, seqReadLine);
14438 
14439     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14440     lineprefix = seqPrefixGenbank(seqReadLine);
14441 
14442 /*
14443 //    dofeat = ajFalse;
14444 //    tryfeat = seqinUfoLocal(seqin);
14445 */
14446 
14447     while(ok &&
14448           !ajStrPrefixC(seqReadLine, "ORIGIN") &&
14449           !ajStrPrefixC(seqReadLine, "BASE COUNT"))
14450     {
14451         done = ajFalse;
14452 
14453         if(ajStrPrefixC(seqReadLine, "DEFINITION"))
14454         {
14455             ajDebug("definition found\n");
14456             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14457             ajStrTokenStep(seqHandle); /* 'DEFINITION' */
14458             ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc); /* desc */
14459             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14460             lineprefix = seqPrefixGenbank(seqReadLine);
14461             done = ajTrue;
14462 
14463             while(ok && ajStrPrefixC(seqReadLine, " "))
14464             {
14465                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14466                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
14467                 ajStrAppendC(&thys->Desc, " ");
14468                 ajStrAppendS(&thys->Desc, seqToken);
14469                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14470                 lineprefix = seqPrefixGenbank(seqReadLine);
14471             }
14472         }
14473 
14474         else if(ajStrPrefixC(seqReadLine, "ACCESSION"))
14475         {
14476             ajDebug("accession found\n");
14477 
14478             ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
14479             ajStrTokenStep(seqHandle); /* 'ACCESSION' */
14480 
14481             while(ajStrTokenNextParse(seqHandle, &seqToken))
14482                 seqAccSave(thys, seqToken);
14483         }
14484 
14485         else if(ajStrPrefixC(seqReadLine, "VERSION"))
14486         {
14487             ajDebug("seqversion found\n");
14488 
14489             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14490             ajStrTokenStep(seqHandle); /* 'VERSION' */
14491             ajStrTokenNextParse(seqHandle, &seqToken);
14492             seqSvSave(thys, seqToken);
14493 
14494             if(ajStrTokenStepC(seqHandle, ": \n\r")) /* GI: */
14495             {
14496                 ajStrTokenNextParse(seqHandle, &thys->Gi);
14497             }
14498         }
14499 
14500         else if(ajStrPrefixC(seqReadLine, "SOURCE"))
14501         {
14502             ajDebug("source found\n");
14503             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14504             ajStrTokenStep(seqHandle); /* 'SOURCE' */
14505             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* source */
14506             ajStrTokenAssignC(&seqHandle2, seqToken, "()");
14507             itaxtype=1;
14508 
14509             while(ajStrTokenNextParse(seqHandle2, &seqToken2))
14510             {
14511                 ajStrTrimWhite(&seqToken2);
14512                 seqTaxSave(thys, seqToken2, itaxtype);
14513                 itaxtype = 3;
14514             }
14515 
14516             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14517             lineprefix = seqPrefixGenbank(seqReadLine);
14518             done = ajTrue;
14519 
14520             while(ok && ajStrPrefixC(seqReadLine, " "))
14521             {
14522                 done = ajFalse;
14523 /* process organism lines */
14524 
14525                 if(ajStrPrefixC(seqReadLine, "  ORGANISM"))
14526                 {
14527                     ajDebug("organism found\n");
14528                     ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14529                     ajStrTokenStep(seqHandle); /* 'ORGANISM' */
14530 
14531                     while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14532                     {
14533                         ajStrTrimWhite(&seqToken);
14534                         seqTaxSave(thys, seqToken, 1);
14535                     }
14536 
14537                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14538                     lineprefix = seqPrefixGenbank(seqReadLine);
14539                     done = ajTrue;
14540 
14541                     while(ok && ajStrPrefixC(seqReadLine, "    "))
14542                     {
14543                         ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14544 
14545                         while(ajStrTokenNextParseC(seqHandle, ".;\n\r",
14546                                                    &seqToken))
14547                         {
14548                             ajStrAssignS(&seqToken2, seqToken);
14549                             ajStrTrimWhite(&seqToken2);
14550                             seqTaxSave(thys, seqToken2, 0);
14551                         }
14552 
14553                         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14554                         lineprefix = seqPrefixGenbank(seqReadLine);
14555                     }
14556                 }
14557 
14558                 if(!done)
14559                 {
14560                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14561                     lineprefix = seqPrefixGenbank(seqReadLine);
14562                 }
14563             }
14564         }
14565 
14566         else if(ajStrPrefixC(seqReadLine, "REFERENCE"))
14567         {
14568             ajDebug("reference found\n");
14569             seqref = ajSeqrefNew();
14570             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14571             ajStrTokenStep(seqHandle); /* 'REFERENCE' */
14572             ajStrTokenNextParse(seqHandle, &seqToken); /* number */
14573             ajStrToUint(seqToken, &refnum);
14574             ajSeqrefSetnumNumber(seqref, refnum);
14575             ajStrAssignClear(&seqToken2);
14576 
14577             while (ajStrTokenNextParse(seqHandle, &seqToken))
14578             {
14579                 if(ajStrMatchC(seqToken, "(bases"))
14580                     continue;
14581 
14582                 if(ajStrMatchC(seqToken, "(residues"))
14583                     continue;
14584 
14585                 if(ajStrMatchC(seqToken, "to"))
14586                     continue;
14587 
14588                 if(!ajStrGetLen(seqToken2))
14589                     ajStrAssignS(&seqToken2, seqToken);
14590 
14591                 if(ajStrSuffixC(seqToken, ")"))
14592                 {
14593                     ajStrTrimEndC(&seqToken, ")");
14594                     ajStrAppendK(&seqToken2, '-');
14595                     ajStrAppendS(&seqToken2, seqToken);
14596                 }
14597             }
14598 
14599             ajSeqrefSetPosition(seqref, seqToken2);
14600 
14601             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine,&thys->TextPtr);
14602             lineprefix = seqPrefixGenbank(seqReadLine);
14603             done = ajTrue;
14604 
14605             ajSeqrefStandard(seqref);
14606             ajSeqAddRef(thys, seqref);
14607 
14608             if(ok && ajStrPrefixC(seqReadLine, "  AUTHORS"))
14609             {
14610                 ajDebug("authors found\n");
14611                 if(!seqref)
14612                     seqref = ajSeqrefNew();
14613                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14614                 ajStrTokenStep(seqHandle); /* 'AUTHORS' */
14615                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken2); /* authors */
14616 
14617                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14618                 lineprefix = seqPrefixGenbank(seqReadLine);
14619 
14620                 while(ok && ajStrPrefixC(seqReadLine, "          "))
14621                 {
14622                     ajStrAssignS(&seqToken, seqReadLine);
14623                     ajStrTrimWhite(&seqToken);
14624                     if(ajStrSuffixC(seqToken2, ".,") ||
14625                        ajStrPrefixC(seqToken2, "and "))
14626                         ajStrAppendC(&seqToken2, " ");
14627                     ajStrAppendS(&seqToken2, seqToken);
14628                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14629                     lineprefix = seqPrefixGenbank(seqReadLine);
14630                 }
14631 
14632                 /* append here - genbank splits author names across lines */
14633                 ajSeqrefAppendAuthors(seqref, seqToken2);
14634             }
14635 
14636             if(ok && ajStrPrefixC(seqReadLine, "  TITLE"))
14637             {
14638                 ajDebug("title found\n");
14639                 if(!seqref)
14640                     seqref = ajSeqrefNew();
14641 
14642                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14643                 ajStrTokenStep(seqHandle); /* 'TITLE' */
14644                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
14645 
14646                 ajSeqrefAppendTitle(seqref, seqToken);
14647 
14648                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14649                 lineprefix = seqPrefixGenbank(seqReadLine);
14650 
14651                 while(ok && ajStrPrefixC(seqReadLine, "          "))
14652                 {
14653                     ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14654                     ajStrTokenNextParseC(seqHandle, "\n\r",
14655                                          &seqToken); /* title */
14656                     ajSeqrefAppendTitle(seqref, seqToken);
14657 
14658                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14659                     lineprefix = seqPrefixGenbank(seqReadLine);
14660                 }
14661             }
14662 
14663             if(ok && ajStrPrefixC(seqReadLine, "  JOURNAL"))
14664             {
14665                 ajDebug("journal location found\n");
14666                 if(!seqref)
14667                     seqref = ajSeqrefNew();
14668 
14669                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14670                 ajStrTokenStep(seqHandle); /* 'JOURNAL' */
14671                 ajStrTokenNextParseC(seqHandle, "\n\r",
14672                                      &seqToken); /* location */
14673 
14674                 ajSeqrefAppendLocation(seqref, seqToken);
14675 
14676                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14677                 lineprefix = seqPrefixGenbank(seqReadLine);
14678             }
14679 
14680             while(ok && ajStrPrefixC(seqReadLine, "  "))
14681             {
14682                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14683                 lineprefix = seqPrefixGenbank(seqReadLine);
14684             }
14685 
14686             seqref = NULL;
14687         }
14688 
14689         else if(ok && ajStrPrefixC(seqReadLine, "COMMENT"))
14690         {
14691             ajDebug("comment found\n");
14692 
14693             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14694             ajStrTokenStep(seqHandle); /* 'COMMENT' */
14695             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
14696 
14697             if(ajStrGetLen(cmtstr))
14698                 ajStrAppendC(&cmtstr, "\n");
14699             ajStrAppendS(&cmtstr, seqToken);
14700 
14701             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14702             lineprefix = seqPrefixGenbank(seqReadLine);
14703             done = ajTrue;
14704 
14705             while(ok && ajStrPrefixC(seqReadLine, "          "))
14706             {
14707                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14708                 ajStrTokenNextParseC(seqHandle, "\n\r",
14709                                      &seqToken); /* comment */
14710 
14711                 if(ajStrGetLen(seqToken))
14712                 {
14713                     if(ajStrGetLen(cmtstr))
14714                         ajStrAppendC(&cmtstr, "\n");
14715                     ajStrAppendS(&cmtstr, seqToken);
14716                 }
14717                 else
14718                 {
14719                     ajSeqAddCmt(thys, cmtstr);
14720                     cmtstr = NULL;
14721                 }
14722 
14723                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14724                 lineprefix = seqPrefixGenbank(seqReadLine);
14725             }
14726 
14727             if(ajStrGetLen(cmtstr))
14728                 ajSeqAddCmt(thys, cmtstr);
14729 
14730             cmtstr = NULL;
14731         }
14732 
14733         else if(ajStrPrefixC(seqReadLine, "KEYWORDS"))
14734         {
14735             ajDebug("keywords found\n");
14736             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14737             ajStrTokenStep(seqHandle); /* 'KEYWORDS' */
14738 
14739             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14740             {
14741                 liststr = ajStrNewS(seqToken);
14742                 ajStrTrimWhite(&liststr);
14743                 ajSeqAddKey(thys, liststr);
14744                 liststr = NULL;
14745             }
14746 
14747             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14748             lineprefix = seqPrefixGenbank(seqReadLine);
14749             done = ajTrue;
14750 
14751             while(ok && ajStrPrefixC(seqReadLine, " "))
14752             {
14753                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14754 
14755                 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14756                 {
14757                     liststr = ajStrNewS(seqToken);
14758                     ajStrTrimWhite(&liststr);
14759                     ajSeqAddKey(thys, liststr);
14760                     liststr = NULL;
14761                 }
14762 
14763                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14764                 lineprefix = seqPrefixGenbank(seqReadLine);
14765             }
14766         }
14767 
14768         else if(ajStrPrefixC(seqReadLine, "  ORGANISM"))
14769         {
14770             ajDebug("organism found\n");
14771             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14772             ajStrTokenStep(seqHandle); /* 'ORGANISM' */
14773 
14774             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14775             {
14776                 ajStrTrimWhite(&seqToken);
14777                 seqTaxSave(thys, seqToken, 0);
14778             }
14779 
14780             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14781             lineprefix = seqPrefixGenbank(seqReadLine);
14782             done = ajTrue;
14783 
14784             while(ok && ajStrPrefixC(seqReadLine, "    "))
14785             {
14786                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
14787 
14788                 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
14789                 {
14790                     ajStrTrimWhite(&seqToken);
14791                     seqTaxSave(thys, seqToken, 0);
14792                 }
14793 
14794                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14795                 lineprefix = seqPrefixGenbank(seqReadLine);
14796             }
14797         }
14798 
14799         if(!done)
14800         {
14801             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14802             lineprefix = seqPrefixGenbank(seqReadLine);
14803         }
14804 
14805     }
14806 /*
14807 //    if(dofeat)
14808 //    {
14809 //        ajDebug("GENPEPT FEAT TabIn %x\n", seqin->Ftquery);
14810 //        ajFeattableDel(&thys->Fttable);
14811 //        thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
14812 //        /# ajFeattableTrace(thys->Fttable); #/
14813 //        ajFeattabinClear(seqin->Ftquery);
14814 //    }
14815 */
14816 
14817     if(ajStrGetLen(seqin->Inseq))
14818     {
14819         /* we have a sequence to use */
14820         ajDebug("Got an Inseq sequence\n");
14821 
14822         if(ajStrMatchC(qry->Method,"gcg"))
14823             while(ok && !ajStrPrefixC(seqReadLine,"ORIGIN"))
14824                 ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14825 
14826         ajStrAssignS(&thys->Seq, seqin->Inseq);
14827 
14828         if(seqin->Input->Text)
14829         {
14830             seqTextSeq(&thys->TextPtr, seqin->Inseq);
14831             ajFmtPrintAppS(&thys->TextPtr, "//\n");
14832         }
14833     }
14834     else
14835     {
14836         /* read the sequence and terminator */
14837         ajDebug("sequence start at '%S'\n", seqReadLine);
14838 
14839         while(!ajStrPrefixC(seqReadLine,"ORIGIN") &&
14840               !ajStrPrefixC(seqReadLine,"BASE COUNT"))
14841             if(!ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr))
14842                 break;
14843 
14844         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14845         ajStrSetRes(&thys->Seq, seqlen+1);
14846 
14847         while(ok && !ajStrPrefixC(seqReadLine, "//"))
14848         {
14849             if(!ajStrPrefixC(seqReadLine, "ORIGIN") &&
14850                !ajStrPrefixC(seqReadLine,"BASE COUNT"))
14851                 seqAppend(&thys->Seq, seqReadLine);
14852             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14853         }
14854     }
14855 
14856     if(!ajStrMatchC(qry->Method,"gcg"))
14857         while(ok && !ajStrPrefixC(seqReadLine,"//"))
14858             ok = ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr);
14859 
14860 
14861     if(thys->Fttable)
14862         ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
14863 
14864     ajFilebuffClear(buff, 0);
14865 
14866     ajStrTokenReset(seqHandle);
14867     ajStrTokenReset(seqHandle2);
14868     ajStrDelStatic(&seqToken);
14869     ajStrDelStatic(&seqToken2);
14870 
14871     return ajTrue;
14872 }
14873 
14874 
14875 
14876 
14877 /* @funcstatic seqReadRefseqp *************************************************
14878 **
14879 ** Given data in a sequence structure, tries to read everything needed
14880 ** using Refseq protein format.
14881 **
14882 ** @param [w] thys [AjPSeq] Sequence object
14883 ** @param [u] seqin [AjPSeqin] Sequence input object
14884 ** @return [AjBool] ajTrue on success
14885 **
14886 ** @release 6.1.0
14887 ** @@
14888 ******************************************************************************/
14889 
seqReadRefseqp(AjPSeq thys,AjPSeqin seqin)14890 static AjBool seqReadRefseqp(AjPSeq thys, AjPSeqin seqin)
14891 {
14892     AjBool ok;
14893     AjBool done = ajFalse;
14894     AjPFilebuff buff;
14895     AjPStr cmtstr = NULL;
14896     AjBool dofeat  = ajFalse;
14897     AjBool tryfeat = ajFalse;
14898     AjPQuery qry;
14899     AjPStr liststr;                     /* for lists, do not delete */
14900     AjPSeqRef seqref = NULL;
14901     ajuint refnum;
14902     ajuint seqlen = 1024;
14903     ajint i;
14904     ajuint itaxtype = 0;
14905 
14906     ajDebug("seqReadRefseqp\n");
14907 
14908     buff = seqin->Input->Filebuff;
14909     qry  = seqin->Input->Query;
14910 
14911     if(!seqFtFmtRefseqp)
14912         ajStrAssignC(&seqFtFmtRefseqp, "refseqp");
14913 
14914     if(!ajBuffreadLine(buff, &seqReadLine))
14915         return ajFalse;
14916 
14917     ajDebug("++seqReadRefseqp  %d first line '%S'\n", buff->Pos, seqReadLine);
14918 
14919     ok = ajTrue;
14920 
14921     /* extra blank lines */
14922 
14923     while(ajStrIsWhite(seqReadLine))
14924     {
14925         if(!ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr))
14926             return ajFalse;
14927     }
14928 
14929     /* for GCG formatted databases */
14930 
14931     if(ajStrPrefixC(seqReadLine, "WPCOMMENT"))
14932     {
14933         ok = ajBuffreadLine(buff, &seqReadLine);
14934 
14935         while(ok && ajStrPrefixC(seqReadLine, " "))
14936         {
14937             ok = ajBuffreadLine(buff, &seqReadLine);
14938         }
14939     }
14940 
14941     if(!ok)
14942     {
14943         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14944 
14945         return ajFalse;
14946     }
14947 
14948     if(!ajStrPrefixC(seqReadLine, "LOCUS"))
14949     {
14950         ajDebug("failed - LOCUS not found - first line was\n%S\n",
14951                 seqReadLine);
14952         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
14953         return ajFalse;
14954     }
14955 
14956     if(seqin->Input->Text)
14957         ajStrAssignS(&thys->TextPtr,seqReadLine);
14958 
14959     ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
14960     i=0;
14961 
14962     while(ajStrTokenNextParse(seqHandle, &seqToken))
14963     {
14964         switch(++i)
14965         {
14966             case 1:             /* 'LOCUS' */
14967                 break;
14968             case 2:             /* locus name */
14969                 seqSetName(thys, seqToken);
14970                 break;
14971             case 3:             /* length */
14972                 ajStrToUint(seqToken, &seqlen);
14973                 break;
14974             case 4:             /* 'aa' */
14975                 if(!ajStrMatchC(seqToken, "aa"))
14976                     ajWarn("bad RefseqP LOCUS line '%S'", seqReadLine);
14977                 break;
14978             case 5:             /* linear etc. */
14979                 break;
14980             case 6:
14981                 ajSeqdivSetGb(&thys->Division, seqToken);
14982                 ajSeqclsSetGb(&thys->Class, seqToken);
14983                 break;
14984             case 7:
14985                 if(!thys->Date)
14986                     thys->Date = ajSeqdateNew();
14987                 ajSeqdateSetModifyS(thys->Date, seqToken);
14988                 break;
14989             default:
14990                 break;
14991         }
14992     }
14993 
14994     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
14995 
14996     dofeat = ajFalse;
14997     tryfeat = seqinUfoLocal(seqin);
14998 
14999     while(ok &&
15000           !ajStrPrefixC(seqReadLine, "ORIGIN") &&
15001           !ajStrPrefixC(seqReadLine, "BASE COUNT"))
15002     {
15003         done = ajFalse;
15004 
15005         if(ajStrPrefixC(seqReadLine, "DEFINITION"))
15006         {
15007             ajDebug("definition found\n");
15008             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15009             ajStrTokenStep(seqHandle); /* 'DEFINITION' */
15010             ajStrTokenNextParseC(seqHandle, "\n\r", &thys->Desc); /* desc */
15011             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15012             done = ajTrue;
15013 
15014             while(ok && ajStrPrefixC(seqReadLine, " "))
15015             {
15016                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15017                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken);
15018                 ajStrAppendC(&thys->Desc, " ");
15019                 ajStrAppendS(&thys->Desc, seqToken);
15020                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15021             }
15022         }
15023 
15024         else if(ajStrPrefixC(seqReadLine, "ACCESSION"))
15025         {
15026             ajDebug("accession found\n");
15027 
15028             ajStrTokenAssignC(&seqHandle, seqReadLine, " ;\n\r");
15029             ajStrTokenStep(seqHandle); /* 'ACCESSION' */
15030 
15031             while(ajStrTokenNextParse(seqHandle, &seqToken))
15032                 seqAccSave(thys, seqToken);
15033         }
15034 
15035         else if(ajStrPrefixC(seqReadLine, "VERSION"))
15036         {
15037             ajDebug("seqversion found\n");
15038 
15039             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
15040             ajStrTokenStep(seqHandle); /* 'VERSION' */
15041             ajStrTokenNextParse(seqHandle, &seqToken);
15042             seqSvSave(thys, seqToken);
15043 
15044             if(ajStrTokenStepC(seqHandle, ": \n\r")) /* GI: */
15045             {
15046                 ajStrTokenNextParse(seqHandle, &thys->Gi);
15047             }
15048         }
15049 
15050         else if(ajStrPrefixC(seqReadLine, "SOURCE"))
15051         {
15052             ajDebug("source found\n");
15053             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15054             ajStrTokenStep(seqHandle); /* 'SOURCE' */
15055             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* source */
15056             ajStrTokenAssignC(&seqHandle2, seqToken, "()");
15057             itaxtype=1;
15058 
15059             while(ajStrTokenNextParse(seqHandle2, &seqToken2))
15060             {
15061                 ajStrTrimWhite(&seqToken2);
15062                 seqTaxSave(thys, seqToken2, itaxtype);
15063                 itaxtype = 3;
15064             }
15065 
15066             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15067             done = ajTrue;
15068 
15069             while(ok && ajStrPrefixC(seqReadLine, " "))
15070             {
15071                 done = ajFalse;
15072 /* process organism lines */
15073 
15074                 if(ajStrPrefixC(seqReadLine, "  ORGANISM"))
15075                 {
15076                     ajDebug("organism found\n");
15077                     ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15078                     ajStrTokenStep(seqHandle); /* 'ORGANISM' */
15079 
15080                     while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15081                     {
15082                         ajStrTrimWhite(&seqToken);
15083                         seqTaxSave(thys, seqToken, 1);
15084                     }
15085 
15086                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15087 
15088                     done = ajTrue;
15089 
15090                     while(ok && ajStrPrefixC(seqReadLine, "    "))
15091                     {
15092                         ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15093 
15094                         while(ajStrTokenNextParseC(seqHandle, ".;\n\r",
15095                                                    &seqToken))
15096                         {
15097                             ajStrTrimWhite(&seqToken);
15098                             seqTaxSave(thys, seqToken, 0);
15099                         }
15100 
15101                         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15102                     }
15103                 }
15104 
15105                 if(!done)
15106                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15107             }
15108         }
15109 
15110         else if(tryfeat && ajStrPrefixC(seqReadLine, "FEATURES"))
15111         {
15112             ajDebug("features found\n");
15113 
15114             if(!dofeat)
15115             {
15116                 dofeat = ajTrue;
15117                 ajFeattabinDel(&seqin->Ftquery);
15118                 seqin->Ftquery = ajFeattabinNewSeqinSS(seqin, seqFtFmtRefseqp,
15119                                                        thys->Name, "N");
15120                 ajDebug("seqin->Ftquery Filebuff %x\n",
15121                         seqin->Ftquery->Input->Filebuff);
15122                 /* ajDebug("REFSEQP FEAT first line:\n%S", seqReadLine); */
15123             }
15124 
15125             ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff, seqReadLine);
15126             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15127             done = ajTrue;
15128 
15129             while(ok && ajStrPrefixC(seqReadLine, " "))
15130             {
15131                 ajFilebuffLoadS(seqin->Ftquery->Input->Filebuff,
15132                                 seqReadLine);
15133                 /* ajDebug("REFSEQP FEAT saved line:\n%S", seqReadLine); */
15134                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15135             }
15136         }
15137 
15138         else if(ajStrPrefixC(seqReadLine, "REFERENCE"))
15139         {
15140             ajDebug("reference found\n");
15141             seqref = ajSeqrefNew();
15142             ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
15143             ajStrTokenStep(seqHandle); /* 'REFERENCE' */
15144             ajStrTokenNextParse(seqHandle, &seqToken); /* number */
15145             ajStrToUint(seqToken, &refnum);
15146             ajSeqrefSetnumNumber(seqref, refnum);
15147             ajStrAssignClear(&seqToken2);
15148 
15149             while (ajStrTokenNextParse(seqHandle, &seqToken))
15150             {
15151                 if(ajStrMatchC(seqToken, "(bases"))
15152                     continue;
15153 
15154                 if(ajStrMatchC(seqToken, "(residues"))
15155                     continue;
15156 
15157                 if(ajStrMatchC(seqToken, "to"))
15158                     continue;
15159 
15160                 if(!ajStrGetLen(seqToken2))
15161                     ajStrAssignS(&seqToken2, seqToken);
15162 
15163                 if(ajStrSuffixC(seqToken, ")"))
15164                 {
15165                     ajStrTrimEndC(&seqToken, ")");
15166                     ajStrAppendK(&seqToken2, '-');
15167                     ajStrAppendS(&seqToken2, seqToken);
15168                 }
15169             }
15170 
15171             ajSeqrefSetPosition(seqref, seqToken2);
15172 
15173             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15174             done = ajTrue;
15175 
15176             ajSeqrefStandard(seqref);
15177             ajSeqAddRef(thys, seqref);
15178 
15179             if(ok && ajStrPrefixC(seqReadLine, "  AUTHORS"))
15180             {
15181                 ajDebug("authors found\n");
15182                 if(!seqref)
15183                     seqref = ajSeqrefNew();
15184                 ajStrTokenAssignC(&seqHandle, seqReadLine, " \n\r");
15185                 ajStrTokenStep(seqHandle); /* 'AUTHORS' */
15186                 ajStrTokenNextParseC(seqHandle, "\n\r",
15187                                      &seqToken2); /* authors */
15188 
15189                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15190 
15191                 while(ok && ajStrPrefixC(seqReadLine, "          "))
15192                 {
15193                     ajStrAssignS(&seqToken, seqReadLine);
15194                     ajStrTrimWhite(&seqToken);
15195                     if(ajStrSuffixC(seqToken2, ".,") ||
15196                        ajStrPrefixC(seqToken, "and "))
15197                         ajStrAppendC(&seqToken2, " ");
15198                     ajStrAppendS(&seqToken2, seqToken);
15199                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15200                 }
15201 
15202                 /* append here - genbank splits author names across lines */
15203                 ajSeqrefAppendAuthors(seqref, seqToken2);
15204             }
15205 
15206             if(ok && ajStrPrefixC(seqReadLine, "  TITLE"))
15207             {
15208                 ajDebug("title found\n");
15209                 if(!seqref)
15210                     seqref = ajSeqrefNew();
15211 
15212                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15213                 ajStrTokenStep(seqHandle); /* 'TITLE' */
15214                 ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* title */
15215 
15216                 ajSeqrefAppendTitle(seqref, seqToken);
15217 
15218                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15219 
15220                 while(ok && ajStrPrefixC(seqReadLine, "          "))
15221                 {
15222                     ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15223                     ajStrTokenNextParseC(seqHandle, "\n\r",
15224                                          &seqToken); /* title */
15225                     ajSeqrefAppendTitle(seqref, seqToken);
15226 
15227                     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15228                 }
15229             }
15230 
15231             if(ok && ajStrPrefixC(seqReadLine, "  JOURNAL"))
15232             {
15233                 ajDebug("journal location found\n");
15234                 if(!seqref)
15235                     seqref = ajSeqrefNew();
15236 
15237                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15238                 ajStrTokenStep(seqHandle); /* 'JOURNAL' */
15239                 ajStrTokenNextParseC(seqHandle, "\n\r",
15240                                      &seqToken); /* location */
15241 
15242                 ajSeqrefAppendLocation(seqref, seqToken);
15243 
15244                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15245             }
15246 
15247             while(ok && ajStrPrefixC(seqReadLine, "  "))
15248             {
15249                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15250             }
15251 
15252             seqref = NULL;
15253         }
15254 
15255         else if(ok && ajStrPrefixC(seqReadLine, "COMMENT"))
15256         {
15257             ajDebug("comment found\n");
15258 
15259             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15260             ajStrTokenStep(seqHandle); /* 'COMMENT' */
15261             ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken); /* comment */
15262 
15263             if(ajStrGetLen(cmtstr))
15264                 ajStrAppendC(&cmtstr, "\n");
15265             ajStrAppendS(&cmtstr, seqToken);
15266 
15267             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15268             done = ajTrue;
15269 
15270             while(ok && ajStrPrefixC(seqReadLine, "          "))
15271             {
15272                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15273                 ajStrTokenNextParseC(seqHandle, "\n\r",
15274                                      &seqToken); /* comment */
15275 
15276                 if(ajStrGetLen(seqToken))
15277                 {
15278                     if(ajStrGetLen(cmtstr))
15279                         ajStrAppendC(&cmtstr, "\n");
15280                     ajStrAppendS(&cmtstr, seqToken);
15281                 }
15282                 else
15283                 {
15284                     ajSeqAddCmt(thys, cmtstr);
15285                     cmtstr = NULL;
15286                 }
15287 
15288                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15289             }
15290 
15291             if(ajStrGetLen(cmtstr))
15292                  ajSeqAddCmt(thys, cmtstr);
15293 
15294             cmtstr = NULL;
15295         }
15296 
15297         else if(ajStrPrefixC(seqReadLine, "KEYWORDS"))
15298         {
15299             ajDebug("keywords found\n");
15300             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15301             ajStrTokenStep(seqHandle); /* 'KEYWORDS' */
15302 
15303             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15304             {
15305                 liststr = ajStrNewS(seqToken);
15306                 ajStrTrimWhite(&liststr);
15307                 ajSeqAddKey(thys, liststr);
15308                 liststr = NULL;
15309             }
15310 
15311             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15312             done = ajTrue;
15313 
15314             while(ok && ajStrPrefixC(seqReadLine, " "))
15315             {
15316                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15317 
15318                 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15319                 {
15320                     liststr = ajStrNewS(seqToken);
15321                     ajStrTrimWhite(&liststr);
15322                     ajSeqAddKey(thys, liststr);
15323                     liststr = NULL;
15324                 }
15325 
15326                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15327             }
15328         }
15329 
15330         else if(ajStrPrefixC(seqReadLine, "  ORGANISM"))
15331         {
15332             ajDebug("organism found\n");
15333             ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15334             ajStrTokenStep(seqHandle); /* 'ORGANISM' */
15335 
15336             while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15337             {
15338                 ajStrTrimWhite(&seqToken);
15339                 seqTaxSave(thys, seqToken, 0);
15340             }
15341 
15342             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15343             done = ajTrue;
15344 
15345             while(ok && ajStrPrefixC(seqReadLine, "    "))
15346             {
15347                 ajStrTokenAssignC(&seqHandle, seqReadLine, " ");
15348 
15349                 while(ajStrTokenNextParseC(seqHandle, ".;\n\r", &seqToken))
15350                 {
15351                     ajStrTrimWhite(&seqToken);
15352                     seqTaxSave(thys, seqToken, 0);
15353                 }
15354 
15355                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15356             }
15357         }
15358 
15359         if(!done)
15360             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15361     }
15362 
15363     if(dofeat)
15364     {
15365         ajDebug("REFSEQP FEAT TabIn %x\n", seqin->Ftquery);
15366         ajFeattableDel(&thys->Fttable);
15367         thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
15368         /* ajFeattableTrace(thys->Fttable); */
15369         ajFeattabinClear(seqin->Ftquery);
15370     }
15371 
15372     if(ajStrGetLen(seqin->Inseq))
15373     {
15374         /* we have a sequence to use */
15375         ajDebug("Got an Inseq sequence\n");
15376 
15377         if(ajStrMatchC(qry->Method,"gcg"))
15378             while(ok && !ajStrPrefixC(seqReadLine,"ORIGIN"))
15379                 ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15380 
15381         ajStrAssignS(&thys->Seq, seqin->Inseq);
15382 
15383         if(seqin->Input->Text)
15384         {
15385             seqTextSeq(&thys->TextPtr, seqin->Inseq);
15386             ajFmtPrintAppS(&thys->TextPtr, "//\n");
15387         }
15388     }
15389     else
15390     {
15391         /* read the sequence and terminator */
15392         ajDebug("sequence start at '%S'\n", seqReadLine);
15393 
15394         while(!ajStrPrefixC(seqReadLine,"ORIGIN") &&
15395               !ajStrPrefixC(seqReadLine,"BASE COUNT"))
15396             if(!ajTextinStoreReadline(seqin->Input,&seqReadLine, &thys->TextPtr))
15397                 break;
15398 
15399         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15400         ajStrSetRes(&thys->Seq, seqlen+1);
15401 
15402         while(ok && !ajStrPrefixC(seqReadLine, "//"))
15403         {
15404             if(!ajStrPrefixC(seqReadLine, "ORIGIN") &&
15405                !ajStrPrefixC(seqReadLine,"BASE COUNT"))
15406                 seqAppend(&thys->Seq, seqReadLine);
15407 
15408             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15409         }
15410     }
15411 
15412     if(!ajStrMatchC(qry->Method,"gcg"))
15413         while(ok && !ajStrPrefixC(seqReadLine,"//"))
15414             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15415 
15416     if(thys->Fttable)
15417         ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
15418 
15419     ajFilebuffClear(buff, 0);
15420     ajDebug("++last line %d '%S'\n", buff->Pos, seqReadLine);
15421 
15422     ajStrTokenReset(seqHandle);
15423     ajStrDelStatic(&seqToken);
15424     ajStrDelStatic(&seqToken2);
15425 
15426     return ajTrue;
15427 }
15428 
15429 
15430 
15431 
15432 /* @funcstatic seqReadGff2 ****************************************************
15433 **
15434 ** Given data in a sequence structure, tries to read everything needed
15435 ** using GFF2 format.
15436 **
15437 ** GFF1 only offers the sequence, and the type, with the DNA, RNA and
15438 ** Protein and End-xxx headers. GFF2 allows other header lines to be defined,
15439 ** so EMBOSS can add more lines for accession number and description
15440 **
15441 ** GFF2 also defines Type and sequence-region headers, but they only
15442 ** provide information that is also in the DNA, RNA or Protein header
15443 ** and these are required for sequence storage so we ignore the alternatives.
15444 **
15445 ** @param [w] thys [AjPSeq] Sequence object
15446 ** @param [u] seqin [AjPSeqin] Sequence input object
15447 ** @return [AjBool] ajTrue on success
15448 **
15449 ** @release 6.4.0
15450 ** @@
15451 ******************************************************************************/
15452 
seqReadGff2(AjPSeq thys,AjPSeqin seqin)15453 static AjBool seqReadGff2(AjPSeq thys, AjPSeqin seqin)
15454 {
15455     AjBool ok;
15456     AjBool isseq            = ajFalse;
15457     AjPFilebuff buff;
15458     AjPFilebuff ftfile   = NULL;
15459     AjBool dofeat        = ajFalse;
15460     AjPStr typstr = NULL;
15461     AjPStr verstr = NULL;       /* copy of version line */
15462     AjPStr outstr = NULL;       /* generated Type line */
15463 
15464     buff = seqin->Input->Filebuff;
15465 
15466     if(!seqRegGffTyp)
15467         seqRegGffTyp = ajRegCompC("^##([DR]NA|Protein) +([^ \t\r\n]+)");
15468 
15469     if(!seqFtFmtGff)
15470         ajStrAssignC(&seqFtFmtGff, "gff");
15471 
15472     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15473     if(!ok)
15474         return ajFalse;
15475 
15476     ajDebug("seqReadGff2 first line '%S'\n", seqReadLine);
15477 
15478     if(!ajStrPrefixC(seqReadLine, "##gff-version "))
15479     {
15480         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15481 
15482         return ajFalse;
15483     }
15484 
15485     ajStrAssignS(&verstr, seqReadLine);
15486 
15487     if(seqin->Input->Text)
15488         ajStrAssignS(&thys->TextPtr,seqReadLine);
15489 
15490     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15491 
15492     /* read the main header */
15493     while(ok && ajStrPrefixC(seqReadLine, "##"))
15494     {
15495         if(ajRegExec(seqRegGffTyp, seqReadLine))
15496         {
15497             isseq = ajTrue;
15498             ajRegSubI(seqRegGffTyp, 1, &typstr);
15499             ajRegSubI(seqRegGffTyp, 2, &thys->Name);
15500             ajFmtPrintS(&outstr, "##Type %S %S", typstr, thys->Name);
15501         }
15502         else if(ajStrPrefixC(seqReadLine, "##end-"))
15503             isseq = ajFalse;
15504         else if(isseq)
15505             seqAppend(&thys->Seq, seqReadLine);
15506 
15507         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15508     }
15509 
15510     if(!ajSeqGetLen(thys))
15511     {
15512         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15513         return ajFalse;
15514     }
15515 
15516     /* do we want the features now? */
15517 
15518     if(ok & seqinUfoLocal(seqin))
15519     {
15520         dofeat = ajTrue;
15521         ftfile = ajFilebuffNewNofile();
15522         ajFilebuffLoadS(ftfile, verstr);
15523         ajFilebuffLoadS(ftfile, outstr);
15524 
15525         while(ok && !ajStrPrefixC(seqReadLine, "##"))
15526         {
15527             ajFilebuffLoadS(ftfile, seqReadLine);
15528             /* ajDebug("GFF FEAT saved line:\n%S", seqReadLine); */
15529             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15530         }
15531     }
15532 
15533     if(dofeat)
15534     {
15535         ajFeattabinDel(&seqin->Ftquery);
15536         seqin->Ftquery = ajFeattabinNewSeqinSSF(seqin, seqFtFmtGff,
15537                                                 thys->Name,
15538                                                 ajStrGetPtr(seqin->Type),
15539                                                 ftfile);
15540         ajDebug("GFF FEAT TabIn %x type: '%S'\n",
15541                 seqin->Ftquery, seqin->Type);
15542         ftfile = NULL;            /* now copied to seqin->Feattabin */
15543         ajFeattableDel(&seqin->Fttable);
15544         seqin->Fttable = ajFeattableNewRead(seqin->Ftquery);
15545         /* ajFeattableTrace(seqin->Fttable); */
15546         ajFeattableDel(&thys->Fttable);
15547         thys->Fttable = seqin->Fttable;
15548         seqin->Fttable = NULL;
15549     }
15550 
15551     if(ajStrMatchC(typstr, "Protein"))
15552         ajSeqSetProt(thys);
15553     else if(ajSeqIsNuc(thys))
15554         ajSeqSetNuc(thys);
15555     else
15556         ajSeqSetProt(thys);
15557 
15558     if(thys->Fttable)
15559         ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
15560 
15561     ajFilebuffClear(buff, 0);
15562 
15563     ajStrDel(&typstr);
15564     ajStrDel(&verstr);
15565     ajStrDel(&outstr);
15566 
15567     return ajTrue;
15568 }
15569 
15570 
15571 
15572 
15573 /* @funcstatic seqReadGff3 ****************************************************
15574 **
15575 ** Given data in a sequence structure, tries to read everything needed
15576 ** using GFF3 format.
15577 **
15578 ** GFF3 is far stricter than GFF2 but does include a sequence in FASTA format
15579 **
15580 ** GFF also defines Type and sequence-region headers, but they only
15581 ** provide information that is also in the DNA, RNA or Protein header
15582 ** and these are required for sequence storage so we ignore the alternatives.
15583 **
15584 ** @param [w] thys [AjPSeq] Sequence object
15585 ** @param [u] seqin [AjPSeqin] Sequence input object
15586 ** @return [AjBool] ajTrue on success
15587 **
15588 ** @release 6.0.0
15589 ** @@
15590 ******************************************************************************/
15591 
seqReadGff3(AjPSeq thys,AjPSeqin seqin)15592 static AjBool seqReadGff3(AjPSeq thys, AjPSeqin seqin)
15593 {
15594     AjBool ok;
15595     AjPFilebuff buff;
15596     AjPFilebuff ftfile   = NULL;
15597     AjBool dofeat        = ajFalse;
15598     AjPStr verstr = NULL;       /* copy of version line */
15599     AjPStr outstr = NULL;       /* generated Type line */
15600     AjPStr typstr = NULL;
15601     AjPStr rest = NULL;
15602     AjBool wantseq = ajFalse;
15603 
15604     buff = seqin->Input->Filebuff;
15605 
15606     if(!seqFtFmtGff)
15607         ajStrAssignC(&seqFtFmtGff, "gff3");
15608 
15609     if(!seqRegGff3Typ)
15610         seqRegGff3Typ = ajRegCompC("^#!Type (.*)");
15611 
15612     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15613     if(!ok)
15614         return ajFalse;
15615 
15616     ajDebug("seqReadGff3 first line '%S'\n", seqReadLine);
15617 
15618     ajStrRemoveWhiteExcess(&seqReadLine);
15619 
15620     if(!ajStrMatchC(seqReadLine, "##gff-version 3"))
15621     {
15622         ajDebug("bad gff3 version line '%S'\n", seqReadLine);
15623         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15624 
15625         return ajFalse;
15626     }
15627 
15628     ajStrAssignS(&verstr, seqReadLine);
15629 
15630     if(seqin->Input->Text)
15631         ajStrAssignS(&thys->TextPtr,seqReadLine);
15632 
15633     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15634 
15635     while(ok && ajStrPrefixC(seqReadLine, "#"))
15636     {
15637         if(ajStrPrefixC(seqReadLine, "##sequence-region"))
15638         {
15639             ajStrTokenAssignC(&seqHandle, seqReadLine, " \t");
15640             ajStrTokenStep(seqHandle);
15641             ajStrTokenNextParse(seqHandle, &thys->Name);
15642             ajStrTokenReset(seqHandle);
15643         }
15644         else if(ajStrPrefixC(seqReadLine, "##feature-ontology"))
15645         {
15646         }
15647         else if(ajStrPrefixC(seqReadLine, "##attribute-ontology"))
15648         {
15649         }
15650         else if(ajStrPrefixC(seqReadLine, "##source-ontology"))
15651         {
15652         }
15653         else if(ajStrPrefixC(seqReadLine, "###"))
15654         {
15655         }
15656         else if(ajStrPrefixC(seqReadLine, "##FASTA"))
15657         {
15658             break;
15659         }
15660         else if(ajStrPrefixC(seqReadLine, "##"))
15661         {
15662             ajDebug("GFF3: Unrecognized header directive '%S'\n",
15663                     seqReadLine);
15664         }
15665 
15666         if(ajRegExec(seqRegGff3Typ, seqReadLine))
15667         {
15668             ajRegSubI(seqRegGff3Typ, 1, &typstr);
15669             ajFmtPrintS(&outstr, "#!Type %S", typstr);
15670         }
15671 
15672         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15673     }
15674 
15675     /* do we want the features now? */
15676 
15677     if(ok & seqinUfoLocal(seqin))
15678     {
15679         dofeat = ajTrue;
15680 
15681         ftfile = ajFilebuffNewNofile();
15682         ajFilebuffLoadS(ftfile, verstr);
15683         ajFilebuffLoadS(ftfile, outstr);
15684     }
15685 
15686     while(ok)
15687     {
15688         if(ajStrPrefixC(seqReadLine, "##"))
15689         {
15690             if(ajStrPrefixCaseC(seqReadLine, "##FASTA"))
15691             {
15692                 break;
15693             }
15694             else if(ajStrPrefixC(seqReadLine, "##gff-version "))
15695             {
15696                 return ajFalse;break;
15697             }
15698         }
15699 
15700         if(dofeat)
15701             ajFilebuffLoadS(ftfile, seqReadLine);
15702         else if(!ajStrGetLen(thys->Name))
15703         {
15704             if(ajStrExtractFirst(seqReadLine, &rest, &seqToken))
15705                 ajStrAssignS(&thys->Name, seqToken);
15706         }
15707 
15708         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15709     }
15710 
15711     if(!ajStrPrefixCaseC(seqReadLine, "##FASTA")) /* no sequence at end */
15712     {
15713         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15714 
15715         ajDebug("No GFF3 ##FASTA line\n");
15716         return ajFalse;
15717     }
15718 
15719     if(dofeat)
15720     {
15721         ajFeattabinDel(&seqin->Ftquery);
15722         seqin->Ftquery = ajFeattabinNewSeqinSSF(seqin, seqFtFmtGff,
15723                                                 thys->Name,
15724                                                 ajStrGetPtr(seqin->Type),
15725                                                 ftfile);
15726         ajDebug("GFF3 FEAT TabIn %x\n", seqin->Ftquery);
15727         ftfile = NULL;
15728         ajFeattableDel(&seqin->Fttable);
15729         ajFeattableDel(&thys->Fttable);
15730         thys->Fttable = ajFeattableNewRead(seqin->Ftquery);
15731         if(thys->Fttable)
15732             ajFeattableSetLength(thys->Fttable, ajStrGetLen(thys->Seq));
15733         if(ajFeattableIsCircular(thys->Fttable))
15734             ajSeqSetCircular(thys);
15735     }
15736 
15737     ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15738     wantseq = ajFalse;
15739 
15740     while(ok && !ajStrPrefixC(seqReadLine, "##"))
15741     {
15742         while(ok && ajStrPrefixC(seqReadLine, ">"))
15743         {
15744             ajStrCutStart(&seqReadLine, 1);
15745             if(wantseq)
15746             {
15747                 wantseq = ajFalse;
15748             }
15749             else
15750             {
15751                 ajStrExtractFirst(seqReadLine, &rest, &seqToken);
15752 
15753                 if(dofeat)
15754                 {
15755                     if(ajStrMatchS(seqToken, ajFeattableGetName(thys->Fttable)))
15756                     {
15757                         wantseq = ajTrue;
15758                         ajStrAssignS(&thys->Name, seqToken);
15759                     }
15760                 }
15761                 else
15762                 {
15763                     if(ajStrMatchS(seqToken, thys->Name))
15764                     {
15765                         wantseq = ajTrue;
15766                     }
15767                 }
15768 
15769                 if(wantseq)
15770                 {
15771                     ajStrRemoveWhiteExcess(&rest);
15772                     ajStrAssignS(&thys->Desc, rest);
15773                 }
15774             }
15775             ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15776         }
15777 
15778         if(wantseq)
15779             seqAppend(&thys->Seq, seqReadLine);
15780 
15781         ok = ajTextinStoreReadline(seqin->Input, &seqReadLine, &thys->TextPtr);
15782     }
15783 
15784     if(!ajSeqGetLen(thys))
15785     {
15786         ajTextinStoreReset(seqin->Input, &thys->TextPtr);
15787 
15788         ajDebug("No sequence data\n");
15789         return ajFalse;
15790     }
15791 
15792     if(ajStrMatchC(typstr, "Protein"))
15793         ajSeqSetProt(thys);
15794     else if(ajSeqIsNuc(thys))
15795         ajSeqSetNuc(thys);
15796     else
15797         ajSeqSetProt(thys);
15798 
15799     ajFilebuffClear(buff, 0);
15800 
15801     ajStrDel(&typstr);
15802     ajStrDel(&verstr);
15803     ajStrDel(&outstr);
15804     ajStrDelStatic(&seqToken);
15805     ajStrDel(&rest);
15806 
15807     return ajTrue;
15808 }
15809 
15810 
15811 
15812 
15813 /* @funcstatic seqReadAbi *****************************************************
15814 **
15815 ** Given data in a sequence structure, tries to read everything needed
15816 ** using ABI format.
15817 **
15818 ** @param [w] thys [AjPSeq] Sequence object
15819 ** @param [u] seqin [AjPSeqin] Sequence input object
15820 ** @return [AjBool] ajTrue on success
15821 **
15822 ** @release 1.8.0
15823 ** @@
15824 ******************************************************************************/
15825 
seqReadAbi(AjPSeq thys,AjPSeqin seqin)15826 static AjBool seqReadAbi(AjPSeq thys, AjPSeqin seqin)
15827 {
15828     AjPFilebuff buff;
15829     AjBool  ok      = ajFalse;
15830     ajlong baseO    = 0L;
15831     ajlong pconO    = 0L;
15832     ajlong numBases = 0L;
15833     AjPStr sample   = NULL;
15834     AjPStr smpl     = NULL;
15835     AjPFile fp;
15836     ajint filestat;
15837 
15838     buff = seqin->Input->Filebuff;
15839     fp = ajFilebuffGetFile(buff);
15840 
15841     ajDebug("seqReadAbi file %F\n", fp);
15842 
15843     /* ajFilebuffTraceFull(buff, 10, 10); */
15844 
15845     if(ajFilebuffIsEnded(buff))
15846         return ajFalse;
15847 
15848     if(!ajSeqABITest(fp))
15849     {
15850         ajDebug("seqReadAbi ajSeqABITest failed on %F\n", fp);
15851         ajFilebuffResetPos(buff);
15852 
15853         return ajFalse;
15854     }
15855 
15856     if(seqin->Input->Text)
15857         ajWarn("Failed to read text from binary ABI file %F", fp);
15858 
15859     filestat = ajFileSeek(fp,0L,0);
15860     ajDebug("filestat %d\n", filestat);
15861 
15862     numBases = ajSeqABIGetNBase(fp);
15863 
15864     ok = ajFalse;
15865 
15866     /* Find BASE tag & get offset                    */
15867     baseO = ajSeqABIGetBaseOffset(fp);
15868     /* Read in sequence         */
15869     if(baseO)
15870         ok = ajSeqABIReadSeq(fp,baseO,numBases,&thys->Seq);
15871 
15872     if(!ok)
15873     {
15874         ajFileSeek(fp,filestat,0);
15875         ajFilebuffResetPos(buff);
15876 
15877         return ajFalse;
15878     }
15879 
15880     ok = ajFalse;
15881 
15882     pconO = ajSeqABIGetConfidOffset(fp);
15883     if(numBases > (ajlong) thys->Qualsize)
15884     {
15885         AJCRESIZE(thys->Accuracy, (size_t) numBases);
15886         thys->Qualsize = (ajuint) numBases; /* Possibly lossy */
15887     }
15888     if(pconO)
15889         ok = ajSeqABIReadConfid(fp, pconO, numBases, thys->Accuracy);
15890 
15891     sample = ajStrNew();
15892     ajSeqABISampleName(fp, &sample);
15893 
15894     /* replace dots in the sample name with underscore */
15895     if(!seqRegAbiDots)
15896         seqRegAbiDots = ajRegCompC("^(.*)[.](.*)$");
15897 
15898     smpl = ajStrNew();
15899 
15900     while(ajRegExec(seqRegAbiDots,sample))
15901     {
15902         ajStrSetClear(&sample);
15903         ajRegSubI(seqRegAbiDots,1,&smpl);
15904         ajStrAppendC(&smpl,"_");
15905         ajStrAppendS(&sample,smpl);
15906         ajRegSubI(seqRegAbiDots,2,&smpl);
15907         ajStrAppendS(&sample,smpl);
15908     }
15909 
15910     ajStrAssignS(&thys->Name,sample);
15911     ajFilenameTrimAll(&thys->Name);
15912 
15913     ajDebug("seqReadAbi name '%S' sample '%S'\n", thys->Name, sample);
15914 
15915     ajSeqSetNuc(thys);
15916 
15917     ajFilebuffClear(buff, -1);
15918     buff->File->End=ajTrue;
15919 
15920     ajStrDel(&smpl);
15921     ajStrDel(&sample);
15922 
15923     return ajTrue;
15924 }
15925 
15926 
15927 
15928 
15929 /* @funcstatic seqReadEnsembl *************************************************
15930 **
15931 ** Given data in a sequence structure, tries to read everything needed
15932 ** using Ensembl SQL access.
15933 **
15934 ** @param [w] thys [AjPSeq] Sequence object
15935 ** @param [u] seqin [AjPSeqin] Sequence input object
15936 ** @return [AjBool] ajTrue on success
15937 **
15938 ** @release 6.3.0
15939 ** @@
15940 ******************************************************************************/
15941 
seqReadEnsembl(AjPSeq thys,AjPSeqin seqin)15942 static AjBool seqReadEnsembl(AjPSeq thys, AjPSeqin seqin)
15943 {
15944     AjBool debug = AJFALSE;
15945 
15946     AjPSeqAccess seqaccess = NULL;
15947 
15948     debug = ajDebugTest("seqReadEnsembl");
15949 
15950     if(debug)
15951         ajDebug("seqReadEnsembl\n"
15952                 "  thys %p\n"
15953                 "  seqin %p\n",
15954                 thys,
15955                 seqin);
15956 
15957     /*
15958     ** Use the SeqData member of the AJAX Sequence Input structure
15959     ** to pass the AJAX Sequence object between the AJAX Sequence Reading
15960     ** (seqReadEnsembl) and AJAX Sequence Database (seqAccessEnsembl) modules.
15961     */
15962 
15963     seqin->SeqData = (void*) thys;
15964 
15965     seqaccess = seqin->Input->Query->Access;
15966 
15967     if(((*seqaccess->Access)(seqin)) == ajFalse)
15968         return ajFalse;
15969 
15970     return ajTrue;
15971 }
15972 
15973 
15974 
15975 
15976 /* @funcstatic seqPrefixGenbank ***********************************************
15977 **
15978 ** Returns an enumerated prefix for a record in genbank format
15979 **
15980 ** @param [r] str [const AjPStr] Input record
15981 **
15982 ** @return [SeqEPrefixGenbank] Enumerated record prefix
15983 ** @@
15984 ******************************************************************************/
15985 
seqPrefixGenbank(const AjPStr str)15986 static SeqEPrefixGenbank seqPrefixGenbank(const AjPStr str)
15987 {
15988     SeqEPrefixGenbank ipref = GB_UNK;
15989     const char* cp = MAJSTRGETPTR(str);
15990     const char* cq = (cp+1);
15991 
15992     switch (*cp)
15993     {
15994         case 'A':
15995             if(*cq == 'C' && !strncmp(cp, "ACCESSION",9)) ipref = GB_AC;
15996             break;
15997         case 'B':
15998             if(*cq == 'A' && !strncmp(cp, "BASE COUNT",10)) ipref = GB_BASE;
15999             break;
16000         case 'C':
16001             if(*cq == 'O' && !strncmp(cp, "COMMENT",7)) ipref = GB_CC;
16002             break;
16003         case 'D':
16004             if(*cq == 'E' && !strncmp(cp, "DEFINITION",10)) ipref = GB_DEF;
16005             break;
16006         case 'F':
16007             if(*cq == 'E' && !strncmp(cp, "FEATURES",8)) ipref = GB_FEAT;
16008             break;
16009         case 'K':
16010             if(*cq == 'E' && !strncmp(cp, "KEYWORDS",8)) ipref = GB_KEY;
16011             break;
16012         case 'L':
16013             if(*cq == 'O' && !strncmp(cp, "LOCUS",5)) ipref = GB_ID;
16014             break;
16015         case 'O':
16016             if(*cq == 'R' && !strncmp(cp, "ORIGIN",6)) ipref = GB_ORI;
16017             break;
16018         case 'R':
16019             if(*cq == 'E' && !strncmp(cp, "REFERENCE",9)) ipref = GB_REF;
16020             break;
16021         case 'S':
16022             if(*cq == 'E' && !strncmp(cp, "SEQVERSION",10)) ipref = GB_VER;
16023             if(*cq == 'O' && !strncmp(cp, "SOURCE",6)) ipref = GB_SRC;
16024             break;
16025         case 'W':
16026             if(*cq == 'P' && !strncmp(cp, "WPCOMMENT",9)) ipref = GB_WP;
16027             break;
16028         case '/':
16029             if(*cq == '/' && !strncmp(cp, "//",2)) ipref = GB_END;
16030             break;
16031         case ' ':
16032             if(*cq == ' ' && !strncmp(cp, "  ",2)) ipref = GB_MORE;
16033             break;
16034         default:
16035             ipref = GB_UNK;
16036             break;
16037     }
16038 
16039     return ipref;
16040 }
16041 
16042 
16043 
16044 
16045 /* @funcstatic seqPrefixGenbankMore *******************************************
16046 **
16047 ** Returns an enumerated prefix for a subrecord in genbank format
16048 **
16049 ** @param [r] str [const AjPStr] Input record
16050 **
16051 ** @return [SeqEPrefixGenbankMore] Enumerated record prefix
16052 ** @@
16053 ******************************************************************************/
16054 
seqPrefixGenbankMore(const AjPStr str)16055 static SeqEPrefixGenbankMore seqPrefixGenbankMore(const AjPStr str)
16056 {
16057     SeqEPrefixGenbankMore imore = GB_MORE_UNK;
16058     const char* cp = MAJSTRGETPTR(str);
16059     const char* cq = (cp+1);
16060     const char* cr = (cp+1);
16061 
16062     if(*cp != ' ' || *cq != ' ')
16063         return GB_MORE_STD;
16064 
16065     switch (*cr)
16066     {
16067         case 'A':
16068             if(!strncmp(cr, "AUTHORS",7)) imore = GB_MORE_AUT;
16069             break;
16070         case 'B':
16071             break;
16072         case 'C':
16073             break;
16074         case 'D':
16075             break;
16076         case 'F':
16077             break;
16078         case 'J':
16079             if(!strncmp(cr, "JOURNAL",7)) imore = GB_MORE_JNL;
16080             break;
16081         case 'O':
16082             if(!strncmp(cr, "ORGANISM",8)) imore = GB_MORE_ORG;
16083             break;
16084         case 'T':
16085             break;
16086             if(!strncmp(cr, "TITLE",5)) imore = GB_MORE_TIT;
16087         case 'W':
16088             break;
16089         case '/':
16090             break;
16091         case ' ':
16092             if(!strncmp(cr, "        ",8)) imore = GB_MORE_MORE;
16093             break;
16094         default:
16095             imore = GB_MORE_UNK;
16096             break;
16097     }
16098 
16099     return imore;
16100 }
16101 
16102 
16103 
16104 
16105 /* @funcstatic seqPrefixSwiss *************************************************
16106 **
16107 ** Returns an enumerated prefix for a record in swissprot format
16108 **
16109 ** @param [r] str [const AjPStr] Input record
16110 **
16111 ** @return [SeqEPrefixSwiss] Enumerated record prefix
16112 ** @@
16113 ******************************************************************************/
16114 
seqPrefixSwiss(const AjPStr str)16115 static SeqEPrefixSwiss seqPrefixSwiss(const AjPStr str)
16116 {
16117     SeqEPrefixSwiss ipref = SWISS_UNK;
16118     const char* cp = MAJSTRGETPTR(str);
16119     const char* cq = (cp+1);
16120 
16121     switch (*cp)
16122     {
16123         case 'A':
16124             switch(*cq)
16125             {
16126                 case 'C':
16127                     ipref = SWISS_AC;
16128                     break;
16129                 case 'H':
16130                     ipref = SWISS_FH; /* Align header ignored with FH */
16131                     break;
16132                 case 'S':
16133                     ipref = SWISS_AS;
16134                     break;
16135                 case 'V':
16136                     ipref = SWISS_AV; /* staden experiment */
16137                     break;
16138             }
16139             break;
16140         case 'C':
16141             switch(*cq)
16142             {
16143                 case 'C':
16144                     ipref = SWISS_CC;
16145                     break;
16146                 case 'O':
16147                     ipref = SWISS_CO;
16148                     break;
16149             }
16150             break;
16151         case 'D':
16152             switch(*cq)
16153             {
16154                 case 'E':
16155                     ipref = SWISS_DE;
16156                     break;
16157                 case 'R':
16158                     ipref = SWISS_DR;
16159                     break;
16160                 case 'T':
16161                     ipref = SWISS_DT;
16162                     break;
16163             }
16164             break;
16165         case 'E':
16166             switch(*cq)
16167             {
16168                 case 'N':
16169                 case 'X':
16170                     ipref = SWISS_EX;
16171                     break;
16172             }
16173             break;
16174         case 'F':
16175             switch(*cq)
16176             {
16177                 case 'H':
16178                     ipref = SWISS_FH;
16179                     break;
16180                 case 'T':
16181                     ipref = SWISS_FT;
16182                     break;
16183             }
16184             break;
16185         case 'G':
16186             if(*cq == 'N') ipref = SWISS_GN;
16187             break;
16188         case 'I':
16189             switch(*cq)
16190             {
16191                 case 'D':
16192                     ipref = SWISS_ID;
16193                     break;
16194                 case 'V':
16195                     ipref = SWISS_SV; /* EMBLCDS Sv equivalent */
16196                     break;
16197             }
16198             break;
16199         case 'K':
16200             if(*cq == 'W') ipref = SWISS_KW;
16201             break;
16202         case 'O':
16203             switch (*cq)
16204             {
16205                 case 'C':
16206                     ipref = SWISS_OC;
16207                     break;
16208                 case 'G':
16209                     ipref = SWISS_OG;
16210                     break;
16211                 case 'H':
16212                     ipref = SWISS_OH;
16213                     break;
16214                 case 'S':
16215                     ipref = SWISS_OS;
16216                     break;
16217                 case 'X':
16218                     ipref = SWISS_OX;
16219                     break;
16220             }
16221             break;
16222         case 'P':
16223             switch(*cq)
16224             {
16225                 case 'A':
16226                     ipref = SWISS_AC; /* PA records in EMBLCDS */
16227                     break;
16228                 case 'E':
16229                     ipref = SWISS_PE;
16230                     break;
16231             }
16232             break;
16233         case 'R':
16234             switch(*cq)
16235             {
16236                 case 'A':
16237                     ipref = SWISS_RA;
16238                     break;
16239                 case 'C':
16240                     ipref = SWISS_RC;
16241                     break;
16242                 case 'G':
16243                     ipref = SWISS_RG;
16244                     break;
16245                 case 'L':
16246                     ipref = SWISS_RL;
16247                     break;
16248                 case 'N':
16249                     ipref = SWISS_RN;
16250                     break;
16251                 case 'P':
16252                     ipref = SWISS_RP;
16253                     break;
16254                 case 'T':
16255                     ipref = SWISS_RT;
16256                     break;
16257                 case 'X':
16258                     ipref = SWISS_RX;
16259                     break;
16260             }
16261             break;
16262         case 'S':
16263             switch(*cq)
16264             {
16265                 case 'Q':
16266                     ipref = SWISS_SQ;
16267                     break;
16268                 case 'V':
16269                     ipref = SWISS_SV;
16270                     break;
16271             }
16272             break;
16273         case 'T':
16274             if(*cq == 'N') ipref = SWISS_EX;
16275             break;
16276         case 'W':
16277             if(*cq == 'P') ipref = SWISS_WP;
16278             break;
16279         case 'X':
16280             if(*cq == 'X') ipref = SWISS_XX;
16281             break;
16282         case '/':
16283             if(*cq == '/') ipref = SWISS_END;
16284             break;
16285         case ' ':
16286             if(*cq == ' ') ipref = SWISS_MORE;
16287             break;
16288         default:
16289             ipref = SWISS_UNK;
16290             break;
16291     }
16292 
16293     return ipref;
16294 }
16295 
16296 
16297 
16298 
16299 /* @funcstatic seqDesSwiss ****************************************************
16300 **
16301 ** Returns an enumerated code for a description record token
16302 **
16303 ** @param [r] str [const AjPStr] Input record
16304 **
16305 ** @return [SeqEDesSwiss] Enumerated record prefix
16306 ** @@
16307 ******************************************************************************/
16308 
seqDesSwiss(const AjPStr str)16309 static SeqEDesSwiss seqDesSwiss(const AjPStr str)
16310 {
16311     SeqEDesSwiss ides = SWISS_DES_UNK;
16312     const char* cp = MAJSTRGETPTR(str);
16313 
16314     switch (*cp)
16315     {
16316         case 'A':
16317             if(!strcmp(cp, "AltName:")) ides = SWISS_DES_ALT;
16318             break;
16319         case 'C':
16320             if(!strcmp(cp, "Contains:")) ides = SWISS_DES_CONT;
16321             break;
16322         case 'F':
16323             if(!strcmp(cp, "Flags:")) ides = SWISS_DES_FLG;
16324             break;
16325         case 'I':
16326             if(!strcmp(cp, "Includes:")) ides = SWISS_DES_INC;
16327             break;
16328         case 'R':
16329             if(!strcmp(cp, "RecName:")) ides = SWISS_DES_REC;
16330             break;
16331         case 'S':
16332             if(!strcmp(cp, "SubName:")) ides = SWISS_DES_SUB;
16333             break;
16334         default:
16335             ides = SWISS_DES_UNK;
16336             break;
16337     }
16338 
16339     return ides;
16340 }
16341 
16342 
16343 
16344 
16345 /* @funcstatic seqDessubSwiss *************************************************
16346 **
16347 ** Returns an enumerated subcode for a description record token
16348 **
16349 ** @param [u] Pstr [AjPStr*] Input record
16350 **
16351 ** @return [SeqESubSwiss] Enumerated record prefix
16352 ** @@
16353 ******************************************************************************/
16354 
seqDessubSwiss(AjPStr * Pstr)16355 static SeqESubSwiss seqDessubSwiss(AjPStr *Pstr)
16356 {
16357     SeqESubSwiss isub = SWISS_SUB_UNK;
16358     const char* cp = MAJSTRGETPTR(*Pstr);
16359 
16360     switch (*cp)
16361     {
16362         case 'A':
16363             if(!strncmp(cp, "Allergen=", 9))
16364             {
16365                 isub = SWISS_SUB_ALLER;
16366                 ajStrCutStart(Pstr, 9);
16367             }
16368             break;
16369         case 'B':
16370             if(!strncmp(cp, "Biotech=", 8))
16371             {
16372                 isub = SWISS_SUB_BIOTECH;
16373                 ajStrCutStart(Pstr, 8);
16374             }
16375             break;
16376         case 'C':
16377             if(!strncmp(cp, "CD_antigen=", 11))
16378             {
16379                 isub = SWISS_SUB_CDA;
16380                 ajStrCutStart(Pstr, 11);
16381             }
16382             break;
16383         case 'E':
16384             if(!strncmp(cp, "EC=", 3))
16385             {
16386                 isub = SWISS_SUB_EC;
16387                 ajStrCutStart(Pstr, 3);
16388             }
16389             break;
16390         case 'F':
16391             if(!strncmp(cp, "Full=", 5))
16392             {
16393                 isub = SWISS_SUB_FULL;
16394                 ajStrCutStart(Pstr, 5);
16395             }
16396             break;
16397         case 'I':
16398             if(!strncmp(cp, "INN=", 4))
16399             {
16400                 isub = SWISS_SUB_INN;
16401                 ajStrCutStart(Pstr, 4);
16402             }
16403             break;
16404         case 'S':
16405             if(!strncmp(cp, "Short=", 6))
16406             {
16407                 isub = SWISS_SUB_SHORT;
16408                 ajStrCutStart(Pstr, 6);
16409             }
16410             break;
16411         default:
16412             isub = SWISS_DES_UNK;
16413             break;
16414     }
16415 
16416     return isub;
16417 }
16418 
16419 
16420 
16421 
16422 /* @func ajSeqPrintInFormat ***************************************************
16423 **
16424 ** Reports the internal data structures
16425 **
16426 ** @param [u] outf [AjPFile] Output file
16427 ** @param [r] full [AjBool] Full report (usually ajFalse)
16428 ** @return [void]
16429 **
16430 ** @release 1.0.0
16431 ** @@
16432 ******************************************************************************/
16433 
ajSeqPrintInFormat(AjPFile outf,AjBool full)16434 void ajSeqPrintInFormat(AjPFile outf, AjBool full)
16435 {
16436     ajuint i = 0;
16437 
16438     ajFmtPrintF(outf, "\n");
16439     ajFmtPrintF(outf, "# Sequence input formats\n");
16440     ajFmtPrintF(outf, "# Name  Format name (or alias)\n");
16441     ajFmtPrintF(outf, "# Alias Alias name\n");
16442     ajFmtPrintF(outf, "# Try   Test for unknown input files\n");
16443     ajFmtPrintF(outf, "# Nuc   Can read nucleotide input\n");
16444     ajFmtPrintF(outf, "# Pro   Can read protein input\n");
16445     ajFmtPrintF(outf, "# Feat  Can read feature annotation\n");
16446     ajFmtPrintF(outf, "# Gap   Can read gap characters\n");
16447     ajFmtPrintF(outf, "# Mset  Can read seqsetall (multiple seqsets)\n");
16448     ajFmtPrintF(outf, "# Name         Alias Try  Nuc  Pro Feat  Gap MSet "
16449                 "Description");
16450     ajFmtPrintF(outf, "\n");
16451     ajFmtPrintF(outf, "InFormat {\n");
16452 
16453     for(i=0; seqinFormatDef[i].Name; i++)
16454         if(full || !seqinFormatDef[i].Alias)
16455             ajFmtPrintF(outf,
16456                         "  %-12s %5B %3B  %3B  %3B  %3B  %3B  %3B \"%s\"\n",
16457                         seqinFormatDef[i].Name,
16458                         seqinFormatDef[i].Alias,
16459                         seqinFormatDef[i].Try,
16460                         seqinFormatDef[i].Nucleotide,
16461                         seqinFormatDef[i].Protein,
16462                         seqinFormatDef[i].Feature,
16463                         seqinFormatDef[i].Gap,
16464                         seqinFormatDef[i].Multiset,
16465                         seqinFormatDef[i].Desc);
16466 
16467     ajFmtPrintF(outf, "}\n\n");
16468 
16469     return;
16470 }
16471 
16472 
16473 
16474 
16475 /* @func ajSeqPrintbookInFormat ***********************************************
16476 **
16477 ** Reports the internal data structures as a Docbook table
16478 **
16479 ** @param [u] outf [AjPFile] Output file
16480 ** @return [void]
16481 **
16482 ** @release 6.2.0
16483 ** @@
16484 ******************************************************************************/
16485 
ajSeqPrintbookInFormat(AjPFile outf)16486 void ajSeqPrintbookInFormat(AjPFile outf)
16487 {
16488     ajuint i = 0;
16489     ajuint j = 0;
16490     AjPStr namestr = NULL;
16491     AjPList fmtlist;
16492     AjPStr* names;
16493 
16494     fmtlist = ajListstrNew();
16495 
16496     ajFmtPrintF(outf, "<para>The supported sequence formats are summarised "
16497                 "in the table below. "
16498                 "The columns are as follows: "
16499                 "<emphasis>Input format</emphasis> (format name), "
16500                 "<emphasis>Output format</emphasis> (format name), "
16501                 "<emphasis>Sngl</emphasis> "
16502                 "(indicates whether each sequence is written to a new file. "
16503                 "This behaviour is the default and can be set by the "
16504                 "<option>-ossingle</option> command line qualifier.  "
16505                 "<emphasis>Save</emphasis> (indicates that sequence data is "
16506                 "stored internally and written when the output is closed. "
16507                 "This is needed for 'interleaved' formats such as Phylip "
16508                 "and MSF), <emphasis>Try</emphasis> (indicates whether the "
16509                 "format can be detected automatically on input), "
16510                 "<emphasis>Nuc</emphasis> (\"true\" indicates nucleotide "
16511                 "sequence data may be represented), <emphasis>Pro</emphasis> "
16512                 "(\"true\" indicates protein sequence data may be represented, "
16513                 "<emphasis>Feat</emphasis> (whether the format includes "
16514                 "feature annotation data. "
16515                 "EMBOSS can also read feature data from a separate "
16516                 "feature file).  "
16517                 "<emphasis>Gap</emphasis> (whether the format supports "
16518                 "sequence data with gap characters, for example the results "
16519                 "of an alignment), "
16520                 "<emphasis>Mset</emphasis> (\"true\" indicates that more "
16521                 "than one set of sequences can be stored in a single file. "
16522                 "This is used by, for example, phylogenetic analysis "
16523                 "applications to store many versions of a multiple alignment "
16524                 "for statistical analysis) and "
16525                 "<emphasis>Description</emphasis> (short description of "
16526                 "the format).</para>\n\n");
16527 
16528     ajFmtPrintF(outf, "<table frame=\"box\" rules=\"cols\">\n");
16529     ajFmtPrintF(outf, "  <caption>Input sequence formats</caption>\n");
16530     ajFmtPrintF(outf, "  <thead>\n");
16531     ajFmtPrintF(outf, "    <tr align=\"center\">\n");
16532     ajFmtPrintF(outf, "      <th>Input Format</th>\n");
16533     ajFmtPrintF(outf, "      <th>Try</th>\n");
16534     ajFmtPrintF(outf, "      <th>Nuc</th>\n");
16535     ajFmtPrintF(outf, "      <th>Pro</th>\n");
16536     ajFmtPrintF(outf, "      <th>Feat</th>\n");
16537     ajFmtPrintF(outf, "      <th>Gap</th>\n");
16538     ajFmtPrintF(outf, "      <th>Mset</th>\n");
16539     ajFmtPrintF(outf, "      <th>Description</th>\n");
16540     ajFmtPrintF(outf, "    </tr>\n");
16541     ajFmtPrintF(outf, "  </thead>\n");
16542     ajFmtPrintF(outf, "  <tbody>\n");
16543 
16544     for(i=1; seqinFormatDef[i].Name; i++)
16545     {
16546         if(!seqinFormatDef[i].Alias)
16547         {
16548             namestr = ajStrNewC(seqinFormatDef[i].Name);
16549             ajListPush(fmtlist, namestr);
16550             namestr = NULL;
16551         }
16552     }
16553 
16554     ajListSort(fmtlist, &ajStrVcmp);
16555     ajListstrToarray(fmtlist, &names);
16556 
16557     for(i=0; names[i]; i++)
16558     {
16559         for(j=0; seqinFormatDef[j].Name; j++)
16560         {
16561             if(ajStrMatchC(names[i],seqinFormatDef[j].Name))
16562             {
16563                 ajFmtPrintF(outf, "    <tr>\n");
16564                 ajFmtPrintF(outf, "      <td>%s</td>\n",
16565                             seqinFormatDef[j].Name);
16566                 ajFmtPrintF(outf, "      <td>%B</td>\n",
16567                             seqinFormatDef[j].Try);
16568                 ajFmtPrintF(outf, "      <td>%B</td>\n",
16569                             seqinFormatDef[j].Nucleotide);
16570                 ajFmtPrintF(outf, "      <td>%B</td>\n",
16571                             seqinFormatDef[j].Protein);
16572                 ajFmtPrintF(outf, "      <td>%B</td>\n",
16573                             seqinFormatDef[j].Feature);
16574                 ajFmtPrintF(outf, "      <td>%B</td>\n",
16575                             seqinFormatDef[j].Gap);
16576                 ajFmtPrintF(outf, "      <td>%B</td>\n",
16577                             seqinFormatDef[j].Multiset);
16578                 ajFmtPrintF(outf, "      <td>%s</td>\n",
16579                             seqinFormatDef[j].Desc);
16580                 ajFmtPrintF(outf, "    </tr>\n");
16581             }
16582         }
16583     }
16584 
16585 
16586     ajFmtPrintF(outf, "  </tbody>\n");
16587     ajFmtPrintF(outf, "</table>\n");
16588     ajStrDel(&namestr);
16589 
16590     names = NULL;
16591     ajListstrFreeData(&fmtlist);
16592 
16593     return;
16594 }
16595 
16596 
16597 
16598 
16599 /* @func ajSeqPrinthtmlInFormat ***********************************************
16600 **
16601 ** Reports the internal data structures as an HTML table
16602 **
16603 ** @param [u] outf [AjPFile] Output file
16604 ** @return [void]
16605 **
16606 ** @release 6.2.0
16607 ** @@
16608 ******************************************************************************/
16609 
ajSeqPrinthtmlInFormat(AjPFile outf)16610 void ajSeqPrinthtmlInFormat(AjPFile outf)
16611 {
16612     ajuint i = 0;
16613     ajuint j = 0;
16614 
16615     AjPStr namestr = NULL;
16616 
16617     ajFmtPrintF(outf, "<table border=3>");
16618     ajFmtPrintF(outf, "<tr><th>Input Format</th><th>Auto</th>\n");
16619     ajFmtPrintF(outf, "<th>Nuc</th><th>Pro</th><th>Feat</th><th>Gap</th>\n");
16620     ajFmtPrintF(outf, "<th>Multi</th><th>Description</th></tr>\n");
16621 
16622     for(i=1; seqinFormatDef[i].Name; i++)
16623     {
16624         ajStrAssignC(&namestr, seqinFormatDef[i].Name);
16625 
16626         if(!seqinFormatDef[i].Alias)
16627         {
16628             for(j=i+1; seqinFormatDef[j].Name; j++)
16629             {
16630                 if(seqinFormatDef[j].Read == seqinFormatDef[i].Read)
16631                 {
16632                     ajFmtPrintAppS(&namestr, " %s", seqinFormatDef[j].Name);
16633                     if(!seqinFormatDef[j].Alias)
16634                     {
16635                         ajWarn("Input format '%s' same as '%s' but not alias",
16636                                seqinFormatDef[j].Name, seqinFormatDef[i].Name);
16637                     }
16638                 }
16639             }
16640 
16641             ajFmtPrintF(outf, "<tr><td>\n%S\n</td><td>%B</td>\n",
16642                         namestr,
16643                         seqinFormatDef[i].Try);
16644             ajFmtPrintF(outf, "<td>%B</td><td>%B</td><td>%B</td><td>%B</td>\n",
16645                         seqinFormatDef[i].Nucleotide,
16646                         seqinFormatDef[i].Protein,
16647                         seqinFormatDef[i].Feature,
16648                         seqinFormatDef[i].Gap);
16649             ajFmtPrintF(outf, "<td>%B</td><td>\n%s\n</td></tr>\n",
16650                         seqinFormatDef[i].Multiset,
16651                         seqinFormatDef[i].Desc);
16652         }
16653 
16654     }
16655 
16656     ajFmtPrintF(outf, "</table>\n");
16657     ajStrDel(&namestr);
16658 
16659     return;
16660 }
16661 
16662 
16663 
16664 
16665 /* @func ajSeqPrintwikiInFormat ***********************************************
16666 **
16667 ** Reports the internal data structures as a wiki table
16668 **
16669 ** @param [u] outf [AjPFile] Output file
16670 ** @return [void]
16671 **
16672 ** @release 6.2.0
16673 ** @@
16674 ******************************************************************************/
16675 
ajSeqPrintwikiInFormat(AjPFile outf)16676 void ajSeqPrintwikiInFormat(AjPFile outf)
16677 {
16678     ajuint i = 0;
16679     ajuint j = 0;
16680 
16681     AjPStr namestr = NULL;
16682 
16683     ajFmtPrintF(outf, "{| class=\"wikitable sortable\" border=\"2\"\n");
16684     ajFmtPrintF(outf, "|-\n");
16685     ajFmtPrintF(outf, "!Format!!Try!!Nuc!!Pro!!Feat!!Gap!!MSet!!"
16686                 "class=\"unsortable\"|Description\n");
16687 
16688     for(i=1; seqinFormatDef[i].Name; i++)
16689     {
16690         ajStrAssignC(&namestr, seqinFormatDef[i].Name);
16691 
16692         if(!seqinFormatDef[i].Alias)
16693         {
16694             for(j=i+1; seqinFormatDef[j].Name; j++)
16695             {
16696                 if(seqinFormatDef[j].Read == seqinFormatDef[i].Read)
16697                 {
16698                     ajFmtPrintAppS(&namestr, "<br>%s", seqinFormatDef[j].Name);
16699                     if(!seqinFormatDef[j].Alias)
16700                     {
16701                         ajWarn("Input format '%s' same as '%s' but not alias",
16702                                seqinFormatDef[j].Name, seqinFormatDef[i].Name);
16703                     }
16704                 }
16705             }
16706 
16707             ajFmtPrintF(outf, "|-\n");
16708             ajFmtPrintF(outf,
16709                         "|%S||%B||%B||%B||%B||%B||%B||%s\n",
16710                         namestr,
16711                         seqinFormatDef[i].Try,
16712                         seqinFormatDef[i].Nucleotide,
16713                         seqinFormatDef[i].Protein,
16714                         seqinFormatDef[i].Feature,
16715                         seqinFormatDef[i].Gap,
16716                         seqinFormatDef[i].Multiset,
16717                         seqinFormatDef[i].Desc);
16718         }
16719 
16720     }
16721 
16722     ajFmtPrintF(outf, "|}\n\n");
16723     ajStrDel(&namestr);
16724 
16725     return;
16726 }
16727 
16728 
16729 
16730 
16731 /* @funcstatic seqinFormatFind ************************************************
16732 **
16733 ** Looks for the specified format(s) in the internal definitions and
16734 ** returns the index.
16735 **
16736 ** Sets iformat as the recognised format, and returns ajTrue.
16737 **
16738 ** @param [r] format [const AjPStr] Format required.
16739 ** @param [w] iformat [ajint*] Index
16740 ** @return [AjBool] ajTrue on success.
16741 **
16742 ** @release 6.4.0
16743 ** @@
16744 ******************************************************************************/
16745 
seqinFormatFind(const AjPStr format,ajint * iformat)16746 static AjBool seqinFormatFind(const AjPStr format, ajint* iformat)
16747 {
16748     AjPStr tmpformat = NULL;
16749     ajuint i = 0;
16750 
16751      ajDebug("seqinFormatFind '%S'\n", format);
16752     if(!ajStrGetLen(format))
16753         return ajFalse;
16754 
16755     ajStrAssignS(&tmpformat, format);
16756     ajStrFmtLower(&tmpformat);
16757 
16758     for(i=0; seqinFormatDef[i].Name; i++)
16759     {
16760         /*ajDebug("test %d '%s' '%s' '%s' \n",
16761                 i, seqinFormatDef[i].Name,
16762                 seqinFormatDef[i].Obo,
16763                 seqinFormatDef[i].Desc);*/
16764         if(ajStrMatchC(tmpformat, seqinFormatDef[i].Name) ||
16765            ajStrMatchC(format, seqinFormatDef[i].Obo))
16766         {
16767             *iformat = i;
16768             ajStrDel(&tmpformat);
16769             /*ajDebug("found '%s' at %d\n", seqinFormatDef[i].Name, i);*/
16770             return ajTrue;
16771         }
16772     }
16773 
16774     ajStrDel(&tmpformat);
16775 
16776     return ajFalse;
16777 }
16778 
16779 
16780 
16781 
16782 /* @func ajSeqFormatTest ******************************************************
16783 **
16784 ** tests whether a named format is known
16785 **
16786 ** @param [r] format [const AjPStr] Format
16787 ** @return [AjBool] ajTrue if formats was accepted
16788 **
16789 ** @release 2.7.0
16790 ** @@
16791 ******************************************************************************/
16792 
ajSeqFormatTest(const AjPStr format)16793 AjBool ajSeqFormatTest(const AjPStr format)
16794 {
16795     ajuint i;
16796 
16797     for(i=0; seqinFormatDef[i].Name; i++)
16798     {
16799         if(ajStrMatchCaseC(format, seqinFormatDef[i].Name))
16800             return ajTrue;
16801         if(ajStrMatchC(format, seqinFormatDef[i].Obo))
16802             return ajTrue;
16803     }
16804 
16805     return ajFalse;
16806 }
16807 
16808 
16809 
16810 
16811 /* @funcstatic seqSetInFormat *************************************************
16812 **
16813 ** Steps through a list of default formats, setting the Try value for
16814 ** each known format to ajTrue if it is in the list, and ajFalse
16815 ** if not.
16816 **
16817 ** @param [r] format [const AjPStr] Format list, punctuated by whitespace
16818 **                                  or commas
16819 ** @return [AjBool] ajTrue if all formats were accepted
16820 **
16821 ** @release 1.0.0
16822 ** @@
16823 ******************************************************************************/
16824 
seqSetInFormat(const AjPStr format)16825 static AjBool seqSetInFormat(const AjPStr format)
16826 {
16827     ajuint i;
16828     ajuint ifound;
16829     AjBool ret        = ajTrue;
16830 
16831     for(i=0; seqinFormatDef[i].Name; i++)
16832         seqinFormatDef[i].Try = ajFalse;
16833 
16834     ajDebug("seqSetInformat '%S'\n", format);
16835 
16836     ajStrTokenAssignC(&seqHandle, format, " \t\n\r,;:");
16837 
16838     while(ajStrTokenNextParseC(seqHandle, " \t\n\r,;:", &seqToken))
16839     {
16840         ifound = 0;
16841 
16842         for(i=0; seqinFormatDef[i].Name; i++)
16843             if(ajStrMatchCaseC(seqToken, seqinFormatDef[i].Name))
16844             {
16845                 /* ajDebug("found '%S' %d\n", fmtstr, i); */
16846                 seqinFormatDef[i].Try = ajTrue;
16847                 ifound = 1;
16848                 break;
16849             }
16850 
16851         if(!ifound)
16852         {
16853             /* ajDebug("not found '%S'\n", fmtstr); */
16854 
16855             ajErr("Input format '%S' not known", seqToken);
16856             ret = ajFalse;
16857         }
16858     }
16859 
16860     ajStrTokenReset(seqHandle);
16861 
16862     return ret;
16863 }
16864 
16865 
16866 
16867 
16868 /* @funcstatic seqAppend ******************************************************
16869 **
16870 ** Appends sequence characters in the input line to a growing sequence.
16871 ** Non sequence characters are simply ignored.
16872 **
16873 ** @param [u] pseq [AjPStr*] Sequence as a string
16874 ** @param [r] line [const AjPStr] Input line.
16875 ** @return [ajuint] Sequence length to date.
16876 **
16877 ** @release 1.0.0
16878 ** @@
16879 ******************************************************************************/
16880 
seqAppend(AjPStr * pseq,const AjPStr line)16881 static ajuint seqAppend(AjPStr* pseq, const AjPStr line)
16882 {
16883     ajuint ret = 0;
16884 
16885     ajStrAssignS(&seqAppendTmpstr, line);
16886     ajStrKeepSetAlphaC(&seqAppendTmpstr, "*.~?#+-");
16887     ajStrAppendS(pseq, seqAppendTmpstr);
16888 
16889     ret = ajStrGetLen(*pseq);
16890 
16891     ajStrDelStatic(&seqAppendTmpstr);
16892 
16893     return ret;
16894 }
16895 
16896 
16897 
16898 
16899 /* @funcstatic seqAppendK *****************************************************
16900 **
16901 ** Appends single sequence character in the input line to a growing sequence.
16902 ** Non sequence characters are simply ignored.
16903 **
16904 ** @param [u] pseq [AjPStr*] Sequence as a string
16905 ** @param [r] ch [char] Input character.
16906 ** @return [ajuint] Sequence length to date.
16907 **
16908 ** @release 6.0.0
16909 ** @@
16910 ******************************************************************************/
16911 
seqAppendK(AjPStr * pseq,char ch)16912 static ajuint seqAppendK(AjPStr* pseq, char ch)
16913 {
16914     AjPStr tmpstr = NULL;
16915     ajuint ret = 0;
16916 
16917     ajStrAssignK(&tmpstr, ch);
16918     ajStrKeepSetAlphaC(&tmpstr, "*.~?#+-");
16919     ajStrAppendS(pseq, tmpstr);
16920 
16921     ret = ajStrGetLen(*pseq);
16922     ajStrDel(&tmpstr);
16923 
16924     return ret;
16925 }
16926 
16927 
16928 
16929 
16930 /* @funcstatic seqAppendCommented *********************************************
16931 **
16932 ** Appends sequence characters in the input line to a growing sequence.
16933 ** Non sequence characters are simply ignored.
16934 **
16935 ** This version of seqAppend removes comments in the angle brackets style
16936 ** used first by Staden and then later by GCG.
16937 **
16938 ** @param [u] pseq [AjPStr*] Sequence as a string
16939 ** @param [u] incomment [AjBool*] Currently processing a comment
16940 ** @param [r] line [const AjPStr] Input line.
16941 ** @return [ajuint] Sequence length to date.
16942 **
16943 ** @release 3.0.0
16944 ** @@
16945 ******************************************************************************/
16946 
seqAppendCommented(AjPStr * pseq,AjBool * incomment,const AjPStr line)16947 static ajuint seqAppendCommented(AjPStr* pseq, AjBool* incomment,
16948                                  const AjPStr line)
16949 {
16950     AjPStr tmpstr = NULL;
16951     ajlong i;
16952     ajuint ret = 0;
16953 
16954     ajStrAssignS(&tmpstr, line);
16955     ajStrKeepSetAlphaC(&tmpstr, "*.~?#+-<>");
16956 
16957     ajDebug("seqAppendCommented %B '%S'\n", *incomment, tmpstr);
16958 
16959     while(ajStrGetLen(tmpstr))
16960     {
16961         /* if we are in a comment, look for the end of it */
16962         /* Staden comments are <comment> */
16963         /* GCG comments are <comment< or >comment> */
16964 
16965         /* there should be no case of >comment<
16966            but in a broken file we can't tell */
16967 
16968         /* so we test for both kinds of angle brackets at both ends */
16969 
16970         if(*incomment)
16971         {
16972             i = ajStrFindAnyC(tmpstr, "<>");
16973 
16974             if(i >= 0)                  /* comment ends in this line */
16975             {
16976                 ajStrCutStart(&tmpstr, (size_t) i+1);
16977                 *incomment = ajFalse;
16978             }
16979             else
16980             {
16981                 ajStrAssignClear(&tmpstr);      /* all comment */
16982             }
16983         }
16984         else
16985         {
16986             i = ajStrFindAnyC(tmpstr, "<>");
16987 
16988             if(i >= 0)                  /* comment starts in this line */
16989             {
16990                 if(i)
16991                     ajStrAppendSubS(pseq, tmpstr, 0, i-1);
16992 
16993                 ajDebug("before comment saved '%S'\n", *pseq);
16994                 ajStrCutStart(&tmpstr, (size_t) (i+1));
16995                 *incomment = ajTrue;
16996             }
16997             else
16998             {
16999                 ajStrAppendS(pseq, tmpstr);
17000                 ajDebug("all saved '%S'\n", *pseq);
17001                 ajStrAssignClear(&tmpstr);
17002             }
17003         }
17004 
17005         if(ajStrGetLen(tmpstr))
17006             ajDebug("continuing %B '%S'\n", *incomment, tmpstr);
17007         else
17008             ajDebug("done %B '%S'\n", *incomment, tmpstr);
17009     }
17010 
17011     ret = ajStrGetLen(*pseq);
17012     ajStrDel(&tmpstr);
17013 
17014     return ret;
17015 }
17016 
17017 
17018 
17019 
17020 /* @funcstatic seqAppendWarn **************************************************
17021 **
17022 ** Appends sequence characters in the input line to a growing sequence.
17023 **
17024 ** Non sequence characters are reported in the return value
17025 ** if EMBOSS_SEQWARN is set
17026 **
17027 ** @param [u] pseq [AjPStr*] Sequence as a string
17028 ** @param [r] line [const AjPStr] Input line.
17029 ** @param [r] informat [ajuint] Input format, zero for unknown
17030 ** @return [const AjPStr] Any rejected non-space characters
17031 **
17032 ** @release 5.0.0
17033 ** @@
17034 ******************************************************************************/
17035 
seqAppendWarn(AjPStr * pseq,const AjPStr line,ajuint informat)17036 static const AjPStr seqAppendWarn(AjPStr* pseq, const AjPStr line,
17037                                   ajuint informat)
17038 {
17039     AjPStr tmpstr = NULL;
17040 
17041     if(!seqAppendRestStr)
17042     {
17043         if(ajNamGetValueC("seqwarn", &tmpstr))
17044             ajStrToBool(tmpstr, &seqDoWarnAppend);
17045         seqAppendRestStr = ajStrNew();
17046     }
17047 
17048     ajStrAssignS(&seqAppendTmpSeq, line);
17049 
17050     if(seqDoWarnAppend || informat)
17051     {
17052         ajStrKeepSetAlphaRestC(&seqAppendTmpSeq, "*.~?#+-", &seqAppendRestStr);
17053         ajStrAppendS(pseq, seqAppendTmpSeq);
17054 
17055         ajStrDelStatic(&seqAppendTmpSeq);
17056 
17057         if(!ajStrGetLen(seqAppendRestStr))
17058             return NULL;
17059 
17060         return seqAppendRestStr;
17061     }
17062 
17063     if(!seqAppendFilter)
17064         seqAppendFilter = ajCharGetfilter( "*.~?#+-"
17065                                            "abcdefghijklmnopqrstuvwxyz"
17066                                            "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
17067 
17068     ajStrKeepSetFilter(&seqAppendTmpSeq, seqAppendFilter);
17069 
17070     ajStrAppendS(pseq, seqAppendTmpSeq);
17071 
17072     ajStrDelStatic(&seqAppendTmpSeq);
17073 
17074     return NULL;
17075 }
17076 
17077 
17078 
17079 
17080 /* @funcstatic seqqualAppendWarn **********************************************
17081 **
17082 ** Appends sequence quality characters in the input line to a growing string.
17083 **
17084 ** Non sequence characters are reported in the return value
17085 ** if EMBOSS_SEQWARN is set
17086 **
17087 ** @param [u] Pqual [AjPStr*] Quality values as a string
17088 ** @param [r] line [const AjPStr] Input line.
17089 ** @return [void]
17090 **
17091 ** @release 6.1.0
17092 ** @@
17093 ******************************************************************************/
17094 
seqqualAppendWarn(AjPStr * Pqual,const AjPStr line)17095 static void seqqualAppendWarn(AjPStr* Pqual, const AjPStr line)
17096 {
17097     ajStrAssignS(&seqAppendTmpSeq, line);
17098 
17099     ajStrKeepSetAscii(&seqAppendTmpSeq, 33, 126);
17100     ajStrAppendS(Pqual, seqAppendTmpSeq);
17101 
17102     ajStrDelStatic(&seqAppendTmpSeq);
17103 
17104     return;
17105 }
17106 
17107 
17108 
17109 
17110 /* @funcstatic seqGcgRegInit **************************************************
17111 **
17112 ** Initialises regular expressions for GCG and MSF format parsing
17113 **
17114 **
17115 ** @return [void]
17116 **
17117 ** @release 4.0.0
17118 ******************************************************************************/
17119 
seqGcgRegInit(void)17120 static void seqGcgRegInit(void)
17121 {
17122     if(!seqRegGcgDot)
17123         seqRegGcgDot = ajRegCompC("[.][.]");
17124 
17125     if(!seqRegGcgChk)
17126         seqRegGcgChk = ajRegCompC("[Cc][Hh][Ee][Cc][Kk]:[ \t]*([0-9]+)");
17127 
17128     if(!seqRegGcgLen)
17129         seqRegGcgLen = ajRegCompC("[Ll][Ee][Nn][Gg][Tt][Hh]:[ \t]*([0-9]+)");
17130 
17131     if(!seqRegGcgTyp)
17132         seqRegGcgTyp = ajRegCompC("[Tt][Yy][Pp][Ee]:[ \t]*([NP])");
17133 
17134     if(!seqRegGcgNam)
17135         seqRegGcgNam = ajRegCompC("[^ \t>]+");
17136 
17137     if(!seqRegGcgMsf)
17138         seqRegGcgMsf = ajRegCompC("[Mm][Ss][Ff]:[ \t]*([0-9]+)");
17139 
17140     if(!seqRegGcgMsflen)
17141         seqRegGcgMsflen = ajRegCompC("[Ll][Ee][Nn]:[ \t]*([0-9]+)");
17142 
17143     if(!seqRegGcgWgt)
17144         seqRegGcgWgt = ajRegCompC("[Ww][Ee][Ii][Gg][Hh][Tt]:[ \t]*([0-9.]+)");
17145 
17146     if(!seqRegGcgMsfnam)
17147         seqRegGcgMsfnam = ajRegCompC("[Nn][Aa][Mm][Ee]:[ \t]*([^ \t]+)");
17148 
17149     return;
17150 }
17151 
17152 
17153 
17154 
17155 /* @funcstatic seqGcgDots *****************************************************
17156 **
17157 ** Looks for the ".." line in the header of a GCG format sequence.
17158 ** Care is needed to make sure this is not an MSF header which
17159 ** has a very similar format.
17160 **
17161 ** Data found on the header line is extracted and returned.
17162 **
17163 ** The number of lines searched is limited to avoid parsing large data
17164 ** files that are not in GCG format. The user should set this limit to
17165 ** be large enough to handle large EMBL/Genbank annotations
17166 **
17167 ** @param [u] thys [AjPSeq] Sequence.
17168 ** @param [r] seqin [const AjPSeqin] Sequence input.
17169 ** @param [u] Pline [AjPStr*] Input buffer.
17170 ** @param [r] maxlines [ajuint] Maximum number of lines to read
17171 **                              before giving up
17172 ** @param [w] len [ajuint*] Length of sequence read.
17173 ** @return [AjBool] ajTrue on success. ajFalse on failure or aborting.
17174 **
17175 ** @release 1.0.0
17176 ** @@
17177 ******************************************************************************/
17178 
seqGcgDots(AjPSeq thys,const AjPSeqin seqin,AjPStr * Pline,ajuint maxlines,ajuint * len)17179 static AjBool seqGcgDots(AjPSeq thys, const  AjPSeqin seqin,
17180                          AjPStr* Pline,
17181                          ajuint maxlines, ajuint* len)
17182 {
17183     AjPStr token  = NULL;
17184     ajuint check  = 0;
17185     ajuint nlines = 0;
17186 
17187     seqGcgRegInit();
17188 
17189     while(nlines < maxlines)
17190     {
17191         if(nlines++)
17192             if(!ajTextinStoreReadline(seqin->Input, Pline, &thys->TextPtr))
17193                 return ajFalse;
17194 
17195         if(nlines > maxlines)
17196             return ajFalse;
17197 
17198         if(!ajRegExec(seqRegGcgDot, *Pline))
17199             continue;
17200 
17201         ajDebug("seqGcgDots   .. found\n'%S'\n", *Pline);
17202 
17203         if(!ajRegExec(seqRegGcgChk, *Pline))    /* checksum required */
17204             return ajFalse;
17205 
17206         if(ajRegExec(seqRegGcgMsf, *Pline))     /* oops - it's an MSF file */
17207             return ajFalse;
17208 
17209         ajRegSubI(seqRegGcgChk, 1, &token);
17210         ajStrToUint(token, &check);
17211 
17212         ajDebug("   checksum %d\n", check);
17213 
17214         if(ajRegExec(seqRegGcgLen, *Pline))
17215         {
17216             ajRegSubI(seqRegGcgLen, 1, &token);
17217             ajStrToUint(token, len);
17218             ajDebug("   length %d\n", *len);
17219         }
17220 
17221         if(ajRegExec(seqRegGcgNam, *Pline))
17222         {
17223             ajRegSubI(seqRegGcgNam, 0, &thys->Name);
17224             ajDebug("   name '%S'\n", thys->Name);
17225         }
17226 
17227         if(ajRegExec(seqRegGcgTyp, *Pline))
17228         {
17229             ajRegSubI(seqRegGcgTyp, 1, &thys->Type);
17230             ajDebug("   type '%S'\n", thys->Type);
17231         }
17232 
17233         ajStrDel(&token);
17234 
17235         return ajTrue;
17236     }
17237 
17238     return ajFalse;
17239 }
17240 
17241 
17242 
17243 
17244 /* @funcstatic seqGcgMsfDots **************************************************
17245 **
17246 ** Looks for the ".." line in the header of an MSF format sequence.
17247 ** Care is needed to make sure this is not a simple GCG header which
17248 ** has a very similar format.
17249 **
17250 ** Data found on the header line is extracted and returned.
17251 **
17252 ** The number of lines searched is limited to avoid parsing large data
17253 ** files that are not in GCG format. The user should set this limit to
17254 ** be large enough to handle large EMBL/Genbank annotations
17255 **
17256 ** @param [u] thys [AjPSeq] Sequence.
17257 ** @param [r] seqin [const AjPSeqin] Sequence input.
17258 ** @param [u] Pline [AjPStr*] Input buffer.
17259 ** @param [r] maxlines [ajuint] Maximum number of lines to read
17260 **                              before giving up
17261 ** @param [w] len [ajuint*] Length of sequence read.
17262 ** @return [AjBool] ajTrue on success. ajFalse on failure or aborting.
17263 **
17264 ** @release 1.0.0
17265 ** @@
17266 ******************************************************************************/
17267 
seqGcgMsfDots(AjPSeq thys,const AjPSeqin seqin,AjPStr * Pline,ajuint maxlines,ajuint * len)17268 static AjBool seqGcgMsfDots(AjPSeq thys, const AjPSeqin seqin, AjPStr* Pline,
17269                             ajuint maxlines, ajuint* len)
17270 {
17271     AjPStr token = NULL;
17272     ajuint check  = 0;
17273     ajuint nlines = 0;
17274 
17275     ajDebug("seqGcgMsfDots maxlines: %d\nline: '%S'\n", maxlines,*Pline);
17276 
17277     seqGcgRegInit();
17278 
17279     while(nlines < maxlines)
17280     {
17281         if(nlines++)
17282             if(!ajTextinStoreReadline(seqin->Input, Pline, &thys->TextPtr))
17283                 return ajFalse;
17284 
17285         ajDebug("testing line %d\n'%S'\n", nlines,*Pline);
17286 
17287         if(nlines > maxlines)
17288             return ajFalse;
17289 
17290         if(!ajRegExec(seqRegGcgDot, *Pline))
17291             continue;
17292 
17293         /* dots found. This must be the line if this is MSF format */
17294 
17295         if(!ajRegExec(seqRegGcgChk, *Pline))    /* check: is required */
17296             return ajFalse;
17297 
17298         if(!ajRegExec(seqRegGcgMsf, *Pline)) /* MSF: len required for GCG*/
17299             return ajFalse;
17300 
17301 
17302         ajRegSubI(seqRegGcgMsf, 1, &token);
17303         ajStrToUint(token, len);
17304 
17305         ajRegSubI(seqRegGcgChk, 1, &token);
17306         ajStrToUint(token, &check);
17307 
17308         if(ajRegExec(seqRegGcgNam, *Pline))
17309             ajRegSubI(seqRegGcgNam, 0, &thys->Name);
17310 
17311         if(ajRegExec(seqRegGcgTyp, *Pline))
17312             ajRegSubI(seqRegGcgTyp, 1, &thys->Type);
17313 
17314         ajStrDel(&token);
17315         ajDebug("seqGcgMsfDots '%S' '%S' len: %d check: %d\n",
17316                 thys->Name, thys->Type, *len, check);
17317 
17318         return ajTrue;
17319     }
17320 
17321     return ajFalse;
17322 }
17323 
17324 
17325 
17326 
17327 /* @funcstatic seqGcgMsfHeader ************************************************
17328 **
17329 ** Parses data from a line of an MSF file header. The header stores
17330 ** names and other data for all sequences in the file. Each file
17331 ** is defined on a separate line. The results are stored
17332 ** in the MSF internal table. The sequence data is read later in the
17333 ** input file and added to the table.
17334 **
17335 ** @param [r] line [const AjPStr] Input line.
17336 ** @param [u] Pmsfitem [SeqPMsfItem*] MSF internal table item.
17337 ** @return [AjBool] ajTrue on success.
17338 **
17339 ** @release 1.0.0
17340 ** @@
17341 ******************************************************************************/
17342 
seqGcgMsfHeader(const AjPStr line,SeqPMsfItem * Pmsfitem)17343 static AjBool seqGcgMsfHeader(const AjPStr line, SeqPMsfItem* Pmsfitem)
17344 {
17345     AjPStr name         = NULL; /* NOTE: not static. New each time for list */
17346     AjPStr token = NULL;
17347     SeqPMsfItem msfitem = NULL;
17348 
17349     ajDebug("seqGcgMsfHeader '%S'\n", line);
17350 
17351     if(!ajRegExec(seqRegGcgMsfnam, line))
17352         return ajFalse;
17353 
17354     ajRegSubI(seqRegGcgMsfnam, 1, &name);
17355     /*ajDebug("Name found\n");*/
17356 
17357     if(!ajRegExec(seqRegGcgChk, line))
17358         return ajFalse;
17359 
17360     /*ajDebug("Check found\n");*/
17361 
17362     *Pmsfitem = AJNEW0(msfitem);
17363     msfitem->Name = name;
17364 
17365     ajRegSubI(seqRegGcgChk, 1, &token);
17366     ajStrToUint(token, &msfitem->Check);
17367 
17368     if(ajRegExec(seqRegGcgMsflen, line))
17369     {
17370         ajRegSubI(seqRegGcgMsflen, 1, &token);
17371         ajStrToUint(token, &msfitem->Len);
17372     }
17373     else
17374         msfitem->Len = 0;
17375 
17376     msfitem->Seq = ajStrNewRes(msfitem->Len+1);
17377 
17378     if(ajRegExec(seqRegGcgWgt, line))
17379     {
17380         ajRegSubI(seqRegGcgWgt, 1, &token);
17381         ajStrToFloat(token, &msfitem->Weight);
17382     }
17383     else
17384         msfitem->Weight = 1.0;
17385 
17386     ajDebug("MSF header name '%S' check %d len %d weight %.3f\n",
17387             msfitem->Name, msfitem->Check, msfitem->Len, msfitem->Weight);
17388 
17389     ajStrDel(&token);
17390 
17391     return ajTrue;
17392 }
17393 
17394 
17395 
17396 
17397 /* @funcstatic seqUsaRegInit **************************************************
17398 **
17399 ** Initialised regular expressions for parsing USAs
17400 **
17401 ** @return [void]
17402 **
17403 ** @release 6.1.0
17404 ******************************************************************************/
17405 
seqUsaRegInit(void)17406 static void seqUsaRegInit(void)
17407 {
17408     if(seqRegUsaInitDone)
17409         return;
17410 
17411     if(!seqRegUsaFmt)
17412         seqRegUsaFmt = ajRegCompC("^([A-Za-z0-9-]*)::(.*)$");
17413     /* \1 format letters and numbers only */
17414     /* \2 remainder (filename, etc.)*/
17415 
17416     if(!seqRegUsaDb)
17417         seqRegUsaDb = ajRegCompC("^([A-Za-z][A-Za-z0-9_]+)([-]([A-Za-z]+))?"
17418                                  "([:{]([^}]*)}?)?$");
17419 
17420     /* \1 dbname (start with a letter, then alphanumeric) */
17421     /* \2 -id or -acc etc. */
17422     /* \3 qry->SingleField (id or acc etc.) */
17423     /* \4 :qry->QryString */
17424     /* \5 qry->QryString */
17425 
17426     if(!seqRegUsaId)
17427 #ifndef WIN32
17428         /* \1 is filename \5 is the qry->SingleField \6 is the qry->QryString */
17429         seqRegUsaId = ajRegCompC("^([^|]+[|]|[^:{%]+)"
17430                                  "(([:{%])(([^:}]+):)?([^:}]*)}?)?$");
17431 #else /* WIN32 */
17432     /* Windows file names can start with e.g.: 'C:\' */
17433     /* But allow e.g. 'C:/...', for Staden spin */
17434 
17435     /* \1 is filename \6 is the qry->SingleField \7 is the qry->QryString */
17436     seqRegUsaId = ajRegCompC ("^(([a-zA-Z]:[\\\\/])?[^:{%]+)"
17437                               "(([:{%])(([^:}]+):)?([^:}]*)}?)?$");
17438 #endif /* !WIN32 */
17439 
17440 
17441     if(!seqRegUsaList)   /* \1 is filename \3 is the qry->QryString */
17442         seqRegUsaList = ajRegCompC("^(@|[Ll][Ii][Ss][Tt]:+)(.+)$");
17443 
17444     if(!seqRegUsaAsis)   /* \1 is filename \3 is the qry->QryString */
17445         seqRegUsaAsis = ajRegCompC("^[Aa][Ss][Ii][Ss]:+(.+)$");
17446 
17447     if(!seqRegUsaWild)
17448         seqRegUsaWild = ajRegCompC("(.*[*].*)");
17449     /* \1 wildcard query */
17450 
17451     if(!seqRegUsaRange)    /* \1 is rest of USA \2 start \3 end \5 reverse*/
17452         seqRegUsaRange = ajRegCompC("(.*)[[](-?[0-9]*):(-?[0-9]*)(:([Rr])?)?[]]$");
17453 
17454     seqRegUsaInitDone = ajTrue;
17455 
17456     return;
17457 }
17458 
17459 
17460 
17461 
17462 /* @func ajSeqUsaGetBase ******************************************************
17463 **
17464 ** Extracts the base part from a USA, suitable for use in fetching other
17465 **sequences from the same source
17466 **
17467 ** @param [r] usa [const AjPStr] Original USA
17468 ** @param [u] Pbaseusa [AjPStr*] Base part of USA
17469 ** @return [AjBool] True on success
17470 **
17471 ** @release 6.1.0
17472 ** @@
17473 ******************************************************************************/
17474 
ajSeqUsaGetBase(const AjPStr usa,AjPStr * Pbaseusa)17475 AjBool ajSeqUsaGetBase(const AjPStr usa, AjPStr* Pbaseusa)
17476 {
17477     AjPStr tmpstr  = NULL;
17478 
17479     AjBool regstat   = ajFalse;
17480 #ifdef __CYGWIN__
17481     AjPStr usatmp    = NULL;
17482 #endif /* __CYGWIN__ */
17483 
17484     seqUsaRegInit();
17485 
17486     ajStrAssignC(Pbaseusa, "");
17487 
17488     ajStrAssignS(&seqUsaTest, usa);
17489 
17490     /* Strip any leading spaces */
17491     ajStrTrimC(&seqUsaTest," \t\n");
17492 
17493 #ifdef __CYGWIN__
17494     if(*(ajStrGetPtr(seqUsaTest)+1)==':')
17495     {
17496         usatmp = ajStrNew();
17497         ajFmtPrintS(&usatmp,"/cygdrive/%c/%s",*ajStrGetPtr(seqUsaTest),
17498                     ajStrGetPtr(seqUsaTest)+2);
17499         ajStrAssignRef(&seqUsaTest,usatmp);
17500         ajStrDel(&usatmp);
17501     }
17502 #endif /* __CYGWIN__ */
17503 
17504     ajDebug("USA to test: '%S'\n\n", seqUsaTest);
17505 
17506     /* trim any range */
17507 
17508     if(ajRegExec(seqRegUsaRange, seqUsaTest))
17509     {
17510         ajRegPre(seqRegUsaRange, &tmpstr);
17511         ajStrAssignS(&seqUsaTest, tmpstr);
17512     }
17513 
17514     /* no base for an ASIS:: USA */
17515 
17516     if(ajRegExec(seqRegUsaAsis, seqUsaTest))
17517         return ajFalse;
17518 
17519     /* no base for a listfile USA */
17520 
17521     if(ajRegExec(seqRegUsaList, seqUsaTest))
17522         return ajFalse;
17523 
17524     if(ajRegExec(seqRegUsaFmt, seqUsaTest))
17525     {
17526         ajRegSubI(seqRegUsaFmt, 1, &tmpstr);
17527         ajStrAppendS(Pbaseusa, tmpstr);
17528         ajStrAppendC(Pbaseusa, "::");
17529         ajRegSubI(seqRegUsaFmt, 2,&tmpstr);
17530         ajStrAssignS(&seqUsaTest, tmpstr);
17531     }
17532 
17533     regstat = ajRegExec(seqRegUsaDb, seqUsaTest);
17534 
17535     if(regstat)
17536     {
17537         ajRegSubI(seqRegUsaDb, 1, &tmpstr);
17538         if(!ajNamDatabase(tmpstr))
17539             regstat = ajFalse;
17540     }
17541 
17542     if(regstat)
17543         ajStrAppendS(Pbaseusa, tmpstr);
17544     else
17545     {
17546         if(ajRegExec(seqRegUsaId, seqUsaTest))
17547         {
17548 #ifndef WIN32
17549             ajRegSubI(seqRegUsaId, 1, &tmpstr);
17550 #else /* WIN32 */
17551             ajRegSubI(seqRegUsaId, 1, &tmpstr);
17552 #endif /* !WIN32 */
17553             ajDebug("found filename %S\n", tmpstr);
17554             ajStrAppendS(Pbaseusa, tmpstr);
17555         }
17556 
17557     }
17558     ajStrDel(&tmpstr);
17559 
17560     if(!ajStrGetLen(*Pbaseusa))
17561         return ajFalse;
17562 
17563     return ajTrue;
17564 }
17565 
17566 
17567 
17568 
17569 /* @funcstatic seqinUsaProcess ************************************************
17570 **
17571 ** Converts a USA Universal Sequence Address into an open file.
17572 **
17573 ** First tests for "[n:n:r]" range and sets this if it is found
17574 **
17575 ** Then tests for asis:: in which the "filename" is really the sequence
17576 ** and no format is needed.
17577 **
17578 ** Then tests for "format::" and sets this if it is found
17579 **
17580 ** Then tests for "list:" or "@" and processes as a list file
17581 ** using seqinListProcess which in turn invokes seqinUsaProcess
17582 ** until a valid USA is found.
17583 **
17584 ** Then tests for dbname:query and opens the file (at the correct position
17585 ** if the database definition defines it)
17586 **
17587 ** If there is no database, looks for file:query and opens the file.
17588 ** In this case the file position is not known and sequence reading
17589 ** will have to scan for the entry/entries we need.
17590 **
17591 ** @param [u] seqin [AjPSeqin] Sequence input structure.
17592 ** @param [u] thys [AjPSeq] Sequence to be read.
17593 ** @return [AjBool] ajTrue on success.
17594 **
17595 ** @release 6.4.0
17596 ** @@
17597 ******************************************************************************/
17598 
seqinUsaProcess(AjPSeqin seqin,AjPSeq thys)17599 static AjBool seqinUsaProcess(AjPSeqin seqin, AjPSeq thys)
17600 {
17601     AjBool ret = ajTrue;
17602     AjPStr qrystr = NULL;
17603     AjBool seqmethod = ajFalse;
17604     const AjPStr fmtstr = NULL;
17605     AjPTextin textin;
17606     AjPQuery qry;
17607     AjPSeqAccess seqaccess = NULL;
17608 
17609     textin = seqin->Input;
17610     qry = textin->Query;
17611 
17612     /* pick up the original query string */
17613     qrystr = ajStrNewS(textin->Qry);
17614 
17615     ajDebug("seqinUsaProcess '%S'\n", qrystr);
17616 
17617     /* look for a format:: prefix */
17618     fmtstr = ajQuerystrParseFormat(&qrystr, textin, seqinFormatFind);
17619     ajDebug("seqinUsaProcess ... fmtstr '%S' '%S'\n", fmtstr, qrystr);
17620 
17621     /* (seq/feat) look for a [range] suffix */
17622     ajQuerystrParseRange(&qrystr, &seqin->Begin, &seqin->End, &seqin->Rev);
17623     ajDebug("seqinUsaProcess ... range %d..%d rev:%B '%S'\n",
17624             seqin->Begin, seqin->End, seqin->Rev, qrystr);
17625 
17626     /* look for a list:: or @:: listfile of queries  - process and return */
17627     if(ajQuerystrParseListfile(&qrystr))
17628     {
17629         ajDebug("seqinUsaProcess ... listfile '%S'\n", qrystr);
17630         ret = seqinListProcess(seqin, thys, qrystr);
17631         ajStrDel(&qrystr);
17632         return ret;
17633     }
17634 
17635     /* try general text access methods (file, asis, text database access */
17636     ajDebug("seqinUsaProcess ... no listfile '%S'\n", qrystr);
17637     if(!ajQuerystrParseRead(&qrystr, textin, seqinFormatFind, &seqmethod))
17638     {
17639         ajStrDel(&qrystr);
17640         return ajFalse;
17641     }
17642 
17643     seqinFormatSet(seqin, thys);
17644 
17645     ajDebug("seqinUsaProcess ... read nontext: %B '%S'\n",
17646             seqmethod, qrystr);
17647     ajStrDel(&qrystr);
17648 
17649     /* we found a non-text method */
17650     if(seqmethod)
17651     {
17652         ajDebug("seqinUsaProcess ... call method '%S'\n", qry->Method);
17653         ajDebug("seqinUsaProcess ... textin format %d '%S'\n",
17654                 textin->Format, textin->Formatstr);
17655         ajDebug("seqinUsaProcess ...  query format  '%S'\n",
17656                 qry->Formatstr);
17657         qry->Access = ajCallTableGetS(seqDbMethods,qry->Method);
17658         seqaccess = qry->Access;
17659 
17660         if(!seqaccess)
17661         {
17662             ajErr("sequence access method '%S' not found", qry->Method);
17663             return ajFalse;
17664         }
17665 
17666         return (*seqaccess->Access)(seqin);
17667     }
17668 
17669     ajDebug("seqinUsaProcess text method '%S' success\n", qry->Method);
17670 
17671     return ajTrue;
17672 }
17673 
17674 
17675 
17676 
17677 /* @funcstatic seqUsaRestore **************************************************
17678 **
17679 ** Restores a sequence input specification from a SeqPListUsa node
17680 **
17681 ** @param [w] seqin [AjPSeqin] Sequence input object
17682 ** @param [r] node [const SeqPListUsa] Usa list node
17683 ** @return [void]
17684 **
17685 ** @release 2.1.0
17686 ******************************************************************************/
17687 
seqUsaRestore(AjPSeqin seqin,const SeqPListUsa node)17688 static void seqUsaRestore(AjPSeqin seqin, const SeqPListUsa node)
17689 {
17690     ajDebug("seqUsaRestore node %d..%d rev:%B '%S' (%u) feat %B '%S'\n",
17691             node->Begin, node->End, node->Rev,
17692             node->Formatstr, node->Format, node->Features, node->Usa);
17693     seqin->Begin    = node->Begin;
17694     seqin->End      = node->End;
17695     seqin->Rev      = node->Rev;
17696     seqin->Input->Format   = node->Format;
17697     seqin->Input->Fpos     = node->Fpos;
17698     seqin->Features = node->Features;
17699     ajStrAssignS(&seqin->Input->Formatstr, node->Formatstr);
17700 
17701     return;
17702 }
17703 
17704 
17705 
17706 
17707 /* @funcstatic seqUsaSave *****************************************************
17708 **
17709 ** Saves a sequence input specification in a SeqPListUsa node
17710 **
17711 ** @param [w] node [SeqPListUsa] Usa list node
17712 ** @param [r] seqin [const AjPSeqin] Sequence input object
17713 ** @return [void]
17714 **
17715 ** @release 2.1.0
17716 ******************************************************************************/
17717 
seqUsaSave(SeqPListUsa node,const AjPSeqin seqin)17718 static void seqUsaSave(SeqPListUsa node, const AjPSeqin seqin)
17719 {
17720     ajDebug("seqUsaSave seqin %d..%d rev:%B '%S' (%u) feat %B '%S'\n",
17721             seqin->Begin, seqin->End, seqin->Rev,
17722             seqin->Input->Formatstr, seqin->Input->Format,
17723             seqin->Features, seqin->Input->Qry);
17724     node->Begin    = seqin->Begin;
17725     node->End      = seqin->End;
17726     node->Rev      = seqin->Rev;
17727     node->Format   = seqin->Input->Format;
17728     node->Fpos     = seqin->Input->Fpos;
17729     node->Features = seqin->Features;
17730     ajStrAssignS(&node->Formatstr, seqin->Input->Formatstr);
17731 
17732     return;
17733 }
17734 
17735 
17736 
17737 
17738 /* @funcstatic seqUsaListTrace ************************************************
17739 **
17740 ** Traces the nodes in a USA list
17741 **
17742 ** @param [r] list [const AjPList] The USA list
17743 ** @return [void]
17744 **
17745 ** @release 2.1.0
17746 ******************************************************************************/
17747 
seqUsaListTrace(const AjPList list)17748 static void seqUsaListTrace(const AjPList list)
17749 {
17750     AjIList iter;
17751     SeqPListUsa node;
17752     ajuint i = 0;
17753 
17754     iter = ajListIterNewread(list);
17755 
17756     ajDebug("SeqUsaListTrace %Lu nodes\n", ajListGetLength(list));
17757 
17758     while(!ajListIterDone(iter))
17759     {
17760         node = (SeqPListUsa) ajListIterGet(iter);
17761         ajDebug("%3d: '%S' %4d..%d (%b) '%S' %d\n",
17762                 ++i, node->Usa, node->Begin, node->End, node->Rev,
17763                 node->Formatstr, node->Format);
17764     }
17765 
17766     ajListIterDel(&iter);
17767     ajDebug("...Done...\n");
17768 
17769     return;
17770 }
17771 
17772 
17773 
17774 
17775 /* @funcstatic seqinListProcess ***********************************************
17776 **
17777 ** Processes a file of USAs.
17778 ** This function is called by, and calls, seqinUsaProcess. There is
17779 ** a depth check to avoid infinite loops, for example where a list file
17780 ** refers to itself.
17781 **
17782 ** This function produces a list (AjPList) of USAs with all list references
17783 ** expanded into lists of USAs.
17784 **
17785 ** Because USAs in a list can have their own begin, end and reverse settings
17786 ** the prior settings are stored with each USA in the list node so that they
17787 ** can be restored after.
17788 **
17789 ** @param [u] seqin [AjPSeqin] Sequence input
17790 ** @param [u] seq [AjPSeq] Sequence
17791 ** @param [r] listfile [const AjPStr] Name of list file.,
17792 ** @return [AjBool] ajTrue on success.
17793 **
17794 ** @release 6.4.0
17795 ** @@
17796 ******************************************************************************/
17797 
seqinListProcess(AjPSeqin seqin,AjPSeq seq,const AjPStr listfile)17798 static AjBool seqinListProcess(AjPSeqin seqin, AjPSeq seq,
17799                                const AjPStr listfile)
17800 {
17801     AjPList list  = NULL;
17802     AjPFile file  = NULL;
17803     AjPStr token  = NULL;
17804     AjPStr rest  = NULL;
17805     AjBool ret       = ajFalse;
17806     SeqPListUsa node = NULL;
17807 
17808     ajuint recnum = 0;
17809     static ajint depth    = 0;
17810     static ajint MAXDEPTH = 16;
17811 
17812     depth++;
17813     ajDebug("++seqinListProcess %S depth %d Rev: %B\n",
17814             listfile, depth, seqin->Rev);
17815 
17816     if(depth > MAXDEPTH)
17817         ajFatal("USA List too deep");
17818 
17819     if(!seqin->Usalist)
17820         seqin->Usalist = ajListNew();
17821 
17822     list = ajListNew();
17823 
17824     file = ajFileNewInNameS(listfile);
17825 
17826     if(!file)
17827     {
17828         ajErr("Failed to open list file '%S'", listfile);
17829         depth--;
17830 
17831         return ret;
17832     }
17833 
17834     while(ajReadlineTrim(file, &seqReadLine))
17835     {
17836         ++recnum;
17837         seqListNoComment(&seqReadLine);
17838         if(ajStrExtractWord(seqReadLine, &rest, &token))
17839         {
17840             if(ajStrGetLen(rest))
17841             {
17842                 ajErr("Bad record %u in list file '%S'\n'%S'",
17843                       recnum, listfile, seqReadLine);
17844             }
17845             else if(ajStrGetLen(token))
17846             {
17847                 ajDebug("++Add to list: '%S'\n", token);
17848                 AJNEW0(node);
17849                 ajStrAssignS(&node->Usa, token);
17850                 seqUsaSave(node, seqin);
17851                 ajListPushAppend(list, node);
17852             }
17853         }
17854     }
17855 
17856     ajFileClose(&file);
17857     ajStrDel(&token);
17858     ajStrDel(&rest);
17859 
17860     ajDebug("Trace seqin->Usalist\n");
17861     seqUsaListTrace(seqin->Usalist);
17862     ajDebug("Trace new list\n");
17863     seqUsaListTrace(list);
17864     ajListPushlist(seqin->Usalist, &list);
17865 
17866     ajDebug("Trace combined seqin->Usalist\n");
17867     seqUsaListTrace(seqin->Usalist);
17868 
17869     /*
17870     ** now try the first item on the list
17871     ** this can descend recursively if it is also a list
17872     ** which is why we check the depth above
17873     */
17874 
17875     if(ajListPop(seqin->Usalist, (void**) &node))
17876     {
17877         ajDebug("++pop first item '%S'\n", node->Usa);
17878         ajSeqinUsa(&seqin, node->Usa);
17879         seqUsaRestore(seqin, node);
17880         ajStrDel(&node->Usa);
17881         ajStrDel(&node->Formatstr);
17882         AJFREE(node);
17883         ajDebug("descending with usa '%S'\n", seqin->Input->Qry);
17884         ret = seqinUsaProcess(seqin, seq);
17885     }
17886 
17887     depth--;
17888     ajDebug("++seqinListProcess depth: %d returns: %B\n", depth, ret);
17889 
17890     return ret;
17891 }
17892 
17893 
17894 
17895 
17896 /* @funcstatic seqListNoComment ***********************************************
17897 **
17898 ** Strips comments from a character string (a line from a list file).
17899 ** Comments are blank lines or any text following a "#" character.
17900 **
17901 ** @param [u] text [AjPStr*] Line of text from input file.
17902 ** @return [void]
17903 **
17904 ** @release 1.0.0
17905 ** @@
17906 ******************************************************************************/
17907 
seqListNoComment(AjPStr * text)17908 static void seqListNoComment(AjPStr* text)
17909 {
17910     ajuint i;
17911     char *cp;
17912 
17913     i = ajStrGetLen(*text);
17914 
17915     if(!i)                              /* empty string */
17916         return;
17917 
17918     MAJSTRGETUNIQUESTR(text);
17919 
17920     cp = strchr(ajStrGetPtr(*text), '#');
17921 
17922     if(cp)
17923     {                                   /* comment found */
17924         *cp = '\0';
17925         ajStrSetValid(text);
17926     }
17927 
17928     return;
17929 }
17930 
17931 
17932 
17933 
17934 /* @funcstatic seqinFormatSet *************************************************
17935 **
17936 ** Sets the input format for a sequence using the sequence input object's
17937 ** defined format, or a default from variable 'EMBOSS_FORMAT'.
17938 **
17939 ** @param [u] seqin [AjPSeqin] Sequence input.
17940 ** @param [u] thys [AjPSeq] Sequence.
17941 ** @return [AjBool] ajTrue on success.
17942 **
17943 ** @release 6.4.0
17944 ** @@
17945 ******************************************************************************/
17946 
seqinFormatSet(AjPSeqin seqin,AjPSeq thys)17947 static AjBool seqinFormatSet(AjPSeqin seqin, AjPSeq thys)
17948 {
17949     AjPTextin textin = seqin->Input;
17950 
17951     if(ajStrGetLen(textin->Formatstr))
17952     {
17953         ajDebug("... input format value '%S'\n", textin->Formatstr);
17954 
17955         if(seqinFormatFind(textin->Formatstr, &textin->Format))
17956         {
17957             ajStrAssignS(&thys->Formatstr, textin->Formatstr);
17958             thys->Format = textin->Format;
17959             ajDebug("...format OK '%S' = %d\n", textin->Formatstr,
17960                     textin->Format);
17961         }
17962         else
17963         {
17964             ajDebug("...format unknown '%S'\n", textin->Formatstr);
17965             ajErr("Unknown input format '%S'", textin->Formatstr);
17966         }
17967 
17968         return ajTrue;
17969     }
17970     else
17971         ajDebug("...input format not set\n");
17972 
17973 
17974     return ajFalse;
17975 }
17976 
17977 
17978 
17979 
17980 /* @funcstatic seqinUfoLocal **************************************************
17981 **
17982 ** Tests whether a sequence input object will read features from the
17983 ** sequence input file. The alternative is to use a separate UFO.
17984 **
17985 ** @param [r] thys [const AjPSeqin] Sequen input object.
17986 ** @return [AjBool] ajTrue if the features will be read from the sequence
17987 **
17988 ** @release 1.13.0
17989 ** @@
17990 ******************************************************************************/
17991 
seqinUfoLocal(const AjPSeqin thys)17992 static AjBool seqinUfoLocal(const AjPSeqin thys)
17993 {
17994     if(thys->Features && ! ajStrGetLen(thys->Ufo))
17995         return ajTrue;
17996 
17997     return ajFalse;
17998 }
17999 
18000 
18001 
18002 
18003 /* @funcstatic seqSetName *****************************************************
18004 **
18005 ** Sets the name for a sequence object by applying simple conversion
18006 ** rules to the input which could be, for example, the name from a
18007 ** FASTA format file.
18008 **
18009 ** @param [u] thys [AjPSeq] Sequence object
18010 ** @param [r] str [const AjPStr] User supplied name.
18011 ** @return [void]
18012 **
18013 ** @release 1.0.0
18014 ** @@
18015 ******************************************************************************/
18016 
seqSetName(AjPSeq thys,const AjPStr str)18017 static void seqSetName(AjPSeq thys, const AjPStr str)
18018 {
18019     if(!ajStrGetLen(str))
18020     {
18021         ajSeqSetNameMulti(thys, NULL);
18022     }
18023     else if(ajStrIsWord(str))
18024     {
18025         ajDebug("seqSetName word '%S'\n", str);
18026         ajStrTokenAssignC(&seqHandleSplit, str, ":");
18027 
18028         while(ajStrTokenNextParse(seqHandleSplit, &seqTokenSplit))
18029             if(ajStrGetLen(seqTokenSplit))
18030                 ajStrAssignS(&thys->Name, seqTokenSplit);
18031 
18032         ajStrExchangeSetCC(&thys->Name, ",/\\", "___");
18033 
18034         ajStrTokenReset(seqHandleSplit);
18035     }
18036     else
18037     {
18038         ajDebug("seqSetName non-word '%S'\n", str);
18039         ajStrAssignS(&thys->Name, str);
18040         ajStrRemoveWhiteExcess(&thys->Name);
18041         ajStrExchangeSetCC(&thys->Name, " ,;:/\\", "______");
18042         ajDebug("seqSetName cleaned '%S'\n", thys->Name);
18043     }
18044 
18045     ajDebug("seqSetName '%S' result: '%S'\n", str, thys->Name);
18046 
18047     ajStrDelStatic(&seqTokenSplit);
18048 
18049     return;
18050 }
18051 
18052 
18053 
18054 
18055 /* @funcstatic seqitemSetName *************************************************
18056 **
18057 ** Sets the name for a multiple sequence item object by applying simple
18058 ** conversion rules to the input which could be, for example, the name from a
18059 ** FASTA format file.
18060 **
18061 ** @param [u] thys [SeqPMsfItem] Sequence item object
18062 ** @param [r] str [const AjPStr] User supplied name.
18063 ** @return [void]
18064 **
18065 ** @release 6.2.0
18066 ** @@
18067 ******************************************************************************/
18068 
seqitemSetName(SeqPMsfItem thys,const AjPStr str)18069 static void seqitemSetName(SeqPMsfItem thys, const AjPStr str)
18070 {
18071     if(ajStrIsWord(str))
18072     {
18073         ajDebug("seqitemSetName word '%S'\n", str);
18074         ajStrTokenAssignC(&seqHandleSplit, str, ":");
18075 
18076         while(ajStrTokenNextParse(seqHandleSplit, &seqTokenSplit))
18077             if(ajStrGetLen(seqTokenSplit))
18078                 ajStrAssignS(&thys->Name, seqTokenSplit);
18079 
18080         ajStrTokenReset(seqHandleSplit);
18081     }
18082     else
18083     {
18084         ajDebug("seqitemSetName non-word '%S'\n", str);
18085         ajStrAssignS(&thys->Name, str);
18086         ajStrRemoveWhiteExcess(&thys->Name);
18087         ajStrExchangeKK(&thys->Name, ' ', '_');
18088         ajDebug("seqitemSetName cleaned '%S'\n", thys->Name);
18089     }
18090 
18091     ajDebug("seqitemSetName '%S' result: '%S'\n", str, thys->Name);
18092     ajStrDelStatic(&seqToken);
18093 
18094     return;
18095 }
18096 
18097 
18098 
18099 
18100 /* @funcstatic seqnameSetName *************************************************
18101 **
18102 ** Sets the name usable by a sequence object by applying simple conversion
18103 ** rules to the input which could be, for example, the name from a
18104 ** FASTA format file.
18105 **
18106 ** @param [u] name [AjPStr*] Sequence name derived.
18107 ** @param [r] str [const AjPStr] User supplied name.
18108 ** @return [void]
18109 **
18110 ** @release 6.2.0
18111 ** @@
18112 ******************************************************************************/
18113 
seqnameSetName(AjPStr * name,const AjPStr str)18114 static void seqnameSetName(AjPStr *name, const AjPStr str)
18115 {
18116     if(ajStrIsWord(str))
18117     {
18118         ajDebug("seqnameSetName word '%S'\n", str);
18119         ajStrTokenAssignC(&seqHandleSplit, str, ":");
18120 
18121         while(ajStrTokenNextParse(seqHandleSplit, &seqTokenSplit))
18122             if(ajStrGetLen(seqTokenSplit))
18123                 ajStrAssignS(name, seqTokenSplit);
18124 
18125         ajStrTokenReset(seqHandleSplit);
18126     }
18127     else
18128     {
18129         ajDebug("seqnameSetName non-word '%S'\n", str);
18130         ajStrAssignS(name, str);
18131         ajStrRemoveWhiteExcess(name);
18132         ajStrExchangeKK(name, ' ', '_');
18133         ajDebug("seqnameSetName cleaned '%S'\n", *name);
18134     }
18135 
18136     ajDebug("seqnameSetName '%S' result: '%S'\n", str, *name);
18137 
18138     ajStrDelStatic(&seqTokenSplit);
18139 
18140     return;
18141 }
18142 
18143 
18144 
18145 
18146 /* @funcstatic seqSetNameNospace **********************************************
18147 **
18148 ** Sets the name for a sequence object by applying simple conversion
18149 ** rules to the input which could be, for example, the name from a
18150 ** FASTA format file.
18151 **
18152 ** @param [u] name [AjPStr*] Sequence name derived.
18153 ** @param [r] str [const AjPStr] User supplied name.
18154 ** @return [void]
18155 **
18156 ** @release 4.1.0
18157 ** @@
18158 ******************************************************************************/
18159 
seqSetNameNospace(AjPStr * name,const AjPStr str)18160 static void seqSetNameNospace(AjPStr* name, const AjPStr str)
18161 {
18162     ajStrAssignS(name, str);
18163 
18164     if(!ajStrIsWord(str))
18165     {
18166         ajDebug("seqSetNameNospace non-word '%S'\n", str);
18167         ajStrRemoveWhiteExcess(name);
18168         ajStrExchangeKK(name, ' ', '_');
18169         ajDebug("seqSetNameNospace cleaned '%S'\n", *name);
18170     }
18171 
18172     ajDebug("seqSetNameNospace '%S' result: '%S'\n", str, *name);
18173 
18174     return;
18175 }
18176 
18177 
18178 
18179 
18180 /* @funcstatic seqSetNameFile *************************************************
18181 **
18182 ** Sets the name for a sequence object by applying simple conversion
18183 ** rules to the input source file..
18184 **
18185 ** @param [u] thys [AjPSeq] Sequence object
18186 ** @param [r] seqin [const AjPSeqin] Sequence input object
18187 ** @return [void]
18188 **
18189 ** @release 2.8.0
18190 ** @@
18191 ******************************************************************************/
18192 
seqSetNameFile(AjPSeq thys,const AjPSeqin seqin)18193 static void seqSetNameFile(AjPSeq thys, const AjPSeqin seqin)
18194 {
18195     AjPStr tmpname = NULL;
18196 
18197     ajStrAssignS(&tmpname, seqin->Input->Filename);
18198 
18199     seqSetName(thys, tmpname);
18200 
18201     if(ajTextinGetCount(seqin->Input) > 1)
18202         ajFmtPrintAppS(&thys->Name, "_%3d", ajTextinGetCount(seqin->Input));
18203 
18204     ajDebug("seqSetNameFile '%S' result: '%S'\n", tmpname, thys->Name);
18205     ajStrDel(&tmpname);
18206 
18207     return;
18208 }
18209 
18210 
18211 
18212 
18213 /* @funcstatic seqAccSave *****************************************************
18214 **
18215 ** Adds an accession number to the stored list for a sequence.
18216 ** The first accession number is also saved as the primary number.
18217 **
18218 ** @param [u] thys [AjPSeq] Sequence object
18219 ** @param [r] acc [const AjPStr] Accession number
18220 ** @return [void]
18221 **
18222 ** @release 1.0.0
18223 ** @@
18224 ******************************************************************************/
18225 
seqAccSave(AjPSeq thys,const AjPStr acc)18226 static void seqAccSave(AjPSeq thys, const AjPStr acc)
18227 {
18228     if(!thys->Acclist)
18229         thys->Acclist = ajListstrNew();
18230 
18231     ajListstrPushAppend(thys->Acclist, ajStrNewS(acc));
18232 
18233     if(!ajStrGetLen(thys->Acc))
18234         ajStrAssignS(&thys->Acc, acc);
18235 
18236     return;
18237 }
18238 
18239 
18240 
18241 
18242 /* @funcstatic seqTaxSave *****************************************************
18243 **
18244 ** Adds an organism taxonomy level to the stored list for a sequence.
18245 ** The first is also saved as the primary 'Tax' (should be the species).
18246 **
18247 ** @param [u] thys [AjPSeq] Sequence object
18248 ** @param [r] tax [const AjPStr] Organism taxonomy
18249 ** @param [r] level [ajuint] 0: taxon level 1: species
18250 **                           2: organelle   3: common name
18251 ** @return [void]
18252 **
18253 ** @release 2.4.0
18254 ** @@
18255 ******************************************************************************/
18256 
seqTaxSave(AjPSeq thys,const AjPStr tax,ajuint level)18257 static void seqTaxSave(AjPSeq thys, const AjPStr tax, ajuint level)
18258 {
18259     AjPStr newstr = NULL;
18260     AjBool done = ajFalse;
18261 
18262     switch(level)
18263     {
18264         case 1:
18265             if(!ajStrGetLen(thys->Tax))
18266                 ajStrAssignS(&thys->Tax, tax);
18267             done = ajTrue;
18268             break;
18269         case 2:
18270             if(!ajStrGetLen(thys->Organelle))
18271                 ajStrAssignS(&thys->Organelle, tax);
18272             done = ajTrue;
18273             break;
18274         case 3:
18275             if(!ajStrGetLen(thys->Taxcommon))
18276                 ajStrAssignS(&thys->Taxcommon, tax);
18277             done = ajTrue;
18278             break;
18279         default:
18280             done = ajFalse;
18281             break;
18282     }
18283 
18284     if(!done)
18285     {
18286         if(!thys->Taxlist)
18287             thys->Taxlist = ajListstrNew();
18288         newstr = ajStrNewS(tax);
18289         ajListstrPushAppend(thys->Taxlist, newstr);
18290     }
18291 
18292     return;
18293 }
18294 
18295 
18296 
18297 
18298 /* @funcstatic seqTaxidSaveI **************************************************
18299 **
18300 ** Adds an organism NCBI taxonomy id to the stored list for a sequence.
18301 **
18302 ** @param [u] thys [AjPSeq] Sequence object
18303 ** @param [r] tax [ajuint] Organism NCBI taxonomy id
18304 ** @return [void]
18305 **
18306 ** @release 6.1.0
18307 ** @@
18308 ******************************************************************************/
18309 
seqTaxidSaveI(AjPSeq thys,ajuint tax)18310 static void seqTaxidSaveI(AjPSeq thys, ajuint tax)
18311 {
18312     if(tax && !ajStrGetLen(thys->Taxid))
18313         ajStrFromUint(&thys->Taxid, tax);
18314 
18315     return;
18316 }
18317 
18318 
18319 
18320 
18321 /* @funcstatic seqTaxidSaveS **************************************************
18322 **
18323 ** Adds an organism NCBI taxonomy id to the stored list for a sequence.
18324 **
18325 ** @param [u] thys [AjPSeq] Sequence object
18326 ** @param [r] tax [const AjPStr] Organism NCBI taxonomy id
18327 ** @return [void]
18328 **
18329 ** @release 6.1.0
18330 ** @@
18331 ******************************************************************************/
18332 
seqTaxidSaveS(AjPSeq thys,const AjPStr tax)18333 static void seqTaxidSaveS(AjPSeq thys, const AjPStr tax)
18334 {
18335     if(!ajStrGetLen(thys->Taxid))
18336         ajStrAssignS(&thys->Taxid, tax);
18337 
18338     return;
18339 }
18340 
18341 
18342 
18343 
18344 /* @funcstatic seqSvSave ******************************************************
18345 **
18346 ** Adds a sequence version number to the stored data for a sequence.
18347 **
18348 ** @param [u] thys [AjPSeq] Sequence object
18349 ** @param [r] sv [const AjPStr] SeqVersion number
18350 ** @return [void]
18351 **
18352 ** @release 2.4.0
18353 ** @@
18354 ******************************************************************************/
18355 
seqSvSave(AjPSeq thys,const AjPStr sv)18356 static void seqSvSave(AjPSeq thys, const AjPStr sv)
18357 {
18358     if(!ajStrGetLen(thys->Sv))
18359         ajStrAssignS(&thys->Sv, sv);
18360 
18361     return;
18362 }
18363 
18364 
18365 
18366 
18367 /* ==================================================================== */
18368 /* ========================= constructors ============================= */
18369 /* ==================================================================== */
18370 
18371 
18372 
18373 
18374 
18375 
18376 
18377 /* ==================================================================== */
18378 /* ======================== Operators ==================================*/
18379 /* ==================================================================== */
18380 
18381 
18382 
18383 
18384 /* @section Sequence Query Operators ******************************************
18385 **
18386 ** These functions use the contents of a sequence query object but do
18387 ** not make any changes.
18388 **
18389 ******************************************************************************/
18390 
18391 
18392 
18393 
18394 /* @funcstatic seqQueryMatch **************************************************
18395 **
18396 ** Compares a sequence to a query and returns true if they match.
18397 **
18398 ** @param [r] thys [const AjPQuery] Sequence query.
18399 ** @param [r] seq [const AjPSeq] Sequence.
18400 ** @return [AjBool] ajTrue if the sequence matches the query.
18401 **
18402 ** @release 1.0.0
18403 ** @@
18404 ******************************************************************************/
18405 
seqQueryMatch(const AjPQuery thys,const AjPSeq seq)18406 static AjBool seqQueryMatch(const AjPQuery thys, const AjPSeq seq)
18407 {
18408     AjBool tested = ajFalse;
18409     AjIList iter  = NULL;
18410     AjIList iterfield  = NULL;
18411     AjPStr accstr;                      /* from list, do not delete */
18412     AjPStr keystr;                      /* from list, do not delete */
18413     AjPStr taxstr;                      /* from list, do not delete */
18414     AjPQueryField field = NULL;
18415     AjBool ok = ajFalse;
18416 
18417     ajDebug("seqQueryMatch '%S' fields: %Lu Case %B Done %B\n",
18418             seq->Name, ajListGetLength(thys->QueryFields),
18419             thys->CaseId, thys->QryDone);
18420 
18421     if(!thys)                      /* no query to test, that's fine */
18422         return ajTrue;
18423 
18424     if(thys->QryDone)                   /* do we need to test here? */
18425         return ajTrue;
18426 
18427     /* test the query field(s) */
18428 
18429     iterfield = ajListIterNewread(thys->QueryFields);
18430     while(!ajListIterDone(iterfield))
18431     {
18432         field = ajListIterGet(iterfield);
18433 
18434         ajDebug("  field: '%S' Query: '%S'\n",
18435                 field->Field, field->Wildquery);
18436         if(ajStrMatchC(field->Field, "id"))
18437         {
18438             ajDebug("  id test: '%S'\n",
18439                     seq->Name);
18440             if(thys->CaseId)
18441             {
18442                 if(ajStrMatchWildS(seq->Name, field->Wildquery))
18443                 {
18444                     ajListIterDel(&iterfield);
18445                     return ajTrue;
18446                 }
18447             }
18448             else
18449             {
18450                 if(ajStrMatchWildCaseS(seq->Name, field->Wildquery))
18451                 {
18452                     ajListIterDel(&iterfield);
18453                     return ajTrue;
18454                 }
18455             }
18456 
18457             ajDebug("id test failed\n");
18458             tested = ajTrue;
18459         }
18460 
18461         else if(ajStrMatchC(field->Field, "sv")) /* test Sv and Gi */
18462         {
18463             ajDebug("  sv test: '%S'\n",
18464                     seq->Sv);
18465             if(ajStrMatchWildCaseS(seq->Sv, field->Wildquery))
18466             {
18467                 ajListIterDel(&iterfield);
18468                 return ajTrue;
18469             }
18470 
18471             ajDebug("sv test failed\n");
18472             tested = ajTrue;
18473         }
18474 
18475         else if(ajStrMatchC(field->Field, "gi")) /* test Sv and Gi */
18476         {
18477             ajDebug("  gi test: '%S'\n",
18478                     seq->Gi);
18479             if(ajStrMatchWildCaseS(seq->Gi, field->Wildquery))
18480             {
18481                 ajListIterDel(&iterfield);
18482                 return ajTrue;
18483             }
18484 
18485             ajDebug("gi test failed\n");
18486             tested = ajTrue;
18487         }
18488 
18489         else if(ajStrMatchC(field->Field, "acc"))
18490         {
18491             ajDebug("  acc test:%Lu\n",
18492                     ajListGetLength(seq->Acclist));
18493             if(ajListGetLength(seq->Acclist))
18494             {              /* accession number test - check the entire list */
18495                 iter = ajListIterNewread(seq->Acclist);
18496 
18497                 while(!ajListIterDone(iter))
18498                 {
18499                     accstr = ajListIterGet(iter);
18500                     ajDebug("... try accession '%S' '%S'\n", accstr,
18501                             field->Wildquery);
18502 
18503                     if(ajStrMatchWildCaseS(accstr, field->Wildquery))
18504                     {
18505                         ajListIterDel(&iterfield);
18506                         ajListIterDel(&iter);
18507 
18508                         return ajTrue;
18509                     }
18510                 }
18511             }
18512 
18513             tested = ajTrue;
18514             ajDebug("acc test failed\n");
18515             ajListIterDel(&iter);
18516         }
18517 
18518         else if(ajStrMatchC(field->Field, "org"))
18519         {
18520             ajDebug("  org test:%Lu\n",
18521                     ajListGetLength(seq->Taxlist));
18522             if(ajListGetLength(seq->Taxlist))
18523             {                      /* taxonomy test - check the entire list */
18524                 iter = ajListIterNewread(seq->Taxlist);
18525 
18526                 while(!ajListIterDone(iter))
18527                 {
18528                     taxstr = ajListIterGet(iter);
18529                     ajDebug("... try organism '%S' '%S'\n", taxstr,
18530                             field->Wildquery);
18531 
18532                     if(ajStrMatchWildCaseS(taxstr, field->Wildquery))
18533                     {
18534                         ajListIterDel(&iterfield);
18535                         ajListIterDel(&iter);
18536 
18537                         return ajTrue;
18538                     }
18539                 }
18540 
18541                 tested = ajTrue;
18542                 ajDebug("org test failed\n");
18543                 ajListIterDel(&iter);
18544             }
18545             else
18546             {
18547                 ajDebug("org test failed - nothing to test\n");
18548 
18549                 return ajFalse;
18550             }
18551         }
18552 
18553         else if(ajStrMatchC(field->Field, "key"))
18554         {
18555             ajDebug("  key test:%Lu\n",
18556                     ajListGetLength(seq->Keylist));
18557             if(ajListGetLength(seq->Keylist))
18558             {           /* keyword test - check the entire list */
18559                 iter = ajListIterNewread(seq->Keylist);
18560 
18561                 while(!ajListIterDone(iter))
18562                 {
18563                     keystr = ajListIterGet(iter);
18564                     ajDebug("... try keyword '%S' '%S'\n", keystr,
18565                             field->Wildquery);
18566 
18567                     if(ajStrMatchWildCaseS(keystr, field->Wildquery))
18568                     {
18569                         ajListIterDel(&iterfield);
18570                         ajListIterDel(&iter);
18571 
18572                         return ajTrue;
18573                     }
18574                 }
18575 
18576                 tested = ajTrue;
18577                 ajDebug("key test failed\n");
18578                 ajListIterDel(&iter);
18579             }
18580             else
18581             {
18582                 ajDebug("key test failed - nothing to test\n");
18583                 ajListIterDel(&iterfield);
18584 
18585                 return ajFalse;
18586             }
18587         }
18588 
18589         else if(ajStrMatchC(field->Field, "des"))
18590         {
18591             ajDebug("  des test: '%S'\n",
18592                     seq->Desc);
18593             if(ajStrGetLen(seq->Desc))
18594             {            /* description test - check the string */
18595                 ajDebug("... try description '%S' '%S'\n", seq->Desc,
18596                         field->Wildquery);
18597 
18598                 if(ajStrMatchWildWordCaseS(seq->Desc, field->Wildquery))
18599                 {
18600                     ajListIterDel(&iterfield);
18601                     return ajTrue;
18602                 }
18603 
18604                 tested = ajTrue;
18605                 ajDebug("des test failed\n");
18606                 ajListIterDel(&iter);
18607             }
18608             else
18609             {
18610                 ajDebug("des test failed - nothing to test\n");
18611                 ajListIterDel(&iterfield);
18612                 return ajFalse;
18613             }
18614         }
18615         else
18616         {
18617             ajErr("Unknown query field '%S' in query '%S'",
18618                   thys->SingleField, thys->QryString);
18619             tested = ajTrue;
18620         }
18621 
18622     }
18623 
18624     ajListIterDel(&iterfield);
18625 
18626     if(!tested)             /* nothing to test, so accept it anyway */
18627     {
18628         if(ajListGetLength(thys->QueryFields))
18629         {
18630             ajErr("");
18631             return ajFalse;
18632         }
18633 
18634         ajDebug("  no tests: assume OK\n");
18635         return ajTrue;
18636     }
18637 
18638     ajDebug("result: %B\n", ok);
18639 
18640     return ok;
18641 }
18642 
18643 
18644 
18645 
18646 
18647 /* @func ajSeqParseFasta ******************************************************
18648 **
18649 ** Parse an NCBI format fasta line. Return id acc sv and description
18650 **
18651 ** @param [r] instr [const AjPStr]   fasta line.
18652 ** @param [w] id [AjPStr*]   id.
18653 ** @param [w] acc [AjPStr*]  accession number.
18654 ** @param [w] sv [AjPStr*]  sequence version number.
18655 ** @param [w] desc [AjPStr*] description.
18656 ** @return [AjBool] ajTrue if fasta format
18657 **
18658 ** @release 2.0.0
18659 ** @@
18660 ******************************************************************************/
18661 
ajSeqParseFasta(const AjPStr instr,AjPStr * id,AjPStr * acc,AjPStr * sv,AjPStr * desc)18662 AjBool ajSeqParseFasta(const AjPStr instr, AjPStr* id, AjPStr* acc,
18663                        AjPStr* sv, AjPStr* desc)
18664 {
18665     AjBool ok = ajFalse;
18666 
18667     ajDebug("ajSeqParseFasta '%S'\n", instr);
18668 
18669     if(!ajStrPrefixC(instr, ">"))
18670         return ajFalse;
18671 
18672     ajStrTokenAssignC(&seqHandle, instr, "> ");
18673     ajStrTokenNextParseC(seqHandle, " \t\n\r", id);
18674 
18675     ok = ajStrTokenNextParse(seqHandle, &seqToken);
18676     ajStrAssignS(&seqToken2, seqToken);
18677     ajStrRemoveSetC(&seqToken2, "()");
18678 
18679     if(ok && ajSeqtestIsSeqversion(seqToken2))
18680     {
18681         ajStrAssignS(acc, ajSeqtestIsSeqversion(seqToken2));
18682         ajStrAssignS(sv, seqToken2);
18683         ajStrTokenNextParseC(seqHandle, "\n\r", desc);
18684     }
18685     else if(ok && ajSeqtestIsAccession(seqToken2))
18686     {
18687         ajStrAssignS(acc, seqToken2);
18688         ajStrAssignClear(sv);
18689         ajStrTokenNextParseC(seqHandle, "\n\r", desc);
18690     }
18691     else if(ok)
18692     {
18693         ajStrAssignClear(acc);
18694         ajStrAssignClear(sv);
18695         ajStrAssignS(desc, seqToken);
18696 
18697         if(ajStrTokenNextParseC(seqHandle, "\n\r", &seqToken))
18698         {
18699             ajStrAppendC(desc, " ");
18700             ajStrAppendS(desc, seqToken);
18701         }
18702     }
18703 
18704     ajStrDelStatic(&seqToken); /* duplicate of accession or description */
18705     ajStrDelStatic(&seqToken2);
18706     ajStrTokenReset(seqHandle);
18707 
18708     ajDebug("result id: '%S' acc: '%S' desc: '%S'\n", *id, *acc, *desc);
18709 
18710     return ajTrue;
18711 }
18712 
18713 
18714 
18715 
18716 /* @func ajSeqParseNcbi *******************************************************
18717 **
18718 ** Parse an NCBI format fasta line. Return id acc and description.
18719 **
18720 ** Tries to cope with the amazing variety of identifiers NCBI inflicts
18721 ** on us all - see the BLAST document README.formatdb from NCBI for
18722 ** some of the gory detail, and look at some real files for clues
18723 ** to what can really happen. Sadly,'real files' also includes
18724 ** internal IDs in blast databases reformatted by formatdb.
18725 **
18726 ** @param [r] instr [const AjPStr]   fasta line.
18727 ** @param [w] id [AjPStr*]   id.
18728 ** @param [w] acc [AjPStr*]  accession number.
18729 ** @param [w] sv [AjPStr*]  sequence version number.
18730 ** @param [w] gi [AjPStr*]  GI version number.
18731 ** @param [w] db [AjPStr*]  NCBI database name
18732 ** @param [w] desc [AjPStr*] description.
18733 ** @return [AjBool] ajTrue if ncbi format
18734 **
18735 ** @release 1.0.0
18736 ** @@
18737 ******************************************************************************/
18738 
ajSeqParseNcbi(const AjPStr instr,AjPStr * id,AjPStr * acc,AjPStr * sv,AjPStr * gi,AjPStr * db,AjPStr * desc)18739 AjBool ajSeqParseNcbi(const AjPStr instr, AjPStr* id, AjPStr* acc,
18740                       AjPStr* sv, AjPStr* gi, AjPStr* db, AjPStr* desc)
18741 {
18742     AjPStr idstr       = NULL;
18743     AjPStr reststr     = NULL;
18744     AjPStr prefix      = NULL;
18745     AjPStr numtoken    = NULL;
18746     AjPStr str         = NULL;
18747     const AjPStr vacc  = NULL;
18748     const char *q;
18749     ajuint  i;
18750     ajuint  nt;
18751     AjBool ret = ajFalse;
18752 
18753     ajStrAssignClear(db);
18754 
18755     /* NCBI's list of standard identifiers June 2001
18756     ** ftp://ncbi.nlm.nih.gov/blast/db/README.formatdb
18757     **
18758     ** Database Name                         Identifier Syntax
18759     **
18760     ** GenBank                               gb|accession|locus
18761     ** EMBL Data Library                     emb|accession|locus
18762     ** DDBJ, DNA Database of Japan           dbj|accession|locus
18763     ** SWISS-PROT                            sp|accession|entry name
18764     ** NCBI Reference Sequence               ref|accession|locus
18765     **
18766     ** General database identifier           gnl|database|identifier
18767     ** BLAST formatdb                        gnl|BL_ORD_ID|number
18768     **   (prefix for normal FASTA header - remove)
18769     **
18770     ** NBRF PIR                              pir||entry
18771     ** Protein Research Foundation           prf||name
18772     **   (Japanese SEQDB protein DB)
18773     **
18774     ** Brookhaven Protein Data Bank          pdb|entry|chain
18775     **
18776     ** Patents                               pat|country|number
18777     **
18778     ** GenInfo Backbone Id                   bbs|number
18779     ** Local Sequence identifier             lcl|identifier
18780     **
18781     ** GenInfo identifier prefix             gi|gi_identifier
18782     **   (prefix - remove)
18783     */
18784 
18785 /*    ajDebug("ajSeqParseNcbi '%S'\n", instr);*/
18786 
18787     if(ajStrGetCharPos(instr, 3) == ';')  /* then it is really PIR format */
18788     {
18789         ajDebug("ajSeqParseNcbi failed: this is PIR format\n");
18790 
18791         return ajFalse;
18792     }
18793 
18794     ajStrAssignS(&str, instr);
18795 
18796     /* ajDebug("id test %B %B\n",
18797        !strchr(MAJSTRGETPTR(str), (ajint)'|'),
18798        (*MAJSTRGETPTR(str)!='>')); */
18799 
18800     /* Line must start with '>', and include '|' bar, hopefully in the ID */
18801 
18802     if(*MAJSTRGETPTR(str)!='>')
18803     {
18804         ajDebug("ajSeqParseNcbi failed: no '>' at start\n");
18805         ajStrDel(&str);
18806 
18807         return ajFalse;
18808     }
18809 
18810     /* pick out the ID */
18811 
18812     ajStrTokenAssignC(&seqHandle2,str,"> \t\r\n");
18813     ajStrTokenNextParse(seqHandle2, &idstr);
18814     ajStrTokenNextParseC(seqHandle2, "\r\n", &reststr);
18815     ajStrTokenReset(seqHandle2);
18816 
18817     /* check we have an ID */
18818 
18819     if(!ajStrGetLen(idstr))
18820     {
18821         ajDebug("No ID string found - but try FASTA\n");
18822         ret = ajSeqParseFasta(str, id, acc, sv, desc);
18823         ajStrDel(&str);
18824         ajStrDel(&idstr);
18825         ajStrDel(&reststr);
18826 
18827         return ret;
18828     }
18829 
18830     /* NCBI ids always have | somewhere. Else we try a simple FASTA format */
18831 
18832     if(!strchr(MAJSTRGETPTR(idstr),(ajint)'|'))
18833     {
18834         ajDebug("trying ajSeqParseFasta\n");
18835         ret = ajSeqParseFasta(str, id, acc, sv, desc);
18836         ajStrDel(&str);
18837         ajStrDel(&idstr);
18838         ajStrDel(&reststr);
18839 
18840         return ret;
18841     }
18842 
18843     ajStrAssignClear(id);
18844     ajStrTokenAssignC(&seqHandle,idstr,"|");
18845 
18846     ajStrTokenNextParse(seqHandle, &prefix);
18847     q = MAJSTRGETPTR(prefix);
18848 
18849 /*
18850 //  ajDebug(" idstr: '%S'\n", idstr);
18851 //    ajDebug("prefix: '%S'\n", prefix);
18852 */
18853 
18854     if(!strncmp(q,"gi",2))
18855     {
18856         /* ajDebug("gi prefix\n"); */
18857         ajStrTokenNextParse(seqHandle, gi);
18858 
18859         if(! ajStrTokenNextParse(seqHandle, &prefix))
18860         {
18861             /* we only have a gi prefix */
18862             ajDebug("*only* gi prefix\n");
18863             ajStrAssignS(id, *gi);
18864             ajStrAssignClear(acc);
18865             ajStrAssignS(desc, reststr);
18866             ajDebug("found pref: '%S' id: '%S', acc: '%S' "
18867                     "desc: '%S'\n",
18868                     prefix, *id, *acc, *desc);
18869             ajStrDel(&str);
18870             ajStrDel(&idstr);
18871             ajStrDel(&reststr);
18872             ajStrDel(&prefix);
18873             ajStrDelStatic(&seqToken);
18874             ajStrTokenReset(seqHandle);
18875 
18876             return ajTrue;
18877         }
18878 
18879         /* otherwise we continue to parse the rest */
18880         q = MAJSTRGETPTR(prefix);
18881         ajDebug("continue with '%S'\n", prefix);
18882     }
18883 
18884 
18885     /*
18886      * This next routine and associated function could be used if
18887      * whatever is appended to gnl lines is consistent
18888      */
18889 
18890     if(!strncmp(MAJSTRGETPTR(idstr),"gnl|BL_ORD_ID|",14))
18891     {
18892         /* ajDebug("gnl|BL_ORD_ID stripping\n"); */
18893         ajStrTokenStep(seqHandle); /* BL_ORD_ID */
18894         ajStrTokenStep(seqHandle); /* number */
18895         ajStrInsertC(&reststr, 0, ">");
18896         ajStrTokenReset(seqHandle);
18897 
18898         if(ajSeqParseNcbi(reststr,id,acc,sv,gi,db,desc))
18899         {
18900             ajStrAssignEmptyC(db, "BL_ORD_ID");
18901             /* recursive ... */
18902             ajDebug("ajSeqParseNcbi recursive success '%S'\n", reststr);
18903             /* ajDebug("found pref: '%S' id: '%S', acc: '%S' "
18904                "sv: '%S' desc: '%S'\n",
18905                prefix, *id, *acc, *sv, *desc); */
18906             ajStrDel(&str);
18907             ajStrDel(&idstr);
18908             ajStrDel(&reststr);
18909             ajStrDel(&prefix);
18910             ajStrDel(&numtoken);
18911             ajStrDelStatic(&seqToken);
18912             ajStrTokenReset(seqHandle);
18913 
18914             return ajTrue;
18915         }
18916         ajDebug("ajSeqParseNcbi recursive failed '%S' - use gnl id\n",
18917                 reststr);
18918         ajStrAssignS(id,numtoken);
18919         ajStrAssignClear(acc);
18920         /* ajDebug("found pref: '%S' id: '%S', acc: '%S' "
18921            "sv: '%S' desc: '%S'\n",
18922            prefix, *id, *acc, *sv, *desc); */
18923         ajStrDel(&str);
18924         ajStrDel(&idstr);
18925         ajStrDel(&reststr);
18926         ajStrDel(&prefix);
18927         ajStrDel(&numtoken);
18928         ajStrDelStatic(&seqToken);
18929         ajStrTokenDel(&seqHandle);
18930 
18931         return ajTrue;
18932     }
18933 
18934     /* works for NCBI formatdb reformatted blast databases
18935     ** still checking for any mis-formatted databases elsewhere */
18936 
18937     if(!strcmp(q,"bbs") || !strcmp(q,"lcl"))
18938     {
18939         if(!strcmp(q, "lcl"))
18940             ajStrAssignS(db, prefix);
18941 
18942         /* ajDebug("bbs or lcl prefix\n"); */
18943         ajStrTokenNextParse(seqHandle, id);
18944         ajStrAssignClear(acc);
18945         ajStrAssignS(desc, reststr);
18946         /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
18947            prefix, *id, *acc, *desc); */
18948         ajStrDel(&str);
18949         ajStrDel(&idstr);
18950         ajStrDel(&reststr);
18951         ajStrDel(&prefix);
18952         ajStrDel(&numtoken);
18953         ajStrDelStatic(&seqToken);
18954         ajStrTokenReset(seqHandle);
18955 
18956         return ajTrue;
18957     }
18958 
18959     if(!strcmp(q,"gnl") || !strcmp(q,"pat"))
18960     {
18961         /* ajDebug("gnl or pat prefix\n"); */
18962         if(!strcmp(q,"gnl"))
18963             ajStrTokenNextParse(seqHandle, db);
18964         else
18965             ajStrTokenStep(seqHandle);
18966 
18967         ajStrTokenNextParse(seqHandle, id);
18968         ajStrAssignClear(acc);          /* no accession number */
18969         ajStrAssignS(desc, reststr);
18970         /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
18971            prefix, *id, *acc, *desc); */
18972         ajStrDel(&str);
18973         ajStrDel(&idstr);
18974         ajStrDel(&reststr);
18975         ajStrDel(&prefix);
18976         ajStrDel(&numtoken);
18977         ajStrDel(&seqToken);
18978         ajStrTokenReset(seqHandle);
18979 
18980         return ajTrue;
18981     }
18982 
18983 
18984     if(!strcmp(q,"pdb"))
18985     {
18986         ajStrAssignS(db, prefix);
18987         /* ajDebug("gnl or pat or pdb prefix\n"); */
18988         ajStrTokenNextParse(seqHandle, id);
18989 
18990         if(ajStrTokenNextParse(seqHandle, &seqToken))
18991         {
18992             /* chain identifier to append */
18993             ajStrAppendS(id, seqToken);
18994         }
18995 
18996         ajStrAssignClear(acc);          /* no accession number */
18997         ajStrAssignS(desc, reststr);
18998         /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
18999            prefix, *id, *acc, *desc); */
19000         ajStrDel(&str);
19001         ajStrDel(&idstr);
19002         ajStrDel(&reststr);
19003         ajStrDel(&prefix);
19004         ajStrDel(&numtoken);
19005         ajStrDelStatic(&seqToken);
19006         ajStrTokenReset(seqHandle);
19007 
19008         return ajTrue;
19009     }
19010 
19011 
19012     if(!strcmp(q,"gb") || !strcmp(q,"emb") || !strcmp(q,"dbj")
19013        || !strcmp(q,"tpd") || !strcmp(q,"tpd") || !strcmp(q,"tpg")
19014        || !strcmp(q,"sp") || !strcmp(q,"ref"))
19015     {
19016         /* ajDebug("gb,emb,dbj,sp,ref prefix\n"); */
19017         ajStrAssignS(db, prefix);
19018         ajStrTokenNextParse(seqHandle, &seqToken);
19019         vacc = ajSeqtestIsSeqversion(seqToken);
19020 
19021         if(vacc)
19022         {
19023             ajStrAssignS(sv,seqToken);
19024             ajStrAssignS(acc,vacc);
19025         }
19026         else if(ajSeqtestIsAccession(seqToken))
19027             ajStrAssignS(acc,seqToken);
19028 
19029         if(!ajStrTokenNextParse(seqHandle, id))
19030         {
19031             /* no ID, reuse accession token */
19032             ajStrAssignS(id, seqToken);
19033         }
19034 
19035         ajStrAssignS(desc, reststr);
19036         /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
19037            prefix, *id, *acc, *desc); */
19038         ajStrDel(&str);
19039         ajStrDel(&idstr);
19040         ajStrDel(&reststr);
19041         ajStrDel(&prefix);
19042         ajStrDel(&numtoken);
19043         ajStrDelStatic(&seqToken);
19044         ajStrTokenReset(seqHandle);
19045 
19046         return ajTrue;
19047     }
19048 
19049 
19050     if(!strcmp(q,"pir") || !strcmp(q,"prf"))
19051     {
19052         ajStrAssignS(db, prefix);
19053         /* ajDebug("pir,prf prefix\n"); */
19054         ajStrTokenNextParse(seqHandle, id);
19055         ajStrAssignS(desc, reststr);
19056         ajStrAssignClear(acc);
19057         /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
19058            prefix, *id, *acc, *desc); */
19059         ajStrDel(&str);
19060         ajStrDel(&idstr);
19061         ajStrDel(&reststr);
19062         ajStrDel(&prefix);
19063         ajStrDel(&numtoken);
19064         ajStrDelStatic(&seqToken);
19065         ajStrTokenReset(seqHandle);
19066 
19067         return ajTrue;
19068     }
19069 
19070 
19071     /* else assume that the last two barred tokens contain [acc]|id */
19072 
19073     ajDebug("No prefix accepted - try the last 2 fields\n");
19074 
19075     nt = ajStrParseCountC(idstr,"|");
19076 
19077     if(ajStrGetCharLast(idstr) == '|')
19078         nt++;
19079 
19080     ajDebug("Barred tokens - %d found\n", nt);
19081 
19082     if(nt < 2)
19083     {
19084         ajStrDel(&str);
19085         ajStrDel(&idstr);
19086         ajStrDel(&reststr);
19087         ajStrDel(&prefix);
19088         ajStrDel(&numtoken);
19089         ajStrDelStatic(&seqToken);
19090         ajStrTokenReset(seqHandle);
19091 
19092         return ajFalse;
19093     }
19094 
19095     /* restart parsing with only bars */
19096 
19097     ajStrTokenAssignC(&seqHandle,idstr,"|");
19098 
19099     for(i=0;i<nt-3;++i)
19100         ajStrTokenStep(seqHandle);
19101 
19102     ajStrTokenNextParse(seqHandle, &seqToken);
19103 
19104     ajStrAssignS(db, seqToken);
19105     ajStrTokenNextParse(seqHandle, &seqToken);
19106     ajDebug("token acc: '%S'\n", seqToken);
19107     vacc = ajSeqtestIsSeqversion(seqToken);
19108 
19109     if(vacc)
19110     {
19111         ajStrAssignS(sv,seqToken);
19112         ajStrAssignS(acc,vacc);
19113         ajStrAssignS(id,vacc);
19114     }
19115     else if(ajSeqtestIsAccession(seqToken))
19116     {
19117         ajStrAssignS(acc,seqToken);
19118         ajStrAssignS(id,seqToken);
19119     }
19120     else
19121     {
19122         ajStrAssignS(id,seqToken);
19123     }
19124 
19125 
19126     if(ajStrTokenNextParseC(seqHandle, " \n\t\r", &seqToken))
19127     {
19128         ajDebug("token id: '%S'\n", seqToken);
19129 
19130         if(ajStrGetLen(seqToken))
19131             ajStrAssignS(id,seqToken);
19132     }
19133 
19134     ajStrTokenStepC(seqHandle, "\n\r");
19135     ajStrAssignS(desc, reststr);
19136     ajStrTokenReset(seqHandle);
19137     ajStrDelStatic(&seqToken);
19138     /* ajDebug("found pref: '%S' id: '%S', acc: '%S' desc: '%S'\n",
19139        prefix, *id, *acc, *desc); */
19140 
19141     ajStrDel(&str);
19142     ajStrDel(&idstr);
19143     ajStrDel(&reststr);
19144     ajStrDel(&prefix);
19145     ajStrDel(&numtoken);
19146 
19147     return ajTrue;
19148 }
19149 
19150 
19151 
19152 
19153 /* @func ajSeqParseFastq ******************************************************
19154 **
19155 ** Parse a fastq id line. Return id acc sv and description
19156 **
19157 ** @param [r] instr [const AjPStr]   fastq line.
19158 ** @param [w] id [AjPStr*]   id.
19159 ** @param [w] desc [AjPStr*] description.
19160 ** @return [AjBool] ajTrue if fastq format
19161 **
19162 ** @release 6.1.0
19163 ** @@
19164 ******************************************************************************/
19165 
ajSeqParseFastq(const AjPStr instr,AjPStr * id,AjPStr * desc)19166 AjBool ajSeqParseFastq(const AjPStr instr, AjPStr* id, AjPStr* desc)
19167 {
19168     AjPStr str       = NULL;
19169 
19170     /*ajDebug("ajSeqParseFastq '%S'\n", instr);*/
19171 
19172     if(!ajStrPrefixC(instr, "@"))
19173         return ajFalse;
19174 
19175     ajStrExtractWord(instr, desc, &str);
19176     ajStrTrimC(desc, "\n");
19177     ajStrAssignSubS(id, str, 1, -1);
19178 
19179     ajStrDel(&str);
19180     return ajTrue;
19181 }
19182 
19183 
19184 
19185 
19186 /* @func ajSeqGetFromUsaRange *************************************************
19187 **
19188 ** Returns a sequence given a USA
19189 **
19190 ** @param [r] thys [const AjPStr] USA
19191 ** @param [r] protein [AjBool] True if protein
19192 ** @param [r] ibegin [ajint] sequence start position
19193 ** @param [r] iend [ajint] sequence end position
19194 ** @param [u] seq [AjPSeq] sequence
19195 ** @return [AjBool] ajTrue on success
19196 **
19197 ** @release 6.4.0
19198 ** @@
19199 ******************************************************************************/
19200 
ajSeqGetFromUsaRange(const AjPStr thys,AjBool protein,ajint ibegin,ajint iend,AjPSeq seq)19201 AjBool ajSeqGetFromUsaRange(const AjPStr thys, AjBool protein,
19202                             ajint ibegin, ajint iend, AjPSeq seq)
19203 {
19204     AjPSeqin seqin;
19205     AjBool ok;
19206 
19207     seqin = NULL;
19208 
19209     ajSeqinUsa(&seqin, thys);
19210 
19211     if(ibegin!=0 || iend!=0)
19212         ajSeqinSetRange(seqin, ibegin, iend);
19213 
19214     seqin->Input->Multi = ajFalse;
19215     seqin->Input->Text  = ajFalse;
19216 
19217     if(!protein)
19218         ajSeqinSetNuc(seqin);
19219     else
19220         ajSeqinSetProt(seqin);
19221 
19222     ok = ajSeqRead(seq, seqin);
19223     ajSeqinDel(&seqin);
19224 
19225     if(!ok)
19226         return ajFalse;
19227 
19228     return ajTrue;
19229 }
19230 
19231 
19232 
19233 
19234 /* @func ajSeqGetFromUsa ******************************************************
19235 **
19236 ** Returns a sequence given a USA
19237 **
19238 ** @param [r] thys [const AjPStr] USA
19239 ** @param [r] protein [AjBool] True if protein
19240 ** @param [u] seq [AjPSeq] sequence
19241 ** @return [AjBool] ajTrue on success
19242 **
19243 ** @release 1.8.0
19244 ** @@
19245 ******************************************************************************/
19246 
ajSeqGetFromUsa(const AjPStr thys,AjBool protein,AjPSeq seq)19247 AjBool ajSeqGetFromUsa(const AjPStr thys, AjBool protein, AjPSeq seq)
19248 {
19249 
19250     return ajSeqGetFromUsaRange(thys, protein, 0, 0, seq);
19251 }
19252 
19253 
19254 
19255 
19256 /* @func ajSeqsetGetFromUsa ***************************************************
19257 **
19258 ** Return a seqset given a usa
19259 **
19260 ** @param [r] thys [const AjPStr] usa
19261 ** @param [w] seq [AjPSeqset*] seqset
19262 ** @return [AjBool] ajTrue on success
19263 **
19264 ** @release 2.7.0
19265 ******************************************************************************/
19266 
ajSeqsetGetFromUsa(const AjPStr thys,AjPSeqset * seq)19267 AjBool ajSeqsetGetFromUsa(const AjPStr thys, AjPSeqset *seq)
19268 {
19269     AjPSeqin seqin;
19270     AjBool ok;
19271 
19272     seqin        = ajSeqinNew();
19273     seqin->Input->Multi = ajTrue;
19274     seqin->Input->Text  = ajFalse;
19275 
19276     ajSeqinUsa(&seqin, thys);
19277     ok = ajSeqsetRead(*seq, seqin);
19278     ajSeqinDel(&seqin);
19279 
19280     if(!ok)
19281         return ajFalse;
19282 
19283     return ajTrue;
19284 }
19285 
19286 
19287 
19288 
19289 /* @funcstatic seqTextSeq *****************************************************
19290 **
19291 ** Saves a sequence from a string into the text output pointer
19292 **
19293 ** Could do some extra formatting here (left margin, numbering)
19294 ** but as the EMBOSS formats are not too fussy that can wait.
19295 **
19296 ** @param [w] textptr [AjPStr*] Text output
19297 ** @param [r] seq [const AjPStr] sequence as a string
19298 ** @return [void]
19299 **
19300 ** @release 2.4.0
19301 ******************************************************************************/
19302 
seqTextSeq(AjPStr * textptr,const AjPStr seq)19303 static void seqTextSeq(AjPStr* textptr, const AjPStr seq)
19304 {
19305     ajuint i;
19306     ajuint istart;
19307     ajuint iend;
19308     ajuint ilen;
19309     ajuint iwidth;
19310     AjPStr tmpstr = NULL;
19311 
19312     ilen = ajStrGetLen(seq);
19313     iwidth = 60;
19314 
19315     for(i=0; i < ilen; i += iwidth)
19316     {
19317         istart = i;
19318         iend = AJMIN(ilen-1, istart+iwidth-1);
19319         ajStrAssignSubS(&tmpstr, seq, istart, iend);
19320         ajFmtPrintAppS(textptr, "%S\n", tmpstr);
19321     }
19322 
19323     ajStrDel(&tmpstr);
19324 
19325     return;
19326 }
19327 
19328 
19329 
19330 
19331 /* @func ajSeqReadExit ********************************************************
19332 **
19333 ** Cleans up sequence reading internal memory
19334 **
19335 ** @return [void]
19336 **
19337 ** @release 4.0.0
19338 ** @@
19339 ******************************************************************************/
19340 
ajSeqReadExit(void)19341 void ajSeqReadExit(void)
19342 {
19343     /* USA processing regular expressions */
19344 
19345     ajRegFree(&seqRegUsaAsis);
19346     ajRegFree(&seqRegUsaDb);
19347     ajRegFree(&seqRegUsaFmt);
19348     ajRegFree(&seqRegUsaId);
19349     ajRegFree(&seqRegUsaList);
19350     ajRegFree(&seqRegUsaRange);
19351     ajRegFree(&seqRegUsaWild);
19352 
19353     /* sequence reading regular expressions */
19354 
19355     ajRegFree(&seqRegTreeconTop);
19356     ajRegFree(&seqRegMegaCommand);
19357     ajRegFree(&seqRegMegaFeat);
19358     ajRegFree(&seqRegMegaSeq);
19359     ajRegFree(&seqRegJackTop);
19360     ajRegFree(&seqRegJackSeq);
19361     ajRegFree(&seqRegGffTyp);
19362     ajRegFree(&seqRegGff3Typ);
19363     ajRegFree(&seqRegGcgDot);
19364     ajRegFree(&seqRegGcgChk);
19365     ajRegFree(&seqRegGcgLen);
19366     ajRegFree(&seqRegGcgNam);
19367     ajRegFree(&seqRegGcgTyp);
19368     ajRegFree(&seqRegGcgMsf);
19369     ajRegFree(&seqRegGcgMsflen);
19370     ajRegFree(&seqRegGcgMsfnam);
19371     ajRegFree(&seqRegGcgWgt);
19372     ajRegFree(&seqRegNbrfId);
19373     ajRegFree(&seqRegStadenId);
19374     ajRegFree(&seqRegHennigBlank);
19375     ajRegFree(&seqRegHennigSeq);
19376     ajRegFree(&seqRegHennigTop);
19377     ajRegFree(&seqRegHennigHead);
19378     ajRegFree(&seqRegFitchHead);
19379     ajRegFree(&seqRegStockholmSeq);
19380     ajRegFree(&seqRegAbiDots);
19381     ajRegFree(&seqRegRawNonseq);
19382     ajRegFree(&seqRegMaseHead);
19383     ajRegFree(&seqRegPhylipTop);
19384     ajRegFree(&seqRegPhylipHead);
19385     ajRegFree(&seqRegPhylipSeq);
19386     ajRegFree(&seqRegPhylipSeq2);
19387 
19388     /* sequence reading strings */
19389     ajStrDel(&seqFtFmtEmbl);
19390     ajStrDel(&seqFtFmtGenbank);
19391     ajStrDel(&seqFtFmtRefseq);
19392     ajStrDel(&seqFtFmtRefseqp);
19393     ajStrDel(&seqFtFmtGff);
19394     ajStrDel(&seqFtFmtPir);
19395     ajStrDel(&seqFtFmtSwiss);
19396     ajStrDel(&seqUsaTest);
19397     ajStrDel(&seqQryChr);
19398     ajStrDel(&seqQryDb);
19399     ajStrDel(&seqQryList);
19400     ajStrDel(&seqAppendRestStr);
19401     ajStrDel(&seqAppendTmpSeq);
19402     ajStrDel(&seqQualStr);
19403 
19404     ajStrDel(&seqReadLine);
19405     ajStrDel(&seqSaveLine);
19406     ajStrDel(&seqSaveLine2);
19407 
19408     ajTableDel(&seqDbMethods);
19409 
19410     AJFREE(seqAppendFilter);
19411 
19412     ajStrTokenDel(&seqHandle);
19413     ajStrTokenDel(&seqHandle2);
19414     ajStrTokenDel(&seqHandleSplit);
19415 
19416     ajStrDel(&seqName);
19417     ajStrDel(&seqChain);
19418     ajStrDel(&seqToken);
19419     ajStrDel(&seqToken2);
19420     ajStrDel(&seqTokenSplit);
19421     ajStrDel(&seqAppendTmpstr);
19422 
19423     return;
19424 }
19425 
19426 
19427 
19428 
19429 /* @section Internals *********************************************************
19430 **
19431 ** Functions to return internal values
19432 **
19433 ** @nam3rule Type Internals for sequence datatype
19434 ** @nam4rule Get  Return a value
19435 ** @nam5rule Fields  Known query fields for ajSeqRead
19436 ** @nam5rule Qlinks  Known query link operators for ajSeqRead
19437 **
19438 ** @valrule * [const char*] Internal value
19439 **
19440 ** @fcategory misc
19441 **
19442 ******************************************************************************/
19443 
19444 
19445 
19446 
19447 /* @func ajSeqinTypeGetFields *************************************************
19448 **
19449 ** Returns the listof known field names for ajSeqinRead
19450 **
19451 ** @return [const char*] List of field names
19452 **
19453 ** @release 6.4.0
19454 ** @@
19455 ******************************************************************************/
19456 
ajSeqinTypeGetFields(void)19457 const char* ajSeqinTypeGetFields(void)
19458 {
19459     return "id acc sv gi des key org";
19460 }
19461 
19462 
19463 
19464 
19465 /* @func ajSeqinTypeGetQlinks *************************************************
19466 **
19467 ** Returns the listof known query link operators for ajSeqRead
19468 **
19469 ** @return [const char*] List of field names
19470 **
19471 ** @release 6.4.0
19472 ** @@
19473 ******************************************************************************/
19474 
ajSeqinTypeGetQlinks(void)19475 const char* ajSeqinTypeGetQlinks(void)
19476 {
19477     return "|&!^=";
19478 }
19479 
19480 
19481 
19482 
19483 /* @func ajSeqinTrace *********************************************************
19484 **
19485 ** Debug calls to trace the data in a sequence input object.
19486 **
19487 ** @param [r] thys [const AjPSeqin] Sequence input object.
19488 ** @return [void]
19489 **
19490 ** @release 1.0.0
19491 ** @@
19492 ******************************************************************************/
19493 
ajSeqinTrace(const AjPSeqin thys)19494 void ajSeqinTrace(const AjPSeqin thys)
19495 {
19496     ajDebug("Sequence input trace\n");
19497     ajDebug( "====================\n\n");
19498     ajDebug( "  Name: '%S'\n", thys->Name);
19499 
19500     ajTextinTrace(thys->Input);
19501 
19502     if(ajStrGetLen(thys->Acc))
19503         ajDebug( "  Accession: '%S'\n", thys->Acc);
19504 
19505     if(ajStrGetLen(thys->Inputtype))
19506         ajDebug( "  Inputtype: '%S'\n", thys->Inputtype);
19507 
19508     if(ajStrGetLen(thys->Desc))
19509         ajDebug( "  Description: '%S'\n", thys->Desc);
19510 
19511     if(ajStrGetLen(thys->Inseq))
19512         ajDebug( "  Inseq len: %d\n", ajStrGetLen(thys->Inseq));
19513 
19514     if(thys->Rev)
19515         ajDebug( "     Rev: %B\n", thys->Rev);
19516 
19517     if(thys->Begin)
19518         ajDebug( "   Begin: %d\n", thys->Begin);
19519 
19520     if(thys->End)
19521         ajDebug( "     End: %d\n", thys->End);
19522 
19523     if(ajStrGetLen(thys->Full))
19524         ajDebug( "  Full name: '%S'\n", thys->Full);
19525 
19526     if(ajStrGetLen(thys->Date))
19527         ajDebug( "  Date: '%S'\n", thys->Date);
19528 
19529     if(ajStrGetLen(thys->Ufo))
19530         ajDebug( "  Ufo: '%S'\n", thys->Ufo);
19531 
19532     if(thys->Fttable)
19533         ajDebug( "  Fttable: exists\n");
19534 
19535     if(thys->Ftquery)
19536         ajDebug( "  Ftquery: exists\n");
19537 
19538     if(ajStrGetLen(thys->Entryname))
19539         ajDebug( "  Entryname: '%S'\n", thys->Entryname);
19540 
19541     if(ajStrGetLen(thys->DbSequence))
19542         ajDebug( "  DbSequence: '%S'\n", thys->DbSequence);
19543 
19544     if(thys->Features)
19545         ajDebug( "  Features: %B\n", thys->Features);
19546 
19547     if(thys->IsNuc)
19548         ajDebug( "  IsNuc: %B\n", thys->IsNuc);
19549 
19550     if(thys->IsProt)
19551         ajDebug( "  IsProt: %B\n", thys->IsProt);
19552 
19553     if(thys->SeqData)
19554         ajDebug( "  SeqData: exists\n");
19555 
19556     if(ajStrGetLen(thys->Doc))
19557         ajDebug( "  Documentation:...\n%S\n", thys->Doc);
19558 
19559     return;
19560 }
19561 
19562 
19563 
19564 
19565 /* @funcstatic stockholmNew ***************************************************
19566 **
19567 ** Creates and initialises a Stockholm object.
19568 **
19569 ** @param [r] i [ajuint] Number of sequences
19570 ** @return [SeqPStockholm] New sequence object.
19571 **
19572 ** @release 4.0.0
19573 ** @@
19574 ******************************************************************************/
19575 
stockholmNew(ajuint i)19576 static SeqPStockholm stockholmNew(ajuint i)
19577 {
19578     SeqPStockholm thys = NULL;
19579 
19580     AJNEW0(thys);
19581 
19582     thys->id  = ajStrNew();
19583     thys->ac  = ajStrNew();
19584     thys->de  = ajStrNew();
19585     thys->au  = ajStrNew();
19586     thys->al  = ajStrNew();
19587     thys->tp  = ajStrNew();
19588     thys->se  = ajStrNew();
19589     thys->bm  = ajStrNew();
19590     thys->dc  = ajStrNew();
19591     thys->dr  = ajStrNew();
19592     thys->cc  = ajStrNew();
19593     thys->gs  = ajStrNew();
19594     thys->ref = ajStrNew();
19595     thys->sacons  = ajStrNew();
19596     thys->sqcons  = ajStrNew();
19597     thys->sscons  = ajStrNew();
19598 
19599     thys->n = i;
19600 
19601     AJCNEW0(thys->name,i);
19602     AJCNEW0(thys->str,i);
19603 
19604     for(i=0;i<thys->n;++i)
19605     {
19606         thys->name[i] = ajStrNew();
19607         thys->str[i]  = ajStrNew();
19608     }
19609 
19610     return thys;
19611 }
19612 
19613 
19614 
19615 
19616 /* #funcstatic stockholmdataNew ***********************************************
19617 **
19618 ** Creates and initialises a Stockholm data object.
19619 **
19620 ** #return [SeqPStockholmdata] New sequence object.
19621 ** ##
19622 ******************************************************************************/
19623 
19624 /*static SeqPStockholmdata stockholmdataNew(void)
19625   {
19626   SeqPStockholmdata thys = NULL;
19627 
19628   AJNEW0(thys);
19629 
19630   thys->id  = ajStrNew();
19631   thys->ac  = ajStrNew();
19632   thys->de  = ajStrNew();
19633   thys->au  = ajStrNew();
19634   thys->al  = ajStrNew();
19635   thys->tp  = ajStrNew();
19636   thys->se  = ajStrNew();
19637   thys->bm  = ajStrNew();
19638   thys->dc  = ajStrNew();
19639   thys->dr  = ajStrNew();
19640   thys->cc  = ajStrNew();
19641   thys->gs  = ajStrNew();
19642   thys->ref = ajStrNew();
19643   thys->sacons  = ajStrNew();
19644   thys->sqcons  = ajStrNew();
19645   thys->sscons  = ajStrNew();
19646 
19647   return thys;
19648   }*/
19649 
19650 
19651 
19652 
19653 /* @funcstatic stockholmDel ***************************************************
19654 **
19655 ** Deletes a Stockholm object.
19656 **
19657 ** @param [d] Pseq [SeqPStockholm*] Stockholm object
19658 ** @return [void]
19659 **
19660 ** @release 4.0.0
19661 ** @@
19662 ******************************************************************************/
19663 
stockholmDel(SeqPStockholm * Pseq)19664 static void stockholmDel(SeqPStockholm *Pseq)
19665 {
19666     SeqPStockholm pthis = NULL;
19667     ajuint i;
19668 
19669     if(!Pseq)
19670         return;
19671 
19672     pthis = *Pseq;
19673 
19674     if(!pthis)
19675         return;
19676 
19677     ajStrDel(&pthis->id);
19678     ajStrDel(&pthis->ac);
19679     ajStrDel(&pthis->de);
19680     ajStrDel(&pthis->au);
19681     ajStrDel(&pthis->al);
19682     ajStrDel(&pthis->tp);
19683     ajStrDel(&pthis->se);
19684     ajStrDel(&pthis->bm);
19685     ajStrDel(&pthis->dc);
19686     ajStrDel(&pthis->dr);
19687     ajStrDel(&pthis->cc);
19688     ajStrDel(&pthis->gs);
19689     ajStrDel(&pthis->ref);
19690     ajStrDel(&pthis->sacons);
19691     ajStrDel(&pthis->sqcons);
19692     ajStrDel(&pthis->sscons);
19693 
19694     for(i=0;i<pthis->n;++i)
19695     {
19696         ajStrDel(&pthis->name[i]);
19697         ajStrDel(&pthis->str[i]);
19698     }
19699 
19700     AJFREE(pthis->name);
19701     AJFREE(pthis->str);
19702     AJFREE(*Pseq);
19703 
19704     return;
19705 }
19706 
19707 
19708 
19709 
19710 /* #funcstatic stockholmdataDel ***********************************************
19711 **
19712 ** Deletes a Stockholm data object.
19713 **
19714 ** #param [d] Pseq [SeqPStockholmdata*] Stockholm object
19715 ** #return [void]
19716 ** ##
19717 ******************************************************************************/
19718 
19719 /*static void stockholmdataDel(SeqPStockholmdata *Pseq)
19720   {
19721   SeqPStockholmdata pthis = NULL;
19722 
19723   if(!Pseq)
19724   return;
19725   pthis = *Pseq;
19726   if(!pthis)
19727   return;
19728 
19729   ajStrDel(&pthis->id);
19730   ajStrDel(&pthis->ac);
19731   ajStrDel(&pthis->de);
19732   ajStrDel(&pthis->au);
19733   ajStrDel(&pthis->al);
19734   ajStrDel(&pthis->tp);
19735   ajStrDel(&pthis->se);
19736   ajStrDel(&pthis->bm);
19737   ajStrDel(&pthis->dc);
19738   ajStrDel(&pthis->dr);
19739   ajStrDel(&pthis->cc);
19740   ajStrDel(&pthis->gs);
19741   ajStrDel(&pthis->ref);
19742   ajStrDel(&pthis->sacons);
19743   ajStrDel(&pthis->sqcons);
19744   ajStrDel(&pthis->sscons);
19745 
19746   AJFREE(*Pseq);
19747 
19748   return;
19749   }*/
19750 
19751 
19752 
19753 
19754 /* @funcstatic selexNew *******************************************************
19755 **
19756 ** Creates and initialises a selex #=SQ line object.
19757 **
19758 ** @param [r] n [ajuint] Number of sequences
19759 ** @return [SeqPSelex] New sequence object.
19760 **
19761 ** @release 4.0.0
19762 ** @@
19763 ******************************************************************************/
19764 
selexNew(ajuint n)19765 static SeqPSelex selexNew(ajuint n)
19766 {
19767     SeqPSelex thys = NULL;
19768     ajuint    i;
19769 
19770     AJNEW0(thys);
19771     thys->id = ajStrNew();
19772     thys->ac = ajStrNew();
19773     thys->de = ajStrNew();
19774     thys->au = ajStrNew();
19775     thys->cs = ajStrNew();
19776     thys->rf = ajStrNew();
19777     thys->n  = n;
19778 
19779     AJCNEW(thys->name,n);
19780     AJCNEW(thys->str,n);
19781     AJCNEW(thys->ss,n);
19782     AJCNEW(thys->sq,n);
19783 
19784     for(i=0;i<n;++i)
19785     {
19786         thys->name[i] = ajStrNew();
19787         thys->str[i]  = ajStrNew();
19788         thys->ss[i]   = ajStrNew();
19789         thys->sq[i]   = selexseqNew();
19790     }
19791 
19792     return thys;
19793 }
19794 
19795 
19796 
19797 
19798 /* @funcstatic selexseqNew ****************************************************
19799 **
19800 ** Creates and initialises a selex #=SQ line object.
19801 **
19802 ** @return [SeqPSelexseq] New sequence object.
19803 **
19804 ** @release 4.0.0
19805 ** @@
19806 ******************************************************************************/
19807 
selexseqNew(void)19808 static SeqPSelexseq selexseqNew(void)
19809 {
19810     SeqPSelexseq thys = NULL;
19811 
19812     AJNEW0(thys);
19813 
19814     thys->name   = ajStrNew();
19815     thys->source = ajStrNew();
19816     thys->ac     = ajStrNew();
19817     thys->de     = ajStrNew();
19818 
19819     return thys;
19820 }
19821 
19822 
19823 
19824 
19825 /* #funcstatic selexdataNew ***************************************************
19826 **
19827 ** Creates and initialises a selex #=SQ line object.
19828 **
19829 ** #return [SeqPSelexdata] New sequence object.
19830 ** ##
19831 ******************************************************************************/
19832 
19833 /*static SeqPSelexdata selexdataNew(void)
19834   {
19835   SeqPSelexdata thys = NULL;
19836 
19837   AJNEW0(thys);
19838   thys->id = ajStrNew();
19839   thys->ac = ajStrNew();
19840   thys->de = ajStrNew();
19841   thys->au = ajStrNew();
19842   thys->cs = ajStrNew();
19843   thys->rf = ajStrNew();
19844 
19845   thys->name = ajStrNew();
19846   thys->str  = ajStrNew();
19847   thys->ss   = ajStrNew();
19848   thys->sq   = selexseqNew();
19849 
19850   return thys;
19851   }*/
19852 
19853 
19854 
19855 
19856 /* @funcstatic selexseqDel ****************************************************
19857 **
19858 ** Deletes a Selex object.
19859 **
19860 ** @param [d] Pseq [SeqPSelexseq*] Selex #=SQ object
19861 ** @return [void]
19862 ** **
19863 **
19864 ** @release 4.1.0
19865 ******************************************************************************/
19866 
selexseqDel(SeqPSelexseq * Pseq)19867 static void selexseqDel(SeqPSelexseq *Pseq)
19868 {
19869     SeqPSelexseq pthis;
19870 
19871     pthis = *Pseq;
19872 
19873     if(!Pseq || !pthis)
19874         return;
19875 
19876     ajStrDel(&pthis->name);
19877     ajStrDel(&pthis->source);
19878     ajStrDel(&pthis->ac);
19879     ajStrDel(&pthis->de);
19880 
19881     AJFREE(pthis);
19882     *Pseq = NULL;
19883 
19884     return;
19885 }
19886 
19887 
19888 
19889 
19890 /* @funcstatic selexDel *******************************************************
19891 **
19892 ** Deletes a Selex object.
19893 **
19894 ** @param [d] Pseq [SeqPSelex*] Selex object
19895 ** @return [void]
19896 **
19897 ** @release 4.1.0
19898 ** @@
19899 ******************************************************************************/
19900 
selexDel(SeqPSelex * Pseq)19901 static void selexDel(SeqPSelex *Pseq)
19902 {
19903     SeqPSelex pthis;
19904     ajuint    i;
19905     ajuint    n;
19906 
19907     pthis = *Pseq;
19908 
19909     if(!Pseq || !pthis)
19910         return;
19911 
19912     n = pthis->n;
19913 
19914     for(i=0;i<n;++i)
19915     {
19916         ajStrDel(&pthis->name[i]);
19917         ajStrDel(&pthis->str[i]);
19918         ajStrDel(&pthis->ss[i]);
19919         selexseqDel(&pthis->sq[i]);
19920     }
19921 
19922     if(n)
19923     {
19924         AJFREE(pthis->name);
19925         AJFREE(pthis->str);
19926         AJFREE(pthis->ss);
19927         AJFREE(pthis->sq);
19928     }
19929 
19930     ajStrDel(&pthis->id);
19931     ajStrDel(&pthis->ac);
19932     ajStrDel(&pthis->de);
19933     ajStrDel(&pthis->au);
19934     ajStrDel(&pthis->cs);
19935     ajStrDel(&pthis->rf);
19936 
19937     AJFREE(pthis);
19938     *Pseq = NULL;
19939 
19940     return;
19941 }
19942 
19943 
19944 
19945 
19946 /* #funcstatic selexdataDel ***************************************************
19947 **
19948 ** Deletes a Selex data object.
19949 **
19950 ** #param [d] Pseq [SeqPSelexdata*] Selex data object
19951 ** #return [void]
19952 ** ##
19953 ******************************************************************************/
19954 
19955 /*static void selexdataDel(SeqPSelexdata *Pseq)
19956   {
19957   SeqPSelexdata pthis;
19958 
19959   pthis = *Pseq;
19960 
19961   if(!Pseq || !pthis)
19962   return;
19963 
19964 
19965   ajStrDel(&pthis->name);
19966   ajStrDel(&pthis->str);
19967   ajStrDel(&pthis->ss);
19968   selexseqDel(&pthis->sq);
19969 
19970   ajStrDel(&pthis->id);
19971   ajStrDel(&pthis->ac);
19972   ajStrDel(&pthis->de);
19973   ajStrDel(&pthis->au);
19974   ajStrDel(&pthis->cs);
19975   ajStrDel(&pthis->rf);
19976 
19977   AJFREE(pthis);
19978   *Pseq = NULL;
19979 
19980   return;
19981   }*/
19982 
19983 
19984 
19985 
19986 /* #funcstatic seqSelexClone *************************************************
19987 **
19988 ** Clone a Selexdata object
19989 **
19990 ** #param [r] thys [const SeqPSelexdata] selex data object
19991 **
19992 ** #return [SeqPSelexdata] New selex data object.
19993 ** ##
19994 ******************************************************************************/
19995 
19996 /*static SeqPSelexdata seqSelexClone(const SeqPSelexdata thys)
19997   {
19998   SeqPSelexdata pthis;
19999 
20000   pthis = selexdataNew();
20001 
20002   ajStrAssignS(&pthis->id, thys->id);
20003   ajStrAssignS(&pthis->ac, thys->ac);
20004   ajStrAssignS(&pthis->de, thys->de);
20005   ajStrAssignS(&pthis->au, thys->au);
20006   ajStrAssignS(&pthis->cs, thys->cs);
20007   ajStrAssignS(&pthis->rf, thys->rf);
20008   ajStrAssignS(&pthis->name, thys->name);
20009   ajStrAssignS(&pthis->str, thys->str);
20010   ajStrAssignS(&pthis->ss, thys->ss);
20011 
20012   pthis->ga[0] = thys->ga[0];
20013   pthis->ga[1] = thys->ga[1];
20014   pthis->tc[0] = thys->tc[0];
20015   pthis->tc[1] = thys->tc[1];
20016   pthis->nc[0] = thys->nc[0];
20017   pthis->nc[1] = thys->nc[1];
20018 
20019   ajStrAssignS(&pthis->sq->name, thys->sq->name);
20020   ajStrAssignS(&pthis->sq->source, thys->sq->source);
20021   ajStrAssignS(&pthis->sq->ac, thys->sq->ac);
20022   ajStrAssignS(&pthis->sq->de, thys->sq->de);
20023 
20024   pthis->sq->wt    = thys->sq->wt;
20025   pthis->sq->start = thys->sq->start;
20026   pthis->sq->stop  = thys->sq->stop;
20027   pthis->sq->len   = thys->sq->len;
20028 
20029 
20030   return pthis;
20031   }*/
20032 
20033 
20034 
20035 
20036 /* @funcstatic seqDefine ******************************************************
20037 **
20038 ** Make sure all sequence object attributes are defined
20039 ** using values from the sequence input object if needed
20040 **
20041 ** @param [w] thys [AjPSeq] Sequence returned.
20042 ** @param [u] seqin [AjPSeqin] Sequence input definitions
20043 ** @return [AjBool] ajTrue on success.
20044 **
20045 ** @release 4.1.0
20046 ** @@
20047 ******************************************************************************/
20048 
seqDefine(AjPSeq thys,AjPSeqin seqin)20049 static AjBool seqDefine(AjPSeq thys, AjPSeqin seqin)
20050 {
20051 
20052     /* if values are missing in the sequence object, we can use defaults
20053        from seqin or calculate where possible */
20054 
20055     /*ajDebug("seqDefine: thys->Db '%S', seqin->Db '%S'\n",
20056       thys->Db, seqin->Db);*/
20057     /*ajDebug("seqDefine: thys->Name '%S' type: %S\n",
20058       thys->Name, thys->Type);*/
20059     /*ajDebug("seqDefine: thys->Entryname '%S', seqin->Entryname '%S'\n",
20060       thys->Entryname, seqin->Entryname);*/
20061 
20062     /* assign the dbname and entryname if defined in the seqin object */
20063     if(ajStrGetLen(seqin->Input->Db))
20064         ajStrAssignS(&thys->Db, seqin->Input->Db);
20065 
20066     if(ajStrGetLen(seqin->Entryname))
20067         ajStrAssignEmptyS(&thys->Entryname, seqin->Entryname);
20068 
20069     if(ajStrGetLen(thys->Entryname))
20070         ajStrAssignS(&thys->Name, thys->Entryname);
20071 
20072     /*ajDebug("seqDefine: returns thys->Name '%S' type: %S\n",
20073       thys->Name, thys->Type);*/
20074 
20075     if(!ajStrGetLen(thys->Type))
20076     {
20077         if(thys->Format)
20078         {
20079             if(seqinFormatDef[thys->Format].Nucleotide &&
20080                !seqinFormatDef[thys->Format].Protein)
20081                 ajSeqSetNuc(thys);
20082 
20083             if(!seqinFormatDef[thys->Format].Nucleotide &&
20084                seqinFormatDef[thys->Format].Protein)
20085                 ajSeqSetProt(thys);
20086         }
20087     }
20088 
20089     if(!ajStrGetLen(thys->Type))
20090         ajSeqType(thys);
20091 
20092     if(seqin->Circular)
20093         thys->Circular = ajTrue;
20094 
20095     if(thys->Fttable)
20096     {
20097         if(thys->Circular)
20098             ajFeattableSetCircular(thys->Fttable);
20099         else if(ajFeattableIsCircular(thys->Fttable))
20100             thys->Circular = ajTrue;
20101     }
20102 
20103     return ajTrue;
20104 }
20105 
20106 
20107 
20108 
20109 /* @func ajSeqaccessGetDb *****************************************************
20110 **
20111 ** returns the table in which sequence database access details are registered
20112 **
20113 ** @return [AjPTable] Access functions hash table
20114 **
20115 ** @release 6.4.0
20116 ** @@
20117 ******************************************************************************/
20118 
ajSeqaccessGetDb(void)20119 AjPTable ajSeqaccessGetDb(void)
20120 {
20121     if(!seqDbMethods)
20122         seqDbMethods = ajCallTableNew();
20123     return seqDbMethods;
20124 }
20125 
20126 
20127 
20128 
20129 
20130 /* @func ajSeqaccessMethodGetQlinks *******************************************
20131 **
20132 ** Tests for a named method for sequence reading and returns the
20133 ** known query link operators
20134 **
20135 ** @param [r] method [const AjPStr] Method required.
20136 ** @return [const char*] Known link operators
20137 **
20138 ** @release 6.4.0
20139 ** @@
20140 ******************************************************************************/
20141 
ajSeqaccessMethodGetQlinks(const AjPStr method)20142 const char* ajSeqaccessMethodGetQlinks(const AjPStr method)
20143 {
20144     AjPSeqAccess methoddata;
20145 
20146     methoddata = ajCallTableGetS(seqDbMethods, method);
20147     if(!methoddata)
20148         return NULL;
20149 
20150     return methoddata->Qlink;
20151 }
20152 
20153 
20154 
20155 
20156 /* @func ajSeqaccessMethodGetScope ********************************************
20157 **
20158 ** Tests for a named method for sequence reading and returns the scope
20159 ** (entry, query or all).
20160 *
20161 ** @param [r] method [const AjPStr] Method required.
20162 ** @return [ajuint] Scope flags
20163 **
20164 ** @release 6.4.0
20165 ** @@
20166 ******************************************************************************/
20167 
ajSeqaccessMethodGetScope(const AjPStr method)20168 ajuint ajSeqaccessMethodGetScope(const AjPStr method)
20169 {
20170     AjPSeqAccess methoddata;
20171     ajuint ret = 0;
20172 
20173     methoddata = ajCallTableGetS(seqDbMethods, method);
20174     if(!methoddata)
20175         return 0;
20176 
20177     if(methoddata->Entry)
20178         ret |= AJMETHOD_ENTRY;
20179     if(methoddata->Query)
20180         ret |= AJMETHOD_QUERY;
20181     if(methoddata->All)
20182         ret |= AJMETHOD_ALL;
20183 
20184     return ret;
20185 }
20186 
20187 
20188 
20189 
20190 /* @func ajSeqaccessMethodTest ************************************************
20191 **
20192 ** Tests for a named method for sequence reading.
20193 **
20194 ** @param [r] method [const AjPStr] Method required.
20195 ** @return [AjBool] ajTrue on success.
20196 **
20197 ** @release 6.4.0
20198 ** @@
20199 ******************************************************************************/
20200 
ajSeqaccessMethodTest(const AjPStr method)20201 AjBool ajSeqaccessMethodTest(const AjPStr method)
20202 {
20203     if(ajCallTableGetS(seqDbMethods, method))
20204         return ajTrue;
20205 
20206     return ajFalse;
20207 }
20208 
20209 
20210 
20211 
20212 /* @func ajSeqinformatTerm ****************************************************
20213 **
20214 ** Tests whether a data input format term is known
20215 **
20216 ** @param [r] term [const AjPStr] Format term EDAM ID
20217 ** @return [AjBool] ajTrue if term was accepted
20218 **
20219 ** @release 6.4.0
20220 ** @@
20221 ******************************************************************************/
20222 
ajSeqinformatTerm(const AjPStr term)20223 AjBool ajSeqinformatTerm(const AjPStr term)
20224 {
20225     ajuint i;
20226 
20227     for(i=0; seqinFormatDef[i].Name; i++)
20228         if(ajStrMatchC(term, seqinFormatDef[i].Obo))
20229             return ajTrue;
20230 
20231     return ajFalse;
20232 }
20233 
20234 
20235 
20236 
20237 /* @func ajSeqinformatTest ****************************************************
20238 **
20239 ** Tests whether a named sequence data input format is known
20240 **
20241 ** @param [r] format [const AjPStr] Format
20242 ** @return [AjBool] ajTrue if formats was accepted
20243 **
20244 ** @release 6.4.0
20245 ** @@
20246 ******************************************************************************/
20247 
ajSeqinformatTest(const AjPStr format)20248 AjBool ajSeqinformatTest(const AjPStr format)
20249 {
20250     ajuint i;
20251 
20252     for(i=0; seqinFormatDef[i].Name; i++)
20253         if(ajStrMatchCaseC(format, seqinFormatDef[i].Name))
20254             return ajTrue;
20255 
20256     return ajFalse;
20257 }
20258 
20259 
20260 
20261 
20262 #ifdef AJ_COMPILE_DEPRECATED_BOOK
20263 #endif /* AJ_COMPILE_DEPRECATED_BOOK */
20264 
20265 
20266 
20267 
20268 #ifdef AJ_COMPILE_DEPRECATED
20269 /* @obsolete ajSeqMethodGetScope
20270 ** @rename ajSeqaccessMethodGetScope
20271 */
20272 
ajSeqMethodGetScope(const AjPStr method)20273 __deprecated ajuint ajSeqMethodGetScope(const AjPStr method)
20274 {
20275     return ajSeqaccessMethodGetScope(method);
20276 }
20277 
20278 
20279 
20280 
20281 /* @obsolete ajSeqMethodTest
20282 ** @rename ajSeqaccessMethodTest
20283 */
20284 
ajSeqMethodTest(const AjPStr method)20285 __deprecated AjBool ajSeqMethodTest(const AjPStr method)
20286 {
20287     return ajSeqaccessMethodTest(method);
20288 }
20289 
20290 #endif /* AJ_COMPILE_DEPRECATED */
20291