1 /* @source enssequence ********************************************************
2 **
3 ** Ensembl Sequence functions
4 **
5 ** @author Copyright (C) 1999 Ensembl Developers
6 ** @author Copyright (C) 2006 Michael K. Schuster
7 ** @version $Revision: 1.46 $
8 ** @modified 2009 by Alan Bleasby for incorporation into EMBOSS core
9 ** @modified $Date: 2013/02/17 13:02:40 $ by $Author: mks $
10 ** @@
11 **
12 ** This library is free software; you can redistribute it and/or
13 ** modify it under the terms of the GNU Lesser General Public
14 ** License as published by the Free Software Foundation; either
15 ** version 2.1 of the License, or (at your option) any later version.
16 **
17 ** This library is distributed in the hope that it will be useful,
18 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 ** Lesser General Public License for more details.
21 **
22 ** You should have received a copy of the GNU Lesser General Public
23 ** License along with this library; if not, write to the Free Software
24 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
25 ** MA 02110-1301, USA.
26 **
27 ******************************************************************************/
28
29 /* ========================================================================= */
30 /* ============================= include files ============================= */
31 /* ========================================================================= */
32
33 #include "enscache.h"
34 #include "ensprojectionsegment.h"
35 #include "enssequence.h"
36 #include "enssequenceedit.h"
37
38
39
40
41 /* ========================================================================= */
42 /* =============================== constants =============================== */
43 /* ========================================================================= */
44
45
46
47
48 /* ========================================================================= */
49 /* =========================== global variables ============================ */
50 /* ========================================================================= */
51
52
53
54
55 /* ========================================================================= */
56 /* ============================= private data ============================== */
57 /* ========================================================================= */
58
59
60
61
62 /* ========================================================================= */
63 /* =========================== private constants =========================== */
64 /* ========================================================================= */
65
66 /* @conststatic sequenceKChunkPower *******************************************
67 **
68 ** The Ensembl Sequence Adaptor handles sequences in larger chunks internally.
69 ** Both, memory allocation and SQL database operations become more efficient,
70 ** since requests for neighbouring regions are likely to be returned from the
71 ** same block.
72 **
73 ** 1 << 18 = 256 KiB, about the size of a BAC clone
74 **
75 ******************************************************************************/
76
77 static const ajuint sequenceKChunkPower = 18U;
78
79
80
81
82 /* @conststatic sequenceadaptorKCacheMaxBytes *********************************
83 **
84 ** The maximum memory size in bytes the Ensembl Sequence Adaptor-internal
85 ** Ensembl Cache can use.
86 **
87 ** 1 << 26 = 64 MiB
88 **
89 ******************************************************************************/
90
91 static const size_t sequenceadaptorKCacheMaxBytes = 1U << 26U;
92
93
94
95
96 /* @conststatic sequenceadaptorKCacheMaxCount *********************************
97 **
98 ** The mamximum number of sequence chunks the Ensembl Sequence Adaptor-internal
99 ** Ensembl Cache can hold.
100 **
101 ** 1 << 16 = 64 Ki
102 **
103 ******************************************************************************/
104
105 static const ajuint sequenceadaptorKCacheMaxCount = 1U << 16U;
106
107
108
109
110 /* @conststatic sequenceadaptorKCacheMaxSize **********************************
111 **
112 ** Maximum size of a sequence chunk to be allowed into the
113 ** Ensembl Sequence Adaptor-internal Ensembl Cache.
114 **
115 ******************************************************************************/
116
117 static const size_t sequenceadaptorKCacheMaxSize = 0U;
118
119
120
121
122 /* @conststatic sequenceadaptorKCacheMaxLength ********************************
123 **
124 ** Maximum length of a sequence request up to which it gets chunked and cached.
125 ** Larger request are returned directly.
126 **
127 ** 1 << 21 = 2 Mi, or about 8 BAC clones
128 **
129 ******************************************************************************/
130
131 static const ajuint sequenceadaptorKCacheMaxLength = 1U << 21U;
132
133
134
135
136 /* ========================================================================= */
137 /* =========================== private variables =========================== */
138 /* ========================================================================= */
139
140
141
142
143 /* ========================================================================= */
144 /* =========================== private functions =========================== */
145 /* ========================================================================= */
146
147 static size_t sequenceadaptorCacheSize(const void *value);
148
149 static AjBool sequenceadaptorFetchCircularsliceSubStr(
150 EnsPSequenceadaptor sqa,
151 EnsPSlice slice,
152 ajint start,
153 ajint end,
154 ajint strand,
155 AjPStr *Psequence);
156
157
158
159
160 /* ========================================================================= */
161 /* ======================= All functions by section ======================== */
162 /* ========================================================================= */
163
164
165
166
167 /* @filesection enssequence ***************************************************
168 **
169 ** @nam1rule ens Function belongs to the Ensembl library
170 **
171 ******************************************************************************/
172
173
174
175
176 /* @datasection [EnsPSequenceadaptor] Ensembl Sequence Adaptor ****************
177 **
178 ** @nam2rule Sequenceadaptor Functions for manipulating
179 ** Ensembl Sequence Adaptor objects
180 **
181 ** @cc Bio::EnsEMBL::DBSQL::Sequenceadaptor
182 ** @cc CVS Revision: 1.54
183 ** @cc CVS Tag: branch-ensembl-68
184 **
185 ******************************************************************************/
186
187
188
189
190 /* @funcstatic sequenceadaptorCacheSize ***************************************
191 **
192 ** Wrapper function to calculate the memory size of an Ensembl Sequence
193 ** (i.e. an AJAX String) from an Ensembl Cache.
194 **
195 ** @param [r] value [const void*] AJAX String
196 **
197 ** @return [size_t] Memory size in bytes or 0
198 **
199 ** @release 6.3.0
200 ** @@
201 ******************************************************************************/
202
sequenceadaptorCacheSize(const void * value)203 static size_t sequenceadaptorCacheSize(const void *value)
204 {
205 size_t size = 0;
206
207 if (!value)
208 return 0;
209
210 size += sizeof (AjOStr);
211
212 size += ajStrGetRes((const AjPStr) value);
213
214 return size;
215 }
216
217
218
219
220 /* @section constructors ******************************************************
221 **
222 ** All constructors return a new Ensembl Sequence Adaptor by pointer.
223 ** It is the responsibility of the user to first destroy any previous
224 ** Sequence Adaptor. The target pointer does not need to be initialised to
225 ** NULL, but it is good programming practice to do so anyway.
226 **
227 ** @fdata [EnsPSequenceadaptor]
228 **
229 ** @nam3rule New Constructor
230 **
231 ** @argrule New dba [EnsPDatabaseadaptor] Ensembl Database Adaptor
232 ** @argrule Obj object [EnsPSequenceadaptor] Ensembl Sequence Adaptor
233 ** @argrule Ref object [EnsPSequenceadaptor] Ensembl Sequence Adaptor
234 **
235 ** @valrule * [EnsPSequenceadaptor] Ensembl Sequence Adaptor or NULL
236 **
237 ** @fcategory new
238 ******************************************************************************/
239
240
241
242
243 /* @func ensSequenceadaptorNew ************************************************
244 **
245 ** Default constructor for an Ensembl Sequence Adaptor.
246 **
247 ** Ensembl Object Adaptors are singleton objects in the sense that a single
248 ** instance of an Ensembl Object Adaptor connected to a particular database is
249 ** sufficient to instantiate any number of Ensembl Objects from the database.
250 ** Each Ensembl Object will have a weak reference to the Object Adaptor that
251 ** instantiated it. Therefore, Ensembl Object Adaptors should not be
252 ** instantiated directly, but rather obtained from the Ensembl Registry,
253 ** which will in turn call this function if neccessary.
254 **
255 ** @see ensRegistryGetDatabaseadaptor
256 ** @see ensRegistryGetSequenceadaptor
257 **
258 ** @cc Bio::EnsEMBL::DBSQL::Sequenceadaptor::new
259 ** @param [u] dba [EnsPDatabaseadaptor] Ensembl Database Adaptor
260 **
261 ** @return [EnsPSequenceadaptor] Ensembl Sequence Adaptor or NULL
262 **
263 ** @release 6.2.0
264 ** @@
265 ******************************************************************************/
266
ensSequenceadaptorNew(EnsPDatabaseadaptor dba)267 EnsPSequenceadaptor ensSequenceadaptorNew(
268 EnsPDatabaseadaptor dba)
269 {
270 EnsPSequenceadaptor sqa = NULL;
271
272 if (!dba)
273 return NULL;
274
275 if (ajDebugTest("ensSequenceadaptorNew"))
276 ajDebug("ensSequenceadaptorNew\n"
277 " dba %p\n",
278 dba);
279
280 AJNEW0(sqa);
281
282 sqa->Adaptor = dba;
283
284 sqa->Cache = ensCacheNew(
285 ensECacheTypeAlphaNumeric,
286 sequenceadaptorKCacheMaxBytes,
287 sequenceadaptorKCacheMaxCount,
288 sequenceadaptorKCacheMaxSize,
289 (void *(*)(void *)) &ajStrNewRef,
290 (void (*)(void **)) &ajStrDel,
291 &sequenceadaptorCacheSize,
292 (void *(*)(const void *)) NULL,
293 (AjBool (*)(const void *)) NULL,
294 ajFalse,
295 "Sequence");
296
297 return sqa;
298 }
299
300
301
302
303 /* @section destructors *******************************************************
304 **
305 ** Destruction destroys all internal data structures and frees the memory
306 ** allocated for an Ensembl Sequence Adaptor object.
307 **
308 ** @fdata [EnsPSequenceadaptor]
309 **
310 ** @nam3rule Del Destroy (free) an Ensembl Sequence Adaptor
311 **
312 ** @argrule * Psqa [EnsPSequenceadaptor*] Ensembl Sequence Adaptor address
313 **
314 ** @valrule * [void]
315 **
316 ** @fcategory delete
317 ******************************************************************************/
318
319
320
321
322 /* @func ensSequenceadaptorDel ************************************************
323 **
324 ** Default destructor for an Ensembl Sequence Adaptor.
325 **
326 ** Ensembl Object Adaptors are singleton objects that are registered in the
327 ** Ensembl Registry and weakly referenced by Ensembl Objects that have been
328 ** instantiated by it. Therefore, Ensembl Object Adaptors should never be
329 ** destroyed directly. Upon exit, the Ensembl Registry will call this function
330 ** if required.
331 **
332 ** @param [d] Psqa [EnsPSequenceadaptor*] Ensembl Sequence Adaptor address
333 **
334 ** @return [void]
335 **
336 ** @release 6.2.0
337 ** @@
338 ******************************************************************************/
339
ensSequenceadaptorDel(EnsPSequenceadaptor * Psqa)340 void ensSequenceadaptorDel(EnsPSequenceadaptor *Psqa)
341 {
342 EnsPSequenceadaptor pthis = NULL;
343
344 if (!Psqa)
345 return;
346
347 #if defined(AJ_DEBUG) && AJ_DEBUG >= 1
348 if (ajDebugTest("ensSequenceadaptorDel"))
349 ajDebug("ensSequenceadaptorDel\n"
350 " *Psqa %p\n",
351 *Psqa);
352 #endif /* defined(AJ_DEBUG) && AJ_DEBUG >= 1 */
353
354 if (!(pthis = *Psqa))
355 return;
356
357 ensCacheDel(&pthis->Cache);
358
359 ajMemFree((void **) Psqa);
360
361 return;
362 }
363
364
365
366
367 /* @section cache *************************************************************
368 **
369 ** Functions for maintaining the Ensembl Sequence Adaptor-internal cache.
370 **
371 ** @fdata [EnsPSequenceadaptor]
372 **
373 ** @nam3rule Cache
374 ** @nam4rule Clear Clear the Ensembl Sequence Adaptor-internal cache
375 **
376 ** @argrule * sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
377 **
378 ** @valrule * [AjBool] ajTrue upon success, ajFalse otherwise
379 **
380 ** @fcategory use
381 ******************************************************************************/
382
383
384
385
386 /* @func ensSequenceadaptorCacheClear *****************************************
387 **
388 ** Clear the internal cache of an Ensembl Sequence Adaptor.
389 **
390 ** @cc Bio::EnsEMBL::DBSQL::SequenceAdaptor::clear_cache
391 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
392 **
393 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
394 **
395 ** @release 6.5.0
396 ** @@
397 ******************************************************************************/
398
ensSequenceadaptorCacheClear(EnsPSequenceadaptor sqa)399 AjBool ensSequenceadaptorCacheClear(EnsPSequenceadaptor sqa)
400 {
401 if (!sqa)
402 return ajFalse;
403
404 return ensCacheClear(sqa->Cache);
405 }
406
407
408
409
410 /* @section member retrieval **************************************************
411 **
412 ** Functions for returning members of an
413 ** Ensembl Sequence Adaptor object.
414 **
415 ** @fdata [EnsPSequenceadaptor]
416 **
417 ** @nam3rule Get Return Ensembl Sequence Adaptor attribute(s)
418 ** @nam4rule Databaseadaptor Return the Ensembl Database Adaptor
419 **
420 ** @argrule * sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
421 **
422 ** @valrule Databaseadaptor [EnsPDatabaseadaptor]
423 ** Ensembl Database Adaptor or NULL
424 **
425 ** @fcategory use
426 ******************************************************************************/
427
428
429
430
431 /* @func ensSequenceadaptorGetDatabaseadaptor ***************************
432 **
433 ** Get the Ensembl Database Adaptor of an Ensembl Sequence Adaptor.
434 **
435 ** @param [u] sqa [EnsPSequenceadaptor]
436 ** Ensembl Sequence Adaptor
437 **
438 ** @return [EnsPDatabaseadaptor] Ensembl Database Adaptor or NULL
439 **
440 ** @release 6.5.0
441 ** @@
442 ******************************************************************************/
443
ensSequenceadaptorGetDatabaseadaptor(EnsPSequenceadaptor sqa)444 EnsPDatabaseadaptor ensSequenceadaptorGetDatabaseadaptor(
445 EnsPSequenceadaptor sqa)
446 {
447 return (sqa) ? sqa->Adaptor : NULL;
448 }
449
450
451
452
453 /* @section object fetching ***************************************************
454 **
455 ** Functions for retrieving Sequence objects from an Ensembl Core database.
456 **
457 ** @fdata [EnsPSequenceadaptor]
458 **
459 ** @nam3rule Fetch Fetch Ensembl Sequence object(s)
460 ** @nam4rule Seqregion Fetch by an Ensembl Sequence Region
461 ** @nam4rule Slice Fetch by an Ensembl Slice
462 ** @nam5rule All Fetch the complete sequence
463 ** @nam5rule Sub Fetch a sub-sequence
464 ** @nam6rule Str Fetch as an AJAX String
465 ** @nam6rule Seq Fetch as an AJAX Sequence
466 **
467 ** @argrule * sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
468 ** @argrule Seqregion sr [const EnsPSeqregion] Ensembl Sequence Region
469 ** @argrule SeqregionSub start [ajuint] Start coordinate
470 ** @argrule SeqregionSub length [ajuint] Sequence length
471 ** @argrule Slice slice [EnsPSlice] Ensembl Slice
472 ** @argrule SliceSub start [ajint] Start
473 ** @argrule SliceSub end [ajint] End
474 ** @argrule SliceSub strand [ajint] Strand
475 ** @argrule Str Psequence [AjPStr*] AJAX String address
476 ** @argrule Seq Psequence [AjPSeq*] AJAX Sequence address
477 **
478 ** @valrule * [AjBool] ajTrue upon success, ajFalse otherwise
479 **
480 ** @fcategory use
481 ******************************************************************************/
482
483
484
485
486 /* @func ensSequenceadaptorFetchSeqregionAllSeq *******************************
487 **
488 ** Fetch the sequence of an Ensembl Sequence Region as an AJAX Sequence.
489 **
490 ** The caller is responsible for deleting the AJAX String.
491 **
492 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
493 ** @param [r] sr [const EnsPSeqregion] Ensembl Sequence Region
494 ** @param [wP] Psequence [AjPSeq*] AJAX Sequence address
495 **
496 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
497 **
498 ** @release 6.4.0
499 ** @@
500 ** This function will only return biological sequence information for
501 ** Ensembl Sequence Region objects, which are in the sequence-level
502 ** Ensembl Coordinate System. All other Ensembl Sequence Region objects do not
503 ** have sequence attached so that their sequence can only be fetched in the
504 ** context of an Ensembl Slice, which is subsequently mapped to the
505 ** sequence-level Ensembl Coordinate System. See the description of the
506 ** ensSequenceadaptorFetchSliceAllSeq function for further details.
507 ******************************************************************************/
508
ensSequenceadaptorFetchSeqregionAllSeq(EnsPSequenceadaptor sqa,const EnsPSeqregion sr,AjPSeq * Psequence)509 AjBool ensSequenceadaptorFetchSeqregionAllSeq(EnsPSequenceadaptor sqa,
510 const EnsPSeqregion sr,
511 AjPSeq *Psequence)
512 {
513 return ensSequenceadaptorFetchSeqregionSubSeq(
514 sqa,
515 sr,
516 1,
517 (ajuint) ensSeqregionGetLength(sr),
518 Psequence);
519 }
520
521
522
523
524 /* @func ensSequenceadaptorFetchSeqregionAllStr *******************************
525 **
526 ** Fetch the sequence of an Ensembl Sequence Region as an AJAX String.
527 **
528 ** The caller is responsible for deleting the AJAX String.
529 **
530 ** @cc Bio::EnsEMBL::DBSQL::Sequenceadaptor::_fetch_seq
531 ** @see ensSequenceadaptorFetchSliceAllStr
532 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
533 ** @param [r] sr [const EnsPSeqregion] Ensembl Sequence Region
534 ** @param [u] Psequence [AjPStr*] Sequence address
535 **
536 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
537 **
538 ** @release 6.4.0
539 ** @@
540 ** This function will only return biological sequence information for
541 ** Ensembl Sequence Region objects, which are in the sequence-level
542 ** Ensembl Coordinate System. All other Ensembl Sequence Region objects do not
543 ** have sequence attached so that their sequence can only be fetched in the
544 ** context of an Ensembl Slice, which is subsequently mapped to the
545 ** sequence-level Ensembl Coordinate System. See the description of the
546 ** ensSequenceadaptorFetchSliceAllStr function for further details.
547 ******************************************************************************/
548
ensSequenceadaptorFetchSeqregionAllStr(EnsPSequenceadaptor sqa,const EnsPSeqregion sr,AjPStr * Psequence)549 AjBool ensSequenceadaptorFetchSeqregionAllStr(EnsPSequenceadaptor sqa,
550 const EnsPSeqregion sr,
551 AjPStr *Psequence)
552 {
553 return ensSequenceadaptorFetchSeqregionSubStr(
554 sqa,
555 sr,
556 1,
557 (ajuint) ensSeqregionGetLength(sr),
558 Psequence);
559 }
560
561
562
563
564 /* @func ensSequenceadaptorFetchSeqregionSubSeq *******************************
565 **
566 ** Fetch a sub-sequence of an Ensembl Sequence Region as an AJAX Sequence.
567 ** The start coordinate is one-based, as in the SQL SUBSTRING function.
568 ** A start of 1 and a length equal the Sequence Region length covers the whole
569 ** Sequence Region.
570 **
571 ** The caller is responsible for deleting the AJAX Sequence.
572 **
573 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
574 ** @param [r] sr [const EnsPSeqregion] Ensembl Sequence Region
575 ** @param [r] start [ajuint] Start coordinate
576 ** @param [r] length [ajuint] Sequence length
577 ** @param [wP] Psequence [AjPSeq*] AJAX Sequence address
578 **
579 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
580 **
581 ** @release 6.4.0
582 ** @@
583 ** This function will only return biological sequence information for
584 ** Ensembl Sequence Region objects, which are in the sequence-level
585 ** Ensembl Coordinate System. All other Ensembl Sequence Region objects do not
586 ** have sequence attached so that their sequence can only be fetched in the
587 ** context of an Ensembl Slice, which is subsequently mapped to the
588 ** sequence-level Ensembl Coordinate System. See the description of the
589 ** ensSequenceadaptorFetchSliceAllSeq function for further details.
590 ******************************************************************************/
591
ensSequenceadaptorFetchSeqregionSubSeq(EnsPSequenceadaptor sqa,const EnsPSeqregion sr,ajuint start,ajuint length,AjPSeq * Psequence)592 AjBool ensSequenceadaptorFetchSeqregionSubSeq(EnsPSequenceadaptor sqa,
593 const EnsPSeqregion sr,
594 ajuint start,
595 ajuint length,
596 AjPSeq *Psequence)
597 {
598 AjBool result = AJFALSE;
599
600 AjPStr name = NULL;
601 AjPStr sequence = NULL;
602
603 if (ajDebugTest("ensSequenceadaptorFetchSeqregionSubSeq"))
604 {
605 ajDebug("ensSequenceadaptorFetchSeqregionSubSeq\n"
606 " sqa %p\n"
607 " sr %p\n"
608 " start %u\n"
609 " length %u\n"
610 " Psequence %p\n",
611 sqa,
612 sr,
613 start,
614 length,
615 Psequence);
616
617 ensSeqregionTrace(sr, 1);
618 }
619
620 if (!sqa)
621 return ajFalse;
622
623 if (!sr)
624 return ajFalse;
625
626 if (!Psequence)
627 return ajFalse;
628
629 /*
630 ** It is sligtly more efficient, if undefined AJAX String objects are
631 ** directly allocated by the following functions to their final size.
632 */
633
634 name = ajFmtStr(
635 "%S:%S:%S:%u:%u:1",
636 ensCoordsystemGetName(ensSeqregionGetCoordsystem(sr)),
637 ensCoordsystemGetVersion(ensSeqregionGetCoordsystem(sr)),
638 ensSeqregionGetName(sr),
639 start,
640 start + length - 1);
641
642 result = ensSequenceadaptorFetchSeqregionSubStr(
643 sqa,
644 sr,
645 start,
646 length,
647 &sequence);
648
649 if (*Psequence)
650 {
651 ajSeqClear(*Psequence);
652
653 ajSeqAssignNameS(*Psequence, name);
654 ajSeqAssignSeqS(*Psequence, sequence);
655 }
656 else
657 *Psequence = ajSeqNewNameS(sequence, name);
658
659 ajSeqSetNuc(*Psequence);
660
661 ajStrDel(&name);
662 ajStrDel(&sequence);
663
664 return result;
665 }
666
667
668
669
670 /* @func ensSequenceadaptorFetchSeqregionSubStr *******************************
671 **
672 ** Fetch a sub-sequence of an Ensembl Sequence Region as an AJAX String.
673 ** The start coordinate is one-based, as in the SQL SUBSTRING function.
674 ** A start of 1 and a length equal the Sequence Region length covers the whole
675 ** Sequence Region.
676 **
677 ** The caller is responsible for deleting the AJAX String.
678 **
679 ** @cc Bio::EnsEMBL::DBSQL::Sequenceadaptor::_fetch_seq
680 ** @see ensSequenceadaptorFetchSliceSubStr
681 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
682 ** @param [r] sr [const EnsPSeqregion] Ensembl Sequence Region
683 ** @param [r] start [ajuint] Start coordinate
684 ** @param [r] length [ajuint] Sequence length
685 ** @param [u] Psequence [AjPStr*] Sequence address
686 **
687 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
688 **
689 ** @release 6.4.0
690 ** @@
691 ** This function will only return biological sequence information for
692 ** Ensembl Sequence Region objects, which are in the sequence-level
693 ** Ensembl Coordinate System. All other Ensembl Sequence Region objects do not
694 ** have sequence attached so that their sequence can only be fetched in the
695 ** context of an Ensembl Slice, which is subsequently mapped to the
696 ** sequence-level Ensembl Coordinate System. See the description of the
697 ** ensSequenceadaptorFetchSliceSubStr function for further details.
698 ******************************************************************************/
699
ensSequenceadaptorFetchSeqregionSubStr(EnsPSequenceadaptor sqa,const EnsPSeqregion sr,ajuint start,ajuint length,AjPStr * Psequence)700 AjBool ensSequenceadaptorFetchSeqregionSubStr(EnsPSequenceadaptor sqa,
701 const EnsPSeqregion sr,
702 ajuint start,
703 ajuint length,
704 AjPStr *Psequence)
705 {
706 register ajuint i = 0U;
707
708 ajuint chkmin = 0U;
709 ajuint chkmax = 0U;
710 ajuint posmin = 0U;
711
712 AjPSqlstatement sqls = NULL;
713 AjISqlrow sqli = NULL;
714 AjPSqlrow sqlr = NULL;
715
716 AjPStr chkstr = NULL;
717 AjPStr tmpstr = NULL;
718 AjPStr key = NULL;
719 AjPStr statement = NULL;
720
721 EnsPDatabaseadaptor dba = NULL;
722
723 if (ajDebugTest("ensSequenceadaptorFetchSeqregionSubStr"))
724 {
725 ajDebug("ensSequenceadaptorFetchSeqregionSubStr\n"
726 " sqa %p\n"
727 " sr %p\n"
728 " start %d\n"
729 " length %d\n",
730 sqa,
731 sr,
732 start,
733 length);
734
735 ensSeqregionTrace(sr, 1);
736 }
737
738 if (!sqa)
739 return ajFalse;
740
741 if (!sr)
742 return ajFalse;
743
744 if (!Psequence)
745 return ajFalse;
746
747 /*
748 ** Reserve sequence space in larger blocks based on the requested length
749 ** plus one position for the 'nul' string terminator.
750 */
751
752 if (*Psequence)
753 ajStrAssignClear(Psequence);
754 else
755 *Psequence = ajStrNewRes((((length + 1) >> sequenceKChunkPower) + 1)
756 << sequenceKChunkPower);
757
758 dba = ensSequenceadaptorGetDatabaseadaptor(sqa);
759
760 if (length < sequenceadaptorKCacheMaxLength)
761 {
762 chkmin = (start - 1) >> sequenceKChunkPower;
763
764 chkmax = (start + length - 1) >> sequenceKChunkPower;
765
766 /*
767 ** Allocate an AJAX String and reserve space for the number of
768 ** sequence chunks plus one for the 'nul' string terminator.
769 */
770
771 tmpstr = ajStrNewRes(((1 << sequenceKChunkPower) + 1) *
772 (chkmax - chkmin + 1));
773
774 /* Piece together sequence from cached sequence chunks. */
775
776 for (i = chkmin; i <= chkmax; i++)
777 {
778 key = ajFmtStr("%u:%u", ensSeqregionGetIdentifier(sr), i);
779
780 chkstr = NULL;
781
782 ensCacheFetch(sqa->Cache, (void *) key, (void **) &chkstr);
783
784 if (chkstr)
785 {
786 ajStrAppendS(&tmpstr, chkstr);
787
788 ajStrDel(&chkstr);
789 }
790 else
791 {
792 /* Fetch uncached sequence chunks. */
793
794 posmin = (i << sequenceKChunkPower) + 1;
795
796 statement = ajFmtStr(
797 "SELECT "
798 "SUBSTRING(dna.sequence FROM %u FOR %u) "
799 "FROM "
800 "dna "
801 "WHERE "
802 "dna.seq_region_id = %u",
803 posmin,
804 1 << sequenceKChunkPower,
805 ensSeqregionGetIdentifier(sr));
806
807 sqls = ensDatabaseadaptorSqlstatementNew(dba, statement);
808
809 sqli = ajSqlrowiterNew(sqls);
810
811 while (!ajSqlrowiterDone(sqli))
812 {
813 sqlr = ajSqlrowiterGet(sqli);
814
815 /*
816 ** Allocate an AJAX String and reserve space for the
817 ** maximum sequence chunk length plus the 'nul' string
818 ** terminator.
819 */
820
821 chkstr = ajStrNewRes((1 << sequenceKChunkPower) + 1);
822
823 ajSqlcolumnToStr(sqlr, &chkstr);
824
825 /*
826 ** Always store upper case sequence
827 ** so that it can be properly soft-masked.
828 */
829
830 ajStrFmtUpper(&chkstr);
831
832 ensCacheStore(sqa->Cache, (void *) key, (void **) &chkstr);
833
834 ajStrAppendS(&tmpstr, chkstr);
835
836 ajStrDel(&chkstr);
837 }
838
839 ajSqlrowiterDel(&sqli);
840
841 ensDatabaseadaptorSqlstatementDel(dba, &sqls);
842
843 ajStrDel(&statement);
844 }
845
846 ajStrDel(&key);
847 }
848
849 /* Return only the requested portion of the entire sequence. */
850
851 posmin = (chkmin << sequenceKChunkPower) + 1;
852
853 ajStrAssignSubS(Psequence,
854 tmpstr,
855 start - posmin,
856 start - posmin + length - 1);
857
858 ajStrDel(&tmpstr);
859 }
860 else
861 {
862 /* Do not attempt any caching for requests of very large sequences. */
863
864 statement = ajFmtStr(
865 "SELECT "
866 "SUBSTRING(dna.sequence FROM %u FOR %u) "
867 "FROM dna "
868 "WHERE "
869 "dna.seq_region_id = %u",
870 start,
871 length,
872 ensSeqregionGetIdentifier(sr));
873
874 sqls = ensDatabaseadaptorSqlstatementNew(dba, statement);
875
876 sqli = ajSqlrowiterNew(sqls);
877
878 while (!ajSqlrowiterDone(sqli))
879 {
880 sqlr = ajSqlrowiterGet(sqli);
881
882 /*
883 ** Allocate an AJAX String and reserve space of the length plus
884 ** the 'nul' string terminator.
885 */
886
887 chkstr = ajStrNewRes((length + 1));
888
889 ajSqlcolumnToStr(sqlr, &chkstr);
890
891 /*
892 ** Always return upper case sequence
893 ** so that it can be properly soft-masked.
894 */
895
896 ajStrFmtUpper(&chkstr);
897
898 ajStrAssignS(Psequence, chkstr);
899
900 ajStrDel(&chkstr);
901 }
902
903 ajSqlrowiterDel(&sqli);
904
905 ensDatabaseadaptorSqlstatementDel(dba, &sqls);
906
907 ajStrDel(&statement);
908 }
909
910 return ajTrue;
911 }
912
913
914
915
916 /* @func ensSequenceadaptorFetchSliceAllSeq ***********************************
917 **
918 ** Fetch the sequence of an Ensembl Slice as an AJAX Sequence.
919 **
920 ** The caller is responsible for deleting the AJAX Sequence.
921 **
922 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
923 ** @param [u] slice [EnsPSlice] Ensembl Slice
924 ** @param [wP] Psequence [AjPSeq*] AJAX Sequence address
925 **
926 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
927 **
928 ** @release 6.4.0
929 ** @@
930 ******************************************************************************/
931
ensSequenceadaptorFetchSliceAllSeq(EnsPSequenceadaptor sqa,EnsPSlice slice,AjPSeq * Psequence)932 AjBool ensSequenceadaptorFetchSliceAllSeq(EnsPSequenceadaptor sqa,
933 EnsPSlice slice,
934 AjPSeq *Psequence)
935 {
936 return ensSequenceadaptorFetchSliceSubSeq(
937 sqa,
938 slice,
939 1,
940 ensSliceCalculateLength(slice),
941 1,
942 Psequence);
943 }
944
945
946
947
948 /* @func ensSequenceadaptorFetchSliceAllStr ***********************************
949 **
950 ** Fetch the sequence of an Ensembl Slice as an AJAX String.
951 **
952 ** The caller is responsible for deleting the AJAX String.
953 **
954 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
955 ** @param [u] slice [EnsPSlice] Ensembl Slice
956 ** @param [u] Psequence [AjPStr*] Sequence address
957 **
958 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
959 **
960 ** @release 6.4.0
961 ** @@
962 ******************************************************************************/
963
ensSequenceadaptorFetchSliceAllStr(EnsPSequenceadaptor sqa,EnsPSlice slice,AjPStr * Psequence)964 AjBool ensSequenceadaptorFetchSliceAllStr(EnsPSequenceadaptor sqa,
965 EnsPSlice slice,
966 AjPStr *Psequence)
967 {
968 return ensSequenceadaptorFetchSliceSubStr(
969 sqa,
970 slice,
971 1,
972 ensSliceCalculateLength(slice),
973 1,
974 Psequence);
975 }
976
977
978
979
980 /* @func ensSequenceadaptorFetchSliceSubSeq ***********************************
981 **
982 ** Fetch a sub-sequence of an Ensembl Slice as an AJAX Sequence.
983 ** Coordinates are relative to the Slice and one-based.
984 ** A start of 1 and an end equal the Slice length covers the whole Slice.
985 **
986 ** The caller is responsible for deleting the AJAX Sequence.
987 **
988 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
989 ** @param [u] slice [EnsPSlice] Ensembl Slice
990 ** @param [r] start [ajint] Start coordinate
991 ** @param [r] end [ajint] End coordinate
992 ** @param [r] strand [ajint] Strand information
993 ** @param [wP] Psequence [AjPSeq*] AJAX Sequence address
994 **
995 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
996 **
997 ** @release 6.4.0
998 ** @@
999 ******************************************************************************/
1000
ensSequenceadaptorFetchSliceSubSeq(EnsPSequenceadaptor sqa,EnsPSlice slice,ajint start,ajint end,ajint strand,AjPSeq * Psequence)1001 AjBool ensSequenceadaptorFetchSliceSubSeq(EnsPSequenceadaptor sqa,
1002 EnsPSlice slice,
1003 ajint start,
1004 ajint end,
1005 ajint strand,
1006 AjPSeq *Psequence)
1007 {
1008 ajint srstart = 0;
1009 ajint srend = 0;
1010 ajint srstrand = 0;
1011
1012 AjBool result = AJFALSE;
1013
1014 AjPStr name = NULL;
1015 AjPStr sequence = NULL;
1016
1017 if (!sqa)
1018 return ajFalse;
1019
1020 if (!slice)
1021 return ajFalse;
1022
1023 if (!strand)
1024 strand = 1;
1025
1026 if (!Psequence)
1027 return ajFalse;
1028
1029 /* Transform relative into absolute coordinates for the Slice name. */
1030
1031 if (ensSliceGetStrand(slice) > 0)
1032 {
1033 srstart = ensSliceGetStart(slice) + start - 1;
1034
1035 srend = ensSliceGetStart(slice) + end - 1;
1036
1037 srstrand = +strand;
1038 }
1039 else
1040 {
1041 srstart = ensSliceGetEnd(slice) - end + 1;
1042
1043 srend = ensSliceGetEnd(slice) - start + 1;
1044
1045 srstrand = -strand;
1046 }
1047
1048 /*
1049 ** It is sligtly more efficient, if undefined AJAX String objects are
1050 ** directly allocated by the following functions to their final size.
1051 */
1052
1053 name = ajFmtStr(
1054 "%S:%S:%S:%d:%d:%d",
1055 ensSliceGetCoordsystemName(slice),
1056 ensSliceGetCoordsystemVersion(slice),
1057 ensSliceGetSeqregionName(slice),
1058 srstart,
1059 srend,
1060 srstrand);
1061
1062 result = ensSequenceadaptorFetchSliceSubStr(
1063 sqa,
1064 slice,
1065 start,
1066 end,
1067 strand,
1068 &sequence);
1069
1070 if (*Psequence)
1071 {
1072 ajSeqClear(*Psequence);
1073
1074 ajSeqAssignNameS(*Psequence, name);
1075 ajSeqAssignSeqS(*Psequence, sequence);
1076 }
1077 else
1078 *Psequence = ajSeqNewNameS(sequence, name);
1079
1080 ajSeqSetNuc(*Psequence);
1081
1082 ajStrDel(&name);
1083 ajStrDel(&sequence);
1084
1085 return result;
1086 }
1087
1088
1089
1090
1091 /* @func ensSequenceadaptorFetchSliceSubStr ***********************************
1092 **
1093 ** Fetch a sub-sequence of an Ensembl Slice as an AJAX String.
1094 ** Coordinates are relative to the Slice and one-based.
1095 ** A start of 1 and an end equal the Slice length covers the whole Slice.
1096 **
1097 ** The caller is responsible for deleting the AJAX String.
1098 **
1099 ** @cc Bio::EnsEMBL::DBSQL::Sequenceadaptor::fetch_by_Slice_start_end_strand
1100 ** @see ensSequenceadaptorFetchSliceAllStr
1101 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
1102 ** @param [u] slice [EnsPSlice] Ensembl Slice
1103 ** @param [r] start [ajint] Start coordinate
1104 ** @param [r] end [ajint] End coordinate
1105 ** @param [r] strand [ajint] Strand information
1106 ** @param [u] Psequence [AjPStr*] Sequence address
1107 **
1108 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
1109 **
1110 ** @release 6.4.0
1111 ** @@
1112 ******************************************************************************/
1113
ensSequenceadaptorFetchSliceSubStr(EnsPSequenceadaptor sqa,EnsPSlice slice,ajint start,ajint end,ajint strand,AjPStr * Psequence)1114 AjBool ensSequenceadaptorFetchSliceSubStr(EnsPSequenceadaptor sqa,
1115 EnsPSlice slice,
1116 ajint start,
1117 ajint end,
1118 ajint strand,
1119 AjPStr *Psequence)
1120 {
1121 const char *Ptr = NULL;
1122
1123 register ajuint i = 0U;
1124
1125 ajint five = 0;
1126 ajint three = 0;
1127 ajint fshift = 0;
1128 ajint tshift = 0;
1129
1130 ajuint padding = 0U;
1131 ajuint total = 0U;
1132
1133 AjBool circular = AJFALSE;
1134 AjBool debug = AJFALSE;
1135
1136 AjPList pslist = NULL;
1137 AjPList ses = NULL;
1138
1139 AjPStr tmpstr = NULL;
1140
1141 EnsPCoordsystem seqlvlcs = NULL;
1142 EnsPCoordsystemadaptor csa = NULL;
1143
1144 EnsPDatabaseadaptor dba = NULL;
1145
1146 EnsPProjectionsegment ps = NULL;
1147
1148 EnsPSequenceedit se = NULL;
1149
1150 EnsPSlice eslice = NULL;
1151 EnsPSlice nslice = NULL;
1152 EnsPSlice sslice = NULL;
1153 EnsPSliceadaptor sla = NULL;
1154
1155 debug = ajDebugTest("ensSequenceadaptorFetchSliceSubStr");
1156
1157 if (debug)
1158 {
1159 ajDebug("ensSequenceadaptorFetchSliceSubStr\n"
1160 " sqa %p\n"
1161 " slice %p\n"
1162 " start %d\n"
1163 " end %d\n"
1164 " strand %d\n"
1165 " Psequence %p\n",
1166 sqa,
1167 slice,
1168 start,
1169 end,
1170 strand,
1171 Psequence);
1172
1173 ensSliceTrace(slice, 1);
1174 }
1175
1176 if (!sqa)
1177 return ajFalse;
1178
1179 if (!slice)
1180 return ajFalse;
1181
1182 if (!Psequence)
1183 return ajFalse;
1184
1185 if (!ensSliceIsCircular(slice, &circular))
1186 return ajFalse;
1187
1188 if (circular == ajTrue)
1189 {
1190 if (start > end)
1191 return sequenceadaptorFetchCircularsliceSubStr(
1192 sqa,
1193 slice,
1194 start,
1195 end,
1196 strand,
1197 Psequence);
1198
1199 if (start < 0)
1200 start += ensSliceGetSeqregionLength(slice);
1201
1202 if (end < 0)
1203 end += ensSliceGetSeqregionLength(slice);
1204
1205 if (ensSliceGetStart(slice) > ensSliceGetEnd(slice))
1206 return sequenceadaptorFetchCircularsliceSubStr(
1207 sqa,
1208 slice,
1209 ensSliceGetStart(slice),
1210 ensSliceGetEnd(slice),
1211 strand,
1212 Psequence);
1213 }
1214
1215 if (start > end)
1216 {
1217 ajDebug("ensSequenceadaptorFetchSliceSubStr requires the start %d "
1218 "to be less than or equal to the end %d coordinate.\n",
1219 start, end);
1220
1221 return ajFalse;
1222 }
1223
1224 if (!strand)
1225 strand = 1;
1226
1227 /*
1228 ** Reserve sequence space in larger blocks based on the requested length
1229 ** plus one position for the 'nul' string terminator.
1230 */
1231
1232 if (*Psequence)
1233 ajStrAssignClear(Psequence);
1234 else
1235 *Psequence = ajStrNewRes(
1236 (((ensSliceCalculateRegion(slice, start, end) + 1)
1237 >> sequenceKChunkPower) + 1) << sequenceKChunkPower);
1238
1239 dba = ensSequenceadaptorGetDatabaseadaptor(sqa);
1240
1241 csa = ensRegistryGetCoordsystemadaptor(dba);
1242 sla = ensRegistryGetSliceadaptor(dba);
1243
1244 /*
1245 ** Get a new Slice that spans the exact region to retrieve DNA from.
1246 ** Only this short region of the Slice needs mapping into the
1247 ** sequence-level coordinate system.
1248 */
1249
1250 /* Relative Slice coordinates range from 1 to length. */
1251
1252 five = 1 - start;
1253
1254 three = end - ensSliceCalculateLength(slice);
1255
1256 if (five || three)
1257 ensSliceFetchSliceexpanded(slice,
1258 five,
1259 three,
1260 ajFalse,
1261 &fshift,
1262 &tshift,
1263 &eslice);
1264 else
1265 eslice = ensSliceNewRef(slice);
1266
1267 /*
1268 ** Retrieve normalised, non-symlinked Slice objects, which allows fetching
1269 ** of haplotypes (HAPs) and pseudo-autosomal regions (PARs).
1270 */
1271
1272 pslist = ajListNew();
1273
1274 ensSliceadaptorRetrieveNormalisedprojection(sla, eslice, pslist);
1275
1276 if (!ajListGetLength(pslist))
1277 ajFatal("ensSequenceadaptorFetchSliceSubStr could not "
1278 "retrieve normalised Slices. Database contains incorrect "
1279 "information in the 'assembly_exception' table.\n");
1280
1281 /*
1282 ** Call this method again with any Slice that was sym-linked to by this
1283 ** Slice.
1284 */
1285
1286 ajListPeekFirst(pslist, (void **) &ps);
1287
1288 if ((ajListGetLength(pslist) != 1) ||
1289 (!ensSliceMatch(ensProjectionsegmentGetTargetSlice(ps), slice)))
1290 {
1291 tmpstr = ajStrNew();
1292
1293 while (ajListPop(pslist, (void **) &ps))
1294 {
1295 nslice = ensProjectionsegmentGetTargetSlice(ps);
1296
1297 ensSequenceadaptorFetchSliceAllStr(sqa, nslice, &tmpstr);
1298
1299 ajStrAppendS(Psequence, tmpstr);
1300
1301 ensProjectionsegmentDel(&ps);
1302 }
1303
1304 ajStrDel(&tmpstr);
1305
1306 ajListFree(&pslist);
1307
1308 if (strand < 0)
1309 ajSeqstrReverse(Psequence);
1310
1311 ensSliceDel(&eslice);
1312
1313 return ajTrue;
1314 }
1315
1316 /*
1317 ** Clear the AJAX List of Ensembl Projection Segment objects resulting
1318 ** from the projection of the expanded Slice object to normalised
1319 ** Slice objects.
1320 */
1321
1322 while (ajListPop(pslist, (void **) &ps))
1323 ensProjectionsegmentDel(&ps);
1324
1325 /*
1326 ** Now, this Slice needs projecting onto the sequence-level Coordinate
1327 ** System even if it is already in this Coordinate System, because
1328 ** flanking gaps need trimming out the Slice is past the edges of
1329 ** the Sequence Region.
1330 */
1331
1332 ensCoordsystemadaptorFetchSeqlevel(csa, &seqlvlcs);
1333
1334 ensSliceProject(slice,
1335 ensCoordsystemGetName(seqlvlcs),
1336 ensCoordsystemGetVersion(seqlvlcs),
1337 pslist);
1338
1339 ensCoordsystemDel(&seqlvlcs);
1340
1341 /*
1342 ** Fetch the sequence for each of the Ensembl Sequence Region objects
1343 ** projected onto. Allocate space for 512 KiB (1 << 19) that should fit
1344 ** one BAC clone.
1345 */
1346
1347 tmpstr = ajStrNewRes((1 << 19) + 1);
1348
1349 while (ajListPop(pslist, (void **) &ps))
1350 {
1351 /*
1352 ** Check for gaps between Ensembl Projection Segment objects and
1353 ** pad them with Ns.
1354 */
1355
1356 padding = ensProjectionsegmentGetSourceStart(ps) - total - 1;
1357
1358 if (padding)
1359 {
1360 ajStrAppendCountK(Psequence, 'N', padding);
1361
1362 if (debug)
1363 ajDebug("ensSequenceadaptorFetchSliceSubStr got total %u "
1364 "and Projection Segment source start %u, "
1365 "therefore added %u N padding between.\n",
1366 total,
1367 ensProjectionsegmentGetSourceStart(ps),
1368 padding);
1369 }
1370
1371 sslice = ensProjectionsegmentGetTargetSlice(ps);
1372
1373 ensSequenceadaptorFetchSeqregionSubStr(
1374 sqa,
1375 ensSliceGetSeqregion(sslice),
1376 ensSliceGetStart(sslice),
1377 ensSliceCalculateLength(sslice),
1378 &tmpstr);
1379
1380 if (ensSliceGetStrand(sslice) < 0)
1381 ajSeqstrReverse(&tmpstr);
1382
1383 ajStrAppendS(Psequence, tmpstr);
1384
1385 total = ensProjectionsegmentGetSourceEnd(ps);
1386
1387 ensProjectionsegmentDel(&ps);
1388 }
1389
1390 ajStrDel(&tmpstr);
1391
1392 ajListFree(&pslist);
1393
1394 /* Check for any remaining gaps at the end. */
1395
1396 padding = ensSliceCalculateLength(slice) - total;
1397
1398 if (padding)
1399 {
1400 ajStrAppendCountK(Psequence, 'N', padding);
1401
1402 if (debug)
1403 ajDebug("ensSequenceadaptorFetchSliceSubStr got total %u "
1404 "and Ensembl Slice length %u, "
1405 "therefore added %u N padding.\n",
1406 total,
1407 ensSliceCalculateLength(slice),
1408 padding);
1409 }
1410
1411 /*
1412 ** If the sequence is too short, because we came in with a sequence-level
1413 ** Slice that was partially off the Sequence Region, pad the end with Ns
1414 ** to make it long enough.
1415 ** NOTE: Since ajStrGetLen returns size_t, which exceeds ajint,
1416 ** the sequence length needs to be determined here.
1417 ** padding = ensSliceCalculateLength(slice) - ajStrGetLen(*Psequence);
1418 */
1419
1420 for (i = 0U, Ptr = ajStrGetPtr(*Psequence); (Ptr && *Ptr); i++, Ptr++)
1421 if (i == UINT_MAX)
1422 ajFatal("ensSequenceadaptorFetchSliceSubStr exeeded UINT_MAX.");
1423
1424 padding = ensSliceCalculateLength(slice) - i;
1425
1426 if (padding)
1427 {
1428 ajStrAppendCountK(Psequence, 'N', padding);
1429
1430 if (debug)
1431 ajDebug("ensSequenceadaptorFetchSliceSubStr got "
1432 "sequence length %u, but Slice length %u, "
1433 "therefore added %u N final padding.\n",
1434 ajStrGetLen(*Psequence),
1435 ensSliceCalculateLength(slice),
1436 padding);
1437 }
1438
1439 /* Apply Sequence Edits. */
1440
1441 ses = ajListNew();
1442
1443 ensSliceFetchAllSequenceedits(slice, ses);
1444
1445 /*
1446 ** Sort Sequence Edits in reverse order to avoid the complication of
1447 ** adjusting down-stream Sequence Edit coordinates.
1448 */
1449
1450 ensListSequenceeditSortStartDescending(ses);
1451
1452 while (ajListPop(ses, (void **) &se))
1453 {
1454 /* Adjust Sequence Edit coordinates to the Sub-Slice. */
1455
1456 ensSequenceeditApplyString(se,
1457 ensSliceGetStart(eslice) - 1,
1458 Psequence);
1459
1460 ensSequenceeditDel(&se);
1461 }
1462
1463 ajListFree(&ses);
1464
1465 /* Reverse sequence if requested. */
1466
1467 if (strand < 0)
1468 ajSeqstrReverse(Psequence);
1469
1470 ensSliceDel(&eslice);
1471
1472 return ajTrue;
1473 }
1474
1475
1476
1477
1478 /* @funcstatic sequenceadaptorFetchCircularsliceSubStr ************************
1479 **
1480 ** Fetch a sub-sequence of a (circular) Ensembl Slice as an AJAX String.
1481 ** Coordinates are relative to the Slice and one-based.
1482 ** A start of 1 and an end equal the Slice length covers the whole Slice.
1483 **
1484 ** The caller is responsible for deleting the AJAX String.
1485 **
1486 ** @cc Bio::EnsEMBL::DBSQL::Sequenceadaptor::
1487 ** _fetch_by_Slice_start_end_strand_circular
1488 ** @param [u] sqa [EnsPSequenceadaptor] Ensembl Sequence Adaptor
1489 ** @param [u] slice [EnsPSlice] Ensembl Slice
1490 ** @param [r] start [ajint] Start coordinate
1491 ** @param [r] end [ajint] End coordinate
1492 ** @param [r] strand [ajint] Strand information
1493 ** @param [u] Psequence [AjPStr*] Sequence address
1494 **
1495 ** @return [AjBool] ajTrue upon success, ajFalse otherwise
1496 **
1497 ** @release 6.4.0
1498 ** @@
1499 ******************************************************************************/
1500
sequenceadaptorFetchCircularsliceSubStr(EnsPSequenceadaptor sqa,EnsPSlice slice,ajint start,ajint end,ajint strand,AjPStr * Psequence)1501 static AjBool sequenceadaptorFetchCircularsliceSubStr(EnsPSequenceadaptor sqa,
1502 EnsPSlice slice,
1503 ajint start,
1504 ajint end,
1505 ajint strand,
1506 AjPStr *Psequence)
1507 {
1508 const char *Ptr = NULL;
1509
1510 register ajuint i = 0U;
1511
1512 ajint mpoint = 0;
1513
1514 ajint five = 0;
1515 ajint three = 0;
1516 ajint fshift = 0;
1517 ajint tshift = 0;
1518
1519 ajuint padding = 0U;
1520 ajuint total = 0U;
1521
1522 AjBool circular = AJFALSE;
1523 AjBool debug = AJFALSE;
1524
1525 AjPList pslist = NULL;
1526 AjPList ses = NULL;
1527
1528 AjPStr sequence1 = NULL;
1529 AjPStr sequence2 = NULL;
1530 AjPStr tmpstr = NULL;
1531
1532 EnsPCoordsystem seqlvlcs = NULL;
1533 EnsPCoordsystemadaptor csa = NULL;
1534
1535 EnsPDatabaseadaptor dba = NULL;
1536
1537 EnsPProjectionsegment ps = NULL;
1538
1539 EnsPSequenceedit se = NULL;
1540
1541 EnsPSlice eslice = NULL;
1542 EnsPSlice nslice = NULL;
1543 EnsPSlice sslice = NULL;
1544 EnsPSliceadaptor sla = NULL;
1545
1546 debug = ajDebugTest("sequenceadaptorFetchCircularsliceSubStr");
1547
1548 if (debug)
1549 {
1550 ajDebug("sequenceadaptorFetchCircularsliceSubStr\n"
1551 " sqa %p\n"
1552 " slice %p\n"
1553 " start %d\n"
1554 " end %d\n"
1555 " strand %d\n"
1556 " Psequence %p\n",
1557 sqa,
1558 slice,
1559 start,
1560 end,
1561 strand,
1562 Psequence);
1563
1564 ensSliceTrace(slice, 1);
1565 }
1566
1567 if (!sqa)
1568 return ajFalse;
1569
1570 if (!slice)
1571 return ajFalse;
1572
1573 if (!Psequence)
1574 return ajFalse;
1575
1576 if (*Psequence)
1577 ajStrAssignClear(Psequence);
1578 else
1579 *Psequence = ajStrNewRes(
1580 (((ensSliceCalculateRegion(slice, start, end) + 1)
1581 >> sequenceKChunkPower) + 1) << sequenceKChunkPower);
1582
1583 if (!ensSliceIsCircular(slice, &circular))
1584 return ajFalse;
1585
1586 dba = ensSequenceadaptorGetDatabaseadaptor(sqa);
1587
1588 csa = ensRegistryGetCoordsystemadaptor(dba);
1589 sla = ensRegistryGetSliceadaptor(dba);
1590
1591 if ((start > end) && (circular == ajTrue))
1592 {
1593 mpoint
1594 = ensSliceGetSeqregionLength(slice)
1595 - ensSliceGetStart(slice)
1596 + 1;
1597
1598 sequence1 = ajStrNew();
1599 sequence2 = ajStrNew();
1600
1601 sequenceadaptorFetchCircularsliceSubStr(
1602 sqa,
1603 slice,
1604 1,
1605 mpoint,
1606 1,
1607 &sequence1);
1608
1609 sequenceadaptorFetchCircularsliceSubStr(
1610 sqa,
1611 slice,
1612 mpoint + 1,
1613 ensSliceCalculateLength(slice),
1614 1,
1615 &sequence1);
1616
1617 if (ensSliceGetStrand(slice) >= 0)
1618 {
1619 ajStrAppendS(Psequence, sequence1);
1620 ajStrAppendS(Psequence, sequence2);
1621 }
1622 else
1623 {
1624 ajStrAppendS(Psequence, sequence2);
1625 ajStrAppendS(Psequence, sequence1);
1626
1627 ajSeqstrReverse(Psequence);
1628 }
1629
1630 ajStrDel(&sequence1);
1631 ajStrDel(&sequence2);
1632
1633 return ajTrue;
1634 }
1635
1636 /* Get a new Slice that spans the exact region to retrieve DNA from. */
1637
1638 /* Relative Slice coordinates range from 1 to length. */
1639
1640 five = 1 - start;
1641
1642 three = end - ensSliceCalculateLength(slice);
1643
1644 if (five || three)
1645 {
1646 if (ensSliceGetStrand(slice) >= 0)
1647 ensSliceFetchSliceexpanded(slice,
1648 five,
1649 three,
1650 ajFalse,
1651 &fshift,
1652 &tshift,
1653 &eslice);
1654 else
1655 ensSliceFetchSliceexpanded(slice,
1656 three,
1657 five,
1658 ajFalse,
1659 &tshift,
1660 &fshift,
1661 &eslice);
1662 }
1663
1664 /*
1665 ** Retrieve normalized non-symlinked Slice objects, which allows fetching
1666 ** of haplotypes (HAPs) and pseudo-autosomal regions (PARs).
1667 */
1668
1669 pslist = ajListNew();
1670
1671 ensSliceadaptorRetrieveNormalisedprojection(sla, eslice, pslist);
1672
1673 if (!ajListGetLength(pslist))
1674 ajFatal("sequenceadaptorFetchCircularsliceSubStr could not "
1675 "retrieve normalised Slices. Database contains incorrect "
1676 "information in the 'assembly_exception' table.\n");
1677
1678
1679 /*
1680 ** Call this method again with any Slice that was sym-linked to by this
1681 ** Slice.
1682 */
1683
1684 ajListPeekFirst(pslist, (void **) &ps);
1685
1686 if ((ajListGetLength(pslist) != 1) ||
1687 (!ensSliceMatch(ensProjectionsegmentGetTargetSlice(ps), slice)))
1688 {
1689 tmpstr = ajStrNew();
1690
1691 while (ajListPop(pslist, (void **) &ps))
1692 {
1693 nslice = ensProjectionsegmentGetTargetSlice(ps);
1694
1695 ensSequenceadaptorFetchSliceAllStr(sqa, nslice, &tmpstr);
1696
1697 ajStrAppendS(Psequence, tmpstr);
1698
1699 ensProjectionsegmentDel(&ps);
1700 }
1701
1702 ajStrDel(&tmpstr);
1703
1704 ajListFree(&pslist);
1705
1706 if (strand < 0)
1707 ajSeqstrReverse(Psequence);
1708
1709 ensSliceDel(&eslice);
1710
1711 return ajTrue;
1712 }
1713
1714 /*
1715 ** Clear the AJAX List of Ensembl Projection Segment objects resulting
1716 ** from the projection of the expanded Slice object to normalised
1717 ** Slice objects.
1718 */
1719
1720 while (ajListPop(pslist, (void **) &ps))
1721 ensProjectionsegmentDel(&ps);
1722
1723 /*
1724 ** Now, this Slice needs projecting onto the sequence-level Coordinate
1725 ** System even if it is already in this Coordinate System, because
1726 ** flanking gaps need trimming out the Slice is past the edges of
1727 ** the Sequence Region.
1728 */
1729
1730 ensCoordsystemadaptorFetchSeqlevel(csa, &seqlvlcs);
1731
1732 ensSliceProject(slice,
1733 ensCoordsystemGetName(seqlvlcs),
1734 ensCoordsystemGetVersion(seqlvlcs),
1735 pslist);
1736
1737 ensCoordsystemDel(&seqlvlcs);
1738
1739 /*
1740 ** Fetch the sequence for each of the Ensembl Sequence Region objects
1741 ** projected onto. Allocate space for 512 KiB (1 << 19) that should fit
1742 ** one BAC clone.
1743 */
1744
1745 tmpstr = ajStrNewRes((1 << 19) + 1);
1746
1747 while (ajListPop(pslist, (void **) &ps))
1748 {
1749 /*
1750 ** Check for gaps between Ensembl Projection Segment objects and
1751 ** pad them with Ns.
1752 */
1753
1754 padding = ensProjectionsegmentGetSourceStart(ps) - total - 1;
1755
1756 if (padding)
1757 {
1758 ajStrAppendCountK(Psequence, 'N', padding);
1759
1760 if (debug)
1761 ajDebug("sequenceadaptorFetchCircularsliceSubStr got total %u "
1762 "and Projection Segment source start %u, "
1763 "therefore added %u N padding between.\n",
1764 total,
1765 ensProjectionsegmentGetSourceStart(ps),
1766 padding);
1767 }
1768
1769 sslice = ensProjectionsegmentGetTargetSlice(ps);
1770
1771 ensSequenceadaptorFetchSeqregionSubStr(
1772 sqa,
1773 ensSliceGetSeqregion(sslice),
1774 ensSliceGetStart(sslice),
1775 ensSliceCalculateLength(sslice),
1776 &tmpstr);
1777
1778 if (ensSliceGetStrand(sslice) < 0)
1779 ajSeqstrReverse(&tmpstr);
1780
1781 ajStrAppendS(Psequence, tmpstr);
1782
1783 total = ensProjectionsegmentGetSourceEnd(ps);
1784
1785 ensProjectionsegmentDel(&ps);
1786 }
1787
1788 /* Check for any remaining gaps at the end. */
1789
1790 padding = ensSliceCalculateLength(slice) - total;
1791
1792 if (padding)
1793 {
1794 ajStrAppendCountK(Psequence, 'N', padding);
1795
1796 if (debug)
1797 ajDebug("sequenceadaptorFetchCircularsliceSubStr got total %u "
1798 "and Ensembl Slice length %u, "
1799 "therefore added %u N padding.\n",
1800 total,
1801 ensSliceCalculateLength(slice),
1802 padding);
1803 }
1804
1805 /*
1806 ** If the sequence is too short, because we came in with a sequence-level
1807 ** Slice that was partially off the Sequence Region, pad the end with Ns
1808 ** to make it long enough.
1809 ** NOTE: Since ajStrGetLen returns size_t, which exceeds ajint,
1810 ** the sequence length needs to be determined here.
1811 ** padding = ensSliceCalculateLength(slice) - ajStrGetLen(*Psequence);
1812 */
1813
1814 for (i = 0U, Ptr = ajStrGetPtr(*Psequence); (Ptr && *Ptr); i++, Ptr++)
1815 if (i == UINT_MAX)
1816 ajFatal("sequenceadaptorFetchCircularsliceSubStr exeeded "
1817 "UINT_MAX.");
1818
1819 padding = ensSliceCalculateLength(slice) - i;
1820
1821 if (padding)
1822 {
1823 ajStrAppendCountK(Psequence, 'N', padding);
1824
1825 if (debug)
1826 ajDebug("sequenceadaptorFetchCircularsliceSubStr got "
1827 "sequence length %u, but Slice length %u, "
1828 "therefore added %u N final padding.\n",
1829 ajStrGetLen(*Psequence),
1830 ensSliceCalculateLength(slice),
1831 padding);
1832 }
1833
1834 /* Apply Sequence Edits. */
1835
1836 ses = ajListNew();
1837
1838 ensSliceFetchAllSequenceedits(slice, ses);
1839
1840 /*
1841 ** Sort Sequence Edits in reverse order to avoid the complication of
1842 ** adjusting down-stream Sequence Edit coordinates.
1843 */
1844
1845 ensListSequenceeditSortStartDescending(ses);
1846
1847 while (ajListPop(ses, (void **) &se))
1848 {
1849 /* Adjust Sequence Edit coordinates to the Sub-Slice. */
1850
1851 ensSequenceeditApplyString(se,
1852 ensSliceGetStart(eslice) - 1,
1853 Psequence);
1854
1855 ensSequenceeditDel(&se);
1856 }
1857
1858 ajListFree(&ses);
1859
1860 /* Reverse sequence if requested. */
1861
1862 if (strand < 0)
1863 ajSeqstrReverse(Psequence);
1864
1865 ensSliceDel(&eslice);
1866
1867 return ajTrue;
1868 }
1869