1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34
35 #ifndef SEQAN_INCLUDE_STORE_STORE_IO_UCSC_H
36 #define SEQAN_INCLUDE_STORE_STORE_IO_UCSC_H
37
38 namespace seqan {
39
40 //////////////////////////////////////////////////////////////////////////////
41 // add a UCSC record to FragmentStore
42
43 template <typename TSpec, typename TConfig>
44 inline void
_storeAnnotationRecord(FragmentStore<TSpec,TConfig> & fragStore,UcscRecord const & record,UcscKnownGene)45 _storeAnnotationRecord(
46 FragmentStore<TSpec, TConfig> & fragStore,
47 UcscRecord const & record,
48 UcscKnownGene)
49 {
50 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
51 typedef typename TFragmentStore::TAnnotationStore TAnnotationStore;
52 typedef typename Value<TAnnotationStore>::Type TAnnotation;
53 typedef typename TAnnotation::TId TId;
54
55 SEQAN_ASSERT_EQ(length(fragStore.annotationStore), length(fragStore.annotationNameStore));
56
57 // add transcript and CDS
58 TId transId = TAnnotation::INVALID_ID;
59 _storeAppendAnnotationName(fragStore, transId, record.transName, (TId) TFragmentStore::ANNO_MRNA);
60 TId cdsId = length(fragStore.annotationStore);
61 appendName(fragStore.annotationNameStoreCache, record.proteinName);
62
63 resize(fragStore.annotationStore, cdsId + 1 + length(record.exonBegin), Generous());
64 resize(fragStore.annotationNameStore, cdsId + 1 + length(record.exonBegin), Generous());
65
66 // add contig name
67 TId contigId = TAnnotation::INVALID_ID;
68 _storeAppendContig(fragStore, contigId, record.contigName);
69
70 // update transcript data
71 TAnnotation & transcript = fragStore.annotationStore[transId];
72 TId geneId = transcript.parentId;
73 if (geneId == TAnnotation::INVALID_ID)
74 geneId = 0;
75 transcript.parentId = geneId;
76 transcript.contigId = contigId;
77 transcript.typeId = TFragmentStore::ANNO_MRNA;
78
79 // add CDS entry
80 TAnnotation & cds = fragStore.annotationStore[cdsId];
81 cds.parentId = transId;
82 cds.contigId = contigId;
83 cds.typeId = TFragmentStore::ANNO_CDS;
84 cds.beginPos = record.cdsBegin;
85 cds.endPos = record.cdsEnd;
86 _adjustParent(transcript, cds);
87
88 // add exons
89 for (unsigned i = 0; i < length(record.exonBegin); ++i)
90 {
91 TAnnotation & exon = fragStore.annotationStore[cdsId + 1 + i];
92 exon.parentId = transId;
93 exon.contigId = contigId;
94 exon.typeId = TFragmentStore::ANNO_EXON;
95 exon.beginPos = record.exonBegin[i];
96 exon.endPos = record.exonEnds[i];
97 _adjustParent(transcript, exon);
98 }
99 if (geneId != 0)
100 _adjustParent(fragStore.annotationStore[geneId], transcript);
101 }
102
103 template <typename TSpec, typename TConfig>
104 inline void
_storeAnnotationRecord(FragmentStore<TSpec,TConfig> & fragStore,UcscRecord const & record,UcscKnownIsoforms const &)105 _storeAnnotationRecord(
106 FragmentStore<TSpec, TConfig> & fragStore,
107 UcscRecord const & record,
108 UcscKnownIsoforms const &)
109 {
110 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
111 typedef typename TFragmentStore::TAnnotationStore TAnnotationStore;
112 typedef typename Value<TAnnotationStore>::Type TAnnotation;
113 typedef typename TAnnotation::TId TId;
114
115 SEQAN_ASSERT_EQ(length(fragStore.annotationStore), length(fragStore.annotationNameStore));
116
117 TId geneId = TAnnotation::INVALID_ID;
118 TId transId = TAnnotation::INVALID_ID;
119
120 // add transcript and CDS
121 _storeAppendAnnotationName(fragStore, geneId, record.transName, (TId) TFragmentStore::ANNO_GENE);
122 _storeAppendAnnotationName(fragStore, transId, record.contigName, (TId) TFragmentStore::ANNO_MRNA);
123
124 // set parent link locus->root
125 TAnnotation & locus = fragStore.annotationStore[geneId];
126 locus.parentId = 0;
127 locus.typeId = TFragmentStore::ANNO_GENE;
128
129 // set parent link transcript->locus
130 TAnnotation & transcript = fragStore.annotationStore[transId];
131 transcript.parentId = geneId;
132 transcript.typeId = TFragmentStore::ANNO_MRNA;
133
134 _adjustParent(locus, transcript);
135 }
136
137 // support for dynamically chosen file formats
138 template <typename TSpec, typename TConfig>
139 inline void
_storeAnnotationRecord(FragmentStore<TSpec,TConfig> &,UcscRecord const &,TagSelector<> const &)140 _storeAnnotationRecord(
141 FragmentStore<TSpec, TConfig> & /*store*/,
142 UcscRecord const & /*record*/,
143 TagSelector<> const & /*format*/)
144 {
145 SEQAN_FAIL("AnnotationStore: File format not specified.");
146 }
147
148 template <typename TSpec, typename TConfig, typename TTagList>
149 inline void
_storeAnnotationRecord(FragmentStore<TSpec,TConfig> & store,UcscRecord const & record,TagSelector<TTagList> const & format)150 _storeAnnotationRecord(
151 FragmentStore<TSpec, TConfig> & store,
152 UcscRecord const & record,
153 TagSelector<TTagList> const & format)
154 {
155 typedef typename TTagList::Type TFormat;
156
157 if (isEqual(format, TFormat()))
158 _storeAnnotationRecord(store, record, TFormat());
159 else
160 _storeAnnotationRecord(store, record, static_cast<typename TagSelector<TTagList>::Base const &>(format));
161 }
162
163
164
165 //////////////////////////////////////////////////////////////////////////////
166 // read a whole UCSC stream into FragmentStore
167
168 template <typename TFSSpec, typename TConfig, typename TSpec>
169 inline void
readRecords(FragmentStore<TFSSpec,TConfig> & fragStore,FormattedFile<Ucsc,Input,TSpec> & file)170 readRecords(FragmentStore<TFSSpec, TConfig> & fragStore,
171 FormattedFile<Ucsc, Input, TSpec> & file)
172 {
173 typedef FormattedFile<Ucsc, Input, TSpec> TFile;
174 typename DirectionIterator<TFile, Input>::Type iter = directionIterator(file, Input());
175
176 if (atEnd(iter))
177 return;
178
179 // get first character from the stream
180 UcscRecord record;
181 UcscIOContext ctx;
182
183 refresh(fragStore.contigNameStoreCache);
184 refresh(fragStore.annotationNameStoreCache);
185 refresh(fragStore.annotationTypeStoreCache);
186
187 while (!atEnd(iter))
188 {
189 readRecord(record, ctx, iter, format(file));
190 _storeAnnotationRecord(fragStore, record, format(file));
191 }
192 _storeClearAnnoBackLinks(fragStore.annotationStore);
193 _storeCreateAnnoBackLinks(fragStore.annotationStore);
194 _storeRemoveTempAnnoNames(fragStore);
195 }
196
197 //////////////////////////////////////////////////////////////////////////////
198 // extract FragmentStore annotation into a UCSC record
199
200 template <typename TRecord, typename TSpec, typename TConfig, typename TAnnotation, typename TId>
201 inline bool
_fillAnnotationRecord(TRecord & record,FragmentStore<TSpec,TConfig> & fragStore,TAnnotation & annotation,TId id,UcscKnownGene)202 _fillAnnotationRecord(
203 TRecord & record,
204 FragmentStore<TSpec, TConfig> & fragStore,
205 TAnnotation & annotation,
206 TId id,
207 UcscKnownGene)
208 {
209 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
210
211 if (annotation.typeId != TFragmentStore::ANNO_MRNA)
212 return false;
213
214 record.transName = getAnnoUniqueName(fragStore, id);
215 if (annotation.contigId < length(fragStore.contigNameStore))
216 record.contigName = fragStore.contigNameStore[annotation.contigId];
217 else
218 clear(record.contigName);
219
220 clear(record.proteinName);
221 clear(record.exonBegin);
222 clear(record.exonEnds);
223
224 record.annotationBeginPos = annotation.beginPos;
225 record.annotationEndPos = annotation.endPos;
226
227 TId lastChildId = annotation.lastChildId;
228 TId i = lastChildId;
229 do
230 {
231 i = fragStore.annotationStore[i].nextSiblingId;
232 TAnnotation & anno = fragStore.annotationStore[i];
233 if (anno.typeId == TFragmentStore::ANNO_CDS)
234 {
235 if (i < length(fragStore.annotationNameStore))
236 record.proteinName = fragStore.annotationNameStore[i];
237 record.cdsBegin = anno.beginPos;
238 record.cdsEnd = anno.endPos;
239 }
240 if (anno.typeId == TFragmentStore::ANNO_EXON)
241 {
242 appendValue(record.exonBegin, anno.beginPos, Generous());
243 appendValue(record.exonEnds, anno.endPos, Generous());
244 }
245 }
246 while (i != lastChildId);
247 return true;
248 }
249
250 template <typename TRecord, typename TSpec, typename TConfig, typename TAnnotation, typename TId>
251 inline bool
_fillAnnotationRecord(TRecord & record,FragmentStore<TSpec,TConfig> & fragStore,TAnnotation & annotation,TId id,UcscKnownIsoforms)252 _fillAnnotationRecord(
253 TRecord & record,
254 FragmentStore<TSpec, TConfig> & fragStore,
255 TAnnotation & annotation,
256 TId id,
257 UcscKnownIsoforms)
258 {
259 typedef FragmentStore<TSpec, TConfig> TFragmentStore;
260
261 if (annotation.typeId != TFragmentStore::ANNO_MRNA)
262 return false;
263
264 if (annotation.parentId == TAnnotation::INVALID_ID || annotation.parentId == 0)
265 return false;
266
267 record.transName = getAnnoUniqueName(fragStore, annotation.parentId);
268 record.contigName = getAnnoUniqueName(fragStore, id);
269 return true;
270 }
271
272 //////////////////////////////////////////////////////////////////////////////
273 // write FragmentStore to a stream in UCSC format
274
275 template <typename TSpec, typename TFSSpec, typename TFSConfig, typename TFormat>
276 inline void
writeRecords(FormattedFile<Ucsc,Output,TSpec> & ucscFile,FragmentStore<TFSSpec,TFSConfig> & store,TFormat const & format)277 writeRecords(FormattedFile<Ucsc, Output, TSpec> & ucscFile,
278 FragmentStore<TFSSpec, TFSConfig> & store,
279 TFormat const & format)
280 {
281 typedef FragmentStore<TFSSpec, TFSConfig> TFragmentStore;
282 typedef typename TFragmentStore::TAnnotationStore TAnnotationStore;
283 typedef typename Value<TAnnotationStore>::Type TAnnotation;
284 typedef typename Iterator<TAnnotationStore, Standard>::Type TAnnoIter;
285 typedef typename Id<TAnnotation>::Type TId;
286
287 typename DirectionIterator<FormattedFile<Ucsc, Output, TSpec>, Output>::Type iter = directionIterator(ucscFile, Output());
288 UcscRecord record;
289
290 TAnnoIter it = begin(store.annotationStore, Standard());
291 TAnnoIter itEnd = end(store.annotationStore, Standard());
292
293 for (TId id = 0; it != itEnd; ++it, ++id)
294 if (_fillAnnotationRecord(record, store, *it, id, format))
295 writeRecord(iter, record, format);
296 }
297
298 template <typename TSpec, typename TFSSpec, typename TFSConfig>
299 inline void
writeRecords(FormattedFile<Ucsc,Output,TSpec> & ucscFile,FragmentStore<TFSSpec,TFSConfig> & store)300 writeRecords(FormattedFile<Ucsc, Output, TSpec> & ucscFile,
301 FragmentStore<TFSSpec, TFSConfig> & store)
302 {
303 writeRecords(ucscFile, store, format(ucscFile));
304 }
305
306 } // namespace seqan
307
308 #endif //#ifndef SEQAN_INCLUDE_STORE_STORE_IO_UCSC_H
309