1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // ==========================================================================
34 
35 #ifndef SEQAN_INCLUDE_STORE_STORE_IO_UCSC_H
36 #define SEQAN_INCLUDE_STORE_STORE_IO_UCSC_H
37 
38 namespace seqan {
39 
40 //////////////////////////////////////////////////////////////////////////////
41 // add a UCSC record to FragmentStore
42 
43 template <typename TSpec, typename TConfig>
44 inline void
_storeAnnotationRecord(FragmentStore<TSpec,TConfig> & fragStore,UcscRecord const & record,UcscKnownGene)45 _storeAnnotationRecord(
46     FragmentStore<TSpec, TConfig> & fragStore,
47     UcscRecord const & record,
48     UcscKnownGene)
49 {
50     typedef FragmentStore<TSpec, TConfig>               TFragmentStore;
51     typedef typename TFragmentStore::TAnnotationStore   TAnnotationStore;
52     typedef typename Value<TAnnotationStore>::Type      TAnnotation;
53     typedef typename TAnnotation::TId                   TId;
54 
55     SEQAN_ASSERT_EQ(length(fragStore.annotationStore), length(fragStore.annotationNameStore));
56 
57     // add transcript and CDS
58     TId transId = TAnnotation::INVALID_ID;
59     _storeAppendAnnotationName(fragStore, transId, record.transName, (TId) TFragmentStore::ANNO_MRNA);
60     TId cdsId = length(fragStore.annotationStore);
61     appendName(fragStore.annotationNameStoreCache, record.proteinName);
62 
63     resize(fragStore.annotationStore, cdsId + 1 + length(record.exonBegin), Generous());
64     resize(fragStore.annotationNameStore, cdsId + 1 + length(record.exonBegin), Generous());
65 
66     // add contig name
67     TId contigId = TAnnotation::INVALID_ID;
68     _storeAppendContig(fragStore, contigId, record.contigName);
69 
70     // update transcript data
71     TAnnotation & transcript = fragStore.annotationStore[transId];
72     TId geneId = transcript.parentId;
73     if (geneId == TAnnotation::INVALID_ID)
74         geneId = 0;
75     transcript.parentId = geneId;
76     transcript.contigId = contigId;
77     transcript.typeId = TFragmentStore::ANNO_MRNA;
78 
79     // add CDS entry
80     TAnnotation & cds = fragStore.annotationStore[cdsId];
81     cds.parentId = transId;
82     cds.contigId = contigId;
83     cds.typeId = TFragmentStore::ANNO_CDS;
84     cds.beginPos = record.cdsBegin;
85     cds.endPos = record.cdsEnd;
86     _adjustParent(transcript, cds);
87 
88     // add exons
89     for (unsigned i = 0; i < length(record.exonBegin); ++i)
90     {
91         TAnnotation & exon = fragStore.annotationStore[cdsId + 1 + i];
92         exon.parentId = transId;
93         exon.contigId = contigId;
94         exon.typeId = TFragmentStore::ANNO_EXON;
95         exon.beginPos = record.exonBegin[i];
96         exon.endPos = record.exonEnds[i];
97         _adjustParent(transcript, exon);
98     }
99     if (geneId != 0)
100         _adjustParent(fragStore.annotationStore[geneId], transcript);
101 }
102 
103 template <typename TSpec, typename TConfig>
104 inline void
_storeAnnotationRecord(FragmentStore<TSpec,TConfig> & fragStore,UcscRecord const & record,UcscKnownIsoforms const &)105 _storeAnnotationRecord(
106     FragmentStore<TSpec, TConfig> & fragStore,
107     UcscRecord const & record,
108     UcscKnownIsoforms const &)
109 {
110     typedef FragmentStore<TSpec, TConfig>               TFragmentStore;
111     typedef typename TFragmentStore::TAnnotationStore   TAnnotationStore;
112     typedef typename Value<TAnnotationStore>::Type      TAnnotation;
113     typedef typename TAnnotation::TId                   TId;
114 
115     SEQAN_ASSERT_EQ(length(fragStore.annotationStore), length(fragStore.annotationNameStore));
116 
117     TId geneId = TAnnotation::INVALID_ID;
118     TId transId = TAnnotation::INVALID_ID;
119 
120     // add transcript and CDS
121     _storeAppendAnnotationName(fragStore, geneId, record.transName, (TId) TFragmentStore::ANNO_GENE);
122     _storeAppendAnnotationName(fragStore, transId, record.contigName, (TId) TFragmentStore::ANNO_MRNA);
123 
124     // set parent link locus->root
125     TAnnotation & locus = fragStore.annotationStore[geneId];
126     locus.parentId = 0;
127     locus.typeId = TFragmentStore::ANNO_GENE;
128 
129     // set parent link transcript->locus
130     TAnnotation & transcript = fragStore.annotationStore[transId];
131     transcript.parentId = geneId;
132     transcript.typeId = TFragmentStore::ANNO_MRNA;
133 
134     _adjustParent(locus, transcript);
135 }
136 
137 // support for dynamically chosen file formats
138 template <typename TSpec, typename TConfig>
139 inline void
_storeAnnotationRecord(FragmentStore<TSpec,TConfig> &,UcscRecord const &,TagSelector<> const &)140 _storeAnnotationRecord(
141     FragmentStore<TSpec, TConfig> & /*store*/,
142     UcscRecord const & /*record*/,
143     TagSelector<> const & /*format*/)
144 {
145     SEQAN_FAIL("AnnotationStore: File format not specified.");
146 }
147 
148 template <typename TSpec, typename TConfig, typename TTagList>
149 inline void
_storeAnnotationRecord(FragmentStore<TSpec,TConfig> & store,UcscRecord const & record,TagSelector<TTagList> const & format)150 _storeAnnotationRecord(
151     FragmentStore<TSpec, TConfig> & store,
152     UcscRecord const & record,
153     TagSelector<TTagList> const & format)
154 {
155     typedef typename TTagList::Type TFormat;
156 
157     if (isEqual(format, TFormat()))
158         _storeAnnotationRecord(store, record, TFormat());
159     else
160         _storeAnnotationRecord(store, record, static_cast<typename TagSelector<TTagList>::Base const &>(format));
161 }
162 
163 
164 
165 //////////////////////////////////////////////////////////////////////////////
166 // read a whole UCSC stream into FragmentStore
167 
168 template <typename TFSSpec, typename TConfig, typename TSpec>
169 inline void
readRecords(FragmentStore<TFSSpec,TConfig> & fragStore,FormattedFile<Ucsc,Input,TSpec> & file)170 readRecords(FragmentStore<TFSSpec, TConfig> & fragStore,
171             FormattedFile<Ucsc, Input, TSpec> & file)
172 {
173     typedef FormattedFile<Ucsc, Input, TSpec> TFile;
174     typename DirectionIterator<TFile, Input>::Type iter = directionIterator(file, Input());
175 
176     if (atEnd(iter))
177         return;
178 
179     // get first character from the stream
180     UcscRecord record;
181     UcscIOContext ctx;
182 
183     refresh(fragStore.contigNameStoreCache);
184     refresh(fragStore.annotationNameStoreCache);
185     refresh(fragStore.annotationTypeStoreCache);
186 
187     while (!atEnd(iter))
188     {
189         readRecord(record, ctx, iter, format(file));
190         _storeAnnotationRecord(fragStore, record, format(file));
191     }
192     _storeClearAnnoBackLinks(fragStore.annotationStore);
193     _storeCreateAnnoBackLinks(fragStore.annotationStore);
194     _storeRemoveTempAnnoNames(fragStore);
195 }
196 
197 //////////////////////////////////////////////////////////////////////////////
198 // extract FragmentStore annotation into a UCSC record
199 
200 template <typename TRecord, typename TSpec, typename TConfig, typename TAnnotation, typename TId>
201 inline bool
_fillAnnotationRecord(TRecord & record,FragmentStore<TSpec,TConfig> & fragStore,TAnnotation & annotation,TId id,UcscKnownGene)202 _fillAnnotationRecord(
203     TRecord & record,
204     FragmentStore<TSpec, TConfig> & fragStore,
205     TAnnotation & annotation,
206     TId id,
207     UcscKnownGene)
208 {
209     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
210 
211     if (annotation.typeId != TFragmentStore::ANNO_MRNA)
212         return false;
213 
214     record.transName = getAnnoUniqueName(fragStore, id);
215     if (annotation.contigId < length(fragStore.contigNameStore))
216         record.contigName = fragStore.contigNameStore[annotation.contigId];
217     else
218         clear(record.contigName);
219 
220     clear(record.proteinName);
221     clear(record.exonBegin);
222     clear(record.exonEnds);
223 
224     record.annotationBeginPos = annotation.beginPos;
225     record.annotationEndPos = annotation.endPos;
226 
227     TId lastChildId = annotation.lastChildId;
228     TId i = lastChildId;
229     do
230     {
231         i = fragStore.annotationStore[i].nextSiblingId;
232         TAnnotation & anno = fragStore.annotationStore[i];
233         if (anno.typeId == TFragmentStore::ANNO_CDS)
234         {
235             if (i < length(fragStore.annotationNameStore))
236                 record.proteinName = fragStore.annotationNameStore[i];
237             record.cdsBegin = anno.beginPos;
238             record.cdsEnd = anno.endPos;
239         }
240         if (anno.typeId == TFragmentStore::ANNO_EXON)
241         {
242             appendValue(record.exonBegin, anno.beginPos, Generous());
243             appendValue(record.exonEnds, anno.endPos, Generous());
244         }
245     }
246     while (i != lastChildId);
247     return true;
248 }
249 
250 template <typename TRecord, typename TSpec, typename TConfig, typename TAnnotation, typename TId>
251 inline bool
_fillAnnotationRecord(TRecord & record,FragmentStore<TSpec,TConfig> & fragStore,TAnnotation & annotation,TId id,UcscKnownIsoforms)252 _fillAnnotationRecord(
253     TRecord & record,
254     FragmentStore<TSpec, TConfig> & fragStore,
255     TAnnotation & annotation,
256     TId id,
257     UcscKnownIsoforms)
258 {
259     typedef FragmentStore<TSpec, TConfig> TFragmentStore;
260 
261     if (annotation.typeId != TFragmentStore::ANNO_MRNA)
262         return false;
263 
264     if (annotation.parentId == TAnnotation::INVALID_ID || annotation.parentId == 0)
265         return false;
266 
267     record.transName = getAnnoUniqueName(fragStore, annotation.parentId);
268     record.contigName = getAnnoUniqueName(fragStore, id);
269     return true;
270 }
271 
272 //////////////////////////////////////////////////////////////////////////////
273 // write FragmentStore to a stream in UCSC format
274 
275 template <typename TSpec, typename TFSSpec, typename TFSConfig, typename TFormat>
276 inline void
writeRecords(FormattedFile<Ucsc,Output,TSpec> & ucscFile,FragmentStore<TFSSpec,TFSConfig> & store,TFormat const & format)277 writeRecords(FormattedFile<Ucsc, Output, TSpec> & ucscFile,
278              FragmentStore<TFSSpec, TFSConfig> & store,
279              TFormat const & format)
280 {
281     typedef FragmentStore<TFSSpec, TFSConfig>                       TFragmentStore;
282     typedef typename TFragmentStore::TAnnotationStore               TAnnotationStore;
283     typedef typename Value<TAnnotationStore>::Type                  TAnnotation;
284     typedef typename Iterator<TAnnotationStore, Standard>::Type     TAnnoIter;
285     typedef typename Id<TAnnotation>::Type                          TId;
286 
287     typename DirectionIterator<FormattedFile<Ucsc, Output, TSpec>, Output>::Type iter = directionIterator(ucscFile, Output());
288     UcscRecord record;
289 
290     TAnnoIter it = begin(store.annotationStore, Standard());
291     TAnnoIter itEnd = end(store.annotationStore, Standard());
292 
293     for (TId id = 0; it != itEnd; ++it, ++id)
294         if (_fillAnnotationRecord(record, store, *it, id, format))
295             writeRecord(iter, record, format);
296 }
297 
298 template <typename TSpec, typename TFSSpec, typename TFSConfig>
299 inline void
writeRecords(FormattedFile<Ucsc,Output,TSpec> & ucscFile,FragmentStore<TFSSpec,TFSConfig> & store)300 writeRecords(FormattedFile<Ucsc, Output, TSpec> & ucscFile,
301              FragmentStore<TFSSpec, TFSConfig> & store)
302 {
303     writeRecords(ucscFile, store, format(ucscFile));
304 }
305 
306 } // namespace seqan
307 
308 #endif //#ifndef SEQAN_INCLUDE_STORE_STORE_IO_UCSC_H
309