1 /*===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26
27 #include <klib/rc.h>
28 #include <klib/printf.h>
29 #include <klib/vector.h>
30
31 #include <kfs/directory.h>
32 #include <kfs/file.h>
33
34 #include <string.h>
35
36 #include "rna_splice_log.h"
37
38 typedef struct rna_splice_dict rna_splice_dict;
39 struct rna_splice_dict
40 {
41 KVector * v;
42 };
43
44
make_rna_splice_dict(void)45 struct rna_splice_dict * make_rna_splice_dict( void )
46 {
47 struct rna_splice_dict * res = NULL;
48 KVector * v;
49 rc_t rc = KVectorMake ( &v );
50 if ( rc == 0 )
51 {
52 res = calloc( 1, sizeof * res );
53 if ( res != NULL )
54 {
55 res->v = v;
56 }
57 else
58 {
59 KVectorRelease ( v );
60 }
61 }
62 return res;
63 }
64
65
free_rna_splice_dict(struct rna_splice_dict * dict)66 void free_rna_splice_dict( struct rna_splice_dict * dict )
67 {
68 if ( dict != NULL )
69 {
70 KVectorRelease ( dict->v );
71 free( dict );
72 }
73 }
74
75
76 typedef struct splice_dict_key splice_dict_key;
77 struct splice_dict_key
78 {
79 uint32_t len;
80 uint32_t pos;
81 };
82
83 union dict_key_union
84 {
85 uint64_t key;
86 splice_dict_key key_struct;
87 };
88
89 union dict_value_union
90 {
91 uint64_t value;
92 splice_dict_entry entry;
93 };
94
95
rna_splice_dict_get(struct rna_splice_dict * dict,uint32_t pos,uint32_t len,splice_dict_entry * entry)96 bool rna_splice_dict_get( struct rna_splice_dict * dict,
97 uint32_t pos, uint32_t len, splice_dict_entry * entry )
98 {
99 bool res = false;
100 if ( dict != NULL )
101 {
102 rc_t rc;
103 union dict_key_union ku;
104 union dict_value_union vu;
105
106 ku.key_struct.pos = pos;
107 ku.key_struct.len = len;
108 rc = KVectorGetU64 ( dict->v, ku.key, &(vu.value) );
109 res = ( rc == 0 );
110 if ( res && entry != NULL )
111 {
112 entry->count = vu.entry.count;
113 entry->intron_type = vu.entry.intron_type;
114 }
115 }
116 return res;
117 }
118
119
rna_splice_dict_set(struct rna_splice_dict * dict,uint32_t pos,uint32_t len,const splice_dict_entry * entry)120 void rna_splice_dict_set( struct rna_splice_dict * dict,
121 uint32_t pos, uint32_t len, const splice_dict_entry * entry )
122 {
123 if ( dict != NULL && entry != NULL )
124 {
125 union dict_key_union ku;
126 union dict_value_union vu;
127
128 ku.key_struct.pos = pos;
129 ku.key_struct.len = len;
130 vu.entry.count = entry->count;
131 vu.entry.intron_type = entry->intron_type;
132 KVectorSetU64 ( dict->v, ku.key, vu.value );
133 }
134 }
135
136
137 /* --------------------------------------------------------------------------- */
138
139
140 typedef struct rna_splice_log rna_splice_log;
141 struct rna_splice_log
142 {
143 KFile * log_file;
144 const char * tool_name;
145 struct ReferenceObj const * ref_obj;
146
147 char ref_name[ 1024 ];
148 uint64_t log_file_pos;
149 };
150
151
make_rna_splice_log(const char * filename,const char * toolname)152 struct rna_splice_log * make_rna_splice_log( const char * filename, const char * toolname )
153 {
154 struct rna_splice_log * res = NULL;
155 KDirectory * dir;
156 rc_t rc = KDirectoryNativeDir ( &dir );
157 if ( rc == 0 )
158 {
159 KFile * f;
160 rc = KDirectoryCreateFile ( dir, &f, false, 0664, kcmInit, "%s", filename );
161 if ( rc == 0 )
162 {
163 res = calloc( 1, sizeof * res );
164 if ( res != NULL )
165 {
166 res->log_file = f;
167 if ( toolname != NULL )
168 res->tool_name = string_dup_measure ( toolname, NULL );
169 }
170 else
171 KFileRelease ( f );
172 }
173 KDirectoryRelease ( dir );
174 }
175 return res;
176 }
177
178
free_rna_splice_log(struct rna_splice_log * sl)179 void free_rna_splice_log( struct rna_splice_log * sl )
180 {
181 if ( sl != NULL )
182 {
183 KFileRelease ( sl->log_file );
184 if ( sl->tool_name != NULL ) free( ( void * )sl->tool_name );
185 free( ( void * ) sl );
186 }
187 }
188
189
rna_splice_log_enter_ref(struct rna_splice_log * sl,const char * ref_name,struct ReferenceObj const * ref_obj)190 void rna_splice_log_enter_ref( struct rna_splice_log * sl,
191 const char * ref_name,
192 struct ReferenceObj const * ref_obj )
193 {
194 if ( sl != NULL )
195 {
196 if ( ref_name != NULL )
197 string_copy_measure ( sl->ref_name, sizeof( sl->ref_name ), ref_name );
198 else
199 sl->ref_name[ 0 ] = 0;
200
201 sl->ref_obj = ref_obj;
202 }
203 }
204
205
copy_read_and_reverse_complement(uint8_t * dst,const uint8_t * const src,INSDC_coord_len const count)206 static void copy_read_and_reverse_complement( uint8_t * dst, const uint8_t * const src, INSDC_coord_len const count )
207 {
208 static char const compl[] = {
209 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
210 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
211 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
212 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
213 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
214 0 , 0 , 0 , 0 , 0 , 0 , '.', 0 ,
215 '0', '1', '2', '3', 0 , 0 , 0 , 0 ,
216 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
217 0 , 'T', 'V', 'G', 'H', 0 , 0 , 'C',
218 'D', 0 , 0 , 'M', 0 , 'K', 'N', 0 ,
219 0 , 0 , 'Y', 'S', 'A', 'A', 'B', 'W',
220 0 , 'R', 0 , 0 , 0 , 0 , 0 , 0 ,
221 0 , 'T', 'V', 'G', 'H', 0 , 0 , 'C',
222 'D', 0 , 0 , 'M', 0 , 'K', 'N', 0 ,
223 0 , 0 , 'Y', 'S', 'A', 'A', 'B', 'W',
224 0 , 'R', 0 , 0 , 0 , 0 , 0 , 0 ,
225 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
226 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
227 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
228 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
229 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
230 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
231 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
232 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
233 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
234 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
235 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
236 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
237 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
238 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
239 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
240 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0
241 };
242
243 INSDC_coord_len i, j;
244
245 for ( i = 0, j = count - 1; i != count; ++i, --j )
246 {
247 dst[ i ] = compl[ src[ j ] ];
248 }
249 }
250
251
252 #define PRE_POST_LEN 10
253 #define EDGE_LEN ( ( PRE_POST_LEN * 2 ) + 2 )
254
255
write_to_file(struct rna_splice_log * sl,const uint8_t * src,size_t len)256 static rc_t write_to_file( struct rna_splice_log * sl, const uint8_t * src, size_t len )
257 {
258 size_t num_writ;
259 rc_t rc = KFileWriteAll( sl->log_file, sl->log_file_pos, src, len, &num_writ );
260 if ( rc == 0 )
261 sl->log_file_pos += num_writ;
262 return rc;
263 }
264
print_edge(struct rna_splice_log * sl,INSDC_coord_zero pos,bool const reverse_complement,bool const add_newline)265 static rc_t print_edge( struct rna_splice_log * sl,
266 INSDC_coord_zero pos,
267 bool const reverse_complement,
268 bool const add_newline )
269 {
270 rc_t rc;
271 INSDC_coord_len from_ref_obj, to_read;
272 uint8_t buffer[ EDGE_LEN + 1 ];
273 INSDC_coord_zero rd_pos = 0;
274 uint32_t pre_len = PRE_POST_LEN;
275 uint32_t post_len = PRE_POST_LEN;
276
277 if ( pos >= PRE_POST_LEN )
278 rd_pos = ( pos - PRE_POST_LEN ); /* in the rare case the delete is at the very beginning of the alignment */
279 else
280 pre_len = pos; /* rd_pos is still 0, what we want*/
281
282 to_read = pre_len + post_len + 2;
283 rc = ReferenceObj_Read( sl->ref_obj, rd_pos, to_read, buffer, &from_ref_obj );
284 if ( rc == 0 )
285 {
286 uint8_t complement[ EDGE_LEN + 1 ];
287 uint8_t to_write[ EDGE_LEN + 5 ];
288 uint8_t * ref_bytes = buffer;
289
290 if ( from_ref_obj < to_read )
291 post_len -= ( to_read - from_ref_obj );
292
293 if ( reverse_complement )
294 {
295 copy_read_and_reverse_complement( complement, buffer, from_ref_obj );
296 ref_bytes = complement;
297 }
298 memmove( to_write, ref_bytes, pre_len );
299 to_write[ pre_len ] = '\t';
300 to_write[ pre_len + 1 ] = ref_bytes[ pre_len ];
301 to_write[ pre_len + 2 ] = ref_bytes[ pre_len + 1 ];
302 to_write[ pre_len + 3 ] = '\t';
303 memmove( &( to_write[ pre_len + 4 ] ), &( ref_bytes[ pre_len + 2 ] ), post_len );
304
305 if ( add_newline )
306 to_write[ pre_len + post_len + 4 ] = '\n';
307 else
308 to_write[ pre_len + post_len + 4 ] = '\t';
309
310 rc = write_to_file( sl, to_write, pre_len + post_len + 5 );
311 }
312 return rc;
313 }
314
315
316 /*
317 #define INTRON_UNKNOWN 0
318 #define INTRON_FWD 1
319 #define INTRON_REV 2
320 */
321
322 static const char intron_type_to_ascii[] = { 'u', '+', '-', 'u' };
323
on_dict_key_value(uint64_t key,uint64_t value,void * user_data)324 static rc_t CC on_dict_key_value( uint64_t key, uint64_t value, void * user_data )
325 {
326 rc_t rc = 0;
327 struct rna_splice_log * sl = ( struct rna_splice_log * )user_data;
328 if ( sl != NULL )
329 {
330 char tmp[ 512 ];
331 size_t num_writ;
332 union dict_key_union ku;
333 union dict_value_union vu;
334 char intron;
335 bool reverse_complement;
336
337 ku.key = key;
338 vu.value = value;
339 intron = intron_type_to_ascii[ vu.entry.intron_type & 0x03 ];
340 reverse_complement = ( ( vu.entry.intron_type & 0x03 ) == INTRON_REV );
341
342 rc = string_printf ( tmp, sizeof tmp, &num_writ,
343 "%s\t%u\t%u\t%u\t%c\t",
344 sl->ref_name, ku.key_struct.pos + 1, ku.key_struct.len, vu.entry.count, intron );
345 if ( rc == 0 )
346 rc = write_to_file( sl, ( uint8_t * )tmp, num_writ );
347
348 if ( reverse_complement )
349 {
350 if ( rc == 0 )
351 rc = print_edge( sl, ku.key_struct.pos + ku.key_struct.len - 2, true, false );
352 if ( rc == 0 )
353 rc = print_edge( sl, ku.key_struct.pos, true, true );
354 }
355 else
356 {
357 if ( rc == 0 )
358 rc = print_edge( sl, ku.key_struct.pos, false, false );
359 if ( rc == 0 )
360 rc = print_edge( sl, ku.key_struct.pos + ku.key_struct.len - 2, false, true );
361 }
362 }
363 return rc;
364 }
365
366
rna_splice_log_exit_ref(struct rna_splice_log * sl,struct rna_splice_dict * dict)367 void rna_splice_log_exit_ref( struct rna_splice_log * sl, struct rna_splice_dict * dict )
368 {
369 if ( sl != NULL && dict != NULL )
370 {
371 KVectorVisitU64 ( dict->v, false, on_dict_key_value, sl );
372 }
373 }
374
375