1 /*===========================================================================
2 *
3 *                            PUBLIC DOMAIN NOTICE
4 *               National Center for Biotechnology Information
5 *
6 *  This software/database is a "United States Government Work" under the
7 *  terms of the United States Copyright Act.  It was written as part of
8 *  the author's official duties as a United States Government employee and
9 *  thus cannot be copyrighted.  This software/database is freely available
10 *  to the public for use. The National Library of Medicine and the U.S.
11 *  Government have not placed any restriction on its use or reproduction.
12 *
13 *  Although all reasonable efforts have been taken to ensure the accuracy
14 *  and reliability of the software and data, the NLM and the U.S.
15 *  Government do not and cannot warrant the performance or results that
16 *  may be obtained by using this software or data. The NLM and the U.S.
17 *  Government disclaim all warranties, express or implied, including
18 *  warranties of performance, merchantability or fitness for any particular
19 *  purpose.
20 *
21 *  Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 */
26 
27 #include <klib/rc.h>
28 #include <klib/printf.h>
29 #include <klib/vector.h>
30 
31 #include <kfs/directory.h>
32 #include <kfs/file.h>
33 
34 #include <string.h>
35 
36 #include "rna_splice_log.h"
37 
38 typedef struct rna_splice_dict rna_splice_dict;
39 struct rna_splice_dict
40 {
41     KVector * v;
42 };
43 
44 
make_rna_splice_dict(void)45 struct rna_splice_dict * make_rna_splice_dict( void )
46 {
47     struct rna_splice_dict * res = NULL;
48     KVector * v;
49     rc_t rc = KVectorMake ( &v );
50     if ( rc == 0 )
51     {
52         res = calloc( 1, sizeof * res );
53         if ( res != NULL )
54         {
55             res->v = v;
56         }
57         else
58         {
59             KVectorRelease ( v );
60         }
61     }
62     return res;
63 }
64 
65 
free_rna_splice_dict(struct rna_splice_dict * dict)66 void free_rna_splice_dict( struct rna_splice_dict * dict )
67 {
68     if ( dict != NULL )
69     {
70         KVectorRelease ( dict->v );
71         free( dict );
72     }
73 }
74 
75 
76 typedef struct splice_dict_key splice_dict_key;
77 struct splice_dict_key
78 {
79     uint32_t len;
80     uint32_t pos;
81 };
82 
83 union dict_key_union
84 {
85     uint64_t key;
86     splice_dict_key key_struct;
87 };
88 
89 union dict_value_union
90 {
91     uint64_t value;
92     splice_dict_entry entry;
93 };
94 
95 
rna_splice_dict_get(struct rna_splice_dict * dict,uint32_t pos,uint32_t len,splice_dict_entry * entry)96 bool rna_splice_dict_get( struct rna_splice_dict * dict,
97                           uint32_t pos, uint32_t len, splice_dict_entry * entry )
98 {
99     bool res = false;
100     if ( dict != NULL )
101     {
102         rc_t rc;
103         union dict_key_union ku;
104         union dict_value_union vu;
105 
106         ku.key_struct.pos = pos;
107         ku.key_struct.len = len;
108         rc = KVectorGetU64 ( dict->v, ku.key, &(vu.value) );
109         res = ( rc == 0 );
110         if ( res && entry != NULL )
111         {
112             entry->count = vu.entry.count;
113             entry->intron_type = vu.entry.intron_type;
114         }
115     }
116     return res;
117 }
118 
119 
rna_splice_dict_set(struct rna_splice_dict * dict,uint32_t pos,uint32_t len,const splice_dict_entry * entry)120 void rna_splice_dict_set( struct rna_splice_dict * dict,
121                           uint32_t pos, uint32_t len, const splice_dict_entry * entry )
122 {
123     if ( dict != NULL && entry != NULL )
124     {
125         union dict_key_union ku;
126         union dict_value_union vu;
127 
128         ku.key_struct.pos = pos;
129         ku.key_struct.len = len;
130         vu.entry.count = entry->count;
131         vu.entry.intron_type = entry->intron_type;
132         KVectorSetU64 ( dict->v, ku.key, vu.value );
133     }
134 }
135 
136 
137 /* --------------------------------------------------------------------------- */
138 
139 
140 typedef struct rna_splice_log rna_splice_log;
141 struct rna_splice_log
142 {
143     KFile * log_file;
144     const char * tool_name;
145     struct ReferenceObj const * ref_obj;
146 
147     char ref_name[ 1024 ];
148     uint64_t log_file_pos;
149 };
150 
151 
make_rna_splice_log(const char * filename,const char * toolname)152 struct rna_splice_log * make_rna_splice_log( const char * filename, const char * toolname )
153 {
154     struct rna_splice_log * res = NULL;
155     KDirectory * dir;
156     rc_t rc = KDirectoryNativeDir ( &dir );
157     if ( rc == 0 )
158     {
159         KFile * f;
160         rc = KDirectoryCreateFile ( dir, &f, false, 0664, kcmInit, "%s", filename );
161         if ( rc == 0 )
162         {
163             res = calloc( 1, sizeof * res );
164             if ( res != NULL )
165             {
166                 res->log_file = f;
167                 if ( toolname != NULL )
168                     res->tool_name = string_dup_measure ( toolname, NULL );
169             }
170             else
171                 KFileRelease ( f );
172         }
173         KDirectoryRelease ( dir );
174     }
175     return res;
176 }
177 
178 
free_rna_splice_log(struct rna_splice_log * sl)179 void free_rna_splice_log( struct rna_splice_log * sl )
180 {
181     if ( sl != NULL )
182     {
183         KFileRelease ( sl->log_file );
184         if ( sl->tool_name != NULL ) free( ( void * )sl->tool_name );
185         free( ( void * ) sl );
186     }
187 }
188 
189 
rna_splice_log_enter_ref(struct rna_splice_log * sl,const char * ref_name,struct ReferenceObj const * ref_obj)190 void rna_splice_log_enter_ref( struct rna_splice_log * sl,
191                                const char * ref_name,
192                                struct ReferenceObj const * ref_obj )
193 {
194     if ( sl != NULL )
195     {
196         if ( ref_name != NULL )
197             string_copy_measure ( sl->ref_name, sizeof( sl->ref_name ), ref_name );
198         else
199             sl->ref_name[ 0 ] = 0;
200 
201         sl->ref_obj = ref_obj;
202     }
203 }
204 
205 
copy_read_and_reverse_complement(uint8_t * dst,const uint8_t * const src,INSDC_coord_len const count)206 static void copy_read_and_reverse_complement( uint8_t * dst, const uint8_t * const src, INSDC_coord_len const count )
207 {
208     static char const compl[] = {
209          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
210          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
211          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
212          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
213          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
214          0 ,  0 ,  0 ,  0 ,  0 ,  0 , '.',  0 ,
215         '0', '1', '2', '3',  0 ,  0 ,  0 ,  0 ,
216          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
217          0 , 'T', 'V', 'G', 'H',  0 ,  0 , 'C',
218         'D',  0 ,  0 , 'M',  0 , 'K', 'N',  0 ,
219          0 ,  0 , 'Y', 'S', 'A', 'A', 'B', 'W',
220          0 , 'R',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
221          0 , 'T', 'V', 'G', 'H',  0 ,  0 , 'C',
222         'D',  0 ,  0 , 'M',  0 , 'K', 'N',  0 ,
223          0 ,  0 , 'Y', 'S', 'A', 'A', 'B', 'W',
224          0 , 'R',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
225          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
226          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
227          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
228          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
229          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
230          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
231          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
232          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
233          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
234          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
235          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
236          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
237          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
238          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
239          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
240          0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,  0
241     };
242 
243     INSDC_coord_len i, j;
244 
245     for ( i = 0, j = count - 1; i != count; ++i, --j )
246     {
247         dst[ i ] = compl[ src[ j ] ];
248     }
249 }
250 
251 
252 #define PRE_POST_LEN 10
253 #define EDGE_LEN ( ( PRE_POST_LEN * 2 ) + 2 )
254 
255 
write_to_file(struct rna_splice_log * sl,const uint8_t * src,size_t len)256 static rc_t write_to_file( struct rna_splice_log * sl, const uint8_t * src, size_t len )
257 {
258     size_t num_writ;
259     rc_t rc = KFileWriteAll( sl->log_file, sl->log_file_pos, src, len, &num_writ );
260     if ( rc == 0 )
261         sl->log_file_pos += num_writ;
262     return rc;
263 }
264 
print_edge(struct rna_splice_log * sl,INSDC_coord_zero pos,bool const reverse_complement,bool const add_newline)265 static rc_t print_edge( struct rna_splice_log * sl,
266                         INSDC_coord_zero pos,
267                         bool const reverse_complement,
268                         bool const add_newline )
269 {
270     rc_t rc;
271     INSDC_coord_len from_ref_obj, to_read;
272     uint8_t buffer[ EDGE_LEN + 1 ];
273     INSDC_coord_zero rd_pos = 0;
274     uint32_t pre_len = PRE_POST_LEN;
275     uint32_t post_len = PRE_POST_LEN;
276 
277     if ( pos >= PRE_POST_LEN )
278         rd_pos = ( pos - PRE_POST_LEN ); /* in the rare case the delete is at the very beginning of the alignment */
279     else
280         pre_len = pos; /* rd_pos is still 0, what we want*/
281 
282     to_read = pre_len + post_len + 2;
283     rc = ReferenceObj_Read( sl->ref_obj, rd_pos, to_read, buffer, &from_ref_obj );
284     if ( rc == 0 )
285     {
286         uint8_t complement[ EDGE_LEN + 1 ];
287         uint8_t to_write[ EDGE_LEN + 5 ];
288         uint8_t * ref_bytes = buffer;
289 
290         if ( from_ref_obj < to_read )
291             post_len -= ( to_read - from_ref_obj );
292 
293         if ( reverse_complement )
294         {
295             copy_read_and_reverse_complement( complement, buffer, from_ref_obj );
296             ref_bytes = complement;
297         }
298         memmove( to_write, ref_bytes, pre_len );
299         to_write[ pre_len ] = '\t';
300         to_write[ pre_len + 1 ] = ref_bytes[ pre_len ];
301         to_write[ pre_len + 2 ] = ref_bytes[ pre_len + 1 ];
302         to_write[ pre_len + 3 ] = '\t';
303         memmove( &( to_write[ pre_len + 4 ] ), &( ref_bytes[ pre_len + 2 ] ), post_len );
304 
305         if ( add_newline )
306             to_write[ pre_len + post_len + 4 ] = '\n';
307         else
308             to_write[ pre_len + post_len + 4 ] = '\t';
309 
310         rc = write_to_file( sl, to_write, pre_len + post_len + 5 );
311     }
312     return rc;
313 }
314 
315 
316 /*
317 #define INTRON_UNKNOWN 0
318 #define INTRON_FWD 1
319 #define INTRON_REV 2
320 */
321 
322 static const char intron_type_to_ascii[] = { 'u', '+', '-', 'u' };
323 
on_dict_key_value(uint64_t key,uint64_t value,void * user_data)324 static rc_t CC on_dict_key_value( uint64_t key, uint64_t value, void * user_data )
325 {
326     rc_t rc = 0;
327     struct rna_splice_log * sl = ( struct rna_splice_log * )user_data;
328     if ( sl != NULL )
329     {
330         char tmp[ 512 ];
331         size_t num_writ;
332         union dict_key_union ku;
333         union dict_value_union vu;
334         char intron;
335         bool reverse_complement;
336 
337         ku.key = key;
338         vu.value = value;
339         intron = intron_type_to_ascii[ vu.entry.intron_type & 0x03 ];
340         reverse_complement = ( ( vu.entry.intron_type & 0x03 ) == INTRON_REV );
341 
342         rc = string_printf ( tmp, sizeof tmp, &num_writ,
343                              "%s\t%u\t%u\t%u\t%c\t",
344                              sl->ref_name, ku.key_struct.pos + 1, ku.key_struct.len, vu.entry.count, intron );
345         if ( rc == 0 )
346             rc = write_to_file( sl, ( uint8_t * )tmp, num_writ );
347 
348         if ( reverse_complement )
349         {
350             if ( rc == 0 )
351                 rc = print_edge( sl, ku.key_struct.pos + ku.key_struct.len - 2, true, false );
352             if ( rc == 0 )
353                 rc = print_edge( sl, ku.key_struct.pos, true, true );
354         }
355         else
356         {
357             if ( rc == 0 )
358                 rc = print_edge( sl, ku.key_struct.pos, false, false );
359             if ( rc == 0 )
360                 rc = print_edge( sl, ku.key_struct.pos + ku.key_struct.len - 2, false, true );
361         }
362     }
363     return rc;
364 }
365 
366 
rna_splice_log_exit_ref(struct rna_splice_log * sl,struct rna_splice_dict * dict)367 void rna_splice_log_exit_ref( struct rna_splice_log * sl, struct rna_splice_dict * dict )
368 {
369     if ( sl != NULL && dict != NULL )
370     {
371         KVectorVisitU64 ( dict->v, false, on_dict_key_value, sl );
372     }
373 }
374 
375