1 static char rcsid[] = "$Id: transcriptome.c 222806 2020-06-03 21:59:29Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11
12 #include "transcriptome.h"
13
14 #include <stdio.h>
15 #include <stddef.h>
16 #include <string.h> /* Needed for strlen and strcpy */
17
18 #include "mem.h"
19 #include "access.h"
20 #include "list.h"
21 #include "genomicpos.h"
22 #include "bitpack64-readtwo.h"
23
24
25 #ifdef WORDS_BIGENDIAN
26 #include "bigendian.h"
27 #else
28 #include "littleendian.h"
29 #endif
30
31
32
33 #define T Transcriptome_T
34 struct T {
35
36 Chrnum_T *chrnums;
37
38 Access_T offsetsmeta_access;
39 int offsetsmeta_shmid;
40 key_t offsetsmeta_key;
41 int offsetsmeta_fd;
42 size_t offsetsmeta_len;
43 UINT4 *offsetsmeta;
44
45 Access_T offsetsstrm_access;
46 int offsetsstrm_shmid;
47 key_t offsetsstrm_key;
48 int offsetsstrm_fd;
49 size_t offsetsstrm_len;
50 UINT4 *offsetsstrm;
51
52 Access_T exoninfo_access;
53 int exoninfo_shmid;
54 key_t exoninfo_key;
55 int exoninfo_fd;
56 size_t exoninfo_len;
57
58 /* exoninfo has exonbounds (int) and exonstarts (unsigned int) interleaved */
59 unsigned int *exoninfo;
60 };
61
62
63 #define BUFFERSIZE 1024000
64
65 void
Transcriptome_free(T * old)66 Transcriptome_free (T *old) {
67 if (*old) {
68
69 if ((*old)->exoninfo_access == ALLOCATED_PRIVATE) {
70 FREE((*old)->exoninfo);
71 } else if ((*old)->exoninfo_access == ALLOCATED_SHARED) {
72 Access_deallocate((*old)->exoninfo,(*old)->exoninfo_shmid,(*old)->exoninfo_key);
73 } else {
74 abort();
75 }
76
77 if ((*old)->offsetsstrm_access == ALLOCATED_PRIVATE) {
78 FREE((*old)->offsetsstrm);
79 } else if ((*old)->offsetsstrm_access == ALLOCATED_SHARED) {
80 Access_deallocate((*old)->offsetsstrm,(*old)->offsetsstrm_shmid,(*old)->offsetsstrm_key);
81 } else {
82 abort();
83 }
84
85 if ((*old)->offsetsmeta_access == ALLOCATED_PRIVATE) {
86 FREE((*old)->offsetsmeta);
87 } else if ((*old)->offsetsmeta_access == ALLOCATED_SHARED) {
88 Access_deallocate((*old)->offsetsmeta,(*old)->offsetsmeta_shmid,(*old)->offsetsmeta_key);
89 } else {
90 abort();
91 }
92
93 FREE((*old)->chrnums);
94 FREE(*old);
95 }
96 return;
97 }
98
99
100 T
Transcriptome_new(char * genomesubdir,char * genome_fileroot,char * transcriptome_fileroot,bool sharedp)101 Transcriptome_new (char *genomesubdir, char *genome_fileroot, char *transcriptome_fileroot,
102 bool sharedp) {
103 T new = (T) MALLOC(sizeof(*new));
104
105 FILE *fp;
106 char *info_root, *filename;
107
108 char *comma;
109 double seconds;
110 int ntranscripts, trnum;
111 int divint;
112
113
114 info_root = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
115 strlen(genome_fileroot)+strlen(".transcripts/")+strlen(transcriptome_fileroot)+1,
116 sizeof(char));
117 sprintf(info_root,"%s/%s.transcripts/%s",genomesubdir,genome_fileroot,transcriptome_fileroot);
118
119 filename = (char *) CALLOC(strlen(info_root)+strlen(".chrnums")+1,sizeof(char));
120 sprintf(filename,"%s.chrnums",info_root);
121 ntranscripts = Access_filesize(filename)/sizeof(int);
122
123 if ((fp = fopen(filename,"rb")) == NULL) {
124 fprintf(stderr,"Cannot open file %s\n",filename);
125 exit(9);
126 }
127 FREE(filename);
128
129 new->chrnums = (Chrnum_T *) MALLOC((ntranscripts+1)*sizeof(Chrnum_T));
130 new->chrnums[0] = 0;
131 for (trnum = 1; trnum <= ntranscripts; trnum++) {
132 FREAD_INT(&divint,fp);
133 new->chrnums[trnum] = divint;
134 }
135 fclose(fp);
136
137
138 filename = (char *) CALLOC(strlen(info_root)+strlen(".offsets64meta")+1,sizeof(char));
139 sprintf(filename,"%s.offsets64meta",info_root);
140 if (sharedp == true) {
141 new->offsetsmeta = (UINT4 *) Access_allocate_shared(&new->offsetsmeta_access,&new->offsetsmeta_shmid,&new->offsetsmeta_key,
142 &new->offsetsmeta_fd,&new->offsetsmeta_len,&seconds,
143 filename,sizeof(UINT4));
144 } else {
145 new->offsetsmeta = (UINT4 *) Access_allocate_private(&new->offsetsmeta_access,&new->offsetsmeta_len,&seconds,
146 filename,sizeof(UINT4));
147 }
148 comma = Genomicpos_commafmt(new->offsetsmeta_len);
149 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
150 FREE(comma);
151 FREE(filename);
152
153
154
155 filename = (char *) CALLOC(strlen(info_root)+strlen(".offsets64strm")+1,sizeof(char));
156 sprintf(filename,"%s.offsets64strm",info_root);
157 if (sharedp == true) {
158 new->offsetsstrm = (UINT4 *) Access_allocate_shared(&new->offsetsstrm_access,&new->offsetsstrm_shmid,&new->offsetsstrm_key,
159 &new->offsetsstrm_fd,&new->offsetsstrm_len,&seconds,
160 filename,sizeof(UINT4));
161 } else {
162 new->offsetsstrm = (UINT4 *) Access_allocate_private(&new->offsetsstrm_access,&new->offsetsstrm_len,&seconds,
163 filename,sizeof(UINT4));
164 }
165 comma = Genomicpos_commafmt(new->offsetsstrm_len);
166 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
167 FREE(comma);
168 FREE(filename);
169
170
171
172 /* Read exoninfo file */
173 filename = (char *) CALLOC(strlen(info_root)+strlen(".exoninfo")+1,sizeof(char));
174 sprintf(filename,"%s.exoninfo",info_root);
175 if (sharedp == true) {
176 new->exoninfo = (UINT4 *) Access_allocate_shared(&new->exoninfo_access,&new->exoninfo_shmid,&new->exoninfo_key,
177 &new->exoninfo_fd,&new->exoninfo_len,&seconds,
178 filename,sizeof(UINT4));
179 } else {
180 new->exoninfo = (UINT4 *) Access_allocate_private(&new->exoninfo_access,&new->exoninfo_len,&seconds,
181 filename,sizeof(UINT4));
182 }
183 comma = Genomicpos_commafmt(new->exoninfo_len);
184 fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
185 FREE(comma);
186 FREE(filename);
187
188 FREE(info_root);
189
190 return new;
191 }
192
193
194 Chrnum_T
Transcriptome_chrnum(int * transcript_genestrand,T this,int trnum)195 Transcriptome_chrnum (int *transcript_genestrand, T this, int trnum) {
196 Chrnum_T chrnum;
197
198 if ((chrnum = this->chrnums[trnum]) == 0) {
199 return 0;
200 } else if (chrnum > 0) {
201 *transcript_genestrand = +1;
202 return chrnum;
203 } else {
204 *transcript_genestrand = -1;
205 return -chrnum;
206 }
207 }
208
209
210
211 /* Assumes that we have already checked that this->chrnums != 0 */
212 int
Transcriptome_exons(int ** exonbounds,Chrpos_T ** exonstarts,T this,int trnum)213 Transcriptome_exons (int **exonbounds, Chrpos_T **exonstarts, T this, int trnum) {
214 UINT4 offset0, offset1;
215 int nexons;
216 #if 0
217 int i;
218 #endif
219
220 offset0 = Bitpack64_read_two(&offset1,(Oligospace_T) (trnum - 1),this->offsetsmeta,this->offsetsstrm);
221 nexons = (int) (offset1 - offset0);
222 *exonbounds = &(((int *) this->exoninfo)[2*offset0]);
223 *exonstarts = &(this->exoninfo[2*offset0 + nexons]);
224
225 #if 0
226 printf("Transcript %d has %d exons:\n",trnum,nexons);
227 for (i = 0; i < nexons; i++) {
228 printf("transcript pos %d chrpos %u\n",(*exonbounds)[i],(*exonstarts)[i]);
229 }
230 printf("\n");
231 #endif
232
233 return nexons;
234 }
235
236
237 bool
Transcriptome_genomic_bounded_p(int trnum,Chrnum_T chrbound,Chrpos_T lowbound,Chrpos_T highbound,T this)238 Transcriptome_genomic_bounded_p (int trnum, Chrnum_T chrbound, Chrpos_T lowbound, Chrpos_T highbound, T this) {
239 UINT4 offset0, offset1;
240 int nexons;
241 int *exonbounds;
242 Chrnum_T chrnum;
243 Chrpos_T *exonstarts;
244 Univcoord_T start, end;
245
246 if ((chrnum = this->chrnums[trnum]) == 0) {
247 return false;
248 } else if (chrnum > 0) {
249 if (chrnum != chrbound) {
250 return false;
251 }
252 } else {
253 if (-chrnum != chrbound) {
254 return false;
255 }
256 }
257
258 offset0 = Bitpack64_read_two(&offset1,(Oligospace_T) (trnum - 1),this->offsetsmeta,this->offsetsstrm);
259 nexons = (int) (offset1 - offset0);
260
261 exonbounds = &(((int *) this->exoninfo)[2*offset0]);
262 exonstarts = &(this->exoninfo[2*offset0 + nexons]);
263
264 start = exonstarts[0];
265 end = exonstarts[nexons - 1] + exonbounds[nexons - 1];
266 if (start < end) {
267 if (end < lowbound) {
268 return false;
269 } else if (start > highbound) {
270 return false;
271 } else {
272 return true;
273 }
274
275 } else {
276 if (start < lowbound) {
277 return false;
278 } else if (end > highbound) {
279 return false;
280 } else {
281 return true;
282 }
283 }
284 }
285
286