1 static char rcsid[] = "$Id: transcriptome.c 222806 2020-06-03 21:59:29Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 #ifndef HAVE_MEMCPY
6 # define memcpy(d,s,n) bcopy((s),(d),(n))
7 #endif
8 #ifndef HAVE_MEMMOVE
9 # define memmove(d,s,n) bcopy((s),(d),(n))
10 #endif
11 
12 #include "transcriptome.h"
13 
14 #include <stdio.h>
15 #include <stddef.h>
16 #include <string.h>		/* Needed for strlen and strcpy */
17 
18 #include "mem.h"
19 #include "access.h"
20 #include "list.h"
21 #include "genomicpos.h"
22 #include "bitpack64-readtwo.h"
23 
24 
25 #ifdef WORDS_BIGENDIAN
26 #include "bigendian.h"
27 #else
28 #include "littleendian.h"
29 #endif
30 
31 
32 
33 #define T Transcriptome_T
34 struct T {
35 
36   Chrnum_T *chrnums;
37 
38   Access_T offsetsmeta_access;
39   int offsetsmeta_shmid;
40   key_t offsetsmeta_key;
41   int offsetsmeta_fd;
42   size_t offsetsmeta_len;
43   UINT4 *offsetsmeta;
44 
45   Access_T offsetsstrm_access;
46   int offsetsstrm_shmid;
47   key_t offsetsstrm_key;
48   int offsetsstrm_fd;
49   size_t offsetsstrm_len;
50   UINT4 *offsetsstrm;
51 
52   Access_T exoninfo_access;
53   int exoninfo_shmid;
54   key_t exoninfo_key;
55   int exoninfo_fd;
56   size_t exoninfo_len;
57 
58   /* exoninfo has exonbounds (int) and exonstarts (unsigned int) interleaved */
59   unsigned int *exoninfo;
60 };
61 
62 
63 #define BUFFERSIZE 1024000
64 
65 void
Transcriptome_free(T * old)66 Transcriptome_free (T *old) {
67   if (*old) {
68 
69     if ((*old)->exoninfo_access == ALLOCATED_PRIVATE) {
70       FREE((*old)->exoninfo);
71     } else if ((*old)->exoninfo_access == ALLOCATED_SHARED) {
72       Access_deallocate((*old)->exoninfo,(*old)->exoninfo_shmid,(*old)->exoninfo_key);
73     } else {
74       abort();
75     }
76 
77     if ((*old)->offsetsstrm_access == ALLOCATED_PRIVATE) {
78       FREE((*old)->offsetsstrm);
79     } else if ((*old)->offsetsstrm_access == ALLOCATED_SHARED) {
80       Access_deallocate((*old)->offsetsstrm,(*old)->offsetsstrm_shmid,(*old)->offsetsstrm_key);
81     } else {
82       abort();
83     }
84 
85     if ((*old)->offsetsmeta_access == ALLOCATED_PRIVATE) {
86       FREE((*old)->offsetsmeta);
87     } else if ((*old)->offsetsmeta_access == ALLOCATED_SHARED) {
88       Access_deallocate((*old)->offsetsmeta,(*old)->offsetsmeta_shmid,(*old)->offsetsmeta_key);
89     } else {
90       abort();
91     }
92 
93     FREE((*old)->chrnums);
94     FREE(*old);
95   }
96   return;
97 }
98 
99 
100 T
Transcriptome_new(char * genomesubdir,char * genome_fileroot,char * transcriptome_fileroot,bool sharedp)101 Transcriptome_new (char *genomesubdir, char *genome_fileroot, char *transcriptome_fileroot,
102 		   bool sharedp) {
103   T new = (T) MALLOC(sizeof(*new));
104 
105   FILE *fp;
106   char *info_root, *filename;
107 
108   char *comma;
109   double seconds;
110   int ntranscripts, trnum;
111   int divint;
112 
113 
114   info_root = (char *) CALLOC(strlen(genomesubdir)+strlen("/")+
115 			      strlen(genome_fileroot)+strlen(".transcripts/")+strlen(transcriptome_fileroot)+1,
116 			      sizeof(char));
117   sprintf(info_root,"%s/%s.transcripts/%s",genomesubdir,genome_fileroot,transcriptome_fileroot);
118 
119   filename = (char *) CALLOC(strlen(info_root)+strlen(".chrnums")+1,sizeof(char));
120   sprintf(filename,"%s.chrnums",info_root);
121   ntranscripts = Access_filesize(filename)/sizeof(int);
122 
123   if ((fp = fopen(filename,"rb")) == NULL) {
124     fprintf(stderr,"Cannot open file %s\n",filename);
125     exit(9);
126   }
127   FREE(filename);
128 
129   new->chrnums = (Chrnum_T *) MALLOC((ntranscripts+1)*sizeof(Chrnum_T));
130   new->chrnums[0] = 0;
131   for (trnum = 1; trnum <= ntranscripts; trnum++) {
132     FREAD_INT(&divint,fp);
133     new->chrnums[trnum] = divint;
134   }
135   fclose(fp);
136 
137 
138   filename = (char *) CALLOC(strlen(info_root)+strlen(".offsets64meta")+1,sizeof(char));
139   sprintf(filename,"%s.offsets64meta",info_root);
140   if (sharedp == true) {
141     new->offsetsmeta = (UINT4 *) Access_allocate_shared(&new->offsetsmeta_access,&new->offsetsmeta_shmid,&new->offsetsmeta_key,
142 							&new->offsetsmeta_fd,&new->offsetsmeta_len,&seconds,
143 							filename,sizeof(UINT4));
144   } else {
145     new->offsetsmeta = (UINT4 *) Access_allocate_private(&new->offsetsmeta_access,&new->offsetsmeta_len,&seconds,
146 							filename,sizeof(UINT4));
147   }
148   comma = Genomicpos_commafmt(new->offsetsmeta_len);
149   fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
150   FREE(comma);
151   FREE(filename);
152 
153 
154 
155   filename = (char *) CALLOC(strlen(info_root)+strlen(".offsets64strm")+1,sizeof(char));
156   sprintf(filename,"%s.offsets64strm",info_root);
157   if (sharedp == true) {
158     new->offsetsstrm = (UINT4 *) Access_allocate_shared(&new->offsetsstrm_access,&new->offsetsstrm_shmid,&new->offsetsstrm_key,
159 							&new->offsetsstrm_fd,&new->offsetsstrm_len,&seconds,
160 							filename,sizeof(UINT4));
161   } else {
162     new->offsetsstrm = (UINT4 *) Access_allocate_private(&new->offsetsstrm_access,&new->offsetsstrm_len,&seconds,
163 							 filename,sizeof(UINT4));
164   }
165   comma = Genomicpos_commafmt(new->offsetsstrm_len);
166   fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
167   FREE(comma);
168   FREE(filename);
169 
170 
171 
172   /* Read exoninfo file */
173   filename = (char *) CALLOC(strlen(info_root)+strlen(".exoninfo")+1,sizeof(char));
174   sprintf(filename,"%s.exoninfo",info_root);
175   if (sharedp == true) {
176     new->exoninfo = (UINT4 *) Access_allocate_shared(&new->exoninfo_access,&new->exoninfo_shmid,&new->exoninfo_key,
177 						     &new->exoninfo_fd,&new->exoninfo_len,&seconds,
178 						     filename,sizeof(UINT4));
179   } else {
180     new->exoninfo = (UINT4 *) Access_allocate_private(&new->exoninfo_access,&new->exoninfo_len,&seconds,
181 						      filename,sizeof(UINT4));
182   }
183   comma = Genomicpos_commafmt(new->exoninfo_len);
184   fprintf(stderr,"done (%s bytes, %.2f sec)\n",comma,seconds);
185   FREE(comma);
186   FREE(filename);
187 
188   FREE(info_root);
189 
190   return new;
191 }
192 
193 
194 Chrnum_T
Transcriptome_chrnum(int * transcript_genestrand,T this,int trnum)195 Transcriptome_chrnum (int *transcript_genestrand, T this, int trnum) {
196   Chrnum_T chrnum;
197 
198   if ((chrnum = this->chrnums[trnum]) == 0) {
199     return 0;
200   } else if (chrnum > 0) {
201     *transcript_genestrand = +1;
202     return chrnum;
203   } else {
204     *transcript_genestrand = -1;
205     return -chrnum;
206   }
207 }
208 
209 
210 
211 /* Assumes that we have already checked that this->chrnums != 0 */
212 int
Transcriptome_exons(int ** exonbounds,Chrpos_T ** exonstarts,T this,int trnum)213 Transcriptome_exons (int **exonbounds, Chrpos_T **exonstarts, T this, int trnum) {
214   UINT4 offset0, offset1;
215   int nexons;
216 #if 0
217   int i;
218 #endif
219 
220   offset0 = Bitpack64_read_two(&offset1,(Oligospace_T) (trnum - 1),this->offsetsmeta,this->offsetsstrm);
221   nexons = (int) (offset1 - offset0);
222   *exonbounds = &(((int *) this->exoninfo)[2*offset0]);
223   *exonstarts = &(this->exoninfo[2*offset0 + nexons]);
224 
225 #if 0
226   printf("Transcript %d has %d exons:\n",trnum,nexons);
227   for (i = 0; i < nexons; i++) {
228     printf("transcript pos %d chrpos %u\n",(*exonbounds)[i],(*exonstarts)[i]);
229   }
230   printf("\n");
231 #endif
232 
233   return nexons;
234 }
235 
236 
237 bool
Transcriptome_genomic_bounded_p(int trnum,Chrnum_T chrbound,Chrpos_T lowbound,Chrpos_T highbound,T this)238 Transcriptome_genomic_bounded_p (int trnum, Chrnum_T chrbound, Chrpos_T lowbound, Chrpos_T highbound, T this) {
239   UINT4 offset0, offset1;
240   int nexons;
241   int *exonbounds;
242   Chrnum_T chrnum;
243   Chrpos_T *exonstarts;
244   Univcoord_T start, end;
245 
246   if ((chrnum = this->chrnums[trnum]) == 0) {
247     return false;
248   } else if (chrnum > 0) {
249     if (chrnum != chrbound) {
250       return false;
251     }
252   } else {
253     if (-chrnum != chrbound) {
254       return false;
255     }
256   }
257 
258   offset0 = Bitpack64_read_two(&offset1,(Oligospace_T) (trnum - 1),this->offsetsmeta,this->offsetsstrm);
259   nexons = (int) (offset1 - offset0);
260 
261   exonbounds = &(((int *) this->exoninfo)[2*offset0]);
262   exonstarts = &(this->exoninfo[2*offset0 + nexons]);
263 
264   start = exonstarts[0];
265   end = exonstarts[nexons - 1] + exonbounds[nexons - 1];
266   if (start < end) {
267     if (end < lowbound) {
268       return false;
269     } else if (start > highbound) {
270       return false;
271     } else {
272       return true;
273     }
274 
275   } else {
276     if (start < lowbound) {
277       return false;
278     } else if (end > highbound) {
279       return false;
280     } else {
281       return true;
282     }
283   }
284 }
285 
286