1 /************************************************************************/
2 /*! \file pdb.c
3
4 \brief Functions for parsing pdb files.
5
6 Pdb reader (parser). Loads arrays of pointers for easy backbone access.
7
8 \date Started 10/20/06
9 \author Kevin
10 \version $Id: pdb.c 10711 2011-08-31 22:23:04Z karypis $
11 */
12 /************************************************************************/
13 #include <GKlib.h>
14
15 /************************************************************************/
16 /*! \brief Converts three-letter amino acid codes to one-leter codes.
17
18 This function takes a three letter \c * and converts it to a single \c
19
20 \param res is the three-letter code to be converted.
21 \returns A \c representing the amino acid.
22 */
23 /************************************************************************/
gk_threetoone(char * res)24 char gk_threetoone(char *res) { /* {{{ */
25 /* make sure the matching works */
26 res[0] = toupper(res[0]);
27 res[1] = toupper(res[1]);
28 res[2] = toupper(res[2]);
29 if(strcmp(res,"ALA") == 0) {
30 return 'A';
31 }
32 else if(strcmp(res,"CYS") == 0) {
33 return 'C';
34 }
35 else if(strcmp(res,"ASP") == 0) {
36 return 'D';
37 }
38 else if(strcmp(res,"GLU") == 0) {
39 return 'E';
40 }
41 else if(strcmp(res,"PHE") == 0) {
42 return 'F';
43 }
44 else if(strcmp(res,"GLY") == 0) {
45 return 'G';
46 }
47 else if(strcmp(res,"HIS") == 0) {
48 return 'H';
49 }
50 else if(strcmp(res,"ILE") == 0) {
51 return 'I';
52 }
53 else if(strcmp(res,"LYS") == 0) {
54 return 'K';
55 }
56 else if(strcmp(res,"LEU") == 0) {
57 return 'L';
58 }
59 else if(strcmp(res,"MET") == 0) {
60 return 'M';
61 }
62 else if(strcmp(res,"ASN") == 0) {
63 return 'N';
64 }
65 else if(strcmp(res,"PRO") == 0) {
66 return 'P';
67 }
68 else if(strcmp(res,"GLN") == 0) {
69 return 'Q';
70 }
71 else if(strcmp(res,"ARG") == 0) {
72 return 'R';
73 }
74 else if(strcmp(res,"SER") == 0) {
75 return 'S';
76 }
77 else if(strcmp(res,"THR") == 0) {
78 return 'T';
79 }
80 else if(strcmp(res,"SCY") == 0) {
81 return 'U';
82 }
83 else if(strcmp(res,"VAL") == 0) {
84 return 'V';
85 }
86 else if(strcmp(res,"TRP") == 0) {
87 return 'W';
88 }
89 else if(strcmp(res,"TYR") == 0) {
90 return 'Y';
91 }
92 else {
93 return 'X';
94 }
95 } /* }}} */
96
97 /************************************************************************/
98 /*! \brief Frees the memory of a pdbf structure.
99
100 This function takes a pdbf pointer and frees all the memory below it.
101
102 \param p is the pdbf structure to be freed.
103 */
104 /************************************************************************/
gk_freepdbf(pdbf * p)105 void gk_freepdbf(pdbf *p) { /* {{{ */
106 int i;
107 if(p != NULL) {
108 gk_free((void **)&p->resSeq, LTERM);
109 for(i=0; i<p->natoms; i++) {
110 gk_free((void **)&p->atoms[i].name, &p->atoms[i].resname, LTERM);
111 }
112 for(i=0; i<p->nresidues; i++) {
113 gk_free((void *)&p->threeresSeq[i], LTERM);
114 }
115 /* this may look like it's wrong, but it's just a 1-d array of pointers, and
116 the pointers themselves are freed above */
117 gk_free((void **)&p->bbs, &p->cas, &p->atoms, &p->cm, &p->threeresSeq, LTERM);
118 }
119 gk_free((void **)&p, LTERM);
120 } /* }}} */
121
122 /************************************************************************/
123 /*! \brief Reads a pdb file into a pdbf structure
124
125 This function allocates a pdbf structure and reads the file fname into
126 that structure.
127
128 \param fname is the file name to be read
129 \returns A filled pdbf structure.
130 */
131 /************************************************************************/
gk_readpdbfile(char * fname)132 pdbf *gk_readpdbfile(char *fname) { /* {{{ */
133 int i=0, res=0;
134 char linetype[6];
135 int aserial;
136 char aname[5] = " \0";
137 char altLoc = ' ';
138 char rname[4] = " \0";
139 char chainid = ' ';
140 char oldchainid = ' ';
141 int rserial;
142 int oldRserial = -37;
143 char icode = ' ';
144 char element = ' ';
145 double x;
146 double y;
147 double z;
148 double avgx;
149 double avgy;
150 double avgz;
151 double opcy;
152 double tmpt;
153 char line[MAXLINELEN];
154 int corruption=0;
155 int nresatoms;
156
157 int atoms=0, residues=0, cas=0, bbs=0, firstres=1;
158 pdbf *toFill = gk_malloc(sizeof(pdbf),"fillme");
159 FILE *FPIN;
160
161 FPIN = gk_fopen(fname,"r",fname);
162 while(fgets(line, 256, FPIN)) {
163 sscanf(line,"%s ",linetype);
164 /* It seems the only reliable parts are through temperature, so we only use these parts */
165 /* if(strstr(linetype, "ATOM") != NULL || strstr(linetype, "HETATM") != NULL) { */
166 if(strstr(linetype, "ATOM") != NULL) {
167 sscanf(line, "%6s%5d%*1c%4c%1c%3c%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf %c\n",
168 linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,&element);
169 sscanf(linetype, " %s ",linetype);
170 sscanf(aname, " %s ",aname);
171 sscanf(rname, " %s ",rname);
172 if(altLoc != ' ') {
173 corruption = corruption|CRP_ALTLOCS;
174 }
175
176 if(firstres == 1) {
177 oldRserial = rserial;
178 oldchainid = chainid;
179 residues++;
180 firstres = 0;
181 }
182 if(oldRserial != rserial) {
183 residues++;
184 oldRserial = rserial;
185 }
186 if(oldchainid != chainid) {
187 corruption = corruption|CRP_MULTICHAIN;
188 }
189 oldchainid = chainid;
190 atoms++;
191 if(strcmp(aname,"CA") == 0) {
192 cas++;
193 }
194 if(strcmp(aname,"N") == 0 || strcmp(aname,"CA") == 0 ||
195 strcmp(aname,"C") == 0 || strcmp(aname,"O") == 0) {
196 bbs++;
197 }
198 }
199 else if(strstr(linetype, "ENDMDL") != NULL || strstr(linetype, "END") != NULL || strstr(linetype, "TER") != NULL) {
200 break;
201 }
202 }
203 fclose(FPIN);
204
205 /* printf("File has coordinates for %d atoms in %d residues\n",atoms,residues); */
206 toFill->natoms = atoms;
207 toFill->ncas = cas;
208 toFill->nbbs = bbs;
209 toFill->nresidues = residues;
210 toFill->resSeq = (char *) gk_malloc (residues*sizeof(char),"residue seq");
211 toFill->threeresSeq = (char **)gk_malloc (residues*sizeof(char *),"residue seq");
212 toFill->atoms = (atom *) gk_malloc (atoms*sizeof(atom), "atoms");
213 toFill->bbs = (atom **)gk_malloc ( bbs*sizeof(atom *),"bbs");
214 toFill->cas = (atom **)gk_malloc ( cas*sizeof(atom *),"cas");
215 toFill->cm = (center_of_mass *)gk_malloc(residues*sizeof(center_of_mass),"center of mass");
216 res=0; firstres=1; cas=0; bbs=0; i=0;
217 avgx = 0.0; avgy = 0.0; avgz = 0.0;
218 nresatoms = 0;
219
220 FPIN = gk_fopen(fname,"r",fname);
221 while(fgets(line, 256, FPIN)) {
222 sscanf(line,"%s ",linetype);
223 /* It seems the only reliable parts are through temperature, so we only use these parts */
224 /* if(strstr(linetype, "ATOM") != NULL || strstr(linetype, "HETATM") != NULL) { */
225 if(strstr(linetype, "ATOM") != NULL ) {
226
227 /* to ensure our memory doesn't get corrupted by the biologists, we only read this far */
228 sscanf(line, "%6s%5d%*1c%4c%1c%3c%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf %c\n",
229 linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,&element);
230 sscanf(aname, "%s",aname);
231 sscanf(rname, "%s",rname);
232
233 if(firstres == 1) {
234 toFill->resSeq[res] = gk_threetoone(rname);
235 toFill->threeresSeq[res] = gk_strdup(rname);
236 oldRserial = rserial;
237 res++;
238 firstres = 0;
239 }
240 if(oldRserial != rserial) {
241 /* we're changing residues. store the center of mass from the last one & reset */
242 toFill->cm[res-1].x = avgx/nresatoms;
243 toFill->cm[res-1].y = avgy/nresatoms;
244 toFill->cm[res-1].z = avgz/nresatoms;
245 avgx = 0.0; avgy = 0.0; avgz = 0.0;
246 nresatoms = 0;
247 toFill->cm[res-1].name = toFill->resSeq[res-1];
248
249 toFill->threeresSeq[res] = gk_strdup(rname);
250 toFill->resSeq[res] = gk_threetoone(rname);
251 res++;
252 oldRserial = rserial;
253 }
254 avgx += x;
255 avgy += y;
256 avgz += z;
257 nresatoms++;
258
259 toFill->atoms[i].x = x;
260 toFill->atoms[i].y = y;
261 toFill->atoms[i].z = z;
262 toFill->atoms[i].opcy = opcy;
263 toFill->atoms[i].tmpt = tmpt;
264 toFill->atoms[i].element = element;
265 toFill->atoms[i].serial = aserial;
266 toFill->atoms[i].chainid = chainid;
267 toFill->atoms[i].altLoc = altLoc;
268 toFill->atoms[i].rserial = rserial;
269 toFill->atoms[i].icode = icode;
270 toFill->atoms[i].name = gk_strdup(aname);
271 toFill->atoms[i].resname = gk_strdup(rname);
272 /* Set up pointers for the backbone and c-alpha shortcuts */
273 if(strcmp(aname,"CA") == 0) {
274 toFill->cas[cas] = &(toFill->atoms[i]);
275 cas++;
276 }
277 if(strcmp(aname,"N") == 0 || strcmp(aname,"CA") == 0 || strcmp(aname,"C") == 0 || strcmp(aname,"O") == 0) {
278 toFill->bbs[bbs] = &(toFill->atoms[i]);
279 bbs++;
280 }
281 i++;
282 }
283 else if(strstr(linetype, "ENDMDL") != NULL || strstr(linetype, "END") != NULL || strstr(linetype, "TER") != NULL) {
284 break;
285 }
286 }
287 /* get that last average */
288 toFill->cm[res-1].x = avgx/nresatoms;
289 toFill->cm[res-1].y = avgy/nresatoms;
290 toFill->cm[res-1].z = avgz/nresatoms;
291 /* Begin test code */
292 if(cas != residues) {
293 printf("Number of residues and CA coordinates differs by %d (!)\n",residues-cas);
294 if(cas < residues) {
295 corruption = corruption|CRP_MISSINGCA;
296 }
297 else if(cas > residues) {
298 corruption = corruption|CRP_MULTICA;
299 }
300 }
301 if(bbs < residues*4) {
302 corruption = corruption|CRP_MISSINGBB;
303 }
304 else if(bbs > residues*4) {
305 corruption = corruption|CRP_MULTIBB;
306 }
307 fclose(FPIN);
308 toFill->corruption = corruption;
309 /* if(corruption == 0)
310 printf("File was clean!\n"); */
311 return(toFill);
312 } /* }}} */
313
314 /************************************************************************/
315 /*! \brief Writes the sequence of residues from a pdb file.
316
317 This function takes a pdbf structure and a filename, and writes out
318 the amino acid sequence according to the atomic coordinates. The output
319 is in fasta format.
320
321
322 \param p is the pdbf structure with the sequence of interest
323 \param fname is the file name to be written
324 */
325 /************************************************************************/
gk_writefastafrompdb(pdbf * pb,char * fname)326 void gk_writefastafrompdb(pdbf *pb, char *fname) {
327 int i;
328 FILE *FPOUT;
329
330 FPOUT = gk_fopen(fname,"w",fname);
331 fprintf(FPOUT,"> %s\n",fname);
332
333 for(i=0; i<pb->nresidues; i++)
334 fprintf(FPOUT,"%c",pb->resSeq[i]);
335
336 fprintf(FPOUT,"\n");
337 fclose(FPOUT);
338 }
339
340 /************************************************************************/
341 /*! \brief Writes all centers of mass in pdb-format to file fname.
342
343 This function takes a pdbf structure and writes out the calculated
344 mass center information to file fname as though each one was a c-alpha.
345
346 \param p is the pdbf structure to write out
347 \param fname is the file name to be written
348 */
349 /************************************************************************/
gk_writecentersofmass(pdbf * p,char * fname)350 void gk_writecentersofmass(pdbf *p, char *fname) {
351 int i;
352 FILE *FPIN;
353 FPIN = gk_fopen(fname,"w",fname);
354 for(i=0; i<p->nresidues; i++) {
355 fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
356 "ATOM ",i,"CA",' ',p->threeresSeq[i],' ',i,' ',p->cm[i].x,p->cm[i].y,p->cm[i].z,1.0,-37.0);
357 }
358 fclose(FPIN);
359 }
360
361 /************************************************************************/
362 /*! \brief Writes all atoms in p in pdb-format to file fname.
363
364 This function takes a pdbf structure and writes out all the atom
365 information to file fname.
366
367 \param p is the pdbf structure to write out
368 \param fname is the file name to be written
369 */
370 /************************************************************************/
gk_writefullatom(pdbf * p,char * fname)371 void gk_writefullatom(pdbf *p, char *fname) {
372 int i;
373 FILE *FPIN;
374 FPIN = gk_fopen(fname,"w",fname);
375 for(i=0; i<p->natoms; i++) {
376 fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
377 "ATOM ",p->atoms[i].serial,p->atoms[i].name,p->atoms[i].altLoc,p->atoms[i].resname,p->atoms[i].chainid,p->atoms[i].rserial,p->atoms[i].icode,p->atoms[i].x,p->atoms[i].y,p->atoms[i].z,p->atoms[i].opcy,p->atoms[i].tmpt);
378 }
379 fclose(FPIN);
380 }
381
382 /************************************************************************/
383 /*! \brief Writes out all the backbone atoms of a structure in pdb format
384
385 This function takes a pdbf structure p and writes only the backbone atoms
386 to a filename fname.
387
388 \param p is the pdb structure to write out.
389 \param fname is the file name to be written.
390 */
391 /************************************************************************/
gk_writebackbone(pdbf * p,char * fname)392 void gk_writebackbone(pdbf *p, char *fname) {
393 int i;
394 FILE *FPIN;
395 FPIN = gk_fopen(fname,"w",fname);
396 for(i=0; i<p->nbbs; i++) {
397 fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
398 "ATOM ",p->bbs[i]->serial,p->bbs[i]->name,p->bbs[i]->altLoc,p->bbs[i]->resname,p->bbs[i]->chainid,p->bbs[i]->rserial,p->bbs[i]->icode,p->bbs[i]->x,p->bbs[i]->y,p->bbs[i]->z,p->bbs[i]->opcy,p->bbs[i]->tmpt);
399 }
400 fclose(FPIN);
401 }
402
403 /************************************************************************/
404 /*! \brief Writes out all the alpha carbon atoms of a structure
405
406 This function takes a pdbf structure p and writes only the alpha carbon
407 atoms to a filename fname.
408
409 \param p is the pdb structure to write out.
410 \param fname is the file name to be written.
411 */
412 /************************************************************************/
gk_writealphacarbons(pdbf * p,char * fname)413 void gk_writealphacarbons(pdbf *p, char *fname) {
414 int i;
415 FILE *FPIN;
416 FPIN = gk_fopen(fname,"w",fname);
417 for(i=0; i<p->ncas; i++) {
418 fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
419 "ATOM ",p->cas[i]->serial,p->cas[i]->name,p->cas[i]->altLoc,p->cas[i]->resname,p->cas[i]->chainid,p->cas[i]->rserial,p->cas[i]->icode,p->cas[i]->x,p->cas[i]->y,p->cas[i]->z,p->cas[i]->opcy,p->cas[i]->tmpt);
420 }
421 fclose(FPIN);
422 }
423
424 /************************************************************************/
425 /*! \brief Decodes the corruption bitswitch and prints any problems
426
427 Due to the totally unreliable nature of the pdb format, reading a pdb
428 file stores a corruption bitswitch, and this function decodes that switch
429 and prints the result on stdout.
430
431 \param p is the pdb structure to write out.
432 \param fname is the file name to be written.
433 */
434 /************************************************************************/
gk_showcorruption(pdbf * p)435 void gk_showcorruption(pdbf *p) {
436 int corruption = p->corruption;
437 if(corruption&CRP_ALTLOCS)
438 printf("Multiple coordinate sets for at least one atom\n");
439 if(corruption&CRP_MISSINGCA)
440 printf("Missing coordiantes for at least one CA atom\n");
441 if(corruption&CRP_MISSINGBB)
442 printf("Missing coordiantes for at least one backbone atom (N,CA,C,O)\n");
443 if(corruption&CRP_MULTICHAIN)
444 printf("File contains coordinates for multiple chains\n");
445 if(corruption&CRP_MULTICA)
446 printf("Multiple CA atoms found for the same residue (could be alternate locators)\n");
447 if(corruption&CRP_MULTICA)
448 printf("Multiple copies of backbone atoms found for the same residue (could be alternate locators)\n");
449 }
450 /* sscanf(line, "%6s%5d%*1c%4s%1c%3s%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf%*6c%4s%2s%2s\n",
451 linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,segId,element,charge);
452 printf(".%s.%s.%s.\n",segId,element,charge);
453 printf("%-6s%5d%-1s%-4s%1c%3s%1s%1c%4d%1c%3s%8.3lf%8.3lf%8.3lf%6.2f%6.2f%6s%4s%2s%2s\n",
454 linetype,aserial," ",aname,altLoc,rname," ",chainid,rserial,icode," ",x,y,z,opcy,tmpt," ",segId,element,charge); */
455
456 /* and we could probably get away with this using astral files, */
457 /* sscanf(line, "%6s%5d%*1c%4s%1c%3s%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf%*6c%6s\n",
458 linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,element);
459 printf("%-6s%5d%-1s%-4s%1c%3s%1s%1c%4d%1c%3s%8.3lf%8.3lf%8.3lf%6.2f%6.2f%6s%6s\n",
460 linetype,aserial," ",aname,altLoc,rname," ",chainid,rserial,icode," ",x,y,z,opcy,tmpt," ",element); */
461