1 /************************************************************************/
2 /*! \file pdb.c
3 
4 \brief Functions for parsing pdb files.
5 
6 Pdb reader (parser).  Loads arrays of pointers for easy backbone access.
7 
8 \date Started 10/20/06
9 \author Kevin
10 \version $Id: pdb.c 10711 2011-08-31 22:23:04Z karypis $
11 */
12 /************************************************************************/
13 #include <GKlib.h>
14 
15 /************************************************************************/
16 /*! \brief Converts three-letter amino acid codes to one-leter codes.
17 
18 This function takes a three letter \c * and converts it to a single \c
19 
20 \param res is the three-letter code to be converted.
21 \returns A \c representing the amino acid.
22 */
23 /************************************************************************/
gk_threetoone(char * res)24 char gk_threetoone(char *res) { /* {{{ */
25 	/* make sure the matching works */
26 	res[0] = toupper(res[0]);
27 	res[1] = toupper(res[1]);
28 	res[2] = toupper(res[2]);
29 	if(strcmp(res,"ALA") == 0) {
30 		return 'A';
31 	}
32 	else if(strcmp(res,"CYS") == 0) {
33 		return 'C';
34 	}
35 	else if(strcmp(res,"ASP") == 0) {
36 		return 'D';
37 	}
38 	else if(strcmp(res,"GLU") == 0) {
39 		return 'E';
40 	}
41 	else if(strcmp(res,"PHE") == 0) {
42 		return 'F';
43 	}
44 	else if(strcmp(res,"GLY") == 0) {
45 		return 'G';
46 	}
47 	else if(strcmp(res,"HIS") == 0) {
48 		return 'H';
49 	}
50 	else if(strcmp(res,"ILE") == 0) {
51 		return 'I';
52 	}
53 	else if(strcmp(res,"LYS") == 0) {
54 		return 'K';
55 	}
56 	else if(strcmp(res,"LEU") == 0) {
57 		return 'L';
58 	}
59 	else if(strcmp(res,"MET") == 0) {
60 		return 'M';
61 	}
62 	else if(strcmp(res,"ASN") == 0) {
63 		return 'N';
64 	}
65 	else if(strcmp(res,"PRO") == 0) {
66 		return 'P';
67 	}
68 	else if(strcmp(res,"GLN") == 0) {
69 		return 'Q';
70 	}
71 	else if(strcmp(res,"ARG") == 0) {
72 		return 'R';
73 	}
74 	else if(strcmp(res,"SER") == 0) {
75 		return 'S';
76 	}
77 	else if(strcmp(res,"THR") == 0) {
78 		return 'T';
79 	}
80 	else if(strcmp(res,"SCY") == 0) {
81 		return 'U';
82 	}
83 	else if(strcmp(res,"VAL") == 0) {
84 		return 'V';
85 	}
86 	else if(strcmp(res,"TRP") == 0) {
87 		return 'W';
88 	}
89 	else if(strcmp(res,"TYR") == 0) {
90 		return 'Y';
91 	}
92 	else  {
93 		return 'X';
94 	}
95 } /* }}} */
96 
97 /************************************************************************/
98 /*! \brief Frees the memory of a pdbf structure.
99 
100 This function takes a pdbf pointer and frees all the memory below it.
101 
102 \param p is the pdbf structure to be freed.
103 */
104 /************************************************************************/
gk_freepdbf(pdbf * p)105 void gk_freepdbf(pdbf *p) { /* {{{ */
106 	int i;
107 	if(p != NULL) {
108 		gk_free((void **)&p->resSeq, LTERM);
109 		for(i=0; i<p->natoms; i++) {
110 			gk_free((void **)&p->atoms[i].name, &p->atoms[i].resname, LTERM);
111     }
112 		for(i=0; i<p->nresidues; i++) {
113       gk_free((void *)&p->threeresSeq[i], LTERM);
114     }
115 		/* this may look like it's wrong, but it's just a 1-d array of pointers, and
116 			 the pointers themselves are freed above */
117 	  gk_free((void **)&p->bbs, &p->cas, &p->atoms, &p->cm, &p->threeresSeq, LTERM);
118 	}
119 	gk_free((void **)&p, LTERM);
120 } /* }}} */
121 
122 /************************************************************************/
123 /*! \brief Reads a pdb file into a pdbf structure
124 
125 This function allocates a pdbf structure and reads the file fname into
126 that structure.
127 
128 \param fname is the file name to be read
129 \returns A filled pdbf structure.
130 */
131 /************************************************************************/
gk_readpdbfile(char * fname)132 pdbf *gk_readpdbfile(char *fname) { /* {{{ */
133 	int i=0, res=0;
134 	char linetype[6];
135 	int  aserial;
136 	char aname[5] = "    \0";
137 	char altLoc   = ' ';
138 	char rname[4] = "   \0";
139 	char chainid  = ' ';
140 	char oldchainid  = ' ';
141 	int  rserial;
142 	int  oldRserial = -37;
143 	char icode    = ' ';
144 	char element  = ' ';
145 	double x;
146 	double y;
147 	double z;
148 	double avgx;
149 	double avgy;
150 	double avgz;
151 	double opcy;
152 	double tmpt;
153 	char line[MAXLINELEN];
154 	int corruption=0;
155   int nresatoms;
156 
157 	int atoms=0, residues=0, cas=0, bbs=0, firstres=1;
158 	pdbf *toFill = gk_malloc(sizeof(pdbf),"fillme");
159 	FILE *FPIN;
160 
161 	FPIN = gk_fopen(fname,"r",fname);
162 	while(fgets(line, 256, FPIN))	{
163 		sscanf(line,"%s ",linetype);
164 		/* It seems the only reliable parts are through temperature, so we only use these parts */
165 		/* if(strstr(linetype, "ATOM") != NULL || strstr(linetype, "HETATM") != NULL) { */
166 		if(strstr(linetype, "ATOM") != NULL) {
167 			sscanf(line, "%6s%5d%*1c%4c%1c%3c%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf %c\n",
168 			linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,&element);
169 			sscanf(linetype, " %s ",linetype);
170 			sscanf(aname, " %s ",aname);
171 			sscanf(rname, " %s ",rname);
172 			if(altLoc != ' ') {
173 				corruption = corruption|CRP_ALTLOCS;
174 			}
175 
176 			if(firstres == 1) {
177 				oldRserial = rserial;
178 				oldchainid = chainid;
179 				residues++;
180 				firstres = 0;
181 			}
182 			if(oldRserial != rserial) {
183 				residues++;
184 				oldRserial = rserial;
185 			}
186 			if(oldchainid != chainid) {
187 				corruption = corruption|CRP_MULTICHAIN;
188 			}
189 			oldchainid = chainid;
190 			atoms++;
191 		  if(strcmp(aname,"CA") == 0) {
192 				cas++;
193 			}
194 			if(strcmp(aname,"N") == 0 || strcmp(aname,"CA") == 0 ||
195          strcmp(aname,"C") == 0 || strcmp(aname,"O") == 0) {
196 				bbs++;
197 			}
198 		}
199 		else if(strstr(linetype, "ENDMDL") != NULL || strstr(linetype, "END") != NULL || strstr(linetype, "TER") != NULL) {
200 			break;
201 		}
202 	}
203 	fclose(FPIN);
204 
205 	/* printf("File has coordinates for %d atoms in %d residues\n",atoms,residues); */
206 	toFill->natoms      = atoms;
207 	toFill->ncas        = cas;
208 	toFill->nbbs        = bbs;
209 	toFill->nresidues   = residues;
210 	toFill->resSeq      = (char *) gk_malloc (residues*sizeof(char),"residue seq");
211 	toFill->threeresSeq = (char **)gk_malloc (residues*sizeof(char *),"residue seq");
212 	toFill->atoms       = (atom *) gk_malloc (atoms*sizeof(atom),  "atoms");
213 	toFill->bbs         = (atom **)gk_malloc (  bbs*sizeof(atom *),"bbs");
214 	toFill->cas         = (atom **)gk_malloc (  cas*sizeof(atom *),"cas");
215 	toFill->cm          = (center_of_mass *)gk_malloc(residues*sizeof(center_of_mass),"center of mass");
216 	res=0; firstres=1; cas=0; bbs=0; i=0;
217   avgx = 0.0; avgy = 0.0; avgz = 0.0;
218   nresatoms = 0;
219 
220 	FPIN = gk_fopen(fname,"r",fname);
221 	while(fgets(line, 256, FPIN))	{
222 		sscanf(line,"%s ",linetype);
223 		/* It seems the only reliable parts are through temperature, so we only use these parts */
224 		/* if(strstr(linetype, "ATOM") != NULL || strstr(linetype, "HETATM") != NULL) { */
225 		if(strstr(linetype, "ATOM") != NULL ) {
226 
227 			/* to ensure our memory doesn't get corrupted by the biologists, we only read this far */
228 			sscanf(line, "%6s%5d%*1c%4c%1c%3c%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf %c\n",
229 			linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,&element);
230 			sscanf(aname, "%s",aname);
231 			sscanf(rname, "%s",rname);
232 
233 			if(firstres == 1) {
234 				toFill->resSeq[res] = gk_threetoone(rname);
235 			  toFill->threeresSeq[res] = gk_strdup(rname);
236 				oldRserial = rserial;
237 				res++;
238 				firstres = 0;
239 			}
240 			if(oldRserial != rserial) {
241         /* we're changing residues. store the center of mass from the last one & reset */
242         toFill->cm[res-1].x = avgx/nresatoms;
243         toFill->cm[res-1].y = avgy/nresatoms;
244         toFill->cm[res-1].z = avgz/nresatoms;
245 	      avgx = 0.0; avgy = 0.0; avgz = 0.0;
246         nresatoms = 0;
247         toFill->cm[res-1].name = toFill->resSeq[res-1];
248 
249 			  toFill->threeresSeq[res] = gk_strdup(rname);
250 				toFill->resSeq[res] = gk_threetoone(rname);
251 				res++;
252 				oldRserial = rserial;
253 			}
254       avgx += x;
255       avgy += y;
256       avgz += z;
257       nresatoms++;
258 
259 			toFill->atoms[i].x       = x;
260 			toFill->atoms[i].y       = y;
261 			toFill->atoms[i].z       = z;
262 			toFill->atoms[i].opcy    = opcy;
263 			toFill->atoms[i].tmpt    = tmpt;
264 			toFill->atoms[i].element = element;
265 			toFill->atoms[i].serial  = aserial;
266 			toFill->atoms[i].chainid = chainid;
267 			toFill->atoms[i].altLoc  = altLoc;
268 			toFill->atoms[i].rserial = rserial;
269 			toFill->atoms[i].icode   = icode;
270 			toFill->atoms[i].name    = gk_strdup(aname);
271 			toFill->atoms[i].resname = gk_strdup(rname);
272 			/* Set up pointers for the backbone and c-alpha shortcuts */
273 			 if(strcmp(aname,"CA") == 0) {
274 				toFill->cas[cas] = &(toFill->atoms[i]);
275 				cas++;
276 			}
277 			if(strcmp(aname,"N") == 0 || strcmp(aname,"CA") == 0 || strcmp(aname,"C") == 0 || strcmp(aname,"O") == 0) {
278 				toFill->bbs[bbs] = &(toFill->atoms[i]);
279 				bbs++;
280 			}
281 			i++;
282 		}
283 		else if(strstr(linetype, "ENDMDL") != NULL || strstr(linetype, "END") != NULL || strstr(linetype, "TER") != NULL) {
284 			break;
285 		}
286 	}
287   /* get that last average */
288   toFill->cm[res-1].x = avgx/nresatoms;
289   toFill->cm[res-1].y = avgy/nresatoms;
290   toFill->cm[res-1].z = avgz/nresatoms;
291 	/* Begin test code */
292 	if(cas != residues) {
293 		printf("Number of residues and CA coordinates differs by %d (!)\n",residues-cas);
294 		if(cas < residues) {
295 			corruption = corruption|CRP_MISSINGCA;
296 		}
297 		else if(cas > residues) {
298 			corruption = corruption|CRP_MULTICA;
299 		}
300 	}
301 	if(bbs < residues*4) {
302 		corruption = corruption|CRP_MISSINGBB;
303 	}
304 	else if(bbs > residues*4) {
305 		corruption = corruption|CRP_MULTIBB;
306 	}
307 	fclose(FPIN);
308 	toFill->corruption = corruption;
309 	/* if(corruption == 0)
310 		printf("File was clean!\n"); */
311 	return(toFill);
312 } /* }}} */
313 
314 /************************************************************************/
315 /*! \brief Writes the sequence of residues from a pdb file.
316 
317 This function takes a pdbf structure and a filename, and writes out
318 the amino acid sequence according to the atomic coordinates.  The output
319 is in fasta format.
320 
321 
322 \param p is the pdbf structure with the sequence of interest
323 \param fname is the file name to be written
324 */
325 /************************************************************************/
gk_writefastafrompdb(pdbf * pb,char * fname)326 void gk_writefastafrompdb(pdbf *pb, char *fname) {
327   int i;
328   FILE *FPOUT;
329 
330   FPOUT = gk_fopen(fname,"w",fname);
331   fprintf(FPOUT,"> %s\n",fname);
332 
333   for(i=0; i<pb->nresidues; i++)
334     fprintf(FPOUT,"%c",pb->resSeq[i]);
335 
336   fprintf(FPOUT,"\n");
337   fclose(FPOUT);
338 }
339 
340 /************************************************************************/
341 /*! \brief Writes all centers of mass in pdb-format to file fname.
342 
343 This function takes a pdbf structure and writes out the calculated
344 mass center information to file fname as though each one was a c-alpha.
345 
346 \param p is the pdbf structure to write out
347 \param fname is the file name to be written
348 */
349 /************************************************************************/
gk_writecentersofmass(pdbf * p,char * fname)350 void gk_writecentersofmass(pdbf *p, char *fname) {
351 	int i;
352 	FILE *FPIN;
353 	FPIN = gk_fopen(fname,"w",fname);
354 	for(i=0; i<p->nresidues; i++) {
355 		 fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c   %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
356 		"ATOM  ",i,"CA",' ',p->threeresSeq[i],' ',i,' ',p->cm[i].x,p->cm[i].y,p->cm[i].z,1.0,-37.0);
357 	}
358 	fclose(FPIN);
359 }
360 
361 /************************************************************************/
362 /*! \brief Writes all atoms in p in pdb-format to file fname.
363 
364 This function takes a pdbf structure and writes out all the atom
365 information to file fname.
366 
367 \param p is the pdbf structure to write out
368 \param fname is the file name to be written
369 */
370 /************************************************************************/
gk_writefullatom(pdbf * p,char * fname)371 void gk_writefullatom(pdbf *p, char *fname) {
372 	int i;
373 	FILE *FPIN;
374 	FPIN = gk_fopen(fname,"w",fname);
375 	for(i=0; i<p->natoms; i++) {
376 		 fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c   %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
377 		"ATOM  ",p->atoms[i].serial,p->atoms[i].name,p->atoms[i].altLoc,p->atoms[i].resname,p->atoms[i].chainid,p->atoms[i].rserial,p->atoms[i].icode,p->atoms[i].x,p->atoms[i].y,p->atoms[i].z,p->atoms[i].opcy,p->atoms[i].tmpt);
378 	}
379 	fclose(FPIN);
380 }
381 
382 /************************************************************************/
383 /*! \brief Writes out all the backbone atoms of a structure in pdb format
384 
385 This function takes a pdbf structure p and writes only the backbone atoms
386 to a filename fname.
387 
388 \param p is the pdb structure to write out.
389 \param fname is the file name to be written.
390 */
391 /************************************************************************/
gk_writebackbone(pdbf * p,char * fname)392 void gk_writebackbone(pdbf *p, char *fname) {
393 	int i;
394 	FILE *FPIN;
395 	FPIN = gk_fopen(fname,"w",fname);
396 	for(i=0; i<p->nbbs; i++) {
397 		 fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c   %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
398 		"ATOM  ",p->bbs[i]->serial,p->bbs[i]->name,p->bbs[i]->altLoc,p->bbs[i]->resname,p->bbs[i]->chainid,p->bbs[i]->rserial,p->bbs[i]->icode,p->bbs[i]->x,p->bbs[i]->y,p->bbs[i]->z,p->bbs[i]->opcy,p->bbs[i]->tmpt);
399 	}
400 	fclose(FPIN);
401 }
402 
403 /************************************************************************/
404 /*! \brief Writes out all the alpha carbon atoms of a structure
405 
406 This function takes a pdbf structure p and writes only the alpha carbon
407 atoms to a filename fname.
408 
409 \param p is the pdb structure to write out.
410 \param fname is the file name to be written.
411 */
412 /************************************************************************/
gk_writealphacarbons(pdbf * p,char * fname)413 void gk_writealphacarbons(pdbf *p, char *fname) {
414 	int i;
415 	FILE *FPIN;
416 	FPIN = gk_fopen(fname,"w",fname);
417 	for(i=0; i<p->ncas; i++) {
418 		 fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c   %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
419 		"ATOM  ",p->cas[i]->serial,p->cas[i]->name,p->cas[i]->altLoc,p->cas[i]->resname,p->cas[i]->chainid,p->cas[i]->rserial,p->cas[i]->icode,p->cas[i]->x,p->cas[i]->y,p->cas[i]->z,p->cas[i]->opcy,p->cas[i]->tmpt);
420 	}
421 	fclose(FPIN);
422 }
423 
424 /************************************************************************/
425 /*! \brief Decodes the corruption bitswitch and prints any problems
426 
427 Due to the totally unreliable nature of the pdb format, reading a pdb
428 file stores a corruption bitswitch, and this function decodes that switch
429 and prints the result on stdout.
430 
431 \param p is the pdb structure to write out.
432 \param fname is the file name to be written.
433 */
434 /************************************************************************/
gk_showcorruption(pdbf * p)435 void gk_showcorruption(pdbf *p) {
436 	int corruption = p->corruption;
437 	if(corruption&CRP_ALTLOCS)
438 		printf("Multiple coordinate sets for at least one atom\n");
439 	if(corruption&CRP_MISSINGCA)
440 		printf("Missing coordiantes for at least one CA atom\n");
441 	if(corruption&CRP_MISSINGBB)
442 		printf("Missing coordiantes for at least one backbone atom (N,CA,C,O)\n");
443 	if(corruption&CRP_MULTICHAIN)
444 		printf("File contains coordinates for multiple chains\n");
445 	if(corruption&CRP_MULTICA)
446 		printf("Multiple CA atoms found for the same residue (could be alternate locators)\n");
447 	if(corruption&CRP_MULTICA)
448 		printf("Multiple copies of backbone atoms found for the same residue (could be alternate locators)\n");
449 }
450 			/* sscanf(line, "%6s%5d%*1c%4s%1c%3s%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf%*6c%4s%2s%2s\n",
451 			linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,segId,element,charge);
452 			printf(".%s.%s.%s.\n",segId,element,charge);
453 			printf("%-6s%5d%-1s%-4s%1c%3s%1s%1c%4d%1c%3s%8.3lf%8.3lf%8.3lf%6.2f%6.2f%6s%4s%2s%2s\n",
454 			linetype,aserial," ",aname,altLoc,rname," ",chainid,rserial,icode," ",x,y,z,opcy,tmpt," ",segId,element,charge); */
455 
456 			/* and we could probably get away with this using astral files, */
457 			/* sscanf(line, "%6s%5d%*1c%4s%1c%3s%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf%*6c%6s\n",
458 			linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,element);
459 			printf("%-6s%5d%-1s%-4s%1c%3s%1s%1c%4d%1c%3s%8.3lf%8.3lf%8.3lf%6.2f%6.2f%6s%6s\n",
460 			linetype,aserial," ",aname,altLoc,rname," ",chainid,rserial,icode," ",x,y,z,opcy,tmpt," ",element); */
461