1--$Revision: 96973 $
2--**********************************************************************
3--
4--  Biological Macromolecule 3-D Structure Data Types for MMDB,
5--                A Molecular Modeling Database
6--
7--  Definitions for structural models
8--
9--  By Hitomi Ohkawa, Jim Ostell, Chris Hogue and Steve Bryant
10--
11--  National Center for Biotechnology Information
12--  National Institutes of Health
13--  Bethesda, MD 20894 USA
14--
15--  July, 1996
16--
17--**********************************************************************
18
19MMDB-Structural-model DEFINITIONS ::=
20
21BEGIN
22
23EXPORTS Biostruc-model, Model-id, Model-coordinate-set-id;
24
25IMPORTS Chem-graph-pntrs, Atom-pntrs, Chem-graph-alignment,
26	Sphere, Cone, Cylinder, Brick, Transform FROM MMDB-Features
27	Biostruc-id FROM MMDB
28	Pub FROM NCBI-Pub;
29
30-- A structural model maps chemical components into a measured three-
31-- dimensional space. PDB-derived biostrucs generally contain 4 models,
32-- corresponding to "views" of the structure of a biomolecular assemble with
33-- increasing levels of complexity.  Model types indicate the complexity of the
34-- view.
35
36-- The model named "NCBI all atom" represents a view suitable for most
37-- computational biology applications.  It provides complete atomic coordinate
38-- data for a "single best" model, omitting statistical disorder information
39-- and/or ensemble structure descriptions provided in the source PDB file.
40-- Construction of the single best model is based on the assumption that the
41-- contents of the "alternate conformation" field from pdb imply no correlation
42-- among the occupancies of multiple sites assigned to sets of atoms: the best
43-- site is chosen only on the basis of highest occupancy. Note, however, that
44-- alternate conformation sets where correlation is implied are generally
45-- constrained in crystallographic refinement to have uniform occupancy, and
46-- will thus be selected as a set. For ensemble models the model which assigns
47-- coordinates to the most atoms is chosen.  If numbers of coordinates are the
48-- same, the model occurring first in the PDB file is selected.  The single
49-- best model includes complete coordinates for all nonpolymer components, but
50-- omits those classified as "solvent".  Model type is 3 for this model.
51
52-- The model named "NCBI backbone" represents a simple view intended for
53-- graphic displays and rapid transmission over a network.  It includes only
54-- alpha carbon or backbone phosphate coordinates for biopolymers. It is based
55-- on selection of alpha-carbon and backbone phosphate atoms from the "NCBI
56-- all atom" model. The model type is set to 2.  An even simpler model gives
57-- only a cartoon representation, using cylinders corresponding to secondary
58-- structure elements.  This is named "NCBI vector", and has model type 1.
59
60-- The models named "PDB Model 1", "PDB Model 2", etc. represent the complete
61-- information provided by PDB, including full descriptions of statistical
62-- disorder.  The name of the model is based on the contents of the PDB MODEL
63-- record, with a default name of "PDB Model 1" for PDB files which contain
64-- only a single model.  Construction of these models is based on the
65-- assumption that contents of the PDB "alternate conformation" field are
66-- intended to imply correlation among the occupancies of atom sets flagged by
67-- the same identifier.  The special flag " " (blank) is assumed to indicate
68-- sites occupied in all alternate conformations, and sites flagged otherwise,
69-- together with " ", to indicate a distinct member of an ensemble of
70-- alternate conformations.  Note that construction of ensemble members
71-- according to these assumption requires two validation checks on PDB
72-- "alternate conformation" flags: they must be unique among sites assigned to
73-- the same atom, and that the special " " flag must occur only for unique
74-- sites.  Sites which violate the first check are flagged as "u", for
75-- "unknown"; they are omitted from all ensemble definitions but are
76-- nontheless retained in the coordinate list.  Sites which violate the second
77-- check are flagged "b" for "blank", and are included in an appropriately
78-- named ensemble.  The model type for pdb all models is 4.
79
80-- Note that in the MMDB database models are stored in the ASN.1 stream in
81-- order of increasing model type value.  Since models occur as the last item
82-- in a biostruc, parsers may avoid reading the entire stream if the desired
83-- model is one of the simplified types, which occur first in the stream. This
84-- can save considerable I/O time, particularly for large ensemble models from
85-- NMR determinations.
86
87Biostruc-model ::= SEQUENCE {
88	id			Model-id,
89	type			Model-type,
90	descr			SEQUENCE OF Model-descr OPTIONAL,
91	model-space		Model-space OPTIONAL,
92	model-coordinates	SEQUENCE OF Model-coordinate-set OPTIONAL }
93
94Model-id ::= INTEGER
95
96Model-type ::= INTEGER {
97	ncbi-vector(1),
98	ncbi-backbone(2),
99	ncbi-all-atom(3),
100	pdb-model(4),
101	other(255)}
102
103Model-descr ::= CHOICE {
104	name			VisibleString,
105	pdb-reso                VisibleString,
106	pdb-method              VisibleString,
107	pdb-comment		VisibleString,
108	other-comment		VisibleString,
109	attribution		Pub }
110
111-- The model space defines measurement units and any external reference frame.
112-- Coordinates refer to a right-handed orthogonal system defined on axes
113-- tagged x, y and z in the coordinate and feature definitions of a biostruc.
114-- Coordinates from PDB-derived structures are reported without change, in
115-- angstrom units.  The units of temperature and occupancy factors are not
116-- defined explicitly in PDB, but are inferred from their value range.
117
118Model-space ::= SEQUENCE {
119	coordinate-units	ENUMERATED {
120					angstroms(1),
121					nanometers(2),
122					other(3),
123					unknown(255)},
124	thermal-factor-units	ENUMERATED {
125					b(1),
126					u(2),
127					other(3),
128					unknown(255)} OPTIONAL,
129	occupancy-factor-units	ENUMERATED {
130					fractional(1),
131					electrons(2),
132					other(3),
133					unknown(255)} OPTIONAL,
134	density-units		ENUMERATED {
135					electrons-per-unit-volume(1),
136					arbitrary-scale(2),
137					other(3),
138					unknown(255)} OPTIONAL,
139	reference-frame		Reference-frame OPTIONAL }
140
141-- An external reference frame is a pointer to another biostruc, with an
142-- optional operator to rotate and translate coordinates into its model space.
143-- This item is intended for representation of homology-derived model
144-- structures, and is not present for structures from PDB.
145
146Reference-frame ::= SEQUENCE {
147	biostruc-id		Biostruc-id,
148	rotation-translation	Transform OPTIONAL }
149
150-- Atomic coordinates may be assigned literally or by reference to another
151-- biostruc.  The reference coordinate type is used to represent homology-
152-- derived model structures.  PDB-derived structures have literal coordinates.
153
154-- Referenced coordinates identify another biostruc, any transformation to be
155-- applied to coordinates from that biostruc, and a mapping of the chemical
156-- graph of the present biostruc onto that of the referenced biostruc.  They
157-- give an "alignment" of atoms in the current biostruc with those in another,
158-- from which the coordinates of matched atoms may be retrieved.  For non-
159-- atomic models "alignment" may also be represented by molecule and residue
160-- equivalence lists.  Referenced coordinates are a data item inteded for
161-- representation of homology models, with an explicit pointer to their source
162-- information. They do not occur in PDB-derived models.
163
164Model-coordinate-set ::= SEQUENCE {
165	id			Model-coordinate-set-id OPTIONAL,
166	descr			SEQUENCE OF Model-descr OPTIONAL,
167	coordinates		CHOICE {
168		literal			Coordinates,
169		reference		Chem-graph-alignment } }
170
171Model-coordinate-set-id ::= INTEGER
172
173
174-- Literal coordinates map chemical components into the model space.  Three
175-- mapping types are allowed, atomic coordinate models, density-grid models,
176-- and surface models. A model consists of a sequence of such coordinate sets,
177-- and may thus combine coordinate subsets which have a different source.
178-- PDB-derived models contain a single atomic coordinate set, as they by
179-- definition represent information from a single source.
180
181Coordinates ::= CHOICE {
182	atomic			Atomic-coordinates,
183	surface			Surface-coordinates,
184	density			Density-coordinates }
185
186-- Literal atomic coordinate values give location, occupancy and order
187-- parameters, and a pointer to a specific atom defined in the biostruc graph.
188-- Temperature and occupancy factors have their conventional crystallographic
189-- definitions, with units defined in the model space declaration.  Atoms,
190-- sites, temperature-factors, occupancies and alternate-conformation-ids
191-- are parallel arrays, i.e. the have the same number of values as given by
192-- number-of-points. Conformation ensembles represent distinct correlated-
193-- disorder subsets of the coordinates.  They will be present only for certain
194-- "views" of PDB structures, as described above. Their derivation from PDB-
195-- supplied "alternate-conformation" ids is described below.
196
197Atomic-coordinates ::= SEQUENCE {
198	number-of-points	INTEGER,
199	atoms			Atom-pntrs,
200	sites			Model-space-points,
201	temperature-factors	Atomic-temperature-factors OPTIONAL,
202	occupancies		Atomic-occupancies OPTIONAL,
203	alternate-conf-ids	Alternate-conformation-ids OPTIONAL,
204	conf-ensembles		SEQUENCE OF Conformation-ensemble OPTIONAL }
205
206-- The atoms whose location is described by each coordinate are identified
207-- via a hierarchical pointer to the chemical graph of the biomolecular
208-- assembly.  Coordinates may be matched with atoms in the chemical structure
209-- by the values of the molecule, residue and atom id's given here,  which
210-- match exactly the items of the same type defined in Biostruc-graph.
211
212-- Coordinates are given as integer values, with a scale factor to convert
213-- to real values for each x, y or z, in the units indicated in model-space.
214-- Integer values must be divided by the the scale factor.  This use of integer
215-- values reduces the ASN.1 stream size. The scale factors for temperature
216-- factors and occupancies are given separately, but must be used in the same
217-- fashion to produce properly scaled real values.
218
219Model-space-points ::= SEQUENCE {
220	scale-factor		INTEGER,
221	x			SEQUENCE OF INTEGER,
222	y			SEQUENCE OF INTEGER,
223	z			SEQUENCE OF INTEGER }
224
225Atomic-temperature-factors ::= CHOICE {
226	isotropic		Isotropic-temperature-factors,
227	anisotropic		Anisotropic-temperature-factors }
228
229Isotropic-temperature-factors ::= SEQUENCE {
230	scale-factor		INTEGER,
231	b			SEQUENCE OF INTEGER }
232
233Anisotropic-temperature-factors ::= SEQUENCE {
234	scale-factor		INTEGER,
235	b-11			SEQUENCE OF INTEGER,
236	b-12			SEQUENCE OF INTEGER,
237	b-13			SEQUENCE OF INTEGER,
238	b-22			SEQUENCE OF INTEGER,
239	b-23			SEQUENCE OF INTEGER,
240	b-33			SEQUENCE OF INTEGER }
241
242Atomic-occupancies ::= SEQUENCE {
243	scale-factor		INTEGER,
244	o			SEQUENCE OF INTEGER }
245
246-- An alternate conformation id is optionally associated with each coordinate.
247-- Aside from corrections due to the validation checks described above, the
248-- contents of MMDB Alternate-conformation-ids are identical to the PDB
249-- "alternate conformation" field.
250
251Alternate-conformation-ids ::= SEQUENCE OF Alternate-conformation-id
252
253Alternate-conformation-id ::= VisibleString
254
255-- Correlated disorder ensemble is defined by a set of alternate conformation
256-- id's which identify coordinates relevant to that ensemble. These are
257-- defined from the validated and corrected contents of the PDB "alternate
258-- conformation" field as described above.  A given ensemble, for example, may
259-- consist of atom sites flagged by " " and "A" Alternate-conformation-ids.
260-- Names for ensembles are constructed from these flags. This example would be
261-- named, in its description, "PDB Ensemble blank plus A".
262
263-- Note that this interpretation is consistent with common PDB usage of the
264-- "alternate conformation" field, but that PDB specifications do not formally
265-- distinguish between correlated and uncorrelated disorder in crystallographic
266-- models. Ensembles identified in MMDB thus may not correspond to the meaning
267-- intended by PDB or the depositor.  No information is lost, however, and
268-- if the intended meaning is known alternative ensemble descriptions may be
269-- reconstructed directly from the Alternate-conformation-ids.
270
271-- Note that correlated disorder as defined here is allowed within an atomic
272-- coordinate set but not between the multiple sets which may define a model.
273-- Multiple sets within the same model are intended as a means to represent
274-- assemblies modeled from different experimentally determined structures,
275-- where correlated disorder between coordinate sets is not relevant.
276
277Conformation-ensemble ::= SEQUENCE {
278	name		VisibleString,
279	alt-conf-ids	SEQUENCE OF Alternate-conformation-id }
280
281
282-- Literal surface coordinates define the chemical components whose structure
283-- is described by a surface, and the surface itself.  The surface may be
284-- either a regular geometric solid or a triangle-mesh of arbitrary shape.
285
286Surface-coordinates ::= SEQUENCE {
287	contents		Chem-graph-pntrs,
288	surface			CHOICE {	sphere		Sphere,
289						cone		Cone,
290						cylinder	Cylinder,
291						brick		Brick,
292						tmesh		T-mesh,
293						triangles	Triangles } }
294T-mesh ::= SEQUENCE {
295	number-of-points	INTEGER,
296	scale-factor		INTEGER,
297	swap			SEQUENCE OF BOOLEAN,
298	x			SEQUENCE OF INTEGER,
299	y			SEQUENCE OF INTEGER,
300	z		        SEQUENCE OF INTEGER }
301
302Triangles ::= SEQUENCE {
303	number-of-points	INTEGER,
304	scale-factor		INTEGER,
305	x			SEQUENCE OF INTEGER,
306	y			SEQUENCE OF INTEGER,
307	z			SEQUENCE OF INTEGER,
308	number-of-triangles     INTEGER,
309	v1			SEQUENCE OF INTEGER,
310	v2			SEQUENCE OF INTEGER,
311	v3			SEQUENCE OF INTEGER }
312
313
314-- Literal density coordinates define the chemical components whose structure
315-- is described by a density grid, parameters of this grid, and density values.
316
317Density-coordinates ::= SEQUENCE {
318	contents		Chem-graph-pntrs,
319	grid-corners		Brick,
320	grid-steps-x		INTEGER,
321	grid-steps-y		INTEGER,
322	grid-steps-z		INTEGER,
323	fastest-varying		ENUMERATED {
324					x(1),
325					y(2),
326					z(3)},
327	slowest-varying		ENUMERATED {
328					x(1),
329					y(2),
330					z(3)},
331	scale-factor		INTEGER,
332	density			SEQUENCE OF INTEGER }
333
334
335END
336