1# Copyright 2008-2015 by Peter Cock.  All rights reserved.
2#
3# This file is part of the Biopython distribution and governed by your
4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
5# Please see the LICENSE file that should have been included as part of this
6# package.
7"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.
8
9This module is for reading and writing PIR or NBRF format files as
10SeqRecord objects.
11
12You are expected to use this module via the Bio.SeqIO functions, or if
13the file contains a sequence alignment, optionally via Bio.AlignIO instead.
14
15This format was introduced for the Protein Information Resource (PIR), a
16project of the National Biomedical Research Foundation (NBRF).  The PIR
17database itself is now part of UniProt.
18
19The file format is described online at:
20http://www.ebi.ac.uk/help/pir_frame.html
21http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)
22
23An example file in this format would be::
24
25  >P1;CRAB_ANAPL
26  ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
27    MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
28    SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
29    GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
30    SDVPERSIPI TREEKPAIAG AQRK*
31
32  >P1;CRAB_BOVIN
33  ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
34    MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
35    PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
36    HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
37    QASGPERTIP ITREEKPAVT AAPKK*
38
39Or, an example of a multiple sequence alignment::
40
41  >P1;S27231
42  rhodopsin - northern leopard frog
43  MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
44  VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
45  GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
46  EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
47  ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
48  YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
49
50  >P1;I51200
51  rhodopsin - African clawed frog
52  MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
53  VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
54  GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
55  EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
56  LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
57  YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
58
59  >P1;JN0120
60  rhodopsin - Japanese lamprey
61  MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
62  VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
63  GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
64  EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
65  ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
66  YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
67
68
69As with the FASTA format, each record starts with a line beginning with ">"
70character.  There is then a two letter sequence type (P1, F1, DL, DC, RL,
71RC, or XX), a semi colon, and the identification code.  The second like is
72free text description.  The remaining lines contain the sequence itself,
73terminating in an asterisk.  Space separated blocks of ten letters as shown
74above are typical.
75
76Sequence codes and their meanings:
77 - P1 - Protein (complete)
78 - F1 - Protein (fragment)
79 - D1 - DNA (e.g. EMBOSS seqret output)
80 - DL - DNA (linear)
81 - DC - DNA (circular)
82 - RL - RNA (linear)
83 - RC - RNA (circular)
84 - N3 - tRNA
85 - N1 - Other functional RNA
86 - XX - Unknown
87
88"""
89from Bio.Seq import Seq
90from Bio.SeqRecord import SeqRecord
91
92from .Interfaces import _get_seq_string
93from .Interfaces import SequenceIterator
94from .Interfaces import SequenceWriter
95
96
97_pir_mol_type = {
98    "P1": "protein",
99    "F1": "protein",
100    "D1": "DNA",
101    "DL": "DNA",
102    "DC": "DNA",
103    "RL": "RNA",
104    "RC": "RNA",
105    "N3": "RNA",
106    "XX": None,
107}
108
109
110class PirIterator(SequenceIterator):
111    """Parser for PIR files."""
112
113    def __init__(self, source):
114        """Iterate over a PIR file and yield SeqRecord objects.
115
116        source - file-like object or a path to a file.
117
118        Examples
119        --------
120        >>> with open("NBRF/DMB_prot.pir") as handle:
121        ...    for record in PirIterator(handle):
122        ...        print("%s length %i" % (record.id, len(record)))
123        HLA:HLA00489 length 263
124        HLA:HLA00490 length 94
125        HLA:HLA00491 length 94
126        HLA:HLA00492 length 80
127        HLA:HLA00493 length 175
128        HLA:HLA01083 length 188
129
130        """
131        super().__init__(source, mode="t", fmt="Pir")
132
133    def parse(self, handle):
134        """Start parsing the file, and return a SeqRecord generator."""
135        records = self.iterate(handle)
136        return records
137
138    def iterate(self, handle):
139        """Iterate over the records in the PIR file."""
140        # Skip any text before the first record (e.g. blank lines, comments)
141        for line in handle:
142            if line[0] == ">":
143                break
144        else:
145            return  # Premature end of file, or just empty?
146
147        while True:
148            pir_type = line[1:3]
149            if pir_type not in _pir_mol_type or line[3] != ";":
150                raise ValueError(
151                    "Records should start with '>XX;' where XX is a valid sequence type"
152                )
153            identifier = line[4:].strip()
154            description = handle.readline().strip()
155
156            lines = []
157            for line in handle:
158                if line[0] == ">":
159                    break
160                # Remove trailing whitespace, and any internal spaces
161                lines.append(line.rstrip().replace(" ", ""))
162            else:
163                line = None
164            seq = "".join(lines)
165            if seq[-1] != "*":
166                # Note the * terminator is present on nucleotide sequences too,
167                # it is not a stop codon!
168                raise ValueError(
169                    "Sequences in PIR files should include a * terminator!"
170                )
171
172            # Return the record and then continue...
173            record = SeqRecord(
174                Seq(seq[:-1]), id=identifier, name=identifier, description=description,
175            )
176            record.annotations["PIR-type"] = pir_type
177            if _pir_mol_type[pir_type]:
178                record.annotations["molecule_type"] = _pir_mol_type[pir_type]
179            yield record
180
181            if line is None:
182                return  # StopIteration
183        raise ValueError("Unrecognised PIR record format.")
184
185
186class PirWriter(SequenceWriter):
187    """Class to write PIR format files."""
188
189    def __init__(self, handle, wrap=60, record2title=None, code=None):
190        """Create a PIR writer.
191
192        Arguments:
193         - handle - Handle to an output file, e.g. as returned
194           by open(filename, "w")
195         - wrap - Optional line length used to wrap sequence lines.
196           Defaults to wrapping the sequence at 60 characters
197           Use zero (or None) for no wrapping, giving a single
198           long line for the sequence.
199         - record2title - Optional function to return the text to be
200           used for the title line of each record.  By default
201           a combination of the record.id, record.name and
202           record.description is used.
203         - code - Optional sequence code must be one of P1, F1,
204           D1, DL, DC, RL, RC, N3 and XX. By default None is used,
205           which means auto detection based on the molecule type
206           in the record annotation.
207
208        You can either use::
209
210            handle = open(filename, "w")
211            writer = PirWriter(handle)
212            writer.write_file(myRecords)
213            handle.close()
214
215        Or, follow the sequential file writer system, for example::
216
217            handle = open(filename, "w")
218            writer = PirWriter(handle)
219            writer.write_header() # does nothing for PIR files
220            ...
221            Multiple writer.write_record() and/or writer.write_records() calls
222            ...
223            writer.write_footer() # does nothing for PIR files
224            handle.close()
225
226        """
227        super().__init__(handle)
228        self.wrap = None
229        if wrap:
230            if wrap < 1:
231                raise ValueError("wrap should be None, 0, or a positive integer")
232        self.wrap = wrap
233        self.record2title = record2title
234        self.code = code
235
236    def write_record(self, record):
237        """Write a single PIR record to the file."""
238        if self.record2title:
239            title = self.clean(self.record2title(record))
240        else:
241            title = self.clean(record.id)
242
243        if record.name and record.description:
244            description = self.clean(record.name + " - " + record.description)
245        elif record.name and not record.description:
246            description = self.clean(record.name)
247        else:
248            description = self.clean(record.description)
249
250        if self.code:
251            code = self.code
252        else:
253            molecule_type = record.annotations.get("molecule_type")
254            if molecule_type is None:
255                code = "XX"
256            elif "DNA" in molecule_type:
257                code = "D1"
258            elif "RNA" in molecule_type:
259                code = "RL"
260            elif "protein" in molecule_type:
261                code = "P1"
262            else:
263                code = "XX"
264
265        if code not in _pir_mol_type:
266            raise TypeError(
267                "Sequence code must be one of " + _pir_mol_type.keys() + "."
268            )
269        assert "\n" not in title
270        assert "\r" not in description
271
272        self.handle.write(">%s;%s\n%s\n" % (code, title, description))
273
274        data = _get_seq_string(record)  # Catches sequence being None
275
276        assert "\n" not in data
277        assert "\r" not in data
278
279        if self.wrap:
280            line = ""
281            for i in range(0, len(data), self.wrap):
282                line += data[i : i + self.wrap] + "\n"
283            line = line[:-1] + "*\n"
284            self.handle.write(line)
285        else:
286            self.handle.write(data + "*\n")
287
288
289if __name__ == "__main__":
290    from Bio._utils import run_doctest
291
292    run_doctest(verbose=0)
293