1# Copyright 2008-2015 by Peter Cock. All rights reserved. 2# 3# This file is part of the Biopython distribution and governed by your 4# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 5# Please see the LICENSE file that should have been included as part of this 6# package. 7"""Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format. 8 9This module is for reading and writing PIR or NBRF format files as 10SeqRecord objects. 11 12You are expected to use this module via the Bio.SeqIO functions, or if 13the file contains a sequence alignment, optionally via Bio.AlignIO instead. 14 15This format was introduced for the Protein Information Resource (PIR), a 16project of the National Biomedical Research Foundation (NBRF). The PIR 17database itself is now part of UniProt. 18 19The file format is described online at: 20http://www.ebi.ac.uk/help/pir_frame.html 21http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down) 22 23An example file in this format would be:: 24 25 >P1;CRAB_ANAPL 26 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). 27 MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR 28 SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH 29 GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ 30 SDVPERSIPI TREEKPAIAG AQRK* 31 32 >P1;CRAB_BOVIN 33 ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). 34 MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR 35 PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV 36 HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK 37 QASGPERTIP ITREEKPAVT AAPKK* 38 39Or, an example of a multiple sequence alignment:: 40 41 >P1;S27231 42 rhodopsin - northern leopard frog 43 MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY 44 VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG 45 GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP 46 EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES 47 ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI 48 YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA* 49 50 >P1;I51200 51 rhodopsin - African clawed frog 52 MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF 53 VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG 54 GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP 55 EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES 56 LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI 57 YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA* 58 59 >P1;JN0120 60 rhodopsin - Japanese lamprey 61 MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF 62 VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG 63 GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP 64 EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES 65 ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL 66 YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA* 67 68 69As with the FASTA format, each record starts with a line beginning with ">" 70character. There is then a two letter sequence type (P1, F1, DL, DC, RL, 71RC, or XX), a semi colon, and the identification code. The second like is 72free text description. The remaining lines contain the sequence itself, 73terminating in an asterisk. Space separated blocks of ten letters as shown 74above are typical. 75 76Sequence codes and their meanings: 77 - P1 - Protein (complete) 78 - F1 - Protein (fragment) 79 - D1 - DNA (e.g. EMBOSS seqret output) 80 - DL - DNA (linear) 81 - DC - DNA (circular) 82 - RL - RNA (linear) 83 - RC - RNA (circular) 84 - N3 - tRNA 85 - N1 - Other functional RNA 86 - XX - Unknown 87 88""" 89from Bio.Seq import Seq 90from Bio.SeqRecord import SeqRecord 91 92from .Interfaces import _get_seq_string 93from .Interfaces import SequenceIterator 94from .Interfaces import SequenceWriter 95 96 97_pir_mol_type = { 98 "P1": "protein", 99 "F1": "protein", 100 "D1": "DNA", 101 "DL": "DNA", 102 "DC": "DNA", 103 "RL": "RNA", 104 "RC": "RNA", 105 "N3": "RNA", 106 "XX": None, 107} 108 109 110class PirIterator(SequenceIterator): 111 """Parser for PIR files.""" 112 113 def __init__(self, source): 114 """Iterate over a PIR file and yield SeqRecord objects. 115 116 source - file-like object or a path to a file. 117 118 Examples 119 -------- 120 >>> with open("NBRF/DMB_prot.pir") as handle: 121 ... for record in PirIterator(handle): 122 ... print("%s length %i" % (record.id, len(record))) 123 HLA:HLA00489 length 263 124 HLA:HLA00490 length 94 125 HLA:HLA00491 length 94 126 HLA:HLA00492 length 80 127 HLA:HLA00493 length 175 128 HLA:HLA01083 length 188 129 130 """ 131 super().__init__(source, mode="t", fmt="Pir") 132 133 def parse(self, handle): 134 """Start parsing the file, and return a SeqRecord generator.""" 135 records = self.iterate(handle) 136 return records 137 138 def iterate(self, handle): 139 """Iterate over the records in the PIR file.""" 140 # Skip any text before the first record (e.g. blank lines, comments) 141 for line in handle: 142 if line[0] == ">": 143 break 144 else: 145 return # Premature end of file, or just empty? 146 147 while True: 148 pir_type = line[1:3] 149 if pir_type not in _pir_mol_type or line[3] != ";": 150 raise ValueError( 151 "Records should start with '>XX;' where XX is a valid sequence type" 152 ) 153 identifier = line[4:].strip() 154 description = handle.readline().strip() 155 156 lines = [] 157 for line in handle: 158 if line[0] == ">": 159 break 160 # Remove trailing whitespace, and any internal spaces 161 lines.append(line.rstrip().replace(" ", "")) 162 else: 163 line = None 164 seq = "".join(lines) 165 if seq[-1] != "*": 166 # Note the * terminator is present on nucleotide sequences too, 167 # it is not a stop codon! 168 raise ValueError( 169 "Sequences in PIR files should include a * terminator!" 170 ) 171 172 # Return the record and then continue... 173 record = SeqRecord( 174 Seq(seq[:-1]), id=identifier, name=identifier, description=description, 175 ) 176 record.annotations["PIR-type"] = pir_type 177 if _pir_mol_type[pir_type]: 178 record.annotations["molecule_type"] = _pir_mol_type[pir_type] 179 yield record 180 181 if line is None: 182 return # StopIteration 183 raise ValueError("Unrecognised PIR record format.") 184 185 186class PirWriter(SequenceWriter): 187 """Class to write PIR format files.""" 188 189 def __init__(self, handle, wrap=60, record2title=None, code=None): 190 """Create a PIR writer. 191 192 Arguments: 193 - handle - Handle to an output file, e.g. as returned 194 by open(filename, "w") 195 - wrap - Optional line length used to wrap sequence lines. 196 Defaults to wrapping the sequence at 60 characters 197 Use zero (or None) for no wrapping, giving a single 198 long line for the sequence. 199 - record2title - Optional function to return the text to be 200 used for the title line of each record. By default 201 a combination of the record.id, record.name and 202 record.description is used. 203 - code - Optional sequence code must be one of P1, F1, 204 D1, DL, DC, RL, RC, N3 and XX. By default None is used, 205 which means auto detection based on the molecule type 206 in the record annotation. 207 208 You can either use:: 209 210 handle = open(filename, "w") 211 writer = PirWriter(handle) 212 writer.write_file(myRecords) 213 handle.close() 214 215 Or, follow the sequential file writer system, for example:: 216 217 handle = open(filename, "w") 218 writer = PirWriter(handle) 219 writer.write_header() # does nothing for PIR files 220 ... 221 Multiple writer.write_record() and/or writer.write_records() calls 222 ... 223 writer.write_footer() # does nothing for PIR files 224 handle.close() 225 226 """ 227 super().__init__(handle) 228 self.wrap = None 229 if wrap: 230 if wrap < 1: 231 raise ValueError("wrap should be None, 0, or a positive integer") 232 self.wrap = wrap 233 self.record2title = record2title 234 self.code = code 235 236 def write_record(self, record): 237 """Write a single PIR record to the file.""" 238 if self.record2title: 239 title = self.clean(self.record2title(record)) 240 else: 241 title = self.clean(record.id) 242 243 if record.name and record.description: 244 description = self.clean(record.name + " - " + record.description) 245 elif record.name and not record.description: 246 description = self.clean(record.name) 247 else: 248 description = self.clean(record.description) 249 250 if self.code: 251 code = self.code 252 else: 253 molecule_type = record.annotations.get("molecule_type") 254 if molecule_type is None: 255 code = "XX" 256 elif "DNA" in molecule_type: 257 code = "D1" 258 elif "RNA" in molecule_type: 259 code = "RL" 260 elif "protein" in molecule_type: 261 code = "P1" 262 else: 263 code = "XX" 264 265 if code not in _pir_mol_type: 266 raise TypeError( 267 "Sequence code must be one of " + _pir_mol_type.keys() + "." 268 ) 269 assert "\n" not in title 270 assert "\r" not in description 271 272 self.handle.write(">%s;%s\n%s\n" % (code, title, description)) 273 274 data = _get_seq_string(record) # Catches sequence being None 275 276 assert "\n" not in data 277 assert "\r" not in data 278 279 if self.wrap: 280 line = "" 281 for i in range(0, len(data), self.wrap): 282 line += data[i : i + self.wrap] + "\n" 283 line = line[:-1] + "*\n" 284 self.handle.write(line) 285 else: 286 self.handle.write(data + "*\n") 287 288 289if __name__ == "__main__": 290 from Bio._utils import run_doctest 291 292 run_doctest(verbose=0) 293