1# Copyright 2000 by Jeffrey Chang, Brad Chapman.  All rights reserved.
2# Copyright 2006-2017 by Peter Cock.  All rights reserved.
3#
4# This code is part of the Biopython distribution and governed by its
5# license.  Please see the LICENSE file that should have been included
6# as part of this package.
7
8"""Code to work with GenBank formatted files.
9
10Rather than using Bio.GenBank, you are now encouraged to use Bio.SeqIO with
11the "genbank" or "embl" format names to parse GenBank or EMBL files into
12SeqRecord and SeqFeature objects (see the Biopython tutorial for details).
13
14Using Bio.GenBank directly to parse GenBank files is only useful if you want
15to obtain GenBank-specific Record objects, which is a much closer
16representation to the raw file contents than the SeqRecord alternative from
17the FeatureParser (used in Bio.SeqIO).
18
19To use the Bio.GenBank parser, there are two helper functions:
20
21    - read                  Parse a handle containing a single GenBank record
22      as Bio.GenBank specific Record objects.
23    - parse                 Iterate over a handle containing multiple GenBank
24      records as Bio.GenBank specific Record objects.
25
26The following internal classes are not intended for direct use and may
27be deprecated in a future release.
28
29Classes:
30 - Iterator              Iterate through a file of GenBank entries
31 - ErrorFeatureParser    Catch errors caused during parsing.
32 - FeatureParser         Parse GenBank data in SeqRecord and SeqFeature objects.
33 - RecordParser          Parse GenBank data into a Record object.
34
35Exceptions:
36 - ParserFailureError    Exception indicating a failure in the parser (ie.
37   scanner or consumer)
38 - LocationParserError   Exception indicating a problem with the spark based
39   location parser.
40
41"""
42
43import re
44import warnings
45
46from Bio import BiopythonParserWarning
47from Bio.Seq import Seq
48from Bio import SeqFeature
49
50# other Bio.GenBank stuff
51from .utils import FeatureValueCleaner
52from .Scanner import GenBankScanner
53
54
55# Constants used to parse GenBank header lines
56GENBANK_INDENT = 12
57GENBANK_SPACER = " " * GENBANK_INDENT
58
59# Constants for parsing GenBank feature lines
60FEATURE_KEY_INDENT = 5
61FEATURE_QUALIFIER_INDENT = 21
62FEATURE_KEY_SPACER = " " * FEATURE_KEY_INDENT
63FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
64
65# Regular expressions for location parsing
66_solo_location = r"[<>]?\d+"
67_pair_location = r"[<>]?\d+\.\.[<>]?\d+"
68_between_location = r"\d+\^\d+"
69
70_within_position = r"\(\d+\.\d+\)"
71_re_within_position = re.compile(_within_position)
72_within_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (
73    _within_position,
74    _within_position,
75)
76assert _re_within_position.match("(3.9)")
77assert re.compile(_within_location).match("(3.9)..10")
78assert re.compile(_within_location).match("26..(30.33)")
79assert re.compile(_within_location).match("(13.19)..(20.28)")
80
81_oneof_position = r"one\-of\(\d+(,\d+)+\)"
82_re_oneof_position = re.compile(_oneof_position)
83_oneof_location = r"([<>]?\d+|%s)\.\.([<>]?\d+|%s)" % (_oneof_position, _oneof_position)
84assert _re_oneof_position.match("one-of(6,9)")
85assert re.compile(_oneof_location).match("one-of(6,9)..101")
86assert re.compile(_oneof_location).match("one-of(6,9)..one-of(101,104)")
87assert re.compile(_oneof_location).match("6..one-of(101,104)")
88
89assert not _re_oneof_position.match("one-of(3)")
90assert _re_oneof_position.match("one-of(3,6)")
91assert _re_oneof_position.match("one-of(3,6,9)")
92
93
94_simple_location = r"\d+\.\.\d+"
95_re_simple_location = re.compile(r"^%s$" % _simple_location)
96_re_simple_compound = re.compile(
97    r"^(join|order|bond)\(%s(,%s)*\)$" % (_simple_location, _simple_location)
98)
99_complex_location = r"([a-zA-Z][a-zA-Z0-9_\.\|]*[a-zA-Z0-9]?\:)?(%s|%s|%s|%s|%s)" % (
100    _pair_location,
101    _solo_location,
102    _between_location,
103    _within_location,
104    _oneof_location,
105)
106_re_complex_location = re.compile(r"^%s$" % _complex_location)
107_possibly_complemented_complex_location = r"(%s|complement\(%s\))" % (
108    _complex_location,
109    _complex_location,
110)
111_re_complex_compound = re.compile(
112    r"^(join|order|bond)\(%s(,%s)*\)$"
113    % (_possibly_complemented_complex_location, _possibly_complemented_complex_location)
114)
115
116
117assert _re_simple_location.match("104..160")
118assert not _re_simple_location.match("68451760..68452073^68452074")
119assert not _re_simple_location.match("<104..>160")
120assert not _re_simple_location.match("104")
121assert not _re_simple_location.match("<1")
122assert not _re_simple_location.match(">99999")
123assert not _re_simple_location.match("join(104..160,320..390,504..579)")
124assert not _re_simple_compound.match("bond(12,63)")
125assert _re_simple_compound.match("join(104..160,320..390,504..579)")
126assert _re_simple_compound.match("order(1..69,1308..1465)")
127assert not _re_simple_compound.match("order(1..69,1308..1465,1524)")
128assert not _re_simple_compound.match("join(<1..442,992..1228,1524..>1983)")
129assert not _re_simple_compound.match("join(<1..181,254..336,422..497,574..>590)")
130assert not _re_simple_compound.match(
131    "join(1475..1577,2841..2986,3074..3193,3314..3481,4126..>4215)"
132)
133assert not _re_simple_compound.match("test(1..69,1308..1465)")
134assert not _re_simple_compound.match("complement(1..69)")
135assert not _re_simple_compound.match("(1..69)")
136assert _re_complex_location.match("(3.9)..10")
137assert _re_complex_location.match("26..(30.33)")
138assert _re_complex_location.match("(13.19)..(20.28)")
139assert _re_complex_location.match("41^42")  # between
140assert _re_complex_location.match("AL121804:41^42")
141assert _re_complex_location.match("AL121804:41..610")
142assert _re_complex_location.match("AL121804.2:41..610")
143assert _re_complex_location.match(
144    "AL358792.24.1.166931:3274..3461"
145)  # lots of dots in external reference
146assert _re_complex_location.match("one-of(3,6)..101")
147assert _re_complex_compound.match(
148    "join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)"
149)
150assert not _re_simple_compound.match(
151    "join(153490..154269,AL121804.2:41..610,AL121804.2:672..1487)"
152)
153assert _re_complex_compound.match("join(complement(69611..69724),139856..140650)")
154assert _re_complex_compound.match(
155    "join(complement(AL354868.10.1.164018:80837..81016),complement(AL354868.10.1.164018:80539..80835))"
156)
157
158# Trans-spliced example from NC_016406, note underscore in reference name:
159assert _re_complex_location.match("NC_016402.1:6618..6676")
160assert _re_complex_location.match("181647..181905")
161assert _re_complex_compound.match(
162    "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
163)
164assert not _re_complex_location.match(
165    "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
166)
167assert not _re_simple_compound.match(
168    "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
169)
170assert not _re_complex_location.match(
171    "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
172)
173assert not _re_simple_location.match(
174    "join(complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905)"
175)
176
177_solo_bond = re.compile(r"bond\(%s\)" % _solo_location)
178assert _solo_bond.match("bond(196)")
179assert _solo_bond.search("bond(196)")
180assert _solo_bond.search("join(bond(284),bond(305),bond(309),bond(305))")
181
182
183def _pos(pos_str, offset=0):
184    """Build a Position object (PRIVATE).
185
186    For an end position, leave offset as zero (default):
187
188    >>> _pos("5")
189    ExactPosition(5)
190
191    For a start position, set offset to minus one (for Python counting):
192
193    >>> _pos("5", -1)
194    ExactPosition(4)
195
196    This also covers fuzzy positions:
197
198    >>> p = _pos("<5")
199    >>> p
200    BeforePosition(5)
201    >>> print(p)
202    <5
203    >>> int(p)
204    5
205
206    >>> _pos(">5")
207    AfterPosition(5)
208
209    By default assumes an end position, so note the integer behaviour:
210
211    >>> p = _pos("one-of(5,8,11)")
212    >>> p
213    OneOfPosition(11, choices=[ExactPosition(5), ExactPosition(8), ExactPosition(11)])
214    >>> print(p)
215    one-of(5,8,11)
216    >>> int(p)
217    11
218
219    >>> _pos("(8.10)")
220    WithinPosition(10, left=8, right=10)
221
222    Fuzzy start positions:
223
224    >>> p = _pos("<5", -1)
225    >>> p
226    BeforePosition(4)
227    >>> print(p)
228    <4
229    >>> int(p)
230    4
231
232    Notice how the integer behaviour changes too!
233
234    >>> p = _pos("one-of(5,8,11)", -1)
235    >>> p
236    OneOfPosition(4, choices=[ExactPosition(4), ExactPosition(7), ExactPosition(10)])
237    >>> print(p)
238    one-of(4,7,10)
239    >>> int(p)
240    4
241
242    """
243    if pos_str.startswith("<"):
244        return SeqFeature.BeforePosition(int(pos_str[1:]) + offset)
245    elif pos_str.startswith(">"):
246        return SeqFeature.AfterPosition(int(pos_str[1:]) + offset)
247    elif _re_within_position.match(pos_str):
248        s, e = pos_str[1:-1].split(".")
249        s = int(s) + offset
250        e = int(e) + offset
251        if offset == -1:
252            default = s
253        else:
254            default = e
255        return SeqFeature.WithinPosition(default, left=s, right=e)
256    elif _re_oneof_position.match(pos_str):
257        assert pos_str.startswith("one-of(")
258        assert pos_str[-1] == ")"
259        parts = [
260            SeqFeature.ExactPosition(int(pos) + offset)
261            for pos in pos_str[7:-1].split(",")
262        ]
263        if offset == -1:
264            default = min(int(pos) for pos in parts)
265        else:
266            default = max(int(pos) for pos in parts)
267        return SeqFeature.OneOfPosition(default, choices=parts)
268    else:
269        return SeqFeature.ExactPosition(int(pos_str) + offset)
270
271
272def _loc(loc_str, expected_seq_length, strand, seq_type=None):
273    """Make FeatureLocation from non-compound non-complement location (PRIVATE).
274
275    This is also invoked to 'automatically' fix ambiguous formatting of features
276    that span the origin of a circular sequence.
277
278    Simple examples,
279
280    >>> _loc("123..456", 1000, +1)
281    FeatureLocation(ExactPosition(122), ExactPosition(456), strand=1)
282    >>> _loc("<123..>456", 1000, strand = -1)
283    FeatureLocation(BeforePosition(122), AfterPosition(456), strand=-1)
284
285    A more complex location using within positions,
286
287    >>> _loc("(9.10)..(20.25)", 1000, 1)
288    FeatureLocation(WithinPosition(8, left=8, right=9), WithinPosition(25, left=20, right=25), strand=1)
289
290    Notice how that will act as though it has overall start 8 and end 25.
291
292    Zero length between feature,
293
294    >>> _loc("123^124", 1000, 0)
295    FeatureLocation(ExactPosition(123), ExactPosition(123), strand=0)
296
297    The expected sequence length is needed for a special case, a between
298    position at the start/end of a circular genome:
299
300    >>> _loc("1000^1", 1000, 1)
301    FeatureLocation(ExactPosition(1000), ExactPosition(1000), strand=1)
302
303    Apart from this special case, between positions P^Q must have P+1==Q,
304
305    >>> _loc("123^456", 1000, 1)
306    Traceback (most recent call last):
307       ...
308    ValueError: Invalid between location '123^456'
309
310    You can optionally provide a reference name:
311
312    >>> _loc("AL391218.9:105173..108462", 2000000, 1)
313    FeatureLocation(ExactPosition(105172), ExactPosition(108462), strand=1, ref='AL391218.9')
314
315    >>> _loc("<2644..159", 2868, 1, "circular")
316    CompoundLocation([FeatureLocation(BeforePosition(2643), ExactPosition(2868), strand=1), FeatureLocation(ExactPosition(0), ExactPosition(159), strand=1)], 'join')
317    """
318    if ":" in loc_str:
319        ref, loc_str = loc_str.split(":")
320    else:
321        ref = None
322    try:
323        s, e = loc_str.split("..")
324    except ValueError:
325        assert ".." not in loc_str
326        if "^" in loc_str:
327            # A between location like "67^68" (one based counting) is a
328            # special case (note it has zero length). In python slice
329            # notation this is 67:67, a zero length slice.  See Bug 2622
330            # Further more, on a circular genome of length N you can have
331            # a location N^1 meaning the junction at the origin. See Bug 3098.
332            # NOTE - We can imagine between locations like "2^4", but this
333            # is just "3".  Similarly, "2^5" is just "3..4"
334            s, e = loc_str.split("^")
335            if int(s) + 1 == int(e):
336                pos = _pos(s)
337            elif int(s) == expected_seq_length and e == "1":
338                pos = _pos(s)
339            else:
340                raise ValueError("Invalid between location %r" % loc_str) from None
341            return SeqFeature.FeatureLocation(pos, pos, strand, ref=ref)
342        else:
343            # e.g. "123"
344            s = loc_str
345            e = loc_str
346
347    # Attempt to fix features that span the origin
348    s_pos = _pos(s, -1)
349    e_pos = _pos(e)
350    if int(s_pos) > int(e_pos):
351        if seq_type is None or "circular" not in seq_type.lower():
352            warnings.warn(
353                "It appears that %r is a feature that spans "
354                "the origin, but the sequence topology is "
355                "undefined. Skipping feature." % loc_str,
356                BiopythonParserWarning,
357            )
358            return None
359        warnings.warn(
360            "Attempting to fix invalid location %r as "
361            "it looks like incorrect origin wrapping. "
362            "Please fix input file, this could have "
363            "unintended behavior." % loc_str,
364            BiopythonParserWarning,
365        )
366
367        f1 = SeqFeature.FeatureLocation(s_pos, expected_seq_length, strand)
368        f2 = SeqFeature.FeatureLocation(0, int(e_pos), strand)
369
370        if strand == -1:
371            # For complementary features spanning the origin
372            return f2 + f1
373        else:
374            return f1 + f2
375
376    return SeqFeature.FeatureLocation(_pos(s, -1), _pos(e), strand, ref=ref)
377
378
379def _split_compound_loc(compound_loc):
380    """Split a tricky compound location string (PRIVATE).
381
382    >>> list(_split_compound_loc("123..145"))
383    ['123..145']
384    >>> list(_split_compound_loc("123..145,200..209"))
385    ['123..145', '200..209']
386    >>> list(_split_compound_loc("one-of(200,203)..300"))
387    ['one-of(200,203)..300']
388    >>> list(_split_compound_loc("complement(123..145),200..209"))
389    ['complement(123..145)', '200..209']
390    >>> list(_split_compound_loc("123..145,one-of(200,203)..209"))
391    ['123..145', 'one-of(200,203)..209']
392    >>> list(_split_compound_loc("123..145,one-of(200,203)..one-of(209,211),300"))
393    ['123..145', 'one-of(200,203)..one-of(209,211)', '300']
394    >>> list(_split_compound_loc("123..145,complement(one-of(200,203)..one-of(209,211)),300"))
395    ['123..145', 'complement(one-of(200,203)..one-of(209,211))', '300']
396    >>> list(_split_compound_loc("123..145,200..one-of(209,211),300"))
397    ['123..145', '200..one-of(209,211)', '300']
398    >>> list(_split_compound_loc("123..145,200..one-of(209,211)"))
399    ['123..145', '200..one-of(209,211)']
400    >>> list(_split_compound_loc("complement(149815..150200),complement(293787..295573),NC_016402.1:6618..6676,181647..181905"))
401    ['complement(149815..150200)', 'complement(293787..295573)', 'NC_016402.1:6618..6676', '181647..181905']
402    """
403    if "one-of(" in compound_loc:
404        # Hard case
405        while "," in compound_loc:
406            assert compound_loc[0] != ","
407            assert compound_loc[0:2] != ".."
408            i = compound_loc.find(",")
409            part = compound_loc[:i]
410            compound_loc = compound_loc[i:]  # includes the comma
411            while part.count("(") > part.count(")"):
412                assert "one-of(" in part, (part, compound_loc)
413                i = compound_loc.find(")")
414                part += compound_loc[: i + 1]
415                compound_loc = compound_loc[i + 1 :]
416            if compound_loc.startswith(".."):
417                i = compound_loc.find(",")
418                if i == -1:
419                    part += compound_loc
420                    compound_loc = ""
421                else:
422                    part += compound_loc[:i]
423                    compound_loc = compound_loc[i:]  # includes the comma
424            while part.count("(") > part.count(")"):
425                assert part.count("one-of(") == 2
426                i = compound_loc.find(")")
427                part += compound_loc[: i + 1]
428                compound_loc = compound_loc[i + 1 :]
429            if compound_loc.startswith(","):
430                compound_loc = compound_loc[1:]
431            assert part
432            yield part
433        if compound_loc:
434            yield compound_loc
435    else:
436        # Easy case
437        yield from compound_loc.split(",")
438
439
440class Iterator:
441    """Iterator interface to move over a file of GenBank entries one at a time (OBSOLETE).
442
443    This class is likely to be deprecated in a future release of Biopython.
444    Please use Bio.SeqIO.parse(..., format="gb") or Bio.GenBank.parse(...)
445    for SeqRecord and GenBank specific Record objects respectively instead.
446    """
447
448    def __init__(self, handle, parser=None):
449        """Initialize the iterator.
450
451        Arguments:
452         - handle - A handle with GenBank entries to iterate through.
453         - parser - An optional parser to pass the entries through before
454           returning them. If None, then the raw entry will be returned.
455
456        """
457        self.handle = handle
458        self._parser = parser
459
460    def __next__(self):
461        """Return the next GenBank record from the handle.
462
463        Will return None if we ran out of records.
464        """
465        if self._parser is None:
466            lines = []
467            while True:
468                line = self.handle.readline()
469                if not line:
470                    return None  # Premature end of file?
471                lines.append(line)
472                if line.rstrip() == "//":
473                    break
474            return "".join(lines)
475        try:
476            return self._parser.parse(self.handle)
477        except StopIteration:
478            return None
479
480    def __iter__(self):
481        """Iterate over the records."""
482        return iter(self.__next__, None)
483
484
485class ParserFailureError(Exception):
486    """Failure caused by some kind of problem in the parser."""
487
488    pass
489
490
491class LocationParserError(Exception):
492    """Could not Properly parse out a location from a GenBank file."""
493
494    pass
495
496
497_cleaner = FeatureValueCleaner()
498
499
500class FeatureParser:
501    """Parse GenBank files into Seq + Feature objects (OBSOLETE).
502
503    Direct use of this class is discouraged, and may be deprecated in
504    a future release of Biopython.
505
506    Please use Bio.SeqIO.parse(...) or Bio.SeqIO.read(...) instead.
507    """
508
509    def __init__(self, debug_level=0, use_fuzziness=1, feature_cleaner=None):
510        """Initialize a GenBank parser and Feature consumer.
511
512        Arguments:
513         - debug_level - An optional argument that species the amount of
514           debugging information the parser should spit out. By default we have
515           no debugging info (the fastest way to do things), but if you want
516           you can set this as high as two and see exactly where a parse fails.
517         - use_fuzziness - Specify whether or not to use fuzzy representations.
518           The default is 1 (use fuzziness).
519         - feature_cleaner - A class which will be used to clean out the
520           values of features. This class must implement the function
521           clean_value. GenBank.utils has a "standard" cleaner class, which
522           is used by default.
523
524        """
525        self._scanner = GenBankScanner(debug_level)
526        self.use_fuzziness = use_fuzziness
527        if feature_cleaner:
528            self._cleaner = feature_cleaner
529        else:
530            self._cleaner = _cleaner  # default
531
532    def parse(self, handle):
533        """Parse the specified handle."""
534        _consumer = _FeatureConsumer(self.use_fuzziness, self._cleaner)
535        self._scanner.feed(handle, _consumer)
536        return _consumer.data
537
538
539class RecordParser:
540    """Parse GenBank files into Record objects (OBSOLETE).
541
542    Direct use of this class is discouraged, and may be deprecated in
543    a future release of Biopython.
544
545    Please use the Bio.GenBank.parse(...) or Bio.GenBank.read(...) functions
546    instead.
547    """
548
549    def __init__(self, debug_level=0):
550        """Initialize the parser.
551
552        Arguments:
553         - debug_level - An optional argument that species the amount of
554           debugging information the parser should spit out. By default we have
555           no debugging info (the fastest way to do things), but if you want
556           you can set this as high as two and see exactly where a parse fails.
557
558        """
559        self._scanner = GenBankScanner(debug_level)
560
561    def parse(self, handle):
562        """Parse the specified handle into a GenBank record."""
563        _consumer = _RecordConsumer()
564
565        self._scanner.feed(handle, _consumer)
566        return _consumer.data
567
568
569class _BaseGenBankConsumer:
570    """Abstract GenBank consumer providing useful general functions (PRIVATE).
571
572    This just helps to eliminate some duplication in things that most
573    GenBank consumers want to do.
574    """
575
576    # Special keys in GenBank records that we should remove spaces from
577    # For instance, \translation keys have values which are proteins and
578    # should have spaces and newlines removed from them. This class
579    # attribute gives us more control over specific formatting problems.
580    remove_space_keys = ["translation"]
581
582    def __init__(self):
583        pass
584
585    @staticmethod
586    def _split_keywords(keyword_string):
587        """Split a string of keywords into a nice clean list (PRIVATE)."""
588        # process the keywords into a python list
589        if keyword_string == "" or keyword_string == ".":
590            keywords = ""
591        elif keyword_string[-1] == ".":
592            keywords = keyword_string[:-1]
593        else:
594            keywords = keyword_string
595        keyword_list = keywords.split(";")
596        return [x.strip() for x in keyword_list]
597
598    @staticmethod
599    def _split_accessions(accession_string):
600        """Split a string of accession numbers into a list (PRIVATE)."""
601        # first replace all line feeds with spaces
602        # Also, EMBL style accessions are split with ';'
603        accession = accession_string.replace("\n", " ").replace(";", " ")
604
605        return [x.strip() for x in accession.split() if x.strip()]
606
607    @staticmethod
608    def _split_taxonomy(taxonomy_string):
609        """Split a string with taxonomy info into a list (PRIVATE)."""
610        if not taxonomy_string or taxonomy_string == ".":
611            # Missing data, no taxonomy
612            return []
613
614        if taxonomy_string[-1] == ".":
615            tax_info = taxonomy_string[:-1]
616        else:
617            tax_info = taxonomy_string
618        tax_list = tax_info.split(";")
619        new_tax_list = []
620        for tax_item in tax_list:
621            new_items = tax_item.split("\n")
622            new_tax_list.extend(new_items)
623        while "" in new_tax_list:
624            new_tax_list.remove("")
625        return [x.strip() for x in new_tax_list]
626
627    @staticmethod
628    def _clean_location(location_string):
629        """Clean whitespace out of a location string (PRIVATE).
630
631        The location parser isn't a fan of whitespace, so we clean it out
632        before feeding it into the parser.
633        """
634        # Originally this imported string.whitespace and did a replace
635        # via a loop.  It's simpler to just split on whitespace and rejoin
636        # the string - and this avoids importing string too.  See Bug 2684.
637        return "".join(location_string.split())
638
639    @staticmethod
640    def _remove_newlines(text):
641        """Remove any newlines in the passed text, returning the new string (PRIVATE)."""
642        # get rid of newlines in the qualifier value
643        newlines = ["\n", "\r"]
644        for ws in newlines:
645            text = text.replace(ws, "")
646
647        return text
648
649    @staticmethod
650    def _normalize_spaces(text):
651        """Replace multiple spaces in the passed text with single spaces (PRIVATE)."""
652        # get rid of excessive spaces
653        return " ".join(x for x in text.split(" ") if x)
654
655    @staticmethod
656    def _remove_spaces(text):
657        """Remove all spaces from the passed text (PRIVATE)."""
658        return text.replace(" ", "")
659
660    @staticmethod
661    def _convert_to_python_numbers(start, end):
662        """Convert a start and end range to python notation (PRIVATE).
663
664        In GenBank, starts and ends are defined in "biological" coordinates,
665        where 1 is the first base and [i, j] means to include both i and j.
666
667        In python, 0 is the first base and [i, j] means to include i, but
668        not j.
669
670        So, to convert "biological" to python coordinates, we need to
671        subtract 1 from the start, and leave the end and things should
672        be converted happily.
673        """
674        new_start = start - 1
675        new_end = end
676
677        return new_start, new_end
678
679
680class _FeatureConsumer(_BaseGenBankConsumer):
681    """Create a SeqRecord object with Features to return (PRIVATE).
682
683    Attributes:
684     - use_fuzziness - specify whether or not to parse with fuzziness in
685       feature locations.
686     - feature_cleaner - a class that will be used to provide specialized
687       cleaning-up of feature values.
688
689    """
690
691    def __init__(self, use_fuzziness, feature_cleaner=None):
692        from Bio.SeqRecord import SeqRecord
693
694        _BaseGenBankConsumer.__init__(self)
695        self.data = SeqRecord(None, id=None)
696        self.data.id = None
697        self.data.description = ""
698
699        self._use_fuzziness = use_fuzziness
700        self._feature_cleaner = feature_cleaner
701
702        self._seq_type = ""
703        self._seq_data = []
704        self._cur_reference = None
705        self._cur_feature = None
706        self._expected_size = None
707
708    def locus(self, locus_name):
709        """Set the locus name is set as the name of the Sequence."""
710        self.data.name = locus_name
711
712    def size(self, content):
713        """Record the sequence length."""
714        self._expected_size = int(content)
715
716    def residue_type(self, type):
717        """Record the sequence type (SEMI-OBSOLETE).
718
719        This reflects the fact that the topology (linear/circular) and
720        molecule type (e.g. DNA vs RNA) were a single field in early
721        files. Current GenBank/EMBL files have two fields.
722        """
723        self._seq_type = type.strip()
724
725    def topology(self, topology):
726        """Validate and record sequence topology.
727
728        The topology argument should be "linear" or "circular" (string).
729        """
730        if topology:
731            if topology not in ["linear", "circular"]:
732                raise ParserFailureError(
733                    "Unexpected topology %r should be linear or circular" % topology
734                )
735            self.data.annotations["topology"] = topology
736
737    def molecule_type(self, mol_type):
738        """Validate and record the molecule type (for round-trip etc)."""
739        if mol_type:
740            if "circular" in mol_type or "linear" in mol_type:
741                raise ParserFailureError(
742                    "Molecule type %r should not include topology" % mol_type
743                )
744
745            # Writing out records will fail if we have a lower case DNA
746            # or RNA string in here, so upper case it.
747            # This is a bit ugly, but we don't want to upper case e.g.
748            # the m in mRNA, but thanks to the strip we lost the spaces
749            # so we need to index from the back
750            if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper():
751                warnings.warn(
752                    "Non-upper case molecule type in LOCUS line: %s" % mol_type,
753                    BiopythonParserWarning,
754                )
755
756            self.data.annotations["molecule_type"] = mol_type
757
758    def data_file_division(self, division):
759        self.data.annotations["data_file_division"] = division
760
761    def date(self, submit_date):
762        self.data.annotations["date"] = submit_date
763
764    def definition(self, definition):
765        """Set the definition as the description of the sequence."""
766        if self.data.description:
767            # Append to any existing description
768            # e.g. EMBL files with two DE lines.
769            self.data.description += " " + definition
770        else:
771            self.data.description = definition
772
773    def accession(self, acc_num):
774        """Set the accession number as the id of the sequence.
775
776        If we have multiple accession numbers, the first one passed is
777        used.
778        """
779        new_acc_nums = self._split_accessions(acc_num)
780
781        # Also record them ALL in the annotations
782        try:
783            # On the off chance there was more than one accession line:
784            for acc in new_acc_nums:
785                # Prevent repeat entries
786                if acc not in self.data.annotations["accessions"]:
787                    self.data.annotations["accessions"].append(acc)
788        except KeyError:
789            self.data.annotations["accessions"] = new_acc_nums
790
791        # if we haven't set the id information yet, add the first acc num
792        if not self.data.id:
793            if len(new_acc_nums) > 0:
794                # self.data.id = new_acc_nums[0]
795                # Use the FIRST accession as the ID, not the first on this line!
796                self.data.id = self.data.annotations["accessions"][0]
797
798    def tls(self, content):
799        self.data.annotations["tls"] = content.split("-")
800
801    def tsa(self, content):
802        self.data.annotations["tsa"] = content.split("-")
803
804    def wgs(self, content):
805        self.data.annotations["wgs"] = content.split("-")
806
807    def add_wgs_scafld(self, content):
808        self.data.annotations.setdefault("wgs_scafld", []).append(content.split("-"))
809
810    def nid(self, content):
811        self.data.annotations["nid"] = content
812
813    def pid(self, content):
814        self.data.annotations["pid"] = content
815
816    def version(self, version_id):
817        # Want to use the versioned accession as the record.id
818        # This comes from the VERSION line in GenBank files, or the
819        # obsolete SV line in EMBL.  For the new EMBL files we need
820        # both the version suffix from the ID line and the accession
821        # from the AC line.
822        if version_id.count(".") == 1 and version_id.split(".")[1].isdigit():
823            self.accession(version_id.split(".")[0])
824            self.version_suffix(version_id.split(".")[1])
825        elif version_id:
826            # For backwards compatibility...
827            self.data.id = version_id
828
829    def project(self, content):
830        """Handle the information from the PROJECT line as a list of projects.
831
832        e.g.::
833
834            PROJECT     GenomeProject:28471
835
836        or::
837
838            PROJECT     GenomeProject:13543  GenomeProject:99999
839
840        This is stored as dbxrefs in the SeqRecord to be consistent with the
841        projected switch of this line to DBLINK in future GenBank versions.
842        Note the NCBI plan to replace "GenomeProject:28471" with the shorter
843        "Project:28471" as part of this transition.
844        """
845        content = content.replace("GenomeProject:", "Project:")
846        self.data.dbxrefs.extend(p for p in content.split() if p)
847
848    def dblink(self, content):
849        """Store DBLINK cross references as dbxrefs in our record object.
850
851        This line type is expected to replace the PROJECT line in 2009. e.g.
852
853        During transition::
854
855            PROJECT     GenomeProject:28471
856            DBLINK      Project:28471
857                        Trace Assembly Archive:123456
858
859        Once the project line is dropped::
860
861            DBLINK      Project:28471
862                        Trace Assembly Archive:123456
863
864        Note GenomeProject -> Project.
865
866        We'll have to see some real examples to be sure, but based on the
867        above example we can expect one reference per line.
868
869        Note that at some point the NCBI have included an extra space, e.g.::
870
871            DBLINK      Project: 28471
872
873        """
874        # During the transition period with both PROJECT and DBLINK lines,
875        # we don't want to add the same cross reference twice.
876        while ": " in content:
877            content = content.replace(": ", ":")
878        if content.strip() not in self.data.dbxrefs:
879            self.data.dbxrefs.append(content.strip())
880
881    def version_suffix(self, version):
882        """Set the version to overwrite the id.
883
884        Since the version provides the same information as the accession
885        number, plus some extra info, we set this as the id if we have
886        a version.
887        """
888        # e.g. GenBank line:
889        # VERSION     U49845.1  GI:1293613
890        # or the obsolete EMBL line:
891        # SV   U49845.1
892        # Scanner calls consumer.version("U49845.1")
893        # which then calls consumer.version_suffix(1)
894        #
895        # e.g. EMBL new line:
896        # ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
897        # Scanner calls consumer.version_suffix(1)
898        assert version.isdigit()
899        self.data.annotations["sequence_version"] = int(version)
900
901    def db_source(self, content):
902        self.data.annotations["db_source"] = content.rstrip()
903
904    def gi(self, content):
905        self.data.annotations["gi"] = content
906
907    def keywords(self, content):
908        if "keywords" in self.data.annotations:
909            # Multi-line keywords, append to list
910            # Note EMBL states "A keyword is never split between lines."
911            self.data.annotations["keywords"].extend(self._split_keywords(content))
912        else:
913            self.data.annotations["keywords"] = self._split_keywords(content)
914
915    def segment(self, content):
916        self.data.annotations["segment"] = content
917
918    def source(self, content):
919        # Note that some software (e.g. VectorNTI) may produce an empty
920        # source (rather than using a dot/period as might be expected).
921        if content == "":
922            source_info = ""
923        elif content[-1] == ".":
924            source_info = content[:-1]
925        else:
926            source_info = content
927        self.data.annotations["source"] = source_info
928
929    def organism(self, content):
930        self.data.annotations["organism"] = content
931
932    def taxonomy(self, content):
933        """Record (another line of) the taxonomy lineage."""
934        lineage = self._split_taxonomy(content)
935        try:
936            self.data.annotations["taxonomy"].extend(lineage)
937        except KeyError:
938            self.data.annotations["taxonomy"] = lineage
939
940    def reference_num(self, content):
941        """Signal the beginning of a new reference object."""
942        # if we have a current reference that hasn't been added to
943        # the list of references, add it.
944        if self._cur_reference is not None:
945            self.data.annotations["references"].append(self._cur_reference)
946        else:
947            self.data.annotations["references"] = []
948
949        self._cur_reference = SeqFeature.Reference()
950
951    def reference_bases(self, content):
952        """Attempt to determine the sequence region the reference entails.
953
954        Possible types of information we may have to deal with:
955
956        (bases 1 to 86436)
957        (sites)
958        (bases 1 to 105654; 110423 to 111122)
959        1  (residues 1 to 182)
960        """
961        # first remove the parentheses
962        assert content.endswith(")"), content
963        ref_base_info = content[1:-1]
964
965        all_locations = []
966        # parse if we've got 'bases' and 'to'
967        if "bases" in ref_base_info and "to" in ref_base_info:
968            # get rid of the beginning 'bases'
969            ref_base_info = ref_base_info[5:]
970            locations = self._split_reference_locations(ref_base_info)
971            all_locations.extend(locations)
972        elif "residues" in ref_base_info and "to" in ref_base_info:
973            residues_start = ref_base_info.find("residues")
974            # get only the information after "residues"
975            ref_base_info = ref_base_info[(residues_start + len("residues ")) :]
976            locations = self._split_reference_locations(ref_base_info)
977            all_locations.extend(locations)
978
979        # make sure if we are not finding information then we have
980        # the string 'sites' or the string 'bases'
981        elif ref_base_info == "sites" or ref_base_info.strip() == "bases":
982            pass
983        # otherwise raise an error
984        else:
985            raise ValueError(
986                "Could not parse base info %s in record %s"
987                % (ref_base_info, self.data.id)
988            )
989
990        self._cur_reference.location = all_locations
991
992    def _split_reference_locations(self, location_string):
993        """Get reference locations out of a string of reference information (PRIVATE).
994
995        The passed string should be of the form::
996
997            1 to 20; 20 to 100
998
999        This splits the information out and returns a list of location objects
1000        based on the reference locations.
1001        """
1002        # split possibly multiple locations using the ';'
1003        all_base_info = location_string.split(";")
1004
1005        new_locations = []
1006        for base_info in all_base_info:
1007            start, end = base_info.split("to")
1008            new_start, new_end = self._convert_to_python_numbers(
1009                int(start.strip()), int(end.strip())
1010            )
1011            this_location = SeqFeature.FeatureLocation(new_start, new_end)
1012            new_locations.append(this_location)
1013        return new_locations
1014
1015    def authors(self, content):
1016        if self._cur_reference.authors:
1017            self._cur_reference.authors += " " + content
1018        else:
1019            self._cur_reference.authors = content
1020
1021    def consrtm(self, content):
1022        if self._cur_reference.consrtm:
1023            self._cur_reference.consrtm += " " + content
1024        else:
1025            self._cur_reference.consrtm = content
1026
1027    def title(self, content):
1028        if self._cur_reference is None:
1029            warnings.warn(
1030                "GenBank TITLE line without REFERENCE line.", BiopythonParserWarning
1031            )
1032        elif self._cur_reference.title:
1033            self._cur_reference.title += " " + content
1034        else:
1035            self._cur_reference.title = content
1036
1037    def journal(self, content):
1038        if self._cur_reference.journal:
1039            self._cur_reference.journal += " " + content
1040        else:
1041            self._cur_reference.journal = content
1042
1043    def medline_id(self, content):
1044        self._cur_reference.medline_id = content
1045
1046    def pubmed_id(self, content):
1047        self._cur_reference.pubmed_id = content
1048
1049    def remark(self, content):
1050        """Deal with a reference comment."""
1051        if self._cur_reference.comment:
1052            self._cur_reference.comment += " " + content
1053        else:
1054            self._cur_reference.comment = content
1055
1056    def comment(self, content):
1057        try:
1058            self.data.annotations["comment"] += "\n" + "\n".join(content)
1059        except KeyError:
1060            self.data.annotations["comment"] = "\n".join(content)
1061
1062    def structured_comment(self, content):
1063        self.data.annotations["structured_comment"] = content
1064
1065    def features_line(self, content):
1066        """Get ready for the feature table when we reach the FEATURE line."""
1067        self.start_feature_table()
1068
1069    def start_feature_table(self):
1070        """Indicate we've got to the start of the feature table."""
1071        # make sure we've added on our last reference object
1072        if self._cur_reference is not None:
1073            self.data.annotations["references"].append(self._cur_reference)
1074            self._cur_reference = None
1075
1076    def feature_key(self, content):
1077        # start a new feature
1078        self._cur_feature = SeqFeature.SeqFeature()
1079        self._cur_feature.type = content
1080        self.data.features.append(self._cur_feature)
1081
1082    def location(self, content):
1083        """Parse out location information from the location string.
1084
1085        This uses simple Python code with some regular expressions to do the
1086        parsing, and then translates the results into appropriate objects.
1087        """
1088        # clean up newlines and other whitespace inside the location before
1089        # parsing - locations should have no whitespace whatsoever
1090        location_line = self._clean_location(content)
1091
1092        # Older records have junk like replace(266,"c") in the
1093        # location line. Newer records just replace this with
1094        # the number 266 and have the information in a more reasonable
1095        # place. So we'll just grab out the number and feed this to the
1096        # parser. We shouldn't really be losing any info this way.
1097        if "replace" in location_line:
1098            comma_pos = location_line.find(",")
1099            location_line = location_line[8:comma_pos]
1100
1101        cur_feature = self._cur_feature
1102
1103        # Handle top level complement here for speed
1104        if location_line.startswith("complement("):
1105            assert location_line.endswith(")")
1106            location_line = location_line[11:-1]
1107            strand = -1
1108        elif "PROTEIN" in self._seq_type.upper():
1109            strand = None
1110        else:
1111            # Assume nucleotide otherwise feature strand for
1112            # GenBank files with bad LOCUS lines set to None
1113            strand = 1
1114
1115        # Special case handling of the most common cases for speed
1116        if _re_simple_location.match(location_line):
1117            # e.g. "123..456"
1118            s, e = location_line.split("..")
1119            try:
1120                cur_feature.location = SeqFeature.FeatureLocation(
1121                    int(s) - 1, int(e), strand
1122                )
1123            except ValueError:
1124                # Could be non-integers, more likely bad origin wrapping
1125                cur_feature.location = _loc(
1126                    location_line,
1127                    self._expected_size,
1128                    strand,
1129                    seq_type=self._seq_type.lower(),
1130                )
1131            return
1132
1133        if ",)" in location_line:
1134            warnings.warn(
1135                "Dropping trailing comma in malformed feature location",
1136                BiopythonParserWarning,
1137            )
1138            location_line = location_line.replace(",)", ")")
1139
1140        if _solo_bond.search(location_line):
1141            # e.g. bond(196)
1142            # e.g. join(bond(284),bond(305),bond(309),bond(305))
1143            warnings.warn(
1144                "Dropping bond qualifier in feature location", BiopythonParserWarning
1145            )
1146            # There ought to be a better way to do this...
1147            for x in _solo_bond.finditer(location_line):
1148                x = x.group()
1149                location_line = location_line.replace(x, x[5:-1])
1150
1151        if _re_simple_compound.match(location_line):
1152            # e.g. join(<123..456,480..>500)
1153            i = location_line.find("(")
1154            # cur_feature.location_operator = location_line[:i]
1155            # we can split on the comma because these are simple locations
1156            locs = []
1157            for part in location_line[i + 1 : -1].split(","):
1158                s, e = part.split("..")
1159
1160                try:
1161                    locs.append(SeqFeature.FeatureLocation(int(s) - 1, int(e), strand))
1162                except ValueError:
1163                    # Could be non-integers, more likely bad origin wrapping
1164
1165                    # In the case of bad origin wrapping, _loc will return
1166                    # a CompoundLocation. CompoundLocation.parts returns a
1167                    # list of the FeatureLocation objects inside the
1168                    # CompoundLocation.
1169                    locs.extend(
1170                        _loc(
1171                            part, self._expected_size, strand, self._seq_type.lower()
1172                        ).parts
1173                    )
1174
1175            if len(locs) < 2:
1176                # The CompoundLocation will raise a ValueError here!
1177                warnings.warn(
1178                    "Should have at least 2 parts for compound location",
1179                    BiopythonParserWarning,
1180                )
1181                cur_feature.location = None
1182                return
1183            if strand == -1:
1184                cur_feature.location = SeqFeature.CompoundLocation(
1185                    locs[::-1], operator=location_line[:i]
1186                )
1187            else:
1188                cur_feature.location = SeqFeature.CompoundLocation(
1189                    locs, operator=location_line[:i]
1190                )
1191            return
1192
1193        # Handle the general case with more complex regular expressions
1194        if _re_complex_location.match(location_line):
1195            # e.g. "AL121804.2:41..610"
1196            cur_feature.location = _loc(
1197                location_line,
1198                self._expected_size,
1199                strand,
1200                seq_type=self._seq_type.lower(),
1201            )
1202            return
1203
1204        if _re_complex_compound.match(location_line):
1205            i = location_line.find("(")
1206            # cur_feature.location_operator = location_line[:i]
1207            # Can't split on the comma because of positions like one-of(1,2,3)
1208            locs = []
1209            for part in _split_compound_loc(location_line[i + 1 : -1]):
1210                if part.startswith("complement("):
1211                    assert part[-1] == ")"
1212                    part = part[11:-1]
1213                    assert strand != -1, "Double complement?"
1214                    part_strand = -1
1215                else:
1216                    part_strand = strand
1217                try:
1218                    # There is likely a problem with origin wrapping.
1219                    # Using _loc to return a CompoundLocation of the
1220                    # wrapped feature and returning the two FeatureLocation
1221                    # objects to extend to the list of feature locations.
1222                    loc = _loc(
1223                        part,
1224                        self._expected_size,
1225                        part_strand,
1226                        seq_type=self._seq_type.lower(),
1227                    ).parts
1228
1229                except ValueError:
1230                    print(location_line)
1231                    print(part)
1232                    raise
1233                # loc will be a list of one or two FeatureLocation items.
1234                locs.extend(loc)
1235            # Historically a join on the reverse strand has been represented
1236            # in Biopython with both the parent SeqFeature and its children
1237            # (the exons for a CDS) all given a strand of -1.  Likewise, for
1238            # a join feature on the forward strand they all have strand +1.
1239            # However, we must also consider evil mixed strand examples like
1240            # this, join(complement(69611..69724),139856..140087,140625..140650)
1241            if strand == -1:
1242                # Whole thing was wrapped in complement(...)
1243                for l in locs:
1244                    assert l.strand == -1
1245                # Reverse the backwards order used in GenBank files
1246                # with complement(join(...))
1247                cur_feature.location = SeqFeature.CompoundLocation(
1248                    locs[::-1], operator=location_line[:i]
1249                )
1250            else:
1251                cur_feature.location = SeqFeature.CompoundLocation(
1252                    locs, operator=location_line[:i]
1253                )
1254            return
1255        # Not recognised
1256        if "order" in location_line and "join" in location_line:
1257            # See Bug 3197
1258            msg = (
1259                'Combinations of "join" and "order" within the same '
1260                "location (nested operators) are illegal:\n" + location_line
1261            )
1262            raise LocationParserError(msg)
1263        # This used to be an error....
1264        cur_feature.location = None
1265        warnings.warn(
1266            BiopythonParserWarning(
1267                "Couldn't parse feature location: %r" % location_line
1268            )
1269        )
1270
1271    def feature_qualifier(self, key, value):
1272        """When we get a qualifier key and its value.
1273
1274        Can receive None, since you can have valueless keys such as /pseudo
1275        """
1276        # Hack to try to preserve historical behaviour of /pseudo etc
1277        if value is None:
1278            # if the key doesn't exist yet, add an empty string
1279            if key not in self._cur_feature.qualifiers:
1280                self._cur_feature.qualifiers[key] = [""]
1281                return
1282            # otherwise just skip this key
1283            return
1284
1285        # Remove enclosing quotation marks
1286        value = re.sub('^"|"$', "", value)
1287
1288        # Handle NCBI escaping
1289        # Warn if escaping is not according to standard
1290        if re.search(r'[^"]"[^"]|^"[^"]|[^"]"$', value):
1291            warnings.warn(
1292                'The NCBI states double-quote characters like " should be escaped as "" '
1293                "(two double - quotes), but here it was not: %r" % value,
1294                BiopythonParserWarning,
1295            )
1296        # Undo escaping, repeated double quotes -> one double quote
1297        value = value.replace('""', '"')
1298
1299        if self._feature_cleaner is not None:
1300            value = self._feature_cleaner.clean_value(key, value)
1301
1302        # if the qualifier name exists, append the value
1303        if key in self._cur_feature.qualifiers:
1304            self._cur_feature.qualifiers[key].append(value)
1305        # otherwise start a new list of the key with its values
1306        else:
1307            self._cur_feature.qualifiers[key] = [value]
1308
1309    def feature_qualifier_name(self, content_list):
1310        """Use feature_qualifier instead (OBSOLETE)."""
1311        raise NotImplementedError("Use the feature_qualifier method instead.")
1312
1313    def feature_qualifier_description(self, content):
1314        """Use feature_qualifier instead (OBSOLETE)."""
1315        raise NotImplementedError("Use the feature_qualifier method instead.")
1316
1317    def contig_location(self, content):
1318        """Deal with CONTIG information."""
1319        # Historically this was stored as a SeqFeature object, but it was
1320        # stored under record.annotations["contig"] and not under
1321        # record.features with the other SeqFeature objects.
1322        #
1323        # The CONTIG location line can include additional tokens like
1324        # Gap(), Gap(100) or Gap(unk100) which are not used in the feature
1325        # location lines, so storing it using SeqFeature based location
1326        # objects is difficult.
1327        #
1328        # We now store this a string, which means for BioSQL we are now in
1329        # much better agreement with how BioPerl records the CONTIG line
1330        # in the database.
1331        #
1332        # NOTE - This code assumes the scanner will return all the CONTIG
1333        # lines already combined into one long string!
1334        self.data.annotations["contig"] = content
1335
1336    def origin_name(self, content):
1337        pass
1338
1339    def base_count(self, content):
1340        pass
1341
1342    def base_number(self, content):
1343        pass
1344
1345    def sequence(self, content):
1346        """Add up sequence information as we get it.
1347
1348        To try and make things speedier, this puts all of the strings
1349        into a list of strings, and then uses string.join later to put
1350        them together. Supposedly, this is a big time savings
1351        """
1352        assert " " not in content
1353        self._seq_data.append(content.upper())
1354
1355    def record_end(self, content):
1356        """Clean up when we've finished the record."""
1357        # Try and append the version number to the accession for the full id
1358        if not self.data.id:
1359            if "accessions" in self.data.annotations:
1360                raise ValueError(
1361                    "Problem adding version number to accession: "
1362                    + str(self.data.annotations["accessions"])
1363                )
1364            self.data.id = self.data.name  # Good fall back?
1365        elif self.data.id.count(".") == 0:
1366            try:
1367                self.data.id += ".%i" % self.data.annotations["sequence_version"]
1368            except KeyError:
1369                pass
1370
1371        # add the sequence information
1372
1373        sequence = "".join(self._seq_data)
1374
1375        if (
1376            self._expected_size is not None
1377            and len(sequence) != 0
1378            and self._expected_size != len(sequence)
1379        ):
1380            warnings.warn(
1381                "Expected sequence length %i, found %i (%s)."
1382                % (self._expected_size, len(sequence), self.data.id),
1383                BiopythonParserWarning,
1384            )
1385
1386        molecule_type = None
1387        if self._seq_type:
1388            # mRNA is really also DNA, since it is actually cDNA
1389            if "DNA" in self._seq_type.upper() or "MRNA" in self._seq_type.upper():
1390                molecule_type = "DNA"
1391            # are there ever really RNA sequences in GenBank?
1392            elif "RNA" in self._seq_type.upper():
1393                # Even for data which was from RNA, the sequence string
1394                # is usually given as DNA (T not U).  Bug 3010
1395                molecule_type = "RNA"
1396            elif (
1397                "PROTEIN" in self._seq_type.upper() or self._seq_type == "PRT"
1398            ):  # PRT is used in EMBL-bank for patents
1399                molecule_type = "protein"
1400            # work around ugly GenBank records which have circular or
1401            # linear but no indication of sequence type
1402            elif self._seq_type in ["circular", "linear", "unspecified"]:
1403                pass
1404            # we have a bug if we get here
1405            else:
1406                raise ValueError(
1407                    "Could not determine molecule_type for seq_type %s" % self._seq_type
1408                )
1409        # Don't overwrite molecule_type
1410        if molecule_type is not None:
1411            self.data.annotations["molecule_type"] = self.data.annotations.get(
1412                "molecule_type", molecule_type
1413            )
1414        if not sequence and self._expected_size:
1415            self.data.seq = Seq(None, length=self._expected_size)
1416        else:
1417            self.data.seq = Seq(sequence)
1418
1419
1420class _RecordConsumer(_BaseGenBankConsumer):
1421    """Create a GenBank Record object from scanner generated information (PRIVATE)."""
1422
1423    def __init__(self):
1424        _BaseGenBankConsumer.__init__(self)
1425        from . import Record
1426
1427        self.data = Record.Record()
1428
1429        self._seq_data = []
1430        self._cur_reference = None
1431        self._cur_feature = None
1432        self._cur_qualifier = None
1433
1434    def tls(self, content):
1435        self.data.tls = content.split("-")
1436
1437    def tsa(self, content):
1438        self.data.tsa = content.split("-")
1439
1440    def wgs(self, content):
1441        self.data.wgs = content.split("-")
1442
1443    def add_wgs_scafld(self, content):
1444        self.data.wgs_scafld.append(content.split("-"))
1445
1446    def locus(self, content):
1447        self.data.locus = content
1448
1449    def size(self, content):
1450        self.data.size = content
1451
1452    def residue_type(self, content):
1453        # Be lenient about parsing, but technically lowercase residue types are malformed.
1454        if "dna" in content or "rna" in content:
1455            warnings.warn(
1456                "Invalid seq_type (%s): DNA/RNA should be uppercase." % content,
1457                BiopythonParserWarning,
1458            )
1459        self.data.residue_type = content
1460
1461    def data_file_division(self, content):
1462        self.data.data_file_division = content
1463
1464    def date(self, content):
1465        self.data.date = content
1466
1467    def definition(self, content):
1468        self.data.definition = content
1469
1470    def accession(self, content):
1471        for acc in self._split_accessions(content):
1472            if acc not in self.data.accession:
1473                self.data.accession.append(acc)
1474
1475    def molecule_type(self, mol_type):
1476        """Validate and record the molecule type (for round-trip etc)."""
1477        if mol_type:
1478            if "circular" in mol_type or "linear" in mol_type:
1479                raise ParserFailureError(
1480                    "Molecule type %r should not include topology" % mol_type
1481                )
1482
1483            # Writing out records will fail if we have a lower case DNA
1484            # or RNA string in here, so upper case it.
1485            # This is a bit ugly, but we don't want to upper case e.g.
1486            # the m in mRNA, but thanks to the strip we lost the spaces
1487            # so we need to index from the back
1488            if mol_type[-3:].upper() in ("DNA", "RNA") and not mol_type[-3:].isupper():
1489                warnings.warn(
1490                    "Non-upper case molecule type in LOCUS line: %s" % mol_type,
1491                    BiopythonParserWarning,
1492                )
1493
1494            self.data.molecule_type = mol_type
1495
1496    def topology(self, topology):
1497        """Validate and record sequence topology.
1498
1499        The topology argument should be "linear" or "circular" (string).
1500        """
1501        if topology:
1502            if topology not in ["linear", "circular"]:
1503                raise ParserFailureError(
1504                    "Unexpected topology %r should be linear or circular" % topology
1505                )
1506            self.data.topology = topology
1507
1508    def nid(self, content):
1509        self.data.nid = content
1510
1511    def pid(self, content):
1512        self.data.pid = content
1513
1514    def version(self, content):
1515        self.data.version = content
1516
1517    def db_source(self, content):
1518        self.data.db_source = content.rstrip()
1519
1520    def gi(self, content):
1521        self.data.gi = content
1522
1523    def keywords(self, content):
1524        self.data.keywords = self._split_keywords(content)
1525
1526    def project(self, content):
1527        self.data.projects.extend(p for p in content.split() if p)
1528
1529    def dblink(self, content):
1530        self.data.dblinks.append(content)
1531
1532    def segment(self, content):
1533        self.data.segment = content
1534
1535    def source(self, content):
1536        self.data.source = content
1537
1538    def organism(self, content):
1539        self.data.organism = content
1540
1541    def taxonomy(self, content):
1542        self.data.taxonomy = self._split_taxonomy(content)
1543
1544    def reference_num(self, content):
1545        """Grab the reference number and signal the start of a new reference."""
1546        # check if we have a reference to add
1547        if self._cur_reference is not None:
1548            self.data.references.append(self._cur_reference)
1549
1550        from . import Record
1551
1552        self._cur_reference = Record.Reference()
1553        self._cur_reference.number = content
1554
1555    def reference_bases(self, content):
1556        self._cur_reference.bases = content
1557
1558    def authors(self, content):
1559        self._cur_reference.authors = content
1560
1561    def consrtm(self, content):
1562        self._cur_reference.consrtm = content
1563
1564    def title(self, content):
1565        if self._cur_reference is None:
1566            warnings.warn(
1567                "GenBank TITLE line without REFERENCE line.", BiopythonParserWarning
1568            )
1569            return
1570        self._cur_reference.title = content
1571
1572    def journal(self, content):
1573        self._cur_reference.journal = content
1574
1575    def medline_id(self, content):
1576        self._cur_reference.medline_id = content
1577
1578    def pubmed_id(self, content):
1579        self._cur_reference.pubmed_id = content
1580
1581    def remark(self, content):
1582        self._cur_reference.remark = content
1583
1584    def comment(self, content):
1585        self.data.comment += "\n".join(content)
1586
1587    def structured_comment(self, content):
1588        self.data.structured_comment = content
1589
1590    def primary_ref_line(self, content):
1591        """Save reference data for the PRIMARY line."""
1592        self.data.primary.append(content)
1593
1594    def primary(self, content):
1595        pass
1596
1597    def features_line(self, content):
1598        """Get ready for the feature table when we reach the FEATURE line."""
1599        self.start_feature_table()
1600
1601    def start_feature_table(self):
1602        """Signal the start of the feature table."""
1603        # we need to add on the last reference
1604        if self._cur_reference is not None:
1605            self.data.references.append(self._cur_reference)
1606
1607    def feature_key(self, content):
1608        """Grab the key of the feature and signal the start of a new feature."""
1609        # first add on feature information if we've got any
1610        self._add_feature()
1611
1612        from . import Record
1613
1614        self._cur_feature = Record.Feature()
1615        self._cur_feature.key = content
1616
1617    def _add_feature(self):
1618        """Add a feature to the record, with relevant checks (PRIVATE).
1619
1620        This does all of the appropriate checking to make sure we haven't
1621        left any info behind, and that we are only adding info if it
1622        exists.
1623        """
1624        if self._cur_feature is not None:
1625            # if we have a left over qualifier, add it to the qualifiers
1626            # on the current feature
1627            if self._cur_qualifier is not None:
1628                self._cur_feature.qualifiers.append(self._cur_qualifier)
1629
1630            self._cur_qualifier = None
1631            self.data.features.append(self._cur_feature)
1632
1633    def location(self, content):
1634        self._cur_feature.location = self._clean_location(content)
1635
1636    def feature_qualifier(self, key, value):
1637        self.feature_qualifier_name([key])
1638        if value is not None:
1639            self.feature_qualifier_description(value)
1640
1641    def feature_qualifier_name(self, content_list):
1642        """Deal with qualifier names.
1643
1644        We receive a list of keys, since you can have valueless keys such as
1645        /pseudo which would be passed in with the next key (since no other
1646        tags separate them in the file)
1647        """
1648        from . import Record
1649
1650        for content in content_list:
1651            # the record parser keeps the /s -- add them if we don't have 'em
1652            if not content.startswith("/"):
1653                content = "/%s" % content
1654            # add on a qualifier if we've got one
1655            if self._cur_qualifier is not None:
1656                self._cur_feature.qualifiers.append(self._cur_qualifier)
1657
1658            self._cur_qualifier = Record.Qualifier()
1659            self._cur_qualifier.key = content
1660
1661    def feature_qualifier_description(self, content):
1662        # if we have info then the qualifier key should have a ='s
1663        if "=" not in self._cur_qualifier.key:
1664            self._cur_qualifier.key = "%s=" % self._cur_qualifier.key
1665        cur_content = self._remove_newlines(content)
1666        # remove all spaces from the value if it is a type where spaces
1667        # are not important
1668        for remove_space_key in self.__class__.remove_space_keys:
1669            if remove_space_key in self._cur_qualifier.key:
1670                cur_content = self._remove_spaces(cur_content)
1671        self._cur_qualifier.value = self._normalize_spaces(cur_content)
1672
1673    def base_count(self, content):
1674        self.data.base_counts = content
1675
1676    def origin_name(self, content):
1677        self.data.origin = content
1678
1679    def contig_location(self, content):
1680        """Signal that we have contig information to add to the record."""
1681        self.data.contig = self._clean_location(content)
1682
1683    def sequence(self, content):
1684        """Add sequence information to a list of sequence strings.
1685
1686        This removes spaces in the data and uppercases the sequence, and
1687        then adds it to a list of sequences. Later on we'll join this
1688        list together to make the final sequence. This is faster than
1689        adding on the new string every time.
1690        """
1691        assert " " not in content
1692        self._seq_data.append(content.upper())
1693
1694    def record_end(self, content):
1695        """Signal the end of the record and do any necessary clean-up."""
1696        # add together all of the sequence parts to create the
1697        # final sequence string
1698        self.data.sequence = "".join(self._seq_data)
1699        # add on the last feature
1700        self._add_feature()
1701
1702
1703def parse(handle):
1704    """Iterate over GenBank formatted entries as Record objects.
1705
1706    >>> from Bio import GenBank
1707    >>> with open("GenBank/NC_000932.gb") as handle:
1708    ...     for record in GenBank.parse(handle):
1709    ...         print(record.accession)
1710    ['NC_000932']
1711
1712    To get SeqRecord objects use Bio.SeqIO.parse(..., format="gb")
1713    instead.
1714    """
1715    return iter(Iterator(handle, RecordParser()))
1716
1717
1718def read(handle):
1719    """Read a handle containing a single GenBank entry as a Record object.
1720
1721    >>> from Bio import GenBank
1722    >>> with open("GenBank/NC_000932.gb") as handle:
1723    ...     record = GenBank.read(handle)
1724    ...     print(record.accession)
1725    ['NC_000932']
1726
1727    To get a SeqRecord object use Bio.SeqIO.read(..., format="gb")
1728    instead.
1729    """
1730    iterator = parse(handle)
1731    try:
1732        record = next(iterator)
1733    except StopIteration:
1734        raise ValueError("No records found in handle") from None
1735    try:
1736        next(iterator)
1737        raise ValueError("More than one record found in handle")
1738    except StopIteration:
1739        pass
1740    return record
1741
1742
1743if __name__ == "__main__":
1744    from Bio._utils import run_doctest
1745
1746    run_doctest()
1747