1#!/usr/bin/env python 2# 3# Restriction Analysis Libraries. 4# Copyright (C) 2004. Frederic Sohm. 5# 6# This code is part of the Biopython distribution and governed by its 7# license. Please see the LICENSE file that should have been included 8# as part of this package. 9# 10 11"""Restriction Enzyme classes. 12 13Notes about the diverses class of the restriction enzyme implementation:: 14 15 RestrictionType is the type of all restriction enzymes. 16 ----------------------------------------------------------------------- 17 AbstractCut implements some methods that are common to all enzymes. 18 ----------------------------------------------------------------------- 19 NoCut, OneCut,TwoCuts represent the number of double strand cuts 20 produced by the enzyme. 21 they correspond to the 4th field of the 22 rebase record emboss_e.NNN. 23 0->NoCut : the enzyme is not characterised. 24 2->OneCut : the enzyme produce one double strand cut. 25 4->TwoCuts : two double strand cuts. 26 ----------------------------------------------------------------------- 27 Meth_Dep, Meth_Undep represent the methylation susceptibility to 28 the enzyme. 29 Not implemented yet. 30 ----------------------------------------------------------------------- 31 Palindromic, if the site is palindromic or not. 32 NotPalindromic allow some optimisations of the code. 33 No need to check the reverse strand 34 with palindromic sites. 35 ----------------------------------------------------------------------- 36 Unknown, Blunt, represent the overhang. 37 Ov5, Ov3 Unknown is here for symmetry reasons and 38 correspond to enzymes that are not 39 characterised in rebase. 40 ----------------------------------------------------------------------- 41 Defined, Ambiguous, represent the sequence of the overhang. 42 NotDefined 43 NotDefined is for enzymes not characterised 44 in rebase. 45 46 Defined correspond to enzymes that display 47 a constant overhang whatever the sequence. 48 ex : EcoRI. G^AATTC -> overhang :AATT 49 CTTAA^G 50 51 Ambiguous : the overhang varies with the 52 sequence restricted. 53 Typically enzymes which cut outside their 54 restriction site or (but not always) 55 inside an ambiguous site. 56 ex: 57 AcuI CTGAAG(22/20) -> overhang : NN 58 AasI GACNNN^NNNGTC -> overhang : NN 59 CTGN^NNNNNCAG 60 61 note : these 3 classes refers to the overhang not the site. 62 So the enzyme ApoI (RAATTY) is defined even if its 63 restriction site is ambiguous. 64 65 ApoI R^AATTY -> overhang : AATT -> Defined 66 YTTAA^R 67 Accordingly, blunt enzymes are always Defined even 68 when they cut outside their restriction site. 69 ----------------------------------------------------------------------- 70 Not_available, as found in rebase file emboss_r.NNN files. 71 Commercially_available 72 allow the selection of the enzymes 73 according to their suppliers to reduce the 74 quantity of results. 75 Also will allow the implementation of 76 buffer compatibility tables. Not 77 implemented yet. 78 79 the list of suppliers is extracted from 80 emboss_s.NNN 81 ----------------------------------------------------------------------- 82 83""" 84 85 86import warnings 87 88import re 89import itertools 90 91from Bio.Seq import Seq, MutableSeq 92from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 93from Bio.Restriction.Restriction_Dictionary import typedict 94from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 95from Bio.Restriction.PrintFormat import PrintFormat 96from Bio import BiopythonWarning 97 98 99# Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 100# namespace), but have deprecated that module. 101 102 103def _check_bases(seq_string): 104 """Check characters in a string (PRIVATE). 105 106 Remove digits and white space present in string. Allows any valid ambiguous 107 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 108 109 Other characters (e.g. symbols) trigger a TypeError. 110 111 Returns the string WITH A LEADING SPACE (!). This is for backwards 112 compatibility, and may in part be explained by the fact that 113 ``Bio.Restriction`` doesn't use zero based counting. 114 """ 115 # Remove white space and make upper case: 116 seq_string = "".join(seq_string.split()).upper() 117 # Remove digits 118 for c in "0123456789": 119 seq_string = seq_string.replace(c, "") 120 # Check only allowed IUPAC letters 121 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): 122 raise TypeError("Invalid character found in %r" % seq_string) 123 return " " + seq_string 124 125 126matching = { 127 "A": "ARWMHVDN", 128 "C": "CYSMHBVN", 129 "G": "GRSKBVDN", 130 "T": "TYWKHBDN", 131 "R": "ABDGHKMNSRWV", 132 "Y": "CBDHKMNSTWVY", 133 "W": "ABDHKMNRTWVY", 134 "S": "CBDGHKMNSRVY", 135 "M": "ACBDHMNSRWVY", 136 "K": "BDGHKNSRTWVY", 137 "H": "ACBDHKMNSRTWVY", 138 "B": "CBDGHKMNSRTWVY", 139 "V": "ACBDGHKMNSRWVY", 140 "D": "ABDGHKMNSRTWVY", 141 "N": "ACBDGHKMNSRTWVY", 142} 143 144DNA = Seq 145 146 147class FormattedSeq: 148 """A linear or circular sequence object for restriction analysis. 149 150 Translates a Bio.Seq into a formatted sequence to be used with Restriction. 151 152 Roughly: remove anything which is not IUPAC alphabet and then add a space 153 in front of the sequence to get a biological index instead of a 154 python index (i.e. index of the first base is 1 not 0). 155 156 Retains information about the shape of the molecule linear (default) or 157 circular. Restriction sites are search over the edges of circular sequence. 158 """ 159 160 def __init__(self, seq, linear=True): 161 """Initialize ``FormattedSeq`` with sequence and topology (optional). 162 163 ``seq`` is either a ``Bio.Seq``, ``Bio.MutableSeq`` or a 164 ``FormattedSeq``. If ``seq`` is a ``FormattedSeq``, ``linear`` 165 will have no effect on the shape of the sequence. 166 """ 167 if isinstance(seq, (Seq, MutableSeq)): 168 stringy = str(seq) 169 self.lower = stringy.islower() 170 # Note this adds a leading space to the sequence (!) 171 self.data = _check_bases(stringy) 172 self.linear = linear 173 self.klass = seq.__class__ 174 elif isinstance(seq, FormattedSeq): 175 self.lower = seq.lower 176 self.data = seq.data 177 self.linear = seq.linear 178 self.klass = seq.klass 179 else: 180 raise TypeError("expected Seq or MutableSeq, got %s" % type(seq)) 181 182 def __len__(self): 183 """Return length of ``FormattedSeq``. 184 185 ``FormattedSeq`` has a leading space, thus subtract 1. 186 """ 187 return len(self.data) - 1 188 189 def __repr__(self): 190 """Represent ``FormattedSeq`` class as a string.""" 191 return "FormattedSeq(%r, linear=%r)" % (self[1:], self.linear) 192 193 def __eq__(self, other): 194 """Implement equality operator for ``FormattedSeq`` object.""" 195 if isinstance(other, FormattedSeq): 196 if repr(self) == repr(other): 197 return True 198 else: 199 return False 200 return False 201 202 def circularise(self): 203 """Circularise sequence in place.""" 204 self.linear = False 205 206 def linearise(self): 207 """Linearise sequence in place.""" 208 self.linear = True 209 210 def to_linear(self): 211 """Make a new instance of sequence as linear.""" 212 new = self.__class__(self) 213 new.linear = True 214 return new 215 216 def to_circular(self): 217 """Make a new instance of sequence as circular.""" 218 new = self.__class__(self) 219 new.linear = False 220 return new 221 222 def is_linear(self): 223 """Return if sequence is linear (True) or circular (False).""" 224 return self.linear 225 226 def finditer(self, pattern, size): 227 """Return a list of a given pattern which occurs in the sequence. 228 229 The list is made of tuple (location, pattern.group). 230 The latter is used with non palindromic sites. 231 Pattern is the regular expression pattern corresponding to the 232 enzyme restriction site. 233 Size is the size of the restriction enzyme recognition-site size. 234 """ 235 if self.is_linear(): 236 data = self.data 237 else: 238 data = self.data + self.data[1:size] 239 return [(i.start(), i.group) for i in re.finditer(pattern, data)] 240 241 def __getitem__(self, i): 242 """Return substring of ``FormattedSeq``. 243 244 The class of the returned object is the class of the respective 245 sequence. Note that due to the leading space, indexing is 1-based: 246 247 >>> from Bio.Seq import Seq 248 >>> from Bio.Restriction.Restriction import FormattedSeq 249 >>> f_seq = FormattedSeq(Seq('ATGCATGC')) 250 >>> f_seq[1] 251 Seq('A') 252 253 """ 254 if self.lower: 255 return self.klass(self.data[i].lower()) 256 return self.klass(self.data[i]) 257 258 259class RestrictionType(type): 260 """RestrictionType. Type from which all enzyme classes are derived. 261 262 Implement the operator methods. 263 """ 264 265 def __init__(cls, name="", bases=(), dct=None): 266 """Initialize RestrictionType instance. 267 268 Not intended to be used in normal operation. The enzymes are 269 instantiated when importing the module. 270 See below. 271 """ 272 if "-" in name: 273 raise ValueError("Problem with hyphen in %r as enzyme name" % name) 274 # 2011/11/26 - Nobody knows what this call was supposed to accomplish, 275 # but all unit tests seem to pass without it. 276 # super().__init__(cls, name, bases, dct) 277 try: 278 cls.compsite = re.compile(cls.compsite) 279 except AttributeError: 280 # Can happen if initialised wrongly. 281 # (This was seen when Sphinx api-doc imports the classes, and 282 # tried to automatically general documentation for them) 283 pass 284 except Exception: 285 raise ValueError( 286 "Problem with regular expression, re.compiled(%r)" % cls.compsite 287 ) from None 288 289 def __add__(cls, other): 290 """Add restriction enzyme to a RestrictionBatch(). 291 292 If other is an enzyme returns a batch of the two enzymes. 293 If other is already a RestrictionBatch add enzyme to it. 294 """ 295 if isinstance(other, RestrictionType): 296 return RestrictionBatch([cls, other]) 297 elif isinstance(other, RestrictionBatch): 298 return other.add_nocheck(cls) 299 else: 300 raise TypeError 301 302 def __truediv__(cls, other): 303 """Override '/' operator to use as search method. 304 305 >>> from Bio.Restriction import EcoRI 306 >>> EcoRI/Seq('GAATTC') 307 [2] 308 309 Returns RE.search(other). 310 """ 311 return cls.search(other) 312 313 def __rtruediv__(cls, other): 314 """Override division with reversed operands to use as search method. 315 316 >>> from Bio.Restriction import EcoRI 317 >>> Seq('GAATTC')/EcoRI 318 [2] 319 320 Returns RE.search(other). 321 """ 322 return cls.search(other) 323 324 def __floordiv__(cls, other): 325 """Override '//' operator to use as catalyse method. 326 327 >>> from Bio.Restriction import EcoRI 328 >>> EcoRI//Seq('GAATTC') 329 (Seq('G'), Seq('AATTC')) 330 331 Returns RE.catalyse(other). 332 """ 333 return cls.catalyse(other) 334 335 def __rfloordiv__(cls, other): 336 """As __floordiv__, with reversed operands. 337 338 >>> from Bio.Restriction import EcoRI 339 >>> Seq('GAATTC')//EcoRI 340 (Seq('G'), Seq('AATTC')) 341 342 Returns RE.catalyse(other). 343 """ 344 return cls.catalyse(other) 345 346 def __str__(cls): 347 """Return the name of the enzyme as string.""" 348 return cls.__name__ 349 350 def __repr__(cls): 351 """Implement repr method. 352 353 Used with eval or exec will instantiate the enzyme. 354 """ 355 return "%s" % cls.__name__ 356 357 def __len__(cls): 358 """Return length of recognition site of enzyme as int.""" 359 try: 360 return cls.size 361 except AttributeError: 362 # Happens if the instance was not initialised as expected. 363 # e.g. if instance created by a documentation framework 364 # like Sphinx trying to inspect the class automatically, 365 # Also seen within IPython. 366 return 0 367 368 def __hash__(cls): 369 """Implement ``hash()`` method for ``RestrictionType``. 370 371 Python default is to use ``id(...)`` 372 This is consistent with the ``__eq__`` implementation 373 """ 374 return id(cls) 375 376 def __eq__(cls, other): 377 """Override '==' operator. 378 379 True if RE and other are the same enzyme. 380 381 Specifically this checks they are the same Python object. 382 """ 383 # assert (id(cls)==id(other)) == (other is cls) == (cls is other) 384 return id(cls) == id(other) 385 386 def __ne__(cls, other): 387 """Override '!=' operator. 388 389 Isoschizomer strict (same recognition site, same restriction) -> False 390 All the other-> True 391 392 WARNING - This is not the inverse of the __eq__ method 393 394 >>> from Bio.Restriction import SacI, SstI 395 >>> SacI != SstI # true isoschizomers 396 False 397 >>> SacI == SstI 398 False 399 """ 400 if not isinstance(other, RestrictionType): 401 return True 402 elif cls.charac == other.charac: 403 return False 404 else: 405 return True 406 407 def __rshift__(cls, other): 408 """Override '>>' operator to test for neoschizomers. 409 410 neoschizomer : same recognition site, different restriction. -> True 411 all the others : -> False 412 413 >>> from Bio.Restriction import SmaI, XmaI 414 >>> SmaI >> XmaI 415 True 416 """ 417 if not isinstance(other, RestrictionType): 418 return False 419 elif cls.site == other.site and cls.charac != other.charac: 420 return True 421 else: 422 return False 423 424 def __mod__(cls, other): 425 """Override '%' operator to test for compatible overhangs. 426 427 True if a and b have compatible overhang. 428 429 >>> from Bio.Restriction import XhoI, SalI 430 >>> XhoI % SalI 431 True 432 """ 433 if not isinstance(other, RestrictionType): 434 raise TypeError("expected RestrictionType, got %s instead" % type(other)) 435 return cls._mod1(other) 436 437 def __ge__(cls, other): 438 """Compare length of recognition site of two enzymes. 439 440 Override '>='. a is greater or equal than b if the a site is longer 441 than b site. If their site have the same length sort by alphabetical 442 order of their names. 443 444 >>> from Bio.Restriction import EcoRI, EcoRV 445 >>> EcoRI.size 446 6 447 >>> EcoRV.size 448 6 449 >>> EcoRI >= EcoRV 450 False 451 """ 452 if not isinstance(other, RestrictionType): 453 raise NotImplementedError 454 if len(cls) > len(other): 455 return True 456 elif cls.size == len(other) and cls.__name__ >= other.__name__: 457 return True 458 else: 459 return False 460 461 def __gt__(cls, other): 462 """Compare length of recognition site of two enzymes. 463 464 Override '>'. Sorting order: 465 466 1. size of the recognition site. 467 2. if equal size, alphabetical order of the names. 468 469 """ 470 if not isinstance(other, RestrictionType): 471 raise NotImplementedError 472 if len(cls) > len(other): 473 return True 474 elif cls.size == len(other) and cls.__name__ > other.__name__: 475 return True 476 else: 477 return False 478 479 def __le__(cls, other): 480 """Compare length of recognition site of two enzymes. 481 482 Override '<='. Sorting order: 483 484 1. size of the recognition site. 485 2. if equal size, alphabetical order of the names. 486 487 """ 488 if not isinstance(other, RestrictionType): 489 raise NotImplementedError 490 elif len(cls) < len(other): 491 return True 492 elif len(cls) == len(other) and cls.__name__ <= other.__name__: 493 return True 494 else: 495 return False 496 497 def __lt__(cls, other): 498 """Compare length of recognition site of two enzymes. 499 500 Override '<'. Sorting order: 501 502 1. size of the recognition site. 503 2. if equal size, alphabetical order of the names. 504 505 """ 506 if not isinstance(other, RestrictionType): 507 raise NotImplementedError 508 elif len(cls) < len(other): 509 return True 510 elif len(cls) == len(other) and cls.__name__ < other.__name__: 511 return True 512 else: 513 return False 514 515 516class AbstractCut(RestrictionType): 517 """Implement the methods that are common to all restriction enzymes. 518 519 All the methods are classmethod. 520 521 For internal use only. Not meant to be instantiated. 522 """ 523 524 @classmethod 525 def search(cls, dna, linear=True): 526 """Return a list of cutting sites of the enzyme in the sequence. 527 528 Compensate for circular sequences and so on. 529 530 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 531 532 If linear is False, the restriction sites that span over the boundaries 533 will be included. 534 535 The positions are the first base of the 3' fragment, 536 i.e. the first base after the position the enzyme will cut. 537 """ 538 # 539 # Separating search from _search allow a (very limited) optimisation 540 # of the search when using a batch of restriction enzymes. 541 # in this case the DNA is tested once by the class which implements 542 # the batch instead of being tested by each enzyme single. 543 # see RestrictionBatch.search() for example. 544 # 545 if isinstance(dna, FormattedSeq): 546 cls.dna = dna 547 return cls._search() 548 else: 549 cls.dna = FormattedSeq(dna, linear) 550 return cls._search() 551 552 @classmethod 553 def all_suppliers(cls): 554 """Print all the suppliers of restriction enzyme.""" 555 supply = sorted(x[0] for x in suppliers_dict.values()) 556 print(",\n".join(supply)) 557 558 @classmethod 559 def is_equischizomer(cls, other): 560 """Test for real isoschizomer. 561 562 True if other is an isoschizomer of RE, but not an neoschizomer, 563 else False. 564 565 Equischizomer: same site, same position of restriction. 566 567 >>> from Bio.Restriction import SacI, SstI, SmaI, XmaI 568 >>> SacI.is_equischizomer(SstI) 569 True 570 >>> SmaI.is_equischizomer(XmaI) 571 False 572 573 """ 574 return not cls != other 575 576 @classmethod 577 def is_neoschizomer(cls, other): 578 """Test for neoschizomer. 579 580 True if other is an isoschizomer of RE, else False. 581 Neoschizomer: same site, different position of restriction. 582 """ 583 return cls >> other 584 585 @classmethod 586 def is_isoschizomer(cls, other): 587 """Test for same recognition site. 588 589 True if other has the same recognition site, else False. 590 591 Isoschizomer: same site. 592 593 >>> from Bio.Restriction import SacI, SstI, SmaI, XmaI 594 >>> SacI.is_isoschizomer(SstI) 595 True 596 >>> SmaI.is_isoschizomer(XmaI) 597 True 598 599 """ 600 return (not cls != other) or cls >> other 601 602 @classmethod 603 def equischizomers(cls, batch=None): 604 """List equischizomers of the enzyme. 605 606 Return a tuple of all the isoschizomers of RE. 607 If batch is supplied it is used instead of the default AllEnzymes. 608 609 Equischizomer: same site, same position of restriction. 610 """ 611 if not batch: 612 batch = AllEnzymes 613 r = [x for x in batch if not cls != x] 614 i = r.index(cls) 615 del r[i] 616 r.sort() 617 return r 618 619 @classmethod 620 def neoschizomers(cls, batch=None): 621 """List neoschizomers of the enzyme. 622 623 Return a tuple of all the neoschizomers of RE. 624 If batch is supplied it is used instead of the default AllEnzymes. 625 626 Neoschizomer: same site, different position of restriction. 627 """ 628 if not batch: 629 batch = AllEnzymes 630 r = sorted(x for x in batch if cls >> x) 631 return r 632 633 @classmethod 634 def isoschizomers(cls, batch=None): 635 """List all isoschizomers of the enzyme. 636 637 Return a tuple of all the equischizomers and neoschizomers of RE. 638 If batch is supplied it is used instead of the default AllEnzymes. 639 """ 640 if not batch: 641 batch = AllEnzymes 642 r = [x for x in batch if (cls >> x) or (not cls != x)] 643 i = r.index(cls) 644 del r[i] 645 r.sort() 646 return r 647 648 @classmethod 649 def frequency(cls): 650 """Return the theoretically cutting frequency of the enzyme. 651 652 Frequency of the site, given as 'one cut per x bases' (int). 653 """ 654 return cls.freq 655 656 657class NoCut(AbstractCut): 658 """Implement the methods specific to the enzymes that do not cut. 659 660 These enzymes are generally enzymes that have been only partially 661 characterised and the way they cut the DNA is unknow or enzymes for 662 which the pattern of cut is to complex to be recorded in Rebase 663 (ncuts values of 0 in emboss_e.###). 664 665 When using search() with these enzymes the values returned are at the start 666 of the restriction site. 667 668 Their catalyse() method returns a TypeError. 669 670 Unknown and NotDefined are also part of the base classes of these enzymes. 671 672 Internal use only. Not meant to be instantiated. 673 """ 674 675 @classmethod 676 def cut_once(cls): 677 """Return if the cutting pattern has one cut. 678 679 True if the enzyme cut the sequence one time on each strand. 680 """ 681 return False 682 683 @classmethod 684 def cut_twice(cls): 685 """Return if the cutting pattern has two cuts. 686 687 True if the enzyme cut the sequence twice on each strand. 688 """ 689 return False 690 691 @classmethod 692 def _modify(cls, location): 693 """Return a generator that moves the cutting position by 1 (PRIVATE). 694 695 For internal use only. 696 697 location is an integer corresponding to the location of the match for 698 the enzyme pattern in the sequence. 699 _modify returns the real place where the enzyme will cut. 700 701 Example:: 702 703 EcoRI pattern : GAATTC 704 EcoRI will cut after the G. 705 so in the sequence: 706 ______ 707 GAATACACGGAATTCGA 708 | 709 10 710 dna.finditer(GAATTC, 6) will return 10 as G is the 10th base 711 EcoRI cut after the G so: 712 EcoRI._modify(10) -> 11. 713 714 If the enzyme cut twice _modify will returns two integer corresponding 715 to each cutting site. 716 """ 717 yield location 718 719 @classmethod 720 def _rev_modify(cls, location): 721 """Return a generator that moves the cutting position by 1 (PRIVATE). 722 723 For internal use only. 724 725 As _modify for site situated on the antiparallel strand when the 726 enzyme is not palindromic. 727 """ 728 yield location 729 730 @classmethod 731 def characteristic(cls): 732 """Return a list of the enzyme's characteristics as tuple. 733 734 the tuple contains the attributes: 735 736 - fst5 -> first 5' cut ((current strand) or None 737 - fst3 -> first 3' cut (complementary strand) or None 738 - scd5 -> second 5' cut (current strand) or None 739 - scd5 -> second 3' cut (complementary strand) or None 740 - site -> recognition site. 741 742 """ 743 return None, None, None, None, cls.site 744 745 746class OneCut(AbstractCut): 747 """Implement the methods for enzymes that cut the DNA only once. 748 749 Correspond to ncuts values of 2 in emboss_e.### 750 751 Internal use only. Not meant to be instantiated. 752 """ 753 754 @classmethod 755 def cut_once(cls): 756 """Return if the cutting pattern has one cut. 757 758 True if the enzyme cut the sequence one time on each strand. 759 """ 760 return True 761 762 @classmethod 763 def cut_twice(cls): 764 """Return if the cutting pattern has two cuts. 765 766 True if the enzyme cut the sequence twice on each strand. 767 """ 768 return False 769 770 @classmethod 771 def _modify(cls, location): 772 """Return a generator that moves the cutting position by 1 (PRIVATE). 773 774 For internal use only. 775 776 location is an integer corresponding to the location of the match for 777 the enzyme pattern in the sequence. 778 _modify returns the real place where the enzyme will cut. 779 780 Example:: 781 782 EcoRI pattern : GAATTC 783 EcoRI will cut after the G. 784 so in the sequence: 785 ______ 786 GAATACACGGAATTCGA 787 | 788 10 789 dna.finditer(GAATTC, 6) will return 10 as G is the 10th base 790 EcoRI cut after the G so: 791 EcoRI._modify(10) -> 11. 792 793 if the enzyme cut twice _modify will returns two integer corresponding 794 to each cutting site. 795 """ 796 yield location + cls.fst5 797 798 @classmethod 799 def _rev_modify(cls, location): 800 """Return a generator that moves the cutting position by 1 (PRIVATE). 801 802 For internal use only. 803 804 As _modify for site situated on the antiparallel strand when the 805 enzyme is not palindromic 806 """ 807 yield location - cls.fst3 808 809 @classmethod 810 def characteristic(cls): 811 """Return a list of the enzyme's characteristics as tuple. 812 813 The tuple contains the attributes: 814 815 - fst5 -> first 5' cut ((current strand) or None 816 - fst3 -> first 3' cut (complementary strand) or None 817 - scd5 -> second 5' cut (current strand) or None 818 - scd5 -> second 3' cut (complementary strand) or None 819 - site -> recognition site. 820 821 """ 822 return cls.fst5, cls.fst3, None, None, cls.site 823 824 825class TwoCuts(AbstractCut): 826 """Implement the methods for enzymes that cut the DNA twice. 827 828 Correspond to ncuts values of 4 in emboss_e.### 829 830 Internal use only. Not meant to be instantiated. 831 """ 832 833 @classmethod 834 def cut_once(cls): 835 """Return if the cutting pattern has one cut. 836 837 True if the enzyme cut the sequence one time on each strand. 838 """ 839 return False 840 841 @classmethod 842 def cut_twice(cls): 843 """Return if the cutting pattern has two cuts. 844 845 True if the enzyme cut the sequence twice on each strand. 846 """ 847 return True 848 849 @classmethod 850 def _modify(cls, location): 851 """Return a generator that moves the cutting position by 1 (PRIVATE). 852 853 For internal use only. 854 855 location is an integer corresponding to the location of the match for 856 the enzyme pattern in the sequence. 857 _modify returns the real place where the enzyme will cut. 858 859 example:: 860 861 EcoRI pattern : GAATTC 862 EcoRI will cut after the G. 863 so in the sequence: 864 ______ 865 GAATACACGGAATTCGA 866 | 867 10 868 dna.finditer(GAATTC, 6) will return 10 as G is the 10th base 869 EcoRI cut after the G so: 870 EcoRI._modify(10) -> 11. 871 872 if the enzyme cut twice _modify will returns two integer corresponding 873 to each cutting site. 874 """ 875 yield location + cls.fst5 876 yield location + cls.scd5 877 878 @classmethod 879 def _rev_modify(cls, location): 880 """Return a generator that moves the cutting position by 1 (PRIVATE). 881 882 for internal use only. 883 884 as _modify for site situated on the antiparallel strand when the 885 enzyme is not palindromic 886 """ 887 yield location - cls.fst3 888 yield location - cls.scd3 889 890 @classmethod 891 def characteristic(cls): 892 """Return a list of the enzyme's characteristics as tuple. 893 894 the tuple contains the attributes: 895 896 - fst5 -> first 5' cut ((current strand) or None 897 - fst3 -> first 3' cut (complementary strand) or None 898 - scd5 -> second 5' cut (current strand) or None 899 - scd5 -> second 3' cut (complementary strand) or None 900 - site -> recognition site. 901 902 """ 903 return cls.fst5, cls.fst3, cls.scd5, cls.scd3, cls.site 904 905 906class Meth_Dep(AbstractCut): 907 """Implement the information about methylation. 908 909 Enzymes of this class possess a site which is methylable. 910 """ 911 912 @classmethod 913 def is_methylable(cls): 914 """Return if recognition site can be methylated. 915 916 True if the recognition site is a methylable. 917 """ 918 return True 919 920 921class Meth_Undep(AbstractCut): 922 """Implement information about methylation sensitibility. 923 924 Enzymes of this class are not sensible to methylation. 925 """ 926 927 @classmethod 928 def is_methylable(cls): 929 """Return if recognition site can be methylated. 930 931 True if the recognition site is a methylable. 932 """ 933 return False 934 935 936class Palindromic(AbstractCut): 937 """Implement methods for enzymes with palindromic recognition sites. 938 939 palindromic means : the recognition site and its reverse complement are 940 identical. 941 Remarks : an enzyme with a site CGNNCG is palindromic even if some 942 of the sites that it will recognise are not. 943 for example here : CGAACG 944 945 Internal use only. Not meant to be instantiated. 946 """ 947 948 @classmethod 949 def _search(cls): 950 """Return a list of cutting sites of the enzyme in the sequence (PRIVATE). 951 952 For internal use only. 953 954 Implement the search method for palindromic enzymes. 955 """ 956 siteloc = cls.dna.finditer(cls.compsite, cls.size) 957 cls.results = [r for s, g in siteloc for r in cls._modify(s)] 958 if cls.results: 959 cls._drop() 960 return cls.results 961 962 @classmethod 963 def is_palindromic(cls): 964 """Return if the enzyme has a palindromic recoginition site.""" 965 return True 966 967 968class NonPalindromic(AbstractCut): 969 """Implement methods for enzymes with non-palindromic recognition sites. 970 971 Palindromic means : the recognition site and its reverse complement are 972 identical. 973 974 Internal use only. Not meant to be instantiated. 975 """ 976 977 @classmethod 978 def _search(cls): 979 """Return a list of cutting sites of the enzyme in the sequence (PRIVATE). 980 981 For internal use only. 982 983 Implement the search method for non palindromic enzymes. 984 """ 985 iterator = cls.dna.finditer(cls.compsite, cls.size) 986 cls.results = [] 987 modif = cls._modify 988 revmodif = cls._rev_modify 989 s = str(cls) 990 cls.on_minus = [] 991 992 for start, group in iterator: 993 if group(s): 994 cls.results += list(modif(start)) 995 else: 996 cls.on_minus += list(revmodif(start)) 997 cls.results += cls.on_minus 998 999 if cls.results: 1000 cls.results.sort() 1001 cls._drop() 1002 return cls.results 1003 1004 @classmethod 1005 def is_palindromic(cls): 1006 """Return if the enzyme has a palindromic recoginition site.""" 1007 return False 1008 1009 1010class Unknown(AbstractCut): 1011 """Implement methods for enzymes that produce unknown overhangs. 1012 1013 These enzymes are also NotDefined and NoCut. 1014 1015 Internal use only. Not meant to be instantiated. 1016 """ 1017 1018 @classmethod 1019 def catalyse(cls, dna, linear=True): 1020 """List the sequence fragments after cutting dna with enzyme. 1021 1022 Return a tuple of dna as will be produced by using RE to restrict the 1023 dna. 1024 1025 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 1026 1027 If linear is False, the sequence is considered to be circular and the 1028 output will be modified accordingly. 1029 """ 1030 raise NotImplementedError("%s restriction is unknown." % cls.__name__) 1031 1032 catalyze = catalyse 1033 1034 @classmethod 1035 def is_blunt(cls): 1036 """Return if the enzyme produces blunt ends. 1037 1038 True if the enzyme produces blunt end. 1039 1040 Related methods: 1041 1042 - RE.is_3overhang() 1043 - RE.is_5overhang() 1044 - RE.is_unknown() 1045 1046 """ 1047 return False 1048 1049 @classmethod 1050 def is_5overhang(cls): 1051 """Return if the enzymes produces 5' overhanging ends. 1052 1053 True if the enzyme produces 5' overhang sticky end. 1054 1055 Related methods: 1056 1057 - RE.is_3overhang() 1058 - RE.is_blunt() 1059 - RE.is_unknown() 1060 1061 """ 1062 return False 1063 1064 @classmethod 1065 def is_3overhang(cls): 1066 """Return if the enzyme produces 3' overhanging ends. 1067 1068 True if the enzyme produces 3' overhang sticky end. 1069 1070 Related methods: 1071 1072 - RE.is_5overhang() 1073 - RE.is_blunt() 1074 - RE.is_unknown() 1075 1076 """ 1077 return False 1078 1079 @classmethod 1080 def overhang(cls): 1081 """Return the type of the enzyme's overhang as string. 1082 1083 Can be "3' overhang", "5' overhang", "blunt", "unknown". 1084 """ 1085 return "unknown" 1086 1087 @classmethod 1088 def compatible_end(cls): 1089 """List all enzymes that produce compatible ends for the enzyme.""" 1090 return [] 1091 1092 @classmethod 1093 def _mod1(cls, other): 1094 """Test if other enzyme produces compatible ends for enzyme (PRIVATE). 1095 1096 For internal use only. 1097 1098 Test for the compatibility of restriction ending of RE and other. 1099 """ 1100 return False 1101 1102 1103class Blunt(AbstractCut): 1104 """Implement methods for enzymes that produce blunt ends. 1105 1106 The enzyme cuts the + strand and the - strand of the DNA at the same 1107 place. 1108 1109 Internal use only. Not meant to be instantiated. 1110 """ 1111 1112 @classmethod 1113 def catalyse(cls, dna, linear=True): 1114 """List the sequence fragments after cutting dna with enzyme. 1115 1116 Return a tuple of dna as will be produced by using RE to restrict the 1117 dna. 1118 1119 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 1120 1121 If linear is False, the sequence is considered to be circular and the 1122 output will be modified accordingly. 1123 """ 1124 r = cls.search(dna, linear) 1125 d = cls.dna 1126 if not r: 1127 return (d[1:],) 1128 fragments = [] 1129 length = len(r) - 1 1130 if d.is_linear(): 1131 # 1132 # START of the sequence to FIRST site. 1133 # 1134 fragments.append(d[1 : r[0]]) 1135 if length: 1136 # 1137 # if more than one site add them. 1138 # 1139 fragments += [d[r[x] : r[x + 1]] for x in range(length)] 1140 # 1141 # LAST site to END of the sequence. 1142 # 1143 fragments.append(d[r[-1] :]) 1144 else: 1145 # 1146 # circular : bridge LAST site to FIRST site. 1147 # 1148 fragments.append(d[r[-1] :] + d[1 : r[0]]) 1149 if not length: 1150 # 1151 # one site we finish here. 1152 # 1153 return tuple(fragments) 1154 # 1155 # add the others. 1156 # 1157 fragments += [d[r[x] : r[x + 1]] for x in range(length)] 1158 return tuple(fragments) 1159 1160 catalyze = catalyse 1161 1162 @classmethod 1163 def is_blunt(cls): 1164 """Return if the enzyme produces blunt ends. 1165 1166 True if the enzyme produces blunt end. 1167 1168 Related methods: 1169 1170 - RE.is_3overhang() 1171 - RE.is_5overhang() 1172 - RE.is_unknown() 1173 1174 """ 1175 return True 1176 1177 @classmethod 1178 def is_5overhang(cls): 1179 """Return if the enzymes produces 5' overhanging ends. 1180 1181 True if the enzyme produces 5' overhang sticky end. 1182 1183 Related methods: 1184 1185 - RE.is_3overhang() 1186 - RE.is_blunt() 1187 - RE.is_unknown() 1188 1189 """ 1190 return False 1191 1192 @classmethod 1193 def is_3overhang(cls): 1194 """Return if the enzyme produces 3' overhanging ends. 1195 1196 True if the enzyme produces 3' overhang sticky end. 1197 1198 Related methods: 1199 1200 - RE.is_5overhang() 1201 - RE.is_blunt() 1202 - RE.is_unknown() 1203 1204 """ 1205 return False 1206 1207 @classmethod 1208 def overhang(cls): 1209 """Return the type of the enzyme's overhang as string. 1210 1211 Can be "3' overhang", "5' overhang", "blunt", "unknown". 1212 """ 1213 return "blunt" 1214 1215 @classmethod 1216 def compatible_end(cls, batch=None): 1217 """List all enzymes that produce compatible ends for the enzyme.""" 1218 if not batch: 1219 batch = AllEnzymes 1220 r = sorted(x for x in iter(AllEnzymes) if x.is_blunt()) 1221 return r 1222 1223 @staticmethod 1224 def _mod1(other): 1225 """Test if other enzyme produces compatible ends for enzyme (PRIVATE). 1226 1227 For internal use only 1228 1229 Test for the compatibility of restriction ending of RE and other. 1230 """ 1231 return issubclass(other, Blunt) 1232 1233 1234class Ov5(AbstractCut): 1235 """Implement methods for enzymes that produce 5' overhanging ends. 1236 1237 The enzyme cuts the + strand after the - strand of the DNA. 1238 1239 Internal use only. Not meant to be instantiated. 1240 """ 1241 1242 @classmethod 1243 def catalyse(cls, dna, linear=True): 1244 """List the sequence fragments after cutting dna with enzyme. 1245 1246 Return a tuple of dna as will be produced by using RE to restrict the 1247 dna. 1248 1249 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 1250 1251 If linear is False, the sequence is considered to be circular and the 1252 output will be modified accordingly. 1253 """ 1254 r = cls.search(dna, linear) 1255 d = cls.dna 1256 if not r: 1257 return (d[1:],) 1258 length = len(r) - 1 1259 fragments = [] 1260 if d.is_linear(): 1261 # 1262 # START of the sequence to FIRST site. 1263 # 1264 fragments.append(d[1 : r[0]]) 1265 if length: 1266 # 1267 # if more than one site add them. 1268 # 1269 fragments += [d[r[x] : r[x + 1]] for x in range(length)] 1270 # 1271 # LAST site to END of the sequence. 1272 # 1273 fragments.append(d[r[-1] :]) 1274 else: 1275 # 1276 # circular : bridge LAST site to FIRST site. 1277 # 1278 fragments.append(d[r[-1] :] + d[1 : r[0]]) 1279 if not length: 1280 # 1281 # one site we finish here. 1282 # 1283 return tuple(fragments) 1284 # 1285 # add the others. 1286 # 1287 fragments += [d[r[x] : r[x + 1]] for x in range(length)] 1288 return tuple(fragments) 1289 1290 catalyze = catalyse 1291 1292 @classmethod 1293 def is_blunt(cls): 1294 """Return if the enzyme produces blunt ends. 1295 1296 True if the enzyme produces blunt end. 1297 1298 Related methods: 1299 1300 - RE.is_3overhang() 1301 - RE.is_5overhang() 1302 - RE.is_unknown() 1303 1304 """ 1305 return False 1306 1307 @classmethod 1308 def is_5overhang(cls): 1309 """Return if the enzymes produces 5' overhanging ends. 1310 1311 True if the enzyme produces 5' overhang sticky end. 1312 1313 Related methods: 1314 1315 - RE.is_3overhang() 1316 - RE.is_blunt() 1317 - RE.is_unknown() 1318 1319 """ 1320 return True 1321 1322 @classmethod 1323 def is_3overhang(cls): 1324 """Return if the enzyme produces 3' overhanging ends. 1325 1326 True if the enzyme produces 3' overhang sticky end. 1327 1328 Related methods: 1329 1330 - RE.is_5overhang() 1331 - RE.is_blunt() 1332 - RE.is_unknown() 1333 1334 """ 1335 return False 1336 1337 @classmethod 1338 def overhang(cls): 1339 """Return the type of the enzyme's overhang as string. 1340 1341 Can be "3' overhang", "5' overhang", "blunt", "unknown". 1342 """ 1343 return "5' overhang" 1344 1345 @classmethod 1346 def compatible_end(cls, batch=None): 1347 """List all enzymes that produce compatible ends for the enzyme.""" 1348 if not batch: 1349 batch = AllEnzymes 1350 r = sorted(x for x in iter(AllEnzymes) if x.is_5overhang() and x % cls) 1351 return r 1352 1353 @classmethod 1354 def _mod1(cls, other): 1355 """Test if other enzyme produces compatible ends for enzyme (PRIVATE). 1356 1357 For internal use only. 1358 1359 Test for the compatibility of restriction ending of RE and other. 1360 """ 1361 if issubclass(other, Ov5): 1362 return cls._mod2(other) 1363 else: 1364 return False 1365 1366 1367class Ov3(AbstractCut): 1368 """Implement methods for enzymes that produce 3' overhanging ends. 1369 1370 The enzyme cuts the - strand after the + strand of the DNA. 1371 1372 Internal use only. Not meant to be instantiated. 1373 """ 1374 1375 @classmethod 1376 def catalyse(cls, dna, linear=True): 1377 """List the sequence fragments after cutting dna with enzyme. 1378 1379 Return a tuple of dna as will be produced by using RE to restrict the 1380 dna. 1381 1382 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 1383 1384 If linear is False, the sequence is considered to be circular and the 1385 output will be modified accordingly. 1386 """ 1387 r = cls.search(dna, linear) 1388 d = cls.dna 1389 if not r: 1390 return (d[1:],) 1391 fragments = [] 1392 length = len(r) - 1 1393 if d.is_linear(): 1394 # 1395 # START of the sequence to FIRST site. 1396 # 1397 fragments.append(d[1 : r[0]]) 1398 if length: 1399 # 1400 # if more than one site add them. 1401 # 1402 fragments += [d[r[x] : r[x + 1]] for x in range(length)] 1403 # 1404 # LAST site to END of the sequence. 1405 # 1406 fragments.append(d[r[-1] :]) 1407 else: 1408 # 1409 # circular : bridge LAST site to FIRST site. 1410 # 1411 fragments.append(d[r[-1] :] + d[1 : r[0]]) 1412 if not length: 1413 # 1414 # one site we finish here. 1415 # 1416 return tuple(fragments) 1417 # 1418 # add the others. 1419 # 1420 fragments += [d[r[x] : r[x + 1]] for x in range(length)] 1421 return tuple(fragments) 1422 1423 catalyze = catalyse 1424 1425 @classmethod 1426 def is_blunt(cls): 1427 """Return if the enzyme produces blunt ends. 1428 1429 True if the enzyme produces blunt end. 1430 1431 Related methods: 1432 1433 - RE.is_3overhang() 1434 - RE.is_5overhang() 1435 - RE.is_unknown() 1436 1437 """ 1438 return False 1439 1440 @classmethod 1441 def is_5overhang(cls): 1442 """Return if the enzymes produces 5' overhanging ends. 1443 1444 True if the enzyme produces 5' overhang sticky end. 1445 1446 Related methods: 1447 1448 - RE.is_3overhang() 1449 - RE.is_blunt() 1450 - RE.is_unknown() 1451 1452 """ 1453 return False 1454 1455 @classmethod 1456 def is_3overhang(cls): 1457 """Return if the enzyme produces 3' overhanging ends. 1458 1459 True if the enzyme produces 3' overhang sticky end. 1460 1461 Related methods: 1462 1463 - RE.is_5overhang() 1464 - RE.is_blunt() 1465 - RE.is_unknown() 1466 1467 """ 1468 return True 1469 1470 @classmethod 1471 def overhang(cls): 1472 """Return the type of the enzyme's overhang as string. 1473 1474 Can be "3' overhang", "5' overhang", "blunt", "unknown". 1475 """ 1476 return "3' overhang" 1477 1478 @classmethod 1479 def compatible_end(cls, batch=None): 1480 """List all enzymes that produce compatible ends for the enzyme.""" 1481 if not batch: 1482 batch = AllEnzymes 1483 r = sorted(x for x in iter(AllEnzymes) if x.is_3overhang() and x % cls) 1484 return r 1485 1486 @classmethod 1487 def _mod1(cls, other): 1488 """Test if other enzyme produces compatible ends for enzyme (PRIVATE). 1489 1490 For internal use only. 1491 1492 Test for the compatibility of restriction ending of RE and other. 1493 """ 1494 # 1495 # called by RE._mod1(other) when the one of the enzyme is ambiguous 1496 # 1497 if issubclass(other, Ov3): 1498 return cls._mod2(other) 1499 else: 1500 return False 1501 1502 1503class Defined(AbstractCut): 1504 """Implement methods for enzymes with defined recognition site and cut. 1505 1506 Typical example : EcoRI -> G^AATT_C 1507 The overhang will always be AATT 1508 Notes: 1509 Blunt enzymes are always defined. Even if their site is GGATCCNNN^_N 1510 Their overhang is always the same : blunt! 1511 1512 Internal use only. Not meant to be instantiated. 1513 """ 1514 1515 @classmethod 1516 def _drop(cls): 1517 """Remove cuts that are outsite of the sequence (PRIVATE). 1518 1519 For internal use only. 1520 1521 Drop the site that are situated outside the sequence in linear 1522 sequence. Modify the index for site in circular sequences. 1523 """ 1524 # 1525 # remove or modify the results that are outside the sequence. 1526 # This is necessary since after finding the site we add the distance 1527 # from the site to the cut with the _modify and _rev_modify methods. 1528 # For linear we will remove these sites altogether. 1529 # For circular sequence, we modify the result rather than _drop it 1530 # since the site is in the sequence. 1531 # 1532 length = len(cls.dna) 1533 drop = itertools.dropwhile 1534 take = itertools.takewhile 1535 if cls.dna.is_linear(): 1536 cls.results = list(drop(lambda x: x <= 1, cls.results)) 1537 cls.results = list(take(lambda x: x <= length, cls.results)) 1538 else: 1539 for index, location in enumerate(cls.results): 1540 if location < 1: 1541 cls.results[index] += length 1542 else: 1543 break 1544 for index, location in enumerate(cls.results[::-1]): 1545 if location > length: 1546 cls.results[-(index + 1)] -= length 1547 else: 1548 break 1549 1550 @classmethod 1551 def is_defined(cls): 1552 """Return if recognition sequence and cut are defined. 1553 1554 True if the sequence recognised and cut is constant, 1555 i.e. the recognition site is not degenerated AND the enzyme cut inside 1556 the site. 1557 1558 Related methods: 1559 1560 - RE.is_ambiguous() 1561 - RE.is_unknown() 1562 1563 """ 1564 return True 1565 1566 @classmethod 1567 def is_ambiguous(cls): 1568 """Return if recognition sequence and cut may be ambiguous. 1569 1570 True if the sequence recognised and cut is ambiguous, 1571 i.e. the recognition site is degenerated AND/OR the enzyme cut outside 1572 the site. 1573 1574 Related methods: 1575 1576 - RE.is_defined() 1577 - RE.is_unknown() 1578 1579 """ 1580 return False 1581 1582 @classmethod 1583 def is_unknown(cls): 1584 """Return if recognition sequence is unknown. 1585 1586 True if the sequence is unknown, 1587 i.e. the recognition site has not been characterised yet. 1588 1589 Related methods: 1590 1591 - RE.is_defined() 1592 - RE.is_ambiguous() 1593 1594 """ 1595 return False 1596 1597 @classmethod 1598 def elucidate(cls): 1599 """Return a string representing the recognition site and cuttings. 1600 1601 Return a representation of the site with the cut on the (+) strand 1602 represented as '^' and the cut on the (-) strand as '_'. 1603 ie: 1604 1605 >>> from Bio.Restriction import EcoRI, KpnI, EcoRV, SnaI 1606 >>> EcoRI.elucidate() # 5' overhang 1607 'G^AATT_C' 1608 >>> KpnI.elucidate() # 3' overhang 1609 'G_GTAC^C' 1610 >>> EcoRV.elucidate() # blunt 1611 'GAT^_ATC' 1612 >>> SnaI.elucidate() # NotDefined, cut profile unknown. 1613 '? GTATAC ?' 1614 >>> 1615 1616 """ 1617 f5 = cls.fst5 1618 f3 = cls.fst3 1619 site = cls.site 1620 if cls.cut_twice(): 1621 re = "cut twice, not yet implemented sorry." 1622 elif cls.is_5overhang(): 1623 if f5 == f3 == 0: 1624 re = "N^" + cls.site + "_N" 1625 elif f3 == 0: 1626 re = site[:f5] + "^" + site[f5:] + "_N" 1627 else: 1628 re = site[:f5] + "^" + site[f5:f3] + "_" + site[f3:] 1629 elif cls.is_blunt(): 1630 re = site[:f5] + "^_" + site[f5:] 1631 else: 1632 if f5 == f3 == 0: 1633 re = "N_" + site + "^N" 1634 else: 1635 re = site[:f3] + "_" + site[f3:f5] + "^" + site[f5:] 1636 return re 1637 1638 @classmethod 1639 def _mod2(cls, other): 1640 """Test if other enzyme produces compatible ends for enzyme (PRIVATE). 1641 1642 For internal use only. 1643 1644 Test for the compatibility of restriction ending of RE and other. 1645 """ 1646 # 1647 # called by RE._mod1(other) when the one of the enzyme is ambiguous 1648 # 1649 if other.ovhgseq == cls.ovhgseq: 1650 return True 1651 elif issubclass(other, Ambiguous): 1652 return other._mod2(cls) 1653 else: 1654 return False 1655 1656 1657class Ambiguous(AbstractCut): 1658 """Implement methods for enzymes that produce variable overhangs. 1659 1660 Typical example : BstXI -> CCAN_NNNN^NTGG 1661 The overhang can be any sequence of 4 bases. 1662 1663 Notes: 1664 Blunt enzymes are always defined. Even if their site is GGATCCNNN^_N 1665 Their overhang is always the same : blunt! 1666 1667 Internal use only. Not meant to be instantiated. 1668 1669 """ 1670 1671 @classmethod 1672 def _drop(cls): 1673 """Remove cuts that are outsite of the sequence (PRIVATE). 1674 1675 For internal use only. 1676 1677 Drop the site that are situated outside the sequence in linear 1678 sequence. Modify the index for site in circular sequences. 1679 """ 1680 length = len(cls.dna) 1681 drop = itertools.dropwhile 1682 take = itertools.takewhile 1683 if cls.dna.is_linear(): 1684 cls.results = list(drop(lambda x: x <= 1, cls.results)) 1685 cls.results = list(take(lambda x: x <= length, cls.results)) 1686 else: 1687 for index, location in enumerate(cls.results): 1688 if location < 1: 1689 cls.results[index] += length 1690 else: 1691 break 1692 for index, location in enumerate(cls.results[::-1]): 1693 if location > length: 1694 cls.results[-(index + 1)] -= length 1695 else: 1696 break 1697 1698 @classmethod 1699 def is_defined(cls): 1700 """Return if recognition sequence and cut are defined. 1701 1702 True if the sequence recognised and cut is constant, 1703 i.e. the recognition site is not degenerated AND the enzyme cut inside 1704 the site. 1705 1706 Related methods: 1707 1708 - RE.is_ambiguous() 1709 - RE.is_unknown() 1710 1711 """ 1712 return False 1713 1714 @classmethod 1715 def is_ambiguous(cls): 1716 """Return if recognition sequence and cut may be ambiguous. 1717 1718 True if the sequence recognised and cut is ambiguous, 1719 i.e. the recognition site is degenerated AND/OR the enzyme cut outside 1720 the site. 1721 1722 Related methods: 1723 1724 - RE.is_defined() 1725 - RE.is_unknown() 1726 1727 """ 1728 return True 1729 1730 @classmethod 1731 def is_unknown(cls): 1732 """Return if recognition sequence is unknown. 1733 1734 True if the sequence is unknown, 1735 i.e. the recognition site has not been characterised yet. 1736 1737 Related methods: 1738 1739 - RE.is_defined() 1740 - RE.is_ambiguous() 1741 1742 """ 1743 return False 1744 1745 @classmethod 1746 def _mod2(cls, other): 1747 """Test if other enzyme produces compatible ends for enzyme (PRIVATE). 1748 1749 For internal use only. 1750 1751 Test for the compatibility of restriction ending of RE and other. 1752 """ 1753 # 1754 # called by RE._mod1(other) when the one of the enzyme is ambiguous 1755 # 1756 if len(cls.ovhgseq) != len(other.ovhgseq): 1757 return False 1758 else: 1759 se = cls.ovhgseq 1760 for base in se: 1761 if base in "ATCG": 1762 pass 1763 if base in "N": 1764 se = ".".join(se.split("N")) 1765 if base in "RYWMSKHDBV": 1766 expand = "[" + matching[base] + "]" 1767 se = expand.join(se.split(base)) 1768 if re.match(se, other.ovhgseq): 1769 return True 1770 else: 1771 return False 1772 1773 @classmethod 1774 def elucidate(cls): 1775 """Return a string representing the recognition site and cuttings. 1776 1777 Return a representation of the site with the cut on the (+) strand 1778 represented as '^' and the cut on the (-) strand as '_'. 1779 ie: 1780 1781 >>> from Bio.Restriction import EcoRI, KpnI, EcoRV, SnaI 1782 >>> EcoRI.elucidate() # 5' overhang 1783 'G^AATT_C' 1784 >>> KpnI.elucidate() # 3' overhang 1785 'G_GTAC^C' 1786 >>> EcoRV.elucidate() # blunt 1787 'GAT^_ATC' 1788 >>> SnaI.elucidate() # NotDefined, cut profile unknown. 1789 '? GTATAC ?' 1790 >>> 1791 1792 """ 1793 f5 = cls.fst5 1794 f3 = cls.fst3 1795 length = len(cls) 1796 site = cls.site 1797 if cls.cut_twice(): 1798 re = "cut twice, not yet implemented sorry." 1799 elif cls.is_5overhang(): 1800 if f3 == f5 == 0: 1801 re = "N^" + site + "_N" 1802 elif 0 <= f5 <= length and 0 <= f3 + length <= length: 1803 re = site[:f5] + "^" + site[f5:f3] + "_" + site[f3:] 1804 elif 0 <= f5 <= length: 1805 re = site[:f5] + "^" + site[f5:] + f3 * "N" + "_N" 1806 elif 0 <= f3 + length <= length: 1807 re = "N^" + abs(f5) * "N" + site[:f3] + "_" + site[f3:] 1808 elif f3 + length < 0: 1809 re = "N^" * abs(f5) * "N" + "_" + abs(length + f3) * "N" + site 1810 elif f5 > length: 1811 re = site + (f5 - length) * "N" + "^" + (length + f3 - f5) * "N" + "_N" 1812 else: 1813 re = "N^" + abs(f5) * "N" + site + f3 * "N" + "_N" 1814 elif cls.is_blunt(): 1815 if f5 < 0: 1816 re = "N^_" + abs(f5) * "N" + site 1817 elif f5 > length: 1818 re = site + (f5 - length) * "N" + "^_N" 1819 else: 1820 raise ValueError("%s.easyrepr() : error f5=%i" % (cls.name, f5)) 1821 else: 1822 if f3 == 0: 1823 if f5 == 0: 1824 re = "N_" + site + "^N" 1825 else: 1826 re = site + "_" + (f5 - length) * "N" + "^N" 1827 elif 0 < f3 + length <= length and 0 <= f5 <= length: 1828 re = site[:f3] + "_" + site[f3:f5] + "^" + site[f5:] 1829 elif 0 < f3 + length <= length: 1830 re = site[:f3] + "_" + site[f3:] + (f5 - length) * "N" + "^N" 1831 elif 0 <= f5 <= length: 1832 re = "N_" + "N" * (f3 + length) + site[:f5] + "^" + site[f5:] 1833 elif f3 > 0: 1834 re = site + f3 * "N" + "_" + (f5 - f3 - length) * "N" + "^N" 1835 elif f5 < 0: 1836 re = "N_" + abs(f3 - f5 + length) * "N" + "^" + abs(f5) * "N" + site 1837 else: 1838 re = "N_" + abs(f3 + length) * "N" + site + (f5 - length) * "N" + "^N" 1839 return re 1840 1841 1842class NotDefined(AbstractCut): 1843 """Implement methods for enzymes with non-characterized overhangs. 1844 1845 Correspond to NoCut and Unknown. 1846 1847 Internal use only. Not meant to be instantiated. 1848 """ 1849 1850 @classmethod 1851 def _drop(cls): 1852 """Remove cuts that are outsite of the sequence (PRIVATE). 1853 1854 For internal use only. 1855 1856 Drop the site that are situated outside the sequence in linear 1857 sequence. Modify the index for site in circular sequences. 1858 """ 1859 if cls.dna.is_linear(): 1860 return 1861 else: 1862 length = len(cls.dna) 1863 for index, location in enumerate(cls.results): 1864 if location < 1: 1865 cls.results[index] += length 1866 else: 1867 break 1868 for index, location in enumerate(cls.results[:-1]): 1869 if location > length: 1870 cls.results[-(index + 1)] -= length 1871 else: 1872 break 1873 1874 @classmethod 1875 def is_defined(cls): 1876 """Return if recognition sequence and cut are defined. 1877 1878 True if the sequence recognised and cut is constant, 1879 i.e. the recognition site is not degenerated AND the enzyme cut inside 1880 the site. 1881 1882 Related methods: 1883 1884 - RE.is_ambiguous() 1885 - RE.is_unknown() 1886 1887 """ 1888 return False 1889 1890 @classmethod 1891 def is_ambiguous(cls): 1892 """Return if recognition sequence and cut may be ambiguous. 1893 1894 True if the sequence recognised and cut is ambiguous, 1895 i.e. the recognition site is degenerated AND/OR the enzyme cut outside 1896 the site. 1897 1898 Related methods: 1899 1900 - RE.is_defined() 1901 - RE.is_unknown() 1902 1903 """ 1904 return False 1905 1906 @classmethod 1907 def is_unknown(cls): 1908 """Return if recognition sequence is unknown. 1909 1910 True if the sequence is unknown, 1911 i.e. the recognition site has not been characterised yet. 1912 1913 Related methods: 1914 1915 - RE.is_defined() 1916 - RE.is_ambiguous() 1917 1918 """ 1919 return True 1920 1921 @classmethod 1922 def _mod2(cls, other): 1923 """Test if other enzyme produces compatible ends for enzyme (PRIVATE). 1924 1925 For internal use only. 1926 1927 Test for the compatibility of restriction ending of RE and other. 1928 """ 1929 # 1930 # Normally we should not arrive here. But well better safe than 1931 # sorry. 1932 # the overhang is not defined we are compatible with nobody. 1933 # could raise an Error may be rather than return quietly. 1934 # 1935 # return False 1936 raise ValueError( 1937 "%s.mod2(%s), %s : NotDefined. pas glop pas glop!" 1938 % (str(cls), str(other), str(cls)) 1939 ) 1940 1941 @classmethod 1942 def elucidate(cls): 1943 """Return a string representing the recognition site and cuttings. 1944 1945 Return a representation of the site with the cut on the (+) strand 1946 represented as '^' and the cut on the (-) strand as '_'. 1947 ie: 1948 1949 >>> from Bio.Restriction import EcoRI, KpnI, EcoRV, SnaI 1950 >>> EcoRI.elucidate() # 5' overhang 1951 'G^AATT_C' 1952 >>> KpnI.elucidate() # 3' overhang 1953 'G_GTAC^C' 1954 >>> EcoRV.elucidate() # blunt 1955 'GAT^_ATC' 1956 >>> SnaI.elucidate() # NotDefined, cut profile unknown. 1957 '? GTATAC ?' 1958 >>> 1959 1960 """ 1961 return "? %s ?" % cls.site 1962 1963 1964class Commercially_available(AbstractCut): 1965 """Implement methods for enzymes which are commercially available. 1966 1967 Internal use only. Not meant to be instantiated. 1968 """ 1969 1970 # 1971 # Recent addition to Rebase make this naming convention uncertain. 1972 # May be better to says enzymes which have a supplier. 1973 # 1974 1975 @classmethod 1976 def suppliers(cls): 1977 """Print a list of suppliers of the enzyme.""" 1978 for s in cls.suppl: 1979 print(suppliers_dict[s][0] + ",") 1980 1981 @classmethod 1982 def supplier_list(cls): 1983 """Return a list of suppliers of the enzyme.""" 1984 return [v[0] for k, v in suppliers_dict.items() if k in cls.suppl] 1985 1986 @classmethod 1987 def buffers(cls, supplier): 1988 """Return the recommended buffer of the supplier for this enzyme. 1989 1990 Not implemented yet. 1991 """ 1992 1993 @classmethod 1994 def is_comm(cls): 1995 """Return if enzyme is commercially available. 1996 1997 True if RE has suppliers. 1998 """ 1999 return True 2000 2001 2002class Not_available(AbstractCut): 2003 """Implement methods for enzymes which are not commercially available. 2004 2005 Internal use only. Not meant to be instantiated. 2006 """ 2007 2008 @staticmethod 2009 def suppliers(): 2010 """Print a list of suppliers of the enzyme.""" 2011 return None 2012 2013 @classmethod 2014 def supplier_list(cls): 2015 """Return a list of suppliers of the enzyme.""" 2016 return [] 2017 2018 @classmethod 2019 def buffers(cls, supplier): 2020 """Return the recommended buffer of the supplier for this enzyme. 2021 2022 Not implemented yet. 2023 """ 2024 raise TypeError("Enzyme not commercially available.") 2025 2026 @classmethod 2027 def is_comm(cls): 2028 """Return if enzyme is commercially available. 2029 2030 True if RE has suppliers. 2031 """ 2032 return False 2033 2034 2035############################################################################### 2036# # 2037# Restriction Batch # 2038# # 2039############################################################################### 2040 2041 2042class RestrictionBatch(set): 2043 """Class for operations on more than one enzyme.""" 2044 2045 def __init__(self, first=(), suppliers=()): 2046 """Initialize empty RB or pre-fill with enzymes (from supplier).""" 2047 first = [self.format(x) for x in first] 2048 first += [eval(x) for n in suppliers for x in suppliers_dict[n][1]] 2049 set.__init__(self, first) 2050 self.mapping = dict.fromkeys(self) 2051 self.already_mapped = None 2052 self.suppliers = [x for x in suppliers if x in suppliers_dict] 2053 2054 def __str__(self): 2055 """Return a readable representation of the ``RestrictionBatch``.""" 2056 if len(self) < 5: 2057 return "+".join(self.elements()) 2058 else: 2059 return "...".join( 2060 ("+".join(self.elements()[:2]), "+".join(self.elements()[-2:])) 2061 ) 2062 2063 def __repr__(self): 2064 """Represent ``RestrictionBatch`` class as a string for debugging.""" 2065 return "RestrictionBatch(%s)" % self.elements() 2066 2067 def __contains__(self, other): 2068 """Implement ``in`` for ``RestrictionBatch``.""" 2069 try: 2070 other = self.format(other) 2071 except ValueError: # other is not a restriction enzyme 2072 return False 2073 return set.__contains__(self, other) 2074 2075 def __div__(self, other): 2076 """Override '/' operator to use as search method.""" 2077 return self.search(other) 2078 2079 def __rdiv__(self, other): 2080 """Override division with reversed operands to use as search method.""" 2081 return self.search(other) 2082 2083 def __truediv__(self, other): 2084 """Override Python 3 division operator to use as search method. 2085 2086 Like __div__. 2087 """ 2088 return self.search(other) 2089 2090 def __rtruediv__(self, other): 2091 """As __truediv___, with reversed operands. 2092 2093 Like __rdiv__. 2094 """ 2095 return self.search(other) 2096 2097 def get(self, enzyme, add=False): 2098 """Check if enzyme is in batch and return it. 2099 2100 If add is True and enzyme is not in batch add enzyme to batch. 2101 If add is False (which is the default) only return enzyme. 2102 If enzyme is not a RestrictionType or can not be evaluated to 2103 a RestrictionType, raise a ValueError. 2104 """ 2105 e = self.format(enzyme) 2106 if e in self: 2107 return e 2108 elif add: 2109 self.add(e) 2110 return e 2111 else: 2112 raise ValueError("enzyme %s is not in RestrictionBatch" % e.__name__) 2113 2114 def lambdasplit(self, func): 2115 """Filter enzymes in batch with supplied function. 2116 2117 The new batch will contain only the enzymes for which 2118 func return True. 2119 """ 2120 d = list(filter(func, self)) 2121 new = RestrictionBatch() 2122 new._data = dict(zip(d, [True] * len(d))) 2123 return new 2124 2125 def add_supplier(self, letter): 2126 """Add all enzymes from a given supplier to batch. 2127 2128 letter represents the suppliers as defined in the dictionary 2129 RestrictionDictionary.suppliers 2130 Returns None. 2131 Raise a KeyError if letter is not a supplier code. 2132 """ 2133 supplier = suppliers_dict[letter] 2134 self.suppliers.append(letter) 2135 for x in supplier[1]: 2136 self.add_nocheck(eval(x)) 2137 2138 def current_suppliers(self): 2139 """List the current suppliers for the restriction batch. 2140 2141 Return a sorted list of the suppliers which have been used to 2142 create the batch. 2143 """ 2144 suppl_list = sorted(suppliers_dict[x][0] for x in self.suppliers) 2145 return suppl_list 2146 2147 def __iadd__(self, other): 2148 """Override '+=' for use with sets. 2149 2150 b += other -> add other to b, check the type of other. 2151 """ 2152 self.add(other) 2153 return self 2154 2155 def __add__(self, other): 2156 """Overide '+' for use with sets. 2157 2158 b + other -> new RestrictionBatch. 2159 """ 2160 new = self.__class__(self) 2161 new.add(other) 2162 return new 2163 2164 def remove(self, other): 2165 """Remove enzyme from restriction batch. 2166 2167 Safe set.remove method. Verify that other is a RestrictionType or can 2168 be evaluated to a RestrictionType. 2169 Raise a ValueError if other can not be evaluated to a RestrictionType. 2170 Raise a KeyError if other is not in B. 2171 """ 2172 return set.remove(self, self.format(other)) 2173 2174 def add(self, other): 2175 """Add a restriction enzyme to the restriction batch. 2176 2177 Safe set.add method. Verify that other is a RestrictionType or can be 2178 evaluated to a RestrictionType. 2179 Raise a ValueError if other can not be evaluated to a RestrictionType. 2180 """ 2181 return set.add(self, self.format(other)) 2182 2183 def add_nocheck(self, other): 2184 """Add restriction enzyme to batch without checking its type.""" 2185 return set.add(self, other) 2186 2187 def format(self, y): 2188 """Evaluate enzyme (name) and return it (as RestrictionType). 2189 2190 If y is a RestrictionType return y. 2191 If y can be evaluated to a RestrictionType return eval(y). 2192 Raise a ValueError in all other case. 2193 """ 2194 try: 2195 if isinstance(y, RestrictionType): 2196 return y 2197 elif isinstance(eval(str(y)), RestrictionType): 2198 return eval(y) 2199 except (NameError, SyntaxError): 2200 pass 2201 raise ValueError("%s is not a RestrictionType" % y.__class__) 2202 2203 def is_restriction(self, y): 2204 """Return if enzyme (name) is a known enzyme. 2205 2206 True if y or eval(y) is a RestrictionType. 2207 """ 2208 return isinstance(y, RestrictionType) or isinstance( 2209 eval(str(y)), RestrictionType 2210 ) 2211 2212 def split(self, *classes, **bool): 2213 """Extract enzymes of a certain class and put in new RestrictionBatch. 2214 2215 It works but it is slow, so it has really an interest when splitting 2216 over multiple conditions. 2217 """ 2218 2219 def splittest(element): 2220 for klass in classes: 2221 b = bool.get(klass.__name__, True) 2222 if issubclass(element, klass): 2223 if b: 2224 continue 2225 else: 2226 return False 2227 elif b: 2228 return False 2229 else: 2230 continue 2231 return True 2232 2233 d = list(filter(splittest, self)) 2234 new = RestrictionBatch() 2235 new._data = dict(zip(d, [True] * len(d))) 2236 return new 2237 2238 def elements(self): 2239 """List the enzymes of the RestrictionBatch as list of strings. 2240 2241 Give all the names of the enzymes in B sorted alphabetically. 2242 """ 2243 return sorted(str(e) for e in self) 2244 2245 def as_string(self): 2246 """List the names of the enzymes of the RestrictionBatch. 2247 2248 Return a list of the name of the elements of the batch. 2249 """ 2250 return [str(e) for e in self] 2251 2252 @classmethod 2253 def suppl_codes(cls): 2254 """Return a dicionary with supplier codes. 2255 2256 Letter code for the suppliers. 2257 """ 2258 supply = {k: v[0] for k, v in suppliers_dict.items()} 2259 return supply 2260 2261 @classmethod 2262 def show_codes(cls): 2263 """Print a list of supplier codes.""" 2264 supply = [" = ".join(i) for i in cls.suppl_codes().items()] 2265 print("\n".join(supply)) 2266 2267 def search(self, dna, linear=True): 2268 """Return a dic of cutting sites in the seq for the batch enzymes.""" 2269 # 2270 # here we replace the search method of the individual enzymes 2271 # with one unique testing method. 2272 # 2273 if not hasattr(self, "already_mapped"): 2274 # TODO - Why does this happen! 2275 # Try the "doctest" at the start of PrintFormat.py 2276 self.already_mapped = None 2277 if isinstance(dna, DNA): 2278 # For the searching, we just care about the sequence as a string, 2279 # if that is the same we can use the cached search results. 2280 # At the time of writing, Seq == method isn't implemented, 2281 # and therefore does object identity which is stricter. 2282 if (str(dna), linear) == self.already_mapped: 2283 return self.mapping 2284 else: 2285 self.already_mapped = str(dna), linear 2286 fseq = FormattedSeq(dna, linear) 2287 self.mapping = {x: x.search(fseq) for x in self} 2288 return self.mapping 2289 elif isinstance(dna, FormattedSeq): 2290 if (str(dna), dna.linear) == self.already_mapped: 2291 return self.mapping 2292 else: 2293 self.already_mapped = str(dna), dna.linear 2294 self.mapping = {x: x.search(dna) for x in self} 2295 return self.mapping 2296 raise TypeError( 2297 "Expected Seq or MutableSeq instance, got %s instead" % type(dna) 2298 ) 2299 2300 2301############################################################################### 2302# # 2303# Restriction Analysis # 2304# # 2305############################################################################### 2306 2307_empty_DNA = DNA("") 2308_restrictionbatch = RestrictionBatch() 2309 2310 2311class Analysis(RestrictionBatch, PrintFormat): 2312 """Provide methods for enhanced analysis and pretty printing.""" 2313 2314 def __init__( 2315 self, restrictionbatch=_restrictionbatch, sequence=_empty_DNA, linear=True 2316 ): 2317 """Initialize an Analysis with RestrictionBatch and sequence. 2318 2319 For most of the methods of this class if a dictionary is given it will 2320 be used as the base to calculate the results. 2321 If no dictionary is given a new analysis using the RestrictionBatch 2322 which has been given when the Analysis class has been instantiated, 2323 will be carried out and used. 2324 """ 2325 RestrictionBatch.__init__(self, restrictionbatch) 2326 self.rb = restrictionbatch 2327 self.sequence = sequence 2328 self.linear = linear 2329 if self.sequence: 2330 self.search(self.sequence, self.linear) 2331 2332 def __repr__(self): 2333 """Represent ``Analysis`` class as a string.""" 2334 return "Analysis(%r,%r,%s)" % (self.rb, self.sequence, self.linear) 2335 2336 def _sub_set(self, wanted): 2337 """Filter result for keys which are in wanted (PRIVATE). 2338 2339 Internal use only. Returns a dict. 2340 2341 Screen the results through wanted set. 2342 Keep only the results for which the enzymes is in wanted set. 2343 """ 2344 # It seems that this method is not used in the whole class! 2345 return {k: v for k, v in self.mapping.items() if k in wanted} 2346 2347 def _boundaries(self, start, end): 2348 """Set boundaries to correct values (PRIVATE). 2349 2350 Format the boundaries for use with the methods that limit the 2351 search to only part of the sequence given to analyse. 2352 """ 2353 if not isinstance(start, int): 2354 raise TypeError("expected int, got %s instead" % type(start)) 2355 if not isinstance(end, int): 2356 raise TypeError("expected int, got %s instead" % type(end)) 2357 if start < 1: # Looks like this tries to do python list like indexing 2358 start += len(self.sequence) 2359 if end < 1: 2360 end += len(self.sequence) 2361 if start < end: 2362 pass 2363 else: 2364 start, end = end, start 2365 if start < end: 2366 return start, end, self._test_normal 2367 2368 def _test_normal(self, start, end, site): 2369 """Test if site is between start and end (PRIVATE). 2370 2371 Internal use only 2372 """ 2373 return start <= site < end 2374 2375 def _test_reverse(self, start, end, site): 2376 """Test if site is between end and start, for circular sequences (PRIVATE). 2377 2378 Internal use only. 2379 """ 2380 return start <= site <= len(self.sequence) or 1 <= site < end 2381 2382 def format_output(self, dct=None, title="", s1=""): 2383 """Collect data and pass to PrintFormat. 2384 2385 If dct is not given the full dictionary is used. 2386 """ 2387 if not dct: 2388 dct = self.mapping 2389 return PrintFormat.format_output(self, dct, title, s1) 2390 2391 def print_that(self, dct=None, title="", s1=""): 2392 """Print the output of the analysis. 2393 2394 If dct is not given the full dictionary is used. 2395 s1: Title for non-cutting enzymes 2396 This method prints the output of A.format_output() and it is here 2397 for backwards compatibility. 2398 """ 2399 print(self.format_output(dct, title, s1)) 2400 2401 def change(self, **what): 2402 """Change parameters of print output. 2403 2404 It is possible to change the width of the shell by setting 2405 self.ConsoleWidth to what you want. 2406 self.NameWidth refer to the maximal length of the enzyme name. 2407 2408 Changing one of these parameters here might not give the results 2409 you expect. In which case, you can settle back to a 80 columns shell 2410 or try to change self.Cmodulo and self.PrefWidth in PrintFormat until 2411 you get it right. 2412 """ 2413 for k, v in what.items(): 2414 if k in ("NameWidth", "ConsoleWidth"): 2415 setattr(self, k, v) 2416 self.Cmodulo = self.ConsoleWidth % self.NameWidth 2417 self.PrefWidth = self.ConsoleWidth - self.Cmodulo 2418 elif k == "sequence": 2419 setattr(self, "sequence", v) 2420 self.search(self.sequence, self.linear) 2421 elif k == "rb": 2422 self = Analysis.__init__(self, v, self.sequence, self.linear) 2423 elif k == "linear": 2424 setattr(self, "linear", v) 2425 self.search(self.sequence, v) 2426 elif k in ("Indent", "Maxsize"): 2427 setattr(self, k, v) 2428 elif k in ("Cmodulo", "PrefWidth"): 2429 raise AttributeError( 2430 "To change %s, change NameWidth and/or ConsoleWidth" % k 2431 ) 2432 else: 2433 raise AttributeError("Analysis has no attribute %s" % k) 2434 2435 def full(self, linear=True): 2436 """Perform analysis with all enzymes of batch and return all results. 2437 2438 Full Restriction Map of the sequence, as a dictionary. 2439 """ 2440 return self.mapping 2441 2442 def blunt(self, dct=None): 2443 """Return only cuts that have blunt ends.""" 2444 if not dct: 2445 dct = self.mapping 2446 return {k: v for k, v in dct.items() if k.is_blunt()} 2447 2448 def overhang5(self, dct=None): 2449 """Return only cuts that have 5' overhangs.""" 2450 if not dct: 2451 dct = self.mapping 2452 return {k: v for k, v in dct.items() if k.is_5overhang()} 2453 2454 def overhang3(self, dct=None): 2455 """Return only cuts that have 3' overhangs.""" 2456 if not dct: 2457 dct = self.mapping 2458 return {k: v for k, v in dct.items() if k.is_3overhang()} 2459 2460 def defined(self, dct=None): 2461 """Return only results from enzymes that produce defined overhangs.""" 2462 if not dct: 2463 dct = self.mapping 2464 return {k: v for k, v in dct.items() if k.is_defined()} 2465 2466 def with_sites(self, dct=None): 2467 """Return only results from enzyme with at least one cut.""" 2468 if not dct: 2469 dct = self.mapping 2470 return {k: v for k, v in dct.items() if v} 2471 2472 def without_site(self, dct=None): 2473 """Return only results from enzymes that don't cut the sequence.""" 2474 if not dct: 2475 dct = self.mapping 2476 return {k: v for k, v in dct.items() if not v} 2477 2478 def with_N_sites(self, N, dct=None): 2479 """Return only results from enzymes that cut the sequence N times.""" 2480 if not dct: 2481 dct = self.mapping 2482 return {k: v for k, v in dct.items() if len(v) == N} 2483 2484 def with_number_list(self, list, dct=None): 2485 """Return only results from enzymes that cut (x,y,z,...) times.""" 2486 if not dct: 2487 dct = self.mapping 2488 return {k: v for k, v in dct.items() if len(v) in list} 2489 2490 def with_name(self, names, dct=None): 2491 """Return only results from enzymes which names are listed.""" 2492 for i, enzyme in enumerate(names): 2493 if enzyme not in AllEnzymes: 2494 warnings.warn("no data for the enzyme: %s" % enzyme, BiopythonWarning) 2495 del names[i] 2496 if not dct: 2497 return RestrictionBatch(names).search(self.sequence, self.linear) 2498 return {n: dct[n] for n in names if n in dct} 2499 2500 def with_site_size(self, site_size, dct=None): 2501 """Return only results form enzymes with a given site size.""" 2502 sites = [name for name in self if name.size == site_size] 2503 if not dct: 2504 return RestrictionBatch(sites).search(self.sequence) 2505 return {k: v for k, v in dct.items() if k in site_size} 2506 2507 def only_between(self, start, end, dct=None): 2508 """Return only results from enzymes that only cut within start, end.""" 2509 start, end, test = self._boundaries(start, end) 2510 if not dct: 2511 dct = self.mapping 2512 d = dict(dct) 2513 for key, sites in dct.items(): 2514 if not sites: 2515 del d[key] 2516 continue 2517 for site in sites: 2518 if test(start, end, site): 2519 continue 2520 else: 2521 del d[key] 2522 break 2523 return d 2524 2525 def between(self, start, end, dct=None): 2526 """Return only results from enzymes that cut at least within borders. 2527 2528 Enzymes that cut the sequence at least in between start and end. 2529 They may cut outside as well. 2530 """ 2531 start, end, test = self._boundaries(start, end) 2532 d = {} 2533 if not dct: 2534 dct = self.mapping 2535 for key, sites in dct.items(): 2536 for site in sites: 2537 if test(start, end, site): 2538 d[key] = sites 2539 break 2540 continue 2541 return d 2542 2543 def show_only_between(self, start, end, dct=None): 2544 """Return only results from within start, end. 2545 2546 Enzymes must cut inside start/end and may also cut outside. However, 2547 only the cutting positions within start/end will be returned. 2548 """ 2549 d = [] 2550 if start <= end: 2551 d = [ 2552 (k, [vv for vv in v if start <= vv <= end]) 2553 for k, v in self.between(start, end, dct).items() 2554 ] 2555 else: 2556 d = [ 2557 (k, [vv for vv in v if start <= vv or vv <= end]) 2558 for k, v in self.between(start, end, dct).items() 2559 ] 2560 return dict(d) 2561 2562 def only_outside(self, start, end, dct=None): 2563 """Return only results from enzymes that only cut outside start, end. 2564 2565 Enzymes that cut the sequence outside of the region 2566 in between start and end but do not cut inside. 2567 """ 2568 start, end, test = self._boundaries(start, end) 2569 if not dct: 2570 dct = self.mapping 2571 d = dict(dct) 2572 for key, sites in dct.items(): 2573 if not sites: 2574 del d[key] 2575 continue 2576 for site in sites: 2577 if test(start, end, site): 2578 del d[key] 2579 break 2580 else: 2581 continue 2582 return d 2583 2584 def outside(self, start, end, dct=None): 2585 """Return only results from enzymes that at least cut outside borders. 2586 2587 Enzymes that cut outside the region in between start and end. 2588 They may cut inside as well. 2589 """ 2590 start, end, test = self._boundaries(start, end) 2591 if not dct: 2592 dct = self.mapping 2593 d = {} 2594 for key, sites in dct.items(): 2595 for site in sites: 2596 if test(start, end, site): 2597 continue 2598 else: 2599 d[key] = sites 2600 break 2601 return d 2602 2603 def do_not_cut(self, start, end, dct=None): 2604 """Return only results from enzymes that don't cut between borders.""" 2605 if not dct: 2606 dct = self.mapping 2607 d = self.without_site() 2608 d.update(self.only_outside(start, end, dct)) 2609 return d 2610 2611 2612# 2613# The restriction enzyme classes are created dynamically when the module is 2614# imported. Here is the magic which allow the creation of the 2615# restriction-enzyme classes. 2616# 2617# The reason for the two dictionaries in Restriction_Dictionary 2618# one for the types (which will be called pseudo-type as they really 2619# correspond to the values that instances of RestrictionType can take) 2620# and one for the enzymes is efficiency as the bases are evaluated 2621# once per pseudo-type. 2622# 2623# However Restriction is still a very inefficient module at import. But 2624# remember that around 660 classes (which is more or less the size of Rebase) 2625# have to be created dynamically. However, this processing take place only 2626# once. 2627# This inefficiency is however largely compensated by the use of metaclass 2628# which provide a very efficient layout for the class themselves mostly 2629# alleviating the need of if/else loops in the class methods. 2630# 2631# It is essential to run Restriction with doc string optimisation (-OO 2632# switch) as the doc string of 660 classes take a lot of processing. 2633# 2634CommOnly = RestrictionBatch() # commercial enzymes 2635NonComm = RestrictionBatch() # not available commercially 2636for TYPE, (bases, enzymes) in typedict.items(): 2637 # 2638 # The keys are the pseudo-types TYPE (stored as type1, type2...) 2639 # The names are not important and are only present to differentiate 2640 # the keys in the dict. All the pseudo-types are in fact RestrictionType. 2641 # These names will not be used after and the pseudo-types are not 2642 # kept in the locals() dictionary. It is therefore impossible to 2643 # import them. 2644 # Now, if you have look at the dictionary, you will see that not all the 2645 # types are present as those without corresponding enzymes have been 2646 # removed by Dictionary_Builder(). 2647 # 2648 # The values are tuples which contain 2649 # as first element a tuple of bases (as string) and 2650 # as second element the names of the enzymes. 2651 # 2652 # First eval the bases. 2653 # 2654 bases = tuple(eval(x) for x in bases) 2655 # 2656 # now create the particular value of RestrictionType for the classes 2657 # in enzymes. 2658 # 2659 T = type.__new__(RestrictionType, "RestrictionType", bases, {}) 2660 for k in enzymes: 2661 # 2662 # Now, we go through all the enzymes and assign them their type. 2663 # enzymedict[k] contains the values of the attributes for this 2664 # particular class (self.site, self.ovhg,....). 2665 # 2666 newenz = T(k, bases, enzymedict[k]) 2667 # 2668 # we add the enzymes to the corresponding batch. 2669 # 2670 # No need to verify the enzyme is a RestrictionType -> add_nocheck 2671 # 2672 if newenz.is_comm(): 2673 CommOnly.add_nocheck(newenz) 2674 else: 2675 NonComm.add_nocheck(newenz) 2676# 2677# AllEnzymes is a RestrictionBatch with all the enzymes from Rebase. 2678# 2679AllEnzymes = RestrictionBatch(CommOnly) 2680AllEnzymes.update(NonComm) 2681# 2682# Now, place the enzymes in locals so they can be imported. 2683# 2684names = [str(x) for x in AllEnzymes] 2685locals().update(dict(zip(names, AllEnzymes))) 2686__all__ = ( 2687 "FormattedSeq", 2688 "Analysis", 2689 "RestrictionBatch", 2690 "AllEnzymes", 2691 "CommOnly", 2692 "NonComm", 2693) + tuple(names) 2694del k, enzymes, TYPE, bases, names 2695