1#!/usr/bin/env python 2 3############################################################################### 4# # 5# This program is free software: you can redistribute it and/or modify # 6# it under the terms of the GNU General Public License as published by # 7# the Free Software Foundation, either version 3 of the License, or # 8# (at your option) any later version. # 9# # 10# This program is distributed in the hope that it will be useful, # 11# but WITHOUT ANY WARRANTY; without even the implied warranty of # 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 13# GNU General Public License for more details. # 14# # 15# You should have received a copy of the GNU General Public License # 16# along with this program. If not, see <http://www.gnu.org/licenses/>. # 17# # 18############################################################################### 19 20__prog_name__ = 'findCircularSeqs.py' 21__prog_desc__ = 'identify seqs with near identical start and end motifs' 22 23__author__ = 'Donovan Parks' 24__copyright__ = 'Copyright 2013' 25__credits__ = ['Donovan Parks'] 26__license__ = 'GPL3' 27__version__ = '0.1' 28__maintainer__ = 'Donovan Parks' 29__email__ = 'donovan.parks@gmail.com' 30__status__ = 'Development' 31 32import os 33import sys 34import argparse 35import difflib 36 37from checkm.lib.seqUtils import readFasta 38 39class FindCircularSeqs(object): 40 def __init__(self): 41 pass 42 43 def hamming(self, str1, str2): 44 diffs = 0 45 for ch1, ch2 in zip(str1, str2): 46 if ch1 != ch2: 47 diffs += 1 48 return diffs 49 50 def matchMotif(self, motif, seq, overlap, motifErrors): 51 # check for exact match 52 posEndMotif = seq.rfind(motif, max(0, len(seq) - overlap)) 53 if posEndMotif != -1 or motifErrors == 0: 54 return posEndMotif 55 56 # find first match with an acceptable number of mismatchers 57 for i in xrange(len(seq)-overlap, len(seq)-len(motif)): 58 diff = self.hamming(motif, seq[i:i+len(motif)]) 59 if diff <= motifErrors: 60 return i 61 62 return -1 63 64 def run(self, seqFile, length, trim, motifErrors, overlap, acceptDiff, minLen): 65 seqs = readFasta(seqFile) 66 67 for seqId, seq in seqs.iteritems(): 68 if len(seq) < minLen: 69 continue 70 71 trimmedSeq = seq[trim:len(seq)-trim] 72 73 startMotif = trimmedSeq[0:length] 74 75 posEndMotif = self.matchMotif(startMotif, trimmedSeq, overlap, motifErrors) 76 if posEndMotif != -1: 77 diff = self.hamming(trimmedSeq[0:len(trimmedSeq)-posEndMotif], trimmedSeq[posEndMotif:]) 78 if diff <= acceptDiff: 79 print '[Putative Circular Sequence]' 80 print seqId 81 print 'M: ' + startMotif 82 print 'S: ' + trimmedSeq[0:len(trimmedSeq)-posEndMotif] 83 print 'E: ' + trimmedSeq[posEndMotif:] 84 print 'Motif length: %d' % (len(trimmedSeq)-posEndMotif) 85 print 'Hamming distance: % d' % diff 86 print 'Sequence length: %d' % len(trimmedSeq) 87 print '--------------' 88 89if __name__ == '__main__': 90 print __prog_name__ + ' v' + __version__ + ': ' + __prog_desc__ 91 print ' by ' + __author__ + ' (' + __email__ + ')' + '\n' 92 93 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 94 parser.add_argument('seq_file', help='sequences to search for circularity') 95 parser.add_argument('-l', '--length', default=50, help='length of motif to use in search') 96 parser.add_argument('-t', '--trim', default=15, help='number of bases to trim from start and end of sequence') 97 parser.add_argument('-e', '--errors', default=1, help='acceptable mismatched to motif') 98 parser.add_argument('-o', '--overlap', default=300, help='search range of start motif at end of sequence') 99 parser.add_argument('-d', '--diff', default=2, help='acceptable number of difference between start and end') 100 parser.add_argument('-m', '--min_len', default=1000, help='minimum sequences length') 101 102 args = parser.parse_args() 103 104 try: 105 findCircularSeqs = FindCircularSeqs() 106 findCircularSeqs.run(args.seq_file, args.length, args.trim, args.errors, args.overlap, args.diff, args.min_len) 107 except SystemExit: 108 print "\nControlled exit resulting from an unrecoverable error or warning." 109 except: 110 print "\nUnexpected error:", sys.exc_info()[0] 111 raise 112