1#!/usr/bin/env python
2
3###############################################################################
4#                                                                             #
5#    This program is free software: you can redistribute it and/or modify     #
6#    it under the terms of the GNU General Public License as published by     #
7#    the Free Software Foundation, either version 3 of the License, or        #
8#    (at your option) any later version.                                      #
9#                                                                             #
10#    This program is distributed in the hope that it will be useful,          #
11#    but WITHOUT ANY WARRANTY; without even the implied warranty of           #
12#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            #
13#    GNU General Public License for more details.                             #
14#                                                                             #
15#    You should have received a copy of the GNU General Public License        #
16#    along with this program. If not, see <http://www.gnu.org/licenses/>.     #
17#                                                                             #
18###############################################################################
19
20__prog_name__ = 'findCircularSeqs.py'
21__prog_desc__ = 'identify seqs with near identical start and end motifs'
22
23__author__ = 'Donovan Parks'
24__copyright__ = 'Copyright 2013'
25__credits__ = ['Donovan Parks']
26__license__ = 'GPL3'
27__version__ = '0.1'
28__maintainer__ = 'Donovan Parks'
29__email__ = 'donovan.parks@gmail.com'
30__status__ = 'Development'
31
32import os
33import sys
34import argparse
35import difflib
36
37from checkm.lib.seqUtils import readFasta
38
39class FindCircularSeqs(object):
40    def __init__(self):
41      pass
42
43    def hamming(self, str1, str2):
44      diffs = 0
45      for ch1, ch2 in zip(str1, str2):
46        if ch1 != ch2:
47          diffs += 1
48      return diffs
49
50    def matchMotif(self, motif, seq, overlap, motifErrors):
51      # check for exact match
52      posEndMotif = seq.rfind(motif, max(0, len(seq) - overlap))
53      if posEndMotif != -1 or motifErrors == 0:
54        return posEndMotif
55
56      # find first match with an acceptable number of mismatchers
57      for i in xrange(len(seq)-overlap, len(seq)-len(motif)):
58        diff = self.hamming(motif, seq[i:i+len(motif)])
59        if diff <= motifErrors:
60          return i
61
62      return -1
63
64    def run(self, seqFile, length, trim, motifErrors, overlap, acceptDiff, minLen):
65      seqs = readFasta(seqFile)
66
67      for seqId, seq in seqs.iteritems():
68        if len(seq) < minLen:
69          continue
70
71        trimmedSeq = seq[trim:len(seq)-trim]
72
73        startMotif = trimmedSeq[0:length]
74
75        posEndMotif = self.matchMotif(startMotif, trimmedSeq, overlap, motifErrors)
76        if posEndMotif != -1:
77          diff = self.hamming(trimmedSeq[0:len(trimmedSeq)-posEndMotif], trimmedSeq[posEndMotif:])
78          if diff <= acceptDiff:
79            print '[Putative Circular Sequence]'
80            print seqId
81            print 'M: ' + startMotif
82            print 'S: ' + trimmedSeq[0:len(trimmedSeq)-posEndMotif]
83            print 'E: ' + trimmedSeq[posEndMotif:]
84            print 'Motif length: %d' % (len(trimmedSeq)-posEndMotif)
85            print 'Hamming distance: % d' % diff
86            print 'Sequence length: %d' % len(trimmedSeq)
87            print '--------------'
88
89if __name__ == '__main__':
90    print __prog_name__ + ' v' + __version__ + ': ' + __prog_desc__
91    print '  by ' + __author__ + ' (' + __email__ + ')' + '\n'
92
93    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
94    parser.add_argument('seq_file', help='sequences to search for circularity')
95    parser.add_argument('-l', '--length', default=50, help='length of motif to use in search')
96    parser.add_argument('-t', '--trim', default=15, help='number of bases to trim from start and end of sequence')
97    parser.add_argument('-e', '--errors', default=1, help='acceptable mismatched to motif')
98    parser.add_argument('-o', '--overlap', default=300, help='search range of start motif at end of sequence')
99    parser.add_argument('-d', '--diff', default=2, help='acceptable number of difference between start and end')
100    parser.add_argument('-m', '--min_len', default=1000, help='minimum sequences length')
101
102    args = parser.parse_args()
103
104    try:
105        findCircularSeqs = FindCircularSeqs()
106        findCircularSeqs.run(args.seq_file, args.length, args.trim, args.errors, args.overlap, args.diff, args.min_len)
107    except SystemExit:
108        print "\nControlled exit resulting from an unrecoverable error or warning."
109    except:
110        print "\nUnexpected error:", sys.exc_info()[0]
111        raise
112