1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "KarlinSignatureDifferenceGraph.h"
23 
24 #include <U2Core/AppContext.h>
25 #include <U2Core/DNAAlphabet.h>
26 #include <U2Core/DNASequenceObject.h>
27 #include <U2Core/DNATranslation.h>
28 #include <U2Core/TextUtils.h>
29 
30 #include "DNAGraphPackPlugin.h"
31 
32 /* TRANSLATOR U2::KarlinGraphFactory */
33 
34 namespace U2 {
nameByType()35 static QString nameByType() {
36     return KarlinGraphFactory::tr("Karlin Signature Difference");
37 }
38 
KarlinGraphFactory(QObject * p)39 KarlinGraphFactory::KarlinGraphFactory(QObject *p)
40     : GSequenceGraphFactory(nameByType(), p) {
41 }
42 
43 //+
isEnabled(const U2SequenceObject * o) const44 bool KarlinGraphFactory::isEnabled(const U2SequenceObject *o) const {
45     const DNAAlphabet *al = o->getAlphabet();
46     return al->isNucleic();
47 }
48 
createGraphs(GSequenceGraphView * view)49 QList<QSharedPointer<GSequenceGraphData>> KarlinGraphFactory::createGraphs(GSequenceGraphView *view) {
50     assert(isEnabled(view->getSequenceObject()));
51     return {QSharedPointer<GSequenceGraphData>(new GSequenceGraphData(view, getGraphName(), new KarlinGraphAlgorithm()))};
52 }
53 
54 //////////////////////////////////////////////////////////////////////////
55 // KarlinGraphAlgorithm
56 
getIndex(char nucl)57 static int getIndex(char nucl) {
58     switch (nucl) {
59         case 'A':
60             return 0;
61         case 'C':
62             return 1;
63         case 'T':
64             return 2;
65         case 'G':
66             return 3;
67     }
68     return -1;
69 }
70 
71 #define IDX(x, y) ((x)*4 + (y))
72 
73 // todo:: use limits
74 static const float FLOAT_MIN = 0.000000001f;
75 
KarlinGraphAlgorithm()76 KarlinGraphAlgorithm::KarlinGraphAlgorithm()
77     : global_relative_abundance_values(nullptr) {
78 }
79 
~KarlinGraphAlgorithm()80 KarlinGraphAlgorithm::~KarlinGraphAlgorithm() {
81     delete[] global_relative_abundance_values;
82 }
83 
calculate(QVector<float> & result,U2SequenceObject * sequenceObject,qint64 window,qint64 step,U2OpStatus & os)84 void KarlinGraphAlgorithm::calculate(QVector<float> &result, U2SequenceObject *sequenceObject, qint64 window, qint64 step, U2OpStatus &os) {
85     U2Region vr(0, sequenceObject->getSequenceLength());
86     int nSteps = GSequenceGraphUtils::getNumSteps(vr, window, step);
87     result.reserve(nSteps);
88 
89     const DNAAlphabet *al = sequenceObject->getAlphabet();
90     assert(al->isNucleic());
91 
92     DNATranslationRegistry *tr = AppContext::getDNATranslationRegistry();
93     DNATranslation *complT = tr->lookupComplementTranslation(al);
94     assert(complT != nullptr);
95 
96     DNATranslation *complTrans = complT;
97     mapTrans = complTrans->getOne2OneMapper();
98 
99     QByteArray seq = sequenceObject->getWholeSequenceData(os);
100     CHECK_OP(os, );
101     int seqLen = seq.size();
102     const char *seqc = seq.constData();
103     if (global_relative_abundance_values == nullptr) {
104         global_relative_abundance_values = new float[16];
105         calculateRelativeAbundance(seqc, seqLen, global_relative_abundance_values, os);
106         CHECK_OP(os, );
107     }
108     // check!!
109     for (int i = 0; i < nSteps; i++) {
110         int start = vr.startPos + i * step;
111         int end = start + window;
112         float val = getValue(start, end, seq, os);
113         CHECK_OP(os, );
114         result.append(val);
115     }
116 }
117 
getValue(int start,int end,const QByteArray & s,U2OpStatus & os)118 float KarlinGraphAlgorithm::getValue(int start, int end, const QByteArray &s, U2OpStatus &os) {
119     float relative_abundance_values[16];
120     calculateRelativeAbundance(s.constData() + start, end - start, relative_abundance_values, os);
121     float signature_difference = 0;
122     for (int first_base = 0; first_base < 4; ++first_base) {
123         for (int second_base = 0; second_base < 4; ++second_base) {
124             CHECK_OP(os, 0);
125             int idx = IDX(first_base, second_base);
126             float global_value = global_relative_abundance_values[idx];
127             float local_value = relative_abundance_values[idx];
128             signature_difference += qAbs(global_value - local_value);
129         }
130     }
131     float res = signature_difference / 16.0;
132     return res;
133 }
134 
calculateRelativeAbundance(const char * seq,int length,float * results,U2OpStatus & os)135 void KarlinGraphAlgorithm::calculateRelativeAbundance(const char *seq, int length, float *results, U2OpStatus &os) {
136     QByteArray tmp;
137     tmp.resize(length);
138 
139     int base_counts[4] = {0, 0, 0, 0};
140     int dinucleotide_base_counts[16] = {
141         0,
142         0,
143         0,
144         0,
145         0,
146         0,
147         0,
148         0,
149         0,
150         0,
151         0,
152         0,
153         0,
154         0,
155         0,
156         0,
157     };
158 
159     int next_f_base_index = 0;
160     int next_r_base_index = 0;
161 
162     for (int i = 0; i < length - 1; ++i) {
163         CHECK_OP(os, );
164         char this_f_base = seq[i];
165         char next_f_base = seq[i + 1];
166 
167         int this_f_base_index = getIndex(this_f_base);
168         next_f_base_index = getIndex(next_f_base);
169 
170         if (this_f_base_index >= 0 && next_f_base_index >= 0) {
171             ++base_counts[this_f_base_index];
172             ++dinucleotide_base_counts[IDX(this_f_base_index, next_f_base_index)];
173         }
174 
175         char this_r_base = mapTrans.at(this_f_base);
176         char next_r_base = mapTrans.at(next_f_base);
177 
178         int this_r_base_index = getIndex(this_r_base);
179         next_r_base_index = getIndex(next_r_base);
180 
181         if (this_r_base_index >= 0 && next_r_base_index >= 0) {
182             ++base_counts[this_r_base_index];
183             ++dinucleotide_base_counts[IDX(this_r_base_index, next_r_base_index)];
184         }
185     }
186 
187     if (next_f_base_index >= 0) {
188         ++base_counts[next_f_base_index];
189     }
190 
191     if (next_r_base_index >= 0) {
192         ++base_counts[next_r_base_index];
193     }
194 
195     for (int first_base_index = 0; first_base_index < 4; ++first_base_index) {
196         for (int second_base_index = 0; second_base_index < 4; ++second_base_index) {
197             int idx = IDX(first_base_index, second_base_index);
198             float dinucleotide_frequency = float(dinucleotide_base_counts[idx]) / (2 * (length - 1));
199             float first_base_frequency = float(base_counts[first_base_index]) / (2 * length);
200             float second_base_frequency = float(base_counts[second_base_index]) / (2 * length);
201             results[idx] = dinucleotide_frequency / qMax(FLOAT_MIN, first_base_frequency * second_base_frequency);
202         }
203     }
204 }
205 
206 }  // namespace U2
207