1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "KarlinSignatureDifferenceGraph.h"
23
24 #include <U2Core/AppContext.h>
25 #include <U2Core/DNAAlphabet.h>
26 #include <U2Core/DNASequenceObject.h>
27 #include <U2Core/DNATranslation.h>
28 #include <U2Core/TextUtils.h>
29
30 #include "DNAGraphPackPlugin.h"
31
32 /* TRANSLATOR U2::KarlinGraphFactory */
33
34 namespace U2 {
nameByType()35 static QString nameByType() {
36 return KarlinGraphFactory::tr("Karlin Signature Difference");
37 }
38
KarlinGraphFactory(QObject * p)39 KarlinGraphFactory::KarlinGraphFactory(QObject *p)
40 : GSequenceGraphFactory(nameByType(), p) {
41 }
42
43 //+
isEnabled(const U2SequenceObject * o) const44 bool KarlinGraphFactory::isEnabled(const U2SequenceObject *o) const {
45 const DNAAlphabet *al = o->getAlphabet();
46 return al->isNucleic();
47 }
48
createGraphs(GSequenceGraphView * view)49 QList<QSharedPointer<GSequenceGraphData>> KarlinGraphFactory::createGraphs(GSequenceGraphView *view) {
50 assert(isEnabled(view->getSequenceObject()));
51 return {QSharedPointer<GSequenceGraphData>(new GSequenceGraphData(view, getGraphName(), new KarlinGraphAlgorithm()))};
52 }
53
54 //////////////////////////////////////////////////////////////////////////
55 // KarlinGraphAlgorithm
56
getIndex(char nucl)57 static int getIndex(char nucl) {
58 switch (nucl) {
59 case 'A':
60 return 0;
61 case 'C':
62 return 1;
63 case 'T':
64 return 2;
65 case 'G':
66 return 3;
67 }
68 return -1;
69 }
70
71 #define IDX(x, y) ((x)*4 + (y))
72
73 // todo:: use limits
74 static const float FLOAT_MIN = 0.000000001f;
75
KarlinGraphAlgorithm()76 KarlinGraphAlgorithm::KarlinGraphAlgorithm()
77 : global_relative_abundance_values(nullptr) {
78 }
79
~KarlinGraphAlgorithm()80 KarlinGraphAlgorithm::~KarlinGraphAlgorithm() {
81 delete[] global_relative_abundance_values;
82 }
83
calculate(QVector<float> & result,U2SequenceObject * sequenceObject,qint64 window,qint64 step,U2OpStatus & os)84 void KarlinGraphAlgorithm::calculate(QVector<float> &result, U2SequenceObject *sequenceObject, qint64 window, qint64 step, U2OpStatus &os) {
85 U2Region vr(0, sequenceObject->getSequenceLength());
86 int nSteps = GSequenceGraphUtils::getNumSteps(vr, window, step);
87 result.reserve(nSteps);
88
89 const DNAAlphabet *al = sequenceObject->getAlphabet();
90 assert(al->isNucleic());
91
92 DNATranslationRegistry *tr = AppContext::getDNATranslationRegistry();
93 DNATranslation *complT = tr->lookupComplementTranslation(al);
94 assert(complT != nullptr);
95
96 DNATranslation *complTrans = complT;
97 mapTrans = complTrans->getOne2OneMapper();
98
99 QByteArray seq = sequenceObject->getWholeSequenceData(os);
100 CHECK_OP(os, );
101 int seqLen = seq.size();
102 const char *seqc = seq.constData();
103 if (global_relative_abundance_values == nullptr) {
104 global_relative_abundance_values = new float[16];
105 calculateRelativeAbundance(seqc, seqLen, global_relative_abundance_values, os);
106 CHECK_OP(os, );
107 }
108 // check!!
109 for (int i = 0; i < nSteps; i++) {
110 int start = vr.startPos + i * step;
111 int end = start + window;
112 float val = getValue(start, end, seq, os);
113 CHECK_OP(os, );
114 result.append(val);
115 }
116 }
117
getValue(int start,int end,const QByteArray & s,U2OpStatus & os)118 float KarlinGraphAlgorithm::getValue(int start, int end, const QByteArray &s, U2OpStatus &os) {
119 float relative_abundance_values[16];
120 calculateRelativeAbundance(s.constData() + start, end - start, relative_abundance_values, os);
121 float signature_difference = 0;
122 for (int first_base = 0; first_base < 4; ++first_base) {
123 for (int second_base = 0; second_base < 4; ++second_base) {
124 CHECK_OP(os, 0);
125 int idx = IDX(first_base, second_base);
126 float global_value = global_relative_abundance_values[idx];
127 float local_value = relative_abundance_values[idx];
128 signature_difference += qAbs(global_value - local_value);
129 }
130 }
131 float res = signature_difference / 16.0;
132 return res;
133 }
134
calculateRelativeAbundance(const char * seq,int length,float * results,U2OpStatus & os)135 void KarlinGraphAlgorithm::calculateRelativeAbundance(const char *seq, int length, float *results, U2OpStatus &os) {
136 QByteArray tmp;
137 tmp.resize(length);
138
139 int base_counts[4] = {0, 0, 0, 0};
140 int dinucleotide_base_counts[16] = {
141 0,
142 0,
143 0,
144 0,
145 0,
146 0,
147 0,
148 0,
149 0,
150 0,
151 0,
152 0,
153 0,
154 0,
155 0,
156 0,
157 };
158
159 int next_f_base_index = 0;
160 int next_r_base_index = 0;
161
162 for (int i = 0; i < length - 1; ++i) {
163 CHECK_OP(os, );
164 char this_f_base = seq[i];
165 char next_f_base = seq[i + 1];
166
167 int this_f_base_index = getIndex(this_f_base);
168 next_f_base_index = getIndex(next_f_base);
169
170 if (this_f_base_index >= 0 && next_f_base_index >= 0) {
171 ++base_counts[this_f_base_index];
172 ++dinucleotide_base_counts[IDX(this_f_base_index, next_f_base_index)];
173 }
174
175 char this_r_base = mapTrans.at(this_f_base);
176 char next_r_base = mapTrans.at(next_f_base);
177
178 int this_r_base_index = getIndex(this_r_base);
179 next_r_base_index = getIndex(next_r_base);
180
181 if (this_r_base_index >= 0 && next_r_base_index >= 0) {
182 ++base_counts[this_r_base_index];
183 ++dinucleotide_base_counts[IDX(this_r_base_index, next_r_base_index)];
184 }
185 }
186
187 if (next_f_base_index >= 0) {
188 ++base_counts[next_f_base_index];
189 }
190
191 if (next_r_base_index >= 0) {
192 ++base_counts[next_r_base_index];
193 }
194
195 for (int first_base_index = 0; first_base_index < 4; ++first_base_index) {
196 for (int second_base_index = 0; second_base_index < 4; ++second_base_index) {
197 int idx = IDX(first_base_index, second_base_index);
198 float dinucleotide_frequency = float(dinucleotide_base_counts[idx]) / (2 * (length - 1));
199 float first_base_frequency = float(base_counts[first_base_index]) / (2 * length);
200 float second_base_frequency = float(base_counts[second_base_index]) / (2 * length);
201 results[idx] = dinucleotide_frequency / qMax(FLOAT_MIN, first_base_frequency * second_base_frequency);
202 }
203 }
204 }
205
206 } // namespace U2
207