1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "DistanceMatrix.h"
23
24 #include <QSharedData>
25
26 #include "U2Core/global.h"
27
28 #ifdef __GNUC__
29 # pragma GCC diagnostic push
30 # pragma GCC diagnostic ignored "-Wunused-variable"
31 #endif
32 #include "dnadist.h"
33 #include "protdist.h"
34 #ifdef __GNUC__
35 # pragma GCC diagnostic pop
36 #endif
37
38 #include <float.h>
39 #include <iostream>
40
41 namespace U2 {
42
calculateOutOfAlignment(const MultipleSequenceAlignment & ma,const CreatePhyTreeSettings & settings)43 void DistanceMatrix::calculateOutOfAlignment(const MultipleSequenceAlignment &ma, const CreatePhyTreeSettings &settings) {
44 try {
45 malignment = &ma;
46 int index = 0;
47 int size = ma->getNumRows();
48 this->size = size;
49 printdata = false;
50
51 foreach (const MultipleSequenceAlignmentRow &r, ma->getMsaRows()) {
52 const QString &str = r->getName();
53 index_map.insert(str, index);
54 index++;
55 unprocessed_taxa.append(str);
56 }
57 if (!memoryLocker.tryAcquire(size * (sizeof(matrixrow) + sizeof(float) * size))) {
58 errorMessage = memoryLocker.getError();
59 return;
60 }
61
62 for (int i = 0; i < size; i++) {
63 matrixrow row;
64
65 for (int j = 0; j < size; j++) {
66 row.append(0);
67 }
68
69 rawMatrix.append(row);
70 }
71 spp = ma->getNumRows();
72 sites = ma->getLength();
73 chars = sites;
74 nonodes = 2 * sites - 1;
75 DNAAlphabetType alphabetType = ma->getAlphabet()->getType();
76
77 ibmpc = IBMCRT;
78 ansi = ANSICRT;
79 mulsets = false;
80 datasets = 1;
81 firstset = true;
82
83 if ((alphabetType == DNAAlphabet_RAW) || (alphabetType == DNAAlphabet_NUCL)) {
84 setDNADistSettings(settings);
85 doinit(memoryLocker);
86 if (memoryLocker.hasError()) {
87 errorMessage = memoryLocker.getError();
88 return;
89 }
90 // ttratio = ttratio0;
91 inputoptions();
92
93 for (int k = 0; k < spp; k++) {
94 for (int j = 0; j < sites; j++) {
95 y[k][j] = ma->getMsaRow(k)->charAt(j);
96 }
97 }
98 makeweights();
99 dnadist_makevalues();
100 dnadist_empiricalfreqs();
101
102 getbasefreqs(freqa, freqc, freqg, freqt, &freqr, &freqy, &freqar, &freqcy, &freqgr, &freqty, &ttratio, &xi, &xv, &fracchange, freqsfrom, printdata);
103 makedists();
104
105 } else {
106 prot_doinit(settings, memoryLocker);
107 if (memoryLocker.hasError()) {
108 errorMessage = memoryLocker.getError();
109 return;
110 }
111 if (!(kimura || similarity))
112 code();
113 if (!(usejtt || usepmb || usepam || kimura || similarity)) {
114 protdist_cats();
115 maketrans();
116 qreigen(prob, 20L);
117 } else {
118 if (kimura || similarity)
119 fracchange = 1.0;
120 else {
121 if (usejtt)
122 jtteigen();
123 else {
124 if (usepmb)
125 pmbeigen();
126 else
127 pameigen();
128 }
129 }
130 }
131
132 doinput();
133 Phylip_Char charstate;
134 aas aa = (aas)0;
135
136 for (int k = 0; k < spp; k++) {
137 for (int j = 0; j < sites; j++) {
138 charstate = ma->getMsaRow(k)->charAt(j);
139 switch (charstate) {
140 case 'A':
141 aa = ala;
142 break;
143 case 'B':
144 aa = asx;
145 break;
146
147 case 'C':
148 aa = cys;
149 break;
150
151 case 'D':
152 aa = asp;
153 break;
154
155 case 'E':
156 aa = glu;
157 break;
158
159 case 'F':
160 aa = phe;
161 break;
162
163 case 'G':
164 aa = gly;
165 break;
166
167 case 'H':
168 aa = his;
169 break;
170
171 case 'I':
172 aa = ileu;
173 break;
174
175 case 'K':
176 aa = lys;
177 break;
178
179 case 'L':
180 aa = leu;
181 break;
182
183 case 'M':
184 aa = met;
185 break;
186
187 case 'N':
188 aa = asn;
189 break;
190
191 case 'P':
192 aa = pro;
193 break;
194
195 case 'Q':
196 aa = gln;
197 break;
198
199 case 'R':
200 aa = arg;
201 break;
202
203 case 'S':
204 aa = ser;
205 break;
206
207 case 'T':
208 aa = thr;
209 break;
210
211 case 'V':
212 aa = val;
213 break;
214
215 case 'W':
216 aa = trp;
217 break;
218
219 case 'X':
220 aa = unk;
221 break;
222
223 case 'Y':
224 aa = tyr;
225 break;
226
227 case 'Z':
228 aa = glx;
229 break;
230
231 case '*':
232 aa = stop;
233 break;
234
235 case '?':
236 aa = quest;
237 break;
238
239 case '-':
240 aa = del;
241 break;
242 }
243 gnode[k][j] = aa;
244 }
245 }
246
247 if (ith == 1)
248 firstset = false;
249 prot_makedists();
250
251 for (int i = 0; i < spp; i++) {
252 free(gnode[i]);
253 }
254 }
255 for (int i = 0; i < spp; i++) {
256 for (int j = 0; j < spp; j++) {
257 rawMatrix[i][j] = d[i][j];
258 }
259 }
260 } catch (const std::bad_alloc &) {
261 errorMessage = QString("Not enough memory to calculate distance matrix for alignment \"%1\"").arg(ma->getName());
262 if (nullptr != gnode) {
263 for (int i = 0; i < spp; i++) {
264 free(gnode[i]);
265 }
266 free(gnode);
267 }
268 }
269 }
270
DistanceMatrix()271 DistanceMatrix::DistanceMatrix()
272 : rawdata(nullptr),
273 size(),
274 malignment(nullptr),
275 treedata(nullptr) {
276 }
277
~DistanceMatrix()278 DistanceMatrix::~DistanceMatrix() {
279 if (nullptr != y) {
280 for (int i = 0; i < spp; i++) {
281 free(y[i]);
282 }
283 free(y);
284 y = nullptr;
285 }
286
287 if (nullptr != nodep) {
288 for (int i = 0; i < spp; i++) {
289 for (int j = 0; j < endsite; j++) {
290 free(nodep[i]->x[j]);
291 }
292 free(nodep[i]->x);
293 free(nodep[i]);
294 }
295 free(nodep);
296 nodep = nullptr;
297 }
298 free(category);
299 category = nullptr;
300
301 free(oldweight);
302 oldweight = nullptr;
303
304 free(weight);
305 weight = nullptr;
306
307 free(alias);
308 alias = nullptr;
309
310 free(ally);
311 ally = nullptr;
312
313 free(location);
314 location = nullptr;
315
316 free(weightrat);
317 weightrat = nullptr;
318
319 if (nullptr != d) {
320 for (int i = 0; i < spp; i++) {
321 free(d[i]);
322 }
323 free(d);
324 d = nullptr;
325 }
326 }
327
printPhyTree()328 void DistanceMatrix::printPhyTree() {
329 DistanceMatrix::printPhyTree(treedata);
330 }
calculateQMatrix()331 void DistanceMatrix::calculateQMatrix() {
332 for (int i = 0; i < size; i++) {
333 matrixrow row;
334
335 for (int j = 0; j < size; j++) {
336 row.append(0);
337 }
338
339 qdata.append(row);
340 }
341
342 for (int i = 0; i < size; i++) {
343 qdata[i].reserve(size);
344 }
345 for (int i = 0; i < size; i++) {
346 for (int j = 0; j < size; j++) {
347 if (i == j) {
348 qdata[i][j] = 0.0;
349 } else {
350 float dij = rawMatrix[i][j];
351 float ri = calculateRawDivergence(i);
352 float rj = calculateRawDivergence(j);
353 float q = dij - (ri + rj) / (size - 2);
354 qdata[i][j] = q;
355 }
356 }
357 }
358 }
359
dumpRawData(matrix & m)360 void DistanceMatrix::dumpRawData(matrix &m) {
361 std::cout << "Distance Matrix " << std::endl;
362 for (int i = 0; i < size; i++) {
363 for (int j = 0; j < size; j++) {
364 float distance = m[i][j];
365 std::cout << distance << " ";
366 }
367 std::cout << std::endl;
368 }
369 }
370
dumpQData()371 void DistanceMatrix::dumpQData() {
372 std::cout << "Q Matrix " << std::endl;
373 for (int i = 0; i < size; i++) {
374 for (int j = 0; j < size; j++) {
375 float distance = qdata[i][j];
376 std::cout << distance << " ";
377 }
378 std::cout << std::endl;
379 }
380 }
generateNodeName(QString name1,QString name2)381 QString DistanceMatrix::generateNodeName(QString name1, QString name2) {
382 QString r = "___";
383 r.append(name1);
384 r = r.append("___");
385 r = r.append(name2);
386 return r;
387 }
switchName(PhyNode * node)388 void DistanceMatrix::switchName(PhyNode *node) {
389 QString str = node->getName();
390 if (str.startsWith("ROOT")) {
391 node->setName("");
392 }
393 if (str.startsWith("___")) {
394 node->setName("");
395 }
396 for (int i = 0; i < node->branchCount(); i++) {
397 node->setBranchesDistance(i, abs(node->getBranchesDistance(i)));
398 if (node->getBranchesDistance(i) != node->getBranchesDistance(i)) {
399 node->setBranchesDistance(i, 1.0);
400 }
401 }
402 }
403
addPairToTree(QPair<int,int> * location)404 void DistanceMatrix::addPairToTree(QPair<int, int> *location) {
405 // treedata->rootNode->dumpBranches();
406 QString name1 = getTaxaName(location->first);
407 if (unprocessed_taxa.contains(name1)) {
408 unprocessed_taxa.removeAll(name1);
409 }
410 PhyNode *oldnode1 = getNodeByName(name1);
411 if (oldnode1 == 0) {
412 return;
413 }
414 PhyNode *rootnode1 = oldnode1->getParentNode();
415
416 treedata->removeBranch(rootnode1, oldnode1);
417
418 // printPhyTree();
419
420 QString name2 = getTaxaName(location->second);
421 if (unprocessed_taxa.contains(name2)) {
422 unprocessed_taxa.removeAll(name2);
423 }
424 PhyNode *oldnode2 = getNodeByName(name2);
425 PhyNode *rootnode2 = oldnode2->getParentNode();
426 treedata->removeBranch(rootnode2, oldnode2);
427 if (oldnode1 == oldnode2) {
428 return;
429 }
430 // printPhyTree();
431
432 float distance1 = calculateRootDistance(location->first, location->second);
433 float distance2 = calculateAdjacentDistance(location->first, location->second, distance1);
434
435 PhyNode *new_node = new PhyNode();
436 QString new_name = generateNodeName(name1, name2);
437 new_node->setName(new_name);
438 treedata->addBranch(new_node, oldnode1, distance1);
439 if (!new_node->isConnected(oldnode2)) {
440 treedata->addBranch(new_node, oldnode2, distance2);
441 }
442 if (!rootnode1->isConnected(new_node)) {
443 treedata->addBranch(rootnode1, new_node, 1);
444 }
445 // printPhyTree();
446
447 for (int j = 0; j < this->size; j++) {
448 middlematrix.append(rawMatrix[j]);
449 }
450 // dumpRawData();
451 // dumpRawData(middlematrix);
452
453 for (int j = 0; j < size; j++) {
454 rawMatrix[j].remove(location->first);
455 rawMatrix[j].remove(location->second);
456 }
457 rawMatrix.remove(location->first);
458 rawMatrix.remove(location->second);
459 this->size -= 2;
460
461 // dumpRawData(rawMatrix);
462 QMap<QString, int> old_map;
463 QList<QString> namelist = index_map.uniqueKeys();
464 for (int j = 0; j < index_map.size(); j++) {
465 int index = index_map[namelist[j]];
466 old_map.insert(namelist[j], index);
467 }
468
469 QList<QString> names = index_map.uniqueKeys();
470 index_map.clear();
471 for (int j = 0; j < names.size(); j++) {
472 QString cur_name = names[j];
473 if (cur_name != name1 && cur_name != name2) {
474 int index = getNewIndex(cur_name, *location, old_map);
475 index_map.insert(cur_name, index);
476 }
477 }
478 // int new_index = index_map.size();
479
480 index_map.insert(new_name, index_map.size());
481 // printIndex();
482
483 for (int j = 0; j < size; j++) {
484 QString name = getTaxaName(j);
485 int old_index = old_map[name];
486 float distance = calculateNewDistance(location, old_index);
487 rawMatrix[j].append(distance);
488 }
489
490 this->size++;
491
492 matrixrow row;
493 for (int j = 0; j < size - 1; j++) {
494 float distance = rawMatrix[j][size - 1];
495 row.append(distance);
496 }
497 row.append(0);
498
499 rawMatrix.append(row);
500 middlematrix.clear();
501
502 // dumpRawData();
503 // treedata->rootNode->dumpBranches();
504 }
505
getNodeByName(QString name)506 PhyNode *DistanceMatrix::getNodeByName(QString name) {
507 visited_list.clear();
508 PhyNode *rootNode = treedata->getRootNode();
509 return findNode(rootNode, name);
510 }
511
findNode(PhyNode * startnode,QString name)512 PhyNode *DistanceMatrix::findNode(PhyNode *startnode, QString name) {
513 visited_list.append(startnode->getName());
514 if (startnode->getName() == name) {
515 return startnode;
516 } else {
517 for (int i = 0; i < startnode->branchCount(); i++) {
518 PhyBranch *branch = startnode->getBranch(i);
519 if (!visited_list.contains(branch->node2->getName())) {
520 PhyNode *node2 = findNode(branch->node2, name);
521 if (node2 != 0) {
522 return node2;
523 }
524 }
525 }
526 }
527 return 0;
528 }
529
calculateRootDistance(int i,int j)530 float DistanceMatrix::calculateRootDistance(int i, int j) {
531 // S(AU) =d(AB) / 2 + [r(A)-r(B)] / 2(N-2) = 1
532 float distance = rawMatrix[i][j] / 2;
533
534 float num1 = calculateRawDivergence(i);
535 float num2 = calculateRawDivergence(j);
536
537 float div = (num1 - num2) / (2 * (size - 2));
538 float result = distance + div;
539 return result;
540 }
541
calculateAdjacentDistance(int i,int j,float rootDistance)542 float DistanceMatrix::calculateAdjacentDistance(int i, int j, float rootDistance) {
543 float result = rawMatrix[i][j] - rootDistance;
544 return result;
545 }
546
calculateNewDistance(QPair<int,int> * location,int current)547 float DistanceMatrix::calculateNewDistance(QPair<int, int> *location, int current) {
548 float a = middlematrix[current][location->first];
549 float b = middlematrix[current][location->second];
550 float c = middlematrix[location->first][location->second] / 2;
551 return a + b - c;
552 }
553
calculateRawDivergence(int index)554 float DistanceMatrix::calculateRawDivergence(int index) {
555 float divergence = 0;
556 for (int i = 0; i < index; i++) {
557 divergence += rawMatrix[i][index];
558 }
559 for (int j = index; j < size; j++) {
560 divergence += rawMatrix[index][j];
561 }
562 return divergence;
563 }
564
getTaxaName(int index)565 QString DistanceMatrix::getTaxaName(int index) {
566 QMap<QString, int>::iterator i;
567 for (i = index_map.begin(); i != index_map.end(); ++i) {
568 if (i.value() == index) {
569 return i.key();
570 }
571 }
572 return " ";
573 }
574
printPhyTree(PhyTreeData * data)575 void DistanceMatrix::printPhyTree(PhyTreeData *data) {
576 data->print();
577 }
578
printPhyNode(PhyNode * node,int tab,QList<PhyNode * > & nodes)579 void DistanceMatrix::printPhyNode(PhyNode *node, int tab, QList<PhyNode *> &nodes) {
580 if (node == 0 || nodes.contains(node)) {
581 return;
582 }
583 nodes.append(node);
584 for (int i = 0; i < tab; i++) {
585 std::cout << " ";
586 }
587 tab++;
588 std::cout << "name: " << node->getName().toLatin1().constData() << std::endl;
589 for (int i = 0; i < node->branchCount(); i++) {
590 PhyBranch *branch = node->getBranch(i);
591 printPhyNode(branch->node2, tab, nodes);
592 }
593 }
areTreesEqual(PhyTree * tree1,PhyTree * tree2)594 bool DistanceMatrix::areTreesEqual(PhyTree *tree1, PhyTree *tree2) {
595 QMap<QString, int> nodes1;
596 QMap<QString, int> nodes2;
597
598 QList<PhyNode *> list1;
599 QList<PhyNode *> list2;
600
601 QList<PhyBranch *> branches1;
602 QList<PhyBranch *> branches2;
603
604 addNodeToList(list1, nodes1, branches1, tree1->data()->getRootNode());
605 addNodeToList(list2, nodes2, branches2, tree2->data()->getRootNode());
606
607 QList<QString> keys1 = nodes1.keys();
608
609 if (nodes1.count() != nodes2.count())
610 return false;
611
612 for (int i = 0; i < keys1.size(); i++) {
613 QString name = keys1[i];
614 // int d1 = nodes1[name];
615 if (!nodes2.contains(name)) {
616 return false;
617 }
618
619 if (!nodes2.contains(name)) {
620 return false;
621 }
622 /*int d2 = nodes2[name];
623 if(d1!=d2){
624 return false;
625 }*/
626 }
627
628 return true;
629 }
630
addNodeToList(QList<PhyNode * > & nodelist,QMap<QString,int> & nodemap,QList<PhyBranch * > & branches,PhyNode * node)631 void DistanceMatrix::addNodeToList(QList<PhyNode *> &nodelist, QMap<QString, int> &nodemap, QList<PhyBranch *> &branches, PhyNode *node) {
632 if ((node == 0) || (nodelist.contains(node))) {
633 return;
634 } else {
635 nodelist.append(node);
636 if (node->getName() != "ROOT" && node->getName() != "" && !node->getName().startsWith("___")) {
637 for (int i = 0; i < node->branchCount(); i++) {
638 PhyBranch *branch = node->getBranch(i);
639 if (branch->node2 == node) {
640 int d = branch->distance;
641 nodemap.insert(node->getName(), d);
642 }
643 }
644 }
645 for (int i = 0; i < node->branchCount(); i++) {
646 PhyBranch *branch = node->getBranch(i);
647 if (!branches.contains(branch)) {
648 branches.append(branch);
649 addNodeToList(nodelist, nodemap, branches, branch->node2);
650 }
651 }
652 }
653 }
654
switchNamesToAllNodes()655 void DistanceMatrix::switchNamesToAllNodes() {
656 QList<PhyNode *> nodes;
657 QList<PhyBranch *> branches;
658 QMap<QString, int> mmap;
659 addNodeToList(nodes, mmap, branches, treedata->getRootNode());
660 int size = nodes.size();
661
662 for (int i = 0; i < size; i++) {
663 switchName(nodes[i]);
664 }
665 }
666
getNewIndex(QString name,QPair<int,int> loc,QMap<QString,int> & old_map)667 int DistanceMatrix::getNewIndex(QString name, QPair<int, int> loc, QMap<QString, int> &old_map) {
668 int old_index = old_map[name];
669 if (old_index > loc.first && old_index > loc.second) {
670 int new_index = old_index - 2;
671 return new_index;
672 } else if (old_index > loc.first || old_index > loc.second) {
673 int new_index = old_index - 1;
674 return new_index;
675 }
676 return old_index;
677 }
printIndex()678 void DistanceMatrix::printIndex() {
679 for (int i = 0; i < index_map.size(); i++) {
680 QList<QString> names = index_map.keys(i);
681 std::cout << "Value :" << i << " Keys:";
682 for (int j = 0; j < names.size(); j++) {
683 std::cout << " " << names[j].toLatin1().constData();
684 }
685 std::cout << std::endl;
686 }
687 std::cout << std::endl;
688 }
dumpRawData()689 void DistanceMatrix::dumpRawData() {
690 dumpRawData(rawMatrix);
691 }
692
isFiniteNumber(double x)693 inline bool isFiniteNumber(double x) {
694 return (x <= DBL_MAX && x >= -DBL_MAX);
695 }
696
isValid()697 bool DistanceMatrix::isValid() {
698 int sz = rawMatrix.count();
699 int zeroCounter = 0;
700 // The matrix must be square and must not have infinite numbers
701 for (int i = 0; i < sz; ++i) {
702 int sz2 = rawMatrix[i].count();
703 if (sz2 != sz) {
704 return false;
705 }
706 for (int j = 0; j < sz; ++j) {
707 double value = rawMatrix[i][j];
708 if (!isFiniteNumber(value)) {
709 return false;
710 }
711 if (value == 0) {
712 ++zeroCounter;
713 }
714 }
715 }
716
717 if (zeroCounter == sz * sz) {
718 // All numbers are zeroes!
719 return false;
720 }
721
722 return true;
723 }
724
725 } // namespace U2
726