1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "DistanceMatrix.h"
23 
24 #include <QSharedData>
25 
26 #include "U2Core/global.h"
27 
28 #ifdef __GNUC__
29 #    pragma GCC diagnostic push
30 #    pragma GCC diagnostic ignored "-Wunused-variable"
31 #endif
32 #include "dnadist.h"
33 #include "protdist.h"
34 #ifdef __GNUC__
35 #    pragma GCC diagnostic pop
36 #endif
37 
38 #include <float.h>
39 #include <iostream>
40 
41 namespace U2 {
42 
calculateOutOfAlignment(const MultipleSequenceAlignment & ma,const CreatePhyTreeSettings & settings)43 void DistanceMatrix::calculateOutOfAlignment(const MultipleSequenceAlignment &ma, const CreatePhyTreeSettings &settings) {
44     try {
45         malignment = &ma;
46         int index = 0;
47         int size = ma->getNumRows();
48         this->size = size;
49         printdata = false;
50 
51         foreach (const MultipleSequenceAlignmentRow &r, ma->getMsaRows()) {
52             const QString &str = r->getName();
53             index_map.insert(str, index);
54             index++;
55             unprocessed_taxa.append(str);
56         }
57         if (!memoryLocker.tryAcquire(size * (sizeof(matrixrow) + sizeof(float) * size))) {
58             errorMessage = memoryLocker.getError();
59             return;
60         }
61 
62         for (int i = 0; i < size; i++) {
63             matrixrow row;
64 
65             for (int j = 0; j < size; j++) {
66                 row.append(0);
67             }
68 
69             rawMatrix.append(row);
70         }
71         spp = ma->getNumRows();
72         sites = ma->getLength();
73         chars = sites;
74         nonodes = 2 * sites - 1;
75         DNAAlphabetType alphabetType = ma->getAlphabet()->getType();
76 
77         ibmpc = IBMCRT;
78         ansi = ANSICRT;
79         mulsets = false;
80         datasets = 1;
81         firstset = true;
82 
83         if ((alphabetType == DNAAlphabet_RAW) || (alphabetType == DNAAlphabet_NUCL)) {
84             setDNADistSettings(settings);
85             doinit(memoryLocker);
86             if (memoryLocker.hasError()) {
87                 errorMessage = memoryLocker.getError();
88                 return;
89             }
90             // ttratio = ttratio0;
91             inputoptions();
92 
93             for (int k = 0; k < spp; k++) {
94                 for (int j = 0; j < sites; j++) {
95                     y[k][j] = ma->getMsaRow(k)->charAt(j);
96                 }
97             }
98             makeweights();
99             dnadist_makevalues();
100             dnadist_empiricalfreqs();
101 
102             getbasefreqs(freqa, freqc, freqg, freqt, &freqr, &freqy, &freqar, &freqcy, &freqgr, &freqty, &ttratio, &xi, &xv, &fracchange, freqsfrom, printdata);
103             makedists();
104 
105         } else {
106             prot_doinit(settings, memoryLocker);
107             if (memoryLocker.hasError()) {
108                 errorMessage = memoryLocker.getError();
109                 return;
110             }
111             if (!(kimura || similarity))
112                 code();
113             if (!(usejtt || usepmb || usepam || kimura || similarity)) {
114                 protdist_cats();
115                 maketrans();
116                 qreigen(prob, 20L);
117             } else {
118                 if (kimura || similarity)
119                     fracchange = 1.0;
120                 else {
121                     if (usejtt)
122                         jtteigen();
123                     else {
124                         if (usepmb)
125                             pmbeigen();
126                         else
127                             pameigen();
128                     }
129                 }
130             }
131 
132             doinput();
133             Phylip_Char charstate;
134             aas aa = (aas)0;
135 
136             for (int k = 0; k < spp; k++) {
137                 for (int j = 0; j < sites; j++) {
138                     charstate = ma->getMsaRow(k)->charAt(j);
139                     switch (charstate) {
140                         case 'A':
141                             aa = ala;
142                             break;
143                         case 'B':
144                             aa = asx;
145                             break;
146 
147                         case 'C':
148                             aa = cys;
149                             break;
150 
151                         case 'D':
152                             aa = asp;
153                             break;
154 
155                         case 'E':
156                             aa = glu;
157                             break;
158 
159                         case 'F':
160                             aa = phe;
161                             break;
162 
163                         case 'G':
164                             aa = gly;
165                             break;
166 
167                         case 'H':
168                             aa = his;
169                             break;
170 
171                         case 'I':
172                             aa = ileu;
173                             break;
174 
175                         case 'K':
176                             aa = lys;
177                             break;
178 
179                         case 'L':
180                             aa = leu;
181                             break;
182 
183                         case 'M':
184                             aa = met;
185                             break;
186 
187                         case 'N':
188                             aa = asn;
189                             break;
190 
191                         case 'P':
192                             aa = pro;
193                             break;
194 
195                         case 'Q':
196                             aa = gln;
197                             break;
198 
199                         case 'R':
200                             aa = arg;
201                             break;
202 
203                         case 'S':
204                             aa = ser;
205                             break;
206 
207                         case 'T':
208                             aa = thr;
209                             break;
210 
211                         case 'V':
212                             aa = val;
213                             break;
214 
215                         case 'W':
216                             aa = trp;
217                             break;
218 
219                         case 'X':
220                             aa = unk;
221                             break;
222 
223                         case 'Y':
224                             aa = tyr;
225                             break;
226 
227                         case 'Z':
228                             aa = glx;
229                             break;
230 
231                         case '*':
232                             aa = stop;
233                             break;
234 
235                         case '?':
236                             aa = quest;
237                             break;
238 
239                         case '-':
240                             aa = del;
241                             break;
242                     }
243                     gnode[k][j] = aa;
244                 }
245             }
246 
247             if (ith == 1)
248                 firstset = false;
249             prot_makedists();
250 
251             for (int i = 0; i < spp; i++) {
252                 free(gnode[i]);
253             }
254         }
255         for (int i = 0; i < spp; i++) {
256             for (int j = 0; j < spp; j++) {
257                 rawMatrix[i][j] = d[i][j];
258             }
259         }
260     } catch (const std::bad_alloc &) {
261         errorMessage = QString("Not enough memory to calculate distance matrix for alignment \"%1\"").arg(ma->getName());
262         if (nullptr != gnode) {
263             for (int i = 0; i < spp; i++) {
264                 free(gnode[i]);
265             }
266             free(gnode);
267         }
268     }
269 }
270 
DistanceMatrix()271 DistanceMatrix::DistanceMatrix()
272     : rawdata(nullptr),
273       size(),
274       malignment(nullptr),
275       treedata(nullptr) {
276 }
277 
~DistanceMatrix()278 DistanceMatrix::~DistanceMatrix() {
279     if (nullptr != y) {
280         for (int i = 0; i < spp; i++) {
281             free(y[i]);
282         }
283         free(y);
284         y = nullptr;
285     }
286 
287     if (nullptr != nodep) {
288         for (int i = 0; i < spp; i++) {
289             for (int j = 0; j < endsite; j++) {
290                 free(nodep[i]->x[j]);
291             }
292             free(nodep[i]->x);
293             free(nodep[i]);
294         }
295         free(nodep);
296         nodep = nullptr;
297     }
298     free(category);
299     category = nullptr;
300 
301     free(oldweight);
302     oldweight = nullptr;
303 
304     free(weight);
305     weight = nullptr;
306 
307     free(alias);
308     alias = nullptr;
309 
310     free(ally);
311     ally = nullptr;
312 
313     free(location);
314     location = nullptr;
315 
316     free(weightrat);
317     weightrat = nullptr;
318 
319     if (nullptr != d) {
320         for (int i = 0; i < spp; i++) {
321             free(d[i]);
322         }
323         free(d);
324         d = nullptr;
325     }
326 }
327 
printPhyTree()328 void DistanceMatrix::printPhyTree() {
329     DistanceMatrix::printPhyTree(treedata);
330 }
calculateQMatrix()331 void DistanceMatrix::calculateQMatrix() {
332     for (int i = 0; i < size; i++) {
333         matrixrow row;
334 
335         for (int j = 0; j < size; j++) {
336             row.append(0);
337         }
338 
339         qdata.append(row);
340     }
341 
342     for (int i = 0; i < size; i++) {
343         qdata[i].reserve(size);
344     }
345     for (int i = 0; i < size; i++) {
346         for (int j = 0; j < size; j++) {
347             if (i == j) {
348                 qdata[i][j] = 0.0;
349             } else {
350                 float dij = rawMatrix[i][j];
351                 float ri = calculateRawDivergence(i);
352                 float rj = calculateRawDivergence(j);
353                 float q = dij - (ri + rj) / (size - 2);
354                 qdata[i][j] = q;
355             }
356         }
357     }
358 }
359 
dumpRawData(matrix & m)360 void DistanceMatrix::dumpRawData(matrix &m) {
361     std::cout << "Distance Matrix " << std::endl;
362     for (int i = 0; i < size; i++) {
363         for (int j = 0; j < size; j++) {
364             float distance = m[i][j];
365             std::cout << distance << " ";
366         }
367         std::cout << std::endl;
368     }
369 }
370 
dumpQData()371 void DistanceMatrix::dumpQData() {
372     std::cout << "Q Matrix " << std::endl;
373     for (int i = 0; i < size; i++) {
374         for (int j = 0; j < size; j++) {
375             float distance = qdata[i][j];
376             std::cout << distance << " ";
377         }
378         std::cout << std::endl;
379     }
380 }
generateNodeName(QString name1,QString name2)381 QString DistanceMatrix::generateNodeName(QString name1, QString name2) {
382     QString r = "___";
383     r.append(name1);
384     r = r.append("___");
385     r = r.append(name2);
386     return r;
387 }
switchName(PhyNode * node)388 void DistanceMatrix::switchName(PhyNode *node) {
389     QString str = node->getName();
390     if (str.startsWith("ROOT")) {
391         node->setName("");
392     }
393     if (str.startsWith("___")) {
394         node->setName("");
395     }
396     for (int i = 0; i < node->branchCount(); i++) {
397         node->setBranchesDistance(i, abs(node->getBranchesDistance(i)));
398         if (node->getBranchesDistance(i) != node->getBranchesDistance(i)) {
399             node->setBranchesDistance(i, 1.0);
400         }
401     }
402 }
403 
addPairToTree(QPair<int,int> * location)404 void DistanceMatrix::addPairToTree(QPair<int, int> *location) {
405     //        treedata->rootNode->dumpBranches();
406     QString name1 = getTaxaName(location->first);
407     if (unprocessed_taxa.contains(name1)) {
408         unprocessed_taxa.removeAll(name1);
409     }
410     PhyNode *oldnode1 = getNodeByName(name1);
411     if (oldnode1 == 0) {
412         return;
413     }
414     PhyNode *rootnode1 = oldnode1->getParentNode();
415 
416     treedata->removeBranch(rootnode1, oldnode1);
417 
418     // printPhyTree();
419 
420     QString name2 = getTaxaName(location->second);
421     if (unprocessed_taxa.contains(name2)) {
422         unprocessed_taxa.removeAll(name2);
423     }
424     PhyNode *oldnode2 = getNodeByName(name2);
425     PhyNode *rootnode2 = oldnode2->getParentNode();
426     treedata->removeBranch(rootnode2, oldnode2);
427     if (oldnode1 == oldnode2) {
428         return;
429     }
430     // printPhyTree();
431 
432     float distance1 = calculateRootDistance(location->first, location->second);
433     float distance2 = calculateAdjacentDistance(location->first, location->second, distance1);
434 
435     PhyNode *new_node = new PhyNode();
436     QString new_name = generateNodeName(name1, name2);
437     new_node->setName(new_name);
438     treedata->addBranch(new_node, oldnode1, distance1);
439     if (!new_node->isConnected(oldnode2)) {
440         treedata->addBranch(new_node, oldnode2, distance2);
441     }
442     if (!rootnode1->isConnected(new_node)) {
443         treedata->addBranch(rootnode1, new_node, 1);
444     }
445     // printPhyTree();
446 
447     for (int j = 0; j < this->size; j++) {
448         middlematrix.append(rawMatrix[j]);
449     }
450     // dumpRawData();
451     //         dumpRawData(middlematrix);
452 
453     for (int j = 0; j < size; j++) {
454         rawMatrix[j].remove(location->first);
455         rawMatrix[j].remove(location->second);
456     }
457     rawMatrix.remove(location->first);
458     rawMatrix.remove(location->second);
459     this->size -= 2;
460 
461     //        dumpRawData(rawMatrix);
462     QMap<QString, int> old_map;
463     QList<QString> namelist = index_map.uniqueKeys();
464     for (int j = 0; j < index_map.size(); j++) {
465         int index = index_map[namelist[j]];
466         old_map.insert(namelist[j], index);
467     }
468 
469     QList<QString> names = index_map.uniqueKeys();
470     index_map.clear();
471     for (int j = 0; j < names.size(); j++) {
472         QString cur_name = names[j];
473         if (cur_name != name1 && cur_name != name2) {
474             int index = getNewIndex(cur_name, *location, old_map);
475             index_map.insert(cur_name, index);
476         }
477     }
478     //        int new_index = index_map.size();
479 
480     index_map.insert(new_name, index_map.size());
481     // printIndex();
482 
483     for (int j = 0; j < size; j++) {
484         QString name = getTaxaName(j);
485         int old_index = old_map[name];
486         float distance = calculateNewDistance(location, old_index);
487         rawMatrix[j].append(distance);
488     }
489 
490     this->size++;
491 
492     matrixrow row;
493     for (int j = 0; j < size - 1; j++) {
494         float distance = rawMatrix[j][size - 1];
495         row.append(distance);
496     }
497     row.append(0);
498 
499     rawMatrix.append(row);
500     middlematrix.clear();
501 
502     // dumpRawData();
503     // treedata->rootNode->dumpBranches();
504 }
505 
getNodeByName(QString name)506 PhyNode *DistanceMatrix::getNodeByName(QString name) {
507     visited_list.clear();
508     PhyNode *rootNode = treedata->getRootNode();
509     return findNode(rootNode, name);
510 }
511 
findNode(PhyNode * startnode,QString name)512 PhyNode *DistanceMatrix::findNode(PhyNode *startnode, QString name) {
513     visited_list.append(startnode->getName());
514     if (startnode->getName() == name) {
515         return startnode;
516     } else {
517         for (int i = 0; i < startnode->branchCount(); i++) {
518             PhyBranch *branch = startnode->getBranch(i);
519             if (!visited_list.contains(branch->node2->getName())) {
520                 PhyNode *node2 = findNode(branch->node2, name);
521                 if (node2 != 0) {
522                     return node2;
523                 }
524             }
525         }
526     }
527     return 0;
528 }
529 
calculateRootDistance(int i,int j)530 float DistanceMatrix::calculateRootDistance(int i, int j) {
531     // S(AU) =d(AB) / 2 + [r(A)-r(B)] / 2(N-2) = 1
532     float distance = rawMatrix[i][j] / 2;
533 
534     float num1 = calculateRawDivergence(i);
535     float num2 = calculateRawDivergence(j);
536 
537     float div = (num1 - num2) / (2 * (size - 2));
538     float result = distance + div;
539     return result;
540 }
541 
calculateAdjacentDistance(int i,int j,float rootDistance)542 float DistanceMatrix::calculateAdjacentDistance(int i, int j, float rootDistance) {
543     float result = rawMatrix[i][j] - rootDistance;
544     return result;
545 }
546 
calculateNewDistance(QPair<int,int> * location,int current)547 float DistanceMatrix::calculateNewDistance(QPair<int, int> *location, int current) {
548     float a = middlematrix[current][location->first];
549     float b = middlematrix[current][location->second];
550     float c = middlematrix[location->first][location->second] / 2;
551     return a + b - c;
552 }
553 
calculateRawDivergence(int index)554 float DistanceMatrix::calculateRawDivergence(int index) {
555     float divergence = 0;
556     for (int i = 0; i < index; i++) {
557         divergence += rawMatrix[i][index];
558     }
559     for (int j = index; j < size; j++) {
560         divergence += rawMatrix[index][j];
561     }
562     return divergence;
563 }
564 
getTaxaName(int index)565 QString DistanceMatrix::getTaxaName(int index) {
566     QMap<QString, int>::iterator i;
567     for (i = index_map.begin(); i != index_map.end(); ++i) {
568         if (i.value() == index) {
569             return i.key();
570         }
571     }
572     return " ";
573 }
574 
printPhyTree(PhyTreeData * data)575 void DistanceMatrix::printPhyTree(PhyTreeData *data) {
576     data->print();
577 }
578 
printPhyNode(PhyNode * node,int tab,QList<PhyNode * > & nodes)579 void DistanceMatrix::printPhyNode(PhyNode *node, int tab, QList<PhyNode *> &nodes) {
580     if (node == 0 || nodes.contains(node)) {
581         return;
582     }
583     nodes.append(node);
584     for (int i = 0; i < tab; i++) {
585         std::cout << " ";
586     }
587     tab++;
588     std::cout << "name: " << node->getName().toLatin1().constData() << std::endl;
589     for (int i = 0; i < node->branchCount(); i++) {
590         PhyBranch *branch = node->getBranch(i);
591         printPhyNode(branch->node2, tab, nodes);
592     }
593 }
areTreesEqual(PhyTree * tree1,PhyTree * tree2)594 bool DistanceMatrix::areTreesEqual(PhyTree *tree1, PhyTree *tree2) {
595     QMap<QString, int> nodes1;
596     QMap<QString, int> nodes2;
597 
598     QList<PhyNode *> list1;
599     QList<PhyNode *> list2;
600 
601     QList<PhyBranch *> branches1;
602     QList<PhyBranch *> branches2;
603 
604     addNodeToList(list1, nodes1, branches1, tree1->data()->getRootNode());
605     addNodeToList(list2, nodes2, branches2, tree2->data()->getRootNode());
606 
607     QList<QString> keys1 = nodes1.keys();
608 
609     if (nodes1.count() != nodes2.count())
610         return false;
611 
612     for (int i = 0; i < keys1.size(); i++) {
613         QString name = keys1[i];
614         // int d1 = nodes1[name];
615         if (!nodes2.contains(name)) {
616             return false;
617         }
618 
619         if (!nodes2.contains(name)) {
620             return false;
621         }
622         /*int d2 = nodes2[name];
623         if(d1!=d2){
624         return false;
625         }*/
626     }
627 
628     return true;
629 }
630 
addNodeToList(QList<PhyNode * > & nodelist,QMap<QString,int> & nodemap,QList<PhyBranch * > & branches,PhyNode * node)631 void DistanceMatrix::addNodeToList(QList<PhyNode *> &nodelist, QMap<QString, int> &nodemap, QList<PhyBranch *> &branches, PhyNode *node) {
632     if ((node == 0) || (nodelist.contains(node))) {
633         return;
634     } else {
635         nodelist.append(node);
636         if (node->getName() != "ROOT" && node->getName() != "" && !node->getName().startsWith("___")) {
637             for (int i = 0; i < node->branchCount(); i++) {
638                 PhyBranch *branch = node->getBranch(i);
639                 if (branch->node2 == node) {
640                     int d = branch->distance;
641                     nodemap.insert(node->getName(), d);
642                 }
643             }
644         }
645         for (int i = 0; i < node->branchCount(); i++) {
646             PhyBranch *branch = node->getBranch(i);
647             if (!branches.contains(branch)) {
648                 branches.append(branch);
649                 addNodeToList(nodelist, nodemap, branches, branch->node2);
650             }
651         }
652     }
653 }
654 
switchNamesToAllNodes()655 void DistanceMatrix::switchNamesToAllNodes() {
656     QList<PhyNode *> nodes;
657     QList<PhyBranch *> branches;
658     QMap<QString, int> mmap;
659     addNodeToList(nodes, mmap, branches, treedata->getRootNode());
660     int size = nodes.size();
661 
662     for (int i = 0; i < size; i++) {
663         switchName(nodes[i]);
664     }
665 }
666 
getNewIndex(QString name,QPair<int,int> loc,QMap<QString,int> & old_map)667 int DistanceMatrix::getNewIndex(QString name, QPair<int, int> loc, QMap<QString, int> &old_map) {
668     int old_index = old_map[name];
669     if (old_index > loc.first && old_index > loc.second) {
670         int new_index = old_index - 2;
671         return new_index;
672     } else if (old_index > loc.first || old_index > loc.second) {
673         int new_index = old_index - 1;
674         return new_index;
675     }
676     return old_index;
677 }
printIndex()678 void DistanceMatrix::printIndex() {
679     for (int i = 0; i < index_map.size(); i++) {
680         QList<QString> names = index_map.keys(i);
681         std::cout << "Value :" << i << " Keys:";
682         for (int j = 0; j < names.size(); j++) {
683             std::cout << " " << names[j].toLatin1().constData();
684         }
685         std::cout << std::endl;
686     }
687     std::cout << std::endl;
688 }
dumpRawData()689 void DistanceMatrix::dumpRawData() {
690     dumpRawData(rawMatrix);
691 }
692 
isFiniteNumber(double x)693 inline bool isFiniteNumber(double x) {
694     return (x <= DBL_MAX && x >= -DBL_MAX);
695 }
696 
isValid()697 bool DistanceMatrix::isValid() {
698     int sz = rawMatrix.count();
699     int zeroCounter = 0;
700     // The matrix must be square and must not have infinite numbers
701     for (int i = 0; i < sz; ++i) {
702         int sz2 = rawMatrix[i].count();
703         if (sz2 != sz) {
704             return false;
705         }
706         for (int j = 0; j < sz; ++j) {
707             double value = rawMatrix[i][j];
708             if (!isFiniteNumber(value)) {
709                 return false;
710             }
711             if (value == 0) {
712                 ++zeroCounter;
713             }
714         }
715     }
716 
717     if (zeroCounter == sz * sz) {
718         // All numbers are zeroes!
719         return false;
720     }
721 
722     return true;
723 }
724 
725 }  // namespace U2
726