1LoadFunctionLibrary("../IOFunctions.bf"); 2LoadFunctionLibrary("../all-terms.bf"); 3LoadFunctionLibrary("../models/frequencies.bf"); 4 5 6/** 7 * Compute all pairwise distances between sequences in a data set 8 * @name distances.nucleotide.tn93 9 * @param filter {String} - id of dataset 10 * @param freqs {Matrix/null} - if not null, use these nucleotide frequencies 11 * @param options{Dict/null} - options for ambiguity treatment 12 * @returns {Matrix} r - pairwise TN93 distances 13 */ 14lfunction distances.nucleotide.tn93 (filter, freqs, options) { 15 if (null == freqs) { 16 freqs = (frequencies._aux.empirical.singlechar ({}, null, filter))[^"terms.efv_estimate"]; 17 } 18 19 fY = freqs[1] + freqs[3]; 20 fR = 1 - fY; 21 22 if (Min (freqs, 0) == 0) { 23 K2P = TRUE; 24 } 25 else { 26 K2P = FALSE; 27 K1 = 2*freqs[0]*freqs[2]/fR; 28 K2 = 2*freqs[1]*freqs[3]/fY; 29 K3 = 2*(fR*fY-freqs[0]*freqs[2]*fY/fR-freqs[1]*freqs[3]*fR/fY); 30 } 31 32 sequence_count = ^(filter + ".species"); 33 distances = {sequence_count,sequence_count}; 34 resolution = "RESOLVE_AMBIGUITIES"; 35 if (utility.Has (options, "ambigs","String")) { 36 resolution = options["ambigs"]; 37 } 38 39 for (s1 = 0; s1 < sequence_count; s1 += 1) { 40 for (s2 = s1 + 1; s2 < sequence_count; s2 += 1) { 41 ExecuteCommands ("GetDataInfo (count, ^filter, s1, s2, `resolution`)"); 42 totalSitesCompared = +count; 43 d = 1000.; 44 if (totalSitesCompared > 0) { 45 count = count * (1/totalSitesCompared); 46 AG = count[0][2] + count[2][0]; 47 CT = count[1][3] + count[3][1]; 48 transversions = 1 - AG - CT - count[0][0] - count[1][1] - count[2][2] - count[3][3]; 49 if (K2P) { 50 d1 = 1-2*(AG+CT)-transversions; 51 d2 = 1-2*transversions; 52 53 if (d1 >0 && d2>0) { 54 d = -(0.5*Log(d1)+.25*Log(d2)); 55 } 56 57 } else { 58 d1 = 1-AG/K1-0.5*transversions/fR; 59 d2 = 1-CT/K2-0.5*transversions/fY; 60 d3 = 1-0.5*transversions/fR/fY; 61 if (d1>0 && d2 >0 && d3 >0) { 62 d = -K1*Log(d1)-K2*Log(d2)-K3*Log(d3); 63 } 64 } 65 } 66 distances [s1][s2] = d; 67 distances [s2][s1] = d; 68 } 69 } 70 71 72 return distances; 73} 74 75/** 76 * Compute all pairwise percent distances between sequences in a data set 77 * @name distances.nucleotide.p_distance 78 * @param filter {String} - id of dataset 79 * @param options {Dict/null} - options for ambiguity treatment 80 * @returns {Matrix} r - pairwise TN93 distances 81 */ 82lfunction distances.p_distance (filter, options) { 83 sequence_count = ^(filter + ".species"); 84 distances = {sequence_count,sequence_count}; 85 resolution = "RESOLVE_AMBIGUITIES"; 86 if (utility.Has (options, "ambigs","String")) { 87 resolution = options["ambigs"]; 88 } 89 90 if (sequence_count) { 91 ExecuteCommands ("GetDataInfo (count, ^filter, 0, 0, `resolution`)"); 92 off_diag = count["_MATRIX_ELEMENT_COLUMN_==_MATRIX_ELEMENT_ROW_"]; 93 } 94 95 expr := "GetDataInfo (count, ^filter, s1, s2, `resolution`)"; 96 97 for (s1 = 0; s1 < sequence_count; s1 += 1) { 98 for (s2 = s1 + 1; s2 < sequence_count; s2 += 1) { 99 ExecuteCommands (expr); 100 totalSitesCompared = +count; 101 if (totalSitesCompared != 0) { 102 identicalSites = +count[off_diag]; 103 d = (totalSitesCompared-identicalSites) / totalSitesCompared; 104 } else { 105 d = 1000; 106 } 107 distances [s1][s2] = d; 108 distances [s2][s1] = d; 109 } 110 } 111 112 return distances; 113} 114 115