1 package jasper; 2 3 import java.io.BufferedReader; 4 import java.io.FileNotFoundException; 5 import java.io.FileReader; 6 import java.io.IOException; 7 import java.util.ArrayList; 8 9 public class SparseSimilarityMatrix { 10 11 /*--------------------------------------------------------------*/ 12 /*---------------- Initialization ----------------*/ 13 /*--------------------------------------------------------------*/ 14 15 /** 16 * Takes in a file of sketch similarity percentages from SketchCompare. 17 * Returns a sparse matrix object containing each percentage 18 * 19 * @param inputFile The file containing pairwise comparisons of each sketch 20 * @throws FileNotFoundException 21 * @throws IOException 22 */ SparseSimilarityMatrix(String inputFile, SparseTree tree_)23 public SparseSimilarityMatrix(String inputFile, SparseTree tree_) throws FileNotFoundException, IOException { 24 25 //Assigns the input tree object to the tree variable. 26 tree = tree_; 27 28 //Take file name as input for building tree of related nodes 29 in = inputFile; 30 31 //Read in file, add header line and add to header variable 32 try (BufferedReader br = new BufferedReader(new FileReader(in))) { 33 String line; 34 35 //while line isn't empty, process 36 while ((line = br.readLine()) != null) { 37 38 //if line is the header line, split and assign to variable. 39 //may be used when header becomes more complex 40 if(line.startsWith("#")) {header=line.split("\t"); 41 } else { 42 43 //If not a header line, split on tab. 44 String[] data = line.split("\t"); 45 46 //Query organism is column 0. 47 String queryName = data[0]; 48 //String refName = data[1]; 49 50 } 51 } 52 } 53 54 //Get the total number of organisms in the tree. 55 orgCount = tree.getOrgCount(); 56 57 //Initialize the matrix with the appropriate size of all nodes. 58 sparseMatrix = new ArrayList[orgCount + 1]; 59 60 //Iterate over the matrix and add an ArrayList<Comparison> to each ArrayList. 61 for(int i=0; i<sparseMatrix.length; i++) { 62 63 sparseMatrix[i] = new ArrayList<Comparison>(); 64 65 } 66 67 //Begin reading the file a second time. 68 try (BufferedReader br = new BufferedReader(new FileReader(in))) { 69 String line; 70 71 //while line isn't empty, process 72 while ((line = br.readLine()) != null) { 73 74 //If line is the header line, split and assign to variable. 75 //may be used when header becomes more complex 76 if(line.startsWith("#")) {assert true; 77 } else { 78 79 //If not a header line, split on tab. 80 String[] data = line.split("\t"); 81 82 //Column 0 is query name. 83 String queryName = data[0]; 84 85 //Column 1 is reference name. 86 String refName = data[1]; 87 88 //Column 2 is the similarity percentage. 89 double similarity = Double.parseDouble(data[2]); 90 91 //Check that both names are in the HashMap (too slow?) 92 if(tree.containsName(queryName)==true && tree.containsName(refName)) { 93 94 //Get the positions assigned to both organisms. 95 int queryPos = nameToNodeId(queryName); 96 int refPos = nameToNodeId(refName); 97 98 Comparison currentComparison = new Comparison(queryPos, refPos, similarity); 99 100 //Add the similarity percentage to the appropriate matrix position. 101 sparseMatrix[queryPos].add(currentComparison); 102 } 103 } 104 } 105 } 106 } 107 108 /** 109 * Method for taking the node name and returning the node ID value 110 * @param orgName the organism node name (String). 111 * @return int The node ID of the organism name taken as input. 112 */ nameToNodeId(String orgName)113 public int nameToNodeId(String orgName) { 114 115 //Get the node associated with the input name. 116 TreeNode org = tree.getNode(orgName); 117 118 //Asserts the org nod is in the tree. 119 assert(org != null) : orgName; 120 121 //Return the int node ID. 122 return org.nodeId; 123 } 124 125 126 /** 127 * Prints out the entire matrix. 128 * Impractical in cases of large input datasets. 129 * 130 */ toString()131 public String toString() { 132 StringBuilder sb=new StringBuilder(); 133 for (int i = 0; i < sparseMatrix.length; i++) { 134 for (int j = 0; j < sparseMatrix[i].size(); j++) { 135 sb.append(sparseMatrix[i].get(j) + " "); 136 } 137 sb.append('\n'); 138 } 139 return sb.toString(); 140 } 141 142 143 //TODO: This method is slow and doesnt work, need something better. 144 // /** 145 // * Returns the similarity of two specified organisms. 146 // * Both organisms must have been compared using SketchCompare. 147 // * 148 // * @param org1 The Name of an organism. 149 // * @param org2 The name of a second organism. 150 // * @return similarity The Double percentage similarity between the two sketches. 151 // */ 152 // public Comparison getComparison(String org1, String org2) { 153 // int orgName1 = nameToNodeId(org1); 154 // int orgName2 = nameToNodeId(org2); 155 // 156 // return sparseMatrix[orgName1].get(orgName2); 157 // } 158 159 getSize()160 public int getSize() { 161 return orgCount; 162 } 163 164 getOrgRow(String orgName)165 public ArrayList<Comparison> getOrgRow(String orgName) { 166 int rowNum = tree.nodeMap.get(orgName).nodeId; 167 return sparseMatrix[rowNum]; 168 } 169 170 171 172 /*--------------------------------------------------------------*/ 173 /*---------------- Fields ----------------*/ 174 /*--------------------------------------------------------------*/ 175 176 /** 177 * A SparseTree object that contains taxonomic information relevant to this matrix. 178 */ 179 final SparseTree tree; 180 181 /** 182 * An arraylist containing comparisons between nodes in the tree. 183 */ 184 private final ArrayList<Comparison>[] sparseMatrix; 185 186 /** 187 * The number of sketches being analyzed. 188 */ 189 private int orgCount; 190 191 /** 192 * ArrayList that will hold the lines of the input file. 193 */ 194 ArrayList<String> lines = new ArrayList<String>(); 195 196 /** 197 * Header line of the comparison input file. 198 */ 199 private String[] header; 200 201 /** 202 * Input file name. 203 */ 204 private String in=null; 205 206 /** 207 * Number of lines processed from the sketch comparison file. 208 */ 209 private long linesProcessed=0; 210 211 } 212