1 /*
2 * shhher.cpp
3 * Mothur
4 *
5 * Created by Pat Schloss on 12/27/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
7 *
8 */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
setParameters()13 vector<string> ShhherCommand::setParameters(){
14 try {
15 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
20 CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
21 CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
22 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
23 CommandParameter porder("order", "Multiple", "A-B-I", "A", "", "", "","",false,false, true); parameters.push_back(porder); CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
24 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26
27 vector<string> tempOutNames;
28 outputTypes["fasta"] = tempOutNames;
29 outputTypes["name"] = tempOutNames;
30 outputTypes["group"] = tempOutNames;
31 outputTypes["counts"] = tempOutNames;
32 outputTypes["qfile"] = tempOutNames;
33
34 abort = false; calledHelp = false;
35
36 vector<string> myArray;
37 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
38 return myArray;
39 }
40 catch(exception& e) {
41 m->errorOut(e, "ShhherCommand", "setParameters");
42 exit(1);
43 }
44 }
45 //**********************************************************************************************************************
getHelpString()46 string ShhherCommand::getHelpString(){
47 try {
48 string helpString = "";
49 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
50 helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
51 helpString += "The flow parameter is used to input your flow file.\n";
52 helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
53 helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
54 helpString += "The order parameter options are A, B or I. Default=A. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n";
55 return helpString;
56 }
57 catch(exception& e) {
58 m->errorOut(e, "ShhherCommand", "getHelpString");
59 exit(1);
60 }
61 }
62 //**********************************************************************************************************************
getOutputPattern(string type)63 string ShhherCommand::getOutputPattern(string type) {
64 try {
65 string pattern = "";
66
67 if (type == "fasta") { pattern = "[filename],shhh.fasta"; }
68 else if (type == "name") { pattern = "[filename],shhh.names"; }
69 else if (type == "group") { pattern = "[filename],shhh.groups"; }
70 else if (type == "counts") { pattern = "[filename],shhh.counts"; }
71 else if (type == "qfile") { pattern = "[filename],shhh.qual"; }
72 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true); }
73
74 return pattern;
75 }
76 catch(exception& e) {
77 m->errorOut(e, "ShhherCommand", "getOutputPattern");
78 exit(1);
79 }
80 }
81 //**********************************************************************************************************************
ShhherCommand(string option)82 ShhherCommand::ShhherCommand(string option) : Command() {
83 try {
84 if(option == "help") { help(); abort = true; calledHelp = true; }
85 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
86 else if(option == "category") { abort = true; calledHelp = true; }
87
88 else {
89 OptionParser parser(option, setParameters());
90 map<string,string> parameters = parser.getParameters();
91
92 ValidParameters validParameter;
93
94
95 //check for required parameters
96 flowFileName = validParameter.validFile(parameters, "flow");
97 flowFilesFileName = validParameter.validFile(parameters, "file");
98 if (flowFileName == "not found" && flowFilesFileName == "not found") {
99 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.\n"); abort = true;
100 }
101 else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
102
103 if(flowFileName != "not found"){
104 compositeFASTAFileName = "";
105 compositeNamesFileName = "";
106 }
107 else{
108 ofstream temp;
109
110 string thisoutputDir = outputdir;
111 if (outputdir == "") { thisoutputDir = util.hasPath(flowFilesFileName); }
112
113 //we want to rip off .files, and also .flow if its there
114 string fileroot = util.getRootName(util.getSimpleName(flowFilesFileName));
115 if (fileroot[fileroot.length()-1] == '.') { fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
116 string extension = util.getExtension(fileroot);
117 if (extension == ".flow") { fileroot = util.getRootName(fileroot); }
118 else { fileroot += "."; } //add back if needed
119
120 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
121 util.openOutputFile(compositeFASTAFileName, temp);
122 temp.close();
123
124 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
125 util.openOutputFile(compositeNamesFileName, temp);
126 temp.close();
127 }
128
129 if(flowFilesFileName != "not found"){
130 ifstream flowFilesFile;
131 util.openInputFile(flowFilesFileName, flowFilesFile);
132 while(flowFilesFile){
133 string fName = util.getline(flowFilesFile); util.gobble(flowFilesFile);
134
135 //check to make sure both are able to be opened
136 bool ableToOpen = util.checkLocations(fName, current->getLocations());
137 if (ableToOpen) {
138 if (util.isBlank(fName)) { m->mothurOut("[WARNING]: " + fName + " is blank, disregarding.\n"); }
139 else { flowFileVector.push_back(fName); }
140 }else { m->mothurOut("Unable to open " + fName + ". Disregarding.\n"); }
141 }
142 flowFilesFile.close();
143 if (flowFileVector.size() == 0) { m->mothurOut("[ERROR]: no valid files.\n"); abort = true; }
144 }
145 else{
146 if (outputdir == "") { outputdir = util.hasPath(flowFileName); }
147 flowFileVector.push_back(flowFileName);
148 }
149
150 //check for optional parameter and set defaults
151 // ...at some point should added some additional type checking...
152 string temp;
153 temp = validParameter.validFile(parameters, "lookup");
154 if (temp == "not found") {
155 string path = current->getProgramPath();
156
157 #if defined NON_WINDOWS
158 path += "lookupFiles/";
159 #else
160 path += "lookupFiles\\";
161 #endif
162 lookupFileName = util.getFullPathName(path) + "LookUp_Titanium.pat";
163
164 //check to make sure both are able to be opened
165 bool ableToOpen = util.checkLocations(lookupFileName, current->getLocations());
166 if (ableToOpen) {
167 if (util.isBlank(lookupFileName)) { m->mothurOut("[ERROR]: " + lookupFileName + " is blank, aborting.\n"); abort=true; }
168 }else { m->mothurOut("[ERROR]: Unable to open " + lookupFileName + ".\n"); abort=true; }
169 }
170 else if(temp == "not open") {
171
172 lookupFileName = validParameter.validPath(parameters, "lookup");
173
174 //check to make sure both are able to be opened
175 bool ableToOpen = util.checkLocations(lookupFileName, current->getLocations());
176 if (ableToOpen) {
177 if (util.isBlank(lookupFileName)) { m->mothurOut("[ERROR]: " + lookupFileName + " is blank, aborting.\n"); abort=true; }
178 }else { m->mothurOut("[ERROR]: Unable to open " + lookupFileName + ".\n"); abort=true; }
179
180 }else { lookupFileName = temp; }
181
182 temp = validParameter.valid(parameters, "cutoff"); if (temp == "not found"){ temp = "0.01"; }
183 util.mothurConvert(temp, cutoff);
184
185 temp = validParameter.valid(parameters, "mindelta"); if (temp == "not found"){ temp = "0.000001"; }
186 util.mothurConvert(temp, minDelta);
187
188 temp = validParameter.valid(parameters, "maxiter"); if (temp == "not found"){ temp = "1000"; }
189 util.mothurConvert(temp, maxIters);
190
191 temp = validParameter.valid(parameters, "large"); if (temp == "not found"){ temp = "0"; }
192 util.mothurConvert(temp, largeSize);
193 if (largeSize != 0) { large = true; }
194 else { large = false; }
195 if (largeSize < 0) { m->mothurOut("The value of the large cannot be negative.\n"); }
196
197 temp = validParameter.valid(parameters, "sigma"); if (temp == "not found") { temp = "60"; }
198 util.mothurConvert(temp, sigma);
199
200 temp = validParameter.valid(parameters, "order"); if (temp == "not found"){ temp = "A"; }
201 if (temp.length() > 1) { m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n"); abort=true;
202 }
203 else {
204 if (toupper(temp[0]) == 'A') { flowOrder = "TACG"; }
205 else if(toupper(temp[0]) == 'B'){
206 flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC"; }
207 else if(toupper(temp[0]) == 'I'){
208 flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC"; }
209 else {
210 m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n"); abort=true;
211 }
212 }
213
214
215 }
216
217 }
218 catch(exception& e) {
219 m->errorOut(e, "ShhherCommand", "ShhherCommand");
220 exit(1);
221 }
222 }
223 //**********************************************************************************************************************
224
execute()225 int ShhherCommand::execute(){
226 try {
227 if (abort) { if (calledHelp) { return 0; } return 2; }
228
229 getSingleLookUp(); if (m->getControl_pressed()) { return 0; }
230 getJointLookUp(); if (m->getControl_pressed()) { return 0; }
231
232 driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
233
234 if(compositeFASTAFileName != ""){
235 outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
236 outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
237 }
238
239 m->mothurOut("\nOutput File Names: \n");
240 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i] +"\n"); } m->mothurOutEndLine();
241
242 return 0;
243 }
244 catch(exception& e) {
245 m->errorOut(e, "ShhherCommand", "execute");
246 exit(1);
247 }
248 }
249 //********************************************************************************************************************
250 //sorts biggest to smallest
compareFileSizes(string left,string right)251 inline bool compareFileSizes(string left, string right){
252
253 FILE * pFile;
254 long leftsize = 0;
255
256 //get num bytes in file
257 string filename = left;
258 pFile = fopen (filename.c_str(),"rb");
259 string error = "Error opening " + filename;
260 if (pFile==NULL) perror (error.c_str());
261 else{
262 fseek (pFile, 0, SEEK_END);
263 leftsize=ftell (pFile);
264 fclose (pFile);
265 }
266
267 FILE * pFile2;
268 long rightsize = 0;
269
270 //get num bytes in file
271 filename = right;
272 pFile2 = fopen (filename.c_str(),"rb");
273 error = "Error opening " + filename;
274 if (pFile2==NULL) perror (error.c_str());
275 else{
276 fseek (pFile2, 0, SEEK_END);
277 rightsize=ftell (pFile2);
278 fclose (pFile2);
279 }
280
281 return (leftsize > rightsize);
282 }
283 /**************************************************************************************************/
284
parseFlowFiles(string filename)285 vector<string> ShhherCommand::parseFlowFiles(string filename){
286 try {
287 vector<string> files;
288 int count = 0;
289
290 ifstream in;
291 util.openInputFile(filename, in);
292
293 int thisNumFLows = 0;
294 in >> thisNumFLows; util.gobble(in);
295
296 while (!in.eof()) {
297 if (m->getControl_pressed()) { break; }
298
299 ofstream out;
300 string outputFileName = filename + toString(count) + ".temp";
301 util.openOutputFile(outputFileName, out);
302 out << thisNumFLows << endl;
303 files.push_back(outputFileName);
304
305 int numLinesWrote = 0;
306 for (int i = 0; i < largeSize; i++) {
307 if (in.eof()) { break; }
308 string line = util.getline(in); util.gobble(in);
309 out << line << endl;
310 numLinesWrote++;
311 }
312 out.close();
313
314 if (numLinesWrote == 0) { util.mothurRemove(outputFileName); files.pop_back(); }
315 count++;
316 }
317 in.close();
318
319 if (m->getControl_pressed()) { for (int i = 0; i < files.size(); i++) { util.mothurRemove(files[i]); } files.clear(); }
320
321 m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n");
322
323 return files;
324 }
325 catch(exception& e) {
326 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
327 exit(1);
328 }
329 }
330 /**************************************************************************************************/
331
driver(vector<string> filenames,string thisCompositeFASTAFileName,string thisCompositeNamesFileName)332 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
333 try {
334
335 int numCompleted = 0;
336
337 for(int i=0;i<filenames.size();i++){
338
339 if (m->getControl_pressed()) { break; }
340
341 vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
342 if (large) { theseFlowFileNames = parseFlowFiles(filenames[i]); }
343
344 if (m->getControl_pressed()) { break; }
345
346 double begClock = clock();
347 unsigned long long begTime;
348
349 string fileNameForOutput = filenames[i];
350
351 for (int g = 0; g < theseFlowFileNames.size(); g++) {
352
353 string flowFileName = theseFlowFileNames[g];
354 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
355 m->mothurOut("Reading flowgrams...\n");
356
357 vector<string> seqNameVector;
358 vector<int> lengths;
359 vector<short> flowDataIntI;
360 vector<double> flowDataPrI;
361 map<string, int> nameMap;
362 vector<short> uniqueFlowgrams;
363 vector<int> uniqueCount;
364 vector<int> mapSeqToUnique;
365 vector<int> mapUniqueToSeq;
366 vector<int> uniqueLengths;
367 int numFlowCells;
368
369 if (m->getDebug()) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
370 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
371
372 if (m->getControl_pressed()) { break; }
373
374 m->mothurOut("Identifying unique flowgrams...\n");
375 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
376
377 if (m->getControl_pressed()) { break; }
378
379 m->mothurOut("Calculating distances between flowgrams...\n");
380 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
381 begTime = time(NULL);
382
383
384 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
385
386 m->mothurOutEndLine();
387 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
388
389
390 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
391 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
392
393 if (m->getControl_pressed()) { break; }
394
395 m->mothurOut("\nClustering flowgrams...\n");
396 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
397 cluster(listFileName, distFileName, namesFileName);
398
399 if (m->getControl_pressed()) { break; }
400
401 vector<int> otuData;
402 vector<int> cumNumSeqs;
403 vector<int> nSeqsPerOTU;
404 vector<vector<int> > aaP; //tMaster->aanP: each row is a different otu / each col contains the sequence indices
405 vector<vector<int> > aaI; //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
406 vector<int> seqNumber; //tMaster->anP: the sequence id number sorted by OTU
407 vector<int> seqIndex; //tMaster->anI; the index that corresponds to seqNumber
408
409
410 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
411
412 if (m->getControl_pressed()) { break; }
413
414 util.mothurRemove(distFileName);
415 util.mothurRemove(namesFileName);
416 util.mothurRemove(listFileName);
417
418 vector<double> dist; //adDist - distance of sequences to centroids
419 vector<short> change; //did the centroid sequence change? 0 = no; 1 = yes
420 vector<int> centroids; //the representative flowgram for each cluster m
421 vector<double> weight;
422 vector<double> singleTau; //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
423 vector<int> nSeqsBreaks;
424 vector<int> nOTUsBreaks;
425
426 if (m->getDebug()) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
427
428 dist.assign(numSeqs * numOTUs, 0);
429 change.assign(numOTUs, 1);
430 centroids.assign(numOTUs, -1);
431 weight.assign(numOTUs, 0);
432 singleTau.assign(numSeqs, 1.0);
433
434 nSeqsBreaks.assign(2, 0);
435 nOTUsBreaks.assign(2, 0);
436
437 nSeqsBreaks[0] = 0;
438 nSeqsBreaks[1] = numSeqs;
439 nOTUsBreaks[1] = numOTUs;
440
441 if (m->getDebug()) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
442
443 if (m->getControl_pressed()) { break; }
444
445 double maxDelta = 0;
446 int iter = 0;
447
448 begClock = clock();
449 begTime = time(NULL);
450
451 m->mothurOut("\nDenoising flowgrams...\n");
452 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
453
454 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
455
456 if (m->getControl_pressed()) { break; }
457
458 double cycClock = clock();
459 unsigned long long cycTime = time(NULL);
460 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
461
462 if (m->getControl_pressed()) { break; }
463
464 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
465
466 if (m->getControl_pressed()) { break; }
467
468 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
469
470 if (m->getControl_pressed()) { break; }
471
472 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
473
474 if (m->getControl_pressed()) { break; }
475
476 checkCentroids(numOTUs, centroids, weight);
477
478 if (m->getControl_pressed()) { break; }
479
480 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU, dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
481
482 if (m->getControl_pressed()) { break; }
483
484 iter++;
485
486 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
487
488 }
489
490 if (m->getControl_pressed()) { break; }
491
492 m->mothurOut("\nFinalizing...\n");
493 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
494
495 if (m->getDebug()) { m->mothurOut("[DEBUG]: done fill().\n"); }
496
497 if (m->getControl_pressed()) { break; }
498
499 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
500
501 if (m->getDebug()) { m->mothurOut("[DEBUG]: done setOTUs().\n"); }
502
503 if (m->getControl_pressed()) { break; }
504
505 vector<int> otuCounts(numOTUs, 0);
506 for(int j=0;j<numSeqs;j++) { otuCounts[otuData[j]]++; }
507
508 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
509
510 if (m->getDebug()) { m->mothurOut("[DEBUG]: done calcCentroidsDriver().\n"); }
511
512 if (m->getControl_pressed()) { break; }
513
514 if ((large) && (g == 0)) { flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
515 string thisOutputDir = outputdir;
516 if (outputdir == "") { thisOutputDir = util.hasPath(flowFileName); }
517 map<string, string> variables;
518 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(flowFileName));
519 string qualityFileName = getOutputFileName("qfile",variables);
520 string fastaFileName = getOutputFileName("fasta",variables);
521 string nameFileName = getOutputFileName("name",variables);
522 string otuCountsFileName = getOutputFileName("counts",variables);
523 string fileRoot = util.getRootName(util.getSimpleName(flowFileName));
524 int pos = fileRoot.find_first_of('.');
525 string fileGroup = fileRoot;
526 if (pos != string::npos) { fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1))); }
527 string groupFileName = getOutputFileName("group",variables);
528
529
530 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->getControl_pressed()) { break; }
531 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->getControl_pressed()) { break; }
532 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU); if (m->getControl_pressed()) { break; }
533 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI); if (m->getControl_pressed()) { break; }
534 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector); if (m->getControl_pressed()) { break; }
535
536 if (large) {
537 if (g > 0) {
538 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(theseFlowFileNames[0]));
539 util.appendFiles(qualityFileName, getOutputFileName("qfile",variables));
540 util.mothurRemove(qualityFileName);
541 util.appendFiles(fastaFileName, getOutputFileName("fasta",variables));
542 util.mothurRemove(fastaFileName);
543 util.appendFiles(nameFileName, getOutputFileName("name",variables));
544 util.mothurRemove(nameFileName);
545 util.appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
546 util.mothurRemove(otuCountsFileName);
547 util.appendFiles(groupFileName, getOutputFileName("group",variables));
548 util.mothurRemove(groupFileName);
549 }
550 util.mothurRemove(theseFlowFileNames[g]);
551 }
552 }
553
554 numCompleted++;
555 m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
556 }
557
558 if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
559
560 return numCompleted;
561
562 }catch(exception& e) {
563 m->errorOut(e, "ShhherCommand", "driver");
564 exit(1);
565 }
566 }
567
568 /**************************************************************************************************/
getFlowData(string filename,vector<string> & thisSeqNameVector,vector<int> & thisLengths,vector<short> & thisFlowDataIntI,map<string,int> & thisNameMap,int & numFlowCells)569 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
570 try{
571
572 ifstream flowFile;
573
574 util.openInputFile(filename, flowFile);
575
576 string seqName;
577 int currentNumFlowCells;
578 float intensity;
579 thisSeqNameVector.clear();
580 thisLengths.clear();
581 thisFlowDataIntI.clear();
582 thisNameMap.clear();
583
584 string numFlowTest;
585 flowFile >> numFlowTest;
586
587 if (!util.isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?\n"); exit(1); }
588 else { convert(numFlowTest, numFlowCells); }
589
590 if (m->getDebug()) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
591 int index = 0;//pcluster
592 while(!flowFile.eof()){
593
594 if (m->getControl_pressed()) { break; }
595
596 flowFile >> seqName >> currentNumFlowCells;
597
598 thisLengths.push_back(currentNumFlowCells);
599
600 thisSeqNameVector.push_back(seqName);
601 thisNameMap[seqName] = index++;//pcluster
602
603 if (m->getDebug()) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
604
605 for(int i=0;i<numFlowCells;i++){
606 flowFile >> intensity;
607 if(intensity > 9.99) { intensity = 9.99; }
608 int intI = int(100 * intensity + 0.0001);
609 thisFlowDataIntI.push_back(intI);
610 }
611 util.gobble(flowFile);
612 }
613 flowFile.close();
614
615 int numSeqs = thisSeqNameVector.size();
616
617 for(int i=0;i<numSeqs;i++){
618
619 if (m->getControl_pressed()) { break; }
620
621 int iNumFlowCells = i * numFlowCells;
622 for(int j=thisLengths[i];j<numFlowCells;j++){
623 thisFlowDataIntI[iNumFlowCells + j] = 0;
624 }
625 }
626
627 return numSeqs;
628
629 }
630 catch(exception& e) {
631 m->errorOut(e, "ShhherCommand", "getFlowData");
632 exit(1);
633 }
634 }
635 /**************************************************************************************************/
636
flowDistParentFork(int numFlowCells,string distFileName,int stopSeq,vector<int> & mapUniqueToSeq,vector<int> & mapSeqToUnique,vector<int> & lengths,vector<double> & flowDataPrI,vector<short> & flowDataIntI)637 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
638 try{
639
640 ostringstream outStream;
641 outStream.setf(ios::fixed, ios::floatfield);
642 outStream.setf(ios::dec, ios::basefield);
643 outStream.setf(ios::showpoint);
644 outStream.precision(6);
645
646 int begTime = time(NULL);
647 double begClock = clock();
648
649 for(int i=0;i<stopSeq;i++){
650
651 if (m->getControl_pressed()) { break; }
652
653 for(int j=0;j<i;j++){
654 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
655
656 if(flowDistance < 1e-6){
657 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
658 }
659 else if(flowDistance <= cutoff){
660 outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
661 }
662 }
663 if(i % 100 == 0){
664 m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
665 m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
666 }
667 }
668
669 ofstream distFile(distFileName.c_str());
670 distFile << outStream.str();
671 distFile.close();
672
673 if (m->getControl_pressed()) {}
674 else {
675 m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
676 m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
677 }
678
679 return 0;
680 }
681 catch(exception& e) {
682 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
683 exit(1);
684 }
685 }
686 /**************************************************************************************************/
687
calcPairwiseDist(int numFlowCells,int seqA,int seqB,vector<int> & mapSeqToUnique,vector<int> & lengths,vector<double> & flowDataPrI,vector<short> & flowDataIntI)688 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
689 try{
690 int minLength = lengths[mapSeqToUnique[seqA]];
691 if(lengths[seqB] < minLength){ minLength = lengths[mapSeqToUnique[seqB]]; }
692
693 int ANumFlowCells = seqA * numFlowCells;
694 int BNumFlowCells = seqB * numFlowCells;
695
696 float dist = 0;
697
698 for(int i=0;i<minLength;i++){
699
700 if (m->getControl_pressed()) { break; }
701
702 int flowAIntI = flowDataIntI[ANumFlowCells + i];
703 float flowAPrI = flowDataPrI[ANumFlowCells + i];
704
705 int flowBIntI = flowDataIntI[BNumFlowCells + i];
706 float flowBPrI = flowDataPrI[BNumFlowCells + i];
707 dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
708 }
709
710 dist /= (float) minLength;
711 return dist;
712 }
713 catch(exception& e) {
714 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
715 exit(1);
716 }
717 }
718
719 /**************************************************************************************************/
720
getUniques(int numSeqs,int numFlowCells,vector<short> & uniqueFlowgrams,vector<int> & uniqueCount,vector<int> & uniqueLengths,vector<int> & mapSeqToUnique,vector<int> & mapUniqueToSeq,vector<int> & lengths,vector<double> & flowDataPrI,vector<short> & flowDataIntI)721 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
722 try{
723 int numUniques = 0;
724 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
725 uniqueCount.assign(numSeqs, 0); // anWeights
726 uniqueLengths.assign(numSeqs, 0);
727 mapSeqToUnique.assign(numSeqs, -1);
728 mapUniqueToSeq.assign(numSeqs, -1);
729
730 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
731
732 for(int i=0;i<numSeqs;i++){
733
734 if (m->getControl_pressed()) { break; }
735
736 int index = 0;
737
738 vector<short> current(numFlowCells);
739 for(int j=0;j<numFlowCells;j++){
740 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
741 }
742
743 for(int j=0;j<numUniques;j++){
744 int offset = j * numFlowCells;
745 bool toEnd = 1;
746
747 int shorterLength;
748 if(lengths[i] < uniqueLengths[j]) { shorterLength = lengths[i]; }
749 else { shorterLength = uniqueLengths[j]; }
750
751 for(int k=0;k<shorterLength;k++){
752 if(current[k] != uniqueFlowgrams[offset + k]){
753 toEnd = 0;
754 break;
755 }
756 }
757
758 if(toEnd){
759 mapSeqToUnique[i] = j;
760 uniqueCount[j]++;
761 index = j;
762 if(lengths[i] > uniqueLengths[j]) { uniqueLengths[j] = lengths[i]; }
763 break;
764 }
765 index++;
766 }
767
768 if(index == numUniques){
769 uniqueLengths[numUniques] = lengths[i];
770 uniqueCount[numUniques] = 1;
771 mapSeqToUnique[i] = numUniques;//anMap
772 mapUniqueToSeq[numUniques] = i;//anF
773
774 for(int k=0;k<numFlowCells;k++){
775 uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
776 uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
777 }
778
779 numUniques++;
780 }
781 }
782 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
783 uniqueLengths.resize(numUniques);
784
785 flowDataPrI.resize(numSeqs * numFlowCells, 0);
786 for(int i=0;i<flowDataPrI.size();i++) { if (m->getControl_pressed()) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]); }
787
788 return numUniques;
789 }
790 catch(exception& e) {
791 m->errorOut(e, "ShhherCommand", "getUniques");
792 exit(1);
793 }
794 }
795 /**************************************************************************************************/
createNamesFile(int numSeqs,int numUniques,string filename,vector<string> & seqNameVector,vector<int> & mapSeqToUnique,vector<int> & mapUniqueToSeq)796 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
797 try{
798
799 vector<string> duplicateNames(numUniques, "");
800 for(int i=0;i<numSeqs;i++){
801 duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
802 }
803
804 ofstream nameFile;
805 util.openOutputFile(filename, nameFile);
806
807 for(int i=0;i<numUniques;i++){
808
809 if (m->getControl_pressed()) { break; }
810
811 // nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
812 nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
813 }
814
815 nameFile.close();
816
817 return 0;
818 }
819 catch(exception& e) {
820 m->errorOut(e, "ShhherCommand", "createNamesFile");
821 exit(1);
822 }
823 }
824 //**********************************************************************************************************************
825
cluster(string filename,string distFileName,string namesFileName)826 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
827 try {
828
829 ReadMatrix* read = new ReadColumnMatrix(distFileName);
830 read->setCutoff(cutoff);
831
832 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
833 clusterNameMap->readMap();
834 read->read(clusterNameMap);
835
836 ListVector* list = read->getListVector();
837 SparseDistanceMatrix* matrix = read->getDMatrix();
838
839 delete read;
840 delete clusterNameMap;
841
842 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
843
844 float adjust = -1.0;
845 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust);
846 string tag = cluster->getTag();
847
848 double clusterCutoff = cutoff;
849 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
850
851 if (m->getControl_pressed()) { break; }
852
853 cluster->update(clusterCutoff);
854 }
855
856 list->setLabel(toString(cutoff));
857
858 ofstream listFile;
859 util.openOutputFile(filename, listFile);
860 list->print(listFile, true);
861 listFile.close();
862
863 delete matrix; delete cluster; delete rabund; delete list;
864
865 return 0;
866 }
867 catch(exception& e) {
868 m->errorOut(e, "ShhherCommand", "cluster");
869 exit(1);
870 }
871 }
872 /**************************************************************************************************/
873
getOTUData(int numSeqs,string fileName,vector<int> & otuData,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<vector<int>> & aaP,vector<vector<int>> & aaI,vector<int> & seqNumber,vector<int> & seqIndex,map<string,int> & nameMap)874 int ShhherCommand::getOTUData(int numSeqs, string fileName, vector<int>& otuData,
875 vector<int>& cumNumSeqs,
876 vector<int>& nSeqsPerOTU,
877 vector<vector<int> >& aaP, //tMaster->aanP: each row is a different otu / each col contains the sequence indices
878 vector<vector<int> >& aaI, //tMaster->aanI: that are in each otu - can't differentiate between aaP and aaI
879 vector<int>& seqNumber, //tMaster->anP: the sequence id number sorted by OTU
880 vector<int>& seqIndex,
881 map<string, int>& nameMap){
882 try {
883 InputData input(fileName, "list", nullVector);
884 ListVector* list = input.getListVector();
885
886 string label = list->getLabel();
887 int numOTUs = list->getNumBins();
888
889 if (m->getDebug()) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
890
891 otuData.assign(numSeqs, 0);
892 cumNumSeqs.assign(numOTUs, 0);
893 nSeqsPerOTU.assign(numOTUs, 0);
894 aaP.clear();aaP.resize(numOTUs);
895
896 seqNumber.clear();
897 aaI.clear();
898 seqIndex.clear();
899
900 for(int i=0;i<numOTUs;i++){
901
902 if (m->getControl_pressed()) { break; }
903 if (m->getDebug()) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
904
905 string singleOTU = list->get(i);
906
907 vector<string> otuSeqs; util.splitAtComma(singleOTU, otuSeqs);
908
909 for(int j=0;j<otuSeqs.size();j++){
910
911 string seqName = otuSeqs[j];
912 map<string,int>::iterator nmIt = nameMap.find(seqName);
913 int index = nmIt->second;
914
915 nameMap.erase(nmIt);
916 otuData[index] = i;
917 nSeqsPerOTU[i]++;
918 aaP[i].push_back(index);
919 }
920
921 sort(aaP[i].begin(), aaP[i].end());
922 for(int j=0;j<nSeqsPerOTU[i];j++) { seqNumber.push_back(aaP[i][j]); }
923 for(int j=nSeqsPerOTU[i];j<numSeqs;j++) { aaP[i].push_back(0); }
924 }
925
926 for(int i=1;i<numOTUs;i++){ cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1]; }
927 aaI = aaP;
928 seqIndex = seqNumber;
929 delete list;
930
931 return numOTUs;
932 }
933 catch(exception& e) {
934 m->errorOut(e, "ShhherCommand", "getOTUData");
935 exit(1);
936 }
937 }
938 /**************************************************************************************************/
939
calcCentroidsDriver(int numOTUs,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<int> & seqIndex,vector<short> & change,vector<int> & centroids,vector<double> & singleTau,vector<int> & mapSeqToUnique,vector<short> & uniqueFlowgrams,vector<short> & flowDataIntI,vector<int> & lengths,int numFlowCells,vector<int> & seqNumber)940 int ShhherCommand::calcCentroidsDriver(int numOTUs,
941 vector<int>& cumNumSeqs,
942 vector<int>& nSeqsPerOTU,
943 vector<int>& seqIndex,
944 vector<short>& change, //did the centroid sequence change? 0 = no; 1 = yes
945 vector<int>& centroids, //the representative flowgram for each cluster m
946 vector<double>& singleTau, //tMaster->adTau: 1-D Tau vector (1xnumSeqs)
947 vector<int>& mapSeqToUnique,
948 vector<short>& uniqueFlowgrams,
949 vector<short>& flowDataIntI,
950 vector<int>& lengths,
951 int numFlowCells,
952 vector<int>& seqNumber){
953
954 //this function gets the most likely homopolymer length at a flow position for a group of sequences
955 //within an otu
956
957 try{
958
959 for(int i=0;i<numOTUs;i++){
960
961 if (m->getControl_pressed()) { break; }
962
963 double count = 0;
964 int position = 0;
965 int minFlowGram = 100000000;
966 double minFlowValue = 1e8;
967 change[i] = 0; //FALSE
968
969 for(int j=0;j<nSeqsPerOTU[i];j++){
970 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
971 }
972
973 if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
974 vector<double> adF(nSeqsPerOTU[i]);
975 vector<int> anL(nSeqsPerOTU[i]);
976
977 for(int j=0;j<nSeqsPerOTU[i];j++){
978 int index = cumNumSeqs[i] + j;
979 int nI = seqIndex[index];
980 int nIU = mapSeqToUnique[nI];
981
982 int k;
983 for(k=0;k<position;k++){
984 if(nIU == anL[k]){
985 break;
986 }
987 }
988 if(k == position){
989 anL[position] = nIU;
990 adF[position] = 0.0000;
991 position++;
992 }
993 }
994
995 for(int j=0;j<nSeqsPerOTU[i];j++){
996 int index = cumNumSeqs[i] + j;
997 int nI = seqIndex[index];
998
999 double tauValue = singleTau[seqNumber[index]];
1000
1001 for(int k=0;k<position;k++){
1002 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
1003 adF[k] += dist * tauValue;
1004 }
1005 }
1006
1007 for(int j=0;j<position;j++){
1008 if(adF[j] < minFlowValue){
1009 minFlowGram = j;
1010 minFlowValue = adF[j];
1011 }
1012 }
1013
1014 if(centroids[i] != anL[minFlowGram]){
1015 change[i] = 1;
1016 centroids[i] = anL[minFlowGram];
1017 }
1018 }
1019 else if(centroids[i] != -1){
1020 change[i] = 1;
1021 centroids[i] = -1;
1022 }
1023 }
1024
1025 return 0;
1026 }
1027 catch(exception& e) {
1028 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1029 exit(1);
1030 }
1031 }
1032 /**************************************************************************************************/
1033
getDistToCentroid(int cent,int flow,int length,vector<short> & uniqueFlowgrams,vector<short> & flowDataIntI,int numFlowCells)1034 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
1035 vector<short>& flowDataIntI, int numFlowCells){
1036 try{
1037
1038 int flowAValue = cent * numFlowCells;
1039 int flowBValue = flow * numFlowCells;
1040
1041 double dist = 0;
1042
1043 for(int i=0;i<length;i++){
1044 dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1045 flowAValue++;
1046 flowBValue++;
1047 }
1048
1049 return dist / (double)length;
1050 }
1051 catch(exception& e) {
1052 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1053 exit(1);
1054 }
1055 }
1056 /**************************************************************************************************/
1057
getNewWeights(int numOTUs,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<double> & singleTau,vector<int> & seqNumber,vector<double> & weight)1058 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
1059 try{
1060
1061 double maxChange = 0;
1062
1063 for(int i=0;i<numOTUs;i++){
1064
1065 if (m->getControl_pressed()) { break; }
1066
1067 double difference = weight[i];
1068 weight[i] = 0;
1069
1070 for(int j=0;j<nSeqsPerOTU[i];j++){
1071 int index = cumNumSeqs[i] + j;
1072 double tauValue = singleTau[seqNumber[index]];
1073 weight[i] += tauValue;
1074 }
1075
1076 difference = fabs(weight[i] - difference);
1077 if(difference > maxChange){ maxChange = difference; }
1078 }
1079 return maxChange;
1080 }
1081 catch(exception& e) {
1082 m->errorOut(e, "ShhherCommand", "getNewWeights");
1083 exit(1);
1084 }
1085 }
1086
1087 /**************************************************************************************************/
1088
getLikelihood(int numSeqs,int numOTUs,vector<int> & nSeqsPerOTU,vector<int> & seqNumber,vector<int> & cumNumSeqs,vector<int> & seqIndex,vector<double> & dist,vector<double> & weight)1089 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
1090
1091 try{
1092
1093 vector<long double> P(numSeqs, 0);
1094 int effNumOTUs = 0;
1095
1096 for(int i=0;i<numOTUs;i++){
1097 if(weight[i] > MIN_WEIGHT){
1098 effNumOTUs++;
1099 }
1100 }
1101
1102 string hold;
1103 for(int i=0;i<numOTUs;i++){
1104
1105 if (m->getControl_pressed()) { break; }
1106
1107 for(int j=0;j<nSeqsPerOTU[i];j++){
1108 int index = cumNumSeqs[i] + j;
1109 int nI = seqIndex[index];
1110 double singleDist = dist[seqNumber[index]];
1111
1112 P[nI] += weight[i] * exp(-singleDist * sigma);
1113 }
1114 }
1115 double nLL = 0.00;
1116 for(int i=0;i<numSeqs;i++){
1117 if(P[i] == 0){ P[i] = DBL_EPSILON; }
1118
1119 nLL += -log(P[i]);
1120 }
1121
1122 nLL = nLL -(double)numSeqs * log(sigma);
1123
1124 return nLL;
1125 }
1126 catch(exception& e) {
1127 m->errorOut(e, "ShhherCommand", "getNewWeights");
1128 exit(1);
1129 }
1130 }
1131
1132 /**************************************************************************************************/
1133
checkCentroids(int numOTUs,vector<int> & centroids,vector<double> & weight)1134 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
1135 try{
1136 vector<int> unique(numOTUs, 1);
1137
1138 for(int i=0;i<numOTUs;i++){
1139 if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1140 unique[i] = -1;
1141 }
1142 }
1143
1144 for(int i=0;i<numOTUs;i++){
1145
1146 if (m->getControl_pressed()) { break; }
1147
1148 if(unique[i] == 1){
1149 for(int j=i+1;j<numOTUs;j++){
1150 if(unique[j] == 1){
1151
1152 if(centroids[j] == centroids[i]){
1153 unique[j] = 0;
1154 centroids[j] = -1;
1155
1156 weight[i] += weight[j];
1157 weight[j] = 0.0;
1158 }
1159 }
1160 }
1161 }
1162 }
1163
1164 return 0;
1165 }
1166 catch(exception& e) {
1167 m->errorOut(e, "ShhherCommand", "checkCentroids");
1168 exit(1);
1169 }
1170 }
1171 /**************************************************************************************************/
1172
calcNewDistances(int numSeqs,int numOTUs,vector<int> & nSeqsPerOTU,vector<double> & dist,vector<double> & weight,vector<short> & change,vector<int> & centroids,vector<vector<int>> & aaP,vector<double> & singleTau,vector<vector<int>> & aaI,vector<int> & seqNumber,vector<int> & seqIndex,vector<short> & uniqueFlowgrams,vector<short> & flowDataIntI,int numFlowCells,vector<int> & lengths)1173 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
1174 vector<double>& weight, vector<short>& change, vector<int>& centroids,
1175 vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,
1176 vector<int>& seqNumber, vector<int>& seqIndex,
1177 vector<short>& uniqueFlowgrams,
1178 vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
1179
1180 try{
1181
1182 int total = 0;
1183 vector<double> newTau(numOTUs,0);
1184 vector<double> norms(numSeqs, 0);
1185 nSeqsPerOTU.assign(numOTUs, 0);
1186
1187 for(int i=0;i<numSeqs;i++){
1188
1189 if (m->getControl_pressed()) { break; }
1190
1191 int indexOffset = i * numOTUs;
1192
1193 double offset = 1e8;
1194
1195 for(int j=0;j<numOTUs;j++){
1196
1197 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1198 dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
1199 }
1200
1201 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1202 offset = dist[indexOffset + j];
1203 }
1204 }
1205
1206 for(int j=0;j<numOTUs;j++){
1207 if(weight[j] > MIN_WEIGHT){
1208 newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1209 norms[i] += newTau[j];
1210 }
1211 else{
1212 newTau[j] = 0.0;
1213 }
1214 }
1215
1216 for(int j=0;j<numOTUs;j++){
1217 newTau[j] /= norms[i];
1218 }
1219
1220 for(int j=0;j<numOTUs;j++){
1221 if(newTau[j] > MIN_TAU){
1222
1223 int oldTotal = total;
1224
1225 total++;
1226
1227 singleTau.resize(total, 0);
1228 seqNumber.resize(total, 0);
1229 seqIndex.resize(total, 0);
1230
1231 singleTau[oldTotal] = newTau[j];
1232
1233 aaP[j][nSeqsPerOTU[j]] = oldTotal;
1234 aaI[j][nSeqsPerOTU[j]] = i;
1235 nSeqsPerOTU[j]++;
1236 }
1237 }
1238
1239 }
1240
1241 }
1242 catch(exception& e) {
1243 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1244 exit(1);
1245 }
1246 }
1247 /**************************************************************************************************/
1248
fill(int numOTUs,vector<int> & seqNumber,vector<int> & seqIndex,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<vector<int>> & aaP,vector<vector<int>> & aaI)1249 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
1250 try {
1251 int index = 0;
1252 for(int i=0;i<numOTUs;i++){
1253
1254 if (m->getControl_pressed()) { return 0; }
1255
1256 cumNumSeqs[i] = index;
1257 for(int j=0;j<nSeqsPerOTU[i];j++){
1258 seqNumber[index] = aaP[i][j];
1259 seqIndex[index] = aaI[i][j];
1260
1261 index++;
1262 }
1263 }
1264
1265 return 0;
1266 }
1267 catch(exception& e) {
1268 m->errorOut(e, "ShhherCommand", "fill");
1269 exit(1);
1270 }
1271 }
1272 /**************************************************************************************************/
1273
setOTUs(int numOTUs,int numSeqs,vector<int> & seqNumber,vector<int> & seqIndex,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<int> & otuData,vector<double> & singleTau,vector<double> & dist,vector<vector<int>> & aaP,vector<vector<int>> & aaI)1274 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
1275 vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
1276
1277 try {
1278 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1279
1280 for(int i=0;i<numOTUs;i++){
1281
1282 if (m->getControl_pressed()) { break; }
1283
1284 for(int j=0;j<nSeqsPerOTU[i];j++){
1285 int index = cumNumSeqs[i] + j;
1286 double tauValue = singleTau[seqNumber[index]];
1287 int sIndex = seqIndex[index];
1288 bigTauMatrix[sIndex * numOTUs + i] = tauValue;
1289 }
1290 }
1291
1292 for(int i=0;i<numSeqs;i++){
1293 double maxTau = -1.0000;
1294 int maxOTU = -1;
1295 for(int j=0;j<numOTUs;j++){
1296 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1297 maxTau = bigTauMatrix[i * numOTUs + j];
1298 maxOTU = j;
1299 }
1300 }
1301
1302 otuData[i] = maxOTU;
1303 }
1304
1305 nSeqsPerOTU.assign(numOTUs, 0);
1306
1307 for(int i=0;i<numSeqs;i++){
1308 int index = otuData[i];
1309
1310 singleTau[i] = 1.0000;
1311 dist[i] = 0.0000;
1312
1313 aaP[index][nSeqsPerOTU[index]] = i;
1314 aaI[index][nSeqsPerOTU[index]] = i;
1315
1316 nSeqsPerOTU[index]++;
1317 }
1318
1319 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
1320 }
1321 catch(exception& e) {
1322 m->errorOut(e, "ShhherCommand", "setOTUs");
1323 exit(1);
1324 }
1325 }
1326 /**************************************************************************************************/
1327
writeQualities(int numOTUs,int numFlowCells,string qualityFileName,vector<int> otuCounts,vector<int> & nSeqsPerOTU,vector<int> & seqNumber,vector<double> & singleTau,vector<short> & flowDataIntI,vector<short> & uniqueFlowgrams,vector<int> & cumNumSeqs,vector<int> & mapUniqueToSeq,vector<string> & seqNameVector,vector<int> & centroids,vector<vector<int>> & aaI)1328 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
1329 vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
1330 vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
1331
1332 try {
1333
1334 ofstream qualityFile;
1335 util.openOutputFile(qualityFileName, qualityFile);
1336
1337 qualityFile.setf(ios::fixed, ios::floatfield);
1338 qualityFile.setf(ios::showpoint);
1339 qualityFile << setprecision(6);
1340
1341 vector<vector<int> > qualities(numOTUs);
1342 vector<double> pr(HOMOPS, 0);
1343
1344
1345 for(int i=0;i<numOTUs;i++){
1346
1347 if (m->getControl_pressed()) { break; }
1348
1349 int index = 0;
1350
1351 if(nSeqsPerOTU[i] > 0){
1352
1353 while(index < numFlowCells){
1354
1355 double maxPrValue = 1e8;
1356 short maxPrIndex = -1;
1357 double count = 0.0000;
1358
1359 pr.assign(HOMOPS, 0);
1360
1361 for(int j=0;j<nSeqsPerOTU[i];j++){
1362 int lIndex = cumNumSeqs[i] + j;
1363 double tauValue = singleTau[seqNumber[lIndex]];
1364 int sequenceIndex = aaI[i][j];
1365 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1366
1367 count += tauValue;
1368
1369 for(int s=0;s<HOMOPS;s++){
1370 pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1371 }
1372 }
1373
1374 maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1375 maxPrValue = pr[maxPrIndex];
1376
1377 if(count > MIN_COUNT){
1378 double U = 0.0000;
1379 double norm = 0.0000;
1380
1381 for(int s=0;s<HOMOPS;s++){
1382 norm += exp(-(pr[s] - maxPrValue));
1383 }
1384
1385 for(int s=1;s<=maxPrIndex;s++){
1386 int value = 0;
1387 double temp = 0.0000;
1388
1389 U += exp(-(pr[s-1]-maxPrValue))/norm;
1390
1391 if(U>0.00){
1392 temp = log10(U);
1393 }
1394 else{
1395 temp = -10.1;
1396 }
1397 temp = floor(-10 * temp);
1398 value = (int)floor(temp);
1399 if(value > 100){ value = 100; }
1400
1401 qualities[i].push_back((int)value);
1402 }
1403 }//end if
1404
1405 index++;
1406
1407 }//end while
1408
1409 }//end if
1410
1411
1412 if(otuCounts[i] > 0){
1413 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1414 //need to get past the first four bases
1415 for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
1416 qualityFile << endl;
1417 }
1418 }//end for
1419 qualityFile.close();
1420 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1421
1422 }
1423 catch(exception& e) {
1424 m->errorOut(e, "ShhherCommand", "writeQualities");
1425 exit(1);
1426 }
1427 }
1428
1429 /**************************************************************************************************/
1430
writeSequences(string thisCompositeFASTAFileName,int numOTUs,int numFlowCells,string fastaFileName,vector<int> otuCounts,vector<short> & uniqueFlowgrams,vector<string> & seqNameVector,vector<vector<int>> & aaI,vector<int> & centroids)1431 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
1432 try {
1433
1434 ofstream fastaFile;
1435 util.openOutputFile(fastaFileName, fastaFile);
1436
1437 vector<string> names(numOTUs, "");
1438
1439 for(int i=0;i<numOTUs;i++){
1440
1441 if (m->getControl_pressed()) { break; }
1442
1443 int index = centroids[i];
1444
1445 if(otuCounts[i] > 0){
1446 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1447
1448 string newSeq = "";
1449
1450 for(int j=0;j<numFlowCells;j++){
1451
1452 char base = flowOrder[j % flowOrder.length()];
1453 for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1454 newSeq += base;
1455 }
1456 }
1457
1458 if (newSeq.length() >= 4) { fastaFile << newSeq.substr(4) << endl; }
1459 else { fastaFile << "NNNN" << endl; }
1460 }
1461 }
1462 fastaFile.close();
1463
1464 outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1465
1466 if(thisCompositeFASTAFileName != ""){
1467 util.appendFiles(fastaFileName, thisCompositeFASTAFileName);
1468 }
1469 }
1470 catch(exception& e) {
1471 m->errorOut(e, "ShhherCommand", "writeSequences");
1472 exit(1);
1473 }
1474 }
1475
1476 /**************************************************************************************************/
1477
writeNames(string thisCompositeNamesFileName,int numOTUs,string nameFileName,vector<int> otuCounts,vector<string> & seqNameVector,vector<vector<int>> & aaI,vector<int> & nSeqsPerOTU)1478 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
1479 try {
1480
1481 ofstream nameFile;
1482 util.openOutputFile(nameFileName, nameFile);
1483
1484 for(int i=0;i<numOTUs;i++){
1485
1486 if (m->getControl_pressed()) { break; }
1487
1488 if(otuCounts[i] > 0){
1489 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1490
1491 for(int j=1;j<nSeqsPerOTU[i];j++){
1492 nameFile << ',' << seqNameVector[aaI[i][j]];
1493 }
1494
1495 nameFile << endl;
1496 }
1497 }
1498 nameFile.close();
1499 outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1500
1501
1502 if(thisCompositeNamesFileName != ""){
1503 util.appendFiles(nameFileName, thisCompositeNamesFileName);
1504 }
1505 }
1506 catch(exception& e) {
1507 m->errorOut(e, "ShhherCommand", "writeNames");
1508 exit(1);
1509 }
1510 }
1511
1512 /**************************************************************************************************/
1513
writeGroups(string groupFileName,string fileRoot,int numSeqs,vector<string> & seqNameVector)1514 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
1515 try {
1516 ofstream groupFile;
1517 util.openOutputFile(groupFileName, groupFile);
1518
1519 for(int i=0;i<numSeqs;i++){
1520 if (m->getControl_pressed()) { break; }
1521 groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1522 }
1523 groupFile.close();
1524 outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1525
1526 }
1527 catch(exception& e) {
1528 m->errorOut(e, "ShhherCommand", "writeGroups");
1529 exit(1);
1530 }
1531 }
1532
1533 /**************************************************************************************************/
1534
writeClusters(string otuCountsFileName,int numOTUs,int numFlowCells,vector<int> otuCounts,vector<int> & centroids,vector<short> & uniqueFlowgrams,vector<string> & seqNameVector,vector<vector<int>> & aaI,vector<int> & nSeqsPerOTU,vector<int> & lengths,vector<short> & flowDataIntI)1535 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
1536 try {
1537 ofstream otuCountsFile;
1538 util.openOutputFile(otuCountsFileName, otuCountsFile);
1539
1540 string bases = flowOrder;
1541
1542 for(int i=0;i<numOTUs;i++){
1543
1544 if (m->getControl_pressed()) {
1545 break;
1546 }
1547 //output the translated version of the centroid sequence for the otu
1548 if(otuCounts[i] > 0){
1549 int index = centroids[i];
1550
1551 otuCountsFile << "ideal\t";
1552 for(int j=8;j<numFlowCells;j++){
1553 char base = bases[j % bases.length()];
1554 for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1555 otuCountsFile << base;
1556 }
1557 }
1558 otuCountsFile << endl;
1559
1560 for(int j=0;j<nSeqsPerOTU[i];j++){
1561 int sequence = aaI[i][j];
1562 otuCountsFile << seqNameVector[sequence] << '\t';
1563
1564 string newSeq = "";
1565
1566 for(int k=0;k<lengths[sequence];k++){
1567 char base = bases[k % bases.length()];
1568 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1569
1570 for(int s=0;s<freq;s++){
1571 newSeq += base;
1572 //otuCountsFile << base;
1573 }
1574 }
1575
1576 if (newSeq.length() >= 4) { otuCountsFile << newSeq.substr(4) << endl; }
1577 else { otuCountsFile << "NNNN" << endl; }
1578 }
1579 otuCountsFile << endl;
1580 }
1581 }
1582 otuCountsFile.close();
1583 outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1584
1585 }
1586 catch(exception& e) {
1587 m->errorOut(e, "ShhherCommand", "writeClusters");
1588 exit(1);
1589 }
1590 }
1591
1592 /**************************************************************************************************/
1593
getSingleLookUp()1594 void ShhherCommand::getSingleLookUp(){
1595 try{
1596 // these are the -log probabilities that a signal corresponds to a particular homopolymer length
1597 singleLookUp.assign(HOMOPS * NUMBINS, 0);
1598
1599 int index = 0;
1600 ifstream lookUpFile;
1601 util.openInputFile(lookupFileName, lookUpFile);
1602
1603 for(int i=0;i<HOMOPS;i++){
1604
1605 if (m->getControl_pressed()) { break; }
1606
1607 float logFracFreq;
1608 lookUpFile >> logFracFreq;
1609
1610 for(int j=0;j<NUMBINS;j++) {
1611 lookUpFile >> singleLookUp[index];
1612 index++;
1613 }
1614 }
1615 lookUpFile.close();
1616 }
1617 catch(exception& e) {
1618 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
1619 exit(1);
1620 }
1621 }
1622
1623 /**************************************************************************************************/
1624
getJointLookUp()1625 void ShhherCommand::getJointLookUp(){
1626 try{
1627
1628 // the most likely joint probability (-log) that two intenities have the same polymer length
1629 jointLookUp.resize(NUMBINS * NUMBINS, 0);
1630
1631 for(int i=0;i<NUMBINS;i++){
1632
1633 if (m->getControl_pressed()) { break; }
1634
1635 for(int j=0;j<NUMBINS;j++){
1636
1637 double minSum = 100000000;
1638
1639 for(int k=0;k<HOMOPS;k++){
1640 double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
1641
1642 if(sum < minSum) { minSum = sum; }
1643 }
1644 jointLookUp[i * NUMBINS + j] = minSum;
1645 }
1646 }
1647 }
1648 catch(exception& e) {
1649 m->errorOut(e, "ShhherCommand", "getJointLookUp");
1650 exit(1);
1651 }
1652 }
1653
1654 /**************************************************************************************************/
1655
getProbIntensity(int intIntensity)1656 double ShhherCommand::getProbIntensity(int intIntensity){
1657 try{
1658
1659 double minNegLogProb = 100000000;
1660
1661
1662 for(int i=0;i<HOMOPS;i++){//loop signal strength
1663
1664 if (m->getControl_pressed()) { break; }
1665
1666 float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
1667 if(negLogProb < minNegLogProb) { minNegLogProb = negLogProb; }
1668 }
1669
1670 return minNegLogProb;
1671 }
1672 catch(exception& e) {
1673 m->errorOut(e, "ShhherCommand", "getProbIntensity");
1674 exit(1);
1675 }
1676 }
1677
1678
1679
1680
1681