1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9 
10 #include "shhhercommand.h"
11 
12 //**********************************************************************************************************************
setParameters()13 vector<string> ShhherCommand::setParameters(){
14 	try {
15 		CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16 		CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17 		CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18 		CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19 		CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
20         CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
21 		CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
22 		CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
23         CommandParameter porder("order", "Multiple", "A-B-I", "A", "", "", "","",false,false, true); parameters.push_back(porder);		CommandParameter pseed("seed", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pseed);
24         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25 		CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26 
27         vector<string> tempOutNames;
28         outputTypes["fasta"] = tempOutNames;
29         outputTypes["name"] = tempOutNames;
30         outputTypes["group"] = tempOutNames;
31         outputTypes["counts"] = tempOutNames;
32         outputTypes["qfile"] = tempOutNames;
33 
34         abort = false; calledHelp = false;
35 
36 		vector<string> myArray;
37 		for (int i = 0; i < parameters.size(); i++) {	myArray.push_back(parameters[i].name);		}
38 		return myArray;
39 	}
40 	catch(exception& e) {
41 		m->errorOut(e, "ShhherCommand", "setParameters");
42 		exit(1);
43 	}
44 }
45 //**********************************************************************************************************************
getHelpString()46 string ShhherCommand::getHelpString(){
47 	try {
48 		string helpString = "";
49 		helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
50         helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
51         helpString += "The flow parameter is used to input your flow file.\n";
52         helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
53         helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
54         helpString += "The order parameter options are A, B or I.  Default=A. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n";
55 		return helpString;
56 	}
57 	catch(exception& e) {
58 		m->errorOut(e, "ShhherCommand", "getHelpString");
59 		exit(1);
60 	}
61 }
62 //**********************************************************************************************************************
getOutputPattern(string type)63 string ShhherCommand::getOutputPattern(string type) {
64     try {
65         string pattern = "";
66 
67         if (type == "fasta")            {   pattern = "[filename],shhh.fasta";   }
68         else if (type == "name")    {   pattern = "[filename],shhh.names";   }
69         else if (type == "group")        {   pattern = "[filename],shhh.groups";   }
70         else if (type == "counts")        {   pattern = "[filename],shhh.counts";   }
71         else if (type == "qfile")        {   pattern = "[filename],shhh.qual";   }
72         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->setControl_pressed(true);  }
73 
74         return pattern;
75     }
76     catch(exception& e) {
77         m->errorOut(e, "ShhherCommand", "getOutputPattern");
78         exit(1);
79     }
80 }
81 //**********************************************************************************************************************
ShhherCommand(string option)82 ShhherCommand::ShhherCommand(string option) : Command() {
83 	try {
84 		if(option == "help") { help(); abort = true; calledHelp = true; }
85 		else if(option == "citation") { citation(); abort = true; calledHelp = true;}
86         else if(option == "category") {  abort = true; calledHelp = true;  }
87 
88 		else {
89             OptionParser parser(option, setParameters());
90 			map<string,string> parameters = parser.getParameters();
91 
92 			ValidParameters validParameter;
93 
94 
95 			//check for required parameters
96 			flowFileName = validParameter.validFile(parameters, "flow");
97 			flowFilesFileName = validParameter.validFile(parameters, "file");
98 			if (flowFileName == "not found" && flowFilesFileName == "not found") {
99 				m->mothurOut("values for either flow or file must be provided for the shhh.flows command.\n"); abort = true;
100 			}
101 			else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
102 
103 			if(flowFileName != "not found"){
104 				compositeFASTAFileName = "";
105 				compositeNamesFileName = "";
106 			}
107 			else{
108 				ofstream temp;
109 
110                 string thisoutputDir = outputdir;
111                 if (outputdir == "") {  thisoutputDir =  util.hasPath(flowFilesFileName); }
112 
113 				//we want to rip off .files, and also .flow if its there
114                 string fileroot = util.getRootName(util.getSimpleName(flowFilesFileName));
115                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
116                 string extension = util.getExtension(fileroot);
117                 if (extension == ".flow") { fileroot = util.getRootName(fileroot);  }
118                 else { fileroot += "."; } //add back if needed
119 
120 				compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
121 				util.openOutputFile(compositeFASTAFileName, temp);
122 				temp.close();
123 
124 				compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
125 				util.openOutputFile(compositeNamesFileName, temp);
126 				temp.close();
127 			}
128 
129             if(flowFilesFileName != "not found"){
130                 ifstream flowFilesFile;
131                 util.openInputFile(flowFilesFileName, flowFilesFile);
132                 while(flowFilesFile){
133                     string fName = util.getline(flowFilesFile); util.gobble(flowFilesFile);
134 
135                     //check to make sure both are able to be opened
136                     bool ableToOpen = util.checkLocations(fName, current->getLocations());
137                     if (ableToOpen) {
138                         if (util.isBlank(fName)) { m->mothurOut("[WARNING]: " + fName + " is blank, disregarding.\n"); }
139                         else { flowFileVector.push_back(fName); }
140                     }else { m->mothurOut("Unable to open " + fName + ". Disregarding.\n");  }
141                 }
142                 flowFilesFile.close();
143                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files.\n");  abort = true; }
144             }
145             else{
146                 if (outputdir == "") { outputdir = util.hasPath(flowFileName); }
147                 flowFileVector.push_back(flowFileName);
148             }
149 
150 			//check for optional parameter and set defaults
151 			// ...at some point should added some additional type checking...
152 			string temp;
153 			temp = validParameter.validFile(parameters, "lookup");
154 			if (temp == "not found")	{
155 				string path = current->getProgramPath();
156 
157 #if defined NON_WINDOWS
158                 path += "lookupFiles/";
159 #else
160                 path += "lookupFiles\\";
161 #endif
162 				lookupFileName = util.getFullPathName(path) + "LookUp_Titanium.pat";
163 
164                 //check to make sure both are able to be opened
165                 bool ableToOpen = util.checkLocations(lookupFileName, current->getLocations());
166                 if (ableToOpen) {
167                     if (util.isBlank(lookupFileName)) { m->mothurOut("[ERROR]: " + lookupFileName + " is blank, aborting.\n"); abort=true;  }
168                 }else { m->mothurOut("[ERROR]: Unable to open " + lookupFileName + ".\n");  abort=true;   }
169 			}
170 			else if(temp == "not open")	{
171 
172 				lookupFileName = validParameter.validPath(parameters, "lookup");
173 
174                 //check to make sure both are able to be opened
175                 bool ableToOpen = util.checkLocations(lookupFileName, current->getLocations());
176                 if (ableToOpen) {
177                     if (util.isBlank(lookupFileName)) { m->mothurOut("[ERROR]: " + lookupFileName + " is blank, aborting.\n"); abort=true;  }
178                 }else { m->mothurOut("[ERROR]: Unable to open " + lookupFileName + ".\n");  abort=true;   }
179 
180             }else						{	lookupFileName = temp;	}
181 
182 			temp = validParameter.valid(parameters, "cutoff");	if (temp == "not found"){	temp = "0.01";		}
183 			util.mothurConvert(temp, cutoff);
184 
185 			temp = validParameter.valid(parameters, "mindelta");	if (temp == "not found"){	temp = "0.000001";	}
186 			util.mothurConvert(temp, minDelta);
187 
188 			temp = validParameter.valid(parameters, "maxiter");	if (temp == "not found"){	temp = "1000";		}
189 			util.mothurConvert(temp, maxIters);
190 
191             temp = validParameter.valid(parameters, "large");	if (temp == "not found"){	temp = "0";		}
192 			util.mothurConvert(temp, largeSize);
193             if (largeSize != 0) { large = true; }
194             else { large = false;  }
195             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
196 
197 			temp = validParameter.valid(parameters, "sigma"); if (temp == "not found")	{	temp = "60";		}
198 			util.mothurConvert(temp, sigma);
199 
200 			temp = validParameter.valid(parameters, "order");  if (temp == "not found"){ 	temp = "A";	}
201             if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
202             }
203             else {
204                 if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
205                 else if(toupper(temp[0]) == 'B'){
206                     flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
207                 else if(toupper(temp[0]) == 'I'){
208                     flowOrder = "TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC";   }
209                 else {
210                     m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A, B, or I. A = TACG, B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC, and I = TACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGCTACGTACGTCTGAGCATCGATCGATGTACAGC.\n");  abort=true;
211                 }
212             }
213 
214 
215 		}
216 
217 	}
218 	catch(exception& e) {
219 		m->errorOut(e, "ShhherCommand", "ShhherCommand");
220 		exit(1);
221 	}
222 }
223 //**********************************************************************************************************************
224 
execute()225 int ShhherCommand::execute(){
226 	try {
227 		if (abort) { if (calledHelp) { return 0; }  return 2;	}
228 
229 		getSingleLookUp();	if (m->getControl_pressed()) { return 0; }
230 		getJointLookUp();	if (m->getControl_pressed()) { return 0; }
231 
232         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
233 
234 		if(compositeFASTAFileName != ""){
235 			outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
236 			outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
237 		}
238 
239 		m->mothurOut("\nOutput File Names: \n");
240 		for (int i = 0; i < outputNames.size(); i++) {	m->mothurOut(outputNames[i] +"\n"); 	} m->mothurOutEndLine();
241 
242 		return 0;
243 	}
244 	catch(exception& e) {
245 		m->errorOut(e, "ShhherCommand", "execute");
246 		exit(1);
247 	}
248 }
249 //********************************************************************************************************************
250 //sorts biggest to smallest
compareFileSizes(string left,string right)251 inline bool compareFileSizes(string left, string right){
252 
253     FILE * pFile;
254     long leftsize = 0;
255 
256     //get num bytes in file
257     string filename = left;
258     pFile = fopen (filename.c_str(),"rb");
259     string error = "Error opening " + filename;
260     if (pFile==NULL) perror (error.c_str());
261     else{
262         fseek (pFile, 0, SEEK_END);
263         leftsize=ftell (pFile);
264         fclose (pFile);
265     }
266 
267     FILE * pFile2;
268     long rightsize = 0;
269 
270     //get num bytes in file
271     filename = right;
272     pFile2 = fopen (filename.c_str(),"rb");
273     error = "Error opening " + filename;
274     if (pFile2==NULL) perror (error.c_str());
275     else{
276         fseek (pFile2, 0, SEEK_END);
277         rightsize=ftell (pFile2);
278         fclose (pFile2);
279     }
280 
281     return (leftsize > rightsize);
282 }
283 /**************************************************************************************************/
284 
parseFlowFiles(string filename)285 vector<string> ShhherCommand::parseFlowFiles(string filename){
286     try {
287         vector<string> files;
288         int count = 0;
289 
290         ifstream in;
291         util.openInputFile(filename, in);
292 
293         int thisNumFLows = 0;
294         in >> thisNumFLows; util.gobble(in);
295 
296         while (!in.eof()) {
297             if (m->getControl_pressed()) { break; }
298 
299             ofstream out;
300             string outputFileName = filename + toString(count) + ".temp";
301             util.openOutputFile(outputFileName, out);
302             out << thisNumFLows << endl;
303             files.push_back(outputFileName);
304 
305             int numLinesWrote = 0;
306             for (int i = 0; i < largeSize; i++) {
307                 if (in.eof()) { break; }
308                 string line = util.getline(in); util.gobble(in);
309                 out << line << endl;
310                 numLinesWrote++;
311             }
312             out.close();
313 
314             if (numLinesWrote == 0) {  util.mothurRemove(outputFileName); files.pop_back();  }
315             count++;
316         }
317         in.close();
318 
319         if (m->getControl_pressed()) { for (int i = 0; i < files.size(); i++) { util.mothurRemove(files[i]); }  files.clear(); }
320 
321         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n");
322 
323         return files;
324     }
325 	catch(exception& e) {
326 		m->errorOut(e, "ShhherCommand", "parseFlowFiles");
327 		exit(1);
328 	}
329 }
330 /**************************************************************************************************/
331 
driver(vector<string> filenames,string thisCompositeFASTAFileName,string thisCompositeNamesFileName)332 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
333     try {
334 
335         int numCompleted = 0;
336 
337         for(int i=0;i<filenames.size();i++){
338 
339 			if (m->getControl_pressed()) { break; }
340 
341             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
342             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
343 
344             if (m->getControl_pressed()) { break; }
345 
346             double begClock = clock();
347             unsigned long long begTime;
348 
349             string fileNameForOutput = filenames[i];
350 
351             for (int g = 0; g < theseFlowFileNames.size(); g++) {
352 
353                 string flowFileName = theseFlowFileNames[g];
354                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
355                 m->mothurOut("Reading flowgrams...\n");
356 
357                 vector<string> seqNameVector;
358                 vector<int> lengths;
359                 vector<short> flowDataIntI;
360                 vector<double> flowDataPrI;
361                 map<string, int> nameMap;
362                 vector<short> uniqueFlowgrams;
363                 vector<int> uniqueCount;
364                 vector<int> mapSeqToUnique;
365                 vector<int> mapUniqueToSeq;
366                 vector<int> uniqueLengths;
367                 int numFlowCells;
368 
369                 if (m->getDebug()) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
370                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
371 
372                 if (m->getControl_pressed()) { break; }
373 
374                 m->mothurOut("Identifying unique flowgrams...\n");
375                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
376 
377                 if (m->getControl_pressed()) { break; }
378 
379                 m->mothurOut("Calculating distances between flowgrams...\n");
380                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
381                 begTime = time(NULL);
382 
383 
384                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
385 
386                 m->mothurOutEndLine();
387                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
388 
389 
390                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
391                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
392 
393                 if (m->getControl_pressed()) { break; }
394 
395                 m->mothurOut("\nClustering flowgrams...\n");
396                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
397                 cluster(listFileName, distFileName, namesFileName);
398 
399                 if (m->getControl_pressed()) { break; }
400 
401                 vector<int> otuData;
402                 vector<int> cumNumSeqs;
403                 vector<int> nSeqsPerOTU;
404                 vector<vector<int> > aaP;	//tMaster->aanP:	each row is a different otu / each col contains the sequence indices
405                 vector<vector<int> > aaI;	//tMaster->aanI:	that are in each otu - can't differentiate between aaP and aaI
406                 vector<int> seqNumber;		//tMaster->anP:		the sequence id number sorted by OTU
407                 vector<int> seqIndex;		//tMaster->anI;		the index that corresponds to seqNumber
408 
409 
410                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
411 
412                 if (m->getControl_pressed()) { break; }
413 
414                 util.mothurRemove(distFileName);
415                 util.mothurRemove(namesFileName);
416                 util.mothurRemove(listFileName);
417 
418                 vector<double> dist;		//adDist - distance of sequences to centroids
419                 vector<short> change;		//did the centroid sequence change? 0 = no; 1 = yes
420                 vector<int> centroids;		//the representative flowgram for each cluster m
421                 vector<double> weight;
422                 vector<double> singleTau;	//tMaster->adTau:	1-D Tau vector (1xnumSeqs)
423                 vector<int> nSeqsBreaks;
424                 vector<int> nOTUsBreaks;
425 
426                 if (m->getDebug()) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
427 
428                 dist.assign(numSeqs * numOTUs, 0);
429                 change.assign(numOTUs, 1);
430                 centroids.assign(numOTUs, -1);
431                 weight.assign(numOTUs, 0);
432                 singleTau.assign(numSeqs, 1.0);
433 
434                 nSeqsBreaks.assign(2, 0);
435                 nOTUsBreaks.assign(2, 0);
436 
437                 nSeqsBreaks[0] = 0;
438                 nSeqsBreaks[1] = numSeqs;
439                 nOTUsBreaks[1] = numOTUs;
440 
441                 if (m->getDebug()) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
442 
443                 if (m->getControl_pressed()) { break; }
444 
445                 double maxDelta = 0;
446                 int iter = 0;
447 
448                 begClock = clock();
449                 begTime = time(NULL);
450 
451                 m->mothurOut("\nDenoising flowgrams...\n");
452                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
453 
454                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
455 
456                     if (m->getControl_pressed()) { break; }
457 
458                     double cycClock = clock();
459                     unsigned long long cycTime = time(NULL);
460                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
461 
462                     if (m->getControl_pressed()) { break; }
463 
464                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
465 
466                     if (m->getControl_pressed()) { break; }
467 
468                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);
469 
470                     if (m->getControl_pressed()) { break; }
471 
472                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight);
473 
474                     if (m->getControl_pressed()) { break; }
475 
476                     checkCentroids(numOTUs, centroids, weight);
477 
478                     if (m->getControl_pressed()) { break; }
479 
480                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
481 
482                     if (m->getControl_pressed()) { break; }
483 
484                     iter++;
485 
486                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
487 
488                 }
489 
490                 if (m->getControl_pressed()) { break; }
491 
492                 m->mothurOut("\nFinalizing...\n");
493                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
494 
495                 if (m->getDebug()) { m->mothurOut("[DEBUG]: done fill().\n"); }
496 
497                 if (m->getControl_pressed()) { break; }
498 
499                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
500 
501                 if (m->getDebug()) { m->mothurOut("[DEBUG]: done setOTUs().\n"); }
502 
503                 if (m->getControl_pressed()) { break; }
504 
505                 vector<int> otuCounts(numOTUs, 0);
506                 for(int j=0;j<numSeqs;j++)	{	otuCounts[otuData[j]]++;	}
507 
508                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
509 
510                 if (m->getDebug()) { m->mothurOut("[DEBUG]: done calcCentroidsDriver().\n"); }
511 
512                 if (m->getControl_pressed()) { break; }
513 
514                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
515                 string thisOutputDir = outputdir;
516                 if (outputdir == "") {  thisOutputDir = util.hasPath(flowFileName);  }
517                 map<string, string> variables;
518                 variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(flowFileName));
519                 string qualityFileName = getOutputFileName("qfile",variables);
520                 string fastaFileName = getOutputFileName("fasta",variables);
521                 string nameFileName = getOutputFileName("name",variables);
522                 string otuCountsFileName = getOutputFileName("counts",variables);
523                 string fileRoot = util.getRootName(util.getSimpleName(flowFileName));
524                 int pos = fileRoot.find_first_of('.');
525                 string fileGroup = fileRoot;
526                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
527                 string groupFileName = getOutputFileName("group",variables);
528 
529 
530                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->getControl_pressed()) { break; }
531                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->getControl_pressed()) { break; }
532                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);				if (m->getControl_pressed()) { break; }
533                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);			if (m->getControl_pressed()) { break; }
534                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);						if (m->getControl_pressed()) { break; }
535 
536                 if (large) {
537                     if (g > 0) {
538                         variables["[filename]"] = thisOutputDir + util.getRootName(util.getSimpleName(theseFlowFileNames[0]));
539                         util.appendFiles(qualityFileName, getOutputFileName("qfile",variables));
540                         util.mothurRemove(qualityFileName);
541                         util.appendFiles(fastaFileName, getOutputFileName("fasta",variables));
542                         util.mothurRemove(fastaFileName);
543                         util.appendFiles(nameFileName, getOutputFileName("name",variables));
544                         util.mothurRemove(nameFileName);
545                         util.appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
546                         util.mothurRemove(otuCountsFileName);
547                         util.appendFiles(groupFileName, getOutputFileName("group",variables));
548                         util.mothurRemove(groupFileName);
549                     }
550                     util.mothurRemove(theseFlowFileNames[g]);
551                 }
552 			}
553 
554             numCompleted++;
555 			m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
556 		}
557 
558         if (m->getControl_pressed()) { for (int i = 0; i < outputNames.size(); i++) { util.mothurRemove(outputNames[i]); } return 0; }
559 
560         return numCompleted;
561 
562     }catch(exception& e) {
563             m->errorOut(e, "ShhherCommand", "driver");
564             exit(1);
565     }
566 }
567 
568 /**************************************************************************************************/
getFlowData(string filename,vector<string> & thisSeqNameVector,vector<int> & thisLengths,vector<short> & thisFlowDataIntI,map<string,int> & thisNameMap,int & numFlowCells)569 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
570 	try{
571 
572 		ifstream flowFile;
573 
574 		util.openInputFile(filename, flowFile);
575 
576 		string seqName;
577 		int currentNumFlowCells;
578 		float intensity;
579         thisSeqNameVector.clear();
580 		thisLengths.clear();
581 		thisFlowDataIntI.clear();
582 		thisNameMap.clear();
583 
584 		string numFlowTest;
585         flowFile >> numFlowTest;
586 
587         if (!util.isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?\n");  exit(1); }
588         else { convert(numFlowTest, numFlowCells); }
589 
590         if (m->getDebug()) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
591 		int index = 0;//pcluster
592 		while(!flowFile.eof()){
593 
594 			if (m->getControl_pressed()) { break; }
595 
596 			flowFile >> seqName >> currentNumFlowCells;
597 
598 			thisLengths.push_back(currentNumFlowCells);
599 
600 			thisSeqNameVector.push_back(seqName);
601 			thisNameMap[seqName] = index++;//pcluster
602 
603             if (m->getDebug()) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
604 
605 			for(int i=0;i<numFlowCells;i++){
606 				flowFile >> intensity;
607 				if(intensity > 9.99)	{	intensity = 9.99;	}
608 				int intI = int(100 * intensity + 0.0001);
609 				thisFlowDataIntI.push_back(intI);
610 			}
611 			util.gobble(flowFile);
612 		}
613 		flowFile.close();
614 
615 		int numSeqs = thisSeqNameVector.size();
616 
617 		for(int i=0;i<numSeqs;i++){
618 
619 			if (m->getControl_pressed()) { break; }
620 
621 			int iNumFlowCells = i * numFlowCells;
622 			for(int j=thisLengths[i];j<numFlowCells;j++){
623 				thisFlowDataIntI[iNumFlowCells + j] = 0;
624 			}
625 		}
626 
627         return numSeqs;
628 
629 	}
630 	catch(exception& e) {
631 		m->errorOut(e, "ShhherCommand", "getFlowData");
632 		exit(1);
633 	}
634 }
635 /**************************************************************************************************/
636 
flowDistParentFork(int numFlowCells,string distFileName,int stopSeq,vector<int> & mapUniqueToSeq,vector<int> & mapSeqToUnique,vector<int> & lengths,vector<double> & flowDataPrI,vector<short> & flowDataIntI)637 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
638 	try{
639 
640 		ostringstream outStream;
641 		outStream.setf(ios::fixed, ios::floatfield);
642 		outStream.setf(ios::dec, ios::basefield);
643 		outStream.setf(ios::showpoint);
644 		outStream.precision(6);
645 
646 		int begTime = time(NULL);
647 		double begClock = clock();
648 
649 		for(int i=0;i<stopSeq;i++){
650 
651 			if (m->getControl_pressed()) { break; }
652 
653 			for(int j=0;j<i;j++){
654 				float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
655 
656 				if(flowDistance < 1e-6){
657 					outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
658 				}
659 				else if(flowDistance <= cutoff){
660 					outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
661 				}
662 			}
663 			if(i % 100 == 0){
664 				m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
665 				m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
666 			}
667 		}
668 
669 		ofstream distFile(distFileName.c_str());
670 		distFile << outStream.str();
671 		distFile.close();
672 
673 		if (m->getControl_pressed()) {}
674 		else {
675 			m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
676 			m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
677 		}
678 
679         return 0;
680 	}
681 	catch(exception& e) {
682 		m->errorOut(e, "ShhherCommand", "flowDistParentFork");
683 		exit(1);
684 	}
685 }
686 /**************************************************************************************************/
687 
calcPairwiseDist(int numFlowCells,int seqA,int seqB,vector<int> & mapSeqToUnique,vector<int> & lengths,vector<double> & flowDataPrI,vector<short> & flowDataIntI)688 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
689 	try{
690 		int minLength = lengths[mapSeqToUnique[seqA]];
691 		if(lengths[seqB] < minLength){	minLength = lengths[mapSeqToUnique[seqB]];	}
692 
693 		int ANumFlowCells = seqA * numFlowCells;
694 		int BNumFlowCells = seqB * numFlowCells;
695 
696 		float dist = 0;
697 
698 		for(int i=0;i<minLength;i++){
699 
700 			if (m->getControl_pressed()) { break; }
701 
702 			int flowAIntI = flowDataIntI[ANumFlowCells + i];
703 			float flowAPrI = flowDataPrI[ANumFlowCells + i];
704 
705 			int flowBIntI = flowDataIntI[BNumFlowCells + i];
706 			float flowBPrI = flowDataPrI[BNumFlowCells + i];
707 			dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
708 		}
709 
710 		dist /= (float) minLength;
711 		return dist;
712 	}
713 	catch(exception& e) {
714 		m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
715 		exit(1);
716 	}
717 }
718 
719 /**************************************************************************************************/
720 
getUniques(int numSeqs,int numFlowCells,vector<short> & uniqueFlowgrams,vector<int> & uniqueCount,vector<int> & uniqueLengths,vector<int> & mapSeqToUnique,vector<int> & mapUniqueToSeq,vector<int> & lengths,vector<double> & flowDataPrI,vector<short> & flowDataIntI)721 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
722 	try{
723 		int numUniques = 0;
724 		uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
725 		uniqueCount.assign(numSeqs, 0);							//	anWeights
726 		uniqueLengths.assign(numSeqs, 0);
727 		mapSeqToUnique.assign(numSeqs, -1);
728 		mapUniqueToSeq.assign(numSeqs, -1);
729 
730 		vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
731 
732 		for(int i=0;i<numSeqs;i++){
733 
734 			if (m->getControl_pressed()) { break; }
735 
736 			int index = 0;
737 
738 			vector<short> current(numFlowCells);
739 			for(int j=0;j<numFlowCells;j++){
740 				current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
741 			}
742 
743 			for(int j=0;j<numUniques;j++){
744 				int offset = j * numFlowCells;
745 				bool toEnd = 1;
746 
747 				int shorterLength;
748 				if(lengths[i] < uniqueLengths[j])	{	shorterLength = lengths[i];			}
749 				else								{	shorterLength = uniqueLengths[j];	}
750 
751 				for(int k=0;k<shorterLength;k++){
752 					if(current[k] != uniqueFlowgrams[offset + k]){
753 						toEnd = 0;
754 						break;
755 					}
756 				}
757 
758 				if(toEnd){
759 					mapSeqToUnique[i] = j;
760 					uniqueCount[j]++;
761 					index = j;
762 					if(lengths[i] > uniqueLengths[j])	{	uniqueLengths[j] = lengths[i];	}
763 					break;
764 				}
765 				index++;
766 			}
767 
768 			if(index == numUniques){
769 				uniqueLengths[numUniques] = lengths[i];
770 				uniqueCount[numUniques] = 1;
771 				mapSeqToUnique[i] = numUniques;//anMap
772 				mapUniqueToSeq[numUniques] = i;//anF
773 
774 				for(int k=0;k<numFlowCells;k++){
775 					uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
776 					uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
777 				}
778 
779 				numUniques++;
780 			}
781 		}
782 		uniqueFlowDataIntI.resize(numFlowCells * numUniques);
783 		uniqueLengths.resize(numUniques);
784 
785 		flowDataPrI.resize(numSeqs * numFlowCells, 0);
786 		for(int i=0;i<flowDataPrI.size();i++)	{	if (m->getControl_pressed()) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);		}
787 
788         return numUniques;
789 	}
790 	catch(exception& e) {
791 		m->errorOut(e, "ShhherCommand", "getUniques");
792 		exit(1);
793 	}
794 }
795 /**************************************************************************************************/
createNamesFile(int numSeqs,int numUniques,string filename,vector<string> & seqNameVector,vector<int> & mapSeqToUnique,vector<int> & mapUniqueToSeq)796 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
797 	try{
798 
799 		vector<string> duplicateNames(numUniques, "");
800 		for(int i=0;i<numSeqs;i++){
801 			duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
802 		}
803 
804 		ofstream nameFile;
805 		util.openOutputFile(filename, nameFile);
806 
807 		for(int i=0;i<numUniques;i++){
808 
809 			if (m->getControl_pressed()) { break; }
810 
811             //			nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
812 			nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
813 		}
814 
815 		nameFile.close();
816 
817 		return 0;
818 	}
819 	catch(exception& e) {
820 		m->errorOut(e, "ShhherCommand", "createNamesFile");
821 		exit(1);
822 	}
823 }
824 //**********************************************************************************************************************
825 
cluster(string filename,string distFileName,string namesFileName)826 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
827 	try {
828 
829 		ReadMatrix* read = new ReadColumnMatrix(distFileName);
830 		read->setCutoff(cutoff);
831 
832 		NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
833 		clusterNameMap->readMap();
834 		read->read(clusterNameMap);
835 
836 		ListVector* list = read->getListVector();
837 		SparseDistanceMatrix* matrix = read->getDMatrix();
838 
839 		delete read;
840 		delete clusterNameMap;
841 
842 		RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
843 
844         float adjust = -1.0;
845 		Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest", adjust);
846 		string tag = cluster->getTag();
847 
848 		double clusterCutoff = cutoff;
849 		while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
850 
851 			if (m->getControl_pressed()) { break; }
852 
853 			cluster->update(clusterCutoff);
854 		}
855 
856 		list->setLabel(toString(cutoff));
857 
858 		ofstream listFile;
859 		util.openOutputFile(filename, listFile);
860 		list->print(listFile, true);
861 		listFile.close();
862 
863 		delete matrix;	delete cluster;	delete rabund; delete list;
864 
865 		return 0;
866 	}
867 	catch(exception& e) {
868 		m->errorOut(e, "ShhherCommand", "cluster");
869 		exit(1);
870 	}
871 }
872 /**************************************************************************************************/
873 
getOTUData(int numSeqs,string fileName,vector<int> & otuData,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<vector<int>> & aaP,vector<vector<int>> & aaI,vector<int> & seqNumber,vector<int> & seqIndex,map<string,int> & nameMap)874 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
875                                vector<int>& cumNumSeqs,
876                                vector<int>& nSeqsPerOTU,
877                                vector<vector<int> >& aaP,	//tMaster->aanP:	each row is a different otu / each col contains the sequence indices
878                                vector<vector<int> >& aaI,	//tMaster->aanI:	that are in each otu - can't differentiate between aaP and aaI
879                                vector<int>& seqNumber,		//tMaster->anP:		the sequence id number sorted by OTU
880                                vector<int>& seqIndex,
881                                map<string, int>& nameMap){
882 	try {
883         InputData input(fileName, "list", nullVector);
884         ListVector* list = input.getListVector();
885 
886         string label = list->getLabel();
887         int numOTUs = list->getNumBins();
888 
889         if (m->getDebug()) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
890 
891 		otuData.assign(numSeqs, 0);
892 		cumNumSeqs.assign(numOTUs, 0);
893 		nSeqsPerOTU.assign(numOTUs, 0);
894 		aaP.clear();aaP.resize(numOTUs);
895 
896 		seqNumber.clear();
897 		aaI.clear();
898 		seqIndex.clear();
899 
900 		for(int i=0;i<numOTUs;i++){
901 
902 			if (m->getControl_pressed()) { break; }
903             if (m->getDebug()) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
904 
905 			string singleOTU = list->get(i);
906 
907             vector<string> otuSeqs; util.splitAtComma(singleOTU, otuSeqs);
908 
909 			for(int j=0;j<otuSeqs.size();j++){
910 
911                 string seqName = otuSeqs[j];
912                 map<string,int>::iterator nmIt = nameMap.find(seqName);
913                 int index = nmIt->second;
914 
915                 nameMap.erase(nmIt);
916                 otuData[index] = i;
917                 nSeqsPerOTU[i]++;
918                 aaP[i].push_back(index);
919             }
920 
921 			sort(aaP[i].begin(), aaP[i].end());
922 			for(int j=0;j<nSeqsPerOTU[i];j++)       { seqNumber.push_back(aaP[i][j]);   }
923 			for(int j=nSeqsPerOTU[i];j<numSeqs;j++) { aaP[i].push_back(0);              }
924 		}
925 
926 		for(int i=1;i<numOTUs;i++){ cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1]; }
927 		aaI = aaP;
928 		seqIndex = seqNumber;
929         delete list;
930 
931         return numOTUs;
932 	}
933 	catch(exception& e) {
934 		m->errorOut(e, "ShhherCommand", "getOTUData");
935 		exit(1);
936 	}
937 }
938 /**************************************************************************************************/
939 
calcCentroidsDriver(int numOTUs,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<int> & seqIndex,vector<short> & change,vector<int> & centroids,vector<double> & singleTau,vector<int> & mapSeqToUnique,vector<short> & uniqueFlowgrams,vector<short> & flowDataIntI,vector<int> & lengths,int numFlowCells,vector<int> & seqNumber)940 int ShhherCommand::calcCentroidsDriver(int numOTUs,
941                                           vector<int>& cumNumSeqs,
942                                           vector<int>& nSeqsPerOTU,
943                                           vector<int>& seqIndex,
944                                           vector<short>& change,		//did the centroid sequence change? 0 = no; 1 = yes
945                                           vector<int>& centroids,		//the representative flowgram for each cluster m
946                                           vector<double>& singleTau,	//tMaster->adTau:	1-D Tau vector (1xnumSeqs)
947                                           vector<int>& mapSeqToUnique,
948                                           vector<short>& uniqueFlowgrams,
949                                           vector<short>& flowDataIntI,
950                                           vector<int>& lengths,
951                                           int numFlowCells,
952                                           vector<int>& seqNumber){
953 
954 	//this function gets the most likely homopolymer length at a flow position for a group of sequences
955 	//within an otu
956 
957 	try{
958 
959 		for(int i=0;i<numOTUs;i++){
960 
961 			if (m->getControl_pressed()) { break; }
962 
963 			double count = 0;
964 			int position = 0;
965 			int minFlowGram = 100000000;
966 			double minFlowValue = 1e8;
967 			change[i] = 0; //FALSE
968 
969 			for(int j=0;j<nSeqsPerOTU[i];j++){
970 				count += singleTau[seqNumber[cumNumSeqs[i] + j]];
971 			}
972 
973 			if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
974 				vector<double> adF(nSeqsPerOTU[i]);
975 				vector<int> anL(nSeqsPerOTU[i]);
976 
977 				for(int j=0;j<nSeqsPerOTU[i];j++){
978 					int index = cumNumSeqs[i] + j;
979 					int nI = seqIndex[index];
980 					int nIU = mapSeqToUnique[nI];
981 
982 					int k;
983 					for(k=0;k<position;k++){
984 						if(nIU == anL[k]){
985 							break;
986 						}
987 					}
988 					if(k == position){
989 						anL[position] = nIU;
990 						adF[position] = 0.0000;
991 						position++;
992 					}
993 				}
994 
995 				for(int j=0;j<nSeqsPerOTU[i];j++){
996 					int index = cumNumSeqs[i] + j;
997 					int nI = seqIndex[index];
998 
999 					double tauValue = singleTau[seqNumber[index]];
1000 
1001 					for(int k=0;k<position;k++){
1002 						double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
1003 						adF[k] += dist * tauValue;
1004 					}
1005 				}
1006 
1007 				for(int j=0;j<position;j++){
1008 					if(adF[j] < minFlowValue){
1009 						minFlowGram = j;
1010 						minFlowValue = adF[j];
1011 					}
1012 				}
1013 
1014 				if(centroids[i] != anL[minFlowGram]){
1015 					change[i] = 1;
1016 					centroids[i] = anL[minFlowGram];
1017 				}
1018 			}
1019 			else if(centroids[i] != -1){
1020 				change[i] = 1;
1021 				centroids[i] = -1;
1022 			}
1023 		}
1024 
1025         return 0;
1026 	}
1027 	catch(exception& e) {
1028 		m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1029 		exit(1);
1030 	}
1031 }
1032 /**************************************************************************************************/
1033 
getDistToCentroid(int cent,int flow,int length,vector<short> & uniqueFlowgrams,vector<short> & flowDataIntI,int numFlowCells)1034 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
1035                                         vector<short>& flowDataIntI, int numFlowCells){
1036 	try{
1037 
1038 		int flowAValue = cent * numFlowCells;
1039 		int flowBValue = flow * numFlowCells;
1040 
1041 		double dist = 0;
1042 
1043 		for(int i=0;i<length;i++){
1044 			dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1045 			flowAValue++;
1046 			flowBValue++;
1047 		}
1048 
1049 		return dist / (double)length;
1050 	}
1051 	catch(exception& e) {
1052 		m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1053 		exit(1);
1054 	}
1055 }
1056 /**************************************************************************************************/
1057 
getNewWeights(int numOTUs,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<double> & singleTau,vector<int> & seqNumber,vector<double> & weight)1058 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
1059 	try{
1060 
1061 		double maxChange = 0;
1062 
1063 		for(int i=0;i<numOTUs;i++){
1064 
1065 			if (m->getControl_pressed()) { break; }
1066 
1067 			double difference = weight[i];
1068 			weight[i] = 0;
1069 
1070 			for(int j=0;j<nSeqsPerOTU[i];j++){
1071 				int index = cumNumSeqs[i] + j;
1072 				double tauValue = singleTau[seqNumber[index]];
1073 				weight[i] += tauValue;
1074 			}
1075 
1076 			difference = fabs(weight[i] - difference);
1077 			if(difference > maxChange){	maxChange = difference;	}
1078 		}
1079 		return maxChange;
1080 	}
1081 	catch(exception& e) {
1082 		m->errorOut(e, "ShhherCommand", "getNewWeights");
1083 		exit(1);
1084 	}
1085 }
1086 
1087 /**************************************************************************************************/
1088 
getLikelihood(int numSeqs,int numOTUs,vector<int> & nSeqsPerOTU,vector<int> & seqNumber,vector<int> & cumNumSeqs,vector<int> & seqIndex,vector<double> & dist,vector<double> & weight)1089 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
1090 
1091 	try{
1092 
1093 		vector<long double> P(numSeqs, 0);
1094 		int effNumOTUs = 0;
1095 
1096 		for(int i=0;i<numOTUs;i++){
1097 			if(weight[i] > MIN_WEIGHT){
1098 				effNumOTUs++;
1099 			}
1100 		}
1101 
1102 		string hold;
1103 		for(int i=0;i<numOTUs;i++){
1104 
1105 			if (m->getControl_pressed()) { break; }
1106 
1107 			for(int j=0;j<nSeqsPerOTU[i];j++){
1108 				int index = cumNumSeqs[i] + j;
1109 				int nI = seqIndex[index];
1110 				double singleDist = dist[seqNumber[index]];
1111 
1112 				P[nI] += weight[i] * exp(-singleDist * sigma);
1113 			}
1114 		}
1115 		double nLL = 0.00;
1116 		for(int i=0;i<numSeqs;i++){
1117 			if(P[i] == 0){	P[i] = DBL_EPSILON;	}
1118 
1119 			nLL += -log(P[i]);
1120 		}
1121 
1122 		nLL = nLL -(double)numSeqs * log(sigma);
1123 
1124 		return nLL;
1125 	}
1126 	catch(exception& e) {
1127 		m->errorOut(e, "ShhherCommand", "getNewWeights");
1128 		exit(1);
1129 	}
1130 }
1131 
1132 /**************************************************************************************************/
1133 
checkCentroids(int numOTUs,vector<int> & centroids,vector<double> & weight)1134 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
1135 	try{
1136 		vector<int> unique(numOTUs, 1);
1137 
1138 		for(int i=0;i<numOTUs;i++){
1139 			if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1140 				unique[i] = -1;
1141 			}
1142 		}
1143 
1144 		for(int i=0;i<numOTUs;i++){
1145 
1146 			if (m->getControl_pressed()) { break; }
1147 
1148 			if(unique[i] == 1){
1149 				for(int j=i+1;j<numOTUs;j++){
1150 					if(unique[j] == 1){
1151 
1152 						if(centroids[j] == centroids[i]){
1153 							unique[j] = 0;
1154 							centroids[j] = -1;
1155 
1156 							weight[i] += weight[j];
1157 							weight[j] = 0.0;
1158 						}
1159 					}
1160 				}
1161 			}
1162 		}
1163 
1164         return 0;
1165 	}
1166 	catch(exception& e) {
1167 		m->errorOut(e, "ShhherCommand", "checkCentroids");
1168 		exit(1);
1169 	}
1170 }
1171 /**************************************************************************************************/
1172 
calcNewDistances(int numSeqs,int numOTUs,vector<int> & nSeqsPerOTU,vector<double> & dist,vector<double> & weight,vector<short> & change,vector<int> & centroids,vector<vector<int>> & aaP,vector<double> & singleTau,vector<vector<int>> & aaI,vector<int> & seqNumber,vector<int> & seqIndex,vector<short> & uniqueFlowgrams,vector<short> & flowDataIntI,int numFlowCells,vector<int> & lengths)1173 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist,
1174                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
1175                                      vector<vector<int> >& aaP,	vector<double>& singleTau, vector<vector<int> >& aaI,
1176                                      vector<int>& seqNumber, vector<int>& seqIndex,
1177                                      vector<short>& uniqueFlowgrams,
1178                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
1179 
1180 	try{
1181 
1182 		int total = 0;
1183 		vector<double> newTau(numOTUs,0);
1184 		vector<double> norms(numSeqs, 0);
1185 		nSeqsPerOTU.assign(numOTUs, 0);
1186 
1187 		for(int i=0;i<numSeqs;i++){
1188 
1189 			if (m->getControl_pressed()) { break; }
1190 
1191 			int indexOffset = i * numOTUs;
1192 
1193 			double offset = 1e8;
1194 
1195 			for(int j=0;j<numOTUs;j++){
1196 
1197 				if(weight[j] > MIN_WEIGHT && change[j] == 1){
1198 					dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
1199 				}
1200 
1201 				if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1202 					offset = dist[indexOffset + j];
1203 				}
1204 			}
1205 
1206 			for(int j=0;j<numOTUs;j++){
1207 				if(weight[j] > MIN_WEIGHT){
1208 					newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1209 					norms[i] += newTau[j];
1210 				}
1211 				else{
1212 					newTau[j] = 0.0;
1213 				}
1214 			}
1215 
1216 			for(int j=0;j<numOTUs;j++){
1217 				newTau[j] /= norms[i];
1218 			}
1219 
1220 			for(int j=0;j<numOTUs;j++){
1221 				if(newTau[j] > MIN_TAU){
1222 
1223 					int oldTotal = total;
1224 
1225 					total++;
1226 
1227 					singleTau.resize(total, 0);
1228 					seqNumber.resize(total, 0);
1229 					seqIndex.resize(total, 0);
1230 
1231 					singleTau[oldTotal] = newTau[j];
1232 
1233 					aaP[j][nSeqsPerOTU[j]] = oldTotal;
1234 					aaI[j][nSeqsPerOTU[j]] = i;
1235 					nSeqsPerOTU[j]++;
1236 				}
1237 			}
1238 
1239 		}
1240 
1241 	}
1242 	catch(exception& e) {
1243 		m->errorOut(e, "ShhherCommand", "calcNewDistances");
1244 		exit(1);
1245 	}
1246 }
1247 /**************************************************************************************************/
1248 
fill(int numOTUs,vector<int> & seqNumber,vector<int> & seqIndex,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<vector<int>> & aaP,vector<vector<int>> & aaI)1249 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
1250 	try {
1251 		int index = 0;
1252 		for(int i=0;i<numOTUs;i++){
1253 
1254 			if (m->getControl_pressed()) { return 0; }
1255 
1256 			cumNumSeqs[i] = index;
1257 			for(int j=0;j<nSeqsPerOTU[i];j++){
1258 				seqNumber[index] = aaP[i][j];
1259 				seqIndex[index] = aaI[i][j];
1260 
1261 				index++;
1262 			}
1263 		}
1264 
1265         return 0;
1266 	}
1267 	catch(exception& e) {
1268 		m->errorOut(e, "ShhherCommand", "fill");
1269 		exit(1);
1270 	}
1271 }
1272 /**************************************************************************************************/
1273 
setOTUs(int numOTUs,int numSeqs,vector<int> & seqNumber,vector<int> & seqIndex,vector<int> & cumNumSeqs,vector<int> & nSeqsPerOTU,vector<int> & otuData,vector<double> & singleTau,vector<double> & dist,vector<vector<int>> & aaP,vector<vector<int>> & aaI)1274 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
1275                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
1276 
1277 	try {
1278 		vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1279 
1280 		for(int i=0;i<numOTUs;i++){
1281 
1282 			if (m->getControl_pressed()) { break; }
1283 
1284 			for(int j=0;j<nSeqsPerOTU[i];j++){
1285 				int index = cumNumSeqs[i] + j;
1286 				double tauValue = singleTau[seqNumber[index]];
1287 				int sIndex = seqIndex[index];
1288 				bigTauMatrix[sIndex * numOTUs + i] = tauValue;
1289 			}
1290 		}
1291 
1292 		for(int i=0;i<numSeqs;i++){
1293 			double maxTau = -1.0000;
1294 			int maxOTU = -1;
1295 			for(int j=0;j<numOTUs;j++){
1296 				if(bigTauMatrix[i * numOTUs + j] > maxTau){
1297 					maxTau = bigTauMatrix[i * numOTUs + j];
1298 					maxOTU = j;
1299 				}
1300 			}
1301 
1302 			otuData[i] = maxOTU;
1303 		}
1304 
1305 		nSeqsPerOTU.assign(numOTUs, 0);
1306 
1307 		for(int i=0;i<numSeqs;i++){
1308 			int index = otuData[i];
1309 
1310 			singleTau[i] = 1.0000;
1311 			dist[i] = 0.0000;
1312 
1313 			aaP[index][nSeqsPerOTU[index]] = i;
1314 			aaI[index][nSeqsPerOTU[index]] = i;
1315 
1316 			nSeqsPerOTU[index]++;
1317 		}
1318 
1319 		fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
1320 	}
1321 	catch(exception& e) {
1322 		m->errorOut(e, "ShhherCommand", "setOTUs");
1323 		exit(1);
1324 	}
1325 }
1326 /**************************************************************************************************/
1327 
writeQualities(int numOTUs,int numFlowCells,string qualityFileName,vector<int> otuCounts,vector<int> & nSeqsPerOTU,vector<int> & seqNumber,vector<double> & singleTau,vector<short> & flowDataIntI,vector<short> & uniqueFlowgrams,vector<int> & cumNumSeqs,vector<int> & mapUniqueToSeq,vector<string> & seqNameVector,vector<int> & centroids,vector<vector<int>> & aaI)1328 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
1329                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
1330                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
1331 
1332 	try {
1333 
1334 		ofstream qualityFile;
1335 		util.openOutputFile(qualityFileName, qualityFile);
1336 
1337 		qualityFile.setf(ios::fixed, ios::floatfield);
1338 		qualityFile.setf(ios::showpoint);
1339 		qualityFile << setprecision(6);
1340 
1341 		vector<vector<int> > qualities(numOTUs);
1342 		vector<double> pr(HOMOPS, 0);
1343 
1344 
1345 		for(int i=0;i<numOTUs;i++){
1346 
1347 			if (m->getControl_pressed()) { break; }
1348 
1349 			int index = 0;
1350 
1351 			if(nSeqsPerOTU[i] > 0){
1352 
1353 				while(index < numFlowCells){
1354 
1355 					double maxPrValue = 1e8;
1356 					short maxPrIndex = -1;
1357 					double count = 0.0000;
1358 
1359 					pr.assign(HOMOPS, 0);
1360 
1361 					for(int j=0;j<nSeqsPerOTU[i];j++){
1362 						int lIndex = cumNumSeqs[i] + j;
1363 						double tauValue = singleTau[seqNumber[lIndex]];
1364 						int sequenceIndex = aaI[i][j];
1365 						short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1366 
1367 						count += tauValue;
1368 
1369 						for(int s=0;s<HOMOPS;s++){
1370 							pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1371 						}
1372 					}
1373 
1374 					maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1375 					maxPrValue = pr[maxPrIndex];
1376 
1377 					if(count > MIN_COUNT){
1378 						double U = 0.0000;
1379 						double norm = 0.0000;
1380 
1381 						for(int s=0;s<HOMOPS;s++){
1382 							norm += exp(-(pr[s] - maxPrValue));
1383 						}
1384 
1385 						for(int s=1;s<=maxPrIndex;s++){
1386 							int value = 0;
1387 							double temp = 0.0000;
1388 
1389 							U += exp(-(pr[s-1]-maxPrValue))/norm;
1390 
1391 							if(U>0.00){
1392 								temp = log10(U);
1393 							}
1394 							else{
1395 								temp = -10.1;
1396 							}
1397 							temp = floor(-10 * temp);
1398 							value = (int)floor(temp);
1399 							if(value > 100){	value = 100;	}
1400 
1401                             qualities[i].push_back((int)value);
1402 						}
1403 					}//end if
1404 
1405 					index++;
1406 
1407 				}//end while
1408 
1409 			}//end if
1410 
1411 
1412 			if(otuCounts[i] > 0){
1413 				qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1414                 //need to get past the first four bases
1415                 for (int j = 4; j < qualities[i].size(); j++) { qualityFile << qualities[i][j] << ' '; }
1416 				qualityFile << endl;
1417 			}
1418 		}//end for
1419 		qualityFile.close();
1420 		outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1421 
1422 	}
1423 	catch(exception& e) {
1424 		m->errorOut(e, "ShhherCommand", "writeQualities");
1425 		exit(1);
1426 	}
1427 }
1428 
1429 /**************************************************************************************************/
1430 
writeSequences(string thisCompositeFASTAFileName,int numOTUs,int numFlowCells,string fastaFileName,vector<int> otuCounts,vector<short> & uniqueFlowgrams,vector<string> & seqNameVector,vector<vector<int>> & aaI,vector<int> & centroids)1431 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
1432 	try {
1433 
1434 		ofstream fastaFile;
1435 		util.openOutputFile(fastaFileName, fastaFile);
1436 
1437 		vector<string> names(numOTUs, "");
1438 
1439 		for(int i=0;i<numOTUs;i++){
1440 
1441 			if (m->getControl_pressed()) { break; }
1442 
1443 			int index = centroids[i];
1444 
1445 			if(otuCounts[i] > 0){
1446 				fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1447 
1448 				string newSeq = "";
1449 
1450 				for(int j=0;j<numFlowCells;j++){
1451 
1452 					char base = flowOrder[j % flowOrder.length()];
1453 					for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1454 						newSeq += base;
1455 					}
1456 				}
1457 
1458 				if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
1459                 else {  fastaFile << "NNNN" << endl;  }
1460 			}
1461 		}
1462 		fastaFile.close();
1463 
1464 		outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1465 
1466 		if(thisCompositeFASTAFileName != ""){
1467 			util.appendFiles(fastaFileName, thisCompositeFASTAFileName);
1468 		}
1469 	}
1470 	catch(exception& e) {
1471 		m->errorOut(e, "ShhherCommand", "writeSequences");
1472 		exit(1);
1473 	}
1474 }
1475 
1476 /**************************************************************************************************/
1477 
writeNames(string thisCompositeNamesFileName,int numOTUs,string nameFileName,vector<int> otuCounts,vector<string> & seqNameVector,vector<vector<int>> & aaI,vector<int> & nSeqsPerOTU)1478 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
1479 	try {
1480 
1481 		ofstream nameFile;
1482 		util.openOutputFile(nameFileName, nameFile);
1483 
1484 		for(int i=0;i<numOTUs;i++){
1485 
1486 			if (m->getControl_pressed()) { break; }
1487 
1488 			if(otuCounts[i] > 0){
1489 				nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1490 
1491 				for(int j=1;j<nSeqsPerOTU[i];j++){
1492 					nameFile << ',' << seqNameVector[aaI[i][j]];
1493 				}
1494 
1495 				nameFile << endl;
1496 			}
1497 		}
1498 		nameFile.close();
1499 		outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1500 
1501 
1502 		if(thisCompositeNamesFileName != ""){
1503 			util.appendFiles(nameFileName, thisCompositeNamesFileName);
1504 		}
1505 	}
1506 	catch(exception& e) {
1507 		m->errorOut(e, "ShhherCommand", "writeNames");
1508 		exit(1);
1509 	}
1510 }
1511 
1512 /**************************************************************************************************/
1513 
writeGroups(string groupFileName,string fileRoot,int numSeqs,vector<string> & seqNameVector)1514 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
1515 	try {
1516         ofstream groupFile;
1517 		util.openOutputFile(groupFileName, groupFile);
1518 
1519 		for(int i=0;i<numSeqs;i++){
1520 			if (m->getControl_pressed()) { break; }
1521 			groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1522 		}
1523 		groupFile.close();
1524 		outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1525 
1526 	}
1527 	catch(exception& e) {
1528 		m->errorOut(e, "ShhherCommand", "writeGroups");
1529 		exit(1);
1530 	}
1531 }
1532 
1533 /**************************************************************************************************/
1534 
writeClusters(string otuCountsFileName,int numOTUs,int numFlowCells,vector<int> otuCounts,vector<int> & centroids,vector<short> & uniqueFlowgrams,vector<string> & seqNameVector,vector<vector<int>> & aaI,vector<int> & nSeqsPerOTU,vector<int> & lengths,vector<short> & flowDataIntI)1535 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
1536 	try {
1537 		ofstream otuCountsFile;
1538 		util.openOutputFile(otuCountsFileName, otuCountsFile);
1539 
1540 		string bases = flowOrder;
1541 
1542 		for(int i=0;i<numOTUs;i++){
1543 
1544 			if (m->getControl_pressed()) {
1545 				break;
1546 			}
1547 			//output the translated version of the centroid sequence for the otu
1548 			if(otuCounts[i] > 0){
1549 				int index = centroids[i];
1550 
1551 				otuCountsFile << "ideal\t";
1552 				for(int j=8;j<numFlowCells;j++){
1553 					char base = bases[j % bases.length()];
1554 					for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1555 						otuCountsFile << base;
1556 					}
1557 				}
1558 				otuCountsFile << endl;
1559 
1560 				for(int j=0;j<nSeqsPerOTU[i];j++){
1561 					int sequence = aaI[i][j];
1562 					otuCountsFile << seqNameVector[sequence] << '\t';
1563 
1564 					string newSeq = "";
1565 
1566 					for(int k=0;k<lengths[sequence];k++){
1567 						char base = bases[k % bases.length()];
1568 						int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1569 
1570 						for(int s=0;s<freq;s++){
1571 							newSeq += base;
1572 							//otuCountsFile << base;
1573 						}
1574 					}
1575 
1576                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
1577                     else {  otuCountsFile << "NNNN" << endl;  }
1578 				}
1579 				otuCountsFile << endl;
1580 			}
1581 		}
1582 		otuCountsFile.close();
1583 		outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1584 
1585 	}
1586 	catch(exception& e) {
1587 		m->errorOut(e, "ShhherCommand", "writeClusters");
1588 		exit(1);
1589 	}
1590 }
1591 
1592 /**************************************************************************************************/
1593 
getSingleLookUp()1594 void ShhherCommand::getSingleLookUp(){
1595 	try{
1596 		//	these are the -log probabilities that a signal corresponds to a particular homopolymer length
1597 		singleLookUp.assign(HOMOPS * NUMBINS, 0);
1598 
1599 		int index = 0;
1600 		ifstream lookUpFile;
1601 		util.openInputFile(lookupFileName, lookUpFile);
1602 
1603 		for(int i=0;i<HOMOPS;i++){
1604 
1605 			if (m->getControl_pressed()) { break; }
1606 
1607 			float logFracFreq;
1608 			lookUpFile >> logFracFreq;
1609 
1610 			for(int j=0;j<NUMBINS;j++)	{
1611 				lookUpFile >> singleLookUp[index];
1612 				index++;
1613 			}
1614 		}
1615 		lookUpFile.close();
1616 	}
1617 	catch(exception& e) {
1618 		m->errorOut(e, "ShhherCommand", "getSingleLookUp");
1619 		exit(1);
1620 	}
1621 }
1622 
1623 /**************************************************************************************************/
1624 
getJointLookUp()1625 void ShhherCommand::getJointLookUp(){
1626 	try{
1627 
1628 		//	the most likely joint probability (-log) that two intenities have the same polymer length
1629 		jointLookUp.resize(NUMBINS * NUMBINS, 0);
1630 
1631 		for(int i=0;i<NUMBINS;i++){
1632 
1633 			if (m->getControl_pressed()) { break; }
1634 
1635 			for(int j=0;j<NUMBINS;j++){
1636 
1637 				double minSum = 100000000;
1638 
1639 				for(int k=0;k<HOMOPS;k++){
1640 					double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
1641 
1642 					if(sum < minSum)	{	minSum = sum;		}
1643 				}
1644 				jointLookUp[i * NUMBINS + j] = minSum;
1645 			}
1646 		}
1647 	}
1648 	catch(exception& e) {
1649 		m->errorOut(e, "ShhherCommand", "getJointLookUp");
1650 		exit(1);
1651 	}
1652 }
1653 
1654 /**************************************************************************************************/
1655 
getProbIntensity(int intIntensity)1656 double ShhherCommand::getProbIntensity(int intIntensity){
1657 	try{
1658 
1659 		double minNegLogProb = 100000000;
1660 
1661 
1662 		for(int i=0;i<HOMOPS;i++){//loop signal strength
1663 
1664 			if (m->getControl_pressed()) { break; }
1665 
1666 			float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
1667 			if(negLogProb < minNegLogProb)	{	minNegLogProb = negLogProb; }
1668 		}
1669 
1670 		return minNegLogProb;
1671 	}
1672 	catch(exception& e) {
1673 		m->errorOut(e, "ShhherCommand", "getProbIntensity");
1674 		exit(1);
1675 	}
1676 }
1677 
1678 
1679 
1680 
1681