1 /*************************************************************************** 2 * Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler * 3 * minh.bui@univie.ac.at * 4 * * 5 * This program is free software; you can redistribute it and/or modify * 6 * it under the terms of the GNU General Public License as published by * 7 * the Free Software Foundation; either version 2 of the License, or * 8 * (at your option) any later version. * 9 * * 10 * This program is distributed in the hope that it will be useful, * 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 13 * GNU General Public License for more details. * 14 * * 15 * You should have received a copy of the GNU General Public License * 16 * along with this program; if not, write to the * 17 * Free Software Foundation, Inc., * 18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * 19 ***************************************************************************/ 20 #ifndef PDNETWORK_H 21 #define PDNETWORK_H 22 23 #include "splitgraph.h" 24 25 /** 26 General Split Network for Phylogenetic Diversity Algorithm 27 28 @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler 29 */ 30 class PDNetwork : public SplitGraph 31 { 32 public: 33 34 friend class MTree; 35 friend class ECOpd; 36 37 /** 38 empty constructor 39 */ 40 PDNetwork(); 41 42 /** 43 construct PD network from a NEXUS or NEWICK file, e.g. produced by SplitsTree 44 @param params program parameters 45 */ 46 PDNetwork(Params ¶ms); 47 48 49 /** 50 Identify the root node if specified, include it into the initial set 51 @param root_name name of the root node 52 */ 53 void readRootNode(const char *root_name); 54 55 /** 56 read the parameter from the file and incoporate into split system 57 @param params program parameters 58 */ 59 void readParams(Params ¶ms); 60 61 /** 62 read the initial set of taxa to be included into PD-tree 63 @param params program parameters 64 */ 65 void readInitialSet(Params ¶ms); 66 67 68 /** 69 read the initial areas to be included into PD set 70 @param params program parameters 71 */ 72 void readInitialAreas(Params ¶ms); 73 74 /** 75 increase the weight of the split associated with initial set 76 */ 77 void proceedInitialSet(); 78 79 /** 80 initialize when PD min specified 81 */ 82 void initPDMin(); 83 84 85 /** 86 compute the minimum required costs to conserve a taxa set 87 @param taxset set of taxa 88 @return budget required 89 */ 90 int calcCost(IntVector &taxset); 91 92 /** 93 compute the minimum required costs to conserve a taxa set 94 @param taxset set of taxa 95 @return budget required 96 */ 97 int calcCost(Split &taxset); 98 99 void printOutputSetScore(Params ¶ms, vector<SplitSet> &pd_set); 100 101 /** 102 compute the PD score of a given taxa set in filename 103 @param params program parameters 104 @param taxa_set (OUT) corresponding set of taxa 105 @param pd_more (OUT) more computed PD measures will be stored here 106 */ 107 void computePD(Params ¶ms, SplitSet &taxa_set, PDRelatedMeasures &pd_more); 108 109 /** 110 this will be called by findPD at the beginning 111 @param params program parameters 112 */ 113 virtual void enterFindPD(Params ¶ms); 114 115 /** 116 main function to search for maximal phylogenetic diversity 117 @param params program parameters 118 @param taxa_set (OUT) the vector of set of taxa in the maximal PD set 119 @param taxa_order (OUT) order of inserted taxa 120 */ 121 virtual void findPD(Params ¶ms, vector<SplitSet> &taxa_set, vector<int> &taxa_order); 122 123 124 /** 125 this function will be called by findPD at the end 126 @param taxa_set (IN/OUT) the vector of set of taxa in the maximal PD set 127 */ 128 virtual void leaveFindPD(vector<SplitSet> &taxa_set); 129 130 /** 131 calculate the PD gain matrix in terms of delta_k^j = pd(PD_k \/ {j}) - pd_k 132 @param pd_set set of optimal PD sets 133 @param delta (OUT) PD gain matrix 134 */ 135 void calcPDGain(vector<SplitSet> &pd_set, mmatrix(double) &delta); 136 137 /** 138 compute the PD score of a given taxa set with name in taxa_name, result is written to id_set.weight. 139 The difference from calcWeight() is that calcPD takes initialset into account 140 @param id_set (IN/OUT) corresponding set of taxa 141 142 */ 143 void calcPD(Split &id_set); 144 145 /** 146 compute PD of a set of areas. It implicitly takes area_taxa map into account. 147 @param area_id_set IDs of areas in the set 148 */ 149 void calcPDArea(Split &area_id_set); 150 151 /** 152 compute the EXCLUSIVE PD score of a given taxa set with name in taxa_name, result is written to id_set.weight 153 @param id_set (IN/OUT) corresponding set of taxa IDs 154 */ 155 void calcExclusivePD(Split &id_set); 156 157 /** 158 compute the area's PD ENDEMISM of set of area 159 @param area_set set of area 160 @param pd_endem (OUT) corresponding PD endemism 161 */ 162 void calcPDEndemism(SplitSet &area_set, DoubleVector &pd_endem); 163 164 /** 165 compute the area's PD complementarity given a specific area 166 @param area_set set of area 167 @param area_names given area names as string separated by commas 168 @param all_names all area names 169 @param pd_comp (OUT) corresponding PD endemism 170 */ 171 void calcPDComplementarity(SplitSet &area_set, char *area_names, 172 vector<string> &all_names, DoubleVector &pd_comp); 173 174 175 /** 176 transform the problem into an Integer Linear Programming and write to .lp file 177 @param params program parameters 178 @param outfile name of output file in LP format 179 @param total_size k for PD_k or total budget 180 @param make_bin TRUE if creating binary programming 181 */ 182 void transformLP(Params ¶ms, const char *outfile, int total_size, bool make_bin); 183 void transformLP2(Params ¶ms, const char *outfile, int total_size, bool make_bin); 184 void transformEcoLP(Params ¶ms, const char *outline, int total_size); 185 186 /** 187 transform the problem into an Integer Linear Programming and write to .lp file 188 @param params program parameters 189 @param outfile name of output file in LP format 190 @param total_size k for PD_k or total budget 191 @param make_bin TRUE if creating binary programming 192 */ 193 void transformLP_Area(Params ¶ms, const char *outfile, int total_size, bool make_bin); 194 void transformLP_Area2(Params ¶ms, const char *outfile, int total_size, bool make_bin); 195 196 /** 197 transform the problem into an Integer Linear Programming and write to .lp file 198 @param params program parameters 199 @param outfile name of output file in LP format 200 @param pd_proportion minimum PD proprotion to be conserved 201 @param make_bin TRUE if creating binary programming 202 */ 203 void transformMinK_Area(Params ¶ms, const char *outfile, double pd_proprotion, bool make_bin); 204 void transformMinK_Area2(Params ¶ms, const char *outfile, double pd_proportion, bool make_bin); 205 206 /** 207 transform the PD problem into linear programming and solve it 208 @param params program parameters 209 @param taxa_set (OUT) the vector of set of taxa in the maximal PD set 210 */ 211 void findPD_LP(Params ¶ms, vector<SplitSet> &taxa_set); 212 213 /** 214 transform the PD problem into linear programming and solve it 215 @param params program parameters 216 @param areas_set (OUT) the vector of set of areas in the maximal PD set 217 */ 218 void findPDArea_LP(Params ¶ms, vector<SplitSet> &areas_set); 219 220 double findMinKArea_LP(Params ¶ms, const char* filename, double pd_proportion, Split &area); 221 222 /** 223 @return TRUE if we are doing PD area optimization 224 */ 225 virtual bool isPDArea(); 226 227 /** 228 check if all taxa are covered by the set of areas 229 @return false if there exists some taxon which is not covered by any areas 230 */ 231 bool checkAreaCoverage(); 232 233 /** 234 transform the problem into an Integer Linear Programming and write to .lp file 235 @param outfile name of output file in LP format 236 @param included_area (OUT) collection of areas that should always be included 237 */ 238 void transformLP_Area_Coverage(const char *outfile, Params ¶ms, Split &included_area); 239 240 241 /** 242 @return the minimum number of areas needed to cover all taxa 243 @param params program parameters 244 @param area_id (OUT) minimal set of areas which cover all taxa 245 */ 246 int findMinAreas(Params ¶ms, Split &area_id); 247 248 249 250 /** 251 the set of areas, each item contains the set of taxa in the area. 252 */ 253 SplitSet area_taxa; 254 255 /** 256 speciesList is used in ECOpd analysis for synchronization of species in SplitNetwork with species in FoodWeb 257 */ 258 void speciesList(vector<string> *speciesNames); 259 260 protected: 261 262 /** 263 extra PD when integrating initial set 264 */ 265 double extra_pd; 266 267 /** 268 when computing PD min (instead of PD max) 269 */ 270 bool min_pd; 271 272 /** 273 taxa set to be included into optimal PD set (with -i option) 274 */ 275 IntVector initialset; 276 277 278 /** 279 areas to be included into optimal PD set (with -ia option) 280 */ 281 IntVector initialareas; 282 283 /** 284 calculate the total maximum budget required 285 @return total maximum budget required 286 */ 287 int calcMaxBudget(); 288 289 /******************************************************** 290 hill-climbing and greedy heuristics 291 ********************************************************/ 292 293 /** 294 greedy algorithm for phylogenetic diversity of a given size 295 @param subsize the subset size 296 @param taxa_set (OUT) the set of taxa in the PD-set 297 @param taxa_order (OUT) order of inserted taxa 298 @return the PD score of the maximal set, also returned in taxa_set.weight 299 */ 300 double greedyPD(int subsize, Split &taxa_set, vector<int> &taxa_order); 301 302 303 /** 304 local search algorithm for phylogenetic diversity of a given size 305 @param subsize the subset size 306 @param taxa_set (OUT) the set of taxa in the PD-set 307 @param taxa_order (IN) order of inserted taxa 308 @return the PD score of the maximal set, also returned in taxa_set.weight 309 */ 310 double localSearchPD(int subsize, Split &taxa_set, vector<int> &taxa_order); 311 312 /******************************************************** 313 exhaustive search 314 ********************************************************/ 315 316 /** 317 exhaustive search version 2 for maximal phylogenetic diversity of a given size 318 @param subsize the subset size 319 @param cur_tax current taxon 320 @param curset current set 321 @param find_all TRUE if wanting all max PD set 322 @param best_set (OUT) the set of taxa in the maximal PD set 323 @param taxa_order (OUT) order of inserted taxa 324 @param rem_splits (IN) remaining splits 325 @param rem_it (IN) begin of remaining iterator 326 @return the PD score of the maximal set 327 */ 328 double exhaustPD2(int subsize, int cur_tax, Split &curset, 329 bool find_all,SplitSet &best_set, vector<int> &taxa_order, 330 IntList &rem_splits, IntList::iterator &rem_it); 331 332 /** 333 exhaustive search for maximal PD with cost-constrained 334 @param cur_budget current budget 335 @param cur_tax current taxon 336 @param curset current set 337 @param find_all TRUE if wanting all max PD set 338 @param best_set (OUT) the set of taxa in the maximal PD set 339 @param taxa_order (OUT) order of inserted taxa 340 @param rem_splits (IN) remaining splits 341 @param rem_it (IN) begin of remaining iterator 342 @return the PD score of the maximal set 343 */ 344 double exhaustPDBudget(int cur_budget, int cur_tax, Split &curset, 345 bool find_all,SplitSet &best_set, vector<int> &taxa_order, 346 IntList &rem_splits, IntList::iterator &rem_it); 347 348 /** 349 calculate sum of weights of preserved splits in the taxa_set 350 @param taxa_set a set of taxa 351 @param rem_splits remaining splits 352 @param rem_it begin iterator of remaining splits 353 */ 354 double calcRaisedWeight(Split &taxa_set, IntList &rem_splits, IntList::iterator & rem_it); 355 356 /** 357 update the best taxa set during the search 358 @param curset the current taxa set 359 @param best_set the list of best taxa set so far 360 */ 361 void updateSplitVector(Split &curset, SplitSet &best_set); 362 363 /******************************************************** 364 linear programming support 365 ********************************************************/ 366 367 /** 368 y variables in the LP formulation, check if it can be dropped or equals some x variable. 369 @param total_size k for PD_k or total budget 370 @param y_value (OUT): vector of: -1 if cannot reduce, 1 if equals 1, or id+2 where id is the trivial split id 371 */ 372 void checkYValue(int total_size, vector<int> &y_value); 373 374 /** 375 y variables in the LP formulation for PD area optimization, check if it can be dropped or equals some x variable. 376 @param total_size k for PD_k or total budget 377 @param y_value (OUT) vector of: -1 if cannot reduce, 1 if can be dropped, or id+2 where id is the trivial area id 378 @param count1 (OUT) count of x variables in the inequality 1 379 @param count2 (OUT) count of x variables in the inequality 2 380 */ 381 void checkYValue_Area(int total_size, vector<int> &y_value, vector<int> &count1, vector<int> &count2); 382 383 /** 384 check if a taxon is uniquely covered by one area 385 @param taxon the taxon ID 386 @param area (OUT) area the area ID that covers taxon 387 @return TRUE if the 'taxon' is uniquely covered by only one area. Otherwise FALSE. 388 */ 389 bool isUniquelyCovered(int taxon, int &area); 390 391 void lpObjectiveMaxSD(ostream &out, Params ¶ms, IntVector &y_value, int total_size); 392 393 void lpObjectiveMinK(ostream &out, Params ¶ms); 394 395 void lpSplitConstraint_RS(ostream &out, Params ¶ms, IntVector &y_value, IntVector &count1, IntVector &count2, int total_size); 396 void lpSplitConstraint_TS(ostream &out, Params ¶ms, IntVector &y_value, int total_size); 397 398 void lpK_BudgetConstraint(ostream &out, Params ¶ms, int total_size); 399 400 void lpMinSDConstraint(ostream &out, Params ¶ms, IntVector &y_value, double pd_proportion); 401 402 void lpVariableBound(ostream &out, Params ¶ms, Split &included_vars, IntVector &y_value); 403 404 void lpBoundaryConstraint(ostream &out, Params ¶ms); 405 406 void lpVariableBinary(ostream &out, Params ¶ms, Split &included_vars); 407 408 void lpVariableBinary(const char *outfile, Params ¶ms, Split &included_vars); 409 void lpVariableBinary(const char *outfile, Params ¶ms, IntVector &initialset); 410 void lpInitialArea(ostream &out, Params ¶ms); 411 412 void computeFeasibleBudget(Params ¶ms, IntVector &list_k); 413 414 }; 415 416 #endif 417