1 /***************************************************************************
2  *   Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler   *
3  *   minh.bui@univie.ac.at   *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20 #ifndef PDNETWORK_H
21 #define PDNETWORK_H
22 
23 #include "splitgraph.h"
24 
25 /**
26 General Split Network for Phylogenetic Diversity Algorithm
27 
28 @author BUI Quang Minh, Steffen Klaere, Arndt von Haeseler
29 */
30 class PDNetwork : public SplitGraph
31 {
32 public:
33 
34 	friend class MTree;
35 	friend class ECOpd;
36 
37 	/**
38 		empty constructor
39 	*/
40     PDNetwork();
41 
42 	/**
43 		construct PD network from a NEXUS or NEWICK file, e.g. produced by SplitsTree
44 		@param params program parameters
45 	*/
46     PDNetwork(Params &params);
47 
48 
49 	/**
50 		Identify the root node if specified, include it into the initial set
51 		@param root_name name of the root node
52 	*/
53 	void readRootNode(const char *root_name);
54 
55 	/**
56 		read the parameter from the file and incoporate into split system
57 		@param params program parameters
58 	*/
59 	void readParams(Params &params);
60 
61 	/**
62 		read the initial set of taxa to be included into PD-tree
63 		@param params program parameters
64 	*/
65 	void readInitialSet(Params &params);
66 
67 
68 	/**
69 		read the initial areas to be included into PD set
70 		@param params program parameters
71 	*/
72 	void readInitialAreas(Params &params);
73 
74 	/**
75 		increase the weight of the split associated with initial set
76 	*/
77 	void proceedInitialSet();
78 
79 	/**
80 		initialize when PD min specified
81 	*/
82 	void initPDMin();
83 
84 
85 	/**
86 		compute the minimum required costs to conserve a taxa set
87 		@param taxset set of taxa
88 		@return budget required
89 	*/
90 	int calcCost(IntVector &taxset);
91 
92 	/**
93 		compute the minimum required costs to conserve a taxa set
94 		@param taxset set of taxa
95 		@return budget required
96 	*/
97 	int calcCost(Split &taxset);
98 
99 	void printOutputSetScore(Params &params, vector<SplitSet> &pd_set);
100 
101 	/**
102 		compute the PD score of a given taxa set in filename
103 		@param params program parameters
104 		@param taxa_set (OUT) corresponding set of taxa
105 		@param pd_more (OUT) more computed PD measures will be stored here
106 	*/
107 	void computePD(Params &params, SplitSet &taxa_set, PDRelatedMeasures &pd_more);
108 
109 	/**
110 		this will be called by findPD at the beginning
111 		@param params program parameters
112 	*/
113 	virtual void enterFindPD(Params &params);
114 
115 	/**
116 		main function to search for maximal phylogenetic diversity
117 		@param params program parameters
118 		@param taxa_set (OUT) the vector of set of taxa in the maximal PD set
119 		@param taxa_order (OUT) order of inserted taxa
120 	*/
121 	virtual void findPD(Params &params, vector<SplitSet> &taxa_set, vector<int> &taxa_order);
122 
123 
124 	/**
125 		this function will be called by findPD at the end
126 		@param taxa_set (IN/OUT) the vector of set of taxa in the maximal PD set
127 	*/
128 	virtual void leaveFindPD(vector<SplitSet> &taxa_set);
129 
130 	/**
131 		calculate the PD gain matrix in terms of delta_k^j = pd(PD_k \/ {j}) - pd_k
132 		@param pd_set set of optimal PD sets
133 		@param delta (OUT) PD gain matrix
134 	*/
135 	void calcPDGain(vector<SplitSet> &pd_set, mmatrix(double) &delta);
136 
137 	/**
138 		compute the PD score of a given taxa set with name in taxa_name, result is written to id_set.weight.
139 		The difference from calcWeight() is that calcPD takes initialset into account
140 		@param id_set (IN/OUT) corresponding set of taxa
141 
142 	*/
143 	void calcPD(Split &id_set);
144 
145 	/**
146 		compute PD of a set of areas. It implicitly takes area_taxa map into account.
147 		@param area_id_set IDs of areas in the set
148 	*/
149 	void calcPDArea(Split &area_id_set);
150 
151 	/**
152 		compute the EXCLUSIVE PD score of a given taxa set with name in taxa_name, result is written to id_set.weight
153 		@param id_set (IN/OUT) corresponding set of taxa IDs
154 	*/
155 	void calcExclusivePD(Split &id_set);
156 
157 	/**
158 		compute the area's PD ENDEMISM of set of area
159 		@param area_set set of area
160 		@param pd_endem (OUT) corresponding PD endemism
161 	*/
162 	void calcPDEndemism(SplitSet &area_set, DoubleVector &pd_endem);
163 
164 	/**
165 		compute the area's PD complementarity given a specific area
166 		@param area_set set of area
167 		@param area_names given area names as string separated by commas
168 		@param all_names all area names
169 		@param pd_comp (OUT) corresponding PD endemism
170 	*/
171 	void calcPDComplementarity(SplitSet &area_set, char *area_names,
172 		vector<string> &all_names, DoubleVector &pd_comp);
173 
174 
175 	/**
176 		transform the problem into an Integer Linear Programming and write to .lp file
177 		@param params program parameters
178 		@param outfile name of output file in LP format
179 		@param total_size k for PD_k or total budget
180 		@param make_bin TRUE if creating binary programming
181 	*/
182 	void transformLP(Params &params, const char *outfile, int total_size, bool make_bin);
183 	void transformLP2(Params &params, const char *outfile, int total_size, bool make_bin);
184 	void transformEcoLP(Params &params, const char *outline, int total_size);
185 
186 	/**
187 		transform the problem into an Integer Linear Programming and write to .lp file
188 		@param params program parameters
189 		@param outfile name of output file in LP format
190 		@param total_size k for PD_k or total budget
191 		@param make_bin TRUE if creating binary programming
192 	*/
193 	void transformLP_Area(Params &params, const char *outfile, int total_size, bool make_bin);
194 	void transformLP_Area2(Params &params, const char *outfile, int total_size, bool make_bin);
195 
196 	/**
197 		transform the problem into an Integer Linear Programming and write to .lp file
198 		@param params program parameters
199 		@param outfile name of output file in LP format
200 		@param pd_proportion minimum PD proprotion to be conserved
201 		@param make_bin TRUE if creating binary programming
202 	*/
203 	void transformMinK_Area(Params &params, const char *outfile, double pd_proprotion, bool make_bin);
204 	void transformMinK_Area2(Params &params, const char *outfile, double pd_proportion, bool make_bin);
205 
206 	/**
207 		transform the PD problem into linear programming and solve it
208 		@param params program parameters
209 		@param taxa_set (OUT) the vector of set of taxa in the maximal PD set
210 	*/
211 	void findPD_LP(Params &params, vector<SplitSet> &taxa_set);
212 
213 	/**
214 		transform the PD problem into linear programming and solve it
215 		@param params program parameters
216 		@param areas_set (OUT) the vector of set of areas in the maximal PD set
217 	*/
218 	void findPDArea_LP(Params &params, vector<SplitSet> &areas_set);
219 
220 	double findMinKArea_LP(Params &params, const char* filename, double pd_proportion, Split &area);
221 
222 	/**
223 		@return TRUE if we are doing PD area optimization
224 	*/
225 	virtual bool isPDArea();
226 
227 	/**
228 		check if all taxa are covered by the set of areas
229 		@return false if there exists some taxon which is not covered by any areas
230 	*/
231 	bool checkAreaCoverage();
232 
233 	/**
234 		transform the problem into an Integer Linear Programming and write to .lp file
235 		@param outfile name of output file in LP format
236 		@param included_area (OUT) collection of areas that should always be included
237 	*/
238 	void transformLP_Area_Coverage(const char *outfile, Params &params, Split &included_area);
239 
240 
241 	/**
242 		@return the minimum number of areas needed to cover all taxa
243 		@param params program parameters
244 		@param area_id (OUT) minimal set of areas which cover all taxa
245 	*/
246 	int findMinAreas(Params &params, Split &area_id);
247 
248 
249 
250 	/**
251 		the set of areas, each item contains the set of taxa in the area.
252 	*/
253 	SplitSet area_taxa;
254 
255 	/**
256 	 	speciesList is used in ECOpd analysis for synchronization of species in SplitNetwork with species in FoodWeb
257 	 */
258 	void speciesList(vector<string> *speciesNames);
259 
260 protected:
261 
262 	/**
263 		extra PD when integrating initial set
264 	*/
265 	double extra_pd;
266 
267 	/**
268 		when computing PD min (instead of PD max)
269 	*/
270 	bool min_pd;
271 
272 	/**
273 		taxa set to be included into optimal PD set (with -i option)
274 	*/
275 	IntVector initialset;
276 
277 
278 	/**
279 		areas to be included into optimal PD set (with -ia option)
280 	*/
281 	IntVector initialareas;
282 
283 	/**
284 		calculate the total maximum budget required
285 		@return total maximum budget required
286 	*/
287 	int calcMaxBudget();
288 
289 /********************************************************
290 	hill-climbing and greedy heuristics
291 ********************************************************/
292 
293 	/**
294 		greedy algorithm for phylogenetic diversity of a given size
295 		@param subsize the subset size
296 		@param taxa_set (OUT) the set of taxa in the PD-set
297 		@param taxa_order (OUT) order of inserted taxa
298 		@return the PD score of the maximal set, also returned in taxa_set.weight
299 	*/
300 	double greedyPD(int subsize, Split &taxa_set, vector<int> &taxa_order);
301 
302 
303 	/**
304 		local search algorithm for phylogenetic diversity of a given size
305 		@param subsize the subset size
306 		@param taxa_set (OUT) the set of taxa in the PD-set
307 		@param taxa_order (IN) order of inserted taxa
308 		@return the PD score of the maximal set, also returned in taxa_set.weight
309 	*/
310 	double localSearchPD(int subsize, Split &taxa_set, vector<int> &taxa_order);
311 
312 /********************************************************
313 	exhaustive search
314 ********************************************************/
315 
316 	/**
317 		exhaustive search version 2 for maximal phylogenetic diversity of a given size
318 		@param subsize the subset size
319 		@param cur_tax current taxon
320 		@param curset current set
321 		@param find_all TRUE if wanting all max PD set
322 		@param best_set (OUT) the set of taxa in the maximal PD set
323 		@param taxa_order (OUT) order of inserted taxa
324 		@param rem_splits (IN) remaining splits
325 		@param rem_it (IN) begin of remaining iterator
326 		@return the PD score of the maximal set
327 	*/
328 	double exhaustPD2(int subsize, int cur_tax, Split &curset,
329 		bool find_all,SplitSet &best_set, vector<int> &taxa_order,
330 		IntList &rem_splits, IntList::iterator &rem_it);
331 
332 	/**
333 		exhaustive search for maximal PD with cost-constrained
334 		@param cur_budget  current budget
335 		@param cur_tax current taxon
336 		@param curset current set
337 		@param find_all TRUE if wanting all max PD set
338 		@param best_set (OUT) the set of taxa in the maximal PD set
339 		@param taxa_order (OUT) order of inserted taxa
340 		@param rem_splits (IN) remaining splits
341 		@param rem_it (IN) begin of remaining iterator
342 		@return the PD score of the maximal set
343 	*/
344 	double exhaustPDBudget(int cur_budget, int cur_tax, Split &curset,
345 		bool find_all,SplitSet &best_set, vector<int> &taxa_order,
346 		IntList &rem_splits, IntList::iterator &rem_it);
347 
348 	/**
349 		calculate sum of weights of preserved splits in the taxa_set
350 		@param taxa_set a set of taxa
351 		@param rem_splits remaining splits
352 		@param rem_it begin iterator of remaining splits
353 	*/
354 	double calcRaisedWeight(Split &taxa_set, IntList &rem_splits, IntList::iterator & rem_it);
355 
356 	/**
357 		update the best taxa set during the search
358 		@param curset the current taxa set
359 		@param best_set the list of best taxa set so far
360 	*/
361 	void updateSplitVector(Split &curset, SplitSet &best_set);
362 
363 /********************************************************
364 	linear programming support
365 ********************************************************/
366 
367 	/**
368 		y variables in the LP formulation, check if it can be dropped or equals some x variable.
369 		@param total_size k for PD_k or total budget
370 		@param y_value (OUT): vector of: -1 if cannot reduce, 1 if equals 1, or id+2 where id is the trivial split id
371 	*/
372 	void checkYValue(int total_size, vector<int> &y_value);
373 
374 	/**
375 		y variables in the LP formulation for PD area optimization, check if it can be dropped or equals some x variable.
376 		@param total_size k for PD_k or total budget
377 		@param y_value (OUT) vector of: -1 if cannot reduce, 1 if can be dropped, or id+2 where id is the trivial area id
378 		@param count1 (OUT) count of x variables in the inequality 1
379 		@param count2 (OUT) count of x variables in the inequality 2
380 	*/
381 	void checkYValue_Area(int total_size, vector<int> &y_value, vector<int> &count1, vector<int> &count2);
382 
383 	/**
384 		check if a taxon is uniquely covered by one area
385 		@param taxon the taxon ID
386 		@param area (OUT) area the area ID that covers taxon
387 		@return TRUE if the 'taxon' is uniquely covered by only one area. Otherwise FALSE.
388 	*/
389 	bool isUniquelyCovered(int taxon, int &area);
390 
391 	void lpObjectiveMaxSD(ostream &out, Params &params, IntVector &y_value, int total_size);
392 
393 	void lpObjectiveMinK(ostream &out, Params &params);
394 
395 	void lpSplitConstraint_RS(ostream &out, Params &params, IntVector &y_value, IntVector &count1, IntVector &count2, int total_size);
396 	void lpSplitConstraint_TS(ostream &out, Params &params, IntVector &y_value, int total_size);
397 
398 	void lpK_BudgetConstraint(ostream &out, Params &params, int total_size);
399 
400 	void lpMinSDConstraint(ostream &out, Params &params, IntVector &y_value, double pd_proportion);
401 
402 	void lpVariableBound(ostream &out, Params &params, Split &included_vars, IntVector &y_value);
403 
404 	void lpBoundaryConstraint(ostream &out, Params &params);
405 
406 	void lpVariableBinary(ostream &out, Params &params, Split &included_vars);
407 
408 	void lpVariableBinary(const char *outfile, Params &params, Split &included_vars);
409 	void lpVariableBinary(const char *outfile, Params &params, IntVector &initialset);
410 	void lpInitialArea(ostream &out, Params &params);
411 
412 	void computeFeasibleBudget(Params &params, IntVector &list_k);
413 
414 };
415 
416 #endif
417