1 //---------------------------------------------------------------------------------
2 //  GBM alteration by Stefan Schroedl (schroedl@a9.com)
3 //
4 //  File:       pairwise
5 //
6 //  Contains:   Distribution object to implement pairwise distributions for ranking
7 //
8 //  History:    12/15/2011   Created
9 //
10 //---------------------------------------------------------------------------------
11 
12 //  This file implements the LambdaMart algorithm for learning ranking functions.
13 //  The main idea is to model p_ij, the probability that item i should rank higher
14 //  than j, as
15 //       p_ij = 1 / (1 + exp(s_i - s_j)),
16 //  where s_i, s_j are the model scores for the two items.
17 //
18 //  While scores are still generated one item at a time, gradients for learning
19 //  depend on _pairs_ of items. The algorithm is aware of _groups_; all pairs of items
20 //  with different labels, belonging to the same group, are used for training. A
21 //  typical application is ranking for web search: groups correspond to user queries,
22 //  and items to (feature vectors of) web pages in the associated match set.
23 //
24 //  Different IR measures can be chosen, to weight instances based on their rank.
25 //  Generally, changes in top ranks should have more influence than changes at the
26 //  bottom of the result list. This function provides the following options:
27 //
28 //  * CONC (concordance index, fraction of correctly raked pairs. This is a generalization
29 //    of Area under the ROC Curve (AUC) from binary to multivalued labels.
30 //  * Normalized Discounted Cumulative Gain (NDCG)
31 //  * Mean Reciprocal Rank (MRR) of the highest-ranked positive instance.
32 //  * Mean Average Precision (MAP), a generalization of MRR to multiple positive instances.
33 //
34 //  While MRR and MAP expect binary target labels, CONC and NDCG can equally work with
35 //  continuous values. More precisely, NDCG is defined as
36 //     \Sum_{r=1..n} val_r / log2(r+1),
37 //  where val_r is the user-specified target for the item at rank r. Note that this is
38 //  contrast to some definitions of NDCG that assume integer targets s_i, and
39 //  implicitly transform val_r = 2^{s+i}-1.
40 //
41 //  Groups are specified using an integer vector of the same length as the training instances.
42 //
43 //  Optionally, item weights can be supplied; it is assumed that all instances belonging
44 //  to the same group have the same weight.
45 //
46 //  For background information on LambdaMart, please see e.g. the following papers:
47 //
48 //  * Burges, C., "From RankNet to LambdaRank to LambdaMART: An Overview", Microsoft
49 //    Research Technical Report MSR-TR-2010-82, 2010
50 //  * Donmez, P., K. Svore, K.,  and Burges, C., "On the Local Optimality of
51 //    LambdaRank", SIGIR 2009
52 //  * Burges, C., Ragno, R., and Le, Q., "Learning to Rank with Non-Smooth Cost
53 //    Functions", NIPS 2006
54 
55 #ifndef PAIRWISE_H
56 #define PAIRWISE_H
57 
58 #include "distribution.h"
59 #include "buildinfo.h"
60 
61 // A class to rerank groups based on (intermediate) scores
62 // Note: Smaller ranks are better, the top rank is 1
63 
64 class CRanker
65 {
66 public:
67     // Auxiliary structure to store score and rank
68     typedef std::pair<double, unsigned int> CDoubleUintPair;
69 
70     // Buffer memory allocation
71     void Init(unsigned int cMaxItemsPerGroup);
72 
73     // Initialize ranker with scores of items belonging to the same group
74     // - adScores is a score array, (at least) cNumItems long
75     bool SetGroupScores(const double* const adScores, unsigned int cNumItems);
76 
77     // Perform the ranking
78     // - Return true if any item changed its rank
79     bool Rank();
80 
81     // Getter / setter
GetNumItems()82     unsigned int GetNumItems() const               { return cNumItems; }
GetRank(int i)83     unsigned int GetRank(int i) const              { return vecdipScoreRank[i].second; }
GetItem(unsigned int iRank)84     unsigned int GetItem(unsigned int iRank) const { return (vecpdipScoreRank[iRank-1] - &(vecdipScoreRank[0])); }
SetRank(int i,unsigned int r)85     void SetRank(int i, unsigned int r)            { vecdipScoreRank[i].second = r; }
AddToScore(int i,double delta)86     void AddToScore(int i, double delta)           { vecdipScoreRank[i].first += delta; }
87 
88 protected:
89     // Number of items in current group
90     unsigned int cNumItems;
91 
92     // Pairs of (score, rank) for current group
93     vector<CDoubleUintPair> vecdipScoreRank;
94 
95     // Array of pointers to elements of vecdipScoreRank, used for sorting
96     // Note: We need a separate array for sorting in order to be able to
97     // quickly look up the rank for any given item.
98     vector<CDoubleUintPair*> vecpdipScoreRank;
99 };
100 
101 
102 // Abstract base class for all IR Measures
103 
104 class CIRMeasure
105 {
106 public:
107     // Constructor
CIRMeasure()108     CIRMeasure() : cRankCutoff(UINT_MAX) {}
109 
110     // Destructor
~CIRMeasure()111     virtual ~CIRMeasure() { }
112 
113     // Getter / Setter
GetCutoffRank()114     unsigned int GetCutoffRank() const { return cRankCutoff; }
SetCutoffRank(unsigned int cRankCutoff)115     void SetCutoffRank(unsigned int cRankCutoff) { this->cRankCutoff = cRankCutoff; }
116 
117     // Auxiliary function for sanity check
AnyPairs(const double * const adY,unsigned int cNumItems)118     bool AnyPairs(const double* const adY, unsigned int cNumItems) const
119     {
120         return (cNumItems >= 2                    // at least two instances
121                 && adY[0] > 0.0                   // at least one positive example (targets are non-increasing)
122                 && adY[cNumItems-1] != adY[0]);   // at least two different targets
123     }
124 
125     // Memory allocation
126     virtual void Init(unsigned long cMaxGroup, unsigned long cNumItems, unsigned int cRankCutoff = UINT_MAX) { this->cRankCutoff = cRankCutoff; }
127 
128      // Calculate the IR measure for the group of items set in the ranker.
129      // Precondition: CRanker::SetGroupScores() has been called
130      // - adY are the target scores
131     virtual double Measure(const double* const adY, const CRanker& ranker) = 0;
132 
133     // Calculate the maximum achievable IR measure for a given group.
134     // Side effect: the ranker state might change
135     // Default implementation for MRR and MAP: if any positive items exist,
136     // ranking them at the top yields a perfect measure of 1.
MaxMeasure(unsigned int iGroup,const double * const adY,unsigned int cNumItems)137     virtual double MaxMeasure(unsigned int iGroup, const double* const adY, unsigned int cNumItems)
138     {
139         return (AnyPairs(adY, cNumItems) ? 1.0 : 0.0);
140     }
141 
142     // Calculate the difference in the IR measure caused by swapping the ranks of two items.
143     // Assumptions:
144     // * iItemBetter has a higher label than iItemWorse (i.e., adY[iItemBetter] > adY[iItemWorse]).
145     // * ranker.setGroup() has been called.
146     virtual double SwapCost(int iItemBetter, int iItemWorse, const double* const adY, const CRanker& ranker) const = 0;
147 
148 protected:
149     // Cut-off rank below which items are ignored for measure
150     unsigned int cRankCutoff;
151 };
152 
153 // Class to implement IR Measure 'CONC' (fraction of concordant pairs). For the case of binary labels, this is
154 // equivalent to the area under the ROC curve (AUC).
155 
156 class CConc : public CIRMeasure
157 {
158 public:
~CConc()159     virtual ~CConc() { }
160 
161     void Init(unsigned long cMaxGroup, unsigned long cNumItems, unsigned int cRankCutoff = UINT_MAX);
162 
163     double Measure(const double* const adY, const CRanker& ranker);
164 
165     // The maximum number of correctly classified pairs is simply all pairs with different labels
MaxMeasure(unsigned int iGroup,const double * const adY,unsigned int cNumItems)166     double MaxMeasure(unsigned int iGroup, const double* const adY, unsigned int cNumItems)
167     {
168         return PairCount(iGroup, adY, cNumItems);
169     }
170 
171     // (Cached) calculation of the number of pairs with different labels
172     unsigned int PairCount(unsigned int iGroup, const double* const adY, unsigned int cNumItems);
173 
174     double SwapCost(int iItemBetter, int iItemWorse, const double* const adY, const CRanker& ranker) const;
175 
176 protected:
177     // Calculate the number of pairs with different labels
178     int  ComputePairCount(const double* const adY, unsigned int cNumItems);
179 
180     // Caches the number of pairs with different labels, for each group
181     vector<int> veccPairCount;
182 };
183 
184 // Class to implement IR Measure 'Normalized Discounted Cumulative Gain'
185 // Note: Labels can have any non-negative value
186 
187 class CNDCG : public CIRMeasure
188 {
189 public:
190 
191     void Init(unsigned long cMaxGroup, unsigned long cNumItems, unsigned int cRankCutoff = UINT_MAX);
192 
193     // Compute DCG
194     double Measure(const double* const adY, const CRanker& ranker);
195 
196     // Compute best possible DCG
197     double MaxMeasure(unsigned int iGroup, const double* const adY, unsigned int cNumItems);
198 
199     double SwapCost(int iItemBetter, int iItemWorse, const double* const adY, const CRanker& ranker) const;
200 
201 protected:
202      // Lookup table for rank weight (w(rank) = 1/log2(1+rank))
203     vector<double> vecdRankWeight;
204 
205     // Caches the maximum achievable DCG, for each group
206     vector<double> vecdMaxDCG;
207 };
208 
209 // Class to implement IR Measure 'Mean Reciprocal Rank'
210 // Assumption: Labels are 0 or 1
211 
212 class CMRR : public CIRMeasure
213 {
214 public:
215     double Measure(const double* const adY, const CRanker& ranker);
216 
217     double SwapCost(int iItemPos, int iItemNeg, const double* const adY, const CRanker& ranker) const;
218 
219 };
220 
221 
222 // Class to implement IR Measure 'Mean Average Precision'
223 // Assumption: Labels are 0 or 1
224 
225 class CMAP : public CIRMeasure
226 {
227 public:
228 
229     void Init(unsigned long cMaxGroup, unsigned long cNumItems, unsigned int cRankCutoff = UINT_MAX);
230 
231     double Measure(const double* const adY, const CRanker& ranker);
232 
233     double SwapCost(int iItemPos, int iItemNeg, const double* const adY, const CRanker& ranker) const;
234 protected:
235 
236     // Buffer to hold positions of positive examples
237     mutable vector<int> veccRankPos;
238 };
239 
240 
241 // Main class for 'pairwise' distribution
242 // Notes and Assumptions:
243 // * The items are sorted such that
244 //   * Instances belonging to the same group occur in
245 //     a contiguous range
246 //   * Within a group, labels are non-increasing.
247 // * adGroup supplies the group ID (positive integer, but double
248 //   format for compliance with the base class interface).
249 // * The targets adY are non-negative values, and binary {0,1}
250 //   for measures MRR and MAP.
251 // * Higher IR measures are better.
252 // * Only pairs with different labels are used for training.
253 // * Instance weights (adWeight) are constant among groups.
254 // * CPairwise::Initialize() is called before any of the other
255 //   functions, with same values for adY, adGroup, adWeight, and
256 //   nTrain. Certain values have to be precomputed for
257 //   efficiency.
258 
259 class CPairwise : public CDistribution
260 {
261 public:
262 
263     // Constructor: determine IR measure as either "conc", "map", "mrr", or "ndcg"
264     CPairwise(const char* szIRMeasure);
265 
266     virtual ~CPairwise();
267 
268     GBMRESULT Initialize(double *adY,
269                          double *adGroup,
270                          double *adOffset,
271                          double *adWeight,
272                          unsigned long cLength);
273 
UpdateParams(double * adF,double * adOffset,double * adWeight,unsigned long cLength)274     GBMRESULT UpdateParams(double *adF,
275                            double *adOffset,
276                            double *adWeight,
277                            unsigned long cLength)
278     {
279         return GBM_OK;
280     };
281 
282     GBMRESULT ComputeWorkingResponse(double *adY,
283                                      double *adGroup,
284                                      double *adOffset,
285                                      double *adF,
286                                      double *adZ,
287                                      double *adWeight,
288                                      bool *afInBag,
289                                      unsigned long nTrain,
290                                      int cIdxOff);
291 
292     double Deviance(double *adY,
293                     double *adGroup,
294                     double *adOffset,
295                     double *adWeight,
296                     double *adF,
297                     unsigned long cLength,
298                     int cIdxOff);
299 
300     GBMRESULT InitF(double *adY,
301                     double *adGroup,
302                     double *adOffset,
303                     double *adWeight,
304                     double &dInitF,
305                     unsigned long cLength);
306 
307     GBMRESULT FitBestConstant(double *adY,
308                               double *adGroup,
309                               double *adOffset,
310                               double *adW,
311                               double *adF,
312                               double *adZ,
313                               unsigned long *aiNodeAssign,
314                               unsigned long nTrain,
315                               VEC_P_NODETERMINAL vecpTermNodes,
316                               unsigned long cTermNodes,
317                               unsigned long cMinObsInNode,
318                               bool *afInBag,
319                               double *adFadj,
320                               int cIdxOff);
321 
322     double BagImprovement(double *adY,
323                           double *adGroup,
324                           double *adOffset,
325                           double *adWeight,
326                           double *adF,
327                           double *adFadj,
328                           bool *afInBag,
329                           double dStepSize,
330                           unsigned long nTrain);
331 
332 protected:
333 
334     // Calculate and accumulate up the gradients and Hessians from all training pairs
335     void ComputeLambdas(int iGroup, unsigned int cNumItems, const double* const adY, const double* const adF, const double* const adWeight, double* adZ, double* adDeriv);
336 
337     CIRMeasure* pirm;                 // The IR measure to use
338     CRanker ranker;                   // The ranker
339 
340     vector<double> vecdHessian;       // Second derivative of loss function, for each training instance; used for Newton step
341 
342     vector<double> vecdNum;           // Buffer used for numerator   in FitBestConstant(), for each node
343     vector<double> vecdDenom;         // Buffer used for denominator in FitBestConstant(), for each node
344 
345     vector<double> vecdFPlusOffset;   // Temporary buffer for (adF + adOffset), if the latter is not null
346 };
347 
348 #endif // PAIRWISE_H
349