1 #include <string.h>
2 #include <stdlib.h>
3 #include "lib/mlrutil.h"
4 #include "containers/percentile_keeper.h"
5 #include "lib/mvfuncs.h"
6 
7 #define INITIAL_CAPACITY 10000
8 #define GROWTH_FACTOR    2.0
9 
10 // ----------------------------------------------------------------
percentile_keeper_alloc()11 percentile_keeper_t* percentile_keeper_alloc() {
12 	unsigned long long capacity = INITIAL_CAPACITY;
13 	percentile_keeper_t* ppercentile_keeper = mlr_malloc_or_die(sizeof(percentile_keeper_t));
14 	ppercentile_keeper->data     = mlr_malloc_or_die(capacity*sizeof(mv_t));
15 	ppercentile_keeper->size     = 0LL;
16 	ppercentile_keeper->capacity = capacity;
17 	ppercentile_keeper->sorted   = FALSE;
18 	return ppercentile_keeper;
19 }
20 
21 // ----------------------------------------------------------------
percentile_keeper_free(percentile_keeper_t * ppercentile_keeper)22 void percentile_keeper_free(percentile_keeper_t* ppercentile_keeper) {
23 	if (ppercentile_keeper == NULL)
24 		return;
25 	for (unsigned long long i = 0; i < ppercentile_keeper->size; i++) {
26 		mv_free(&ppercentile_keeper->data[i]);
27 	}
28 	free(ppercentile_keeper->data);
29 	ppercentile_keeper->data = NULL;
30 	ppercentile_keeper->size = 0LL;
31 	ppercentile_keeper->capacity = 0LL;
32 	free(ppercentile_keeper);
33 }
34 
35 // ----------------------------------------------------------------
percentile_keeper_ingest(percentile_keeper_t * ppercentile_keeper,mv_t value)36 void percentile_keeper_ingest(percentile_keeper_t* ppercentile_keeper, mv_t value) {
37 	if (ppercentile_keeper->size >= ppercentile_keeper->capacity) {
38 		ppercentile_keeper->capacity = (unsigned long long)(ppercentile_keeper->capacity * GROWTH_FACTOR);
39 		ppercentile_keeper->data = (mv_t*)mlr_realloc_or_die(ppercentile_keeper->data,
40 			ppercentile_keeper->capacity*sizeof(mv_t));
41 	}
42 	ppercentile_keeper->data[ppercentile_keeper->size++] = value;
43 	ppercentile_keeper->sorted = FALSE;
44 }
45 
46 // ================================================================
47 // Non-interpolated percentiles (see also https://en.wikipedia.org/wiki/Percentile)
48 
49 // ----------------------------------------------------------------
50 // OPTION 1: int index = p*n/100.0;
51 //
52 // x
53 // 0
54 // 20
55 // 40
56 // 60
57 // 80
58 // 100
59 //
60 // x_p00 0 x_p10  0 x_p20 20 x_p30 20 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80  80 x_p90 100 x_p100 100
61 // x_p01 0 x_p11  0 x_p21 20 x_p31 20 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81  80 x_p91 100
62 // x_p02 0 x_p12  0 x_p22 20 x_p32 20 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82  80 x_p92 100
63 // x_p03 0 x_p13  0 x_p23 20 x_p33 20 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83  80 x_p93 100
64 // x_p04 0 x_p14  0 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 100 x_p94 100
65 // x_p05 0 x_p15  0 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 100 x_p95 100
66 // x_p06 0 x_p16  0 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 100 x_p96 100
67 // x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100
68 // x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100
69 // x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100
70 //
71 // x
72 // 0
73 // 25
74 // 50
75 // 75
76 // 100
77 //
78 // x_p00 0 x_p10 0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 75 x_p70 75 x_p80 100 x_p90 100 x_p100 100
79 // x_p01 0 x_p11 0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 75 x_p71 75 x_p81 100 x_p91 100
80 // x_p02 0 x_p12 0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 75 x_p72 75 x_p82 100 x_p92 100
81 // x_p03 0 x_p13 0 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83 100 x_p93 100
82 // x_p04 0 x_p14 0 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84 100 x_p94 100
83 // x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85 100 x_p95 100
84 // x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86 100 x_p96 100
85 // x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87 100 x_p97 100
86 // x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100
87 // x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100
88 //
89 // ----------------------------------------------------------------
90 // OPTION 2: int index = p*(n-1)/100.0;
91 //
92 // x
93 // 0
94 // 20
95 // 40
96 // 60
97 // 80
98 // 100
99 //
100 // x_p00 0 x_p10 0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90 80 x_p100 100
101 // x_p01 0 x_p11 0 x_p21 20 x_p31 20 x_p41 40 x_p51 40 x_p61 60 x_p71 60 x_p81 80 x_p91 80
102 // x_p02 0 x_p12 0 x_p22 20 x_p32 20 x_p42 40 x_p52 40 x_p62 60 x_p72 60 x_p82 80 x_p92 80
103 // x_p03 0 x_p13 0 x_p23 20 x_p33 20 x_p43 40 x_p53 40 x_p63 60 x_p73 60 x_p83 80 x_p93 80
104 // x_p04 0 x_p14 0 x_p24 20 x_p34 20 x_p44 40 x_p54 40 x_p64 60 x_p74 60 x_p84 80 x_p94 80
105 // x_p05 0 x_p15 0 x_p25 20 x_p35 20 x_p45 40 x_p55 40 x_p65 60 x_p75 60 x_p85 80 x_p95 80
106 // x_p06 0 x_p16 0 x_p26 20 x_p36 20 x_p46 40 x_p56 40 x_p66 60 x_p76 60 x_p86 80 x_p96 80
107 // x_p07 0 x_p17 0 x_p27 20 x_p37 20 x_p47 40 x_p57 40 x_p67 60 x_p77 60 x_p87 80 x_p97 80
108 // x_p08 0 x_p18 0 x_p28 20 x_p38 20 x_p48 40 x_p58 40 x_p68 60 x_p78 60 x_p88 80 x_p98 80
109 // x_p09 0 x_p19 0 x_p29 20 x_p39 20 x_p49 40 x_p59 40 x_p69 60 x_p79 60 x_p89 80 x_p99 80
110 //
111 // x
112 // 0
113 // 25
114 // 50
115 // 75
116 // 100
117 //
118 // x_p00 0 x_p10 0 x_p20  0 x_p30 25 x_p40 25 x_p50 50 x_p60 50 x_p70 50 x_p80 75 x_p90 75 x_p100 100
119 // x_p01 0 x_p11 0 x_p21  0 x_p31 25 x_p41 25 x_p51 50 x_p61 50 x_p71 50 x_p81 75 x_p91 75
120 // x_p02 0 x_p12 0 x_p22  0 x_p32 25 x_p42 25 x_p52 50 x_p62 50 x_p72 50 x_p82 75 x_p92 75
121 // x_p03 0 x_p13 0 x_p23  0 x_p33 25 x_p43 25 x_p53 50 x_p63 50 x_p73 50 x_p83 75 x_p93 75
122 // x_p04 0 x_p14 0 x_p24  0 x_p34 25 x_p44 25 x_p54 50 x_p64 50 x_p74 50 x_p84 75 x_p94 75
123 // x_p05 0 x_p15 0 x_p25 25 x_p35 25 x_p45 25 x_p55 50 x_p65 50 x_p75 75 x_p85 75 x_p95 75
124 // x_p06 0 x_p16 0 x_p26 25 x_p36 25 x_p46 25 x_p56 50 x_p66 50 x_p76 75 x_p86 75 x_p96 75
125 // x_p07 0 x_p17 0 x_p27 25 x_p37 25 x_p47 25 x_p57 50 x_p67 50 x_p77 75 x_p87 75 x_p97 75
126 // x_p08 0 x_p18 0 x_p28 25 x_p38 25 x_p48 25 x_p58 50 x_p68 50 x_p78 75 x_p88 75 x_p98 75
127 // x_p09 0 x_p19 0 x_p29 25 x_p39 25 x_p49 25 x_p59 50 x_p69 50 x_p79 75 x_p89 75 x_p99 75
128 //
129 // ----------------------------------------------------------------
130 // OPTION 3: int index = (int)ceil(p*(n-1)/100.0);
131 //
132 // x
133 // 0
134 // 20
135 // 40
136 // 60
137 // 80
138 // 100
139 //
140 // x_p00  0 x_p10 20 x_p20 20 x_p30 40 x_p40 40 x_p50 60 x_p60 60 x_p70 80 x_p80  80 x_p90 100 x_p100 100
141 // x_p01 20 x_p11 20 x_p21 40 x_p31 40 x_p41 60 x_p51 60 x_p61 80 x_p71 80 x_p81 100 x_p91 100
142 // x_p02 20 x_p12 20 x_p22 40 x_p32 40 x_p42 60 x_p52 60 x_p62 80 x_p72 80 x_p82 100 x_p92 100
143 // x_p03 20 x_p13 20 x_p23 40 x_p33 40 x_p43 60 x_p53 60 x_p63 80 x_p73 80 x_p83 100 x_p93 100
144 // x_p04 20 x_p14 20 x_p24 40 x_p34 40 x_p44 60 x_p54 60 x_p64 80 x_p74 80 x_p84 100 x_p94 100
145 // x_p05 20 x_p15 20 x_p25 40 x_p35 40 x_p45 60 x_p55 60 x_p65 80 x_p75 80 x_p85 100 x_p95 100
146 // x_p06 20 x_p16 20 x_p26 40 x_p36 40 x_p46 60 x_p56 60 x_p66 80 x_p76 80 x_p86 100 x_p96 100
147 // x_p07 20 x_p17 20 x_p27 40 x_p37 40 x_p47 60 x_p57 60 x_p67 80 x_p77 80 x_p87 100 x_p97 100
148 // x_p08 20 x_p18 20 x_p28 40 x_p38 40 x_p48 60 x_p58 60 x_p68 80 x_p78 80 x_p88 100 x_p98 100
149 // x_p09 20 x_p19 20 x_p29 40 x_p39 40 x_p49 60 x_p59 60 x_p69 80 x_p79 80 x_p89 100 x_p99 100
150 //
151 // x
152 // 0
153 // 25
154 // 50
155 // 75
156 // 100
157 //
158 // x_p00  0 x_p10 25 x_p20 25 x_p30 50 x_p40 50 x_p50 50 x_p60 75 x_p70  75 x_p80 100 x_p90 100 x_p100 100
159 // x_p01 25 x_p11 25 x_p21 25 x_p31 50 x_p41 50 x_p51 75 x_p61 75 x_p71  75 x_p81 100 x_p91 100
160 // x_p02 25 x_p12 25 x_p22 25 x_p32 50 x_p42 50 x_p52 75 x_p62 75 x_p72  75 x_p82 100 x_p92 100
161 // x_p03 25 x_p13 25 x_p23 25 x_p33 50 x_p43 50 x_p53 75 x_p63 75 x_p73  75 x_p83 100 x_p93 100
162 // x_p04 25 x_p14 25 x_p24 25 x_p34 50 x_p44 50 x_p54 75 x_p64 75 x_p74  75 x_p84 100 x_p94 100
163 // x_p05 25 x_p15 25 x_p25 25 x_p35 50 x_p45 50 x_p55 75 x_p65 75 x_p75  75 x_p85 100 x_p95 100
164 // x_p06 25 x_p16 25 x_p26 50 x_p36 50 x_p46 50 x_p56 75 x_p66 75 x_p76 100 x_p86 100 x_p96 100
165 // x_p07 25 x_p17 25 x_p27 50 x_p37 50 x_p47 50 x_p57 75 x_p67 75 x_p77 100 x_p87 100 x_p97 100
166 // x_p08 25 x_p18 25 x_p28 50 x_p38 50 x_p48 50 x_p58 75 x_p68 75 x_p78 100 x_p88 100 x_p98 100
167 // x_p09 25 x_p19 25 x_p29 50 x_p39 50 x_p49 50 x_p59 75 x_p69 75 x_p79 100 x_p89 100 x_p99 100
168 //
169 // ----------------------------------------------------------------
170 // OPTION 4: int index = (int)ceil(-0.5 + p*(n-1)/100.0);
171 //
172 // x
173 // 0
174 // 20
175 // 40
176 // 60
177 // 80
178 // 100
179 //
180 // x_p00 0 x_p10  0 x_p20 20 x_p30 20 x_p40 40 x_p50 40 x_p60 60 x_p70 60 x_p80 80 x_p90  80 x_p100 100
181 // x_p01 0 x_p11 20 x_p21 20 x_p31 40 x_p41 40 x_p51 60 x_p61 60 x_p71 80 x_p81 80 x_p91 100
182 // x_p02 0 x_p12 20 x_p22 20 x_p32 40 x_p42 40 x_p52 60 x_p62 60 x_p72 80 x_p82 80 x_p92 100
183 // x_p03 0 x_p13 20 x_p23 20 x_p33 40 x_p43 40 x_p53 60 x_p63 60 x_p73 80 x_p83 80 x_p93 100
184 // x_p04 0 x_p14 20 x_p24 20 x_p34 40 x_p44 40 x_p54 60 x_p64 60 x_p74 80 x_p84 80 x_p94 100
185 // x_p05 0 x_p15 20 x_p25 20 x_p35 40 x_p45 40 x_p55 60 x_p65 60 x_p75 80 x_p85 80 x_p95 100
186 // x_p06 0 x_p16 20 x_p26 20 x_p36 40 x_p46 40 x_p56 60 x_p66 60 x_p76 80 x_p86 80 x_p96 100
187 // x_p07 0 x_p17 20 x_p27 20 x_p37 40 x_p47 40 x_p57 60 x_p67 60 x_p77 80 x_p87 80 x_p97 100
188 // x_p08 0 x_p18 20 x_p28 20 x_p38 40 x_p48 40 x_p58 60 x_p68 60 x_p78 80 x_p88 80 x_p98 100
189 // x_p09 0 x_p19 20 x_p29 20 x_p39 40 x_p49 40 x_p59 60 x_p69 60 x_p79 80 x_p89 80 x_p99 100
190 //
191 // x
192 // 0
193 // 25
194 // 50
195 // 75
196 // 100
197 //
198 // x_p00 0 x_p10  0 x_p20 25 x_p30 25 x_p40 50 x_p50 50 x_p60 50 x_p70 75 x_p80  75 x_p90 100 x_p100 100
199 // x_p01 0 x_p11  0 x_p21 25 x_p31 25 x_p41 50 x_p51 50 x_p61 50 x_p71 75 x_p81  75 x_p91 100
200 // x_p02 0 x_p12  0 x_p22 25 x_p32 25 x_p42 50 x_p52 50 x_p62 50 x_p72 75 x_p82  75 x_p92 100
201 // x_p03 0 x_p13 25 x_p23 25 x_p33 25 x_p43 50 x_p53 50 x_p63 75 x_p73 75 x_p83  75 x_p93 100
202 // x_p04 0 x_p14 25 x_p24 25 x_p34 25 x_p44 50 x_p54 50 x_p64 75 x_p74 75 x_p84  75 x_p94 100
203 // x_p05 0 x_p15 25 x_p25 25 x_p35 25 x_p45 50 x_p55 50 x_p65 75 x_p75 75 x_p85  75 x_p95 100
204 // x_p06 0 x_p16 25 x_p26 25 x_p36 25 x_p46 50 x_p56 50 x_p66 75 x_p76 75 x_p86  75 x_p96 100
205 // x_p07 0 x_p17 25 x_p27 25 x_p37 25 x_p47 50 x_p57 50 x_p67 75 x_p77 75 x_p87  75 x_p97 100
206 // x_p08 0 x_p18 25 x_p28 25 x_p38 50 x_p48 50 x_p58 50 x_p68 75 x_p78 75 x_p88 100 x_p98 100
207 // x_p09 0 x_p19 25 x_p29 25 x_p39 50 x_p49 50 x_p59 50 x_p69 75 x_p79 75 x_p89 100 x_p99 100
208 //
209 // ----------------------------------------------------------------
210 // CONCLUSION:
211 // * I like option 2 for its simplicity ...
212 // * ... but option 1 matches R's quantile with type=1.
213 // * (Note that Miller's interpolated percentiles match match R's quantile with type=7)
214 // ----------------------------------------------------------------
215 
compute_index_non_interpolated(unsigned long long n,double p)216 static unsigned long long compute_index_non_interpolated(unsigned long long n, double p) {
217 	long long index = p*n/100.0;
218 	//unsigned long long index = p*(n-1)/100.0;
219 	//unsigned long long index = (unsigned long long)ceil(p*(n-1)/100.0);
220 	//unsigned long long index = (unsigned long long)ceil(-0.5 + p*(n-1)/100.0);
221 	if (index >= n)
222 		index = n-1;
223 	if (index < 0)
224 		index = 0;
225 	return (unsigned long long)index;
226 }
227 
get_percentile_linearly_interpolated(mv_t * array,unsigned long long n,double p)228 static mv_t get_percentile_linearly_interpolated(mv_t* array, unsigned long long n, double p) {
229 	double findex = (p/100.0)*(n-1);
230 	if (findex < 0)
231 		findex = 0;
232 	unsigned long long iindex = (unsigned long long)floor(findex);
233 	if (iindex >= n-1) {
234 		return array[iindex];
235 	} else {
236 		// array[iindex] + frac * (array[iindex+1] - array[iindex]);
237 		mv_t frac = mv_from_float(findex - iindex);
238 		mv_t* pa = &array[iindex];
239 		mv_t* pb = &array[iindex+1];
240 		mv_t diff = x_xx_minus_func(pb, pa);
241 		mv_t prod = x_xx_times_func(&frac, &diff);
242 		mv_t rv = x_xx_plus_func(pa, &prod);
243 		return rv;
244 	}
245 }
246 
247 // ----------------------------------------------------------------
percentile_keeper_emit_non_interpolated(percentile_keeper_t * ppercentile_keeper,double percentile)248 mv_t percentile_keeper_emit_non_interpolated(percentile_keeper_t* ppercentile_keeper, double percentile) {
249 	if (ppercentile_keeper->size == 0) {
250 		return mv_absent();
251 	}
252 	if (!ppercentile_keeper->sorted) {
253 		qsort(ppercentile_keeper->data, ppercentile_keeper->size, sizeof(mv_t), mv_xx_comparator);
254 		ppercentile_keeper->sorted = TRUE;
255 	}
256 	return ppercentile_keeper->data[compute_index_non_interpolated(ppercentile_keeper->size, percentile)];
257 }
258 
percentile_keeper_emit_linearly_interpolated(percentile_keeper_t * ppercentile_keeper,double percentile)259 mv_t percentile_keeper_emit_linearly_interpolated(percentile_keeper_t* ppercentile_keeper, double percentile) {
260 	if (ppercentile_keeper->size == 0) {
261 		return mv_absent();
262 	}
263 	if (!ppercentile_keeper->sorted) {
264 		qsort(ppercentile_keeper->data, ppercentile_keeper->size, sizeof(mv_t), mv_xx_comparator);
265 		ppercentile_keeper->sorted = TRUE;
266 	}
267 	return get_percentile_linearly_interpolated(ppercentile_keeper->data, ppercentile_keeper->size, percentile);
268 }
269 
270 // ----------------------------------------------------------------
percentile_keeper_print(percentile_keeper_t * ppercentile_keeper)271 void percentile_keeper_print(percentile_keeper_t* ppercentile_keeper) {
272 	printf("percentile_keeper dump:\n");
273 	for (unsigned long long i = 0; i < ppercentile_keeper->size; i++) {
274 		mv_t* pa = &ppercentile_keeper->data[i];
275 		if (pa->type == MT_FLOAT)
276 			printf("[%02llu] %.8lf\n", i, ppercentile_keeper->data[i].u.fltv);
277 		else
278 			printf("[%02llu] %8lld\n", i, ppercentile_keeper->data[i].u.intv);
279 	}
280 }
281