1 /* stats-cmd.c -- implements the size stats sub-command.
2  *
3  * ====================================================================
4  *    Licensed to the Apache Software Foundation (ASF) under one
5  *    or more contributor license agreements.  See the NOTICE file
6  *    distributed with this work for additional information
7  *    regarding copyright ownership.  The ASF licenses this file
8  *    to you under the Apache License, Version 2.0 (the
9  *    "License"); you may not use this file except in compliance
10  *    with the License.  You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  *    Unless required by applicable law or agreed to in writing,
15  *    software distributed under the License is distributed on an
16  *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17  *    KIND, either express or implied.  See the License for the
18  *    specific language governing permissions and limitations
19  *    under the License.
20  * ====================================================================
21  */
22 
23 #include <assert.h>
24 
25 #include "svn_fs.h"
26 #include "svn_pools.h"
27 #include "svn_sorts.h"
28 
29 #include "private/svn_sorts_private.h"
30 #include "private/svn_string_private.h"
31 #include "private/svn_fs_fs_private.h"
32 
33 #include "svn_private_config.h"
34 #include "svnfsfs.h"
35 
36 /* Return the string, allocated in RESULT_POOL, describing the value 2**I.
37  */
38 static const char *
print_two_power(int i,apr_pool_t * result_pool)39 print_two_power(int i,
40                 apr_pool_t *result_pool)
41 {
42   /* These are the SI prefixes for base-1000, the binary ones with base-1024
43      are too clumsy and require appending B for "byte" to be intelligible,
44      e.g. "MiB".
45 
46      Therefore, we ignore the official standard and revert to the traditional
47      contextual use were the base-1000 prefixes are understood as base-1024
48      when it came to data sizes.
49    */
50   const char *si_prefixes = " kMGTPEZY";
51 
52   int number = (i >= 0) ? (1 << (i % 10)) : 0;
53   int thousands = (i >= 0) ? (i / 10) : 0;
54 
55   char si_prefix = (thousands < strlen(si_prefixes))
56                  ? si_prefixes[thousands]
57                  : '?';
58 
59   if (si_prefix == ' ')
60     return apr_psprintf(result_pool, "%d", number);
61 
62   return apr_psprintf(result_pool, "%d%c", number, si_prefix);
63 }
64 
65 /* Print statistics for the given group of representations to console.
66  * Use POOL for allocations.
67  */
68 static void
print_rep_stats(svn_fs_fs__representation_stats_t * stats,apr_pool_t * pool)69 print_rep_stats(svn_fs_fs__representation_stats_t *stats,
70                 apr_pool_t *pool)
71 {
72   printf(_("%20s bytes in %12s reps\n"
73            "%20s bytes in %12s shared reps\n"
74            "%20s bytes expanded size\n"
75            "%20s bytes expanded shared size\n"
76            "%20s bytes with rep-sharing off\n"
77            "%20s shared references\n"
78            "%20.3f average delta chain length\n"),
79          svn__ui64toa_sep(stats->total.packed_size, ',', pool),
80          svn__ui64toa_sep(stats->total.count, ',', pool),
81          svn__ui64toa_sep(stats->shared.packed_size, ',', pool),
82          svn__ui64toa_sep(stats->shared.count, ',', pool),
83          svn__ui64toa_sep(stats->total.expanded_size, ',', pool),
84          svn__ui64toa_sep(stats->shared.expanded_size, ',', pool),
85          svn__ui64toa_sep(stats->expanded_size, ',', pool),
86          svn__ui64toa_sep(stats->references - stats->total.count, ',', pool),
87          stats->chain_len / MAX(1.0, (double)stats->total.count));
88 }
89 
90 /* Print the (used) contents of CHANGES.  Use POOL for allocations.
91  */
92 static void
print_largest_reps(svn_fs_fs__largest_changes_t * changes,apr_pool_t * pool)93 print_largest_reps(svn_fs_fs__largest_changes_t *changes,
94                    apr_pool_t *pool)
95 {
96   apr_size_t i;
97   for (i = 0; i < changes->count && changes->changes[i]->size; ++i)
98     printf(_("%12s r%-8ld %s\n"),
99            svn__ui64toa_sep(changes->changes[i]->size, ',', pool),
100            changes->changes[i]->revision,
101            changes->changes[i]->path->data);
102 }
103 
104 /* Print the non-zero section of HISTOGRAM to console.
105  * Use POOL for allocations.
106  */
107 static void
print_histogram(svn_fs_fs__histogram_t * histogram,apr_pool_t * pool)108 print_histogram(svn_fs_fs__histogram_t *histogram,
109                 apr_pool_t *pool)
110 {
111   int first = 0;
112   int last = 63;
113   int i;
114 
115   /* identify non-zero range */
116   while (last > 0 && histogram->lines[last].count == 0)
117     --last;
118 
119   while (first <= last && histogram->lines[first].count == 0)
120     ++first;
121 
122   /* display histogram lines */
123   for (i = last; i >= first; --i)
124     printf(_("  %4s .. < %-4s %19s (%2d%%) bytes in %12s (%2d%%) items\n"),
125            print_two_power(i-1, pool), print_two_power(i, pool),
126            svn__ui64toa_sep(histogram->lines[i].sum, ',', pool),
127            (int)(histogram->lines[i].sum * 100 / histogram->total.sum),
128            svn__ui64toa_sep(histogram->lines[i].count, ',', pool),
129            (int)(histogram->lines[i].count * 100 / histogram->total.count));
130 }
131 
132 /* COMPARISON_FUNC for svn_sort__hash.
133  * Sort extension_info_t values by total count in descending order.
134  */
135 static int
compare_count(const svn_sort__item_t * a,const svn_sort__item_t * b)136 compare_count(const svn_sort__item_t *a,
137               const svn_sort__item_t *b)
138 {
139   const svn_fs_fs__extension_info_t *lhs = a->value;
140   const svn_fs_fs__extension_info_t *rhs = b->value;
141   apr_int64_t diff = lhs->node_histogram.total.count
142                    - rhs->node_histogram.total.count;
143 
144   return diff > 0 ? -1 : (diff < 0 ? 1 : 0);
145 }
146 
147 /* COMPARISON_FUNC for svn_sort__hash.
148  * Sort extension_info_t values by total uncompressed size in descending order.
149  */
150 static int
compare_node_size(const svn_sort__item_t * a,const svn_sort__item_t * b)151 compare_node_size(const svn_sort__item_t *a,
152                   const svn_sort__item_t *b)
153 {
154   const svn_fs_fs__extension_info_t *lhs = a->value;
155   const svn_fs_fs__extension_info_t *rhs = b->value;
156   apr_int64_t diff = lhs->node_histogram.total.sum
157                    - rhs->node_histogram.total.sum;
158 
159   return diff > 0 ? -1 : (diff < 0 ? 1 : 0);
160 }
161 
162 /* COMPARISON_FUNC for svn_sort__hash.
163  * Sort extension_info_t values by total prep count in descending order.
164  */
165 static int
compare_rep_size(const svn_sort__item_t * a,const svn_sort__item_t * b)166 compare_rep_size(const svn_sort__item_t *a,
167                  const svn_sort__item_t *b)
168 {
169   const svn_fs_fs__extension_info_t *lhs = a->value;
170   const svn_fs_fs__extension_info_t *rhs = b->value;
171   apr_int64_t diff = lhs->rep_histogram.total.sum
172                    - rhs->rep_histogram.total.sum;
173 
174   return diff > 0 ? -1 : (diff < 0 ? 1 : 0);
175 }
176 
177 /* Return an array of extension_info_t* for the (up to) 16 most prominent
178  * extensions in STATS according to the sort criterion COMPARISON_FUNC.
179  * Allocate results in POOL.
180  */
181 static apr_array_header_t *
get_by_extensions(svn_fs_fs__stats_t * stats,int (* comparison_func)(const svn_sort__item_t *,const svn_sort__item_t *),apr_pool_t * pool)182 get_by_extensions(svn_fs_fs__stats_t *stats,
183                   int (*comparison_func)(const svn_sort__item_t *,
184                                          const svn_sort__item_t *),
185                   apr_pool_t *pool)
186 {
187   /* sort all data by extension */
188   apr_array_header_t *sorted
189     = svn_sort__hash(stats->by_extension, comparison_func, pool);
190 
191   /* select the top (first) 16 entries */
192   int count = MIN(sorted->nelts, 16);
193   apr_array_header_t *result
194     = apr_array_make(pool, count, sizeof(svn_fs_fs__extension_info_t*));
195   int i;
196 
197   for (i = 0; i < count; ++i)
198     APR_ARRAY_PUSH(result, svn_fs_fs__extension_info_t*)
199      = APR_ARRAY_IDX(sorted, i, svn_sort__item_t).value;
200 
201   return result;
202 }
203 
204 /* Add all extension_info_t* entries of TO_ADD not already in TARGET to
205  * TARGET.
206  */
207 static void
merge_by_extension(apr_array_header_t * target,apr_array_header_t * to_add)208 merge_by_extension(apr_array_header_t *target,
209                    apr_array_header_t *to_add)
210 {
211   int i, k, count;
212 
213   count = target->nelts;
214   for (i = 0; i < to_add->nelts; ++i)
215     {
216       svn_fs_fs__extension_info_t *info
217         = APR_ARRAY_IDX(to_add, i, svn_fs_fs__extension_info_t *);
218       for (k = 0; k < count; ++k)
219         if (info == APR_ARRAY_IDX(target, k, svn_fs_fs__extension_info_t *))
220           break;
221 
222       if (k == count)
223         APR_ARRAY_PUSH(target, svn_fs_fs__extension_info_t*) = info;
224     }
225 }
226 
227 /* Print the (up to) 16 extensions in STATS with the most changes.
228  * Use POOL for allocations.
229  */
230 static void
print_extensions_by_changes(svn_fs_fs__stats_t * stats,apr_pool_t * pool)231 print_extensions_by_changes(svn_fs_fs__stats_t *stats,
232                             apr_pool_t *pool)
233 {
234   apr_array_header_t *data = get_by_extensions(stats, compare_count, pool);
235   apr_int64_t sum = 0;
236   int i;
237 
238   for (i = 0; i < data->nelts; ++i)
239     {
240       svn_fs_fs__extension_info_t *info
241         = APR_ARRAY_IDX(data, i, svn_fs_fs__extension_info_t *);
242 
243       /* If there are elements, then their count cannot be 0. */
244       assert(stats->file_histogram.total.count);
245 
246       sum += info->node_histogram.total.count;
247       printf(_("%11s %20s (%2d%%) representations\n"),
248              info->extension,
249              svn__ui64toa_sep(info->node_histogram.total.count, ',', pool),
250              (int)(info->node_histogram.total.count * 100 /
251                    stats->file_histogram.total.count));
252     }
253 
254   if (stats->file_histogram.total.count)
255     {
256       printf(_("%11s %20s (%2d%%) representations\n"),
257              "(others)",
258              svn__ui64toa_sep(stats->file_histogram.total.count - sum, ',',
259                               pool),
260              (int)((stats->file_histogram.total.count - sum) * 100 /
261                    stats->file_histogram.total.count));
262     }
263 }
264 
265 /* Calculate a percentage, handling edge cases. */
266 static int
get_percentage(apr_uint64_t part,apr_uint64_t total)267 get_percentage(apr_uint64_t part,
268                apr_uint64_t total)
269 {
270   /* This include total == 0. */
271   if (part >= total)
272     return 100;
273 
274   /* Standard case. */
275   return (int)(part * 100.0 / total);
276 }
277 
278 /* Print the (up to) 16 extensions in STATS with the largest total size of
279  * changed file content.  Use POOL for allocations.
280  */
281 static void
print_extensions_by_nodes(svn_fs_fs__stats_t * stats,apr_pool_t * pool)282 print_extensions_by_nodes(svn_fs_fs__stats_t *stats,
283                           apr_pool_t *pool)
284 {
285   apr_array_header_t *data = get_by_extensions(stats, compare_node_size, pool);
286   apr_int64_t sum = 0;
287   int i;
288 
289   for (i = 0; i < data->nelts; ++i)
290     {
291       svn_fs_fs__extension_info_t *info
292         = APR_ARRAY_IDX(data, i, svn_fs_fs__extension_info_t *);
293       sum += info->node_histogram.total.sum;
294       printf(_("%11s %20s (%2d%%) bytes\n"),
295              info->extension,
296              svn__ui64toa_sep(info->node_histogram.total.sum, ',', pool),
297              get_percentage(info->node_histogram.total.sum,
298                             stats->file_histogram.total.sum));
299     }
300 
301   if (stats->file_histogram.total.sum > sum)
302     {
303       /* Total sum can't be zero here. */
304       printf(_("%11s %20s (%2d%%) bytes\n"),
305              "(others)",
306              svn__ui64toa_sep(stats->file_histogram.total.sum - sum, ',',
307                               pool),
308              get_percentage(stats->file_histogram.total.sum - sum,
309                             stats->file_histogram.total.sum));
310     }
311 }
312 
313 /* Print the (up to) 16 extensions in STATS with the largest total size of
314  * changed file content.  Use POOL for allocations.
315  */
316 static void
print_extensions_by_reps(svn_fs_fs__stats_t * stats,apr_pool_t * pool)317 print_extensions_by_reps(svn_fs_fs__stats_t *stats,
318                          apr_pool_t *pool)
319 {
320   apr_array_header_t *data = get_by_extensions(stats, compare_rep_size, pool);
321   apr_int64_t sum = 0;
322   int i;
323 
324   for (i = 0; i < data->nelts; ++i)
325     {
326       svn_fs_fs__extension_info_t *info
327         = APR_ARRAY_IDX(data, i, svn_fs_fs__extension_info_t *);
328       sum += info->rep_histogram.total.sum;
329       printf(_("%11s %20s (%2d%%) bytes\n"),
330              info->extension,
331              svn__ui64toa_sep(info->rep_histogram.total.sum, ',', pool),
332              get_percentage(info->rep_histogram.total.sum,
333                             stats->rep_size_histogram.total.sum));
334     }
335 
336   if (stats->rep_size_histogram.total.sum > sum)
337     {
338       /* Total sum can't be zero here. */
339       printf(_("%11s %20s (%2d%%) bytes\n"),
340              "(others)",
341              svn__ui64toa_sep(stats->rep_size_histogram.total.sum - sum, ',',
342                               pool),
343              get_percentage(stats->rep_size_histogram.total.sum - sum,
344                             stats->rep_size_histogram.total.sum));
345     }
346 }
347 
348 /* Print per-extension histograms for the most frequent extensions in STATS.
349  * Use POOL for allocations. */
350 static void
print_histograms_by_extension(svn_fs_fs__stats_t * stats,apr_pool_t * pool)351 print_histograms_by_extension(svn_fs_fs__stats_t *stats,
352                               apr_pool_t *pool)
353 {
354   apr_array_header_t *data = get_by_extensions(stats, compare_count, pool);
355   int i;
356 
357   merge_by_extension(data, get_by_extensions(stats, compare_node_size, pool));
358   merge_by_extension(data, get_by_extensions(stats, compare_rep_size, pool));
359 
360   for (i = 0; i < data->nelts; ++i)
361     {
362       svn_fs_fs__extension_info_t *info
363         = APR_ARRAY_IDX(data, i, svn_fs_fs__extension_info_t *);
364       printf("\nHistogram of '%s' file sizes:\n", info->extension);
365       print_histogram(&info->node_histogram, pool);
366       printf("\nHistogram of '%s' file representation sizes:\n",
367              info->extension);
368       print_histogram(&info->rep_histogram, pool);
369     }
370 }
371 
372 /* Print the contents of STATS to the console.
373  * Use POOL for allocations.
374  */
375 static void
print_stats(svn_fs_fs__stats_t * stats,apr_pool_t * pool)376 print_stats(svn_fs_fs__stats_t *stats,
377             apr_pool_t *pool)
378 {
379   /* print results */
380   printf("\n\nGlobal statistics:\n");
381   printf(_("%20s bytes in %12s revisions\n"
382            "%20s bytes in %12s changes\n"
383            "%20s bytes in %12s node revision records\n"
384            "%20s bytes in %12s representations\n"
385            "%20s bytes expanded representation size\n"
386            "%20s bytes with rep-sharing off\n"),
387          svn__ui64toa_sep(stats->total_size, ',', pool),
388          svn__ui64toa_sep(stats->revision_count, ',', pool),
389          svn__ui64toa_sep(stats->change_len, ',', pool),
390          svn__ui64toa_sep(stats->change_count, ',', pool),
391          svn__ui64toa_sep(stats->total_node_stats.size, ',', pool),
392          svn__ui64toa_sep(stats->total_node_stats.count, ',', pool),
393          svn__ui64toa_sep(stats->total_rep_stats.total.packed_size, ',',
394                          pool),
395          svn__ui64toa_sep(stats->total_rep_stats.total.count, ',', pool),
396          svn__ui64toa_sep(stats->total_rep_stats.total.expanded_size, ',',
397                          pool),
398          svn__ui64toa_sep(stats->total_rep_stats.expanded_size, ',', pool));
399 
400   printf("\nNoderev statistics:\n");
401   printf(_("%20s bytes in %12s nodes total\n"
402            "%20s bytes in %12s directory noderevs\n"
403            "%20s bytes in %12s file noderevs\n"),
404          svn__ui64toa_sep(stats->total_node_stats.size, ',', pool),
405          svn__ui64toa_sep(stats->total_node_stats.count, ',', pool),
406          svn__ui64toa_sep(stats->dir_node_stats.size, ',', pool),
407          svn__ui64toa_sep(stats->dir_node_stats.count, ',', pool),
408          svn__ui64toa_sep(stats->file_node_stats.size, ',', pool),
409          svn__ui64toa_sep(stats->file_node_stats.count, ',', pool));
410 
411   printf("\nRepresentation statistics:\n");
412   printf(_("%20s bytes in %12s representations total\n"
413            "%20s bytes in %12s directory representations\n"
414            "%20s bytes in %12s file representations\n"
415            "%20s bytes in %12s representations of added file nodes\n"
416            "%20s bytes in %12s directory property representations\n"
417            "%20s bytes in %12s file property representations\n"
418            "                         with %12.3f average delta chain length\n"
419            "%20s bytes in header & footer overhead\n"),
420          svn__ui64toa_sep(stats->total_rep_stats.total.packed_size, ',',
421                          pool),
422          svn__ui64toa_sep(stats->total_rep_stats.total.count, ',', pool),
423          svn__ui64toa_sep(stats->dir_rep_stats.total.packed_size, ',',
424                          pool),
425          svn__ui64toa_sep(stats->dir_rep_stats.total.count, ',', pool),
426          svn__ui64toa_sep(stats->file_rep_stats.total.packed_size, ',',
427                          pool),
428          svn__ui64toa_sep(stats->file_rep_stats.total.count, ',', pool),
429          svn__ui64toa_sep(stats->added_rep_size_histogram.total.sum, ',',
430                          pool),
431          svn__ui64toa_sep(stats->added_rep_size_histogram.total.count, ',',
432                          pool),
433          svn__ui64toa_sep(stats->dir_prop_rep_stats.total.packed_size, ',',
434                          pool),
435          svn__ui64toa_sep(stats->dir_prop_rep_stats.total.count, ',', pool),
436          svn__ui64toa_sep(stats->file_prop_rep_stats.total.packed_size, ',',
437                          pool),
438          svn__ui64toa_sep(stats->file_prop_rep_stats.total.count, ',', pool),
439          stats->total_rep_stats.chain_len
440             / (double)stats->total_rep_stats.total.count,
441          svn__ui64toa_sep(stats->total_rep_stats.total.overhead_size, ',',
442                          pool));
443 
444   printf("\nDirectory representation statistics:\n");
445   print_rep_stats(&stats->dir_rep_stats, pool);
446   printf("\nFile representation statistics:\n");
447   print_rep_stats(&stats->file_rep_stats, pool);
448   printf("\nDirectory property representation statistics:\n");
449   print_rep_stats(&stats->dir_prop_rep_stats, pool);
450   printf("\nFile property representation statistics:\n");
451   print_rep_stats(&stats->file_prop_rep_stats, pool);
452 
453   printf("\nLargest representations:\n");
454   print_largest_reps(stats->largest_changes, pool);
455   printf("\nExtensions by number of representations:\n");
456   print_extensions_by_changes(stats, pool);
457   printf("\nExtensions by size of changed files:\n");
458   print_extensions_by_nodes(stats, pool);
459   printf("\nExtensions by size of representations:\n");
460   print_extensions_by_reps(stats, pool);
461 
462   printf("\nHistogram of expanded node sizes:\n");
463   print_histogram(&stats->node_size_histogram, pool);
464   printf("\nHistogram of representation sizes:\n");
465   print_histogram(&stats->rep_size_histogram, pool);
466   printf("\nHistogram of file sizes:\n");
467   print_histogram(&stats->file_histogram, pool);
468   printf("\nHistogram of file representation sizes:\n");
469   print_histogram(&stats->file_rep_histogram, pool);
470   printf("\nHistogram of file property sizes:\n");
471   print_histogram(&stats->file_prop_histogram, pool);
472   printf("\nHistogram of file property representation sizes:\n");
473   print_histogram(&stats->file_prop_rep_histogram, pool);
474   printf("\nHistogram of directory sizes:\n");
475   print_histogram(&stats->dir_histogram, pool);
476   printf("\nHistogram of directory representation sizes:\n");
477   print_histogram(&stats->dir_rep_histogram, pool);
478   printf("\nHistogram of directory property sizes:\n");
479   print_histogram(&stats->dir_prop_histogram, pool);
480   printf("\nHistogram of directory property representation sizes:\n");
481   print_histogram(&stats->dir_prop_rep_histogram, pool);
482 
483   print_histograms_by_extension(stats, pool);
484 }
485 
486 /* Our progress function simply prints the REVISION number and makes it
487  * appear immediately.
488  */
489 static void
print_progress(svn_revnum_t revision,void * baton,apr_pool_t * pool)490 print_progress(svn_revnum_t revision,
491                void *baton,
492                apr_pool_t *pool)
493 {
494   printf("%8ld", revision);
495   fflush(stdout);
496 }
497 
498 /* This implements `svn_opt_subcommand_t'. */
499 svn_error_t *
subcommand__stats(apr_getopt_t * os,void * baton,apr_pool_t * pool)500 subcommand__stats(apr_getopt_t *os, void *baton, apr_pool_t *pool)
501 {
502   svnfsfs__opt_state *opt_state = baton;
503   svn_fs_t *fs;
504   svn_fs_fs__ioctl_get_stats_input_t input = {0};
505   svn_fs_fs__ioctl_get_stats_output_t *output;
506 
507   printf("Reading revisions\n");
508   SVN_ERR(open_fs(&fs, opt_state->repository_path, pool));
509 
510   input.progress_func = print_progress;
511   SVN_ERR(svn_fs_ioctl(fs, SVN_FS_FS__IOCTL_GET_STATS, &input, (void **)&output,
512                        check_cancel, NULL, pool, pool));
513   print_stats(output->stats, pool);
514 
515   return SVN_NO_ERROR;
516 }
517