xref: /freebsd/sys/contrib/zstd/lib/dictBuilder/cover.c (revision 5ff13fbc)
10c16b537SWarner Losh /*
25ff13fbcSAllan Jude  * Copyright (c) Yann Collet, Facebook, Inc.
30c16b537SWarner Losh  * All rights reserved.
40c16b537SWarner Losh  *
50c16b537SWarner Losh  * This source code is licensed under both the BSD-style license (found in the
60c16b537SWarner Losh  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
70c16b537SWarner Losh  * in the COPYING file in the root directory of this source tree).
80c16b537SWarner Losh  * You may select, at your option, one of the above-listed licenses.
90c16b537SWarner Losh  */
100c16b537SWarner Losh 
110c16b537SWarner Losh /* *****************************************************************************
120c16b537SWarner Losh  * Constructs a dictionary using a heuristic based on the following paper:
130c16b537SWarner Losh  *
140c16b537SWarner Losh  * Liao, Petri, Moffat, Wirth
150c16b537SWarner Losh  * Effective Construction of Relative Lempel-Ziv Dictionaries
160c16b537SWarner Losh  * Published in WWW 2016.
170c16b537SWarner Losh  *
180c16b537SWarner Losh  * Adapted from code originally written by @ot (Giuseppe Ottaviano).
190c16b537SWarner Losh  ******************************************************************************/
200c16b537SWarner Losh 
210c16b537SWarner Losh /*-*************************************
220c16b537SWarner Losh *  Dependencies
230c16b537SWarner Losh ***************************************/
240c16b537SWarner Losh #include <stdio.h>  /* fprintf */
250c16b537SWarner Losh #include <stdlib.h> /* malloc, free, qsort */
260c16b537SWarner Losh #include <string.h> /* memset */
270c16b537SWarner Losh #include <time.h>   /* clock */
280c16b537SWarner Losh 
290c16b537SWarner Losh #ifndef ZDICT_STATIC_LINKING_ONLY
300c16b537SWarner Losh #  define ZDICT_STATIC_LINKING_ONLY
310c16b537SWarner Losh #endif
325ff13fbcSAllan Jude 
335ff13fbcSAllan Jude #include "../common/mem.h" /* read */
345ff13fbcSAllan Jude #include "../common/pool.h"
355ff13fbcSAllan Jude #include "../common/threading.h"
365ff13fbcSAllan Jude #include "../common/zstd_internal.h" /* includes zstd.h */
375ff13fbcSAllan Jude #include "../zdict.h"
385ff13fbcSAllan Jude #include "cover.h"
390c16b537SWarner Losh 
400c16b537SWarner Losh /*-*************************************
410c16b537SWarner Losh *  Constants
420c16b537SWarner Losh ***************************************/
435ff13fbcSAllan Jude /**
445ff13fbcSAllan Jude * There are 32bit indexes used to ref samples, so limit samples size to 4GB
455ff13fbcSAllan Jude * on 64bit builds.
465ff13fbcSAllan Jude * For 32bit builds we choose 1 GB.
475ff13fbcSAllan Jude * Most 32bit platforms have 2GB user-mode addressable space and we allocate a large
485ff13fbcSAllan Jude * contiguous buffer, so 1GB is already a high limit.
495ff13fbcSAllan Jude */
50a0483764SConrad Meyer #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
51f7cd7fe5SConrad Meyer #define COVER_DEFAULT_SPLITPOINT 1.0
520c16b537SWarner Losh 
530c16b537SWarner Losh /*-*************************************
540c16b537SWarner Losh *  Console display
550c16b537SWarner Losh ***************************************/
56f7cd7fe5SConrad Meyer #ifndef LOCALDISPLAYLEVEL
575ff13fbcSAllan Jude static int g_displayLevel = 0;
58f7cd7fe5SConrad Meyer #endif
59f7cd7fe5SConrad Meyer #undef  DISPLAY
600c16b537SWarner Losh #define DISPLAY(...)                                                           \
610c16b537SWarner Losh   {                                                                            \
620c16b537SWarner Losh     fprintf(stderr, __VA_ARGS__);                                              \
630c16b537SWarner Losh     fflush(stderr);                                                            \
640c16b537SWarner Losh   }
65f7cd7fe5SConrad Meyer #undef  LOCALDISPLAYLEVEL
660c16b537SWarner Losh #define LOCALDISPLAYLEVEL(displayLevel, l, ...)                                \
670c16b537SWarner Losh   if (displayLevel >= l) {                                                     \
680c16b537SWarner Losh     DISPLAY(__VA_ARGS__);                                                      \
690c16b537SWarner Losh   } /* 0 : no display;   1: errors;   2: default;  3: details;  4: debug */
70f7cd7fe5SConrad Meyer #undef  DISPLAYLEVEL
710c16b537SWarner Losh #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
720c16b537SWarner Losh 
73f7cd7fe5SConrad Meyer #ifndef LOCALDISPLAYUPDATE
74f7cd7fe5SConrad Meyer static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
75f7cd7fe5SConrad Meyer static clock_t g_time = 0;
76f7cd7fe5SConrad Meyer #endif
77f7cd7fe5SConrad Meyer #undef  LOCALDISPLAYUPDATE
780c16b537SWarner Losh #define LOCALDISPLAYUPDATE(displayLevel, l, ...)                               \
790c16b537SWarner Losh   if (displayLevel >= l) {                                                     \
80f7cd7fe5SConrad Meyer     if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) {             \
810c16b537SWarner Losh       g_time = clock();                                                        \
820c16b537SWarner Losh       DISPLAY(__VA_ARGS__);                                                    \
830c16b537SWarner Losh     }                                                                          \
840c16b537SWarner Losh   }
85f7cd7fe5SConrad Meyer #undef  DISPLAYUPDATE
860c16b537SWarner Losh #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
870c16b537SWarner Losh 
880c16b537SWarner Losh /*-*************************************
890c16b537SWarner Losh * Hash table
900c16b537SWarner Losh ***************************************
910c16b537SWarner Losh * A small specialized hash map for storing activeDmers.
920c16b537SWarner Losh * The map does not resize, so if it becomes full it will loop forever.
930c16b537SWarner Losh * Thus, the map must be large enough to store every value.
940c16b537SWarner Losh * The map implements linear probing and keeps its load less than 0.5.
950c16b537SWarner Losh */
960c16b537SWarner Losh 
970c16b537SWarner Losh #define MAP_EMPTY_VALUE ((U32)-1)
980c16b537SWarner Losh typedef struct COVER_map_pair_t_s {
990c16b537SWarner Losh   U32 key;
1000c16b537SWarner Losh   U32 value;
1010c16b537SWarner Losh } COVER_map_pair_t;
1020c16b537SWarner Losh 
1030c16b537SWarner Losh typedef struct COVER_map_s {
1040c16b537SWarner Losh   COVER_map_pair_t *data;
1050c16b537SWarner Losh   U32 sizeLog;
1060c16b537SWarner Losh   U32 size;
1070c16b537SWarner Losh   U32 sizeMask;
1080c16b537SWarner Losh } COVER_map_t;
1090c16b537SWarner Losh 
1100c16b537SWarner Losh /**
1110c16b537SWarner Losh  * Clear the map.
1120c16b537SWarner Losh  */
COVER_map_clear(COVER_map_t * map)1130c16b537SWarner Losh static void COVER_map_clear(COVER_map_t *map) {
1140c16b537SWarner Losh   memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
1150c16b537SWarner Losh }
1160c16b537SWarner Losh 
1170c16b537SWarner Losh /**
1180c16b537SWarner Losh  * Initializes a map of the given size.
1190c16b537SWarner Losh  * Returns 1 on success and 0 on failure.
1200c16b537SWarner Losh  * The map must be destroyed with COVER_map_destroy().
1210c16b537SWarner Losh  * The map is only guaranteed to be large enough to hold size elements.
1220c16b537SWarner Losh  */
COVER_map_init(COVER_map_t * map,U32 size)1230c16b537SWarner Losh static int COVER_map_init(COVER_map_t *map, U32 size) {
1240c16b537SWarner Losh   map->sizeLog = ZSTD_highbit32(size) + 2;
1250c16b537SWarner Losh   map->size = (U32)1 << map->sizeLog;
1260c16b537SWarner Losh   map->sizeMask = map->size - 1;
1270c16b537SWarner Losh   map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
1280c16b537SWarner Losh   if (!map->data) {
1290c16b537SWarner Losh     map->sizeLog = 0;
1300c16b537SWarner Losh     map->size = 0;
1310c16b537SWarner Losh     return 0;
1320c16b537SWarner Losh   }
1330c16b537SWarner Losh   COVER_map_clear(map);
1340c16b537SWarner Losh   return 1;
1350c16b537SWarner Losh }
1360c16b537SWarner Losh 
1370c16b537SWarner Losh /**
1380c16b537SWarner Losh  * Internal hash function
1390c16b537SWarner Losh  */
140f7cd7fe5SConrad Meyer static const U32 COVER_prime4bytes = 2654435761U;
COVER_map_hash(COVER_map_t * map,U32 key)1410c16b537SWarner Losh static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
142f7cd7fe5SConrad Meyer   return (key * COVER_prime4bytes) >> (32 - map->sizeLog);
1430c16b537SWarner Losh }
1440c16b537SWarner Losh 
1450c16b537SWarner Losh /**
1460c16b537SWarner Losh  * Helper function that returns the index that a key should be placed into.
1470c16b537SWarner Losh  */
COVER_map_index(COVER_map_t * map,U32 key)1480c16b537SWarner Losh static U32 COVER_map_index(COVER_map_t *map, U32 key) {
1490c16b537SWarner Losh   const U32 hash = COVER_map_hash(map, key);
1500c16b537SWarner Losh   U32 i;
1510c16b537SWarner Losh   for (i = hash;; i = (i + 1) & map->sizeMask) {
1520c16b537SWarner Losh     COVER_map_pair_t *pos = &map->data[i];
1530c16b537SWarner Losh     if (pos->value == MAP_EMPTY_VALUE) {
1540c16b537SWarner Losh       return i;
1550c16b537SWarner Losh     }
1560c16b537SWarner Losh     if (pos->key == key) {
1570c16b537SWarner Losh       return i;
1580c16b537SWarner Losh     }
1590c16b537SWarner Losh   }
1600c16b537SWarner Losh }
1610c16b537SWarner Losh 
1620c16b537SWarner Losh /**
1630c16b537SWarner Losh  * Returns the pointer to the value for key.
1640c16b537SWarner Losh  * If key is not in the map, it is inserted and the value is set to 0.
1650c16b537SWarner Losh  * The map must not be full.
1660c16b537SWarner Losh  */
COVER_map_at(COVER_map_t * map,U32 key)1670c16b537SWarner Losh static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
1680c16b537SWarner Losh   COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
1690c16b537SWarner Losh   if (pos->value == MAP_EMPTY_VALUE) {
1700c16b537SWarner Losh     pos->key = key;
1710c16b537SWarner Losh     pos->value = 0;
1720c16b537SWarner Losh   }
1730c16b537SWarner Losh   return &pos->value;
1740c16b537SWarner Losh }
1750c16b537SWarner Losh 
1760c16b537SWarner Losh /**
1770c16b537SWarner Losh  * Deletes key from the map if present.
1780c16b537SWarner Losh  */
COVER_map_remove(COVER_map_t * map,U32 key)1790c16b537SWarner Losh static void COVER_map_remove(COVER_map_t *map, U32 key) {
1800c16b537SWarner Losh   U32 i = COVER_map_index(map, key);
1810c16b537SWarner Losh   COVER_map_pair_t *del = &map->data[i];
1820c16b537SWarner Losh   U32 shift = 1;
1830c16b537SWarner Losh   if (del->value == MAP_EMPTY_VALUE) {
1840c16b537SWarner Losh     return;
1850c16b537SWarner Losh   }
1860c16b537SWarner Losh   for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
1870c16b537SWarner Losh     COVER_map_pair_t *const pos = &map->data[i];
1880c16b537SWarner Losh     /* If the position is empty we are done */
1890c16b537SWarner Losh     if (pos->value == MAP_EMPTY_VALUE) {
1900c16b537SWarner Losh       del->value = MAP_EMPTY_VALUE;
1910c16b537SWarner Losh       return;
1920c16b537SWarner Losh     }
1930c16b537SWarner Losh     /* If pos can be moved to del do so */
1940c16b537SWarner Losh     if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
1950c16b537SWarner Losh       del->key = pos->key;
1960c16b537SWarner Losh       del->value = pos->value;
1970c16b537SWarner Losh       del = pos;
1980c16b537SWarner Losh       shift = 1;
1990c16b537SWarner Losh     } else {
2000c16b537SWarner Losh       ++shift;
2010c16b537SWarner Losh     }
2020c16b537SWarner Losh   }
2030c16b537SWarner Losh }
2040c16b537SWarner Losh 
2050c16b537SWarner Losh /**
2060f743729SConrad Meyer  * Destroys a map that is inited with COVER_map_init().
2070c16b537SWarner Losh  */
COVER_map_destroy(COVER_map_t * map)2080c16b537SWarner Losh static void COVER_map_destroy(COVER_map_t *map) {
2090c16b537SWarner Losh   if (map->data) {
2100c16b537SWarner Losh     free(map->data);
2110c16b537SWarner Losh   }
2120c16b537SWarner Losh   map->data = NULL;
2130c16b537SWarner Losh   map->size = 0;
2140c16b537SWarner Losh }
2150c16b537SWarner Losh 
2160c16b537SWarner Losh /*-*************************************
2170c16b537SWarner Losh * Context
2180c16b537SWarner Losh ***************************************/
2190c16b537SWarner Losh 
2200c16b537SWarner Losh typedef struct {
2210c16b537SWarner Losh   const BYTE *samples;
2220c16b537SWarner Losh   size_t *offsets;
2230c16b537SWarner Losh   const size_t *samplesSizes;
2240c16b537SWarner Losh   size_t nbSamples;
2250f743729SConrad Meyer   size_t nbTrainSamples;
2260f743729SConrad Meyer   size_t nbTestSamples;
2270c16b537SWarner Losh   U32 *suffix;
2280c16b537SWarner Losh   size_t suffixSize;
2290c16b537SWarner Losh   U32 *freqs;
2300c16b537SWarner Losh   U32 *dmerAt;
2310c16b537SWarner Losh   unsigned d;
2320c16b537SWarner Losh } COVER_ctx_t;
2330c16b537SWarner Losh 
2340c16b537SWarner Losh /* We need a global context for qsort... */
235f7cd7fe5SConrad Meyer static COVER_ctx_t *g_coverCtx = NULL;
2360c16b537SWarner Losh 
2370c16b537SWarner Losh /*-*************************************
2380c16b537SWarner Losh *  Helper functions
2390c16b537SWarner Losh ***************************************/
2400c16b537SWarner Losh 
2410c16b537SWarner Losh /**
2420c16b537SWarner Losh  * Returns the sum of the sample sizes.
2430c16b537SWarner Losh  */
COVER_sum(const size_t * samplesSizes,unsigned nbSamples)2440f743729SConrad Meyer size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
2450c16b537SWarner Losh   size_t sum = 0;
2460f743729SConrad Meyer   unsigned i;
2470c16b537SWarner Losh   for (i = 0; i < nbSamples; ++i) {
2480c16b537SWarner Losh     sum += samplesSizes[i];
2490c16b537SWarner Losh   }
2500c16b537SWarner Losh   return sum;
2510c16b537SWarner Losh }
2520c16b537SWarner Losh 
2530c16b537SWarner Losh /**
2540c16b537SWarner Losh  * Returns -1 if the dmer at lp is less than the dmer at rp.
2550c16b537SWarner Losh  * Return 0 if the dmers at lp and rp are equal.
2560c16b537SWarner Losh  * Returns 1 if the dmer at lp is greater than the dmer at rp.
2570c16b537SWarner Losh  */
COVER_cmp(COVER_ctx_t * ctx,const void * lp,const void * rp)2580c16b537SWarner Losh static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
2590c16b537SWarner Losh   U32 const lhs = *(U32 const *)lp;
2600c16b537SWarner Losh   U32 const rhs = *(U32 const *)rp;
2610c16b537SWarner Losh   return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
2620c16b537SWarner Losh }
2630c16b537SWarner Losh /**
2640c16b537SWarner Losh  * Faster version for d <= 8.
2650c16b537SWarner Losh  */
COVER_cmp8(COVER_ctx_t * ctx,const void * lp,const void * rp)2660c16b537SWarner Losh static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
2670c16b537SWarner Losh   U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
2680c16b537SWarner Losh   U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
2690c16b537SWarner Losh   U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
2700c16b537SWarner Losh   if (lhs < rhs) {
2710c16b537SWarner Losh     return -1;
2720c16b537SWarner Losh   }
2730c16b537SWarner Losh   return (lhs > rhs);
2740c16b537SWarner Losh }
2750c16b537SWarner Losh 
2760c16b537SWarner Losh /**
2770c16b537SWarner Losh  * Same as COVER_cmp() except ties are broken by pointer value
278f7cd7fe5SConrad Meyer  * NOTE: g_coverCtx must be set to call this function.  A global is required because
2790c16b537SWarner Losh  * qsort doesn't take an opaque pointer.
2800c16b537SWarner Losh  */
COVER_strict_cmp(const void * lp,const void * rp)281f7cd7fe5SConrad Meyer static int WIN_CDECL COVER_strict_cmp(const void *lp, const void *rp) {
282f7cd7fe5SConrad Meyer   int result = COVER_cmp(g_coverCtx, lp, rp);
2830c16b537SWarner Losh   if (result == 0) {
2840c16b537SWarner Losh     result = lp < rp ? -1 : 1;
2850c16b537SWarner Losh   }
2860c16b537SWarner Losh   return result;
2870c16b537SWarner Losh }
2880c16b537SWarner Losh /**
2890c16b537SWarner Losh  * Faster version for d <= 8.
2900c16b537SWarner Losh  */
COVER_strict_cmp8(const void * lp,const void * rp)291f7cd7fe5SConrad Meyer static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {
292f7cd7fe5SConrad Meyer   int result = COVER_cmp8(g_coverCtx, lp, rp);
2930c16b537SWarner Losh   if (result == 0) {
2940c16b537SWarner Losh     result = lp < rp ? -1 : 1;
2950c16b537SWarner Losh   }
2960c16b537SWarner Losh   return result;
2970c16b537SWarner Losh }
2980c16b537SWarner Losh 
2990c16b537SWarner Losh /**
3000c16b537SWarner Losh  * Returns the first pointer in [first, last) whose element does not compare
3010c16b537SWarner Losh  * less than value.  If no such element exists it returns last.
3020c16b537SWarner Losh  */
COVER_lower_bound(const size_t * first,const size_t * last,size_t value)3030c16b537SWarner Losh static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
3040c16b537SWarner Losh                                        size_t value) {
3050c16b537SWarner Losh   size_t count = last - first;
3060c16b537SWarner Losh   while (count != 0) {
3070c16b537SWarner Losh     size_t step = count / 2;
3080c16b537SWarner Losh     const size_t *ptr = first;
3090c16b537SWarner Losh     ptr += step;
3100c16b537SWarner Losh     if (*ptr < value) {
3110c16b537SWarner Losh       first = ++ptr;
3120c16b537SWarner Losh       count -= step + 1;
3130c16b537SWarner Losh     } else {
3140c16b537SWarner Losh       count = step;
3150c16b537SWarner Losh     }
3160c16b537SWarner Losh   }
3170c16b537SWarner Losh   return first;
3180c16b537SWarner Losh }
3190c16b537SWarner Losh 
3200c16b537SWarner Losh /**
3210c16b537SWarner Losh  * Generic groupBy function.
3220c16b537SWarner Losh  * Groups an array sorted by cmp into groups with equivalent values.
3230c16b537SWarner Losh  * Calls grp for each group.
3240c16b537SWarner Losh  */
3250c16b537SWarner Losh static void
COVER_groupBy(const void * data,size_t count,size_t size,COVER_ctx_t * ctx,int (* cmp)(COVER_ctx_t *,const void *,const void *),void (* grp)(COVER_ctx_t *,const void *,const void *))3260c16b537SWarner Losh COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
3270c16b537SWarner Losh               int (*cmp)(COVER_ctx_t *, const void *, const void *),
3280c16b537SWarner Losh               void (*grp)(COVER_ctx_t *, const void *, const void *)) {
3290c16b537SWarner Losh   const BYTE *ptr = (const BYTE *)data;
3300c16b537SWarner Losh   size_t num = 0;
3310c16b537SWarner Losh   while (num < count) {
3320c16b537SWarner Losh     const BYTE *grpEnd = ptr + size;
3330c16b537SWarner Losh     ++num;
3340c16b537SWarner Losh     while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
3350c16b537SWarner Losh       grpEnd += size;
3360c16b537SWarner Losh       ++num;
3370c16b537SWarner Losh     }
3380c16b537SWarner Losh     grp(ctx, ptr, grpEnd);
3390c16b537SWarner Losh     ptr = grpEnd;
3400c16b537SWarner Losh   }
3410c16b537SWarner Losh }
3420c16b537SWarner Losh 
3430c16b537SWarner Losh /*-*************************************
3440c16b537SWarner Losh *  Cover functions
3450c16b537SWarner Losh ***************************************/
3460c16b537SWarner Losh 
3470c16b537SWarner Losh /**
3480c16b537SWarner Losh  * Called on each group of positions with the same dmer.
3490c16b537SWarner Losh  * Counts the frequency of each dmer and saves it in the suffix array.
3500c16b537SWarner Losh  * Fills `ctx->dmerAt`.
3510c16b537SWarner Losh  */
COVER_group(COVER_ctx_t * ctx,const void * group,const void * groupEnd)3520c16b537SWarner Losh static void COVER_group(COVER_ctx_t *ctx, const void *group,
3530c16b537SWarner Losh                         const void *groupEnd) {
3540c16b537SWarner Losh   /* The group consists of all the positions with the same first d bytes. */
3550c16b537SWarner Losh   const U32 *grpPtr = (const U32 *)group;
3560c16b537SWarner Losh   const U32 *grpEnd = (const U32 *)groupEnd;
3570c16b537SWarner Losh   /* The dmerId is how we will reference this dmer.
3580c16b537SWarner Losh    * This allows us to map the whole dmer space to a much smaller space, the
3590c16b537SWarner Losh    * size of the suffix array.
3600c16b537SWarner Losh    */
3610c16b537SWarner Losh   const U32 dmerId = (U32)(grpPtr - ctx->suffix);
3620c16b537SWarner Losh   /* Count the number of samples this dmer shows up in */
3630c16b537SWarner Losh   U32 freq = 0;
3640c16b537SWarner Losh   /* Details */
3650c16b537SWarner Losh   const size_t *curOffsetPtr = ctx->offsets;
3660c16b537SWarner Losh   const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
3670c16b537SWarner Losh   /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
3680c16b537SWarner Losh    * different sample than the last.
3690c16b537SWarner Losh    */
3700c16b537SWarner Losh   size_t curSampleEnd = ctx->offsets[0];
3710c16b537SWarner Losh   for (; grpPtr != grpEnd; ++grpPtr) {
3720c16b537SWarner Losh     /* Save the dmerId for this position so we can get back to it. */
3730c16b537SWarner Losh     ctx->dmerAt[*grpPtr] = dmerId;
3740c16b537SWarner Losh     /* Dictionaries only help for the first reference to the dmer.
3750c16b537SWarner Losh      * After that zstd can reference the match from the previous reference.
3760c16b537SWarner Losh      * So only count each dmer once for each sample it is in.
3770c16b537SWarner Losh      */
3780c16b537SWarner Losh     if (*grpPtr < curSampleEnd) {
3790c16b537SWarner Losh       continue;
3800c16b537SWarner Losh     }
3810c16b537SWarner Losh     freq += 1;
3820c16b537SWarner Losh     /* Binary search to find the end of the sample *grpPtr is in.
3830c16b537SWarner Losh      * In the common case that grpPtr + 1 == grpEnd we can skip the binary
3840c16b537SWarner Losh      * search because the loop is over.
3850c16b537SWarner Losh      */
3860c16b537SWarner Losh     if (grpPtr + 1 != grpEnd) {
3870c16b537SWarner Losh       const size_t *sampleEndPtr =
3880c16b537SWarner Losh           COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
3890c16b537SWarner Losh       curSampleEnd = *sampleEndPtr;
3900c16b537SWarner Losh       curOffsetPtr = sampleEndPtr + 1;
3910c16b537SWarner Losh     }
3920c16b537SWarner Losh   }
3930c16b537SWarner Losh   /* At this point we are never going to look at this segment of the suffix
3940c16b537SWarner Losh    * array again.  We take advantage of this fact to save memory.
3950c16b537SWarner Losh    * We store the frequency of the dmer in the first position of the group,
3960c16b537SWarner Losh    * which is dmerId.
3970c16b537SWarner Losh    */
3980c16b537SWarner Losh   ctx->suffix[dmerId] = freq;
3990c16b537SWarner Losh }
4000c16b537SWarner Losh 
4010c16b537SWarner Losh 
4020c16b537SWarner Losh /**
4030c16b537SWarner Losh  * Selects the best segment in an epoch.
4040c16b537SWarner Losh  * Segments of are scored according to the function:
4050c16b537SWarner Losh  *
4060c16b537SWarner Losh  * Let F(d) be the frequency of dmer d.
4070c16b537SWarner Losh  * Let S_i be the dmer at position i of segment S which has length k.
4080c16b537SWarner Losh  *
4090c16b537SWarner Losh  *     Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
4100c16b537SWarner Losh  *
4112b9c00cbSConrad Meyer  * Once the dmer d is in the dictionary we set F(d) = 0.
4120c16b537SWarner Losh  */
COVER_selectSegment(const COVER_ctx_t * ctx,U32 * freqs,COVER_map_t * activeDmers,U32 begin,U32 end,ZDICT_cover_params_t parameters)4130c16b537SWarner Losh static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
4140c16b537SWarner Losh                                            COVER_map_t *activeDmers, U32 begin,
4150c16b537SWarner Losh                                            U32 end,
4160c16b537SWarner Losh                                            ZDICT_cover_params_t parameters) {
4170c16b537SWarner Losh   /* Constants */
4180c16b537SWarner Losh   const U32 k = parameters.k;
4190c16b537SWarner Losh   const U32 d = parameters.d;
4200c16b537SWarner Losh   const U32 dmersInK = k - d + 1;
4210c16b537SWarner Losh   /* Try each segment (activeSegment) and save the best (bestSegment) */
4220c16b537SWarner Losh   COVER_segment_t bestSegment = {0, 0, 0};
4230c16b537SWarner Losh   COVER_segment_t activeSegment;
4240c16b537SWarner Losh   /* Reset the activeDmers in the segment */
4250c16b537SWarner Losh   COVER_map_clear(activeDmers);
4260c16b537SWarner Losh   /* The activeSegment starts at the beginning of the epoch. */
4270c16b537SWarner Losh   activeSegment.begin = begin;
4280c16b537SWarner Losh   activeSegment.end = begin;
4290c16b537SWarner Losh   activeSegment.score = 0;
4300c16b537SWarner Losh   /* Slide the activeSegment through the whole epoch.
4310c16b537SWarner Losh    * Save the best segment in bestSegment.
4320c16b537SWarner Losh    */
4330c16b537SWarner Losh   while (activeSegment.end < end) {
4340c16b537SWarner Losh     /* The dmerId for the dmer at the next position */
4350c16b537SWarner Losh     U32 newDmer = ctx->dmerAt[activeSegment.end];
4360c16b537SWarner Losh     /* The entry in activeDmers for this dmerId */
4370c16b537SWarner Losh     U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
4380c16b537SWarner Losh     /* If the dmer isn't already present in the segment add its score. */
4390c16b537SWarner Losh     if (*newDmerOcc == 0) {
4400c16b537SWarner Losh       /* The paper suggest using the L-0.5 norm, but experiments show that it
4410c16b537SWarner Losh        * doesn't help.
4420c16b537SWarner Losh        */
4430c16b537SWarner Losh       activeSegment.score += freqs[newDmer];
4440c16b537SWarner Losh     }
4450c16b537SWarner Losh     /* Add the dmer to the segment */
4460c16b537SWarner Losh     activeSegment.end += 1;
4470c16b537SWarner Losh     *newDmerOcc += 1;
4480c16b537SWarner Losh 
4490c16b537SWarner Losh     /* If the window is now too large, drop the first position */
4500c16b537SWarner Losh     if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
4510c16b537SWarner Losh       U32 delDmer = ctx->dmerAt[activeSegment.begin];
4520c16b537SWarner Losh       U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
4530c16b537SWarner Losh       activeSegment.begin += 1;
4540c16b537SWarner Losh       *delDmerOcc -= 1;
4552b9c00cbSConrad Meyer       /* If this is the last occurrence of the dmer, subtract its score */
4560c16b537SWarner Losh       if (*delDmerOcc == 0) {
4570c16b537SWarner Losh         COVER_map_remove(activeDmers, delDmer);
4580c16b537SWarner Losh         activeSegment.score -= freqs[delDmer];
4590c16b537SWarner Losh       }
4600c16b537SWarner Losh     }
4610c16b537SWarner Losh 
4620c16b537SWarner Losh     /* If this segment is the best so far save it */
4630c16b537SWarner Losh     if (activeSegment.score > bestSegment.score) {
4640c16b537SWarner Losh       bestSegment = activeSegment;
4650c16b537SWarner Losh     }
4660c16b537SWarner Losh   }
4670c16b537SWarner Losh   {
4680c16b537SWarner Losh     /* Trim off the zero frequency head and tail from the segment. */
4690c16b537SWarner Losh     U32 newBegin = bestSegment.end;
4700c16b537SWarner Losh     U32 newEnd = bestSegment.begin;
4710c16b537SWarner Losh     U32 pos;
4720c16b537SWarner Losh     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
4730c16b537SWarner Losh       U32 freq = freqs[ctx->dmerAt[pos]];
4740c16b537SWarner Losh       if (freq != 0) {
4750c16b537SWarner Losh         newBegin = MIN(newBegin, pos);
4760c16b537SWarner Losh         newEnd = pos + 1;
4770c16b537SWarner Losh       }
4780c16b537SWarner Losh     }
4790c16b537SWarner Losh     bestSegment.begin = newBegin;
4800c16b537SWarner Losh     bestSegment.end = newEnd;
4810c16b537SWarner Losh   }
4820c16b537SWarner Losh   {
4830c16b537SWarner Losh     /* Zero out the frequency of each dmer covered by the chosen segment. */
4840c16b537SWarner Losh     U32 pos;
4850c16b537SWarner Losh     for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
4860c16b537SWarner Losh       freqs[ctx->dmerAt[pos]] = 0;
4870c16b537SWarner Losh     }
4880c16b537SWarner Losh   }
4890c16b537SWarner Losh   return bestSegment;
4900c16b537SWarner Losh }
4910c16b537SWarner Losh 
4920c16b537SWarner Losh /**
4930c16b537SWarner Losh  * Check the validity of the parameters.
4940c16b537SWarner Losh  * Returns non-zero if the parameters are valid and 0 otherwise.
4950c16b537SWarner Losh  */
COVER_checkParameters(ZDICT_cover_params_t parameters,size_t maxDictSize)4960c16b537SWarner Losh static int COVER_checkParameters(ZDICT_cover_params_t parameters,
4970c16b537SWarner Losh                                  size_t maxDictSize) {
4980c16b537SWarner Losh   /* k and d are required parameters */
4990c16b537SWarner Losh   if (parameters.d == 0 || parameters.k == 0) {
5000c16b537SWarner Losh     return 0;
5010c16b537SWarner Losh   }
5020c16b537SWarner Losh   /* k <= maxDictSize */
5030c16b537SWarner Losh   if (parameters.k > maxDictSize) {
5040c16b537SWarner Losh     return 0;
5050c16b537SWarner Losh   }
5060c16b537SWarner Losh   /* d <= k */
5070c16b537SWarner Losh   if (parameters.d > parameters.k) {
5080c16b537SWarner Losh     return 0;
5090c16b537SWarner Losh   }
5100f743729SConrad Meyer   /* 0 < splitPoint <= 1 */
5110f743729SConrad Meyer   if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
5120f743729SConrad Meyer     return 0;
5130f743729SConrad Meyer   }
5140c16b537SWarner Losh   return 1;
5150c16b537SWarner Losh }
5160c16b537SWarner Losh 
5170c16b537SWarner Losh /**
5180c16b537SWarner Losh  * Clean up a context initialized with `COVER_ctx_init()`.
5190c16b537SWarner Losh  */
COVER_ctx_destroy(COVER_ctx_t * ctx)5200c16b537SWarner Losh static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
5210c16b537SWarner Losh   if (!ctx) {
5220c16b537SWarner Losh     return;
5230c16b537SWarner Losh   }
5240c16b537SWarner Losh   if (ctx->suffix) {
5250c16b537SWarner Losh     free(ctx->suffix);
5260c16b537SWarner Losh     ctx->suffix = NULL;
5270c16b537SWarner Losh   }
5280c16b537SWarner Losh   if (ctx->freqs) {
5290c16b537SWarner Losh     free(ctx->freqs);
5300c16b537SWarner Losh     ctx->freqs = NULL;
5310c16b537SWarner Losh   }
5320c16b537SWarner Losh   if (ctx->dmerAt) {
5330c16b537SWarner Losh     free(ctx->dmerAt);
5340c16b537SWarner Losh     ctx->dmerAt = NULL;
5350c16b537SWarner Losh   }
5360c16b537SWarner Losh   if (ctx->offsets) {
5370c16b537SWarner Losh     free(ctx->offsets);
5380c16b537SWarner Losh     ctx->offsets = NULL;
5390c16b537SWarner Losh   }
5400c16b537SWarner Losh }
5410c16b537SWarner Losh 
5420c16b537SWarner Losh /**
5430c16b537SWarner Losh  * Prepare a context for dictionary building.
5440c16b537SWarner Losh  * The context is only dependent on the parameter `d` and can used multiple
5450c16b537SWarner Losh  * times.
5464d3f1eafSConrad Meyer  * Returns 0 on success or error code on error.
5470c16b537SWarner Losh  * The context must be destroyed with `COVER_ctx_destroy()`.
5480c16b537SWarner Losh  */
COVER_ctx_init(COVER_ctx_t * ctx,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,unsigned d,double splitPoint)5494d3f1eafSConrad Meyer static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
5500c16b537SWarner Losh                           const size_t *samplesSizes, unsigned nbSamples,
5510f743729SConrad Meyer                           unsigned d, double splitPoint) {
5520c16b537SWarner Losh   const BYTE *const samples = (const BYTE *)samplesBuffer;
5530c16b537SWarner Losh   const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
5540f743729SConrad Meyer   /* Split samples into testing and training sets */
5550f743729SConrad Meyer   const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
5560f743729SConrad Meyer   const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
5570f743729SConrad Meyer   const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
5580f743729SConrad Meyer   const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
5590c16b537SWarner Losh   /* Checks */
5600c16b537SWarner Losh   if (totalSamplesSize < MAX(d, sizeof(U64)) ||
5610c16b537SWarner Losh       totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
56219fcbaf1SConrad Meyer     DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
563a0483764SConrad Meyer                  (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
5644d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5650c16b537SWarner Losh   }
5660f743729SConrad Meyer   /* Check if there are at least 5 training samples */
5670f743729SConrad Meyer   if (nbTrainSamples < 5) {
5680f743729SConrad Meyer     DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
5694d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5700f743729SConrad Meyer   }
5710f743729SConrad Meyer   /* Check if there's testing sample */
5720f743729SConrad Meyer   if (nbTestSamples < 1) {
5730f743729SConrad Meyer     DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
5744d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
5750f743729SConrad Meyer   }
5760c16b537SWarner Losh   /* Zero the context */
5770c16b537SWarner Losh   memset(ctx, 0, sizeof(*ctx));
5780f743729SConrad Meyer   DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
579a0483764SConrad Meyer                (unsigned)trainingSamplesSize);
5800f743729SConrad Meyer   DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
581a0483764SConrad Meyer                (unsigned)testSamplesSize);
5820c16b537SWarner Losh   ctx->samples = samples;
5830c16b537SWarner Losh   ctx->samplesSizes = samplesSizes;
5840c16b537SWarner Losh   ctx->nbSamples = nbSamples;
5850f743729SConrad Meyer   ctx->nbTrainSamples = nbTrainSamples;
5860f743729SConrad Meyer   ctx->nbTestSamples = nbTestSamples;
5870c16b537SWarner Losh   /* Partial suffix array */
5880f743729SConrad Meyer   ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
5890c16b537SWarner Losh   ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
5900c16b537SWarner Losh   /* Maps index to the dmerID */
5910c16b537SWarner Losh   ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
5920c16b537SWarner Losh   /* The offsets of each file */
5930c16b537SWarner Losh   ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
5940c16b537SWarner Losh   if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
5950c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
5960c16b537SWarner Losh     COVER_ctx_destroy(ctx);
5974d3f1eafSConrad Meyer     return ERROR(memory_allocation);
5980c16b537SWarner Losh   }
5990c16b537SWarner Losh   ctx->freqs = NULL;
6000c16b537SWarner Losh   ctx->d = d;
6010c16b537SWarner Losh 
6020f743729SConrad Meyer   /* Fill offsets from the samplesSizes */
6030c16b537SWarner Losh   {
6040c16b537SWarner Losh     U32 i;
6050c16b537SWarner Losh     ctx->offsets[0] = 0;
6060c16b537SWarner Losh     for (i = 1; i <= nbSamples; ++i) {
6070c16b537SWarner Losh       ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
6080c16b537SWarner Losh     }
6090c16b537SWarner Losh   }
6100c16b537SWarner Losh   DISPLAYLEVEL(2, "Constructing partial suffix array\n");
6110c16b537SWarner Losh   {
6120c16b537SWarner Losh     /* suffix is a partial suffix array.
6130c16b537SWarner Losh      * It only sorts suffixes by their first parameters.d bytes.
6140c16b537SWarner Losh      * The sort is stable, so each dmer group is sorted by position in input.
6150c16b537SWarner Losh      */
6160c16b537SWarner Losh     U32 i;
6170c16b537SWarner Losh     for (i = 0; i < ctx->suffixSize; ++i) {
6180c16b537SWarner Losh       ctx->suffix[i] = i;
6190c16b537SWarner Losh     }
6200f743729SConrad Meyer     /* qsort doesn't take an opaque pointer, so pass as a global.
6210f743729SConrad Meyer      * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
6220f743729SConrad Meyer      */
623f7cd7fe5SConrad Meyer     g_coverCtx = ctx;
6240f743729SConrad Meyer #if defined(__OpenBSD__)
6250f743729SConrad Meyer     mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
6260f743729SConrad Meyer           (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
6270f743729SConrad Meyer #else
6280c16b537SWarner Losh     qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
6290c16b537SWarner Losh           (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
6300f743729SConrad Meyer #endif
6310c16b537SWarner Losh   }
6320c16b537SWarner Losh   DISPLAYLEVEL(2, "Computing frequencies\n");
6330c16b537SWarner Losh   /* For each dmer group (group of positions with the same first d bytes):
6340c16b537SWarner Losh    * 1. For each position we set dmerAt[position] = dmerID.  The dmerID is
6350c16b537SWarner Losh    *    (groupBeginPtr - suffix).  This allows us to go from position to
6360c16b537SWarner Losh    *    dmerID so we can look up values in freq.
6370c16b537SWarner Losh    * 2. We calculate how many samples the dmer occurs in and save it in
6380c16b537SWarner Losh    *    freqs[dmerId].
6390c16b537SWarner Losh    */
6400c16b537SWarner Losh   COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
6410c16b537SWarner Losh                 (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
6420c16b537SWarner Losh   ctx->freqs = ctx->suffix;
6430c16b537SWarner Losh   ctx->suffix = NULL;
6444d3f1eafSConrad Meyer   return 0;
6450c16b537SWarner Losh }
6460c16b537SWarner Losh 
COVER_warnOnSmallCorpus(size_t maxDictSize,size_t nbDmers,int displayLevel)6472b9c00cbSConrad Meyer void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
6482b9c00cbSConrad Meyer {
6492b9c00cbSConrad Meyer   const double ratio = (double)nbDmers / maxDictSize;
6502b9c00cbSConrad Meyer   if (ratio >= 10) {
6512b9c00cbSConrad Meyer       return;
6522b9c00cbSConrad Meyer   }
6532b9c00cbSConrad Meyer   LOCALDISPLAYLEVEL(displayLevel, 1,
6542b9c00cbSConrad Meyer                     "WARNING: The maximum dictionary size %u is too large "
6552b9c00cbSConrad Meyer                     "compared to the source size %u! "
6562b9c00cbSConrad Meyer                     "size(source)/size(dictionary) = %f, but it should be >= "
6572b9c00cbSConrad Meyer                     "10! This may lead to a subpar dictionary! We recommend "
6589cbefe25SConrad Meyer                     "training on sources at least 10x, and preferably 100x "
6599cbefe25SConrad Meyer                     "the size of the dictionary! \n", (U32)maxDictSize,
6602b9c00cbSConrad Meyer                     (U32)nbDmers, ratio);
6612b9c00cbSConrad Meyer }
6622b9c00cbSConrad Meyer 
COVER_computeEpochs(U32 maxDictSize,U32 nbDmers,U32 k,U32 passes)6632b9c00cbSConrad Meyer COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
6642b9c00cbSConrad Meyer                                        U32 nbDmers, U32 k, U32 passes)
6652b9c00cbSConrad Meyer {
6662b9c00cbSConrad Meyer   const U32 minEpochSize = k * 10;
6672b9c00cbSConrad Meyer   COVER_epoch_info_t epochs;
6682b9c00cbSConrad Meyer   epochs.num = MAX(1, maxDictSize / k / passes);
6692b9c00cbSConrad Meyer   epochs.size = nbDmers / epochs.num;
6702b9c00cbSConrad Meyer   if (epochs.size >= minEpochSize) {
6712b9c00cbSConrad Meyer       assert(epochs.size * epochs.num <= nbDmers);
6722b9c00cbSConrad Meyer       return epochs;
6732b9c00cbSConrad Meyer   }
6742b9c00cbSConrad Meyer   epochs.size = MIN(minEpochSize, nbDmers);
6752b9c00cbSConrad Meyer   epochs.num = nbDmers / epochs.size;
6762b9c00cbSConrad Meyer   assert(epochs.size * epochs.num <= nbDmers);
6772b9c00cbSConrad Meyer   return epochs;
6782b9c00cbSConrad Meyer }
6792b9c00cbSConrad Meyer 
6800c16b537SWarner Losh /**
6810c16b537SWarner Losh  * Given the prepared context build the dictionary.
6820c16b537SWarner Losh  */
COVER_buildDictionary(const COVER_ctx_t * ctx,U32 * freqs,COVER_map_t * activeDmers,void * dictBuffer,size_t dictBufferCapacity,ZDICT_cover_params_t parameters)6830c16b537SWarner Losh static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
6840c16b537SWarner Losh                                     COVER_map_t *activeDmers, void *dictBuffer,
6850c16b537SWarner Losh                                     size_t dictBufferCapacity,
6860c16b537SWarner Losh                                     ZDICT_cover_params_t parameters) {
6870c16b537SWarner Losh   BYTE *const dict = (BYTE *)dictBuffer;
6880c16b537SWarner Losh   size_t tail = dictBufferCapacity;
6892b9c00cbSConrad Meyer   /* Divide the data into epochs. We will select one segment from each epoch. */
6902b9c00cbSConrad Meyer   const COVER_epoch_info_t epochs = COVER_computeEpochs(
6912b9c00cbSConrad Meyer       (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
6922b9c00cbSConrad Meyer   const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
6932b9c00cbSConrad Meyer   size_t zeroScoreRun = 0;
6940c16b537SWarner Losh   size_t epoch;
695a0483764SConrad Meyer   DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
6962b9c00cbSConrad Meyer                 (U32)epochs.num, (U32)epochs.size);
6970c16b537SWarner Losh   /* Loop through the epochs until there are no more segments or the dictionary
6980c16b537SWarner Losh    * is full.
6990c16b537SWarner Losh    */
7002b9c00cbSConrad Meyer   for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
7012b9c00cbSConrad Meyer     const U32 epochBegin = (U32)(epoch * epochs.size);
7022b9c00cbSConrad Meyer     const U32 epochEnd = epochBegin + epochs.size;
7030c16b537SWarner Losh     size_t segmentSize;
7040c16b537SWarner Losh     /* Select a segment */
7050c16b537SWarner Losh     COVER_segment_t segment = COVER_selectSegment(
7060c16b537SWarner Losh         ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
7072b9c00cbSConrad Meyer     /* If the segment covers no dmers, then we are out of content.
7082b9c00cbSConrad Meyer      * There may be new content in other epochs, for continue for some time.
7092b9c00cbSConrad Meyer      */
7100c16b537SWarner Losh     if (segment.score == 0) {
7112b9c00cbSConrad Meyer       if (++zeroScoreRun >= maxZeroScoreRun) {
7120c16b537SWarner Losh           break;
7130c16b537SWarner Losh       }
7142b9c00cbSConrad Meyer       continue;
7152b9c00cbSConrad Meyer     }
7162b9c00cbSConrad Meyer     zeroScoreRun = 0;
7170c16b537SWarner Losh     /* Trim the segment if necessary and if it is too small then we are done */
7180c16b537SWarner Losh     segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
7190c16b537SWarner Losh     if (segmentSize < parameters.d) {
7200c16b537SWarner Losh       break;
7210c16b537SWarner Losh     }
7220c16b537SWarner Losh     /* We fill the dictionary from the back to allow the best segments to be
7230c16b537SWarner Losh      * referenced with the smallest offsets.
7240c16b537SWarner Losh      */
7250c16b537SWarner Losh     tail -= segmentSize;
7260c16b537SWarner Losh     memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
7270c16b537SWarner Losh     DISPLAYUPDATE(
7280c16b537SWarner Losh         2, "\r%u%%       ",
729a0483764SConrad Meyer         (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
7300c16b537SWarner Losh   }
7310c16b537SWarner Losh   DISPLAYLEVEL(2, "\r%79s\r", "");
7320c16b537SWarner Losh   return tail;
7330c16b537SWarner Losh }
7340c16b537SWarner Losh 
ZDICT_trainFromBuffer_cover(void * dictBuffer,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_cover_params_t parameters)7350c16b537SWarner Losh ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
73619fcbaf1SConrad Meyer     void *dictBuffer, size_t dictBufferCapacity,
73719fcbaf1SConrad Meyer     const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
73819fcbaf1SConrad Meyer     ZDICT_cover_params_t parameters)
73919fcbaf1SConrad Meyer {
7400c16b537SWarner Losh   BYTE* const dict = (BYTE*)dictBuffer;
7410c16b537SWarner Losh   COVER_ctx_t ctx;
7420c16b537SWarner Losh   COVER_map_t activeDmers;
7430f743729SConrad Meyer   parameters.splitPoint = 1.0;
74419fcbaf1SConrad Meyer   /* Initialize global data */
7455ff13fbcSAllan Jude   g_displayLevel = (int)parameters.zParams.notificationLevel;
7460c16b537SWarner Losh   /* Checks */
7470c16b537SWarner Losh   if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
7480c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover parameters incorrect\n");
7494d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
7500c16b537SWarner Losh   }
7510c16b537SWarner Losh   if (nbSamples == 0) {
7520c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover must have at least one input file\n");
7534d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
7540c16b537SWarner Losh   }
7550c16b537SWarner Losh   if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
7560c16b537SWarner Losh     DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
7570c16b537SWarner Losh                  ZDICT_DICTSIZE_MIN);
7580c16b537SWarner Losh     return ERROR(dstSize_tooSmall);
7590c16b537SWarner Losh   }
7600c16b537SWarner Losh   /* Initialize context and activeDmers */
7614d3f1eafSConrad Meyer   {
7624d3f1eafSConrad Meyer     size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
7634d3f1eafSConrad Meyer                       parameters.d, parameters.splitPoint);
7644d3f1eafSConrad Meyer     if (ZSTD_isError(initVal)) {
7654d3f1eafSConrad Meyer       return initVal;
7664d3f1eafSConrad Meyer     }
7670c16b537SWarner Losh   }
7682b9c00cbSConrad Meyer   COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
7690c16b537SWarner Losh   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
7700c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
7710c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
7724d3f1eafSConrad Meyer     return ERROR(memory_allocation);
7730c16b537SWarner Losh   }
7740c16b537SWarner Losh 
7750c16b537SWarner Losh   DISPLAYLEVEL(2, "Building dictionary\n");
7760c16b537SWarner Losh   {
7770c16b537SWarner Losh     const size_t tail =
7780c16b537SWarner Losh         COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
7790c16b537SWarner Losh                               dictBufferCapacity, parameters);
7800c16b537SWarner Losh     const size_t dictionarySize = ZDICT_finalizeDictionary(
7810c16b537SWarner Losh         dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
7820c16b537SWarner Losh         samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
7830c16b537SWarner Losh     if (!ZSTD_isError(dictionarySize)) {
7840c16b537SWarner Losh       DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
785a0483764SConrad Meyer                    (unsigned)dictionarySize);
7860c16b537SWarner Losh     }
7870c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
7880c16b537SWarner Losh     COVER_map_destroy(&activeDmers);
7890c16b537SWarner Losh     return dictionarySize;
7900c16b537SWarner Losh   }
7910c16b537SWarner Losh }
7920c16b537SWarner Losh 
7930f743729SConrad Meyer 
7940f743729SConrad Meyer 
COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,const size_t * samplesSizes,const BYTE * samples,size_t * offsets,size_t nbTrainSamples,size_t nbSamples,BYTE * const dict,size_t dictBufferCapacity)7950f743729SConrad Meyer size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
7960f743729SConrad Meyer                                     const size_t *samplesSizes, const BYTE *samples,
7970f743729SConrad Meyer                                     size_t *offsets,
7980f743729SConrad Meyer                                     size_t nbTrainSamples, size_t nbSamples,
7990f743729SConrad Meyer                                     BYTE *const dict, size_t dictBufferCapacity) {
8000f743729SConrad Meyer   size_t totalCompressedSize = ERROR(GENERIC);
8010f743729SConrad Meyer   /* Pointers */
8020f743729SConrad Meyer   ZSTD_CCtx *cctx;
8030f743729SConrad Meyer   ZSTD_CDict *cdict;
8040f743729SConrad Meyer   void *dst;
8050f743729SConrad Meyer   /* Local variables */
8060f743729SConrad Meyer   size_t dstCapacity;
8070f743729SConrad Meyer   size_t i;
8080f743729SConrad Meyer   /* Allocate dst with enough space to compress the maximum sized sample */
8090f743729SConrad Meyer   {
8100f743729SConrad Meyer     size_t maxSampleSize = 0;
8110f743729SConrad Meyer     i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
8120f743729SConrad Meyer     for (; i < nbSamples; ++i) {
8130f743729SConrad Meyer       maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
8140f743729SConrad Meyer     }
8150f743729SConrad Meyer     dstCapacity = ZSTD_compressBound(maxSampleSize);
8160f743729SConrad Meyer     dst = malloc(dstCapacity);
8170f743729SConrad Meyer   }
8180f743729SConrad Meyer   /* Create the cctx and cdict */
8190f743729SConrad Meyer   cctx = ZSTD_createCCtx();
8200f743729SConrad Meyer   cdict = ZSTD_createCDict(dict, dictBufferCapacity,
8210f743729SConrad Meyer                            parameters.zParams.compressionLevel);
8220f743729SConrad Meyer   if (!dst || !cctx || !cdict) {
8230f743729SConrad Meyer     goto _compressCleanup;
8240f743729SConrad Meyer   }
8250f743729SConrad Meyer   /* Compress each sample and sum their sizes (or error) */
8260f743729SConrad Meyer   totalCompressedSize = dictBufferCapacity;
8270f743729SConrad Meyer   i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
8280f743729SConrad Meyer   for (; i < nbSamples; ++i) {
8290f743729SConrad Meyer     const size_t size = ZSTD_compress_usingCDict(
8300f743729SConrad Meyer         cctx, dst, dstCapacity, samples + offsets[i],
8310f743729SConrad Meyer         samplesSizes[i], cdict);
8320f743729SConrad Meyer     if (ZSTD_isError(size)) {
8334d3f1eafSConrad Meyer       totalCompressedSize = size;
8340f743729SConrad Meyer       goto _compressCleanup;
8350f743729SConrad Meyer     }
8360f743729SConrad Meyer     totalCompressedSize += size;
8370f743729SConrad Meyer   }
8380f743729SConrad Meyer _compressCleanup:
8390f743729SConrad Meyer   ZSTD_freeCCtx(cctx);
8400f743729SConrad Meyer   ZSTD_freeCDict(cdict);
8410f743729SConrad Meyer   if (dst) {
8420f743729SConrad Meyer     free(dst);
8430f743729SConrad Meyer   }
8440f743729SConrad Meyer   return totalCompressedSize;
8450f743729SConrad Meyer }
8460f743729SConrad Meyer 
8470c16b537SWarner Losh 
8480c16b537SWarner Losh /**
8490c16b537SWarner Losh  * Initialize the `COVER_best_t`.
8500c16b537SWarner Losh  */
COVER_best_init(COVER_best_t * best)8510f743729SConrad Meyer void COVER_best_init(COVER_best_t *best) {
8520c16b537SWarner Losh   if (best==NULL) return; /* compatible with init on NULL */
8530c16b537SWarner Losh   (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
8540c16b537SWarner Losh   (void)ZSTD_pthread_cond_init(&best->cond, NULL);
8550c16b537SWarner Losh   best->liveJobs = 0;
8560c16b537SWarner Losh   best->dict = NULL;
8570c16b537SWarner Losh   best->dictSize = 0;
8580c16b537SWarner Losh   best->compressedSize = (size_t)-1;
8590c16b537SWarner Losh   memset(&best->parameters, 0, sizeof(best->parameters));
8600c16b537SWarner Losh }
8610c16b537SWarner Losh 
8620c16b537SWarner Losh /**
8630c16b537SWarner Losh  * Wait until liveJobs == 0.
8640c16b537SWarner Losh  */
COVER_best_wait(COVER_best_t * best)8650f743729SConrad Meyer void COVER_best_wait(COVER_best_t *best) {
8660c16b537SWarner Losh   if (!best) {
8670c16b537SWarner Losh     return;
8680c16b537SWarner Losh   }
8690c16b537SWarner Losh   ZSTD_pthread_mutex_lock(&best->mutex);
8700c16b537SWarner Losh   while (best->liveJobs != 0) {
8710c16b537SWarner Losh     ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
8720c16b537SWarner Losh   }
8730c16b537SWarner Losh   ZSTD_pthread_mutex_unlock(&best->mutex);
8740c16b537SWarner Losh }
8750c16b537SWarner Losh 
8760c16b537SWarner Losh /**
8770c16b537SWarner Losh  * Call COVER_best_wait() and then destroy the COVER_best_t.
8780c16b537SWarner Losh  */
COVER_best_destroy(COVER_best_t * best)8790f743729SConrad Meyer void COVER_best_destroy(COVER_best_t *best) {
8800c16b537SWarner Losh   if (!best) {
8810c16b537SWarner Losh     return;
8820c16b537SWarner Losh   }
8830c16b537SWarner Losh   COVER_best_wait(best);
8840c16b537SWarner Losh   if (best->dict) {
8850c16b537SWarner Losh     free(best->dict);
8860c16b537SWarner Losh   }
8870c16b537SWarner Losh   ZSTD_pthread_mutex_destroy(&best->mutex);
8880c16b537SWarner Losh   ZSTD_pthread_cond_destroy(&best->cond);
8890c16b537SWarner Losh }
8900c16b537SWarner Losh 
8910c16b537SWarner Losh /**
8920c16b537SWarner Losh  * Called when a thread is about to be launched.
8930c16b537SWarner Losh  * Increments liveJobs.
8940c16b537SWarner Losh  */
COVER_best_start(COVER_best_t * best)8950f743729SConrad Meyer void COVER_best_start(COVER_best_t *best) {
8960c16b537SWarner Losh   if (!best) {
8970c16b537SWarner Losh     return;
8980c16b537SWarner Losh   }
8990c16b537SWarner Losh   ZSTD_pthread_mutex_lock(&best->mutex);
9000c16b537SWarner Losh   ++best->liveJobs;
9010c16b537SWarner Losh   ZSTD_pthread_mutex_unlock(&best->mutex);
9020c16b537SWarner Losh }
9030c16b537SWarner Losh 
9040c16b537SWarner Losh /**
9050c16b537SWarner Losh  * Called when a thread finishes executing, both on error or success.
9060c16b537SWarner Losh  * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
9070c16b537SWarner Losh  * If this dictionary is the best so far save it and its parameters.
9080c16b537SWarner Losh  */
COVER_best_finish(COVER_best_t * best,ZDICT_cover_params_t parameters,COVER_dictSelection_t selection)9094d3f1eafSConrad Meyer void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
9104d3f1eafSConrad Meyer                               COVER_dictSelection_t selection) {
9114d3f1eafSConrad Meyer   void* dict = selection.dictContent;
9124d3f1eafSConrad Meyer   size_t compressedSize = selection.totalCompressedSize;
9134d3f1eafSConrad Meyer   size_t dictSize = selection.dictSize;
9140c16b537SWarner Losh   if (!best) {
9150c16b537SWarner Losh     return;
9160c16b537SWarner Losh   }
9170c16b537SWarner Losh   {
9180c16b537SWarner Losh     size_t liveJobs;
9190c16b537SWarner Losh     ZSTD_pthread_mutex_lock(&best->mutex);
9200c16b537SWarner Losh     --best->liveJobs;
9210c16b537SWarner Losh     liveJobs = best->liveJobs;
9220c16b537SWarner Losh     /* If the new dictionary is better */
9230c16b537SWarner Losh     if (compressedSize < best->compressedSize) {
9240c16b537SWarner Losh       /* Allocate space if necessary */
9250c16b537SWarner Losh       if (!best->dict || best->dictSize < dictSize) {
9260c16b537SWarner Losh         if (best->dict) {
9270c16b537SWarner Losh           free(best->dict);
9280c16b537SWarner Losh         }
9290c16b537SWarner Losh         best->dict = malloc(dictSize);
9300c16b537SWarner Losh         if (!best->dict) {
9310c16b537SWarner Losh           best->compressedSize = ERROR(GENERIC);
9320c16b537SWarner Losh           best->dictSize = 0;
933a0483764SConrad Meyer           ZSTD_pthread_cond_signal(&best->cond);
934a0483764SConrad Meyer           ZSTD_pthread_mutex_unlock(&best->mutex);
9350c16b537SWarner Losh           return;
9360c16b537SWarner Losh         }
9370c16b537SWarner Losh       }
9380c16b537SWarner Losh       /* Save the dictionary, parameters, and size */
9399cbefe25SConrad Meyer       if (dict) {
9400c16b537SWarner Losh         memcpy(best->dict, dict, dictSize);
9410c16b537SWarner Losh         best->dictSize = dictSize;
9420c16b537SWarner Losh         best->parameters = parameters;
9430c16b537SWarner Losh         best->compressedSize = compressedSize;
9440c16b537SWarner Losh       }
9459cbefe25SConrad Meyer     }
9460c16b537SWarner Losh     if (liveJobs == 0) {
9470c16b537SWarner Losh       ZSTD_pthread_cond_broadcast(&best->cond);
9480c16b537SWarner Losh     }
9490f743729SConrad Meyer     ZSTD_pthread_mutex_unlock(&best->mutex);
9500c16b537SWarner Losh   }
9510c16b537SWarner Losh }
9520c16b537SWarner Losh 
COVER_dictSelectionError(size_t error)9534d3f1eafSConrad Meyer COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
9544d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { NULL, 0, error };
9554d3f1eafSConrad Meyer     return selection;
9564d3f1eafSConrad Meyer }
9574d3f1eafSConrad Meyer 
COVER_dictSelectionIsError(COVER_dictSelection_t selection)9584d3f1eafSConrad Meyer unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
9594d3f1eafSConrad Meyer   return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
9604d3f1eafSConrad Meyer }
9614d3f1eafSConrad Meyer 
COVER_dictSelectionFree(COVER_dictSelection_t selection)9624d3f1eafSConrad Meyer void COVER_dictSelectionFree(COVER_dictSelection_t selection){
9634d3f1eafSConrad Meyer   free(selection.dictContent);
9644d3f1eafSConrad Meyer }
9654d3f1eafSConrad Meyer 
COVER_selectDict(BYTE * customDictContent,size_t dictBufferCapacity,size_t dictContentSize,const BYTE * samplesBuffer,const size_t * samplesSizes,unsigned nbFinalizeSamples,size_t nbCheckSamples,size_t nbSamples,ZDICT_cover_params_t params,size_t * offsets,size_t totalCompressedSize)966f7cd7fe5SConrad Meyer COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
9674d3f1eafSConrad Meyer         size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
9684d3f1eafSConrad Meyer         size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
9694d3f1eafSConrad Meyer 
9704d3f1eafSConrad Meyer   size_t largestDict = 0;
9714d3f1eafSConrad Meyer   size_t largestCompressed = 0;
9724d3f1eafSConrad Meyer   BYTE* customDictContentEnd = customDictContent + dictContentSize;
9734d3f1eafSConrad Meyer 
974f7cd7fe5SConrad Meyer   BYTE * largestDictbuffer = (BYTE *)malloc(dictBufferCapacity);
975f7cd7fe5SConrad Meyer   BYTE * candidateDictBuffer = (BYTE *)malloc(dictBufferCapacity);
9764d3f1eafSConrad Meyer   double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
9774d3f1eafSConrad Meyer 
9784d3f1eafSConrad Meyer   if (!largestDictbuffer || !candidateDictBuffer) {
9794d3f1eafSConrad Meyer     free(largestDictbuffer);
9804d3f1eafSConrad Meyer     free(candidateDictBuffer);
9814d3f1eafSConrad Meyer     return COVER_dictSelectionError(dictContentSize);
9824d3f1eafSConrad Meyer   }
9834d3f1eafSConrad Meyer 
9844d3f1eafSConrad Meyer   /* Initial dictionary size and compressed size */
9854d3f1eafSConrad Meyer   memcpy(largestDictbuffer, customDictContent, dictContentSize);
9864d3f1eafSConrad Meyer   dictContentSize = ZDICT_finalizeDictionary(
987f7cd7fe5SConrad Meyer     largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,
9884d3f1eafSConrad Meyer     samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
9894d3f1eafSConrad Meyer 
9904d3f1eafSConrad Meyer   if (ZDICT_isError(dictContentSize)) {
9914d3f1eafSConrad Meyer     free(largestDictbuffer);
9924d3f1eafSConrad Meyer     free(candidateDictBuffer);
9934d3f1eafSConrad Meyer     return COVER_dictSelectionError(dictContentSize);
9944d3f1eafSConrad Meyer   }
9954d3f1eafSConrad Meyer 
9964d3f1eafSConrad Meyer   totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
9974d3f1eafSConrad Meyer                                                        samplesBuffer, offsets,
9984d3f1eafSConrad Meyer                                                        nbCheckSamples, nbSamples,
9994d3f1eafSConrad Meyer                                                        largestDictbuffer, dictContentSize);
10004d3f1eafSConrad Meyer 
10014d3f1eafSConrad Meyer   if (ZSTD_isError(totalCompressedSize)) {
10024d3f1eafSConrad Meyer     free(largestDictbuffer);
10034d3f1eafSConrad Meyer     free(candidateDictBuffer);
10044d3f1eafSConrad Meyer     return COVER_dictSelectionError(totalCompressedSize);
10054d3f1eafSConrad Meyer   }
10064d3f1eafSConrad Meyer 
10074d3f1eafSConrad Meyer   if (params.shrinkDict == 0) {
10084d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
10094d3f1eafSConrad Meyer     free(candidateDictBuffer);
10104d3f1eafSConrad Meyer     return selection;
10114d3f1eafSConrad Meyer   }
10124d3f1eafSConrad Meyer 
10134d3f1eafSConrad Meyer   largestDict = dictContentSize;
10144d3f1eafSConrad Meyer   largestCompressed = totalCompressedSize;
10154d3f1eafSConrad Meyer   dictContentSize = ZDICT_DICTSIZE_MIN;
10164d3f1eafSConrad Meyer 
10174d3f1eafSConrad Meyer   /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
10184d3f1eafSConrad Meyer   while (dictContentSize < largestDict) {
10194d3f1eafSConrad Meyer     memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
10204d3f1eafSConrad Meyer     dictContentSize = ZDICT_finalizeDictionary(
1021f7cd7fe5SConrad Meyer       candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,
10224d3f1eafSConrad Meyer       samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
10234d3f1eafSConrad Meyer 
10244d3f1eafSConrad Meyer     if (ZDICT_isError(dictContentSize)) {
10254d3f1eafSConrad Meyer       free(largestDictbuffer);
10264d3f1eafSConrad Meyer       free(candidateDictBuffer);
10274d3f1eafSConrad Meyer       return COVER_dictSelectionError(dictContentSize);
10284d3f1eafSConrad Meyer 
10294d3f1eafSConrad Meyer     }
10304d3f1eafSConrad Meyer 
10314d3f1eafSConrad Meyer     totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
10324d3f1eafSConrad Meyer                                                          samplesBuffer, offsets,
10334d3f1eafSConrad Meyer                                                          nbCheckSamples, nbSamples,
10344d3f1eafSConrad Meyer                                                          candidateDictBuffer, dictContentSize);
10354d3f1eafSConrad Meyer 
10364d3f1eafSConrad Meyer     if (ZSTD_isError(totalCompressedSize)) {
10374d3f1eafSConrad Meyer       free(largestDictbuffer);
10384d3f1eafSConrad Meyer       free(candidateDictBuffer);
10394d3f1eafSConrad Meyer       return COVER_dictSelectionError(totalCompressedSize);
10404d3f1eafSConrad Meyer     }
10414d3f1eafSConrad Meyer 
10424d3f1eafSConrad Meyer     if (totalCompressedSize <= largestCompressed * regressionTolerance) {
10434d3f1eafSConrad Meyer       COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
10444d3f1eafSConrad Meyer       free(largestDictbuffer);
10454d3f1eafSConrad Meyer       return selection;
10464d3f1eafSConrad Meyer     }
10474d3f1eafSConrad Meyer     dictContentSize *= 2;
10484d3f1eafSConrad Meyer   }
10494d3f1eafSConrad Meyer   dictContentSize = largestDict;
10504d3f1eafSConrad Meyer   totalCompressedSize = largestCompressed;
10514d3f1eafSConrad Meyer   {
10524d3f1eafSConrad Meyer     COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
10534d3f1eafSConrad Meyer     free(candidateDictBuffer);
10544d3f1eafSConrad Meyer     return selection;
10554d3f1eafSConrad Meyer   }
10564d3f1eafSConrad Meyer }
10574d3f1eafSConrad Meyer 
10580c16b537SWarner Losh /**
10590c16b537SWarner Losh  * Parameters for COVER_tryParameters().
10600c16b537SWarner Losh  */
10610c16b537SWarner Losh typedef struct COVER_tryParameters_data_s {
10620c16b537SWarner Losh   const COVER_ctx_t *ctx;
10630c16b537SWarner Losh   COVER_best_t *best;
10640c16b537SWarner Losh   size_t dictBufferCapacity;
10650c16b537SWarner Losh   ZDICT_cover_params_t parameters;
10660c16b537SWarner Losh } COVER_tryParameters_data_t;
10670c16b537SWarner Losh 
10680c16b537SWarner Losh /**
10690f743729SConrad Meyer  * Tries a set of parameters and updates the COVER_best_t with the results.
10700c16b537SWarner Losh  * This function is thread safe if zstd is compiled with multithreaded support.
10710c16b537SWarner Losh  * It takes its parameters as an *OWNING* opaque pointer to support threading.
10720c16b537SWarner Losh  */
COVER_tryParameters(void * opaque)10735ff13fbcSAllan Jude static void COVER_tryParameters(void *opaque)
10745ff13fbcSAllan Jude {
10750c16b537SWarner Losh   /* Save parameters as local variables */
10760c16b537SWarner Losh   COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque;
10770c16b537SWarner Losh   const COVER_ctx_t *const ctx = data->ctx;
10780c16b537SWarner Losh   const ZDICT_cover_params_t parameters = data->parameters;
10790c16b537SWarner Losh   size_t dictBufferCapacity = data->dictBufferCapacity;
10800c16b537SWarner Losh   size_t totalCompressedSize = ERROR(GENERIC);
10810c16b537SWarner Losh   /* Allocate space for hash table, dict, and freqs */
10820c16b537SWarner Losh   COVER_map_t activeDmers;
10835ff13fbcSAllan Jude   BYTE* const dict = (BYTE*)malloc(dictBufferCapacity);
10844d3f1eafSConrad Meyer   COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
10855ff13fbcSAllan Jude   U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32));
10860c16b537SWarner Losh   if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
10870c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
10880c16b537SWarner Losh     goto _cleanup;
10890c16b537SWarner Losh   }
10900c16b537SWarner Losh   if (!dict || !freqs) {
10910c16b537SWarner Losh     DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
10920c16b537SWarner Losh     goto _cleanup;
10930c16b537SWarner Losh   }
10940c16b537SWarner Losh   /* Copy the frequencies because we need to modify them */
10950c16b537SWarner Losh   memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
10960c16b537SWarner Losh   /* Build the dictionary */
10970c16b537SWarner Losh   {
10980c16b537SWarner Losh     const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
10990c16b537SWarner Losh                                               dictBufferCapacity, parameters);
1100f7cd7fe5SConrad Meyer     selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
11014d3f1eafSConrad Meyer         ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
11024d3f1eafSConrad Meyer         totalCompressedSize);
11034d3f1eafSConrad Meyer 
11044d3f1eafSConrad Meyer     if (COVER_dictSelectionIsError(selection)) {
11054d3f1eafSConrad Meyer       DISPLAYLEVEL(1, "Failed to select dictionary\n");
11060c16b537SWarner Losh       goto _cleanup;
11070c16b537SWarner Losh     }
11080c16b537SWarner Losh   }
11090c16b537SWarner Losh _cleanup:
11104d3f1eafSConrad Meyer   free(dict);
11114d3f1eafSConrad Meyer   COVER_best_finish(data->best, parameters, selection);
11120c16b537SWarner Losh   free(data);
11130c16b537SWarner Losh   COVER_map_destroy(&activeDmers);
11144d3f1eafSConrad Meyer   COVER_dictSelectionFree(selection);
11150c16b537SWarner Losh   free(freqs);
11160c16b537SWarner Losh }
11170c16b537SWarner Losh 
ZDICT_optimizeTrainFromBuffer_cover(void * dictBuffer,size_t dictBufferCapacity,const void * samplesBuffer,const size_t * samplesSizes,unsigned nbSamples,ZDICT_cover_params_t * parameters)11180c16b537SWarner Losh ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
11190c16b537SWarner Losh     void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer,
11200c16b537SWarner Losh     const size_t* samplesSizes, unsigned nbSamples,
11215ff13fbcSAllan Jude     ZDICT_cover_params_t* parameters)
11225ff13fbcSAllan Jude {
11230c16b537SWarner Losh   /* constants */
11240c16b537SWarner Losh   const unsigned nbThreads = parameters->nbThreads;
11250f743729SConrad Meyer   const double splitPoint =
1126f7cd7fe5SConrad Meyer       parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
11270c16b537SWarner Losh   const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
11280c16b537SWarner Losh   const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
11290c16b537SWarner Losh   const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
11300c16b537SWarner Losh   const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
11310c16b537SWarner Losh   const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
11320c16b537SWarner Losh   const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
11330c16b537SWarner Losh   const unsigned kIterations =
11340c16b537SWarner Losh       (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
11354d3f1eafSConrad Meyer   const unsigned shrinkDict = 0;
11360c16b537SWarner Losh   /* Local variables */
11370c16b537SWarner Losh   const int displayLevel = parameters->zParams.notificationLevel;
11380c16b537SWarner Losh   unsigned iteration = 1;
11390c16b537SWarner Losh   unsigned d;
11400c16b537SWarner Losh   unsigned k;
11410c16b537SWarner Losh   COVER_best_t best;
11420c16b537SWarner Losh   POOL_ctx *pool = NULL;
11432b9c00cbSConrad Meyer   int warned = 0;
114419fcbaf1SConrad Meyer 
11450c16b537SWarner Losh   /* Checks */
11460f743729SConrad Meyer   if (splitPoint <= 0 || splitPoint > 1) {
11470f743729SConrad Meyer     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
11484d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
11490f743729SConrad Meyer   }
11500c16b537SWarner Losh   if (kMinK < kMaxD || kMaxK < kMinK) {
11510c16b537SWarner Losh     LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
11524d3f1eafSConrad Meyer     return ERROR(parameter_outOfBound);
11530c16b537SWarner Losh   }
11540c16b537SWarner Losh   if (nbSamples == 0) {
11550c16b537SWarner Losh     DISPLAYLEVEL(1, "Cover must have at least one input file\n");
11564d3f1eafSConrad Meyer     return ERROR(srcSize_wrong);
11570c16b537SWarner Losh   }
11580c16b537SWarner Losh   if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
11590c16b537SWarner Losh     DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
11600c16b537SWarner Losh                  ZDICT_DICTSIZE_MIN);
11610c16b537SWarner Losh     return ERROR(dstSize_tooSmall);
11620c16b537SWarner Losh   }
11630c16b537SWarner Losh   if (nbThreads > 1) {
11640c16b537SWarner Losh     pool = POOL_create(nbThreads, 1);
11650c16b537SWarner Losh     if (!pool) {
11660c16b537SWarner Losh       return ERROR(memory_allocation);
11670c16b537SWarner Losh     }
11680c16b537SWarner Losh   }
11690c16b537SWarner Losh   /* Initialization */
11700c16b537SWarner Losh   COVER_best_init(&best);
11710c16b537SWarner Losh   /* Turn down global display level to clean up display at level 2 and below */
11720c16b537SWarner Losh   g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
11730c16b537SWarner Losh   /* Loop through d first because each new value needs a new context */
11740c16b537SWarner Losh   LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
11750c16b537SWarner Losh                     kIterations);
11760c16b537SWarner Losh   for (d = kMinD; d <= kMaxD; d += 2) {
11770c16b537SWarner Losh     /* Initialize the context for this value of d */
11780c16b537SWarner Losh     COVER_ctx_t ctx;
11790c16b537SWarner Losh     LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
11804d3f1eafSConrad Meyer     {
11814d3f1eafSConrad Meyer       const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
11824d3f1eafSConrad Meyer       if (ZSTD_isError(initVal)) {
11830c16b537SWarner Losh         LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
11840c16b537SWarner Losh         COVER_best_destroy(&best);
11850c16b537SWarner Losh         POOL_free(pool);
11864d3f1eafSConrad Meyer         return initVal;
11874d3f1eafSConrad Meyer       }
11880c16b537SWarner Losh     }
11892b9c00cbSConrad Meyer     if (!warned) {
11902b9c00cbSConrad Meyer       COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
11912b9c00cbSConrad Meyer       warned = 1;
11922b9c00cbSConrad Meyer     }
11930c16b537SWarner Losh     /* Loop through k reusing the same context */
11940c16b537SWarner Losh     for (k = kMinK; k <= kMaxK; k += kStepSize) {
11950c16b537SWarner Losh       /* Prepare the arguments */
11960c16b537SWarner Losh       COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
11970c16b537SWarner Losh           sizeof(COVER_tryParameters_data_t));
11980c16b537SWarner Losh       LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
11990c16b537SWarner Losh       if (!data) {
12000c16b537SWarner Losh         LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
12010c16b537SWarner Losh         COVER_best_destroy(&best);
12020c16b537SWarner Losh         COVER_ctx_destroy(&ctx);
12030c16b537SWarner Losh         POOL_free(pool);
12044d3f1eafSConrad Meyer         return ERROR(memory_allocation);
12050c16b537SWarner Losh       }
12060c16b537SWarner Losh       data->ctx = &ctx;
12070c16b537SWarner Losh       data->best = &best;
12080c16b537SWarner Losh       data->dictBufferCapacity = dictBufferCapacity;
12090c16b537SWarner Losh       data->parameters = *parameters;
12100c16b537SWarner Losh       data->parameters.k = k;
12110c16b537SWarner Losh       data->parameters.d = d;
12120f743729SConrad Meyer       data->parameters.splitPoint = splitPoint;
12130c16b537SWarner Losh       data->parameters.steps = kSteps;
12144d3f1eafSConrad Meyer       data->parameters.shrinkDict = shrinkDict;
12150c16b537SWarner Losh       data->parameters.zParams.notificationLevel = g_displayLevel;
12160c16b537SWarner Losh       /* Check the parameters */
12170c16b537SWarner Losh       if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
12180c16b537SWarner Losh         DISPLAYLEVEL(1, "Cover parameters incorrect\n");
12190c16b537SWarner Losh         free(data);
12200c16b537SWarner Losh         continue;
12210c16b537SWarner Losh       }
12220c16b537SWarner Losh       /* Call the function and pass ownership of data to it */
12230c16b537SWarner Losh       COVER_best_start(&best);
12240c16b537SWarner Losh       if (pool) {
12250c16b537SWarner Losh         POOL_add(pool, &COVER_tryParameters, data);
12260c16b537SWarner Losh       } else {
12270c16b537SWarner Losh         COVER_tryParameters(data);
12280c16b537SWarner Losh       }
12290c16b537SWarner Losh       /* Print status */
12300c16b537SWarner Losh       LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%%       ",
1231a0483764SConrad Meyer                          (unsigned)((iteration * 100) / kIterations));
12320c16b537SWarner Losh       ++iteration;
12330c16b537SWarner Losh     }
12340c16b537SWarner Losh     COVER_best_wait(&best);
12350c16b537SWarner Losh     COVER_ctx_destroy(&ctx);
12360c16b537SWarner Losh   }
12370c16b537SWarner Losh   LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
12380c16b537SWarner Losh   /* Fill the output buffer and parameters with output of the best parameters */
12390c16b537SWarner Losh   {
12400c16b537SWarner Losh     const size_t dictSize = best.dictSize;
12410c16b537SWarner Losh     if (ZSTD_isError(best.compressedSize)) {
12420c16b537SWarner Losh       const size_t compressedSize = best.compressedSize;
12430c16b537SWarner Losh       COVER_best_destroy(&best);
12440c16b537SWarner Losh       POOL_free(pool);
12450c16b537SWarner Losh       return compressedSize;
12460c16b537SWarner Losh     }
12470c16b537SWarner Losh     *parameters = best.parameters;
12480c16b537SWarner Losh     memcpy(dictBuffer, best.dict, dictSize);
12490c16b537SWarner Losh     COVER_best_destroy(&best);
12500c16b537SWarner Losh     POOL_free(pool);
12510c16b537SWarner Losh     return dictSize;
12520c16b537SWarner Losh   }
12530c16b537SWarner Losh }
1254