xref: /freebsd/sys/contrib/zstd/programs/dibio.c (revision 5ff13fbc)
10c16b537SWarner Losh /*
25ff13fbcSAllan Jude  * Copyright (c) Yann Collet, Facebook, Inc.
30c16b537SWarner Losh  * All rights reserved.
40c16b537SWarner Losh  *
50c16b537SWarner Losh  * This source code is licensed under both the BSD-style license (found in the
60c16b537SWarner Losh  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
70c16b537SWarner Losh  * in the COPYING file in the root directory of this source tree).
80c16b537SWarner Losh  * You may select, at your option, one of the above-listed licenses.
90c16b537SWarner Losh  */
100c16b537SWarner Losh 
110c16b537SWarner Losh 
120c16b537SWarner Losh 
130c16b537SWarner Losh /* **************************************
140c16b537SWarner Losh *  Compiler Warnings
150c16b537SWarner Losh ****************************************/
160c16b537SWarner Losh #ifdef _MSC_VER
170c16b537SWarner Losh #  pragma warning(disable : 4127)    /* disable: C4127: conditional expression is constant */
180c16b537SWarner Losh #endif
190c16b537SWarner Losh 
200c16b537SWarner Losh 
210c16b537SWarner Losh /*-*************************************
220c16b537SWarner Losh *  Includes
230c16b537SWarner Losh ***************************************/
240c16b537SWarner Losh #include "platform.h"       /* Large Files support */
250c16b537SWarner Losh #include "util.h"           /* UTIL_getFileSize, UTIL_getTotalFileSize */
260c16b537SWarner Losh #include <stdlib.h>         /* malloc, free */
270c16b537SWarner Losh #include <string.h>         /* memset */
280c16b537SWarner Losh #include <stdio.h>          /* fprintf, fopen, ftello64 */
290c16b537SWarner Losh #include <errno.h>          /* errno */
300f743729SConrad Meyer #include <assert.h>
310c16b537SWarner Losh 
322b9c00cbSConrad Meyer #include "timefn.h"         /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
3337f1f268SConrad Meyer #include "../lib/common/mem.h"  /* read */
340c16b537SWarner Losh #include "dibio.h"
350c16b537SWarner Losh 
360c16b537SWarner Losh 
370c16b537SWarner Losh /*-*************************************
380c16b537SWarner Losh *  Constants
390c16b537SWarner Losh ***************************************/
400c16b537SWarner Losh #define KB *(1 <<10)
410c16b537SWarner Losh #define MB *(1 <<20)
420c16b537SWarner Losh #define GB *(1U<<30)
430c16b537SWarner Losh 
440c16b537SWarner Losh #define SAMPLESIZE_MAX (128 KB)
450c16b537SWarner Losh #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
460c16b537SWarner Losh #define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
470f743729SConrad Meyer #define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
480c16b537SWarner Losh static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
490c16b537SWarner Losh 
500c16b537SWarner Losh #define NOISELENGTH 32
515ff13fbcSAllan Jude #define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
520c16b537SWarner Losh 
530c16b537SWarner Losh 
540c16b537SWarner Losh /*-*************************************
550c16b537SWarner Losh *  Console display
560c16b537SWarner Losh ***************************************/
570c16b537SWarner Losh #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
580c16b537SWarner Losh #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
590c16b537SWarner Losh 
60052d3c12SConrad Meyer static const U64 g_refreshRate = SEC_TO_MICRO / 6;
61052d3c12SConrad Meyer static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
620c16b537SWarner Losh 
63052d3c12SConrad Meyer #define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
64052d3c12SConrad Meyer             if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
65052d3c12SConrad Meyer             { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
66052d3c12SConrad Meyer             if (displayLevel>=4) fflush(stderr); } } }
670c16b537SWarner Losh 
680c16b537SWarner Losh /*-*************************************
690c16b537SWarner Losh *  Exceptions
700c16b537SWarner Losh ***************************************/
710c16b537SWarner Losh #ifndef DEBUG
720c16b537SWarner Losh #  define DEBUG 0
730c16b537SWarner Losh #endif
740c16b537SWarner Losh #define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
750c16b537SWarner Losh #define EXM_THROW(error, ...)                                             \
760c16b537SWarner Losh {                                                                         \
770c16b537SWarner Losh     DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
780c16b537SWarner Losh     DISPLAY("Error %i : ", error);                                        \
790c16b537SWarner Losh     DISPLAY(__VA_ARGS__);                                                 \
800c16b537SWarner Losh     DISPLAY("\n");                                                        \
810c16b537SWarner Losh     exit(error);                                                          \
820c16b537SWarner Losh }
830c16b537SWarner Losh 
840c16b537SWarner Losh 
850c16b537SWarner Losh /* ********************************************************
860c16b537SWarner Losh *  Helper functions
870c16b537SWarner Losh **********************************************************/
880c16b537SWarner Losh #undef MIN
890c16b537SWarner Losh #define MIN(a,b)    ((a) < (b) ? (a) : (b))
900c16b537SWarner Losh 
915ff13fbcSAllan Jude /**
925ff13fbcSAllan Jude   Returns the size of a file.
935ff13fbcSAllan Jude   If error returns -1.
945ff13fbcSAllan Jude */
DiB_getFileSize(const char * fileName)955ff13fbcSAllan Jude static S64 DiB_getFileSize (const char * fileName)
965ff13fbcSAllan Jude {
975ff13fbcSAllan Jude     U64 const fileSize = UTIL_getFileSize(fileName);
985ff13fbcSAllan Jude     return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
995ff13fbcSAllan Jude }
1000c16b537SWarner Losh 
1010c16b537SWarner Losh /* ********************************************************
1020c16b537SWarner Losh *  File related operations
1030c16b537SWarner Losh **********************************************************/
1040c16b537SWarner Losh /** DiB_loadFiles() :
1050c16b537SWarner Losh  *  load samples from files listed in fileNamesTable into buffer.
1060c16b537SWarner Losh  *  works even if buffer is too small to load all samples.
1070c16b537SWarner Losh  *  Also provides the size of each sample into sampleSizes table
1080c16b537SWarner Losh  *  which must be sized correctly, using DiB_fileStats().
1090c16b537SWarner Losh  * @return : nb of samples effectively loaded into `buffer`
1100c16b537SWarner Losh  * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
1110c16b537SWarner Losh  *  sampleSizes is filled with the size of each sample.
1120c16b537SWarner Losh  */
DiB_loadFiles(void * buffer,size_t * bufferSizePtr,size_t * sampleSizes,int sstSize,const char ** fileNamesTable,int nbFiles,size_t targetChunkSize,int displayLevel)1135ff13fbcSAllan Jude static int DiB_loadFiles(
1145ff13fbcSAllan Jude     void* buffer, size_t* bufferSizePtr,
1155ff13fbcSAllan Jude     size_t* sampleSizes, int sstSize,
1165ff13fbcSAllan Jude     const char** fileNamesTable, int nbFiles,
1175ff13fbcSAllan Jude     size_t targetChunkSize, int displayLevel )
1180c16b537SWarner Losh {
1190c16b537SWarner Losh     char* const buff = (char*)buffer;
1205ff13fbcSAllan Jude     size_t totalDataLoaded = 0;
1215ff13fbcSAllan Jude     int nbSamplesLoaded = 0;
1225ff13fbcSAllan Jude     int fileIndex = 0;
1235ff13fbcSAllan Jude     FILE * f = NULL;
1240c16b537SWarner Losh 
1255ff13fbcSAllan Jude     assert(targetChunkSize <= SAMPLESIZE_MAX);
1265ff13fbcSAllan Jude 
1275ff13fbcSAllan Jude     while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
1285ff13fbcSAllan Jude         size_t fileDataLoaded;
1295ff13fbcSAllan Jude         S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
1305ff13fbcSAllan Jude         if (fileSize <= 0) /* skip if zero-size or file error */
1315ff13fbcSAllan Jude             continue;
1325ff13fbcSAllan Jude 
1335ff13fbcSAllan Jude         f = fopen( fileNamesTable[fileIndex], "rb");
1345ff13fbcSAllan Jude         if (f == NULL)
1355ff13fbcSAllan Jude             EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
1365ff13fbcSAllan Jude         DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[fileIndex]);
1375ff13fbcSAllan Jude 
1385ff13fbcSAllan Jude         /* Load the first chunk of data from the file */
1395ff13fbcSAllan Jude         fileDataLoaded = targetChunkSize > 0 ?
1405ff13fbcSAllan Jude                             (size_t)MIN(fileSize, (S64)targetChunkSize) :
1415ff13fbcSAllan Jude                             (size_t)MIN(fileSize, SAMPLESIZE_MAX );
1425ff13fbcSAllan Jude         if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
1430c16b537SWarner Losh             break;
1445ff13fbcSAllan Jude         if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
1455ff13fbcSAllan Jude             EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
1465ff13fbcSAllan Jude         sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
1475ff13fbcSAllan Jude         totalDataLoaded += fileDataLoaded;
1485ff13fbcSAllan Jude 
1495ff13fbcSAllan Jude         /* If file-chunking is enabled, load the rest of the file as more samples */
1505ff13fbcSAllan Jude         if (targetChunkSize > 0) {
1515ff13fbcSAllan Jude             while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
1525ff13fbcSAllan Jude                 size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
1535ff13fbcSAllan Jude                 if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
1545ff13fbcSAllan Jude                     break;
1555ff13fbcSAllan Jude 
1565ff13fbcSAllan Jude                 if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
1575ff13fbcSAllan Jude                     EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
1585ff13fbcSAllan Jude                 sampleSizes[nbSamplesLoaded++] = chunkSize;
1595ff13fbcSAllan Jude                 totalDataLoaded += chunkSize;
1605ff13fbcSAllan Jude                 fileDataLoaded += chunkSize;
1610c16b537SWarner Losh             }
1625ff13fbcSAllan Jude         }
1635ff13fbcSAllan Jude         fileIndex += 1;
1645ff13fbcSAllan Jude         fclose(f); f = NULL;
1655ff13fbcSAllan Jude     }
1665ff13fbcSAllan Jude     if (f != NULL)
1670c16b537SWarner Losh         fclose(f);
1685ff13fbcSAllan Jude 
1690c16b537SWarner Losh     DISPLAYLEVEL(2, "\r%79s\r", "");
1705ff13fbcSAllan Jude     DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
1715ff13fbcSAllan Jude         (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
1725ff13fbcSAllan Jude     *bufferSizePtr = totalDataLoaded;
1735ff13fbcSAllan Jude     return nbSamplesLoaded;
1740c16b537SWarner Losh }
1750c16b537SWarner Losh 
1760c16b537SWarner Losh #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
DiB_rand(U32 * src)1770c16b537SWarner Losh static U32 DiB_rand(U32* src)
1780c16b537SWarner Losh {
1790c16b537SWarner Losh     static const U32 prime1 = 2654435761U;
1800c16b537SWarner Losh     static const U32 prime2 = 2246822519U;
1810c16b537SWarner Losh     U32 rand32 = *src;
1820c16b537SWarner Losh     rand32 *= prime1;
1830c16b537SWarner Losh     rand32 ^= prime2;
1840c16b537SWarner Losh     rand32  = DiB_rotl32(rand32, 13);
1850c16b537SWarner Losh     *src = rand32;
1860c16b537SWarner Losh     return rand32 >> 5;
1870c16b537SWarner Losh }
1880c16b537SWarner Losh 
1890c16b537SWarner Losh /* DiB_shuffle() :
1900c16b537SWarner Losh  * shuffle a table of file names in a semi-random way
1910c16b537SWarner Losh  * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
1920c16b537SWarner Losh  * it will load random elements from it, instead of just the first ones. */
DiB_shuffle(const char ** fileNamesTable,unsigned nbFiles)1930c16b537SWarner Losh static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
1940c16b537SWarner Losh     U32 seed = 0xFD2FB528;
1950c16b537SWarner Losh     unsigned i;
1960f743729SConrad Meyer     assert(nbFiles >= 1);
1970c16b537SWarner Losh     for (i = nbFiles - 1; i > 0; --i) {
1980c16b537SWarner Losh         unsigned const j = DiB_rand(&seed) % (i + 1);
1990c16b537SWarner Losh         const char* const tmp = fileNamesTable[j];
2000c16b537SWarner Losh         fileNamesTable[j] = fileNamesTable[i];
2010c16b537SWarner Losh         fileNamesTable[i] = tmp;
2020c16b537SWarner Losh     }
2030c16b537SWarner Losh }
2040c16b537SWarner Losh 
2050c16b537SWarner Losh 
2060c16b537SWarner Losh /*-********************************************************
2070c16b537SWarner Losh *  Dictionary training functions
2080c16b537SWarner Losh **********************************************************/
DiB_findMaxMem(unsigned long long requiredMem)2090c16b537SWarner Losh static size_t DiB_findMaxMem(unsigned long long requiredMem)
2100c16b537SWarner Losh {
2110c16b537SWarner Losh     size_t const step = 8 MB;
2120c16b537SWarner Losh     void* testmem = NULL;
2130c16b537SWarner Losh 
2140c16b537SWarner Losh     requiredMem = (((requiredMem >> 23) + 1) << 23);
2150c16b537SWarner Losh     requiredMem += step;
2160c16b537SWarner Losh     if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
2170c16b537SWarner Losh 
2180c16b537SWarner Losh     while (!testmem) {
2190c16b537SWarner Losh         testmem = malloc((size_t)requiredMem);
2200c16b537SWarner Losh         requiredMem -= step;
2210c16b537SWarner Losh     }
2220c16b537SWarner Losh 
2230c16b537SWarner Losh     free(testmem);
2240c16b537SWarner Losh     return (size_t)requiredMem;
2250c16b537SWarner Losh }
2260c16b537SWarner Losh 
2270c16b537SWarner Losh 
DiB_fillNoise(void * buffer,size_t length)2280c16b537SWarner Losh static void DiB_fillNoise(void* buffer, size_t length)
2290c16b537SWarner Losh {
2300c16b537SWarner Losh     unsigned const prime1 = 2654435761U;
2310c16b537SWarner Losh     unsigned const prime2 = 2246822519U;
2320c16b537SWarner Losh     unsigned acc = prime1;
2339cbefe25SConrad Meyer     size_t p=0;
2340c16b537SWarner Losh 
2350c16b537SWarner Losh     for (p=0; p<length; p++) {
2360c16b537SWarner Losh         acc *= prime2;
2370c16b537SWarner Losh         ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
2380c16b537SWarner Losh     }
2390c16b537SWarner Losh }
2400c16b537SWarner Losh 
2410c16b537SWarner Losh 
DiB_saveDict(const char * dictFileName,const void * buff,size_t buffSize)2420c16b537SWarner Losh static void DiB_saveDict(const char* dictFileName,
2430c16b537SWarner Losh                          const void* buff, size_t buffSize)
2440c16b537SWarner Losh {
2450c16b537SWarner Losh     FILE* const f = fopen(dictFileName, "wb");
2460c16b537SWarner Losh     if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
2470c16b537SWarner Losh 
2480c16b537SWarner Losh     { size_t const n = fwrite(buff, 1, buffSize, f);
2490c16b537SWarner Losh       if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
2500c16b537SWarner Losh 
2510c16b537SWarner Losh     { size_t const n = (size_t)fclose(f);
2520c16b537SWarner Losh       if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
2530c16b537SWarner Losh }
2540c16b537SWarner Losh 
2550c16b537SWarner Losh typedef struct {
2565ff13fbcSAllan Jude     S64 totalSizeToLoad;
2575ff13fbcSAllan Jude     int nbSamples;
2585ff13fbcSAllan Jude     int oneSampleTooLarge;
2590c16b537SWarner Losh } fileStats;
2600c16b537SWarner Losh 
2610c16b537SWarner Losh /*! DiB_fileStats() :
2620c16b537SWarner Losh  *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
2630c16b537SWarner Losh  *  provides the amount of data to be loaded and the resulting nb of samples.
2640c16b537SWarner Losh  *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
2650c16b537SWarner Losh  */
DiB_fileStats(const char ** fileNamesTable,int nbFiles,size_t chunkSize,int displayLevel)2665ff13fbcSAllan Jude static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
2670c16b537SWarner Losh {
2680c16b537SWarner Losh     fileStats fs;
2695ff13fbcSAllan Jude     int n;
2700c16b537SWarner Losh     memset(&fs, 0, sizeof(fs));
2715ff13fbcSAllan Jude 
2725ff13fbcSAllan Jude     // We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX
2735ff13fbcSAllan Jude     assert( chunkSize <= SAMPLESIZE_MAX );
2745ff13fbcSAllan Jude 
2750c16b537SWarner Losh     for (n=0; n<nbFiles; n++) {
2765ff13fbcSAllan Jude       S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
2775ff13fbcSAllan Jude       // TODO: is there a minimum sample size? What if the file is 1-byte?
2785ff13fbcSAllan Jude       if (fileSize == 0) {
2795ff13fbcSAllan Jude         DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
2805ff13fbcSAllan Jude         continue;
2810c16b537SWarner Losh       }
2825ff13fbcSAllan Jude 
2835ff13fbcSAllan Jude       /* the case where we are breaking up files in sample chunks */
2845ff13fbcSAllan Jude       if (chunkSize > 0)
2855ff13fbcSAllan Jude       {
2865ff13fbcSAllan Jude         // TODO: is there a minimum sample size? Can we have a 1-byte sample?
2875ff13fbcSAllan Jude         fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
2885ff13fbcSAllan Jude         fs.totalSizeToLoad += fileSize;
2895ff13fbcSAllan Jude       }
2905ff13fbcSAllan Jude       else {
2915ff13fbcSAllan Jude       /* the case where one file is one sample */
2925ff13fbcSAllan Jude         if (fileSize > SAMPLESIZE_MAX) {
2935ff13fbcSAllan Jude           /* flag excessively large sample files */
2945ff13fbcSAllan Jude           fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
2955ff13fbcSAllan Jude 
2965ff13fbcSAllan Jude           /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
2975ff13fbcSAllan Jude           DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
2985ff13fbcSAllan Jude               fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
2995ff13fbcSAllan Jude         }
3005ff13fbcSAllan Jude         fs.nbSamples += 1;
3015ff13fbcSAllan Jude         fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
3025ff13fbcSAllan Jude       }
3035ff13fbcSAllan Jude     }
3045ff13fbcSAllan Jude     DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
3050c16b537SWarner Losh     return fs;
3060c16b537SWarner Losh }
3070c16b537SWarner Losh 
DiB_trainFromFiles(const char * dictFileName,size_t maxDictSize,const char ** fileNamesTable,int nbFiles,size_t chunkSize,ZDICT_legacy_params_t * params,ZDICT_cover_params_t * coverParams,ZDICT_fastCover_params_t * fastCoverParams,int optimize,unsigned memLimit)3085ff13fbcSAllan Jude int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
3095ff13fbcSAllan Jude                        const char** fileNamesTable, int nbFiles, size_t chunkSize,
3100c16b537SWarner Losh                        ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
3115ff13fbcSAllan Jude                        ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
3120c16b537SWarner Losh {
3135ff13fbcSAllan Jude     fileStats fs;
3145ff13fbcSAllan Jude     size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
3155ff13fbcSAllan Jude     int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
3165ff13fbcSAllan Jude     size_t loadedSize; /* total data loaded in srcBuffer for all samples */
3175ff13fbcSAllan Jude     void* srcBuffer /* contiguous buffer with training data/samples */;
3180c16b537SWarner Losh     void* const dictBuffer = malloc(maxDictSize);
3195ff13fbcSAllan Jude     int result = 0;
3205ff13fbcSAllan Jude 
3215ff13fbcSAllan Jude     int const displayLevel = params ? params->zParams.notificationLevel :
3225ff13fbcSAllan Jude         coverParams ? coverParams->zParams.notificationLevel :
3235ff13fbcSAllan Jude         fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
3245ff13fbcSAllan Jude 
3255ff13fbcSAllan Jude     /* Shuffle input files before we start assessing how much sample datA to load.
3265ff13fbcSAllan Jude        The purpose of the shuffle is to pick random samples when the sample
3275ff13fbcSAllan Jude        set is larger than what we can load in memory. */
3285ff13fbcSAllan Jude     DISPLAYLEVEL(3, "Shuffling input files\n");
3295ff13fbcSAllan Jude     DiB_shuffle(fileNamesTable, nbFiles);
3305ff13fbcSAllan Jude 
3315ff13fbcSAllan Jude     /* Figure out how much sample data to load with how many samples */
3325ff13fbcSAllan Jude     fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
3335ff13fbcSAllan Jude 
3345ff13fbcSAllan Jude     {
3355ff13fbcSAllan Jude         int const memMult = params ? MEMMULT :
3360f743729SConrad Meyer                             coverParams ? COVER_MEMMULT:
3370f743729SConrad Meyer                             FASTCOVER_MEMMULT;
3380c16b537SWarner Losh         size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
3395ff13fbcSAllan Jude         /* Limit the size of the training data to the free memory */
3405ff13fbcSAllan Jude         /* Limit the size of the training data to 2GB */
3415ff13fbcSAllan Jude         /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
3425ff13fbcSAllan Jude         loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
3435ff13fbcSAllan Jude         if (memLimit != 0) {
3445ff13fbcSAllan Jude             DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
3455ff13fbcSAllan Jude                 (unsigned)(memLimit / (1 MB)));
3465ff13fbcSAllan Jude             loadedSize = (size_t)MIN(loadedSize, memLimit);
3475ff13fbcSAllan Jude         }
3485ff13fbcSAllan Jude         srcBuffer = malloc(loadedSize+NOISELENGTH);
3495ff13fbcSAllan Jude         sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
3505ff13fbcSAllan Jude     }
3510c16b537SWarner Losh 
3520c16b537SWarner Losh     /* Checks */
3530c16b537SWarner Losh     if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
3540c16b537SWarner Losh         EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
3550c16b537SWarner Losh     if (fs.oneSampleTooLarge) {
3560c16b537SWarner Losh         DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
3570c16b537SWarner Losh         DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
3580c16b537SWarner Losh         DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
3590c16b537SWarner Losh     }
3600c16b537SWarner Losh     if (fs.nbSamples < 5) {
3610c16b537SWarner Losh         DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
3620c16b537SWarner Losh         DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
3630c16b537SWarner Losh         DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
3640c16b537SWarner Losh         EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
3650c16b537SWarner Losh     }
3665ff13fbcSAllan Jude     if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
3670c16b537SWarner Losh         DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
3680c16b537SWarner Losh         DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
3690c16b537SWarner Losh     }
3700c16b537SWarner Losh 
3710c16b537SWarner Losh     /* init */
3725ff13fbcSAllan Jude     if ((S64)loadedSize < fs.totalSizeToLoad)
3735ff13fbcSAllan Jude         DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
3745ff13fbcSAllan Jude             (unsigned)(fs.totalSizeToLoad / (1 MB)),
3755ff13fbcSAllan Jude             (unsigned)(loadedSize / (1 MB)));
3760c16b537SWarner Losh 
3770c16b537SWarner Losh     /* Load input buffer */
3785ff13fbcSAllan Jude     nbSamplesLoaded = DiB_loadFiles(
3795ff13fbcSAllan Jude         srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
3805ff13fbcSAllan Jude         nbFiles, chunkSize, displayLevel);
3810c16b537SWarner Losh 
3820c16b537SWarner Losh     {   size_t dictSize;
3830c16b537SWarner Losh         if (params) {
3840c16b537SWarner Losh             DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
3855ff13fbcSAllan Jude             dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
3865ff13fbcSAllan Jude                                                     srcBuffer, sampleSizes, nbSamplesLoaded,
3870c16b537SWarner Losh                                                     *params);
3880f743729SConrad Meyer         } else if (coverParams) {
3890f743729SConrad Meyer             if (optimize) {
3900c16b537SWarner Losh               dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
3915ff13fbcSAllan Jude                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
3920c16b537SWarner Losh                                                              coverParams);
3930c16b537SWarner Losh               if (!ZDICT_isError(dictSize)) {
3940f743729SConrad Meyer                   unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
3950f743729SConrad Meyer                   DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
3960f743729SConrad Meyer                               coverParams->steps, splitPercentage);
3970c16b537SWarner Losh               }
3980c16b537SWarner Losh             } else {
3990c16b537SWarner Losh               dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
4005ff13fbcSAllan Jude                                                      sampleSizes, nbSamplesLoaded, *coverParams);
4010c16b537SWarner Losh             }
4020f743729SConrad Meyer         } else {
4030f743729SConrad Meyer             assert(fastCoverParams != NULL);
4040f743729SConrad Meyer             if (optimize) {
4050f743729SConrad Meyer               dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
4065ff13fbcSAllan Jude                                                               srcBuffer, sampleSizes, nbSamplesLoaded,
4070f743729SConrad Meyer                                                               fastCoverParams);
4080f743729SConrad Meyer               if (!ZDICT_isError(dictSize)) {
4090f743729SConrad Meyer                 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
4100f743729SConrad Meyer                 DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
4110f743729SConrad Meyer                             fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
4120f743729SConrad Meyer                             fastCoverParams->accel);
4130f743729SConrad Meyer               }
4140f743729SConrad Meyer             } else {
4150f743729SConrad Meyer               dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
4165ff13fbcSAllan Jude                                                         sampleSizes, nbSamplesLoaded, *fastCoverParams);
4170f743729SConrad Meyer             }
4180f743729SConrad Meyer         }
4190c16b537SWarner Losh         if (ZDICT_isError(dictSize)) {
4200c16b537SWarner Losh             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
4210c16b537SWarner Losh             result = 1;
4220c16b537SWarner Losh             goto _cleanup;
4230c16b537SWarner Losh         }
4240c16b537SWarner Losh         /* save dict */
425a0483764SConrad Meyer         DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
4260c16b537SWarner Losh         DiB_saveDict(dictFileName, dictBuffer, dictSize);
4270c16b537SWarner Losh     }
4280c16b537SWarner Losh 
4290c16b537SWarner Losh     /* clean up */
4300c16b537SWarner Losh _cleanup:
4310c16b537SWarner Losh     free(srcBuffer);
4320c16b537SWarner Losh     free(sampleSizes);
4330c16b537SWarner Losh     free(dictBuffer);
4340c16b537SWarner Losh     return result;
4350c16b537SWarner Losh }
436