1package compress 2 3import ( 4 "crypto/rand" 5 "encoding/base32" 6 "io/ioutil" 7 "strconv" 8 "strings" 9 "testing" 10 11 "github.com/klauspost/compress/flate" 12 "github.com/klauspost/compress/gzip" 13) 14 15func BenchmarkEstimate(b *testing.B) { 16 b.ReportAllocs() 17 // (predictable, low entropy distibution) 18 b.Run("zeroes-5k", func(b *testing.B) { 19 var testData = make([]byte, 5000) 20 b.SetBytes(int64(len(testData))) 21 b.ResetTimer() 22 for i := 0; i < b.N; i++ { 23 Estimate(testData) 24 } 25 b.Log(Estimate(testData)) 26 }) 27 28 // (predictable, high entropy distibution) 29 b.Run("predictable-5k", func(b *testing.B) { 30 var testData = make([]byte, 5000) 31 for i := range testData { 32 testData[i] = byte(float64(i) / float64(len(testData)) * 256) 33 } 34 b.SetBytes(int64(len(testData))) 35 b.ResetTimer() 36 for i := 0; i < b.N; i++ { 37 Estimate(testData) 38 } 39 b.Log(Estimate(testData)) 40 }) 41 42 // (not predictable, high entropy distibution) 43 b.Run("random-500b", func(b *testing.B) { 44 var testData = make([]byte, 500) 45 rand.Read(testData) 46 b.SetBytes(int64(len(testData))) 47 b.ResetTimer() 48 for i := 0; i < b.N; i++ { 49 Estimate(testData) 50 } 51 b.Log(Estimate(testData)) 52 }) 53 54 // (not predictable, high entropy distibution) 55 b.Run("random-5k", func(b *testing.B) { 56 var testData = make([]byte, 5000) 57 rand.Read(testData) 58 b.SetBytes(int64(len(testData))) 59 b.ResetTimer() 60 for i := 0; i < b.N; i++ { 61 Estimate(testData) 62 } 63 b.Log(Estimate(testData)) 64 }) 65 66 // (not predictable, high entropy distibution) 67 b.Run("random-50k", func(b *testing.B) { 68 var testData = make([]byte, 50000) 69 rand.Read(testData) 70 b.SetBytes(int64(len(testData))) 71 b.ResetTimer() 72 for i := 0; i < b.N; i++ { 73 Estimate(testData) 74 } 75 b.Log(Estimate(testData)) 76 }) 77 78 // (not predictable, high entropy distibution) 79 b.Run("random-500k", func(b *testing.B) { 80 var testData = make([]byte, 500000) 81 rand.Read(testData) 82 b.SetBytes(int64(len(testData))) 83 b.ResetTimer() 84 for i := 0; i < b.N; i++ { 85 Estimate(testData) 86 } 87 b.Log(Estimate(testData)) 88 }) 89 90 // (not predictable, medium entropy distibution) 91 b.Run("base-32-5k", func(b *testing.B) { 92 var testData = make([]byte, 5000) 93 rand.Read(testData) 94 s := base32.StdEncoding.EncodeToString(testData) 95 testData = []byte(s) 96 testData = testData[:5000] 97 b.SetBytes(int64(len(testData))) 98 b.ResetTimer() 99 for i := 0; i < b.N; i++ { 100 Estimate(testData) 101 } 102 b.Log(Estimate(testData)) 103 }) 104 // (medium predictable, medium entropy distibution) 105 b.Run("text", func(b *testing.B) { 106 var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks. 107This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process: 108With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths. 109As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic. 110With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?). 111Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks. 112The attacker then compresses that chunk using the compression algorithm. 113The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets. 114IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk. 115It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup. 116AS always, a paranoid and highly unscientific stream of consciousness. 117Thoughts?`) 118 testData = append(testData, testData...) 119 testData = append(testData, testData...) 120 b.SetBytes(int64(len(testData))) 121 b.ResetTimer() 122 for i := 0; i < b.N; i++ { 123 Estimate(testData) 124 } 125 b.Log(Estimate(testData)) 126 }) 127} 128 129func BenchmarkSnannonEntropyBits(b *testing.B) { 130 b.ReportAllocs() 131 // (predictable, low entropy distibution) 132 b.Run("zeroes-5k", func(b *testing.B) { 133 var testData = make([]byte, 5000) 134 b.SetBytes(int64(len(testData))) 135 b.ResetTimer() 136 for i := 0; i < b.N; i++ { 137 ShannonEntropyBits(testData) 138 } 139 b.Log(ShannonEntropyBits(testData)) 140 }) 141 142 // (predictable, high entropy distibution) 143 b.Run("predictable-5k", func(b *testing.B) { 144 var testData = make([]byte, 5000) 145 for i := range testData { 146 testData[i] = byte(float64(i) / float64(len(testData)) * 256) 147 } 148 b.SetBytes(int64(len(testData))) 149 b.ResetTimer() 150 for i := 0; i < b.N; i++ { 151 ShannonEntropyBits(testData) 152 } 153 b.Log(ShannonEntropyBits(testData)) 154 }) 155 156 // (not predictable, high entropy distibution) 157 b.Run("random-500b", func(b *testing.B) { 158 var testData = make([]byte, 500) 159 rand.Read(testData) 160 b.SetBytes(int64(len(testData))) 161 b.ResetTimer() 162 for i := 0; i < b.N; i++ { 163 ShannonEntropyBits(testData) 164 } 165 b.Log(ShannonEntropyBits(testData)) 166 }) 167 168 // (not predictable, high entropy distibution) 169 b.Run("random-5k", func(b *testing.B) { 170 var testData = make([]byte, 5000) 171 rand.Read(testData) 172 b.SetBytes(int64(len(testData))) 173 b.ResetTimer() 174 for i := 0; i < b.N; i++ { 175 ShannonEntropyBits(testData) 176 } 177 b.Log(ShannonEntropyBits(testData)) 178 }) 179 180 // (not predictable, high entropy distibution) 181 b.Run("random-50k", func(b *testing.B) { 182 var testData = make([]byte, 50000) 183 rand.Read(testData) 184 b.SetBytes(int64(len(testData))) 185 b.ResetTimer() 186 for i := 0; i < b.N; i++ { 187 ShannonEntropyBits(testData) 188 } 189 b.Log(ShannonEntropyBits(testData)) 190 }) 191 192 // (not predictable, high entropy distibution) 193 b.Run("random-500k", func(b *testing.B) { 194 var testData = make([]byte, 500000) 195 rand.Read(testData) 196 b.SetBytes(int64(len(testData))) 197 b.ResetTimer() 198 for i := 0; i < b.N; i++ { 199 ShannonEntropyBits(testData) 200 } 201 b.Log(ShannonEntropyBits(testData)) 202 }) 203 204 // (not predictable, medium entropy distibution) 205 b.Run("base-32-5k", func(b *testing.B) { 206 var testData = make([]byte, 5000) 207 rand.Read(testData) 208 s := base32.StdEncoding.EncodeToString(testData) 209 testData = []byte(s) 210 testData = testData[:5000] 211 b.SetBytes(int64(len(testData))) 212 b.ResetTimer() 213 for i := 0; i < b.N; i++ { 214 ShannonEntropyBits(testData) 215 } 216 b.Log(ShannonEntropyBits(testData)) 217 }) 218 // (medium predictable, medium entropy distibution) 219 b.Run("text", func(b *testing.B) { 220 var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks. 221This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process: 222With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths. 223As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic. 224With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?). 225Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks. 226The attacker then compresses that chunk using the compression algorithm. 227The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets. 228IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk. 229It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup. 230AS always, a paranoid and highly unscientific stream of consciousness. 231Thoughts?`) 232 testData = append(testData, testData...) 233 testData = append(testData, testData...) 234 b.SetBytes(int64(len(testData))) 235 b.ResetTimer() 236 for i := 0; i < b.N; i++ { 237 ShannonEntropyBits(testData) 238 } 239 b.Log(ShannonEntropyBits(testData)) 240 }) 241} 242 243func BenchmarkCompressAllocations(b *testing.B) { 244 payload := []byte(strings.Repeat("Tiny payload", 20)) 245 for j := -2; j <= 9; j++ { 246 b.Run("level("+strconv.Itoa(j)+")", func(b *testing.B) { 247 b.Run("flate", func(b *testing.B) { 248 b.ReportAllocs() 249 250 for i := 0; i < b.N; i++ { 251 w, err := flate.NewWriter(ioutil.Discard, j) 252 if err != nil { 253 b.Fatal(err) 254 } 255 w.Write(payload) 256 w.Close() 257 } 258 }) 259 b.Run("gzip", func(b *testing.B) { 260 b.ReportAllocs() 261 262 for i := 0; i < b.N; i++ { 263 w, err := gzip.NewWriterLevel(ioutil.Discard, j) 264 if err != nil { 265 b.Fatal(err) 266 } 267 w.Write(payload) 268 w.Close() 269 } 270 }) 271 }) 272 } 273} 274 275func BenchmarkCompressAllocationsSingle(b *testing.B) { 276 payload := []byte(strings.Repeat("Tiny payload", 20)) 277 const level = 2 278 b.Run("flate", func(b *testing.B) { 279 b.ReportAllocs() 280 281 for i := 0; i < b.N; i++ { 282 w, err := flate.NewWriter(ioutil.Discard, level) 283 if err != nil { 284 b.Fatal(err) 285 } 286 w.Write(payload) 287 w.Close() 288 } 289 }) 290 b.Run("gzip", func(b *testing.B) { 291 b.ReportAllocs() 292 293 for i := 0; i < b.N; i++ { 294 w, err := gzip.NewWriterLevel(ioutil.Discard, level) 295 if err != nil { 296 b.Fatal(err) 297 } 298 w.Write(payload) 299 w.Close() 300 } 301 }) 302} 303