1 // This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
2 #include "meshoptimizer.h"
3 
4 #include <assert.h>
5 #include <string.h>
6 
7 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
8 #define SIMD_NEON
9 #endif
10 
11 #if defined(__AVX__) || defined(__SSSE3__)
12 #define SIMD_SSE
13 #endif
14 
15 #if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
16 #undef SIMD_SSE
17 #define SIMD_AVX
18 #endif
19 
20 #if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
21 #define SIMD_SSE
22 #define SIMD_FALLBACK
23 #include <intrin.h> // __cpuid
24 #endif
25 
26 #if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
27 #define SIMD_NEON
28 #endif
29 
30 // WebAssembly SIMD implementation requires a few bleeding edge intrinsics that are only available in Chrome Canary
31 #if defined(__wasm_simd128__) && defined(__wasm_unimplemented_simd128__)
32 #define SIMD_WASM
33 #endif
34 
35 #ifdef SIMD_SSE
36 #include <tmmintrin.h>
37 #endif
38 
39 #ifdef SIMD_AVX
40 #include <immintrin.h>
41 #endif
42 
43 #ifdef SIMD_NEON
44 #if defined(_MSC_VER) && defined(_M_ARM64)
45 #include <arm64_neon.h>
46 #else
47 #include <arm_neon.h>
48 #endif
49 #endif
50 
51 #ifdef SIMD_WASM
52 #include <wasm_simd128.h>
53 #endif
54 
55 #ifndef TRACE
56 #define TRACE 0
57 #endif
58 
59 #if TRACE
60 #include <stdio.h>
61 #endif
62 
63 #ifdef SIMD_WASM
64 #define wasm_v32x4_splat(v, i) wasm_v8x16_shuffle(v, v, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3, 4 * i, 4 * i + 1, 4 * i + 2, 4 * i + 3)
65 #define wasm_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
66 #define wasm_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
67 #define wasm_unpacklo_v16x8(a, b) wasm_v8x16_shuffle(a, b, 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23)
68 #define wasm_unpackhi_v16x8(a, b) wasm_v8x16_shuffle(a, b, 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31)
69 #endif
70 
71 namespace meshopt
72 {
73 
74 const unsigned char kVertexHeader = 0xa0;
75 
76 const size_t kVertexBlockSizeBytes = 8192;
77 const size_t kVertexBlockMaxSize = 256;
78 const size_t kByteGroupSize = 16;
79 const size_t kTailMaxSize = 32;
80 
getVertexBlockSize(size_t vertex_size)81 static size_t getVertexBlockSize(size_t vertex_size)
82 {
83 	// make sure the entire block fits into the scratch buffer
84 	size_t result = kVertexBlockSizeBytes / vertex_size;
85 
86 	// align to byte group size; we encode each byte as a byte group
87 	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
88 	result &= ~(kByteGroupSize - 1);
89 
90 	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
91 }
92 
zigzag8(unsigned char v)93 inline unsigned char zigzag8(unsigned char v)
94 {
95 	return ((signed char)(v) >> 7) ^ (v << 1);
96 }
97 
unzigzag8(unsigned char v)98 inline unsigned char unzigzag8(unsigned char v)
99 {
100 	return -(v & 1) ^ (v >> 1);
101 }
102 
103 #if TRACE
104 struct Stats
105 {
106 	size_t size;
107 	size_t header;
108 	size_t bitg[4];
109 	size_t bitb[4];
110 };
111 
112 Stats* bytestats;
113 Stats vertexstats[256];
114 #endif
115 
encodeBytesGroupZero(const unsigned char * buffer)116 static bool encodeBytesGroupZero(const unsigned char* buffer)
117 {
118 	for (size_t i = 0; i < kByteGroupSize; ++i)
119 		if (buffer[i])
120 			return false;
121 
122 	return true;
123 }
124 
encodeBytesGroupMeasure(const unsigned char * buffer,int bits)125 static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
126 {
127 	assert(bits >= 1 && bits <= 8);
128 
129 	if (bits == 1)
130 		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
131 
132 	if (bits == 8)
133 		return kByteGroupSize;
134 
135 	size_t result = kByteGroupSize * bits / 8;
136 
137 	unsigned char sentinel = (1 << bits) - 1;
138 
139 	for (size_t i = 0; i < kByteGroupSize; ++i)
140 		result += buffer[i] >= sentinel;
141 
142 	return result;
143 }
144 
encodeBytesGroup(unsigned char * data,const unsigned char * buffer,int bits)145 static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
146 {
147 	assert(bits >= 1 && bits <= 8);
148 
149 	if (bits == 1)
150 		return data;
151 
152 	if (bits == 8)
153 	{
154 		memcpy(data, buffer, kByteGroupSize);
155 		return data + kByteGroupSize;
156 	}
157 
158 	size_t byte_size = 8 / bits;
159 	assert(kByteGroupSize % byte_size == 0);
160 
161 	// fixed portion: bits bits for each value
162 	// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
163 	unsigned char sentinel = (1 << bits) - 1;
164 
165 	for (size_t i = 0; i < kByteGroupSize; i += byte_size)
166 	{
167 		unsigned char byte = 0;
168 
169 		for (size_t k = 0; k < byte_size; ++k)
170 		{
171 			unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
172 
173 			byte <<= bits;
174 			byte |= enc;
175 		}
176 
177 		*data++ = byte;
178 	}
179 
180 	for (size_t i = 0; i < kByteGroupSize; ++i)
181 	{
182 		if (buffer[i] >= sentinel)
183 		{
184 			*data++ = buffer[i];
185 		}
186 	}
187 
188 	return data;
189 }
190 
encodeBytes(unsigned char * data,unsigned char * data_end,const unsigned char * buffer,size_t buffer_size)191 static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
192 {
193 	assert(buffer_size % kByteGroupSize == 0);
194 
195 	unsigned char* header = data;
196 
197 	// round number of groups to 4 to get number of header bytes
198 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
199 
200 	if (size_t(data_end - data) < header_size)
201 		return 0;
202 
203 	data += header_size;
204 
205 	memset(header, 0, header_size);
206 
207 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
208 	{
209 		if (size_t(data_end - data) < kTailMaxSize)
210 			return 0;
211 
212 		int best_bits = 8;
213 		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
214 
215 		for (int bits = 1; bits < 8; bits *= 2)
216 		{
217 			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
218 
219 			if (size < best_size)
220 			{
221 				best_bits = bits;
222 				best_size = size;
223 			}
224 		}
225 
226 		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
227 		assert((1 << bitslog2) == best_bits);
228 
229 		size_t header_offset = i / kByteGroupSize;
230 
231 		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
232 
233 		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
234 
235 		assert(data + best_size == next);
236 		data = next;
237 
238 #if TRACE > 1
239 		bytestats->bitg[bitslog2]++;
240 		bytestats->bitb[bitslog2] += best_size;
241 #endif
242 	}
243 
244 #if TRACE > 1
245 	bytestats->header += header_size;
246 #endif
247 
248 	return data;
249 }
250 
encodeVertexBlock(unsigned char * data,unsigned char * data_end,const unsigned char * vertex_data,size_t vertex_count,size_t vertex_size,unsigned char last_vertex[256])251 static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
252 {
253 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
254 
255 	unsigned char buffer[kVertexBlockMaxSize];
256 	assert(sizeof(buffer) % kByteGroupSize == 0);
257 
258 	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
259 	memset(buffer, 0, sizeof(buffer));
260 
261 	for (size_t k = 0; k < vertex_size; ++k)
262 	{
263 		size_t vertex_offset = k;
264 
265 		unsigned char p = last_vertex[k];
266 
267 		for (size_t i = 0; i < vertex_count; ++i)
268 		{
269 			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
270 
271 			p = vertex_data[vertex_offset];
272 
273 			vertex_offset += vertex_size;
274 		}
275 
276 #if TRACE
277 		const unsigned char* olddata = data;
278 		bytestats = &vertexstats[k];
279 #endif
280 
281 		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
282 		if (!data)
283 			return 0;
284 
285 #if TRACE
286 		bytestats = 0;
287 		vertexstats[k].size += data - olddata;
288 #endif
289 	}
290 
291 	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
292 
293 	return data;
294 }
295 
296 #if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
decodeBytesGroup(const unsigned char * data,unsigned char * buffer,int bitslog2)297 static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
298 {
299 #define READ() byte = *data++
300 #define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
301 
302 	unsigned char byte, enc, encv;
303 	const unsigned char* data_var;
304 
305 	switch (bitslog2)
306 	{
307 	case 0:
308 		memset(buffer, 0, kByteGroupSize);
309 		return data;
310 	case 1:
311 		data_var = data + 4;
312 
313 		// 4 groups with 4 2-bit values in each byte
314 		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
315 		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
316 		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
317 		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
318 
319 		return data_var;
320 	case 2:
321 		data_var = data + 8;
322 
323 		// 8 groups with 2 4-bit values in each byte
324 		READ(), NEXT(4), NEXT(4);
325 		READ(), NEXT(4), NEXT(4);
326 		READ(), NEXT(4), NEXT(4);
327 		READ(), NEXT(4), NEXT(4);
328 		READ(), NEXT(4), NEXT(4);
329 		READ(), NEXT(4), NEXT(4);
330 		READ(), NEXT(4), NEXT(4);
331 		READ(), NEXT(4), NEXT(4);
332 
333 		return data_var;
334 	case 3:
335 		memcpy(buffer, data, kByteGroupSize);
336 		return data + kByteGroupSize;
337 	default:
338 		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
339 		return data;
340 	}
341 
342 #undef READ
343 #undef NEXT
344 }
345 
decodeBytes(const unsigned char * data,const unsigned char * data_end,unsigned char * buffer,size_t buffer_size)346 static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
347 {
348 	assert(buffer_size % kByteGroupSize == 0);
349 
350 	const unsigned char* header = data;
351 
352 	// round number of groups to 4 to get number of header bytes
353 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
354 
355 	if (size_t(data_end - data) < header_size)
356 		return 0;
357 
358 	data += header_size;
359 
360 	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
361 	{
362 		if (size_t(data_end - data) < kTailMaxSize)
363 			return 0;
364 
365 		size_t header_offset = i / kByteGroupSize;
366 
367 		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
368 
369 		data = decodeBytesGroup(data, buffer + i, bitslog2);
370 	}
371 
372 	return data;
373 }
374 
decodeVertexBlock(const unsigned char * data,const unsigned char * data_end,unsigned char * vertex_data,size_t vertex_count,size_t vertex_size,unsigned char last_vertex[256])375 static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
376 {
377 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
378 
379 	unsigned char buffer[kVertexBlockMaxSize];
380 	unsigned char transposed[kVertexBlockSizeBytes];
381 
382 	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
383 
384 	for (size_t k = 0; k < vertex_size; ++k)
385 	{
386 		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
387 		if (!data)
388 			return 0;
389 
390 		size_t vertex_offset = k;
391 
392 		unsigned char p = last_vertex[k];
393 
394 		for (size_t i = 0; i < vertex_count; ++i)
395 		{
396 			unsigned char v = unzigzag8(buffer[i]) + p;
397 
398 			transposed[vertex_offset] = v;
399 			p = v;
400 
401 			vertex_offset += vertex_size;
402 		}
403 	}
404 
405 	memcpy(vertex_data, transposed, vertex_count * vertex_size);
406 
407 	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
408 
409 	return data;
410 }
411 #endif
412 
413 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
414 static unsigned char kDecodeBytesGroupShuffle[256][8];
415 static unsigned char kDecodeBytesGroupCount[256];
416 
decodeBytesGroupBuildTables()417 static bool decodeBytesGroupBuildTables()
418 {
419 	for (int mask = 0; mask < 256; ++mask)
420 	{
421 		unsigned char shuffle[8];
422 		unsigned char count = 0;
423 
424 		for (int i = 0; i < 8; ++i)
425 		{
426 			int maski = (mask >> i) & 1;
427 			shuffle[i] = maski ? count : 0x80;
428 			count += (unsigned char)(maski);
429 		}
430 
431 		memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
432 		kDecodeBytesGroupCount[mask] = count;
433 	}
434 
435 	return true;
436 }
437 
438 static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
439 #endif
440 
441 #ifdef SIMD_SSE
decodeShuffleMask(unsigned char mask0,unsigned char mask1)442 static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
443 {
444 	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
445 	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
446 	__m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
447 
448 	__m128i sm1r = _mm_add_epi8(sm1, sm1off);
449 
450 	return _mm_unpacklo_epi64(sm0, sm1r);
451 }
452 
decodeBytesGroupSimd(const unsigned char * data,unsigned char * buffer,int bitslog2)453 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
454 {
455 	switch (bitslog2)
456 	{
457 	case 0:
458 	{
459 		__m128i result = _mm_setzero_si128();
460 
461 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
462 
463 		return data;
464 	}
465 
466 	case 1:
467 	{
468 #ifdef __GNUC__
469 		typedef int __attribute__((aligned(1))) unaligned_int;
470 #else
471 		typedef int unaligned_int;
472 #endif
473 
474 		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
475 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
476 
477 		__m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
478 		__m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
479 		__m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
480 
481 		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
482 		int mask16 = _mm_movemask_epi8(mask);
483 		unsigned char mask0 = (unsigned char)(mask16 & 255);
484 		unsigned char mask1 = (unsigned char)(mask16 >> 8);
485 
486 		__m128i shuf = decodeShuffleMask(mask0, mask1);
487 
488 		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
489 
490 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
491 
492 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
493 	}
494 
495 	case 2:
496 	{
497 		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
498 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
499 
500 		__m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
501 		__m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
502 
503 		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
504 		int mask16 = _mm_movemask_epi8(mask);
505 		unsigned char mask0 = (unsigned char)(mask16 & 255);
506 		unsigned char mask1 = (unsigned char)(mask16 >> 8);
507 
508 		__m128i shuf = decodeShuffleMask(mask0, mask1);
509 
510 		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
511 
512 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
513 
514 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
515 	}
516 
517 	case 3:
518 	{
519 		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
520 
521 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
522 
523 		return data + 16;
524 	}
525 
526 	default:
527 		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
528 		return data;
529 	}
530 }
531 #endif
532 
533 #ifdef SIMD_AVX
534 static const __m128i decodeBytesGroupConfig[] = {
535     _mm_set1_epi8(3),
536     _mm_set1_epi8(15),
537     _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
538     _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
539 };
540 
decodeBytesGroupSimd(const unsigned char * data,unsigned char * buffer,int bitslog2)541 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
542 {
543 	switch (bitslog2)
544 	{
545 	case 0:
546 	{
547 		__m128i result = _mm_setzero_si128();
548 
549 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
550 
551 		return data;
552 	}
553 
554 	case 1:
555 	case 2:
556 	{
557 		const unsigned char* skip = data + (bitslog2 << 2);
558 
559 		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
560 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
561 
562 		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
563 		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
564 
565 		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
566 		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
567 		__mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
568 
569 		__m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
570 
571 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
572 
573 		return skip + _mm_popcnt_u32(mask16);
574 	}
575 
576 	case 3:
577 	{
578 		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
579 
580 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
581 
582 		return data + 16;
583 	}
584 
585 	default:
586 		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
587 		return data;
588 	}
589 }
590 #endif
591 
592 #ifdef SIMD_NEON
shuffleBytes(unsigned char mask0,unsigned char mask1,uint8x8_t rest0,uint8x8_t rest1)593 static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
594 {
595 	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
596 	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
597 
598 	uint8x8_t r0 = vtbl1_u8(rest0, sm0);
599 	uint8x8_t r1 = vtbl1_u8(rest1, sm1);
600 
601 	return vcombine_u8(r0, r1);
602 }
603 
neonMoveMask(uint8x16_t mask,unsigned char & mask0,unsigned char & mask1)604 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
605 {
606 	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
607 
608 	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
609 	uint8x16_t masked = vandq_u8(mask, byte_mask);
610 
611 #ifdef __aarch64__
612 	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
613 	mask0 = vaddv_u8(vget_low_u8(masked));
614 	mask1 = vaddv_u8(vget_high_u8(masked));
615 #else
616 	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
617 	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
618 	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
619 	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
620 
621 	mask0 = vget_lane_u8(sum3, 0);
622 	mask1 = vget_lane_u8(sum3, 1);
623 #endif
624 }
625 
decodeBytesGroupSimd(const unsigned char * data,unsigned char * buffer,int bitslog2)626 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
627 {
628 	switch (bitslog2)
629 	{
630 	case 0:
631 	{
632 		uint8x16_t result = vdupq_n_u8(0);
633 
634 		vst1q_u8(buffer, result);
635 
636 		return data;
637 	}
638 
639 	case 1:
640 	{
641 		uint8x8_t sel2 = vld1_u8(data);
642 		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
643 		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
644 		uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
645 
646 		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
647 		unsigned char mask0, mask1;
648 		neonMoveMask(mask, mask0, mask1);
649 
650 		uint8x8_t rest0 = vld1_u8(data + 4);
651 		uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
652 
653 		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
654 
655 		vst1q_u8(buffer, result);
656 
657 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
658 	}
659 
660 	case 2:
661 	{
662 		uint8x8_t sel4 = vld1_u8(data);
663 		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
664 		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
665 
666 		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
667 		unsigned char mask0, mask1;
668 		neonMoveMask(mask, mask0, mask1);
669 
670 		uint8x8_t rest0 = vld1_u8(data + 8);
671 		uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
672 
673 		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
674 
675 		vst1q_u8(buffer, result);
676 
677 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
678 	}
679 
680 	case 3:
681 	{
682 		uint8x16_t result = vld1q_u8(data);
683 
684 		vst1q_u8(buffer, result);
685 
686 		return data + 16;
687 	}
688 
689 	default:
690 		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
691 		return data;
692 	}
693 }
694 #endif
695 
696 #ifdef SIMD_WASM
decodeShuffleMask(unsigned char mask0,unsigned char mask1)697 static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
698 {
699 	// TODO: 8b buffer overrun - should we use splat or extend buffers?
700 	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
701 	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
702 
703 	// TODO: we should use v8x16_load_splat
704 	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
705 	sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
706 
707 	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
708 
709 	return wasm_v8x16_shuffle(sm0, sm1r, 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23);
710 }
711 
wasmMoveMask(v128_t mask,unsigned char & mask0,unsigned char & mask1)712 static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
713 {
714 	uint64_t mbits = 0x8040201008040201ull;
715 
716 	uint64_t m0_8 = wasm_i64x2_extract_lane(mask, 0) & mbits;
717 	uint64_t m1_8 = wasm_i64x2_extract_lane(mask, 1) & mbits;
718 
719 	uint32_t m0_4 = m0_8 | (m0_8 >> 32);
720 	uint32_t m1_4 = m1_8 | (m1_8 >> 32);
721 
722 	uint16_t m0_2 = m0_4 | (m0_4 >> 16);
723 	uint16_t m1_2 = m1_4 | (m1_4 >> 16);
724 
725 	mask0 = m0_2 | (m0_2 >> 8);
726 	mask1 = m1_2 | (m1_2 >> 8);
727 }
728 
decodeBytesGroupSimd(const unsigned char * data,unsigned char * buffer,int bitslog2)729 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
730 {
731 	unsigned char byte, enc, encv;
732 	const unsigned char* data_var;
733 
734 	switch (bitslog2)
735 	{
736 	case 0:
737 	{
738 		v128_t result = wasm_i8x16_splat(0);
739 
740 		wasm_v128_store(buffer, result);
741 
742 		return data;
743 	}
744 
745 	case 1:
746 	{
747 		// TODO: test 4b load splat
748 		v128_t sel2 = wasm_v128_load(data);
749 		v128_t rest = wasm_v128_load(data + 4);
750 
751 		v128_t sel22 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
752 		v128_t sel2222 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
753 		v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
754 
755 		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
756 
757 		if (!wasm_i8x16_any_true(mask))
758 		{
759 			wasm_v128_store(buffer, sel);
760 
761 			return data + 4;
762 		}
763 
764 		unsigned char mask0, mask1;
765 		wasmMoveMask(mask, mask0, mask1);
766 
767 		v128_t shuf = decodeShuffleMask(mask0, mask1);
768 
769 		// TODO: test or/andnot
770 		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
771 
772 		wasm_v128_store(buffer, result);
773 
774 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
775 	}
776 
777 	case 2:
778 	{
779 		// TODO: test 8b load splat
780 		v128_t sel4 = wasm_v128_load(data);
781 		v128_t rest = wasm_v128_load(data + 8);
782 
783 		v128_t sel44 = wasm_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
784 		v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
785 
786 		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
787 
788 		if (!wasm_i8x16_any_true(mask))
789 		{
790 			wasm_v128_store(buffer, sel);
791 
792 			return data + 8;
793 		}
794 
795 		unsigned char mask0, mask1;
796 		wasmMoveMask(mask, mask0, mask1);
797 
798 		v128_t shuf = decodeShuffleMask(mask0, mask1);
799 
800 		// TODO: test or/andnot
801 		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
802 
803 		wasm_v128_store(buffer, result);
804 
805 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
806 	}
807 
808 	case 3:
809 	{
810 		v128_t result = wasm_v128_load(data);
811 
812 		wasm_v128_store(buffer, result);
813 
814 		return data + 16;
815 	}
816 
817 	default:
818 		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
819 		return data;
820 	}
821 }
822 #endif
823 
824 #if defined(SIMD_SSE) || defined(SIMD_AVX)
transpose8(__m128i & x0,__m128i & x1,__m128i & x2,__m128i & x3)825 static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
826 {
827 	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
828 	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
829 	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
830 	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
831 
832 	x0 = _mm_unpacklo_epi16(t0, t2);
833 	x1 = _mm_unpackhi_epi16(t0, t2);
834 	x2 = _mm_unpacklo_epi16(t1, t3);
835 	x3 = _mm_unpackhi_epi16(t1, t3);
836 }
837 
unzigzag8(__m128i v)838 static __m128i unzigzag8(__m128i v)
839 {
840 	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
841 	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
842 
843 	return _mm_xor_si128(xl, xr);
844 }
845 #endif
846 
847 #ifdef SIMD_NEON
transpose8(uint8x16_t & x0,uint8x16_t & x1,uint8x16_t & x2,uint8x16_t & x3)848 static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
849 {
850 	uint8x16x2_t t01 = vzipq_u8(x0, x1);
851 	uint8x16x2_t t23 = vzipq_u8(x2, x3);
852 
853 	uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
854 	uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
855 
856 	x0 = vreinterpretq_u8_u16(x01.val[0]);
857 	x1 = vreinterpretq_u8_u16(x01.val[1]);
858 	x2 = vreinterpretq_u8_u16(x23.val[0]);
859 	x3 = vreinterpretq_u8_u16(x23.val[1]);
860 }
861 
unzigzag8(uint8x16_t v)862 static uint8x16_t unzigzag8(uint8x16_t v)
863 {
864 	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
865 	uint8x16_t xr = vshrq_n_u8(v, 1);
866 
867 	return veorq_u8(xl, xr);
868 }
869 #endif
870 
871 #ifdef SIMD_WASM
transpose8(v128_t & x0,v128_t & x1,v128_t & x2,v128_t & x3)872 static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
873 {
874 	v128_t t0 = wasm_unpacklo_v8x16(x0, x1);
875 	v128_t t1 = wasm_unpackhi_v8x16(x0, x1);
876 	v128_t t2 = wasm_unpacklo_v8x16(x2, x3);
877 	v128_t t3 = wasm_unpackhi_v8x16(x2, x3);
878 
879 	x0 = wasm_unpacklo_v16x8(t0, t2);
880 	x1 = wasm_unpackhi_v16x8(t0, t2);
881 	x2 = wasm_unpacklo_v16x8(t1, t3);
882 	x3 = wasm_unpackhi_v16x8(t1, t3);
883 }
884 
unzigzag8(v128_t v)885 static v128_t unzigzag8(v128_t v)
886 {
887 	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
888 	v128_t xr = wasm_u8x16_shr(v, 1);
889 
890 	return wasm_v128_xor(xl, xr);
891 }
892 #endif
893 
894 #if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
decodeBytesSimd(const unsigned char * data,const unsigned char * data_end,unsigned char * buffer,size_t buffer_size)895 static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
896 {
897 	assert(buffer_size % kByteGroupSize == 0);
898 	assert(kByteGroupSize == 16);
899 
900 	const unsigned char* header = data;
901 
902 	// round number of groups to 4 to get number of header bytes
903 	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
904 
905 	if (size_t(data_end - data) < header_size)
906 		return 0;
907 
908 	data += header_size;
909 
910 	size_t i = 0;
911 
912 	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=32b
913 	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kTailMaxSize * 4; i += kByteGroupSize * 4)
914 	{
915 		size_t header_offset = i / kByteGroupSize;
916 		unsigned char header_byte = header[header_offset / 4];
917 
918 		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
919 		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
920 		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
921 		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
922 	}
923 
924 	// slow-path: process remaining groups
925 	for (; i < buffer_size; i += kByteGroupSize)
926 	{
927 		if (size_t(data_end - data) < kTailMaxSize)
928 			return 0;
929 
930 		size_t header_offset = i / kByteGroupSize;
931 
932 		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
933 
934 		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
935 	}
936 
937 	return data;
938 }
939 
decodeVertexBlockSimd(const unsigned char * data,const unsigned char * data_end,unsigned char * vertex_data,size_t vertex_count,size_t vertex_size,unsigned char last_vertex[256])940 static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
941 {
942 	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
943 
944 	unsigned char buffer[kVertexBlockMaxSize * 4];
945 	unsigned char transposed[kVertexBlockSizeBytes];
946 
947 	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
948 
949 	for (size_t k = 0; k < vertex_size; k += 4)
950 	{
951 		for (size_t j = 0; j < 4; ++j)
952 		{
953 			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
954 			if (!data)
955 				return 0;
956 		}
957 
958 #if defined(SIMD_SSE) || defined(SIMD_AVX)
959 #define TEMP __m128i
960 #define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
961 #define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
962 #define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
963 #define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
964 #define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
965 #endif
966 
967 #ifdef SIMD_NEON
968 #define TEMP uint8x8_t
969 #define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
970 #define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
971 #define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
972 #define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
973 #define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
974 #endif
975 
976 #ifdef SIMD_WASM
977 #define TEMP v128_t
978 #define PREP() v128_t pi = wasm_v128_load(last_vertex + k) // TODO: use wasm_v32x4_load_splat to avoid buffer overrun
979 #define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
980 #define GRP4(i) t0 = wasm_v32x4_splat(r##i, 0), t1 = wasm_v32x4_splat(r##i, 1), t2 = wasm_v32x4_splat(r##i, 2), t3 = wasm_v32x4_splat(r##i, 3)
981 #define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
982 #define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
983 #endif
984 
985 		PREP();
986 
987 		unsigned char* savep = transposed + k;
988 
989 		for (size_t j = 0; j < vertex_count_aligned; j += 16)
990 		{
991 			LOAD(0);
992 			LOAD(1);
993 			LOAD(2);
994 			LOAD(3);
995 
996 			r0 = unzigzag8(r0);
997 			r1 = unzigzag8(r1);
998 			r2 = unzigzag8(r2);
999 			r3 = unzigzag8(r3);
1000 
1001 			transpose8(r0, r1, r2, r3);
1002 
1003 			TEMP t0, t1, t2, t3;
1004 
1005 			GRP4(0);
1006 			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1007 			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1008 
1009 			GRP4(1);
1010 			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1011 			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1012 
1013 			GRP4(2);
1014 			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1015 			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1016 
1017 			GRP4(3);
1018 			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
1019 			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
1020 
1021 #undef TEMP
1022 #undef PREP
1023 #undef LOAD
1024 #undef GRP4
1025 #undef FIXD
1026 #undef SAVE
1027 		}
1028 	}
1029 
1030 	memcpy(vertex_data, transposed, vertex_count * vertex_size);
1031 
1032 	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
1033 
1034 	return data;
1035 }
1036 #endif
1037 
1038 } // namespace meshopt
1039 
meshopt_encodeVertexBuffer(unsigned char * buffer,size_t buffer_size,const void * vertices,size_t vertex_count,size_t vertex_size)1040 size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
1041 {
1042 	using namespace meshopt;
1043 
1044 	assert(vertex_size > 0 && vertex_size <= 256);
1045 	assert(vertex_size % 4 == 0);
1046 
1047 #if TRACE
1048 	memset(vertexstats, 0, sizeof(vertexstats));
1049 #endif
1050 
1051 	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
1052 
1053 	unsigned char* data = buffer;
1054 	unsigned char* data_end = buffer + buffer_size;
1055 
1056 	if (size_t(data_end - data) < 1 + vertex_size)
1057 		return 0;
1058 
1059 	*data++ = kVertexHeader;
1060 
1061 	unsigned char last_vertex[256] = {};
1062 	if (vertex_count > 0)
1063 		memcpy(last_vertex, vertex_data, vertex_size);
1064 
1065 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
1066 
1067 	size_t vertex_offset = 0;
1068 
1069 	while (vertex_offset < vertex_count)
1070 	{
1071 		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
1072 
1073 		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
1074 		if (!data)
1075 			return 0;
1076 
1077 		vertex_offset += block_size;
1078 	}
1079 
1080 	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1081 
1082 	if (size_t(data_end - data) < tail_size)
1083 		return 0;
1084 
1085 	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
1086 	if (vertex_size < kTailMaxSize)
1087 	{
1088 		memset(data, 0, kTailMaxSize - vertex_size);
1089 		data += kTailMaxSize - vertex_size;
1090 	}
1091 
1092 	memcpy(data, vertex_data, vertex_size);
1093 	data += vertex_size;
1094 
1095 	assert(data >= buffer + tail_size);
1096 	assert(data <= buffer + buffer_size);
1097 
1098 #if TRACE
1099 	size_t total_size = data - buffer;
1100 
1101 	for (size_t k = 0; k < vertex_size; ++k)
1102 	{
1103 		const Stats& vsk = vertexstats[k];
1104 
1105 		printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
1106 
1107 #if TRACE > 1
1108 		printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
1109 		       int(vsk.header),
1110 		       int(vsk.bitg[0]), int(vsk.bitb[0]),
1111 		       int(vsk.bitg[1]), int(vsk.bitb[1]),
1112 		       int(vsk.bitg[2]), int(vsk.bitb[2]),
1113 		       int(vsk.bitg[3]), int(vsk.bitb[3]));
1114 #endif
1115 
1116 		printf("\n");
1117 	}
1118 #endif
1119 
1120 	return data - buffer;
1121 }
1122 
meshopt_encodeVertexBufferBound(size_t vertex_count,size_t vertex_size)1123 size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
1124 {
1125 	using namespace meshopt;
1126 
1127 	assert(vertex_size > 0 && vertex_size <= 256);
1128 	assert(vertex_size % 4 == 0);
1129 
1130 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
1131 	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
1132 
1133 	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
1134 	size_t vertex_block_data_size = vertex_block_size;
1135 
1136 	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1137 
1138 	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
1139 }
1140 
meshopt_decodeVertexBuffer(void * destination,size_t vertex_count,size_t vertex_size,const unsigned char * buffer,size_t buffer_size)1141 int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
1142 {
1143 	using namespace meshopt;
1144 
1145 	assert(vertex_size > 0 && vertex_size <= 256);
1146 	assert(vertex_size % 4 == 0);
1147 
1148 	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
1149 
1150 #if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
1151 	int cpuinfo[4] = {};
1152 	__cpuid(cpuinfo, 1);
1153 	decode = (cpuinfo[2] & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
1154 #elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
1155 	decode = decodeVertexBlockSimd;
1156 #else
1157 	decode = decodeVertexBlock;
1158 #endif
1159 
1160 #if defined(SIMD_WASM)
1161 	// TODO: workaround for https://github.com/emscripten-core/emscripten/issues/9767
1162 	if (!gDecodeBytesGroupInitialized)
1163 		gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
1164 #endif
1165 
1166 #if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
1167 	assert(gDecodeBytesGroupInitialized);
1168 	(void)gDecodeBytesGroupInitialized;
1169 #endif
1170 
1171 	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
1172 
1173 	const unsigned char* data = buffer;
1174 	const unsigned char* data_end = buffer + buffer_size;
1175 
1176 	if (size_t(data_end - data) < 1 + vertex_size)
1177 		return -2;
1178 
1179 	if (*data++ != kVertexHeader)
1180 		return -1;
1181 
1182 	unsigned char last_vertex[256];
1183 	memcpy(last_vertex, data_end - vertex_size, vertex_size);
1184 
1185 	size_t vertex_block_size = getVertexBlockSize(vertex_size);
1186 
1187 	size_t vertex_offset = 0;
1188 
1189 	while (vertex_offset < vertex_count)
1190 	{
1191 		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
1192 
1193 		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
1194 		if (!data)
1195 			return -2;
1196 
1197 		vertex_offset += block_size;
1198 	}
1199 
1200 	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
1201 
1202 	if (size_t(data_end - data) != tail_size)
1203 		return -3;
1204 
1205 	return 0;
1206 }
1207