1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build s390x 6 7#include "textflag.h" 8 9// Vector register range containing CRC-32 constants 10 11#define CONST_PERM_LE2BE V9 12#define CONST_R2R1 V10 13#define CONST_R4R3 V11 14#define CONST_R5 V12 15#define CONST_RU_POLY V13 16#define CONST_CRC_POLY V14 17 18// The CRC-32 constant block contains reduction constants to fold and 19// process particular chunks of the input data stream in parallel. 20// 21// Note that the constant definitions below are extended in order to compute 22// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. 23// The rightmost doubleword can be 0 to prevent contribution to the result or 24// can be multiplied by 1 to perform an XOR without the need for a separate 25// VECTOR EXCLUSIVE OR instruction. 26// 27// The polynomials used are bit-reflected: 28// 29// IEEE: P'(x) = 0x0edb88320 30// Castagnoli: P'(x) = 0x082f63b78 31 32// IEEE polynomial constants 33DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask 34DATA ·crcleconskp+8(SB)/8, $0x0706050403020100 35DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2 36DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1 37DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4 38DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3 39DATA ·crcleconskp+48(SB)/8, $0x0000000000000000 40DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5 41DATA ·crcleconskp+64(SB)/8, $0x0000000000000000 42DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u' 43DATA ·crcleconskp+80(SB)/8, $0x0000000000000000 44DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 45 46GLOBL ·crcleconskp(SB), RODATA, $144 47 48// Castagonli Polynomial constants 49DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask 50DATA ·crccleconskp+8(SB)/8, $0x0706050403020100 51DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2 52DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1 53DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4 54DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3 55DATA ·crccleconskp+48(SB)/8, $0x0000000000000000 56DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5 57DATA ·crccleconskp+64(SB)/8, $0x0000000000000000 58DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u' 59DATA ·crccleconskp+80(SB)/8, $0x0000000000000000 60DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 61 62GLOBL ·crccleconskp(SB), RODATA, $144 63 64// func hasVectorFacility() bool 65TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 66 MOVD $x-24(SP), R1 67 XC $24, 0(R1), 0(R1) // clear the storage 68 MOVD $2, R0 // R0 is the number of double words stored -1 69 WORD $0xB2B01000 // STFLE 0(R1) 70 XOR R0, R0 // reset the value of R0 71 MOVBZ z-8(SP), R1 72 AND $0x40, R1 73 BEQ novector 74 75vectorinstalled: 76 // check if the vector instruction has been enabled 77 VLEIB $0, $0xF, V16 78 VLGVB $0, V16, R1 79 CMPBNE R1, $0xF, novector 80 MOVB $1, ret+0(FP) // have vx 81 RET 82 83novector: 84 MOVB $0, ret+0(FP) // no vx 85 RET 86 87// The CRC-32 function(s) use these calling conventions: 88// 89// Parameters: 90// 91// R2: Initial CRC value, typically ~0; and final CRC (return) value. 92// R3: Input buffer pointer, performance might be improved if the 93// buffer is on a doubleword boundary. 94// R4: Length of the buffer, must be 64 bytes or greater. 95// 96// Register usage: 97// 98// R5: CRC-32 constant pool base pointer. 99// V0: Initial CRC value and intermediate constants and results. 100// V1..V4: Data for CRC computation. 101// V5..V8: Next data chunks that are fetched from the input buffer. 102// 103// V9..V14: CRC-32 constants. 104 105// func vectorizedIEEE(crc uint32, p []byte) uint32 106TEXT ·vectorizedIEEE(SB), NOSPLIT, $0 107 MOVWZ crc+0(FP), R2 // R2 stores the CRC value 108 MOVD p+8(FP), R3 // data pointer 109 MOVD p_len+16(FP), R4 // len(p) 110 111 MOVD $·crcleconskp(SB), R5 112 BR vectorizedBody<>(SB) 113 114// func vectorizedCastagnoli(crc uint32, p []byte) uint32 115TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0 116 MOVWZ crc+0(FP), R2 // R2 stores the CRC value 117 MOVD p+8(FP), R3 // data pointer 118 MOVD p_len+16(FP), R4 // len(p) 119 120 // R5: crc-32 constant pool base pointer, constant is used to reduce crc 121 MOVD $·crccleconskp(SB), R5 122 BR vectorizedBody<>(SB) 123 124TEXT vectorizedBody<>(SB), NOSPLIT, $0 125 XOR $0xffffffff, R2 // NOTW R2 126 VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY 127 128 // Load the initial CRC value into the rightmost word of V0 129 VZERO V0 130 VLVGF $3, R2, V0 131 132 // Crash if the input size is less than 64-bytes. 133 CMP R4, $64 134 BLT crash 135 136 // Load a 64-byte data chunk and XOR with CRC 137 VLM 0(R3), V1, V4 // 64-bytes into V1..V4 138 139 // Reflect the data if the CRC operation is in the bit-reflected domain 140 VPERM V1, V1, CONST_PERM_LE2BE, V1 141 VPERM V2, V2, CONST_PERM_LE2BE, V2 142 VPERM V3, V3, CONST_PERM_LE2BE, V3 143 VPERM V4, V4, CONST_PERM_LE2BE, V4 144 145 VX V0, V1, V1 // V1 ^= CRC 146 ADD $64, R3 // BUF = BUF + 64 147 ADD $(-64), R4 148 149 // Check remaining buffer size and jump to proper folding method 150 CMP R4, $64 151 BLT less_than_64bytes 152 153fold_64bytes_loop: 154 // Load the next 64-byte data chunk into V5 to V8 155 VLM 0(R3), V5, V8 156 VPERM V5, V5, CONST_PERM_LE2BE, V5 157 VPERM V6, V6, CONST_PERM_LE2BE, V6 158 VPERM V7, V7, CONST_PERM_LE2BE, V7 159 VPERM V8, V8, CONST_PERM_LE2BE, V8 160 161 // Perform a GF(2) multiplication of the doublewords in V1 with 162 // the reduction constants in V0. The intermediate result is 163 // then folded (accumulated) with the next data chunk in V5 and 164 // stored in V1. Repeat this step for the register contents 165 // in V2, V3, and V4 respectively. 166 167 VGFMAG CONST_R2R1, V1, V5, V1 168 VGFMAG CONST_R2R1, V2, V6, V2 169 VGFMAG CONST_R2R1, V3, V7, V3 170 VGFMAG CONST_R2R1, V4, V8, V4 171 172 // Adjust buffer pointer and length for next loop 173 ADD $64, R3 // BUF = BUF + 64 174 ADD $(-64), R4 // LEN = LEN - 64 175 176 CMP R4, $64 177 BGE fold_64bytes_loop 178 179less_than_64bytes: 180 // Fold V1 to V4 into a single 128-bit value in V1 181 VGFMAG CONST_R4R3, V1, V2, V1 182 VGFMAG CONST_R4R3, V1, V3, V1 183 VGFMAG CONST_R4R3, V1, V4, V1 184 185 // Check whether to continue with 64-bit folding 186 CMP R4, $16 187 BLT final_fold 188 189fold_16bytes_loop: 190 VL 0(R3), V2 // Load next data chunk 191 VPERM V2, V2, CONST_PERM_LE2BE, V2 192 193 VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk 194 195 // Adjust buffer pointer and size for folding next data chunk 196 ADD $16, R3 197 ADD $-16, R4 198 199 // Process remaining data chunks 200 CMP R4, $16 201 BGE fold_16bytes_loop 202 203final_fold: 204 VLEIB $7, $0x40, V9 205 VSRLB V9, CONST_R4R3, V0 206 VLEIG $0, $1, V0 207 208 VGFMG V0, V1, V1 209 210 VLEIB $7, $0x20, V9 // Shift by words 211 VSRLB V9, V1, V2 // Store remaining bits in V2 212 VUPLLF V1, V1 // Split rightmost doubleword 213 VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 214 215 // The input values to the Barret reduction are the degree-63 polynomial 216 // in V1 (R(x)), degree-32 generator polynomial, and the reduction 217 // constant u. The Barret reduction result is the CRC value of R(x) mod 218 // P(x). 219 // 220 // The Barret reduction algorithm is defined as: 221 // 222 // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u 223 // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) 224 // 3. C(x) = R(x) XOR T2(x) mod x^32 225 // 226 // Note: To compensate the division by x^32, use the vector unpack 227 // instruction to move the leftmost word into the leftmost doubleword 228 // of the vector register. The rightmost doubleword is multiplied 229 // with zero to not contribute to the intermedate results. 230 231 // T1(x) = floor( R(x) / x^32 ) GF2MUL u 232 VUPLLF V1, V2 233 VGFMG CONST_RU_POLY, V2, V2 234 235 // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in 236 // V2 and XOR the intermediate result, T2(x), with the value in V1. 237 // The final result is in the rightmost word of V2. 238 239 VUPLLF V2, V2 240 VGFMAG CONST_CRC_POLY, V2, V1, V2 241 242done: 243 VLGVF $2, V2, R2 244 XOR $0xffffffff, R2 // NOTW R2 245 MOVWZ R2, ret + 32(FP) 246 RET 247 248crash: 249 MOVD $0, (R0) // input size is less than 64-bytes 250