1 /* C implementation of performance sensitive functions. */
2 
3 #define PY_SSIZE_T_CLEAN
4 #include <Python.h>
5 #include <stdint.h> /* uint8_t, uint32_t, uint64_t */
6 
7 #if __ARM_NEON
8 #include <arm_neon.h>
9 #elif __SSE2__
10 #include <emmintrin.h>
11 #endif
12 
13 static const Py_ssize_t MASK_LEN = 4;
14 
15 /* Similar to PyBytes_AsStringAndSize, but accepts more types */
16 
17 static int
_PyBytesLike_AsStringAndSize(PyObject * obj,PyObject ** tmp,char ** buffer,Py_ssize_t * length)18 _PyBytesLike_AsStringAndSize(PyObject *obj, PyObject **tmp, char **buffer, Py_ssize_t *length)
19 {
20     // This supports bytes, bytearrays, and memoryview objects,
21     // which are common data structures for handling byte streams.
22     // websockets.framing.prepare_data() returns only these types.
23     // If *tmp isn't NULL, the caller gets a new reference.
24     if (PyBytes_Check(obj))
25     {
26         *tmp = NULL;
27         *buffer = PyBytes_AS_STRING(obj);
28         *length = PyBytes_GET_SIZE(obj);
29     }
30     else if (PyByteArray_Check(obj))
31     {
32         *tmp = NULL;
33         *buffer = PyByteArray_AS_STRING(obj);
34         *length = PyByteArray_GET_SIZE(obj);
35     }
36     else if (PyMemoryView_Check(obj))
37     {
38         *tmp = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C');
39         if (*tmp == NULL)
40         {
41             return -1;
42         }
43         Py_buffer *mv_buf;
44         mv_buf = PyMemoryView_GET_BUFFER(*tmp);
45         *buffer = mv_buf->buf;
46         *length = mv_buf->len;
47     }
48     else
49     {
50         PyErr_Format(
51             PyExc_TypeError,
52             "expected a bytes-like object, %.200s found",
53             Py_TYPE(obj)->tp_name);
54         return -1;
55     }
56 
57     return 0;
58 }
59 
60 /* C implementation of websockets.utils.apply_mask */
61 
62 static PyObject *
apply_mask(PyObject * self,PyObject * args,PyObject * kwds)63 apply_mask(PyObject *self, PyObject *args, PyObject *kwds)
64 {
65 
66     // In order to support various bytes-like types, accept any Python object.
67 
68     static char *kwlist[] = {"data", "mask", NULL};
69     PyObject *input_obj;
70     PyObject *mask_obj;
71 
72     // A pointer to a char * + length will be extracted from the data and mask
73     // arguments, possibly via a Py_buffer.
74 
75     PyObject *input_tmp = NULL;
76     char *input;
77     Py_ssize_t input_len;
78     PyObject *mask_tmp = NULL;
79     char *mask;
80     Py_ssize_t mask_len;
81 
82     // Initialize a PyBytesObject then get a pointer to the underlying char *
83     // in order to avoid an extra memory copy in PyBytes_FromStringAndSize.
84 
85     PyObject *result = NULL;
86     char *output;
87 
88     // Other variables.
89 
90     Py_ssize_t i = 0;
91 
92     // Parse inputs.
93 
94     if (!PyArg_ParseTupleAndKeywords(
95             args, kwds, "OO", kwlist, &input_obj, &mask_obj))
96     {
97         goto exit;
98     }
99 
100     if (_PyBytesLike_AsStringAndSize(input_obj, &input_tmp, &input, &input_len) == -1)
101     {
102         goto exit;
103     }
104 
105     if (_PyBytesLike_AsStringAndSize(mask_obj, &mask_tmp, &mask, &mask_len) == -1)
106     {
107         goto exit;
108     }
109 
110     if (mask_len != MASK_LEN)
111     {
112         PyErr_SetString(PyExc_ValueError, "mask must contain 4 bytes");
113         goto exit;
114     }
115 
116     // Create output.
117 
118     result = PyBytes_FromStringAndSize(NULL, input_len);
119     if (result == NULL)
120     {
121         goto exit;
122     }
123 
124     // Since we juste created result, we don't need error checks.
125     output = PyBytes_AS_STRING(result);
126 
127     // Perform the masking operation.
128 
129     // Apparently GCC cannot figure out the following optimizations by itself.
130 
131     // We need a new scope for MSVC 2010 (non C99 friendly)
132     {
133 #if __ARM_NEON
134 
135         // With NEON support, XOR by blocks of 16 bytes = 128 bits.
136 
137         Py_ssize_t input_len_128 = input_len & ~15;
138         uint8x16_t mask_128 = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t *)mask));
139 
140         for (; i < input_len_128; i += 16)
141         {
142             uint8x16_t in_128 = vld1q_u8((uint8_t *)(input + i));
143             uint8x16_t out_128 = veorq_u8(in_128, mask_128);
144             vst1q_u8((uint8_t *)(output + i), out_128);
145         }
146 
147 #elif __SSE2__
148 
149         // With SSE2 support, XOR by blocks of 16 bytes = 128 bits.
150 
151         // Since we cannot control the 16-bytes alignment of input and output
152         // buffers, we rely on loadu/storeu rather than load/store.
153 
154         Py_ssize_t input_len_128 = input_len & ~15;
155         __m128i mask_128 = _mm_set1_epi32(*(uint32_t *)mask);
156 
157         for (; i < input_len_128; i += 16)
158         {
159             __m128i in_128 = _mm_loadu_si128((__m128i *)(input + i));
160             __m128i out_128 = _mm_xor_si128(in_128, mask_128);
161             _mm_storeu_si128((__m128i *)(output + i), out_128);
162         }
163 
164 #else
165 
166         // Without SSE2 support, XOR by blocks of 8 bytes = 64 bits.
167 
168         // We assume the memory allocator aligns everything on 8 bytes boundaries.
169 
170         Py_ssize_t input_len_64 = input_len & ~7;
171         uint32_t mask_32 = *(uint32_t *)mask;
172         uint64_t mask_64 = ((uint64_t)mask_32 << 32) | (uint64_t)mask_32;
173 
174         for (; i < input_len_64; i += 8)
175         {
176             *(uint64_t *)(output + i) = *(uint64_t *)(input + i) ^ mask_64;
177         }
178 
179 #endif
180     }
181 
182     // XOR the remainder of the input byte by byte.
183 
184     for (; i < input_len; i++)
185     {
186         output[i] = input[i] ^ mask[i & (MASK_LEN - 1)];
187     }
188 
189 exit:
190     Py_XDECREF(input_tmp);
191     Py_XDECREF(mask_tmp);
192     return result;
193 
194 }
195 
196 static PyMethodDef speedups_methods[] = {
197     {
198         "apply_mask",
199         (PyCFunction)apply_mask,
200         METH_VARARGS | METH_KEYWORDS,
201         "Apply masking to the data of a WebSocket message.",
202     },
203     {NULL, NULL, 0, NULL},      /* Sentinel */
204 };
205 
206 static struct PyModuleDef speedups_module = {
207     PyModuleDef_HEAD_INIT,
208     "websocket.speedups",       /* m_name */
209     "C implementation of performance sensitive functions.",
210                                 /* m_doc */
211     -1,                         /* m_size */
212     speedups_methods,           /* m_methods */
213     NULL,
214     NULL,
215     NULL,
216     NULL
217 };
218 
219 PyMODINIT_FUNC
PyInit_speedups(void)220 PyInit_speedups(void)
221 {
222     return PyModule_Create(&speedups_module);
223 }
224