1 /* C implementation of performance sensitive functions. */
2
3 #define PY_SSIZE_T_CLEAN
4 #include <Python.h>
5 #include <stdint.h> /* uint32_t, uint64_t */
6
7 #if __SSE2__
8 #include <emmintrin.h>
9 #endif
10
11 static const Py_ssize_t MASK_LEN = 4;
12
13 /* Similar to PyBytes_AsStringAndSize, but accepts more types */
14
15 static int
_PyBytesLike_AsStringAndSize(PyObject * obj,char ** buffer,Py_ssize_t * length)16 _PyBytesLike_AsStringAndSize(PyObject *obj, char **buffer, Py_ssize_t *length)
17 {
18 // This supports bytes, bytearrays, and C-contiguous memoryview objects,
19 // which are the most useful data structures for handling byte streams.
20 // websockets.framing.prepare_data() returns only values of these types.
21 // Any object implementing the buffer protocol could be supported, however
22 // that would require allocation or copying memory, which is expensive.
23 if (PyBytes_Check(obj))
24 {
25 *buffer = PyBytes_AS_STRING(obj);
26 *length = PyBytes_GET_SIZE(obj);
27 }
28 else if (PyByteArray_Check(obj))
29 {
30 *buffer = PyByteArray_AS_STRING(obj);
31 *length = PyByteArray_GET_SIZE(obj);
32 }
33 else if (PyMemoryView_Check(obj))
34 {
35 Py_buffer *mv_buf;
36 mv_buf = PyMemoryView_GET_BUFFER(obj);
37 if (PyBuffer_IsContiguous(mv_buf, 'C'))
38 {
39 *buffer = mv_buf->buf;
40 *length = mv_buf->len;
41 }
42 else
43 {
44 PyErr_Format(
45 PyExc_TypeError,
46 "expected a contiguous memoryview");
47 return -1;
48 }
49 }
50 else
51 {
52 PyErr_Format(
53 PyExc_TypeError,
54 "expected a bytes-like object, %.200s found",
55 Py_TYPE(obj)->tp_name);
56 return -1;
57 }
58
59 return 0;
60 }
61
62 /* C implementation of websockets.utils.apply_mask */
63
64 static PyObject *
apply_mask(PyObject * self,PyObject * args,PyObject * kwds)65 apply_mask(PyObject *self, PyObject *args, PyObject *kwds)
66 {
67
68 // In order to support various bytes-like types, accept any Python object.
69
70 static char *kwlist[] = {"data", "mask", NULL};
71 PyObject *input_obj;
72 PyObject *mask_obj;
73
74 // A pointer to a char * + length will be extracted from the data and mask
75 // arguments, possibly via a Py_buffer.
76
77 char *input;
78 Py_ssize_t input_len;
79 char *mask;
80 Py_ssize_t mask_len;
81
82 // Initialize a PyBytesObject then get a pointer to the underlying char *
83 // in order to avoid an extra memory copy in PyBytes_FromStringAndSize.
84
85 PyObject *result;
86 char *output;
87
88 // Other variables.
89
90 Py_ssize_t i = 0;
91
92 // Parse inputs.
93
94 if (!PyArg_ParseTupleAndKeywords(
95 args, kwds, "OO", kwlist, &input_obj, &mask_obj))
96 {
97 return NULL;
98 }
99
100 if (_PyBytesLike_AsStringAndSize(input_obj, &input, &input_len) == -1)
101 {
102 return NULL;
103 }
104
105 if (_PyBytesLike_AsStringAndSize(mask_obj, &mask, &mask_len) == -1)
106 {
107 return NULL;
108 }
109
110 if (mask_len != MASK_LEN)
111 {
112 PyErr_SetString(PyExc_ValueError, "mask must contain 4 bytes");
113 return NULL;
114 }
115
116 // Create output.
117
118 result = PyBytes_FromStringAndSize(NULL, input_len);
119 if (result == NULL)
120 {
121 return NULL;
122 }
123
124 // Since we juste created result, we don't need error checks.
125 output = PyBytes_AS_STRING(result);
126
127 // Perform the masking operation.
128
129 // Apparently GCC cannot figure out the following optimizations by itself.
130
131 // We need a new scope for MSVC 2010 (non C99 friendly)
132 {
133 #if __SSE2__
134
135 // With SSE2 support, XOR by blocks of 16 bytes = 128 bits.
136
137 // Since we cannot control the 16-bytes alignment of input and output
138 // buffers, we rely on loadu/storeu rather than load/store.
139
140 Py_ssize_t input_len_128 = input_len & ~15;
141 __m128i mask_128 = _mm_set1_epi32(*(uint32_t *)mask);
142
143 for (; i < input_len_128; i += 16)
144 {
145 __m128i in_128 = _mm_loadu_si128((__m128i *)(input + i));
146 __m128i out_128 = _mm_xor_si128(in_128, mask_128);
147 _mm_storeu_si128((__m128i *)(output + i), out_128);
148 }
149
150 #else
151
152 // Without SSE2 support, XOR by blocks of 8 bytes = 64 bits.
153
154 // We assume the memory allocator aligns everything on 8 bytes boundaries.
155
156 Py_ssize_t input_len_64 = input_len & ~7;
157 uint32_t mask_32 = *(uint32_t *)mask;
158 uint64_t mask_64 = ((uint64_t)mask_32 << 32) | (uint64_t)mask_32;
159
160 for (; i < input_len_64; i += 8)
161 {
162 *(uint64_t *)(output + i) = *(uint64_t *)(input + i) ^ mask_64;
163 }
164
165 #endif
166 }
167
168 // XOR the remainder of the input byte by byte.
169
170 for (; i < input_len; i++)
171 {
172 output[i] = input[i] ^ mask[i & (MASK_LEN - 1)];
173 }
174
175 return result;
176
177 }
178
179 static PyMethodDef speedups_methods[] = {
180 {
181 "apply_mask",
182 (PyCFunction)apply_mask,
183 METH_VARARGS | METH_KEYWORDS,
184 "Apply masking to websocket message.",
185 },
186 {NULL, NULL, 0, NULL}, /* Sentinel */
187 };
188
189 static struct PyModuleDef speedups_module = {
190 PyModuleDef_HEAD_INIT,
191 "websocket.speedups", /* m_name */
192 "C implementation of performance sensitive functions.",
193 /* m_doc */
194 -1, /* m_size */
195 speedups_methods, /* m_methods */
196 NULL,
197 NULL,
198 NULL,
199 NULL
200 };
201
202 PyMODINIT_FUNC
PyInit_speedups(void)203 PyInit_speedups(void)
204 {
205 return PyModule_Create(&speedups_module);
206 }
207