1# Copyright 2021, Kay Hayen, mailto:kay.hayen@gmail.com 2# 3# Part of "Nuitka", an optimizing Python compiler that is compatible and 4# integrates with CPython, but also works on its own. 5# 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17# 18""" C string encoding 19 20This contains the code to create string literals for C to represent the given 21values. 22""" 23 24import codecs 25import re 26 27 28def _identifierEncode(c): 29 """Nuitka handler to encode unicode to ASCII identifiers for C compiler.""" 30 return "$%02x$" % ord(c.object[c.end - 1]), c.end 31 32 33codecs.register_error("c_identifier", _identifierEncode) 34 35 36def _encodePythonStringToC(value): 37 """Encode a string, so that it gives a C string literal. 38 39 This doesn't handle limits. 40 """ 41 assert type(value) is bytes, type(value) 42 43 result = "" 44 octal = False 45 46 for c in value: 47 if str is bytes: 48 cv = ord(c) 49 else: 50 cv = c 51 52 if c in b'\\\t\r\n"?': 53 result += r"\%o" % cv 54 55 octal = True 56 elif 32 <= cv <= 127: 57 if octal and c in b"0123456789": 58 result += '" "' 59 60 result += chr(cv) 61 62 octal = False 63 else: 64 result += r"\%o" % cv 65 66 octal = True 67 68 result = result.replace('" "\\', "\\") 69 70 return '"%s"' % result 71 72 73def encodePythonStringToC(value): 74 """Encode a string, so that it gives a C string literal.""" 75 76 # Not all compilers allow arbitrary large C strings, therefore split it up 77 # into chunks. That changes nothing to the meanings, but is easier on the 78 # parser. Currently only MSVC is known to have this issue, but the 79 # workaround can be used universally. 80 81 result = _encodePythonStringToC(value[:16000]) 82 value = value[16000:] 83 84 while value: 85 result += " " 86 result += _encodePythonStringToC(value[:16000]) 87 value = value[16000:] 88 89 return result 90 91 92def encodePythonIdentifierToC(value): 93 """Encode an identifier from a given Python string.""" 94 95 # Python identifiers allow almost of characters except a very 96 # few, much more than C identifiers support. This attempts to 97 # be bi-directional, so we can reverse it. 98 99 def r(match): 100 c = match.group() 101 102 if c == ".": 103 return "$" 104 else: 105 return "$$%d$" % ord(c) 106 107 return "".join(re.sub("[^a-zA-Z0-9_]", r, c) for c in value) 108