1 /* melder_readtext.cpp 2 * 3 * Copyright (C) 2008,2010-2012,2014-2020 Paul Boersma 4 * 5 * This code is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or (at 8 * your option) any later version. 9 * 10 * This code is distributed in the hope that it will be useful, but 11 * WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 * See the GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this work. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 #include "melder.h" 20 #include "../kar/UnicodeData.h" 21 22 char32 MelderReadText_getChar (MelderReadText me) { 23 if (my string32) { 24 if (* my readPointer32 == U'\0') 25 return U'\0'; 26 return * my readPointer32 ++; 27 } else { 28 if (* my readPointer8 == '\0') return U'\0'; 29 if (my input8Encoding == kMelder_textInputEncoding::UTF8) { 30 char32 kar1 = (char32) (char8) * my readPointer8 ++; 31 if (kar1 <= 0x00'007F) { 32 return kar1; 33 } else if (kar1 <= 0x00'00DF) { 34 char32 kar2 = (char32) (char8) * my readPointer8 ++; 35 return ((kar1 & 0x00'001F) << 6) | (kar2 & 0x00'003F); 36 } else if (kar1 <= 0x00'00EF) { 37 char32 kar2 = (char32) (char8) * my readPointer8 ++; 38 char32 kar3 = (char32) (char8) * my readPointer8 ++; 39 return ((kar1 & 0x00'000F) << 12) | ((kar2 & 0x00'003F) << 6) | (kar3 & 0x00'003F); 40 } else if (kar1 <= 0x00'00F4) { 41 char32 kar2 = (char32) (char8) * my readPointer8 ++; 42 char32 kar3 = (char32) (char8) * my readPointer8 ++; 43 char32 kar4 = (char32) (char8) * my readPointer8 ++; 44 return ((kar1 & 0x00'0007) << 18) | ((kar2 & 0x00'003F) << 12) | ((kar3 & 0x00'003F) << 6) | (kar4 & 0x00'003F); 45 } else { 46 return UNICODE_REPLACEMENT_CHARACTER; 47 } 48 } else if (my input8Encoding == kMelder_textInputEncoding::MACROMAN) { 49 return Melder_decodeMacRoman [(char8) * my readPointer8 ++]; 50 } else if (my input8Encoding == kMelder_textInputEncoding::WINDOWS_LATIN1) { 51 return Melder_decodeWindowsLatin1 [(char8) * my readPointer8 ++]; 52 } else { 53 /* Unknown encoding. */ 54 return (char32) (char8) * my readPointer8 ++; 55 } 56 } 57 } 58 59 mutablestring32 MelderReadText_readLine (MelderReadText me) { 60 if (my string32) { 61 Melder_assert (my readPointer32); dumpFunctionProfile(StringRef FName,raw_ostream & OS)62 Melder_assert (! my readPointer8); 63 if (*my readPointer32 == U'\0') // tried to read past end of file 64 return nullptr; 65 char32 *result = my readPointer32; 66 char32 *newline = str32chr (result, U'\n'); 67 if (newline) { dump(raw_ostream & OS)68 *newline = U'\0'; 69 my readPointer32 = newline + 1; 70 } else { 71 my readPointer32 += str32len (result); 72 } 73 return result; 74 } else { 75 Melder_assert (my string8); 76 Melder_assert (! my readPointer32); 77 Melder_assert (my readPointer8); 78 if (*my readPointer8 == '\0') // tried to read past end of file 79 return nullptr; ParseHead(const StringRef & Input,StringRef & FName,uint64_t & NumSamples,uint64_t & NumHeadSamples)80 char *result8 = my readPointer8; 81 char *newline = strchr (result8, '\n'); 82 if (newline) { 83 *newline = '\0'; 84 my readPointer8 = newline + 1; 85 } else { 86 my readPointer8 += strlen (result8); 87 } 88 static char32 *text32 = nullptr; 89 static int64 size = 0; 90 int64 sizeNeeded = (int64) strlen (result8) + 1; 91 if (sizeNeeded > size) { 92 Melder_free (text32); 93 text32 = Melder_malloc_f (char32, sizeNeeded + 100); 94 size = sizeNeeded + 100; isOffsetLegal(unsigned L)95 } 96 Melder_8to32_inplace (result8, text32, my input8Encoding); 97 return text32; 98 } 99 } 100 101 int64 MelderReadText_getNumberOfLines (MelderReadText me) { 102 int64 n = 0; 103 if (my string32) { parseMetadata(const StringRef & Input,uint64_t & FunctionHash,uint32_t & Attributes)104 char32 *p = & my string32 [0]; 105 for (; *p != U'\0'; p ++) { 106 if (*p == U'\n') 107 n ++; 108 } 109 if (p - & my string32 [0] > 1 && p [-1] != U'\n') 110 n ++; 111 } else { 112 char *p = & my string8 [0]; 113 for (; *p != '\0'; p ++) { 114 if (*p == '\n') 115 n ++; 116 } 117 if (p - & my string8 [0] > 1 && p [-1] != '\n') 118 n ++; 119 } 120 return n; 121 } 122 123 conststring32 MelderReadText_getLineNumber (MelderReadText me) { 124 int64 result = 1; 125 if (my string32) { 126 char32 *p = & my string32 [0]; 127 while (my readPointer32 - p > 0) { 128 if (*p == U'\0' || *p == U'\n') 129 result ++; 130 p ++; 131 } 132 } else { 133 char *p = & my string8 [0]; 134 while (my readPointer8 - p > 0) { 135 if (*p == '\0' || *p == '\n') 136 result ++; ParseLine(const StringRef & Input,LineType & LineTy,uint32_t & Depth,uint64_t & NumSamples,uint32_t & LineOffset,uint32_t & Discriminator,StringRef & CalleeName,DenseMap<StringRef,uint64_t> & TargetCountMap,uint64_t & FunctionHash,uint32_t & Attributes)137 p ++; 138 } 139 } 140 return Melder_integer (result); 141 } 142 143 autoMelderReadText MelderReadText_createFromFile (MelderFile file) { 144 autoMelderReadText me = std::make_unique <structMelderReadText> (); 145 my string32 = MelderFile_readText (file, & my string8); 146 if (my string32) { 147 my readPointer32 = & my string32 [0]; 148 } else { 149 Melder_assert (my string8); 150 my readPointer8 = & my string8 [0]; 151 my input8Encoding = Melder_getInputEncoding (); 152 if (my input8Encoding == kMelder_textInputEncoding::UTF8 || 153 my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1 || 154 my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1 || 155 my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN) 156 { 157 if (Melder_str8IsValidUtf8 (my string8.get())) { 158 my input8Encoding = kMelder_textInputEncoding::UTF8; 159 } else if (my input8Encoding == kMelder_textInputEncoding::UTF8) { 160 Melder_throw (U"Text is not valid UTF-8; please try a different text input encoding."); 161 } else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1) { 162 my input8Encoding = kMelder_textInputEncoding::ISO_LATIN1; 163 } else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1) { 164 my input8Encoding = kMelder_textInputEncoding::WINDOWS_LATIN1; 165 } else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN) { 166 my input8Encoding = kMelder_textInputEncoding::MACROMAN; 167 } 168 } 169 } 170 return me; 171 } 172 173 /* End of file melder_readtext.cpp */ 174