1 /* melder_readtext.cpp
2  *
3  * Copyright (C) 2008,2010-2012,2014-2020 Paul Boersma
4  *
5  * This code is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or (at
8  * your option) any later version.
9  *
10  * This code is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13  * See the GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this work. If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "melder.h"
20 #include "../kar/UnicodeData.h"
21 
22 char32 MelderReadText_getChar (MelderReadText me) {
23 	if (my string32) {
24 		if (* my readPointer32 == U'\0')
25 			return U'\0';
26 		return * my readPointer32 ++;
27 	} else {
28 		if (* my readPointer8 == '\0') return U'\0';
29 		if (my input8Encoding == kMelder_textInputEncoding::UTF8) {
30 			char32 kar1 = (char32) (char8) * my readPointer8 ++;
31 			if (kar1 <= 0x00'007F) {
32 				return kar1;
33 			} else if (kar1 <= 0x00'00DF) {
34 				char32 kar2 = (char32) (char8) * my readPointer8 ++;
35 				return ((kar1 & 0x00'001F) << 6) | (kar2 & 0x00'003F);
36 			} else if (kar1 <= 0x00'00EF) {
37 				char32 kar2 = (char32) (char8) * my readPointer8 ++;
38 				char32 kar3 = (char32) (char8) * my readPointer8 ++;
39 				return ((kar1 & 0x00'000F) << 12) | ((kar2 & 0x00'003F) << 6) | (kar3 & 0x00'003F);
40 			} else if (kar1 <= 0x00'00F4) {
41 				char32 kar2 = (char32) (char8) * my readPointer8 ++;
42 				char32 kar3 = (char32) (char8) * my readPointer8 ++;
43 				char32 kar4 = (char32) (char8) * my readPointer8 ++;
44 				return ((kar1 & 0x00'0007) << 18) | ((kar2 & 0x00'003F) << 12) | ((kar3 & 0x00'003F) << 6) | (kar4 & 0x00'003F);
45 			} else {
46 				return UNICODE_REPLACEMENT_CHARACTER;
47 			}
48 		} else if (my input8Encoding == kMelder_textInputEncoding::MACROMAN) {
49 			return Melder_decodeMacRoman [(char8) * my readPointer8 ++];
50 		} else if (my input8Encoding == kMelder_textInputEncoding::WINDOWS_LATIN1) {
51 			return Melder_decodeWindowsLatin1 [(char8) * my readPointer8 ++];
52 		} else {
53 			/* Unknown encoding. */
54 			return (char32) (char8) * my readPointer8 ++;
55 		}
56 	}
57 }
58 
59 mutablestring32 MelderReadText_readLine (MelderReadText me) {
60 	if (my string32) {
61 		Melder_assert (my readPointer32);
dumpFunctionProfile(StringRef FName,raw_ostream & OS)62 		Melder_assert (! my readPointer8);
63 		if (*my readPointer32 == U'\0')   // tried to read past end of file
64 			return nullptr;
65 		char32 *result = my readPointer32;
66 		char32 *newline = str32chr (result, U'\n');
67 		if (newline) {
dump(raw_ostream & OS)68 			*newline = U'\0';
69 			my readPointer32 = newline + 1;
70 		} else {
71 			my readPointer32 += str32len (result);
72 		}
73 		return result;
74 	} else {
75 		Melder_assert (my string8);
76 		Melder_assert (! my readPointer32);
77 		Melder_assert (my readPointer8);
78 		if (*my readPointer8 == '\0')   // tried to read past end of file
79 			return nullptr;
ParseHead(const StringRef & Input,StringRef & FName,uint64_t & NumSamples,uint64_t & NumHeadSamples)80 		char *result8 = my readPointer8;
81 		char *newline = strchr (result8, '\n');
82 		if (newline) {
83 			*newline = '\0';
84 			my readPointer8 = newline + 1;
85 		} else {
86 			my readPointer8 += strlen (result8);
87 		}
88 		static char32 *text32 = nullptr;
89 		static int64 size = 0;
90 		int64 sizeNeeded = (int64) strlen (result8) + 1;
91 		if (sizeNeeded > size) {
92 			Melder_free (text32);
93 			text32 = Melder_malloc_f (char32, sizeNeeded + 100);
94 			size = sizeNeeded + 100;
isOffsetLegal(unsigned L)95 		}
96 		Melder_8to32_inplace (result8, text32, my input8Encoding);
97 		return text32;
98 	}
99 }
100 
101 int64 MelderReadText_getNumberOfLines (MelderReadText me) {
102 	int64 n = 0;
103 	if (my string32) {
parseMetadata(const StringRef & Input,uint64_t & FunctionHash,uint32_t & Attributes)104 		char32 *p = & my string32 [0];
105 		for (; *p != U'\0'; p ++) {
106 			if (*p == U'\n')
107 				n ++;
108 		}
109 		if (p - & my string32 [0] > 1 && p [-1] != U'\n')
110 			n ++;
111 	} else {
112 		char *p = & my string8 [0];
113 		for (; *p != '\0'; p ++) {
114 			if (*p == '\n')
115 				n ++;
116 		}
117 		if (p - & my string8 [0] > 1 && p [-1] != '\n')
118 			n ++;
119 	}
120 	return n;
121 }
122 
123 conststring32 MelderReadText_getLineNumber (MelderReadText me) {
124 	int64 result = 1;
125 	if (my string32) {
126 		char32 *p = & my string32 [0];
127 		while (my readPointer32 - p > 0) {
128 			if (*p == U'\0' || *p == U'\n')
129 				result ++;
130 			p ++;
131 		}
132 	} else {
133 		char *p = & my string8 [0];
134 		while (my readPointer8 - p > 0) {
135 			if (*p == '\0' || *p == '\n')
136 				result ++;
ParseLine(const StringRef & Input,LineType & LineTy,uint32_t & Depth,uint64_t & NumSamples,uint32_t & LineOffset,uint32_t & Discriminator,StringRef & CalleeName,DenseMap<StringRef,uint64_t> & TargetCountMap,uint64_t & FunctionHash,uint32_t & Attributes)137 			p ++;
138 		}
139 	}
140 	return Melder_integer (result);
141 }
142 
143 autoMelderReadText MelderReadText_createFromFile (MelderFile file) {
144 	autoMelderReadText me = std::make_unique <structMelderReadText> ();
145 	my string32 = MelderFile_readText (file, & my string8);
146 	if (my string32) {
147 		my readPointer32 = & my string32 [0];
148 	} else {
149 		Melder_assert (my string8);
150 		my readPointer8 = & my string8 [0];
151 		my input8Encoding = Melder_getInputEncoding ();
152 		if (my input8Encoding == kMelder_textInputEncoding::UTF8 ||
153 			my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1 ||
154 			my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1 ||
155 			my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN)
156 		{
157 			if (Melder_str8IsValidUtf8 (my string8.get())) {
158 				my input8Encoding = kMelder_textInputEncoding::UTF8;
159 			} else if (my input8Encoding == kMelder_textInputEncoding::UTF8) {
160 				Melder_throw (U"Text is not valid UTF-8; please try a different text input encoding.");
161 			} else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_ISO_LATIN1) {
162 				my input8Encoding = kMelder_textInputEncoding::ISO_LATIN1;
163 			} else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_WINDOWS_LATIN1) {
164 				my input8Encoding = kMelder_textInputEncoding::WINDOWS_LATIN1;
165 			} else if (my input8Encoding == kMelder_textInputEncoding::UTF8_THEN_MACROMAN) {
166 				my input8Encoding = kMelder_textInputEncoding::MACROMAN;
167 			}
168 		}
169 	}
170 	return me;
171 }
172 
173 /* End of file melder_readtext.cpp */
174