xref: /reactos/base/applications/notepad/text.c (revision 8a978a17)
1 /*
2  *  Notepad (text.c)
3  *
4  *  Copyright 1998,99 Marcel Baur <mbaur@g26.ethz.ch>
5  *  Copyright 2002 Sylvain Petreolle <spetreolle@yahoo.fr>
6  *  Copyright 2002 Andriy Palamarchuk
7  *  Copyright 2019 Katayama Hirofumi MZ <katayama.hirofumi.mz@gmail.com>
8  *
9  * This library is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * This library is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with this library; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
22  */
23 
24 #include "notepad.h"
25 
26 static BOOL Append(LPWSTR *ppszText, DWORD *pdwTextLen, LPCWSTR pszAppendText, DWORD dwAppendLen)
27 {
28     LPWSTR pszNewText;
29 
30     if (dwAppendLen > 0)
31     {
32         if (*ppszText)
33         {
34             pszNewText = (LPWSTR) HeapReAlloc(GetProcessHeap(), 0, *ppszText, (*pdwTextLen + dwAppendLen) * sizeof(WCHAR));
35         }
36         else
37         {
38             pszNewText = (LPWSTR) HeapAlloc(GetProcessHeap(), 0, dwAppendLen * sizeof(WCHAR));
39         }
40 
41         if (!pszNewText)
42             return FALSE;
43 
44         memcpy(pszNewText + *pdwTextLen, pszAppendText, dwAppendLen * sizeof(WCHAR));
45         *ppszText = pszNewText;
46         *pdwTextLen += dwAppendLen;
47     }
48     return TRUE;
49 }
50 
51 BOOL IsTextNonZeroASCII(const void *pText, DWORD dwSize)
52 {
53     const signed char *pBytes = pText;
54     while (dwSize-- > 0)
55     {
56         if (*pBytes <= 0)
57             return FALSE;
58 
59         ++pBytes;
60     }
61     return TRUE;
62 }
63 
64 ENCODING AnalyzeEncoding(const char *pBytes, DWORD dwSize)
65 {
66     INT flags = IS_TEXT_UNICODE_STATISTICS;
67 
68     if (dwSize <= 1)
69         return ENCODING_ANSI;
70 
71     if (IsTextNonZeroASCII(pBytes, dwSize))
72     {
73         return ENCODING_ANSI;
74     }
75 
76     if (IsTextUnicode(pBytes, dwSize, &flags))
77     {
78         return ENCODING_UTF16LE;
79     }
80 
81     if ((flags & IS_TEXT_UNICODE_REVERSE_MASK) && !(flags & IS_TEXT_UNICODE_ILLEGAL_CHARS))
82     {
83         return ENCODING_UTF16BE;
84     }
85 
86     /* is it UTF-8? */
87     if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pBytes, dwSize, NULL, 0))
88     {
89         return ENCODING_UTF8;
90     }
91 
92     return ENCODING_ANSI;
93 }
94 
95 BOOL
96 ReadText(HANDLE hFile, LPWSTR *ppszText, DWORD *pdwTextLen, ENCODING *pencFile, int *piEoln)
97 {
98     DWORD dwSize;
99     LPBYTE pBytes = NULL;
100     LPWSTR pszText;
101     LPWSTR pszAllocText = NULL;
102     DWORD dwPos, i;
103     DWORD dwCharCount;
104     BOOL bSuccess = FALSE;
105     BYTE b = 0;
106     ENCODING encFile = ENCODING_ANSI;
107     int iCodePage = 0;
108     WCHAR szCrlf[2] = {'\r', '\n'};
109     DWORD adwEolnCount[3] = {0, 0, 0};
110 
111     *ppszText = NULL;
112     *pdwTextLen = 0;
113 
114     dwSize = GetFileSize(hFile, NULL);
115     if (dwSize == INVALID_FILE_SIZE)
116         goto done;
117 
118     pBytes = HeapAlloc(GetProcessHeap(), 0, dwSize + 2);
119     if (!pBytes)
120         goto done;
121 
122     if (!ReadFile(hFile, pBytes, dwSize, &dwSize, NULL))
123         goto done;
124     dwPos = 0;
125 
126     /* Make sure that there is a NUL character at the end, in any encoding */
127     pBytes[dwSize + 0] = '\0';
128     pBytes[dwSize + 1] = '\0';
129 
130     /* Look for Byte Order Marks */
131     if ((dwSize >= 2) && (pBytes[0] == 0xFF) && (pBytes[1] == 0xFE))
132     {
133         encFile = ENCODING_UTF16LE;
134         dwPos += 2;
135     }
136     else if ((dwSize >= 2) && (pBytes[0] == 0xFE) && (pBytes[1] == 0xFF))
137     {
138         encFile = ENCODING_UTF16BE;
139         dwPos += 2;
140     }
141     else if ((dwSize >= 3) && (pBytes[0] == 0xEF) && (pBytes[1] == 0xBB) && (pBytes[2] == 0xBF))
142     {
143         encFile = ENCODING_UTF8;
144         dwPos += 3;
145     }
146     else
147     {
148         encFile = AnalyzeEncoding((const char *)pBytes, dwSize);
149     }
150 
151     switch(encFile)
152     {
153     case ENCODING_UTF16BE:
154         for (i = dwPos; i < dwSize-1; i += 2)
155         {
156             b = pBytes[i+0];
157             pBytes[i+0] = pBytes[i+1];
158             pBytes[i+1] = b;
159         }
160         /* fall through */
161 
162     case ENCODING_UTF16LE:
163         pszText = (LPWSTR) &pBytes[dwPos];
164         dwCharCount = (dwSize - dwPos) / sizeof(WCHAR);
165         break;
166 
167     case ENCODING_ANSI:
168     case ENCODING_UTF8:
169         if (encFile == ENCODING_ANSI)
170             iCodePage = CP_ACP;
171         else if (encFile == ENCODING_UTF8)
172             iCodePage = CP_UTF8;
173 
174         if ((dwSize - dwPos) > 0)
175         {
176             dwCharCount = MultiByteToWideChar(iCodePage, 0, (LPCSTR)&pBytes[dwPos], dwSize - dwPos, NULL, 0);
177             if (dwCharCount == 0)
178                 goto done;
179         }
180         else
181         {
182             /* special case for files with no characters (other than BOMs) */
183             dwCharCount = 0;
184         }
185 
186         pszAllocText = (LPWSTR) HeapAlloc(GetProcessHeap(), 0, (dwCharCount + 1) * sizeof(WCHAR));
187         if (!pszAllocText)
188             goto done;
189 
190         if ((dwSize - dwPos) > 0)
191         {
192             if (!MultiByteToWideChar(iCodePage, 0, (LPCSTR)&pBytes[dwPos], dwSize - dwPos, pszAllocText, dwCharCount))
193                 goto done;
194         }
195 
196         pszAllocText[dwCharCount] = '\0';
197         pszText = pszAllocText;
198         break;
199     DEFAULT_UNREACHABLE;
200     }
201 
202     dwPos = 0;
203     for (i = 0; i < dwCharCount; i++)
204     {
205         switch(pszText[i])
206         {
207         case '\r':
208             if ((i < dwCharCount-1) && (pszText[i+1] == '\n'))
209             {
210                 i++;
211                 adwEolnCount[EOLN_CRLF]++;
212                 break;
213             }
214             /* fall through */
215 
216         case '\n':
217             if (!Append(ppszText, pdwTextLen, &pszText[dwPos], i - dwPos))
218                 return FALSE;
219             if (!Append(ppszText, pdwTextLen, szCrlf, ARRAY_SIZE(szCrlf)))
220                 return FALSE;
221             dwPos = i + 1;
222 
223             if (pszText[i] == '\r')
224                 adwEolnCount[EOLN_CR]++;
225             else
226                 adwEolnCount[EOLN_LF]++;
227             break;
228 
229         case '\0':
230             pszText[i] = ' ';
231             break;
232         }
233     }
234 
235     if (!*ppszText && (pszText == pszAllocText))
236     {
237         /* special case; don't need to reallocate */
238         *ppszText = pszAllocText;
239         *pdwTextLen = dwCharCount;
240         pszAllocText = NULL;
241     }
242     else
243     {
244         /* append last remaining text */
245         if (!Append(ppszText, pdwTextLen, &pszText[dwPos], i - dwPos + 1))
246             return FALSE;
247     }
248 
249     /* chose which eoln to use */
250     *piEoln = EOLN_CRLF;
251     if (adwEolnCount[EOLN_LF] > adwEolnCount[*piEoln])
252         *piEoln = EOLN_LF;
253     if (adwEolnCount[EOLN_CR] > adwEolnCount[*piEoln])
254         *piEoln = EOLN_CR;
255     *pencFile = encFile;
256 
257     bSuccess = TRUE;
258 
259 done:
260     if (pBytes)
261         HeapFree(GetProcessHeap(), 0, pBytes);
262     if (pszAllocText)
263         HeapFree(GetProcessHeap(), 0, pszAllocText);
264 
265     if (!bSuccess && *ppszText)
266     {
267         HeapFree(GetProcessHeap(), 0, *ppszText);
268         *ppszText = NULL;
269         *pdwTextLen = 0;
270     }
271     return bSuccess;
272 }
273 
274 static BOOL WriteEncodedText(HANDLE hFile, LPCWSTR pszText, DWORD dwTextLen, ENCODING encFile)
275 {
276     LPBYTE pBytes = NULL;
277     LPBYTE pAllocBuffer = NULL;
278     DWORD dwPos = 0;
279     DWORD dwByteCount;
280     BYTE buffer[1024];
281     UINT iCodePage = 0;
282     DWORD dwDummy, i;
283     BOOL bSuccess = FALSE;
284     int iBufferSize, iRequiredBytes;
285     BYTE b;
286 
287     while(dwPos < dwTextLen)
288     {
289         switch(encFile)
290         {
291             case ENCODING_UTF16LE:
292                 pBytes = (LPBYTE) &pszText[dwPos];
293                 dwByteCount = (dwTextLen - dwPos) * sizeof(WCHAR);
294                 dwPos = dwTextLen;
295                 break;
296 
297             case ENCODING_UTF16BE:
298                 dwByteCount = (dwTextLen - dwPos) * sizeof(WCHAR);
299                 if (dwByteCount > sizeof(buffer))
300                     dwByteCount = sizeof(buffer);
301 
302                 memcpy(buffer, &pszText[dwPos], dwByteCount);
303                 for (i = 0; i < dwByteCount; i += 2)
304                 {
305                     b = buffer[i+0];
306                     buffer[i+0] = buffer[i+1];
307                     buffer[i+1] = b;
308                 }
309                 pBytes = (LPBYTE) &buffer[dwPos];
310                 dwPos += dwByteCount / sizeof(WCHAR);
311                 break;
312 
313             case ENCODING_ANSI:
314             case ENCODING_UTF8:
315                 if (encFile == ENCODING_ANSI)
316                     iCodePage = CP_ACP;
317                 else if (encFile == ENCODING_UTF8)
318                     iCodePage = CP_UTF8;
319 
320                 iRequiredBytes = WideCharToMultiByte(iCodePage, 0, &pszText[dwPos], dwTextLen - dwPos, NULL, 0, NULL, NULL);
321                 if (iRequiredBytes <= 0)
322                 {
323                     goto done;
324                 }
325                 else if (iRequiredBytes < sizeof(buffer))
326                 {
327                     pBytes = buffer;
328                     iBufferSize = sizeof(buffer);
329                 }
330                 else
331                 {
332                     pAllocBuffer = (LPBYTE) HeapAlloc(GetProcessHeap(), 0, iRequiredBytes);
333                     if (!pAllocBuffer)
334                         return FALSE;
335                     pBytes = pAllocBuffer;
336                     iBufferSize = iRequiredBytes;
337                 }
338 
339                 dwByteCount = WideCharToMultiByte(iCodePage, 0, &pszText[dwPos], dwTextLen - dwPos, (LPSTR) pBytes, iBufferSize, NULL, NULL);
340                 if (!dwByteCount)
341                     goto done;
342 
343                 dwPos = dwTextLen;
344                 break;
345 
346             default:
347                 goto done;
348         }
349 
350         if (!WriteFile(hFile, pBytes, dwByteCount, &dwDummy, NULL))
351             goto done;
352 
353         /* free the buffer, if we have allocated one */
354         if (pAllocBuffer)
355         {
356             HeapFree(GetProcessHeap(), 0, pAllocBuffer);
357             pAllocBuffer = NULL;
358         }
359     }
360     bSuccess = TRUE;
361 
362 done:
363     if (pAllocBuffer)
364         HeapFree(GetProcessHeap(), 0, pAllocBuffer);
365     return bSuccess;
366 }
367 
368 BOOL WriteText(HANDLE hFile, LPCWSTR pszText, DWORD dwTextLen, ENCODING encFile, int iEoln)
369 {
370     WCHAR wcBom;
371     LPCWSTR pszLF = L"\n";
372     DWORD dwPos, dwNext;
373 
374     /* Write the proper byte order marks if not ANSI */
375     if (encFile != ENCODING_ANSI)
376     {
377         wcBom = 0xFEFF;
378         if (!WriteEncodedText(hFile, &wcBom, 1, encFile))
379             return FALSE;
380     }
381 
382     dwPos = 0;
383 
384     /* pszText eoln are always \r\n */
385 
386     do
387     {
388         /* Find the next eoln */
389         dwNext = dwPos;
390         while(dwNext < dwTextLen)
391         {
392             if (pszText[dwNext] == '\r' && pszText[dwNext + 1] == '\n')
393                 break;
394             dwNext++;
395         }
396 
397         if (dwNext != dwTextLen)
398         {
399             switch (iEoln)
400             {
401             case EOLN_LF:
402                 /* Write text (without eoln) */
403                 if (!WriteEncodedText(hFile, &pszText[dwPos], dwNext - dwPos, encFile))
404                     return FALSE;
405                 /* Write eoln */
406                 if (!WriteEncodedText(hFile, pszLF, 1, encFile))
407                     return FALSE;
408                 break;
409             case EOLN_CR:
410                 /* Write text (including \r as eoln) */
411                 if (!WriteEncodedText(hFile, &pszText[dwPos], dwNext - dwPos + 1, encFile))
412                     return FALSE;
413                 break;
414             case EOLN_CRLF:
415                 /* Write text (including \r\n as eoln) */
416                 if (!WriteEncodedText(hFile, &pszText[dwPos], dwNext - dwPos + 2, encFile))
417                     return FALSE;
418                 break;
419             default:
420                 return FALSE;
421             }
422         }
423         else
424         {
425             /* Write text (without eoln, since this is the end of the file) */
426             if (!WriteEncodedText(hFile, &pszText[dwPos], dwNext - dwPos, encFile))
427                 return FALSE;
428         }
429 
430         /* Skip \r\n */
431         dwPos = dwNext + 2;
432     }
433     while (dwPos < dwTextLen);
434 
435     return TRUE;
436 }
437