1 unit JcfUnicodeFiles;
2
3 {(*}
4 (*------------------------------------------------------------------------------
5 Delphi Code formatter source code
6
7 The Original Code is JcfUnicodeFiles, released March 2007.
8 The Initial Developer of the Original Code is Anthony Steele.
9 Portions created by Anthony Steele are Copyright (C) 2007 Anthony Steele.
10 All Rights Reserved.
11 Contributor(s): Anthony Steele.
12
13 The contents of this file are subject to the Mozilla Public License Version 1.1
14 (the "License"). you may not use this file except in compliance with the License.
15 You may obtain a copy of the License at http://www.mozilla.org/NPL/
16
17 Software distributed under the License is distributed on an "AS IS" basis,
18 WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 See the License for the specific language governing rights and limitations
20 under the License.
21
22 Alternatively, the contents of this file may be used under the terms of
23 the GNU General Public License Version 2 or later (the "GPL")
24 See http://www.gnu.org/licenses/gpl.html
25 ------------------------------------------------------------------------------*)
26 {*)}
27
28 {$I JcfGlobal.inc}
29
30 interface
31
32 uses
33 Classes, SysUtils, Dialogs;
34
35 type
36 TFileContentType = (eUnknown, e8Bit, eUtf8,
37 eUtf16LittleEndian, eUtf16BigEndian,
38 eUtf32LittleEndian, eUtf32BigEndian);
39
TypeOfTextFilenull40 function TypeOfTextFile(const psFileName: string): TFileContentType;
41
42 procedure ReadTextFile(const psFileName: string; out psContents: String;
43 out peContentType: TFileContentType);
44
45 procedure WriteTextFile(const psFileName: string; const psContents: String;
46 const peContentType: TFileContentType);
47
48
49 implementation
50
51 const
52 // byte order markers (BOM)
53 // these are found at the start of the file
54
55 /// 3 bytes for UTF-8
56 Utf8Marker12 = $BBEF;
57 Utf8Marker1 = $EF;
58 Utf8Marker2 = $BB;
59 Utf8Marker3 = $BF;
60
61 // 4 bytes for UTF-16. Big or little-endian
62 Utf16LittleEndianMarker = $FEFF;
63 Utf16BigEndianMarker = $FFFE;
64
65 // 4 bytes for utf-32. Big or little-endian
66 Utf32LittleEndianMarker1 = $FEFF;
67 Utf32LittleEndianMarker2 = $0000;
68
69 Utf32BigEndianMarker1 = $0000;
70 Utf32BigEndianMarker2 = $FFFE;
71
72 //MaxAnsiChar = 127;
73
ReadFileHeadernull74 function ReadFileHeader(const pcFileStream: TFileStream): TFileContentType;
75 var
76 word1: word;
77 word2: word;
78 byte3: byte;
79 begin
80 // small files are ansi
81 if pcFileStream.Size < 4 then
82 begin
83 Result := e8Bit;
84 exit;
85 end;
86
87 // read the first 4 bytes
88 pcFileStream.Seek(0, soFromBeginning);
89
90 word1:=0;
91 word2:=0;
92 pcFileStream.Read(word1, SizeOf(word));
93 pcFileStream.Read(word2, SizeOf(word));
94
95 byte3 := byte(word2);
96
97 if (word1 = Utf32LittleEndianMarker1) and (word2 = Utf32LittleEndianMarker2) then
98 begin
99 Result := eUtf32LittleEndian;
100 end
101 else if (word1 = Utf32BigEndianMarker1) and (word2 = Utf32BigEndianMarker2) then
102 begin
103 Result := eUtf32BigEndian;
104 end
105 else if (word1 = Utf8Marker12) and (byte3 = Utf8Marker3) then
106 begin
107 Result := eUtf8;
108 end
109 else if (word1 = Utf16LittleEndianMarker) then
110 begin
111 Result := eUtf16LittleEndian;
112 end
113 else if (word1 = Utf16BigEndianMarker) then
114 begin
115 Result := eUtf16BigEndian;
116 end
117 else
118 begin
119 Result := e8Bit;
120 end;
121
122 end;
123
TypeOfTextFilenull124 function TypeOfTextFile(const psFileName: string): TFileContentType;
125 var
126 fs: TFileStream;
127 begin
128 {open file}
129 fs := TFileStream.Create(psFileName, fmOpenRead);
130 try
131 Result := ReadFileHeader(fs);
132 finally
133 // close the file
134 fs.Free;
135 end;
136 end;
137
138 procedure ReadPastFileHeader(const pcFileStream: TFileStream; const peContentType: TFileContentType);
139 var
140 liOffsetBytes: integer;
141 begin
142 case peContentType of
143 e8Bit:
144 liOffsetBytes := 0;
145 eUtf8:
146 liOffsetBytes := 3;
147 eUtf16LittleEndian, eUtf16BigEndian:
148 liOffsetBytes := 2;
149 eUtf32LittleEndian, eUtf32BigEndian:
150 liOffsetBytes := 4;
151 else
152 raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
153 end;
154
155 pcFileStream.Seek(liOffsetBytes, soFromBeginning);
156 end;
157
158 { this is one of the few cases where 'AnsiString' must be used }
Read8BitFilenull159 function Read8BitFile(const pcFileStream: TFileStream): String;
160 var
161 liBytesRemaining: integer;
162 lsContents8bit: AnsiString;
163 begin
164 liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
165 // read the bytes into a string
166 SetLength(lsContents8bit, liBytesRemaining);
167 if pcFileStream.Size > 0 then
168 begin
169 pcFileStream.ReadBuffer(lsContents8bit[1], liBytesRemaining);
170 end;
171 Result := lsContents8bit;
172 end;
173
174
ReadUtf8Filenull175 function ReadUtf8File(const pcFileStream: TFileStream): String;
176 var
177 liBytesRemaining: integer;
178 lsContents: AnsiString;
179 begin
180 liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
181 // read the bytes into a string
182 SetLength(lsContents, liBytesRemaining);
183 if pcFileStream.Size > 0 then
184 begin
185 pcFileStream.ReadBuffer(lsContents[1], liBytesRemaining);
186 end;
187 Result := lsContents;
188 end;
189
190
Read16BitFilenull191 function Read16BitFile(const pcFileStream: TFileStream; const pbBigEndian: boolean): WideString;
192 var
193 liBytesRemaining: integer;
194 liLoop: integer;
195 lsWideContents: WideString;
196 begin
197 // read it
198 liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
199 SetLength(lsWideContents, liBytesRemaining div 2);
200 pcFileStream.Read(lsWideContents[1], liBytesRemaining);
201
202 if pbBigEndian then
203 begin
204 // swap the bytes
205 for liLoop := 1 to Length(lsWideContents) do
206 lsWideContents[liLoop] := WideChar(Swap(word(lsWideContents[liLoop])));
207 end;
208
209 Result := lsWideContents;
210 end;
211
SwapWordsnull212 function SwapWords(const value: UCS4Char): UCS4Char;
213 var
214 hi: word;
215 lo: word;
216 begin
217 // split into 16-bit words
218 hi := value shr 16;
219 lo := (value and $0000FFFF); // Prevent Range check error by converting to a word
220
221 hi := Swap(hi);
222 lo := Swap(lo);
223
224 // recombine
225 Result := (lo shl 16) + hi;
226 end;
227
Read32BitFilenull228 function Read32BitFile(const pcFileStream: TFileStream; pbBigEndian: boolean): WideString;
229 var
230 liBytesRemaining: integer;
231 charsRemaining: integer;
232 ucs4Chars: UCS4String;
233 liLoop: integer;
234 begin
235 liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
236 charsRemaining := liBytesRemaining div 4;
237
238 SetLength(ucs4Chars, charsRemaining);
239 pcFileStream.Read(ucs4Chars[0], liBytesRemaining);
240
241 if pbBigEndian then
242 begin
243 // swap the bytes
244 for liLoop := 0 to charsRemaining - 1 do
245 ucs4Chars[liLoop] := SwapWords(ucs4Chars[liLoop]);
246 end;
247
248 Result := UCS4StringToWideString(ucs4Chars);
249 end;
250
251 { read in a text file,
252 the file can contain 8-bit or 16-bit chars
253 code is much adapted from a sample by Mike Shkolnik
254 in nntp://borland.public.delphi.rtl.general
255 Re: Read UNICODE/ANSI/ASCII Text File to String
256 at: Jan 23 2006, 12:17
257 found at http://delphi.newswhat.com/geoxml/forumhistorythread?groupname=borland.public.delphi.rtl.general&messageid=43d485bf$1@newsgroups.borland.com
258 }
259 procedure ReadTextFile(const psFileName: string; out psContents: String;
260 out peContentType: TFileContentType);
261 var
262 fs: TFileStream;
263 begin
264 psContents := '';
265 peContentType := eUnknown;
266
267 {open file}
268 fs := TFileStream.Create(psFileName, fmOpenRead);
269 try
270 peContentType := ReadFileHeader(fs);
271
272 ReadPastFileHeader(fs, peContentType);
273
274 case peContentType of
275 e8Bit:
276 psContents := Read8BitFile(fs);
277
278 eUtf8:
279 psContents := ReadUtf8File(fs);
280
281 eUtf16LittleEndian, eUtf16BigEndian:
282 psContents := {%H-}Read16BitFile(fs, peContentType = eUtf16BigEndian);
283
284 eUtf32LittleEndian, eUtf32BigEndian:
285 psContents := {%H-}Read32BitFile(fs, peContentType = eUtf32BigEndian);
286 else
287 raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
288
289 end;
290 finally
291 // close the file
292 fs.Free;
293 end;
294 end;
295
296 { this is one of the few cases when "AnsiString" must be used }
297 procedure Write8BitFile(const pcFileStream: TFileStream; const psContents: String);
298 var
299 Len: integer;
300 lsContents: AnsiString;
301 begin
302 lsContents := psContents;
303 Len := Length(lsContents);
304 if Len > 0 then
305 pcFileStream.WriteBuffer(lsContents[1], Len);
306 end;
307
308 { this is one of the few cases when "AnsiString" must be used }
309 procedure WriteUtf8File(const pcFileStream: TFileStream; const psContents: String);
310 var
311 Len: integer;
312 lsContents: AnsiString;
313 utf8Header: array [0..2] of byte;
314 begin
315 lsContents := psContents;
316 Len := Length(lsContents);
317 // write the BOM
318 utf8Header[0] := Utf8Marker1;
319 utf8Header[1] := Utf8Marker2;
320 utf8Header[2] := Utf8Marker3;
321 pcFileStream.WriteBuffer(utf8Header[0], 3);
322
323 if Len > 0 then
324 pcFileStream.WriteBuffer(lsContents[1], Len);
325 end;
326
327 procedure Write16BitFile(const pcFileStream: TFileStream;
328 const psContents: WideString; const pbBigEndian: boolean);
329 var
330 Len: integer;
331 liLoop: integer;
332 wChar: word;
333 begin
334 Len := Length(psContents);
335
336 if Len > 0 then
337 begin
338 if pbBigEndian then
339 begin
340 // write the BOM
341 wChar := Utf16BigEndianMarker;
342 pcFileStream.WriteBuffer(wChar, 2);
343
344 for liLoop := 1 to Len do
345 begin
346 wChar := Swap(word(psContents[liLoop]));
347 pcFileStream.WriteBuffer(wChar, 2);
348 end;
349 end
350 else
351 begin
352 // write the BOM
353 wChar := Utf16LittleEndianMarker;
354 pcFileStream.WriteBuffer(wChar, 2);
355
356 pcFileStream.WriteBuffer(psContents[1], Len * 2);
357 end;
358 end;
359 end;
360
361 procedure Write32BitFile(const pcFileStream: TFileStream;
362 const psContents: WideString; const pbBigEndian: boolean);
363 var
364 Len: integer;
365 liLoop: integer;
366 lsUcs4String: UCS4String;
367 lcUcs4Char: UCS4Char;
368 wChar: word;
369 begin
370 Len := Length(psContents);
371
372 if Len > 0 then
373 begin
374 lsUcs4String := WideStringToUCS4String(psContents);
375
376 if pbBigEndian then
377 begin
378 // write the BOM
379 wChar := Utf32BigEndianMarker1;
380 pcFileStream.WriteBuffer(wChar, 2);
381 wChar := Utf32BigEndianMarker2;
382 pcFileStream.WriteBuffer(wChar, 2);
383
384 for liLoop := 0 to Len do
385 begin
386 lcUcs4Char := SwapWords(lsUcs4String[liLoop]);
387 pcFileStream.WriteBuffer(lcUcs4Char, 4);
388 end;
389 end
390 else
391 begin
392 // write the BOM
393 wChar := Utf32LittleEndianMarker1;
394 pcFileStream.WriteBuffer(wChar, 2);
395 wChar := Utf32LittleEndianMarker2;
396 pcFileStream.WriteBuffer(wChar, 2);
397
398 // an array not a real string, indexed from zero
399 pcFileStream.WriteBuffer(lsUcs4String[0], (Len + 1) * 4);
400 end;
401 end;
402 end;
403
404 procedure WriteTextFile(const psFileName: string; const psContents: String;
405 const peContentType: TFileContentType);
406 var
407 fs: TFileStream;
408 begin
409 fs := TFileStream.Create(psFileName, fmCreate);
410 try
411
412 case peContentType of
413 e8Bit:
414 begin
415 Write8BitFile(fs, psContents);
416 end;
417
418 eUtf8:
419 begin
420 WriteUtf8File(fs, psContents);
421 end;
422
423 eUtf16LittleEndian, eUtf16BigEndian:
424 begin
425 Write16BitFile(fs, psContents{%H-}, peContentType = eUtf16BigEndian);
426 end;
427
428 eUtf32LittleEndian, eUtf32BigEndian:
429 begin
430 Write32BitFile(fs, psContents{%H-}, peContentType = eUtf32BigEndian);
431 end;
432
433 else
434 raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
435
436 end;
437
438 finally
439 fs.Free;
440 end;
441 end;
442
443 end.
444