1 unit JcfUnicodeFiles;
2
3 {(*}
4 (*------------------------------------------------------------------------------
5 Delphi Code formatter source code
6
7 The Original Code is JcfUnicodeFiles, released March 2007.
8 The Initial Developer of the Original Code is Anthony Steele.
9 Portions created by Anthony Steele are Copyright (C) 2007 Anthony Steele.
10 All Rights Reserved.
11 Contributor(s): Anthony Steele.
12
13 The contents of this file are subject to the Mozilla Public License Version 1.1
14 (the "License"). you may not use this file except in compliance with the License.
15 You may obtain a copy of the License at http://www.mozilla.org/NPL/
16
17 Software distributed under the License is distributed on an "AS IS" basis,
18 WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 See the License for the specific language governing rights and limitations
20 under the License.
21
22 Alternatively, the contents of this file may be used under the terms of
23 the GNU General Public License Version 2 or later (the "GPL")
24 See http://www.gnu.org/licenses/gpl.html
25 ------------------------------------------------------------------------------*)
26 {*)}
27
28 {$I JcfGlobal.inc}
29
30 interface
31
32 type
33 TFileContentType = (eUnknown, e8Bit, eUtf8,
34 eUtf16LittleEndian, eUtf16BigEndian,
35 eUtf32LittleEndian, eUtf32BigEndian);
36
TypeOfTextFilenull37 function TypeOfTextFile(const psFileName: string): TFileContentType;
38
39 procedure ReadTextFile(const psFileName: string; out psContents: String;
40 out peContentType: TFileContentType);
41
42 procedure WriteTextFile(const psFileName: string; const psContents: String;
43 const peContentType: TFileContentType);
44
45
46 implementation
47
48 uses
49 { delphi }
50 Classes, SysUtils, Dialogs;
51
52 const
53 // byte order markers (BOM)
54 // these are found at the start of the file
55
56 /// 3 bytes for UTF-8
57 Utf8Marker12 = $BBEF;
58 Utf8Marker1 = $EF;
59 Utf8Marker2 = $BB;
60 Utf8Marker3 = $BF;
61
62 // 4 bytes for UTF-16. Big or little-endian
63 Utf16LittleEndianMarker = $FEFF;
64 Utf16BigEndianMarker = $FFFE;
65
66 // 4 bytes for utf-32. Big or little-endian
67 Utf32LittleEndianMarker1 = $FEFF;
68 Utf32LittleEndianMarker2 = $0000;
69
70 Utf32BigEndianMarker1 = $0000;
71 Utf32BigEndianMarker2 = $FFFE;
72
73 //MaxAnsiChar = 127;
74
ReadFileHeadernull75 function ReadFileHeader(const pcFileStream: TFileStream): TFileContentType;
76 var
77 word1: word;
78 word2: word;
79 byte3: byte;
80 begin
81 // small files are ansi
82 if pcFileStream.Size < 4 then
83 begin
84 Result := e8Bit;
85 exit;
86 end;
87
88 // read the first 4 bytes
89 pcFileStream.Seek(0, soFromBeginning);
90
91 word1:=0;
92 word2:=0;
93 pcFileStream.Read(word1, SizeOf(word));
94 pcFileStream.Read(word2, SizeOf(word));
95
96 byte3 := byte(word2);
97
98 if (word1 = Utf32LittleEndianMarker1) and (word2 = Utf32LittleEndianMarker2) then
99 begin
100 Result := eUtf32LittleEndian;
101 end
102 else if (word1 = Utf32BigEndianMarker1) and (word2 = Utf32BigEndianMarker2) then
103 begin
104 Result := eUtf32BigEndian;
105 end
106 else if (word1 = Utf8Marker12) and (byte3 = Utf8Marker3) then
107 begin
108 Result := eUtf8;
109 end
110 else if (word1 = Utf16LittleEndianMarker) then
111 begin
112 Result := eUtf16LittleEndian;
113 end
114 else if (word1 = Utf16BigEndianMarker) then
115 begin
116 Result := eUtf16BigEndian;
117 end
118 else
119 begin
120 Result := e8Bit;
121 end;
122
123 end;
124
TypeOfTextFilenull125 function TypeOfTextFile(const psFileName: string): TFileContentType;
126 var
127 fs: TFileStream;
128 begin
129 {open file}
130 fs := TFileStream.Create(psFileName, fmOpenRead);
131 try
132 Result := ReadFileHeader(fs);
133 finally
134 // close the file
135 fs.Free;
136 end;
137 end;
138
139 procedure ReadPastFileHeader(const pcFileStream: TFileStream; const peContentType: TFileContentType);
140 var
141 liOffsetBytes: integer;
142 begin
143 case peContentType of
144 e8Bit:
145 liOffsetBytes := 0;
146 eUtf8:
147 liOffsetBytes := 3;
148 eUtf16LittleEndian, eUtf16BigEndian:
149 liOffsetBytes := 2;
150 eUtf32LittleEndian, eUtf32BigEndian:
151 liOffsetBytes := 4;
152 else
153 raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
154 end;
155
156 pcFileStream.Seek(liOffsetBytes, soFromBeginning);
157 end;
158
159 { this is one of the few cases where 'AnsiString' must be used }
Read8BitFilenull160 function Read8BitFile(const pcFileStream: TFileStream): String;
161 var
162 liBytesRemaining: integer;
163 lsContents8bit: AnsiString;
164 begin
165 liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
166 // read the bytes into a string
167 SetLength(lsContents8bit, liBytesRemaining);
168 if pcFileStream.Size > 0 then
169 begin
170 pcFileStream.ReadBuffer(lsContents8bit[1], liBytesRemaining);
171 end;
172 Result := lsContents8bit;
173 end;
174
175
ReadUtf8Filenull176 function ReadUtf8File(const pcFileStream: TFileStream): String;
177 var
178 liBytesRemaining: integer;
179 lsContents: AnsiString;
180 begin
181 liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
182 // read the bytes into a string
183 SetLength(lsContents, liBytesRemaining);
184 if pcFileStream.Size > 0 then
185 begin
186 pcFileStream.ReadBuffer(lsContents[1], liBytesRemaining);
187 end;
188 Result := lsContents;
189 end;
190
191
Read16BitFilenull192 function Read16BitFile(const pcFileStream: TFileStream; const pbBigEndian: boolean): WideString;
193 var
194 liBytesRemaining: integer;
195 liLoop: integer;
196 lsWideContents: WideString;
197 begin
198 // read it
199 liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
200 SetLength(lsWideContents, liBytesRemaining div 2);
201 pcFileStream.Read(lsWideContents[1], liBytesRemaining);
202
203 if pbBigEndian then
204 begin
205 // swap the bytes
206 for liLoop := 1 to Length(lsWideContents) do
207 lsWideContents[liLoop] := WideChar(Swap(word(lsWideContents[liLoop])));
208 end;
209
210 Result := lsWideContents;
211 end;
212
SwapWordsnull213 function SwapWords(const value: UCS4Char): UCS4Char;
214 var
215 hi: word;
216 lo: word;
217 begin
218 // split into 16-bit words
219 hi := value shr 16;
220 lo := (value and $0000FFFF); // Prevent Range check error by converting to a word
221
222 hi := Swap(hi);
223 lo := Swap(lo);
224
225 // recombine
226 Result := (lo shl 16) + hi;
227 end;
228
Read32BitFilenull229 function Read32BitFile(const pcFileStream: TFileStream; pbBigEndian: boolean): WideString;
230 var
231 liBytesRemaining: integer;
232 charsRemaining: integer;
233 ucs4Chars: UCS4String;
234 liLoop: integer;
235 begin
236 liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
237 charsRemaining := liBytesRemaining div 4;
238
239 SetLength(ucs4Chars, charsRemaining);
240 pcFileStream.Read(ucs4Chars[0], liBytesRemaining);
241
242 if pbBigEndian then
243 begin
244 // swap the bytes
245 for liLoop := 0 to charsRemaining - 1 do
246 ucs4Chars[liLoop] := SwapWords(ucs4Chars[liLoop]);
247 end;
248
249 Result := UCS4StringToWideString(ucs4Chars);
250 end;
251
252 { read in a text file,
253 the file can contain 8-bit or 16-bit chars
254 code is much adapted from a sample by Mike Shkolnik
255 in nntp://borland.public.delphi.rtl.general
256 Re: Read UNICODE/ANSI/ASCII Text File to String
257 at: Jan 23 2006, 12:17
258 found at http://delphi.newswhat.com/geoxml/forumhistorythread?groupname=borland.public.delphi.rtl.general&messageid=43d485bf$1@newsgroups.borland.com
259 }
260 procedure ReadTextFile(const psFileName: string; out psContents: String;
261 out peContentType: TFileContentType);
262 var
263 fs: TFileStream;
264 begin
265 psContents := '';
266 peContentType := eUnknown;
267
268 {open file}
269 fs := TFileStream.Create(psFileName, fmOpenRead);
270 try
271 peContentType := ReadFileHeader(fs);
272
273 ReadPastFileHeader(fs, peContentType);
274
275 case peContentType of
276 e8Bit:
277 psContents := Read8BitFile(fs);
278
279 eUtf8:
280 psContents := ReadUtf8File(fs);
281
282 eUtf16LittleEndian, eUtf16BigEndian:
283 psContents := {%H-}Read16BitFile(fs, peContentType = eUtf16BigEndian);
284
285 eUtf32LittleEndian, eUtf32BigEndian:
286 psContents := {%H-}Read32BitFile(fs, peContentType = eUtf32BigEndian);
287 else
288 raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
289
290 end;
291 finally
292 // close the file
293 fs.Free;
294 end;
295 end;
296
297 { this is one of the few cases when "AnsiString" must be used }
298 procedure Write8BitFile(const pcFileStream: TFileStream; const psContents: String);
299 var
300 Len: integer;
301 lsContents: AnsiString;
302 begin
303 lsContents := psContents;
304 Len := Length(lsContents);
305 if Len > 0 then
306 pcFileStream.WriteBuffer(lsContents[1], Len);
307 end;
308
309 { this is one of the few cases when "AnsiString" must be used }
310 procedure WriteUtf8File(const pcFileStream: TFileStream; const psContents: String);
311 var
312 Len: integer;
313 lsContents: AnsiString;
314 utf8Header: array [0..2] of byte;
315 begin
316 lsContents := psContents;
317 Len := Length(lsContents);
318 // write the BOM
319 utf8Header[0] := Utf8Marker1;
320 utf8Header[1] := Utf8Marker2;
321 utf8Header[2] := Utf8Marker3;
322 pcFileStream.WriteBuffer(utf8Header[0], 3);
323
324 if Len > 0 then
325 pcFileStream.WriteBuffer(lsContents[1], Len);
326 end;
327
328 procedure Write16BitFile(const pcFileStream: TFileStream;
329 const psContents: WideString; const pbBigEndian: boolean);
330 var
331 Len: integer;
332 liLoop: integer;
333 wChar: word;
334 begin
335 Len := Length(psContents);
336
337 if Len > 0 then
338 begin
339 if pbBigEndian then
340 begin
341 // write the BOM
342 wChar := Utf16BigEndianMarker;
343 pcFileStream.WriteBuffer(wChar, 2);
344
345 for liLoop := 1 to Len do
346 begin
347 wChar := Swap(word(psContents[liLoop]));
348 pcFileStream.WriteBuffer(wChar, 2);
349 end;
350 end
351 else
352 begin
353 // write the BOM
354 wChar := Utf16LittleEndianMarker;
355 pcFileStream.WriteBuffer(wChar, 2);
356
357 pcFileStream.WriteBuffer(psContents[1], Len * 2);
358 end;
359 end;
360 end;
361
362 procedure Write32BitFile(const pcFileStream: TFileStream;
363 const psContents: WideString; const pbBigEndian: boolean);
364 var
365 Len: integer;
366 liLoop: integer;
367 lsUcs4String: UCS4String;
368 lcUcs4Char: UCS4Char;
369 wChar: word;
370 begin
371 Len := Length(psContents);
372
373 if Len > 0 then
374 begin
375 lsUcs4String := WideStringToUCS4String(psContents);
376
377 if pbBigEndian then
378 begin
379 // write the BOM
380 wChar := Utf32BigEndianMarker1;
381 pcFileStream.WriteBuffer(wChar, 2);
382 wChar := Utf32BigEndianMarker2;
383 pcFileStream.WriteBuffer(wChar, 2);
384
385 for liLoop := 0 to Len do
386 begin
387 lcUcs4Char := SwapWords(lsUcs4String[liLoop]);
388 pcFileStream.WriteBuffer(lcUcs4Char, 4);
389 end;
390 end
391 else
392 begin
393 // write the BOM
394 wChar := Utf32LittleEndianMarker1;
395 pcFileStream.WriteBuffer(wChar, 2);
396 wChar := Utf32LittleEndianMarker2;
397 pcFileStream.WriteBuffer(wChar, 2);
398
399 // an array not a real string, indexed from zero
400 pcFileStream.WriteBuffer(lsUcs4String[0], (Len + 1) * 4);
401 end;
402 end;
403 end;
404
405 procedure WriteTextFile(const psFileName: string; const psContents: String;
406 const peContentType: TFileContentType);
407 var
408 fs: TFileStream;
409 begin
410 fs := TFileStream.Create(psFileName, fmCreate);
411 try
412
413 case peContentType of
414 e8Bit:
415 begin
416 Write8BitFile(fs, psContents);
417 end;
418
419 eUtf8:
420 begin
421 WriteUtf8File(fs, psContents);
422 end;
423
424 eUtf16LittleEndian, eUtf16BigEndian:
425 begin
426 Write16BitFile(fs, psContents{%H-}, peContentType = eUtf16BigEndian);
427 end;
428
429 eUtf32LittleEndian, eUtf32BigEndian:
430 begin
431 Write32BitFile(fs, psContents{%H-}, peContentType = eUtf32BigEndian);
432 end;
433
434 else
435 raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
436
437 end;
438
439 finally
440 fs.Free;
441 end;
442 end;
443
444 end.
445