1 unit JcfUnicodeFiles;
2 
3 {(*}
4 (*------------------------------------------------------------------------------
5  Delphi Code formatter source code
6 
7 The Original Code is JcfUnicodeFiles, released March 2007.
8 The Initial Developer of the Original Code is Anthony Steele.
9 Portions created by Anthony Steele are Copyright (C) 2007 Anthony Steele.
10 All Rights Reserved.
11 Contributor(s): Anthony Steele.
12 
13 The contents of this file are subject to the Mozilla Public License Version 1.1
14 (the "License"). you may not use this file except in compliance with the License.
15 You may obtain a copy of the License at http://www.mozilla.org/NPL/
16 
17 Software distributed under the License is distributed on an "AS IS" basis,
18 WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 See the License for the specific language governing rights and limitations
20 under the License.
21 
22 Alternatively, the contents of this file may be used under the terms of
23 the GNU General Public License Version 2 or later (the "GPL")
24 See http://www.gnu.org/licenses/gpl.html
25 ------------------------------------------------------------------------------*)
26 {*)}
27 
28 {$I JcfGlobal.inc}
29 
30 interface
31 
32 uses
33   Classes, SysUtils, Dialogs;
34 
35 type
36   TFileContentType = (eUnknown, e8Bit, eUtf8,
37     eUtf16LittleEndian, eUtf16BigEndian,
38     eUtf32LittleEndian, eUtf32BigEndian);
39 
TypeOfTextFilenull40 function TypeOfTextFile(const psFileName: string): TFileContentType;
41 
42 procedure ReadTextFile(const psFileName: string; out psContents: String;
43   out peContentType: TFileContentType);
44 
45 procedure WriteTextFile(const psFileName: string; const psContents: String;
46   const peContentType: TFileContentType);
47 
48 
49 implementation
50 
51 const
52   // byte order markers (BOM)
53   // these are found at the start of the file
54 
55   /// 3 bytes for UTF-8
56   Utf8Marker12 = $BBEF;
57   Utf8Marker1 = $EF;
58   Utf8Marker2 = $BB;
59   Utf8Marker3  = $BF;
60 
61   // 4 bytes for UTF-16. Big or little-endian
62   Utf16LittleEndianMarker = $FEFF;
63   Utf16BigEndianMarker = $FFFE;
64 
65   // 4 bytes for utf-32. Big or little-endian
66   Utf32LittleEndianMarker1 = $FEFF;
67   Utf32LittleEndianMarker2 = $0000;
68 
69   Utf32BigEndianMarker1 = $0000;
70   Utf32BigEndianMarker2 = $FFFE;
71 
72   //MaxAnsiChar = 127;
73 
ReadFileHeadernull74 function ReadFileHeader(const pcFileStream: TFileStream): TFileContentType;
75 var
76   word1: word;
77   word2: word;
78   byte3: byte;
79 begin
80   // small files are ansi
81   if pcFileStream.Size < 4 then
82   begin
83     Result := e8Bit;
84     exit;
85   end;
86 
87   // read the first 4 bytes
88   pcFileStream.Seek(0, soFromBeginning);
89 
90   word1:=0;
91   word2:=0;
92   pcFileStream.Read(word1, SizeOf(word));
93   pcFileStream.Read(word2, SizeOf(word));
94 
95   byte3 := byte(word2);
96 
97   if (word1 = Utf32LittleEndianMarker1) and (word2 = Utf32LittleEndianMarker2) then
98   begin
99     Result := eUtf32LittleEndian;
100   end
101   else if (word1 = Utf32BigEndianMarker1) and (word2 = Utf32BigEndianMarker2) then
102   begin
103     Result := eUtf32BigEndian;
104   end
105   else if (word1 = Utf8Marker12) and (byte3 = Utf8Marker3) then
106   begin
107     Result := eUtf8;
108   end
109   else if (word1 = Utf16LittleEndianMarker) then
110   begin
111     Result := eUtf16LittleEndian;
112   end
113   else if (word1 = Utf16BigEndianMarker) then
114   begin
115     Result := eUtf16BigEndian;
116   end
117   else
118   begin
119     Result := e8Bit;
120   end;
121 
122 end;
123 
TypeOfTextFilenull124 function TypeOfTextFile(const psFileName: string): TFileContentType;
125 var
126   fs: TFileStream;
127 begin
128   {open file}
129   fs := TFileStream.Create(psFileName, fmOpenRead);
130   try
131     Result := ReadFileHeader(fs);
132   finally
133     // close the file
134     fs.Free;
135   end;
136 end;
137 
138 procedure ReadPastFileHeader(const pcFileStream: TFileStream; const peContentType: TFileContentType);
139 var
140   liOffsetBytes: integer;
141 begin
142   case peContentType of
143     e8Bit:
144       liOffsetBytes := 0;
145     eUtf8:
146       liOffsetBytes := 3;
147     eUtf16LittleEndian, eUtf16BigEndian:
148       liOffsetBytes := 2;
149     eUtf32LittleEndian, eUtf32BigEndian:
150       liOffsetBytes := 4;
151     else
152       raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
153   end;
154 
155   pcFileStream.Seek(liOffsetBytes, soFromBeginning);
156 end;
157 
158 { this is one of the few cases where 'AnsiString' must be used }
Read8BitFilenull159 function Read8BitFile(const pcFileStream: TFileStream): String;
160 var
161   liBytesRemaining: integer;
162   lsContents8bit: AnsiString;
163 begin
164   liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
165   // read the bytes into a string
166   SetLength(lsContents8bit, liBytesRemaining);
167   if pcFileStream.Size > 0 then
168   begin
169     pcFileStream.ReadBuffer(lsContents8bit[1], liBytesRemaining);
170   end;
171   Result := lsContents8bit;
172 end;
173 
174 
ReadUtf8Filenull175 function ReadUtf8File(const pcFileStream: TFileStream): String;
176 var
177   liBytesRemaining: integer;
178   lsContents: AnsiString;
179 begin
180   liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
181   // read the bytes into a string
182   SetLength(lsContents, liBytesRemaining);
183   if pcFileStream.Size > 0 then
184   begin
185     pcFileStream.ReadBuffer(lsContents[1], liBytesRemaining);
186   end;
187   Result := lsContents;
188 end;
189 
190 
Read16BitFilenull191 function Read16BitFile(const pcFileStream: TFileStream; const pbBigEndian: boolean): WideString;
192 var
193   liBytesRemaining: integer;
194   liLoop: integer;
195   lsWideContents: WideString;
196 begin
197   // read it
198   liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
199   SetLength(lsWideContents, liBytesRemaining div 2);
200   pcFileStream.Read(lsWideContents[1], liBytesRemaining);
201 
202   if pbBigEndian then
203   begin
204     // swap the bytes
205     for liLoop := 1 to Length(lsWideContents) do
206       lsWideContents[liLoop] := WideChar(Swap(word(lsWideContents[liLoop])));
207   end;
208 
209   Result := lsWideContents;
210 end;
211 
SwapWordsnull212 function SwapWords(const value: UCS4Char): UCS4Char;
213 var
214   hi: word;
215   lo: word;
216 begin
217   // split into 16-bit words
218   hi := value shr 16;
219   lo := (value and $0000FFFF); // Prevent Range check error by converting to a word
220 
221   hi := Swap(hi);
222   lo := Swap(lo);
223 
224   // recombine
225   Result := (lo shl 16) + hi;
226 end;
227 
Read32BitFilenull228 function Read32BitFile(const pcFileStream: TFileStream; pbBigEndian: boolean): WideString;
229 var
230   liBytesRemaining: integer;
231   charsRemaining: integer;
232   ucs4Chars: UCS4String;
233   liLoop: integer;
234 begin
235   liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
236   charsRemaining := liBytesRemaining div 4;
237 
238   SetLength(ucs4Chars, charsRemaining);
239   pcFileStream.Read(ucs4Chars[0], liBytesRemaining);
240 
241   if pbBigEndian then
242   begin
243     // swap the bytes
244     for liLoop := 0 to charsRemaining - 1 do
245       ucs4Chars[liLoop] := SwapWords(ucs4Chars[liLoop]);
246   end;
247 
248   Result := UCS4StringToWideString(ucs4Chars);
249 end;
250 
251 { read in a text file,
252   the file can contain 8-bit or 16-bit chars
253   code is much adapted from a sample by Mike Shkolnik
254   in nntp://borland.public.delphi.rtl.general
255   Re: Read UNICODE/ANSI/ASCII Text File to String
256   at: Jan 23 2006, 12:17
257   found at http://delphi.newswhat.com/geoxml/forumhistorythread?groupname=borland.public.delphi.rtl.general&messageid=43d485bf$1@newsgroups.borland.com
258 }
259 procedure ReadTextFile(const psFileName: string; out psContents: String;
260   out peContentType: TFileContentType);
261 var
262   fs: TFileStream;
263 begin
264   psContents    := '';
265   peContentType := eUnknown;
266 
267   {open file}
268   fs := TFileStream.Create(psFileName, fmOpenRead);
269   try
270     peContentType := ReadFileHeader(fs);
271 
272     ReadPastFileHeader(fs, peContentType);
273 
274     case peContentType of
275       e8Bit:
276         psContents := Read8BitFile(fs);
277 
278       eUtf8:
279         psContents := ReadUtf8File(fs);
280 
281       eUtf16LittleEndian, eUtf16BigEndian:
282         psContents := {%H-}Read16BitFile(fs, peContentType = eUtf16BigEndian);
283 
284       eUtf32LittleEndian, eUtf32BigEndian:
285         psContents := {%H-}Read32BitFile(fs, peContentType = eUtf32BigEndian);
286       else
287         raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
288 
289     end;
290   finally
291     // close the file
292     fs.Free;
293   end;
294 end;
295 
296 { this is one of the few cases when "AnsiString" must be used }
297 procedure Write8BitFile(const pcFileStream: TFileStream; const psContents: String);
298 var
299   Len:    integer;
300   lsContents: AnsiString;
301 begin
302   lsContents := psContents;
303   Len := Length(lsContents);
304   if Len > 0 then
305     pcFileStream.WriteBuffer(lsContents[1], Len);
306 end;
307 
308 { this is one of the few cases when "AnsiString" must be used }
309 procedure WriteUtf8File(const pcFileStream: TFileStream; const psContents: String);
310 var
311   Len:    integer;
312   lsContents: AnsiString;
313   utf8Header: array [0..2] of byte;
314 begin
315   lsContents := psContents;
316   Len := Length(lsContents);
317   // write the BOM
318   utf8Header[0] := Utf8Marker1;
319   utf8Header[1] := Utf8Marker2;
320   utf8Header[2] := Utf8Marker3;
321   pcFileStream.WriteBuffer(utf8Header[0], 3);
322 
323   if Len > 0 then
324     pcFileStream.WriteBuffer(lsContents[1], Len);
325 end;
326 
327 procedure Write16BitFile(const pcFileStream: TFileStream;
328   const psContents: WideString; const pbBigEndian: boolean);
329 var
330   Len:    integer;
331   liLoop: integer;
332   wChar:  word;
333 begin
334   Len := Length(psContents);
335 
336   if Len > 0 then
337   begin
338     if pbBigEndian then
339     begin
340       // write the BOM
341       wChar := Utf16BigEndianMarker;
342       pcFileStream.WriteBuffer(wChar, 2);
343 
344       for liLoop := 1 to Len do
345       begin
346         wChar := Swap(word(psContents[liLoop]));
347         pcFileStream.WriteBuffer(wChar, 2);
348       end;
349     end
350     else
351     begin
352       // write the BOM
353       wChar := Utf16LittleEndianMarker;
354       pcFileStream.WriteBuffer(wChar, 2);
355 
356       pcFileStream.WriteBuffer(psContents[1], Len * 2);
357     end;
358   end;
359 end;
360 
361 procedure Write32BitFile(const pcFileStream: TFileStream;
362   const psContents: WideString; const pbBigEndian: boolean);
363 var
364   Len:    integer;
365   liLoop: integer;
366   lsUcs4String: UCS4String;
367   lcUcs4Char: UCS4Char;
368   wChar: word;
369 begin
370   Len := Length(psContents);
371 
372   if Len > 0 then
373   begin
374     lsUcs4String := WideStringToUCS4String(psContents);
375 
376     if pbBigEndian then
377     begin
378       // write the BOM
379       wChar := Utf32BigEndianMarker1;
380       pcFileStream.WriteBuffer(wChar, 2);
381       wChar := Utf32BigEndianMarker2;
382       pcFileStream.WriteBuffer(wChar, 2);
383 
384       for liLoop := 0 to Len do
385       begin
386         lcUcs4Char := SwapWords(lsUcs4String[liLoop]);
387         pcFileStream.WriteBuffer(lcUcs4Char, 4);
388       end;
389     end
390     else
391     begin
392       // write the BOM
393       wChar := Utf32LittleEndianMarker1;
394       pcFileStream.WriteBuffer(wChar, 2);
395       wChar := Utf32LittleEndianMarker2;
396       pcFileStream.WriteBuffer(wChar, 2);
397 
398       // an array not a real string, indexed from zero
399       pcFileStream.WriteBuffer(lsUcs4String[0], (Len + 1) * 4);
400     end;
401   end;
402 end;
403 
404 procedure WriteTextFile(const psFileName: string; const psContents: String;
405   const peContentType: TFileContentType);
406 var
407   fs:     TFileStream;
408  begin
409   fs := TFileStream.Create(psFileName, fmCreate);
410   try
411 
412    case peContentType of
413      e8Bit:
414      begin
415        Write8BitFile(fs, psContents);
416      end;
417 
418      eUtf8:
419      begin
420        WriteUtf8File(fs, psContents);
421      end;
422 
423      eUtf16LittleEndian, eUtf16BigEndian:
424      begin
425        Write16BitFile(fs, psContents{%H-}, peContentType = eUtf16BigEndian);
426      end;
427 
428      eUtf32LittleEndian, eUtf32BigEndian:
429      begin
430        Write32BitFile(fs, psContents{%H-}, peContentType = eUtf32BigEndian);
431      end;
432 
433      else
434        raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
435 
436    end;
437 
438   finally
439     fs.Free;
440   end;
441 end;
442 
443 end.
444