1 unit JcfUnicodeFiles;
2 
3 {(*}
4 (*------------------------------------------------------------------------------
5  Delphi Code formatter source code
6 
7 The Original Code is JcfUnicodeFiles, released March 2007.
8 The Initial Developer of the Original Code is Anthony Steele.
9 Portions created by Anthony Steele are Copyright (C) 2007 Anthony Steele.
10 All Rights Reserved.
11 Contributor(s): Anthony Steele.
12 
13 The contents of this file are subject to the Mozilla Public License Version 1.1
14 (the "License"). you may not use this file except in compliance with the License.
15 You may obtain a copy of the License at http://www.mozilla.org/NPL/
16 
17 Software distributed under the License is distributed on an "AS IS" basis,
18 WITHOUT WARRANTY OF ANY KIND, either express or implied.
19 See the License for the specific language governing rights and limitations
20 under the License.
21 
22 Alternatively, the contents of this file may be used under the terms of
23 the GNU General Public License Version 2 or later (the "GPL")
24 See http://www.gnu.org/licenses/gpl.html
25 ------------------------------------------------------------------------------*)
26 {*)}
27 
28 {$I JcfGlobal.inc}
29 
30 interface
31 
32 type
33   TFileContentType = (eUnknown, e8Bit, eUtf8,
34     eUtf16LittleEndian, eUtf16BigEndian,
35     eUtf32LittleEndian, eUtf32BigEndian);
36 
TypeOfTextFilenull37 function TypeOfTextFile(const psFileName: string): TFileContentType;
38 
39 procedure ReadTextFile(const psFileName: string; out psContents: String;
40   out peContentType: TFileContentType);
41 
42 procedure WriteTextFile(const psFileName: string; const psContents: String;
43   const peContentType: TFileContentType);
44 
45 
46 implementation
47 
48 uses
49   { delphi }
50   Classes, SysUtils, Dialogs;
51 
52 const
53   // byte order markers (BOM)
54   // these are found at the start of the file
55 
56   /// 3 bytes for UTF-8
57   Utf8Marker12 = $BBEF;
58   Utf8Marker1 = $EF;
59   Utf8Marker2 = $BB;
60   Utf8Marker3  = $BF;
61 
62   // 4 bytes for UTF-16. Big or little-endian
63   Utf16LittleEndianMarker = $FEFF;
64   Utf16BigEndianMarker = $FFFE;
65 
66   // 4 bytes for utf-32. Big or little-endian
67   Utf32LittleEndianMarker1 = $FEFF;
68   Utf32LittleEndianMarker2 = $0000;
69 
70   Utf32BigEndianMarker1 = $0000;
71   Utf32BigEndianMarker2 = $FFFE;
72 
73   //MaxAnsiChar = 127;
74 
ReadFileHeadernull75 function ReadFileHeader(const pcFileStream: TFileStream): TFileContentType;
76 var
77   word1: word;
78   word2: word;
79   byte3: byte;
80 begin
81   // small files are ansi
82   if pcFileStream.Size < 4 then
83   begin
84     Result := e8Bit;
85     exit;
86   end;
87 
88   // read the first 4 bytes
89   pcFileStream.Seek(0, soFromBeginning);
90 
91   word1:=0;
92   word2:=0;
93   pcFileStream.Read(word1, SizeOf(word));
94   pcFileStream.Read(word2, SizeOf(word));
95 
96   byte3 := byte(word2);
97 
98   if (word1 = Utf32LittleEndianMarker1) and (word2 = Utf32LittleEndianMarker2) then
99   begin
100     Result := eUtf32LittleEndian;
101   end
102   else if (word1 = Utf32BigEndianMarker1) and (word2 = Utf32BigEndianMarker2) then
103   begin
104     Result := eUtf32BigEndian;
105   end
106   else if (word1 = Utf8Marker12) and (byte3 = Utf8Marker3) then
107   begin
108     Result := eUtf8;
109   end
110   else if (word1 = Utf16LittleEndianMarker) then
111   begin
112     Result := eUtf16LittleEndian;
113   end
114   else if (word1 = Utf16BigEndianMarker) then
115   begin
116     Result := eUtf16BigEndian;
117   end
118   else
119   begin
120     Result := e8Bit;
121   end;
122 
123 end;
124 
TypeOfTextFilenull125 function TypeOfTextFile(const psFileName: string): TFileContentType;
126 var
127   fs: TFileStream;
128 begin
129   {open file}
130   fs := TFileStream.Create(psFileName, fmOpenRead);
131   try
132     Result := ReadFileHeader(fs);
133   finally
134     // close the file
135     fs.Free;
136   end;
137 end;
138 
139 procedure ReadPastFileHeader(const pcFileStream: TFileStream; const peContentType: TFileContentType);
140 var
141   liOffsetBytes: integer;
142 begin
143   case peContentType of
144     e8Bit:
145       liOffsetBytes := 0;
146     eUtf8:
147       liOffsetBytes := 3;
148     eUtf16LittleEndian, eUtf16BigEndian:
149       liOffsetBytes := 2;
150     eUtf32LittleEndian, eUtf32BigEndian:
151       liOffsetBytes := 4;
152     else
153       raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
154   end;
155 
156   pcFileStream.Seek(liOffsetBytes, soFromBeginning);
157 end;
158 
159 { this is one of the few cases where 'AnsiString' must be used }
Read8BitFilenull160 function Read8BitFile(const pcFileStream: TFileStream): String;
161 var
162   liBytesRemaining: integer;
163   lsContents8bit: AnsiString;
164 begin
165   liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
166   // read the bytes into a string
167   SetLength(lsContents8bit, liBytesRemaining);
168   if pcFileStream.Size > 0 then
169   begin
170     pcFileStream.ReadBuffer(lsContents8bit[1], liBytesRemaining);
171   end;
172   Result := lsContents8bit;
173 end;
174 
175 
ReadUtf8Filenull176 function ReadUtf8File(const pcFileStream: TFileStream): String;
177 var
178   liBytesRemaining: integer;
179   lsContents: AnsiString;
180 begin
181   liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
182   // read the bytes into a string
183   SetLength(lsContents, liBytesRemaining);
184   if pcFileStream.Size > 0 then
185   begin
186     pcFileStream.ReadBuffer(lsContents[1], liBytesRemaining);
187   end;
188   Result := lsContents;
189 end;
190 
191 
Read16BitFilenull192 function Read16BitFile(const pcFileStream: TFileStream; const pbBigEndian: boolean): WideString;
193 var
194   liBytesRemaining: integer;
195   liLoop: integer;
196   lsWideContents: WideString;
197 begin
198   // read it
199   liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
200   SetLength(lsWideContents, liBytesRemaining div 2);
201   pcFileStream.Read(lsWideContents[1], liBytesRemaining);
202 
203   if pbBigEndian then
204   begin
205     // swap the bytes
206     for liLoop := 1 to Length(lsWideContents) do
207       lsWideContents[liLoop] := WideChar(Swap(word(lsWideContents[liLoop])));
208   end;
209 
210   Result := lsWideContents;
211 end;
212 
SwapWordsnull213 function SwapWords(const value: UCS4Char): UCS4Char;
214 var
215   hi: word;
216   lo: word;
217 begin
218   // split into 16-bit words
219   hi := value shr 16;
220   lo := (value and $0000FFFF); // Prevent Range check error by converting to a word
221 
222   hi := Swap(hi);
223   lo := Swap(lo);
224 
225   // recombine
226   Result := (lo shl 16) + hi;
227 end;
228 
Read32BitFilenull229 function Read32BitFile(const pcFileStream: TFileStream; pbBigEndian: boolean): WideString;
230 var
231   liBytesRemaining: integer;
232   charsRemaining: integer;
233   ucs4Chars: UCS4String;
234   liLoop: integer;
235 begin
236   liBytesRemaining := pcFileStream.Size - pcFileStream.Position;
237   charsRemaining := liBytesRemaining div 4;
238 
239   SetLength(ucs4Chars, charsRemaining);
240   pcFileStream.Read(ucs4Chars[0], liBytesRemaining);
241 
242   if pbBigEndian then
243   begin
244     // swap the bytes
245     for liLoop := 0 to charsRemaining - 1 do
246       ucs4Chars[liLoop] := SwapWords(ucs4Chars[liLoop]);
247   end;
248 
249   Result := UCS4StringToWideString(ucs4Chars);
250 end;
251 
252 { read in a text file,
253   the file can contain 8-bit or 16-bit chars
254   code is much adapted from a sample by Mike Shkolnik
255   in nntp://borland.public.delphi.rtl.general
256   Re: Read UNICODE/ANSI/ASCII Text File to String
257   at: Jan 23 2006, 12:17
258   found at http://delphi.newswhat.com/geoxml/forumhistorythread?groupname=borland.public.delphi.rtl.general&messageid=43d485bf$1@newsgroups.borland.com
259 }
260 procedure ReadTextFile(const psFileName: string; out psContents: String;
261   out peContentType: TFileContentType);
262 var
263   fs: TFileStream;
264 begin
265   psContents    := '';
266   peContentType := eUnknown;
267 
268   {open file}
269   fs := TFileStream.Create(psFileName, fmOpenRead);
270   try
271     peContentType := ReadFileHeader(fs);
272 
273     ReadPastFileHeader(fs, peContentType);
274 
275     case peContentType of
276       e8Bit:
277         psContents := Read8BitFile(fs);
278 
279       eUtf8:
280         psContents := ReadUtf8File(fs);
281 
282       eUtf16LittleEndian, eUtf16BigEndian:
283         psContents := {%H-}Read16BitFile(fs, peContentType = eUtf16BigEndian);
284 
285       eUtf32LittleEndian, eUtf32BigEndian:
286         psContents := {%H-}Read32BitFile(fs, peContentType = eUtf32BigEndian);
287       else
288         raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
289 
290     end;
291   finally
292     // close the file
293     fs.Free;
294   end;
295 end;
296 
297 { this is one of the few cases when "AnsiString" must be used }
298 procedure Write8BitFile(const pcFileStream: TFileStream; const psContents: String);
299 var
300   Len:    integer;
301   lsContents: AnsiString;
302 begin
303   lsContents := psContents;
304   Len := Length(lsContents);
305   if Len > 0 then
306     pcFileStream.WriteBuffer(lsContents[1], Len);
307 end;
308 
309 { this is one of the few cases when "AnsiString" must be used }
310 procedure WriteUtf8File(const pcFileStream: TFileStream; const psContents: String);
311 var
312   Len:    integer;
313   lsContents: AnsiString;
314   utf8Header: array [0..2] of byte;
315 begin
316   lsContents := psContents;
317   Len := Length(lsContents);
318   // write the BOM
319   utf8Header[0] := Utf8Marker1;
320   utf8Header[1] := Utf8Marker2;
321   utf8Header[2] := Utf8Marker3;
322   pcFileStream.WriteBuffer(utf8Header[0], 3);
323 
324   if Len > 0 then
325     pcFileStream.WriteBuffer(lsContents[1], Len);
326 end;
327 
328 procedure Write16BitFile(const pcFileStream: TFileStream;
329   const psContents: WideString; const pbBigEndian: boolean);
330 var
331   Len:    integer;
332   liLoop: integer;
333   wChar:  word;
334 begin
335   Len := Length(psContents);
336 
337   if Len > 0 then
338   begin
339     if pbBigEndian then
340     begin
341       // write the BOM
342       wChar := Utf16BigEndianMarker;
343       pcFileStream.WriteBuffer(wChar, 2);
344 
345       for liLoop := 1 to Len do
346       begin
347         wChar := Swap(word(psContents[liLoop]));
348         pcFileStream.WriteBuffer(wChar, 2);
349       end;
350     end
351     else
352     begin
353       // write the BOM
354       wChar := Utf16LittleEndianMarker;
355       pcFileStream.WriteBuffer(wChar, 2);
356 
357       pcFileStream.WriteBuffer(psContents[1], Len * 2);
358     end;
359   end;
360 end;
361 
362 procedure Write32BitFile(const pcFileStream: TFileStream;
363   const psContents: WideString; const pbBigEndian: boolean);
364 var
365   Len:    integer;
366   liLoop: integer;
367   lsUcs4String: UCS4String;
368   lcUcs4Char: UCS4Char;
369   wChar: word;
370 begin
371   Len := Length(psContents);
372 
373   if Len > 0 then
374   begin
375     lsUcs4String := WideStringToUCS4String(psContents);
376 
377     if pbBigEndian then
378     begin
379       // write the BOM
380       wChar := Utf32BigEndianMarker1;
381       pcFileStream.WriteBuffer(wChar, 2);
382       wChar := Utf32BigEndianMarker2;
383       pcFileStream.WriteBuffer(wChar, 2);
384 
385       for liLoop := 0 to Len do
386       begin
387         lcUcs4Char := SwapWords(lsUcs4String[liLoop]);
388         pcFileStream.WriteBuffer(lcUcs4Char, 4);
389       end;
390     end
391     else
392     begin
393       // write the BOM
394       wChar := Utf32LittleEndianMarker1;
395       pcFileStream.WriteBuffer(wChar, 2);
396       wChar := Utf32LittleEndianMarker2;
397       pcFileStream.WriteBuffer(wChar, 2);
398 
399       // an array not a real string, indexed from zero
400       pcFileStream.WriteBuffer(lsUcs4String[0], (Len + 1) * 4);
401     end;
402   end;
403 end;
404 
405 procedure WriteTextFile(const psFileName: string; const psContents: String;
406   const peContentType: TFileContentType);
407 var
408   fs:     TFileStream;
409  begin
410   fs := TFileStream.Create(psFileName, fmCreate);
411   try
412 
413    case peContentType of
414      e8Bit:
415      begin
416        Write8BitFile(fs, psContents);
417      end;
418 
419      eUtf8:
420      begin
421        WriteUtf8File(fs, psContents);
422      end;
423 
424      eUtf16LittleEndian, eUtf16BigEndian:
425      begin
426        Write16BitFile(fs, psContents{%H-}, peContentType = eUtf16BigEndian);
427      end;
428 
429      eUtf32LittleEndian, eUtf32BigEndian:
430      begin
431        Write32BitFile(fs, psContents{%H-}, peContentType = eUtf32BigEndian);
432      end;
433 
434      else
435        raise Exception.Create('Unknown file content type: ' + IntToStr(Ord(peContentType)));
436 
437    end;
438 
439   finally
440     fs.Free;
441   end;
442 end;
443 
444 end.
445