1 /**********************************************************************
2 
3    Audacity: A Digital Audio Editor
4    Audacity(R) is copyright (c) 1999-2010 Audacity Team.
5    License: GPL v2.  See License.txt.
6 
7    ProjectSerializer.cpp
8 
9 *******************************************************************//**
10 
11 \class ProjectSerializer
12 \brief a class used to (de)serialize the project catalog
13 
14 *//********************************************************************/
15 
16 
17 #include "ProjectSerializer.h"
18 
19 #include <algorithm>
20 #include <cstdint>
21 #include <mutex>
22 #include <wx/ustring.h>
23 #include <codecvt>
24 #include <locale>
25 #include <deque>
26 
27 #include <wx/log.h>
28 
29 #include "BufferedStreamReader.h"
30 
31 ///
32 /// ProjectSerializer class
33 ///
34 
35 // Simple "binary xml" format used exclusively for project documents.
36 //
37 // It is not intended that the user view or modify the file.
38 //
39 // It IS intended that very little work be done during auto save, so numbers
40 // and strings are written in their native format.  They will be converted
41 // during recovery.
42 //
43 // The file has 3 main sections:
44 //
45 //    character size    1 (UTF-8), 2 (UTF-16) or 4 (UTF-32)
46 //    name dictionary   dictionary of all names used in the document
47 //    data fields       the "encoded" XML document
48 //
49 // If a subtree is added, it will be preceded with FT_Push to tell the decoder
50 // to preserve the active dictionary.  The decoder will then restore the
51 // dictionary when an FT_Pop is encountered.  Nesting is unlimited.
52 //
53 // To save space, each name (attribute or element) encountered is stored in
54 // the name dictionary and replaced with the assigned 2-byte identifier.
55 //
56 // All strings are in native unicode format, 2-byte or 4-byte.
57 //
58 // All name "lengths" are 2-byte signed, so are limited to 32767 bytes long.
59 // All string/data "lengths" are 4-byte signed.
60 
61 enum FieldTypes
62 {
63    FT_CharSize,      // type, ID, value
64    FT_StartTag,      // type, ID
65    FT_EndTag,        // type, ID
66    FT_String,        // type, ID, string length, string
67    FT_Int,           // type, ID, value
68    FT_Bool,          // type, ID, value
69    FT_Long,          // type, ID, value
70    FT_LongLong,      // type, ID, value
71    FT_SizeT,         // type, ID, value
72    FT_Float,         // type, ID, value, digits
73    FT_Double,        // type, ID, value, digits
74    FT_Data,          // type, string length, string
75    FT_Raw,           // type, string length, string
76    FT_Push,          // type only
77    FT_Pop,           // type only
78    FT_Name           // type, ID, name length, name
79 };
80 
81 // Static so that the dict can be reused each time.
82 //
83 // If entries get added later, like when an envelope node (for example)
84 // is written and then the envelope is later removed, the dict will still
85 // contain the envelope name, but that's not a problem.
86 
87 NameMap ProjectSerializer::mNames;
88 MemoryStream ProjectSerializer::mDict;
89 
FailureMessage(const FilePath &)90 TranslatableString ProjectSerializer::FailureMessage( const FilePath &/*filePath*/ )
91 {
92    return
93 XO("This recovery file was saved by Audacity 2.3.0 or before.\n"
94    "You need to run that version of Audacity to recover the project." );
95 }
96 
97 namespace
98 {
99 // Aliases for the FIXED-WIDTH integer types that are used in the file
100 // format.
101 
102 // Chosen so that among the four build types (32 bit Windows, 64
103 // bit Windows, 64 bit Mac clang, Linux g++) presently done (3.0.0
104 // development), we use the narrowest width of the type on any of them, so
105 // that anything saved on one build will be read back identically on all
106 // builds. (Although this means that very large values on some systems might
107 // be saved and then read back with loss.)
108 
109 // In fact the only types for which this matters are long (only 32 bits on
110 // 32 and 64 bit Windows) and size_t (only 32 bits on 32 bit Windows).
111 
112 using UShort = std::uint16_t;
113 using Int = std::int32_t;
114 
115 using Long = std::int32_t;   // To save long values
116 using ULong = std::uint32_t; // To save size_t values
117 
118 using LongLong = std::int64_t;
119 
120 // Detect this computer's endianness
IsLittleEndian()121 bool IsLittleEndian()
122 {
123    const std::uint32_t x = 1u;
124    return static_cast<const unsigned char*>(static_cast<const void*>(&x))[0];
125    // We will assume the same for other widths!
126 }
127 // In C++20 this could be
128 // constexpr bool IsLittleEndian = (std::endian::native == std::endian::little);
129 // static_assert( IsLittleEndian || (std::endian::native == std::endian::big),
130 //    "Oh no!  I'm mixed-endian!" );
131 
132 // Functions that can read and write native integer types to a canonicalized
133 // little-endian file format.  (We don't bother to do the same for floating
134 // point numbers.)
135 
136 // Write native little-endian to little-endian file format
137 template <typename Number>
WriteLittleEndian(MemoryStream & out,Number value)138 void WriteLittleEndian(MemoryStream& out, Number value)
139 {
140    out.AppendData(&value, sizeof(value));
141 }
142 
143 // Write native big-endian to little-endian file format
WriteBigEndian(MemoryStream & out,Number value)144 template <typename Number> void WriteBigEndian(MemoryStream& out, Number value)
145 {
146    auto begin = static_cast<unsigned char*>(static_cast<void*>(&value));
147    std::reverse(begin, begin + sizeof(value));
148    out.AppendData(&value, sizeof(value));
149 }
150 
151 // Read little-endian file format to native little-endian
ReadLittleEndian(BufferedStreamReader & in)152 template <typename Number> Number ReadLittleEndian(BufferedStreamReader& in)
153 {
154    Number result;
155    in.ReadValue(result);
156    return result;
157 }
158 
159 // Read little-endian file format to native big-endian
ReadBigEndian(BufferedStreamReader & in)160 template <typename Number> Number ReadBigEndian(BufferedStreamReader& in)
161 {
162    Number result;
163    in.ReadValue(result);
164    auto begin = static_cast<unsigned char*>(static_cast<void*>(&result));
165    std::reverse(begin, begin + sizeof(result));
166    return result;
167 }
168 
169 // Choose between implementations!
170 static const auto WriteUShort =
171    IsLittleEndian() ? &WriteLittleEndian<UShort> : &WriteBigEndian<UShort>;
172 static const auto WriteInt =
173    IsLittleEndian() ? &WriteLittleEndian<Int> : &WriteBigEndian<Int>;
174 static const auto WriteLong =
175    IsLittleEndian() ? &WriteLittleEndian<Long> : &WriteBigEndian<Long>;
176 static const auto WriteULong =
177    IsLittleEndian() ? &WriteLittleEndian<ULong> : &WriteBigEndian<ULong>;
178 static const auto WriteLongLong =
179    IsLittleEndian() ? &WriteLittleEndian<LongLong> : &WriteBigEndian<LongLong>;
180 
181 static const auto ReadUShort =
182    IsLittleEndian() ? &ReadLittleEndian<UShort> : &ReadBigEndian<UShort>;
183 static const auto ReadInt =
184    IsLittleEndian() ? &ReadLittleEndian<Int> : &ReadBigEndian<Int>;
185 static const auto ReadLong =
186    IsLittleEndian() ? &ReadLittleEndian<Long> : &ReadBigEndian<Long>;
187 static const auto ReadULong =
188    IsLittleEndian() ? &ReadLittleEndian<ULong> : &ReadBigEndian<ULong>;
189 static const auto ReadLongLong =
190    IsLittleEndian() ? &ReadLittleEndian<LongLong> : &ReadBigEndian<LongLong>;
191 
192 // Functions to read and write certain lengths -- maybe we will change
193 // our choices for widths or signedness?
194 
195 using Length = Int; // Instead, as wide as size_t?
196 static const auto WriteLength = WriteInt;
197 static const auto ReadLength = ReadInt;
198 
199 using Digits = Int; // Instead, just an unsigned char?
200 static const auto WriteDigits = WriteInt;
201 static const auto ReadDigits = ReadInt;
202 
203 class XMLTagHandlerAdapter final
204 {
205 public:
XMLTagHandlerAdapter(XMLTagHandler * handler)206    explicit XMLTagHandlerAdapter(XMLTagHandler* handler) noexcept
207        : mBaseHandler(handler)
208    {
209    }
210 
EmitStartTag(const std::string_view & name)211    void EmitStartTag(const std::string_view& name)
212    {
213       if (mInTag)
214          EmitStartTag();
215 
216       mCurrentTagName = name;
217       mInTag = true;
218    }
219 
EndTag(const std::string_view & name)220    void EndTag(const std::string_view& name)
221    {
222       if (mInTag)
223          EmitStartTag();
224 
225       if (XMLTagHandler* const handler = mHandlers.back())
226          handler->HandleXMLEndTag(name);
227 
228       mHandlers.pop_back();
229    }
230 
WriteAttr(const std::string_view & name,std::string value)231    void WriteAttr(const std::string_view& name, std::string value)
232    {
233       assert(mInTag);
234 
235       if (!mInTag)
236          return;
237 
238       mAttributes.emplace_back(name, CacheString(std::move(value)));
239    }
240 
WriteAttr(const std::string_view & name,T value)241    template <typename T> void WriteAttr(const std::string_view& name, T value)
242    {
243       assert(mInTag);
244 
245       if (!mInTag)
246          return;
247 
248       mAttributes.emplace_back(name, XMLAttributeValueView(value));
249    }
250 
WriteData(std::string value)251    void WriteData(std::string value)
252    {
253       if (mInTag)
254          EmitStartTag();
255 
256       if (XMLTagHandler* const handler = mHandlers.back())
257          handler->HandleXMLContent(CacheString(std::move(value)));
258    }
259 
WriteRaw(std::string)260    void WriteRaw(std::string)
261    {
262       // This method is intentionally left empty.
263       // The only data that is serialized by FT_Raw
264       // is the boilerplate code like <?xml > and <!DOCTYPE>
265       // which are ignored
266    }
267 
Finalize()268    bool Finalize()
269    {
270       if (mInTag)
271       {
272          EmitStartTag();
273          EndTag(mCurrentTagName);
274       }
275 
276       return mBaseHandler != nullptr;
277    }
278 
279 private:
EmitStartTag()280    void EmitStartTag()
281    {
282       if (mHandlers.empty())
283       {
284          mHandlers.push_back(mBaseHandler);
285       }
286       else
287       {
288          if (XMLTagHandler* const handler = mHandlers.back())
289             mHandlers.push_back(handler->HandleXMLChild(mCurrentTagName));
290          else
291             mHandlers.push_back(NULL);
292       }
293 
294       if (XMLTagHandler*& handler = mHandlers.back())
295       {
296          if (!handler->HandleXMLTag(mCurrentTagName, mAttributes))
297          {
298             handler = nullptr;
299 
300             if (mHandlers.size() == 1)
301                mBaseHandler = nullptr;
302          }
303       }
304 
305       mStringsCache.clear();
306       mAttributes.clear();
307       mInTag = false;
308    }
309 
CacheString(std::string string)310    std::string_view CacheString(std::string string)
311    {
312       mStringsCache.emplace_back(std::move(string));
313       return mStringsCache.back();
314    }
315 
316    XMLTagHandler* mBaseHandler;
317 
318    std::vector<XMLTagHandler*> mHandlers;
319 
320    std::string_view mCurrentTagName;
321 
322    std::deque<std::string> mStringsCache;
323    AttributesList mAttributes;
324 
325    bool mInTag { false };
326 };
327 
328 // template<typename BaseCharType>
329 // std::string FastStringConvertFromAscii(const BaseCharType* begin, const BaseCharType* end)
330 // {
331 //
332 // }
333 
334 template<typename BaseCharType>
FastStringConvert(const void * bytes,int bytesCount)335 std::string FastStringConvert(const void* bytes, int bytesCount)
336 {
337    constexpr int charSize = sizeof(BaseCharType);
338 
339    assert(bytesCount % charSize == 0);
340 
341    const auto begin = static_cast<const BaseCharType*>(bytes);
342    const auto end = begin + bytesCount / charSize;
343 
344    const bool isAscii = std::all_of(
345       begin, end,
346       [](BaseCharType c)
347       { return static_cast<std::make_unsigned_t<BaseCharType>>(c) < 0x7f; });
348 
349    if (isAscii)
350       return std::string(begin, end);
351 
352    return std::wstring_convert<std::codecvt_utf8<BaseCharType>, BaseCharType>()
353       .to_bytes(begin, end);
354 }
355 } // namespace
356 
ProjectSerializer(size_t allocSize)357 ProjectSerializer::ProjectSerializer(size_t allocSize)
358 {
359    static std::once_flag flag;
360    std::call_once(flag, []{
361       // Just once per run, store header information in the unique static
362       // dictionary that will be written into each project that is saved.
363       // Store the size of "wxStringCharType" so we can convert during recovery
364       // in case the file is used on a system with a different character size.
365       char size = sizeof(wxStringCharType);
366       mDict.AppendByte(FT_CharSize);
367       mDict.AppendData(&size, 1);
368    });
369 
370    mDictChanged = false;
371 }
372 
~ProjectSerializer()373 ProjectSerializer::~ProjectSerializer()
374 {
375 }
376 
StartTag(const wxString & name)377 void ProjectSerializer::StartTag(const wxString & name)
378 {
379    mBuffer.AppendByte(FT_StartTag);
380    WriteName(name);
381 }
382 
EndTag(const wxString & name)383 void ProjectSerializer::EndTag(const wxString & name)
384 {
385    mBuffer.AppendByte(FT_EndTag);
386    WriteName(name);
387 }
388 
WriteAttr(const wxString & name,const wxChar * value)389 void ProjectSerializer::WriteAttr(const wxString & name, const wxChar *value)
390 {
391    WriteAttr(name, wxString(value));
392 }
393 
WriteAttr(const wxString & name,const wxString & value)394 void ProjectSerializer::WriteAttr(const wxString & name, const wxString & value)
395 {
396    mBuffer.AppendByte(FT_String);
397    WriteName(name);
398 
399    const Length len = value.length() * sizeof(wxStringCharType);
400    WriteLength( mBuffer, len );
401    mBuffer.AppendData(value.wx_str(), len);
402 }
403 
WriteAttr(const wxString & name,int value)404 void ProjectSerializer::WriteAttr(const wxString & name, int value)
405 {
406    mBuffer.AppendByte(FT_Int);
407    WriteName(name);
408 
409    WriteInt( mBuffer, value );
410 }
411 
WriteAttr(const wxString & name,bool value)412 void ProjectSerializer::WriteAttr(const wxString & name, bool value)
413 {
414    mBuffer.AppendByte(FT_Bool);
415    WriteName(name);
416 
417    mBuffer.AppendByte(value);
418 }
419 
WriteAttr(const wxString & name,long value)420 void ProjectSerializer::WriteAttr(const wxString & name, long value)
421 {
422    mBuffer.AppendByte(FT_Long);
423    WriteName(name);
424 
425    WriteLong( mBuffer, value );
426 }
427 
WriteAttr(const wxString & name,long long value)428 void ProjectSerializer::WriteAttr(const wxString & name, long long value)
429 {
430    mBuffer.AppendByte(FT_LongLong);
431    WriteName(name);
432 
433    WriteLongLong( mBuffer, value );
434 }
435 
WriteAttr(const wxString & name,size_t value)436 void ProjectSerializer::WriteAttr(const wxString & name, size_t value)
437 {
438    mBuffer.AppendByte(FT_SizeT);
439    WriteName(name);
440 
441    WriteULong( mBuffer, value );
442 }
443 
WriteAttr(const wxString & name,float value,int digits)444 void ProjectSerializer::WriteAttr(const wxString & name, float value, int digits)
445 {
446    mBuffer.AppendByte(FT_Float);
447    WriteName(name);
448 
449    mBuffer.AppendData(&value, sizeof(value));
450    WriteDigits( mBuffer, digits );
451 }
452 
WriteAttr(const wxString & name,double value,int digits)453 void ProjectSerializer::WriteAttr(const wxString & name, double value, int digits)
454 {
455    mBuffer.AppendByte(FT_Double);
456    WriteName(name);
457 
458    mBuffer.AppendData(&value, sizeof(value));
459    WriteDigits( mBuffer, digits );
460 }
461 
WriteData(const wxString & value)462 void ProjectSerializer::WriteData(const wxString & value)
463 {
464    mBuffer.AppendByte(FT_Data);
465 
466    Length len = value.length() * sizeof(wxStringCharType);
467    WriteLength( mBuffer, len );
468    mBuffer.AppendData(value.wx_str(), len);
469 }
470 
Write(const wxString & value)471 void ProjectSerializer::Write(const wxString & value)
472 {
473    mBuffer.AppendByte(FT_Raw);
474    Length len = value.length() * sizeof(wxStringCharType);
475    WriteLength( mBuffer, len );
476    mBuffer.AppendData(value.wx_str(), len);
477 }
478 
WriteName(const wxString & name)479 void ProjectSerializer::WriteName(const wxString & name)
480 {
481    wxASSERT(name.length() * sizeof(wxStringCharType) <= SHRT_MAX);
482    UShort id;
483 
484    auto nameiter = mNames.find(name);
485    if (nameiter != mNames.end())
486    {
487       id = nameiter->second;
488    }
489    else
490    {
491       // mNames is static.  This appends each name to static mDict only once
492       // in each run.
493       UShort len = name.length() * sizeof(wxStringCharType);
494 
495       id = mNames.size();
496       mNames[name] = id;
497 
498       mDict.AppendByte(FT_Name);
499       WriteUShort( mDict, id );
500       WriteUShort( mDict, len );
501       mDict.AppendData(name.wx_str(), len);
502 
503       mDictChanged = true;
504    }
505 
506    WriteUShort( mBuffer, id );
507 }
508 
GetDict() const509 const MemoryStream &ProjectSerializer::GetDict() const
510 {
511    return mDict;
512 }
513 
GetData() const514 const MemoryStream& ProjectSerializer::GetData() const
515 {
516    return mBuffer;
517 }
518 
IsEmpty() const519 bool ProjectSerializer::IsEmpty() const
520 {
521    return mBuffer.GetSize() == 0;
522 }
523 
DictChanged() const524 bool ProjectSerializer::DictChanged() const
525 {
526    return mDictChanged;
527 }
528 
529 // See ProjectFileIO::LoadProject() for explanation of the blockids arg
Decode(BufferedStreamReader & in,XMLTagHandler * handler)530 bool ProjectSerializer::Decode(BufferedStreamReader& in, XMLTagHandler* handler)
531 {
532    if (handler == nullptr)
533       return false;
534 
535    XMLTagHandlerAdapter adapter(handler);
536 
537    std::vector<char> bytes;
538    IdMap mIds;
539    std::vector<IdMap> mIdStack;
540    char mCharSize = 0;
541 
542    mIds.clear();
543 
544    struct Error{}; // exception type for short-range try/catch
545    auto Lookup = [&mIds]( UShort id ) -> std::string_view
546    {
547       auto iter = mIds.find( id );
548       if (iter == mIds.end())
549       {
550          throw Error{};
551       }
552 
553       return iter->second;
554    };
555 
556    int64_t stringsCount = 0;
557    int64_t stringsLength = 0;
558 
559    auto ReadString = [&mCharSize, &in, &bytes, &stringsCount, &stringsLength](int len) -> std::string
560    {
561       bytes.reserve( len );
562       auto data = bytes.data();
563       in.Read( data, len );
564 
565       stringsCount++;
566       stringsLength += len;
567 
568       switch (mCharSize)
569       {
570          case 1:
571             return std::string(bytes.data(), len);
572 
573          case 2:
574             return FastStringConvert<char16_t>(bytes.data(), len);
575 
576          case 4:
577             return FastStringConvert<char32_t>(bytes.data(), len);
578 
579          default:
580             wxASSERT_MSG(false, wxT("Characters size not 1, 2, or 4"));
581          break;
582       }
583 
584       return {};
585    };
586 
587    try
588    {
589       while (!in.Eof())
590       {
591          UShort id;
592 
593          switch (in.GetC())
594          {
595             case FT_Push:
596             {
597                mIdStack.push_back(mIds);
598                mIds.clear();
599             }
600             break;
601 
602             case FT_Pop:
603             {
604                mIds = mIdStack.back();
605                mIdStack.pop_back();
606             }
607             break;
608 
609             case FT_Name:
610             {
611                id = ReadUShort( in );
612                auto len = ReadUShort( in );
613                mIds[id] = ReadString(len);
614             }
615             break;
616 
617             case FT_StartTag:
618             {
619                id = ReadUShort( in );
620 
621                adapter.EmitStartTag(Lookup(id));
622             }
623             break;
624 
625             case FT_EndTag:
626             {
627                id = ReadUShort( in );
628 
629                adapter.EndTag(Lookup(id));
630             }
631             break;
632 
633             case FT_String:
634             {
635                id = ReadUShort( in );
636                int len = ReadLength( in );
637 
638                adapter.WriteAttr(Lookup(id), ReadString(len));
639             }
640             break;
641 
642             case FT_Float:
643             {
644                float val;
645 
646                id = ReadUShort( in );
647                in.Read(&val, sizeof(val));
648                /* int dig = */ReadDigits(in);
649 
650                adapter.WriteAttr(Lookup(id), val);
651             }
652             break;
653 
654             case FT_Double:
655             {
656                double val;
657 
658                id = ReadUShort( in );
659                in.Read(&val, sizeof(val));
660                /*int dig = */ReadDigits(in);
661 
662                adapter.WriteAttr(Lookup(id), val);
663             }
664             break;
665 
666             case FT_Int:
667             {
668                id = ReadUShort( in );
669                int val = ReadInt( in );
670 
671                adapter.WriteAttr(Lookup(id), val);
672             }
673             break;
674 
675             case FT_Bool:
676             {
677                unsigned char val;
678 
679                id = ReadUShort( in );
680                in.Read(&val, 1);
681 
682                adapter.WriteAttr(Lookup(id), val);
683             }
684             break;
685 
686             case FT_Long:
687             {
688                id = ReadUShort( in );
689                long val = ReadLong( in );
690 
691                adapter.WriteAttr(Lookup(id), val);
692             }
693             break;
694 
695             case FT_LongLong:
696             {
697                id = ReadUShort( in );
698                long long val = ReadLongLong( in );
699                adapter.WriteAttr(Lookup(id), val);
700             }
701             break;
702 
703             case FT_SizeT:
704             {
705                id = ReadUShort( in );
706                size_t val = ReadULong( in );
707 
708                adapter.WriteAttr(Lookup(id), val);
709             }
710             break;
711 
712             case FT_Data:
713             {
714                int len = ReadLength( in );
715                adapter.WriteData(ReadString(len));
716             }
717             break;
718 
719             case FT_Raw:
720             {
721                int len = ReadLength( in );
722                adapter.WriteRaw(ReadString(len));
723             }
724             break;
725 
726             case FT_CharSize:
727             {
728                in.Read(&mCharSize, 1);
729             }
730             break;
731 
732             default:
733                wxASSERT(true);
734             break;
735          }
736       }
737    }
738    catch( const Error& )
739    {
740       // Document was corrupt, or platform differences in size or endianness
741       // were not well canonicalized
742       return false;
743    }
744 
745    wxLogInfo(
746       "Loaded %lld string %f Kb in size", stringsCount, stringsLength / 1024.0);
747 
748    return adapter.Finalize();
749 }
750