1 // © 2020 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // uniquecharstr.h
5 // created: 2020sep01 Frank Yung-Fong Tang
6 
7 #ifndef __UNIQUECHARSTR_H__
8 #define __UNIQUECHARSTR_H__
9 
10 #include "charstr.h"
11 #include "uassert.h"
12 #include "uhash.h"
13 
14 U_NAMESPACE_BEGIN
15 
16 /**
17  * Stores NUL-terminated strings with duplicate elimination.
18  * Checks for unique UTF-16 string pointers and converts to invariant characters.
19  *
20  * Intended to be stack-allocated. Add strings, get a unique number for each,
21  * freeze the object, get a char * pointer for each string,
22  * call orphanCharStrings() to capture the string storage, and let this object go out of scope.
23  */
24 class UniqueCharStrings {
25 public:
UniqueCharStrings(UErrorCode & errorCode)26     UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) {
27         // Note: We hash on string contents but store stable char16_t * pointers.
28         // If the strings are stored in resource bundles which should be built with
29         // duplicate elimination, then we should be able to hash on just the pointer values.
30         uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode);
31         if (U_FAILURE(errorCode)) { return; }
32         strings = new CharString();
33         if (strings == nullptr) {
34             errorCode = U_MEMORY_ALLOCATION_ERROR;
35         }
36     }
~UniqueCharStrings()37     ~UniqueCharStrings() {
38         uhash_close(&map);
39         delete strings;
40     }
41 
42     /** Returns/orphans the CharString that contains all strings. */
orphanCharStrings()43     CharString *orphanCharStrings() {
44         CharString *result = strings;
45         strings = nullptr;
46         return result;
47     }
48 
49     /**
50      * Adds a string and returns a unique number for it.
51      * The string's buffer contents must not change, nor move around in memory,
52      * while this UniqueCharStrings is in use.
53      * The string contents must be NUL-terminated exactly at s.length().
54      *
55      * Best used with read-only-alias UnicodeString objects that point to
56      * stable storage, such as strings returned by resource bundle functions.
57      */
add(const UnicodeString & s,UErrorCode & errorCode)58     int32_t add(const UnicodeString &s, UErrorCode &errorCode) {
59         if (U_FAILURE(errorCode)) { return 0; }
60         if (isFrozen) {
61             errorCode = U_NO_WRITE_PERMISSION;
62             return 0;
63         }
64         // The string points into the resource bundle.
65         const char16_t *p = s.getBuffer();
66         int32_t oldIndex = uhash_geti(&map, p);
67         if (oldIndex != 0) {  // found duplicate
68             return oldIndex;
69         }
70         // Explicit NUL terminator for the previous string.
71         // The strings object is also terminated with one implicit NUL.
72         strings->append(0, errorCode);
73         int32_t newIndex = strings->length();
74         strings->appendInvariantChars(s, errorCode);
75         uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode);
76         return newIndex;
77     }
78 
freeze()79     void freeze() { isFrozen = true; }
80 
81     /**
82      * Returns a string pointer for its unique number, if this object is frozen.
83      * Otherwise nullptr.
84      */
get(int32_t i)85     const char *get(int32_t i) const {
86         U_ASSERT(isFrozen);
87         return isFrozen && i > 0 ? strings->data() + i : nullptr;
88     }
89 
90 private:
91     UHashtable map;
92     CharString *strings;
93     bool isFrozen = false;
94 };
95 
96 U_NAMESPACE_END
97 
98 #endif  // __UNIQUECHARSTR_H__
99