1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/utf_offset_string_conversions.h"
6 
7 #include <stdint.h>
8 
9 #include <algorithm>
10 #include <memory>
11 
12 #include "base/logging.h"
13 #include "base/strings/string_piece.h"
14 #include "base/strings/utf_string_conversion_utils.h"
15 
16 namespace base {
17 
Adjustment(size_t original_offset,size_t original_length,size_t output_length)18 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
19                                        size_t original_length,
20                                        size_t output_length)
21     : original_offset(original_offset),
22       original_length(original_length),
23       output_length(output_length) {}
24 
25 // static
AdjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_adjustment,size_t limit)26 void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
27                                    std::vector<size_t>* offsets_for_adjustment,
28                                    size_t limit) {
29   DCHECK(offsets_for_adjustment);
30   for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
31        i != offsets_for_adjustment->end(); ++i)
32     AdjustOffset(adjustments, &(*i), limit);
33 }
34 
35 // static
AdjustOffset(const Adjustments & adjustments,size_t * offset,size_t limit)36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
37                                   size_t* offset,
38                                   size_t limit) {
39   DCHECK(offset);
40   if (*offset == string16::npos)
41     return;
42   int adjustment = 0;
43   for (Adjustments::const_iterator i = adjustments.begin();
44        i != adjustments.end(); ++i) {
45     if (*offset <= i->original_offset)
46       break;
47     if (*offset < (i->original_offset + i->original_length)) {
48       *offset = string16::npos;
49       return;
50     }
51     adjustment += static_cast<int>(i->original_length - i->output_length);
52   }
53   *offset -= adjustment;
54 
55   if (*offset > limit)
56     *offset = string16::npos;
57 }
58 
59 // static
UnadjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_unadjustment)60 void OffsetAdjuster::UnadjustOffsets(
61     const Adjustments& adjustments,
62     std::vector<size_t>* offsets_for_unadjustment) {
63   if (!offsets_for_unadjustment || adjustments.empty())
64     return;
65   for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
66        i != offsets_for_unadjustment->end(); ++i)
67     UnadjustOffset(adjustments, &(*i));
68 }
69 
70 // static
UnadjustOffset(const Adjustments & adjustments,size_t * offset)71 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
72                                     size_t* offset) {
73   if (*offset == string16::npos)
74     return;
75   int adjustment = 0;
76   for (Adjustments::const_iterator i = adjustments.begin();
77        i != adjustments.end(); ++i) {
78     if (*offset + adjustment <= i->original_offset)
79       break;
80     adjustment += static_cast<int>(i->original_length - i->output_length);
81     if ((*offset + adjustment) < (i->original_offset + i->original_length)) {
82       *offset = string16::npos;
83       return;
84     }
85   }
86   *offset += adjustment;
87 }
88 
89 // static
MergeSequentialAdjustments(const Adjustments & first_adjustments,Adjustments * adjustments_on_adjusted_string)90 void OffsetAdjuster::MergeSequentialAdjustments(
91     const Adjustments& first_adjustments,
92     Adjustments* adjustments_on_adjusted_string) {
93   Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
94   Adjustments::const_iterator first_iter = first_adjustments.begin();
95   // Simultaneously iterate over all |adjustments_on_adjusted_string| and
96   // |first_adjustments|, adding adjustments to or correcting the adjustments
97   // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
98   // current number of characters collapsed by |first_adjustments| up to this
99   // point.  |currently_collapsing| keeps track of the number of characters
100   // collapsed by |first_adjustments| into the current |adjusted_iter|'s
101   // length.  These are characters that will change |shift| as soon as we're
102   // done processing the current |adjusted_iter|; they are not yet reflected in
103   // |shift|.
104   size_t shift = 0;
105   size_t currently_collapsing = 0;
106   while (adjusted_iter != adjustments_on_adjusted_string->end()) {
107     if ((first_iter == first_adjustments.end()) ||
108         ((adjusted_iter->original_offset + shift +
109           adjusted_iter->original_length) <= first_iter->original_offset)) {
110       // Entire |adjusted_iter| (accounting for its shift and including its
111       // whole original length) comes before |first_iter|.
112       //
113       // Correct the offset at |adjusted_iter| and move onto the next
114       // adjustment that needs revising.
115       adjusted_iter->original_offset += shift;
116       shift += currently_collapsing;
117       currently_collapsing = 0;
118       ++adjusted_iter;
119     } else if ((adjusted_iter->original_offset + shift) >
120                first_iter->original_offset) {
121       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
122 
123       // It's not possible for the adjustments to overlap.  (It shouldn't
124       // be possible that we have an |adjusted_iter->original_offset| that,
125       // when adjusted by the computed |shift|, is in the middle of
126       // |first_iter|'s output's length.  After all, that would mean the
127       // current adjustment_on_adjusted_string somehow points to an offset
128       // that was supposed to have been eliminated by the first set of
129       // adjustments.)
130       DCHECK_LE(first_iter->original_offset + first_iter->output_length,
131                 adjusted_iter->original_offset + shift);
132 
133       // Add the |first_adjustment_iter| to the full set of adjustments while
134       // making sure |adjusted_iter| continues pointing to the same element.
135       // We do this by inserting the |first_adjustment_iter| right before
136       // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
137       // the following element.
138       shift += first_iter->original_length - first_iter->output_length;
139       adjusted_iter =
140           adjustments_on_adjusted_string->insert(adjusted_iter, *first_iter);
141       ++adjusted_iter;
142       ++first_iter;
143     } else {
144       // The first adjustment adjusted something that then got further adjusted
145       // by the second set of adjustments.  In other words, |first_iter| points
146       // to something in the range covered by |adjusted_iter|'s length (after
147       // accounting for |shift|).  Precisely,
148       //   adjusted_iter->original_offset + shift
149       //   <=
150       //   first_iter->original_offset
151       //   <=
152       //   adjusted_iter->original_offset + shift +
153       //       adjusted_iter->original_length
154 
155       // Modify the current |adjusted_iter| to include whatever collapsing
156       // happened in |first_iter|, then advance to the next |first_adjustments|
157       // because we dealt with the current one.
158       const int collapse = static_cast<int>(first_iter->original_length) -
159                            static_cast<int>(first_iter->output_length);
160       // This function does not know how to deal with a string that expands and
161       // then gets modified, only strings that collapse and then get modified.
162       DCHECK_GT(collapse, 0);
163       adjusted_iter->original_length += collapse;
164       currently_collapsing += collapse;
165       ++first_iter;
166     }
167   }
168   DCHECK_EQ(0u, currently_collapsing);
169   if (first_iter != first_adjustments.end()) {
170     // Only first adjustments are left.  These do not need to be modified.
171     // (Their offsets are already correct with respect to the original string.)
172     // Append them all.
173     DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
174     adjustments_on_adjusted_string->insert(
175         adjustments_on_adjusted_string->end(), first_iter,
176         first_adjustments.end());
177   }
178 }
179 
180 // Converts the given source Unicode character type to the given destination
181 // Unicode character type as a STL string. The given input buffer and size
182 // determine the source, and the given output STL string will be replaced by
183 // the result.  If non-NULL, |adjustments| is set to reflect the all the
184 // alterations to the string that are not one-character-to-one-character.
185 // It will always be sorted by increasing offset.
186 template <typename SrcChar, typename DestStdString>
ConvertUnicode(const SrcChar * src,size_t src_len,DestStdString * output,OffsetAdjuster::Adjustments * adjustments)187 bool ConvertUnicode(const SrcChar* src,
188                     size_t src_len,
189                     DestStdString* output,
190                     OffsetAdjuster::Adjustments* adjustments) {
191   if (adjustments)
192     adjustments->clear();
193   // ICU requires 32-bit numbers.
194   bool success = true;
195   int32_t src_len32 = static_cast<int32_t>(src_len);
196   for (int32_t i = 0; i < src_len32; i++) {
197     uint32_t code_point;
198     size_t original_i = i;
199     size_t chars_written = 0;
200     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
201       chars_written = WriteUnicodeCharacter(code_point, output);
202     } else {
203       chars_written = WriteUnicodeCharacter(0xFFFD, output);
204       success = false;
205     }
206 
207     // Only bother writing an adjustment if this modification changed the
208     // length of this character.
209     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
210     // character read, not after it (so that incrementing it in the loop
211     // increment will place it at the right location), so we need to account
212     // for that in determining the amount that was read.
213     if (adjustments && ((i - original_i + 1) != chars_written)) {
214       adjustments->push_back(OffsetAdjuster::Adjustment(
215           original_i, i - original_i + 1, chars_written));
216     }
217   }
218   return success;
219 }
220 
UTF8ToUTF16WithAdjustments(const char * src,size_t src_len,string16 * output,base::OffsetAdjuster::Adjustments * adjustments)221 bool UTF8ToUTF16WithAdjustments(
222     const char* src,
223     size_t src_len,
224     string16* output,
225     base::OffsetAdjuster::Adjustments* adjustments) {
226   PrepareForUTF16Or32Output(src, src_len, output);
227   return ConvertUnicode(src, src_len, output, adjustments);
228 }
229 
UTF8ToUTF16WithAdjustments(const base::StringPiece & utf8,base::OffsetAdjuster::Adjustments * adjustments)230 string16 UTF8ToUTF16WithAdjustments(
231     const base::StringPiece& utf8,
232     base::OffsetAdjuster::Adjustments* adjustments) {
233   string16 result;
234   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
235   return result;
236 }
237 
UTF8ToUTF16AndAdjustOffsets(const base::StringPiece & utf8,std::vector<size_t> * offsets_for_adjustment)238 string16 UTF8ToUTF16AndAdjustOffsets(
239     const base::StringPiece& utf8,
240     std::vector<size_t>* offsets_for_adjustment) {
241   for (size_t& offset : *offsets_for_adjustment) {
242     if (offset > utf8.length())
243       offset = string16::npos;
244   }
245   OffsetAdjuster::Adjustments adjustments;
246   string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
247   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
248   return result;
249 }
250 
UTF16ToUTF8AndAdjustOffsets(const base::StringPiece16 & utf16,std::vector<size_t> * offsets_for_adjustment)251 std::string UTF16ToUTF8AndAdjustOffsets(
252     const base::StringPiece16& utf16,
253     std::vector<size_t>* offsets_for_adjustment) {
254   for (size_t& offset : *offsets_for_adjustment) {
255     if (offset > utf16.length())
256       offset = string16::npos;
257   }
258   std::string result;
259   PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
260   OffsetAdjuster::Adjustments adjustments;
261   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
262   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
263   return result;
264 }
265 
266 }  // namespace base
267