1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/utf_offset_string_conversions.h"
6 
7 #include <stdint.h>
8 
9 #include <algorithm>
10 #include <memory>
11 
12 #include "base/check_op.h"
13 #include "base/strings/string_piece.h"
14 #include "base/strings/utf_string_conversion_utils.h"
15 
16 namespace base {
17 
Adjustment(size_t original_offset,size_t original_length,size_t output_length)18 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
19                                        size_t original_length,
20                                        size_t output_length)
21     : original_offset(original_offset),
22       original_length(original_length),
23       output_length(output_length) {
24 }
25 
26 // static
AdjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_adjustment,size_t limit)27 void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
28                                    std::vector<size_t>* offsets_for_adjustment,
29                                    size_t limit) {
30   DCHECK(offsets_for_adjustment);
31   for (auto& i : *offsets_for_adjustment)
32     AdjustOffset(adjustments, &i, limit);
33 }
34 
35 // static
AdjustOffset(const Adjustments & adjustments,size_t * offset,size_t limit)36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
37                                   size_t* offset,
38                                   size_t limit) {
39   DCHECK(offset);
40   if (*offset == string16::npos)
41     return;
42   int adjustment = 0;
43   for (const auto& i : adjustments) {
44     if (*offset <= i.original_offset)
45       break;
46     if (*offset < (i.original_offset + i.original_length)) {
47       *offset = string16::npos;
48       return;
49     }
50     adjustment += static_cast<int>(i.original_length - i.output_length);
51   }
52   *offset -= adjustment;
53 
54   if (*offset > limit)
55     *offset = string16::npos;
56 }
57 
58 // static
UnadjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_unadjustment)59 void OffsetAdjuster::UnadjustOffsets(
60     const Adjustments& adjustments,
61     std::vector<size_t>* offsets_for_unadjustment) {
62   if (!offsets_for_unadjustment || adjustments.empty())
63     return;
64   for (auto& i : *offsets_for_unadjustment)
65     UnadjustOffset(adjustments, &i);
66 }
67 
68 // static
UnadjustOffset(const Adjustments & adjustments,size_t * offset)69 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
70                                     size_t* offset) {
71   if (*offset == string16::npos)
72     return;
73   int adjustment = 0;
74   for (const auto& i : adjustments) {
75     if (*offset + adjustment <= i.original_offset)
76       break;
77     adjustment += static_cast<int>(i.original_length - i.output_length);
78     if ((*offset + adjustment) < (i.original_offset + i.original_length)) {
79       *offset = string16::npos;
80       return;
81     }
82   }
83   *offset += adjustment;
84 }
85 
86 // static
MergeSequentialAdjustments(const Adjustments & first_adjustments,Adjustments * adjustments_on_adjusted_string)87 void OffsetAdjuster::MergeSequentialAdjustments(
88     const Adjustments& first_adjustments,
89     Adjustments* adjustments_on_adjusted_string) {
90   auto adjusted_iter = adjustments_on_adjusted_string->begin();
91   auto first_iter = first_adjustments.begin();
92   // Simultaneously iterate over all |adjustments_on_adjusted_string| and
93   // |first_adjustments|, pushing adjustments at the end of
94   // |adjustments_builder| as we go.  |shift| keeps track of the current number
95   // of characters collapsed by |first_adjustments| up to this point.
96   // |currently_collapsing| keeps track of the number of characters collapsed by
97   // |first_adjustments| into the current |adjusted_iter|'s length.  These are
98   // characters that will change |shift| as soon as we're done processing the
99   // current |adjusted_iter|; they are not yet reflected in |shift|.
100   size_t shift = 0;
101   size_t currently_collapsing = 0;
102   // While we *could* update |adjustments_on_adjusted_string| in place by
103   // inserting new adjustments into the middle, we would be repeatedly calling
104   // |std::vector::insert|. That would cost O(n) time per insert, relative to
105   // distance from end of the string.  By instead allocating
106   // |adjustments_builder| and calling |std::vector::push_back|, we only pay
107   // amortized constant time per push. We are trading space for time.
108   Adjustments adjustments_builder;
109   while (adjusted_iter != adjustments_on_adjusted_string->end()) {
110     if ((first_iter == first_adjustments.end()) ||
111         ((adjusted_iter->original_offset + shift +
112           adjusted_iter->original_length) <= first_iter->original_offset)) {
113       // Entire |adjusted_iter| (accounting for its shift and including its
114       // whole original length) comes before |first_iter|.
115       //
116       // Correct the offset at |adjusted_iter| and move onto the next
117       // adjustment that needs revising.
118       adjusted_iter->original_offset += shift;
119       shift += currently_collapsing;
120       currently_collapsing = 0;
121       adjustments_builder.push_back(*adjusted_iter);
122       ++adjusted_iter;
123     } else if ((adjusted_iter->original_offset + shift) >
124                first_iter->original_offset) {
125       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
126 
127       // It's not possible for the adjustments to overlap.  (It shouldn't
128       // be possible that we have an |adjusted_iter->original_offset| that,
129       // when adjusted by the computed |shift|, is in the middle of
130       // |first_iter|'s output's length.  After all, that would mean the
131       // current adjustment_on_adjusted_string somehow points to an offset
132       // that was supposed to have been eliminated by the first set of
133       // adjustments.)
134       DCHECK_LE(first_iter->original_offset + first_iter->output_length,
135                 adjusted_iter->original_offset + shift);
136 
137       // Add the |first_iter| to the full set of adjustments.
138       shift += first_iter->original_length - first_iter->output_length;
139       adjustments_builder.push_back(*first_iter);
140       ++first_iter;
141     } else {
142       // The first adjustment adjusted something that then got further adjusted
143       // by the second set of adjustments.  In other words, |first_iter| points
144       // to something in the range covered by |adjusted_iter|'s length (after
145       // accounting for |shift|).  Precisely,
146       //   adjusted_iter->original_offset + shift
147       //   <=
148       //   first_iter->original_offset
149       //   <=
150       //   adjusted_iter->original_offset + shift +
151       //       adjusted_iter->original_length
152 
153       // Modify the current |adjusted_iter| to include whatever collapsing
154       // happened in |first_iter|, then advance to the next |first_adjustments|
155       // because we dealt with the current one.
156       const int collapse = static_cast<int>(first_iter->original_length) -
157           static_cast<int>(first_iter->output_length);
158       // This function does not know how to deal with a string that expands and
159       // then gets modified, only strings that collapse and then get modified.
160       DCHECK_GT(collapse, 0);
161       adjusted_iter->original_length += collapse;
162       currently_collapsing += collapse;
163       ++first_iter;
164     }
165   }
166   DCHECK_EQ(0u, currently_collapsing);
167   if (first_iter != first_adjustments.end()) {
168     // Only first adjustments are left.  These do not need to be modified.
169     // (Their offsets are already correct with respect to the original string.)
170     // Append them all.
171     DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
172     adjustments_builder.insert(adjustments_builder.end(), first_iter,
173                                first_adjustments.end());
174   }
175   *adjustments_on_adjusted_string = std::move(adjustments_builder);
176 }
177 
178 // Converts the given source Unicode character type to the given destination
179 // Unicode character type as a STL string. The given input buffer and size
180 // determine the source, and the given output STL string will be replaced by
181 // the result.  If non-NULL, |adjustments| is set to reflect the all the
182 // alterations to the string that are not one-character-to-one-character.
183 // It will always be sorted by increasing offset.
184 template<typename SrcChar, typename DestStdString>
ConvertUnicode(const SrcChar * src,size_t src_len,DestStdString * output,OffsetAdjuster::Adjustments * adjustments)185 bool ConvertUnicode(const SrcChar* src,
186                     size_t src_len,
187                     DestStdString* output,
188                     OffsetAdjuster::Adjustments* adjustments) {
189   if (adjustments)
190     adjustments->clear();
191   // ICU requires 32-bit numbers.
192   bool success = true;
193   int32_t src_len32 = static_cast<int32_t>(src_len);
194   for (int32_t i = 0; i < src_len32; i++) {
195     uint32_t code_point;
196     size_t original_i = i;
197     size_t chars_written = 0;
198     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
199       chars_written = WriteUnicodeCharacter(code_point, output);
200     } else {
201       chars_written = WriteUnicodeCharacter(0xFFFD, output);
202       success = false;
203     }
204 
205     // Only bother writing an adjustment if this modification changed the
206     // length of this character.
207     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
208     // character read, not after it (so that incrementing it in the loop
209     // increment will place it at the right location), so we need to account
210     // for that in determining the amount that was read.
211     if (adjustments && ((i - original_i + 1) != chars_written)) {
212       adjustments->push_back(OffsetAdjuster::Adjustment(
213           original_i, i - original_i + 1, chars_written));
214     }
215   }
216   return success;
217 }
218 
UTF8ToUTF16WithAdjustments(const char * src,size_t src_len,string16 * output,base::OffsetAdjuster::Adjustments * adjustments)219 bool UTF8ToUTF16WithAdjustments(
220     const char* src,
221     size_t src_len,
222     string16* output,
223     base::OffsetAdjuster::Adjustments* adjustments) {
224   PrepareForUTF16Or32Output(src, src_len, output);
225   return ConvertUnicode(src, src_len, output, adjustments);
226 }
227 
UTF8ToUTF16WithAdjustments(const base::StringPiece & utf8,base::OffsetAdjuster::Adjustments * adjustments)228 string16 UTF8ToUTF16WithAdjustments(
229     const base::StringPiece& utf8,
230     base::OffsetAdjuster::Adjustments* adjustments) {
231   string16 result;
232   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
233   return result;
234 }
235 
UTF8ToUTF16AndAdjustOffsets(const base::StringPiece & utf8,std::vector<size_t> * offsets_for_adjustment)236 string16 UTF8ToUTF16AndAdjustOffsets(
237     const base::StringPiece& utf8,
238     std::vector<size_t>* offsets_for_adjustment) {
239   for (size_t& offset : *offsets_for_adjustment) {
240     if (offset > utf8.length())
241       offset = string16::npos;
242   }
243   OffsetAdjuster::Adjustments adjustments;
244   string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
245   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
246   return result;
247 }
248 
UTF16ToUTF8AndAdjustOffsets(const base::StringPiece16 & utf16,std::vector<size_t> * offsets_for_adjustment)249 std::string UTF16ToUTF8AndAdjustOffsets(
250     const base::StringPiece16& utf16,
251     std::vector<size_t>* offsets_for_adjustment) {
252   for (size_t& offset : *offsets_for_adjustment) {
253     if (offset > utf16.length())
254       offset = string16::npos;
255   }
256   std::string result;
257   PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
258   OffsetAdjuster::Adjustments adjustments;
259   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
260   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
261   return result;
262 }
263 
264 }  // namespace base
265