/* -*- mode: C++; tab-width: 4; c-basic-offset: 4; -*- */ // UT_Stringbuf.cpp // Copyright (C) 2001 Mike Nordell // Copyright (c) 2007 Hubert Figuiere // // This class is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This class is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA // 02110-1301 USA. // #include #include #include #include #include #include #include "ut_string.h" #include "ut_stringbuf.h" #include "ut_unicode.h" #include "ut_string_class.h" #include "ut_assert.h" #include "ut_debugmsg.h" // these classes keep zero terminated strings. // if size() != 0, capacity() is always at least size() + 1. ////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// // // UTF-8 string: encoding is *always* UTF-8 // //////////////////////////////////////////////////////////////////////// UT_UTF8Stringbuf::UT_UTF8Stringbuf () : m_psz(0), m_pEnd(0), m_strlen(0), m_buflen(0) { // } UT_UTF8Stringbuf::UT_UTF8Stringbuf (const UT_UTF8Stringbuf & rhs) : m_psz(0), m_pEnd(0), m_strlen(0), m_buflen(0) { append (rhs); } UT_UTF8Stringbuf::UT_UTF8Stringbuf (const char * sz, size_t n /* == 0 => null-termination */) : m_psz(0), m_pEnd(0), m_strlen(0), m_buflen(0) { append (sz, n); } UT_UTF8Stringbuf::~UT_UTF8Stringbuf () { clear (); } void UT_UTF8Stringbuf::operator=(const UT_UTF8Stringbuf & rhs) { m_pEnd = m_psz; m_strlen = 0; append (rhs); } void UT_UTF8Stringbuf::assign (const char * sz, size_t n /* == 0 => null-termination */) { m_pEnd = m_psz; m_strlen = 0; append (sz, n); } // returns 0 if invalid, or if end of string, i.e. 0 // technically it could differentiate, since UCS-4 is only 31-bit, but... UT_UTF8Stringbuf::UCS4Char UT_UTF8Stringbuf::charCode (const char * str) { if ( str == 0) return 0; if (*str == 0) return 0; const char * p = str; if ((*p & 0x80) == 0x00) // plain us-ascii part of latin-1 { return (UCS4Char) (*p); } UCS4Char ret_code = 0; int bytesInSequence = 0; int bytesExpectedInSequence = 0; while (*p) { // 'continuing' octets: if ((*p & 0xc0) == 0x80) // trailing byte in multi-byte sequence { if (bytesInSequence == 0) break; bytesInSequence++; ret_code = (ret_code << 6) | (UCS4Char) (*p & 0x3f); if (bytesInSequence == bytesExpectedInSequence) break; p++; continue; } if (bytesInSequence) break; bytesInSequence++; /* 4,5,6-byte sequences may require > 2 bytes in UCS-4 */ if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence { bytesExpectedInSequence = 6; ret_code = (UCS4Char) (*p & 0x01); p++; continue; } if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence { bytesExpectedInSequence = 5; ret_code = (UCS4Char) (*p & 0x03); p++; continue; } if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence { bytesExpectedInSequence = 4; ret_code = (UCS4Char) (*p & 0x07); p++; continue; } /* 1,2,3-byte sequences do not require > 2 bytes in UCS-4 */ if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence { bytesExpectedInSequence = 3; ret_code = (UCS4Char) (*p & 0x0f); p++; continue; } if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence { bytesExpectedInSequence = 2; ret_code = (UCS4Char) (*p & 0x1f); p++; continue; } ret_code = 0; break; // invalid byte - not UTF-8 } if (bytesInSequence != bytesExpectedInSequence) ret_code = 0; return ret_code; } void UT_UTF8Stringbuf::append (const char * sz, size_t n /* == 0 => null-termination */) { if (sz == 0) return; if (!grow ((n?n:strlen(sz)) + 1)) return; const char * p = sz; char buf[6]; int bytesInSequence = 0; int bytesExpectedInSequence = 0; size_t np = 0; while ((!n && *p) || (np < n)) { if ((*p & 0x80) == 0x00) // plain us-ascii part of latin-1 { if (bytesInSequence) break; *m_pEnd++ = *p; *m_pEnd = 0; m_strlen++; p++; np++; continue; } // 'continuing' octets: if ((*p & 0xc0) == 0x80) // trailing byte in multi-byte sequence { if (bytesInSequence == 0) break; buf[bytesInSequence++] = *p; if (bytesInSequence == bytesExpectedInSequence) { for (int b = 0; b < bytesInSequence; b++) *m_pEnd++ = buf[b]; *m_pEnd = 0; m_strlen++; bytesInSequence = 0; bytesExpectedInSequence = 0; } p++; np++; continue; } if (bytesInSequence) break; buf[bytesInSequence++] = *p; /* 4,5,6-byte sequences may require > 2 bytes in UCS-4 */ if ((*p & 0xfe) == 0xfc) // lead byte in 6-byte sequence { bytesExpectedInSequence = 6; p++; np++; continue; } if ((*p & 0xfc) == 0xf8) // lead byte in 5-byte sequence { bytesExpectedInSequence = 5; p++; np++; continue; } if ((*p & 0xf8) == 0xf0) // lead byte in 4-byte sequence { bytesExpectedInSequence = 4; p++; np++; continue; } /* 1,2,3-byte sequences do not require > 2 bytes in UCS-4 */ if ((*p & 0xf0) == 0xe0) // lead byte in 3-byte sequence { bytesExpectedInSequence = 3; p++; np++; continue; } if ((*p & 0xe0) == 0xc0) // lead byte in 2-byte sequence { bytesExpectedInSequence = 2; p++; np++; continue; } break; // invalid byte - not UTF-8 } } void UT_UTF8Stringbuf::append (const UT_UTF8Stringbuf & rhs) { if (grow (rhs.byteLength () + 1)) { memcpy (m_pEnd, rhs.data (), rhs.byteLength ()); m_strlen += rhs.utf8Length (); m_pEnd = m_pEnd + rhs.byteLength (); *m_pEnd = 0; } } void UT_UTF8Stringbuf::appendUCS4 (const UT_UCS4Char * sz, size_t n /* == 0 => null-termination */) { size_t bytelength = 0; size_t i; if (!sz || (!n && !*sz)) return; /* The vast majority of calls to appendUCS4 pass in 1 for n, so we can halve the number of calls to g_unichar_to_utf8 (in most cases) by caching the first byte length. */ int iCache = 0; for (i = 0; (i < n) || (n == 0); i++) { if((0 == sz[i]) && (0 == n)) break; int seql = UT_Unicode::UTF8_ByteLength (sz[i]); if(i == 0) iCache = seql; if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // end-of-string? bytelength += static_cast(seql); } if(bytelength == 0) return; if (!grow (bytelength + 1)) return; for (i = 0; (i < n) || (n == 0); i++) { if((0 == sz[i]) && (0 == n)) break; int seql; if(i == 0) seql = iCache; else seql = UT_Unicode::UTF8_ByteLength (sz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // end-of-string? UT_Unicode::UCS4_to_UTF8 (m_pEnd, bytelength, sz[i]); m_strlen++; } *m_pEnd = 0; } void UT_UTF8Stringbuf::appendUCS2 (const UT_UCS2Char * sz, size_t n /* == 0 => null-termination */) { size_t bytelength = 0; size_t i; for (i = 0; (i < n) || (n == 0); i++) { if (sz[i]==0 && n==0) break; int seql = UT_Unicode::UTF8_ByteLength ((UT_UCS4Char)sz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // end-of-string? bytelength += static_cast(seql); } if (!grow (bytelength + 1)) return; for (i = 0; (i < n) || (n == 0); i++) { if (sz[i]==0 && n==0) break; int seql = UT_Unicode::UTF8_ByteLength ((UT_UCS4Char)sz[i]); if (seql < 0) continue; // not UCS-4 !! if (seql == 0) break; // end-of-string? UT_Unicode::UCS4_to_UTF8 (m_pEnd, bytelength, (UT_UCS4Char)sz[i]); m_strlen++; } *m_pEnd = 0; } /* replaces with in the current string */ void UT_UTF8Stringbuf::escape (const UT_UTF8String & utf8_str1, const UT_UTF8String & utf8_str2) { size_t diff = 0; size_t len1 = utf8_str1.byteLength (); size_t len2 = utf8_str2.byteLength (); const char * str1 = utf8_str1.utf8_str (); const char * str2 = utf8_str2.utf8_str (); if (len2 > len1) { diff = len2 - len1; size_t incr = 0; char * ptr = m_psz; while (ptr + len1 <= m_pEnd) { if (memcmp (ptr, str1, len1) == 0) { incr += diff; ptr += len1; } else { ++ptr; } } if (!grow (incr)) return; } else { diff = len1 - len2; } char * ptr = m_psz; while (ptr + len1 <= m_pEnd) { if (memcmp (ptr, str1, len1) == 0) { if (diff) { if (len2 > len1) { memmove (ptr + diff, ptr, m_pEnd - ptr + 1); m_pEnd += diff; } else { memmove (ptr, ptr + diff, m_pEnd - (ptr + diff) + 1); m_pEnd -= diff; } } memcpy (ptr, str2, len2); ptr += len2; m_strlen += utf8_str2.length () - utf8_str1.length (); } else { ++ptr; } } } /* FIXME -- these functions assume that &, <, > and " cannot appear in * multi-byte utf8 sequence -- I do not think that holds * * Also, the decode function should handle other & tokens * * Should use glib to traverse these strings */ void UT_UTF8Stringbuf::decodeXML () { if (!m_psz) return; size_t shrink = 0; char * p_src = m_psz; char * p_dst = m_psz; while (p_src < m_pEnd && *p_src) { if(*p_src == '&') { if (!strncmp (p_src+1, "amp;", 4)) { *p_dst++ = '&'; p_src += 5; shrink += 4; continue; } else if (!strncmp (p_src+1, "lt;", 3)) { *p_dst++ = '<'; p_src += 4; shrink += 3; continue; } else if (!strncmp (p_src+1, "gt;", 3)) { *p_dst++ = '>'; p_src += 4; shrink += 3; continue; } else if (!strncmp (p_src+1, "quot;", 5)) { *p_dst++ = '"'; p_src += 6; shrink += 5; continue; } } *p_dst = *p_src; p_dst++; p_src++; } *p_dst = 0; m_pEnd -= shrink; } /* escapes '<', '>', '\"' and '&' in the current string */ void UT_UTF8Stringbuf::escapeXML () { size_t incr = 0; char * ptr = m_psz; while (ptr < m_pEnd) { if ((*ptr == '<') || (*ptr == '>')) incr += 3; else if (*ptr == '&') incr += 4; else if (*ptr == '"') incr += 5; ptr++; } bool bInsert = grow (incr); ptr = m_psz; while (ptr < m_pEnd) { if (*ptr == '<') { if (bInsert) { *ptr++ = '&'; insert (ptr, "lt;", 3); } else *ptr++ = '?'; } else if (*ptr == '>') { if (bInsert) { *ptr++ = '&'; insert (ptr, "gt;", 3); } else *ptr++ = '?'; } else if (*ptr == '&') { if (bInsert) { *ptr++ = '&'; insert (ptr, "amp;", 4); } else *ptr++ = '?'; } else if (*ptr == '"') { if (bInsert) { *ptr++ = '&'; insert (ptr, "quot;", 5); } else *ptr++ = '?'; } else ptr++; } } /* this function escapes the string to provide for conformity with http://www.w3.org/TR/xlink/#link-locators, section 5.4 Just use libxml and hope for the best. */ void UT_UTF8Stringbuf::escapeURL () { if(!m_psz || !*m_psz) return; xmlChar * uri = xmlURIEscape(BAD_CAST m_psz); if(uri) { assign((gchar*)uri); xmlFree(uri); } } /* decode %xx encoded characters */ static UT_uint32 s_charCode_to_hexval(UT_UCS4Char c) { if(c >= 0x30 && c <= 0x39) return c - 0x30; else if(c >= 0x41 && c <= 0x46) return c - 0x41 + 10; else if(c >= 0x61 && c <= 0x66) return c - 0x61 + 10; UT_return_val_if_fail( UT_SHOULD_NOT_HAPPEN, 0 ); } void UT_UTF8Stringbuf::decodeURL() { if(!m_psz || !*m_psz) return; char * buff = (char*)g_try_malloc(byteLength() + 1); UT_return_if_fail( buff ); buff[0] = 0; UTF8Iterator J(this); const char * ptr = J.current(); UT_UCS4Char c = charCode(J.current()); char utf8cache[7]; utf8cache[6] = 0; UT_uint32 iCachePos = 0; UT_uint32 iCacheNeeded = 0; while (c != 0) { if(c == '%') { J.advance(); UT_UCS4Char b1 = charCode(J.current()); J.advance(); UT_UCS4Char b2 = charCode(J.current()); J.advance(); if(isalnum(b1) && isalnum(b2)) { b1 = s_charCode_to_hexval(b1); b2 = s_charCode_to_hexval(b2); UT_UCS4Char code = ((b1 << 4)& 0xf0) | (b2 & 0x0f); if(iCacheNeeded == 0) { // we start new utf8 sequence in the cache if ((code & 0x80) == 0) iCacheNeeded = 1; else if ((code & 0xe0) == 0xc0) iCacheNeeded = 2; else if ((code & 0xf0) == 0xe0) iCacheNeeded = 3; else if ((code & 0xf8) == 0xf0) iCacheNeeded = 4; else if ((code & 0xfc) == 0xf8) iCacheNeeded = 5; else if ((code & 0xfe) == 0xfc) iCacheNeeded = 6; utf8cache[0] = (char) code; utf8cache[iCacheNeeded] = 0; // make sure the sequence will be terminated iCachePos++; } else { // append to our cache utf8cache[iCachePos++] = (char) code; } if(iCacheNeeded == 0 && (code >= 0x7f && code <= 0xff)) { // the present character is not a valid start of utf8 sequence -- // this is almost certainly a character from the extended ASCII set // which was encoded directly according to the RFC 1738 scheme, we // just append it size_t iLenBuff = strlen(buff); size_t iLenLeft = byteLength() - iLenBuff; char * p = buff + iLenBuff; UT_Unicode::UCS4_to_UTF8(p, iLenLeft, code); // we need to null-terminate *p = 0; } if(iCacheNeeded && iCacheNeeded <= iCachePos) { UT_ASSERT_HARMLESS( iCacheNeeded == iCachePos ); // append the cache to our buffer UT_uint32 iLenBuff = strlen(buff); char * p = buff + iLenBuff; strcat(p, utf8cache); iCacheNeeded = iCachePos = 0; } } else { // this should not happen in encoded url and so we will ignore this token; // if we are in the middle of utf8 sequence; we will reset it iCacheNeeded = iCachePos = 0; } } else { J.advance(); // advance here, for the sake of the else clause below if(iCacheNeeded > iCachePos) { // we are processing a utf sequence, so just append this byte to our cache utf8cache[iCachePos++] = (char) c; } else { const char * p = J.current(); UT_uint32 iLen = p ? p - ptr : strlen(ptr); strncat(buff, ptr, iLen); } } ptr = J.current(); c = charCode(J.current()); } assign(buff); g_free(buff); } /* translates the current string to MIME "quoted-printable" format */ void UT_UTF8Stringbuf::escapeMIME () { static const char hex[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' }; static const char * s_eol = "=\r\n"; if (m_strlen == 0) return; size_t bytes = 0; char * ptr = m_psz; while (*ptr) { char c = *ptr++; unsigned char u = static_cast(c); if ((c == '\r') || (c == '\n') || (c == '=') || (u & 0x80)) bytes += 2; } if (bytes) { if (!grow (bytes)) return; char * pOld = m_pEnd; char * pNew = m_pEnd + bytes; while (pOld >= m_psz) { char c = *pOld--; unsigned char u = static_cast(c); if ((u & 0x80) || (c == '\r') || (c == '\n') || (c == '=')) { *pNew-- = hex[ u & 0x0f]; *pNew-- = hex[(u >> 4) & 0x0f]; *pNew-- = '='; } else *pNew-- = c; } m_pEnd += bytes; m_strlen = m_pEnd - m_psz; } size_t length = 0; ptr = m_psz; while (true) { if (*ptr == 0) { if (length) { size_t offset = ptr - m_psz; if (grow (3)) { ptr = m_psz + offset; insert (ptr, s_eol, 3); } } break; } if (length >= 70) { size_t offset = ptr - m_psz; if (grow (3)) { ptr = m_psz + offset; insert (ptr, s_eol, 3); } length = 0; } if (*ptr == '=') { ptr += 3; length += 3; } else { ptr++; length++; } } } UT_UTF8Stringbuf * UT_UTF8Stringbuf::lowerCase () { if(!byteLength()) return NULL; UT_UTF8Stringbuf * n = new UT_UTF8Stringbuf(); UT_return_val_if_fail(n, NULL); UTF8Iterator s(this); UT_UCS4Char c = charCode(s.current()); while(c) { UT_UCS4Char l = UT_UCS4_tolower(c); n->appendUCS4(&l,1); c = charCode(s.advance()); } return n; } void UT_UTF8Stringbuf::clear () { if (m_psz) g_free (m_psz); m_psz = 0; m_pEnd = 0; m_strlen = 0; m_buflen = 0; } void UT_UTF8Stringbuf::insert (char *& ptr, const char * str, size_t utf8length) { if ( str == 0) return; if (*str == 0) return; if ((ptr < m_psz) || (ptr > m_pEnd)) return; char * orig_buf = m_psz; char * orig_ptr = ptr; size_t length = static_cast(strlen(str)); if (!grow (length)) return; ptr = m_psz + (orig_ptr - orig_buf); memmove (ptr + length, ptr, (m_pEnd - ptr) + 1); memcpy (ptr, str, length); ptr += length; m_pEnd += length; m_strlen += utf8length; } void UT_UTF8Stringbuf::reserve(size_t n) { grow(n); } bool UT_UTF8Stringbuf::grow (size_t length) { if (length + 1 <= (m_buflen - (m_pEnd - m_psz))) return true; if (m_psz == 0) { if (length == 0) return true; m_psz = static_cast(g_try_malloc(length)); if (m_psz == 0) return false; m_strlen = 0; m_buflen = length; m_pEnd = m_psz; *m_pEnd = 0; return true; } size_t new_length = length + (m_pEnd - m_psz) + 1; size_t end_offset = m_pEnd - m_psz; char * more = static_cast(g_try_realloc(static_cast(m_psz), new_length)); if (more == 0) return false; m_psz = more; m_pEnd = m_psz + end_offset; m_buflen = new_length; return true; } UT_UTF8Stringbuf::UTF8Iterator::UTF8Iterator (const UT_UTF8Stringbuf * strbuf) : m_strbuf(strbuf), m_utfbuf(0), m_utfptr(0) { sync (); } UT_UTF8Stringbuf::UTF8Iterator::~UTF8Iterator () { // } void UT_UTF8Stringbuf::UTF8Iterator::operator=(const char * position) { if (!sync ()) return; if (static_cast(position- m_utfbuf) > m_strbuf->byteLength ()) { m_utfptr = m_utfbuf + m_strbuf->byteLength (); } else { m_utfptr = position; } } const char * UT_UTF8Stringbuf::UTF8Iterator::current () { if (!sync ()) return 0; if ((*m_utfptr & 0xc0) == 0x80) return 0; // oops - a 'continuing' byte return m_utfptr; } const char * UT_UTF8Stringbuf::UTF8Iterator::start () { if (!sync ()) return 0; return m_utfbuf; } const char * UT_UTF8Stringbuf::UTF8Iterator::end () { if (!sync ()) return 0; return m_utfbuf + m_strbuf->byteLength (); } const char * UT_UTF8Stringbuf::UTF8Iterator::advance () { if (!sync ()) return 0; if (*m_utfptr == 0) return 0; do { m_utfptr++; } while ((*m_utfptr & 0xc0) == 0x80); // a 'continuing' byte return m_utfptr; } const char * UT_UTF8Stringbuf::UTF8Iterator::retreat () { if (!sync ()) return 0; if (m_utfptr == m_utfbuf) return 0; do { m_utfptr--; } while ((*m_utfptr & 0xc0) == 0x80); // a 'continuing' byte return m_utfptr; } // returns false only if there is no string data bool UT_UTF8Stringbuf::UTF8Iterator::sync () { if (m_strbuf == 0) return false; const char * utf8_buffer = m_strbuf->data (); if (utf8_buffer == 0) { m_utfbuf = 0; m_utfptr = 0; return false; } size_t utf8_length = m_strbuf->byteLength (); /* note that this doesn't guarantee that m_utfptr points to the * start of UTF-8 char sequence */ if (static_cast(m_utfptr- m_utfbuf) > utf8_length) { m_utfptr = utf8_buffer + utf8_length; } else { m_utfptr = utf8_buffer + (m_utfptr - m_utfbuf); } m_utfbuf = utf8_buffer; return true; }