335 lines
No EOL
9.7 KiB
C++
335 lines
No EOL
9.7 KiB
C++
/*
|
|
* Convert a UTF-16 string to UTF-8, mapping indices to provide low-complexity
|
|
* range and index lookups.
|
|
*
|
|
* Copyright 2010 Rasmus Andersson. All rights reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to
|
|
* deal in the Software without restriction, including without limitation the
|
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
* sell copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "convert_utf16.h"
|
|
#include "core/array.h"
|
|
#include "core/variant.h"
|
|
#include "core/int_types.h"
|
|
#include "modules/debug/debug.h"
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Macros extracted from icu/unicode/utf16.h
|
|
|
|
/**
|
|
* Is this code unit a lead surrogate (U+d800..U+dbff)?
|
|
* @param c 16-bit code unit
|
|
* @return TRUE or FALSE
|
|
* @stable ICU 2.4
|
|
*/
|
|
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
|
|
|
|
/**
|
|
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
|
|
* @param c 16-bit code unit
|
|
* @return TRUE or FALSE
|
|
* @stable ICU 2.4
|
|
*/
|
|
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
|
|
|
|
/**
|
|
* Helper constant for U16_GET_SUPPLEMENTARY. (0x35fdc00)
|
|
* @internal
|
|
*/
|
|
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
|
|
|
|
/**
|
|
* Get a supplementary code point value (U+10000..U+10ffff)
|
|
* from its lead and trail surrogates.
|
|
* The result is undefined if the input values are not
|
|
* lead and trail surrogates.
|
|
*
|
|
* @param lead lead surrogate (U+d800..U+dbff)
|
|
* @param trail trail surrogate (U+dc00..U+dfff)
|
|
* @return supplementary code point (U+10000..U+10ffff)
|
|
* @stable ICU 2.4
|
|
*/
|
|
#define U16_GET_SUPPLEMENTARY(lead, trail) \
|
|
(((uint32_t)(lead)<<10UL)+(uint32_t)(trail)-U16_SURROGATE_OFFSET)
|
|
|
|
/**
|
|
* Get a code point from a string at a code point boundary offset,
|
|
* and advance the offset to the next code point boundary.
|
|
* (Post-incrementing forward iteration.)
|
|
* "Unsafe" macro, assumes well-formed UTF-16.
|
|
*
|
|
* The offset may point to the lead surrogate unit
|
|
* for a supplementary code point, in which case the macro will read
|
|
* the following trail surrogate as well.
|
|
* If the offset points to a trail surrogate, then that itself
|
|
* will be returned as the code point.
|
|
* The result is undefined if the offset points to a single, unpaired lead surrogate.
|
|
*
|
|
* @param s const UChar * string
|
|
* @param i string offset
|
|
* @param c output uint32_t variable
|
|
* @see U16_NEXT
|
|
* @stable ICU 2.4
|
|
*/
|
|
#define U16_NEXT_UNSAFE(s, i, c) { \
|
|
(c)=(s)[(i)++]; \
|
|
if(U16_IS_LEAD(c)) { \
|
|
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
|
|
} \
|
|
}
|
|
|
|
/**
|
|
* Get a code point from a string at a code point boundary offset,
|
|
* and advance the offset to the next code point boundary.
|
|
* (Post-incrementing forward iteration.)
|
|
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
|
*
|
|
* The offset may point to the lead surrogate unit
|
|
* for a supplementary code point, in which case the macro will read
|
|
* the following trail surrogate as well.
|
|
* If the offset points to a trail surrogate or
|
|
* to a single, unpaired lead surrogate, then that itself
|
|
* will be returned as the code point.
|
|
*
|
|
* @param s const UChar * string
|
|
* @param i string offset, must be i<length
|
|
* @param length string length
|
|
* @param c output UChar32 variable
|
|
* @see U16_NEXT_UNSAFE
|
|
* @stable ICU 2.4
|
|
*/
|
|
#define U16_NEXT(s, i, length, c) { \
|
|
(c)=(s)[(i)++]; \
|
|
if(U16_IS_LEAD(c)) { \
|
|
uint16_t __c2; \
|
|
if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
|
|
++(i); \
|
|
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
// end of icu/unicode/utf16.h
|
|
// ----------------------------------------------------------------------------
|
|
|
|
// convert_utf16_to_utf8 based on HUTF8MappedUTF16String.mm
|
|
|
|
String convert_utf16_to_utf8(uint8_t * u16_buf, size_t u16_len)
|
|
{
|
|
String out;
|
|
int u8_len = 0;
|
|
size_t * u8to16_table = new size_t[u16_len*4];
|
|
char *u8buf = new char[u16_len*4+1];
|
|
|
|
// For each UTF-16 character...
|
|
for (size_t u16i=0; u16i < u16_len; )
|
|
{
|
|
// Retrieve 1-2 UTF-16 characters, forming one 32-bit unicode character
|
|
uint32_t u32c = 0;
|
|
size_t u16i_next = u16i;
|
|
// slower, but "safer"
|
|
// U16_NEXT(u16_buf_, u16i_next, u16_len_, u32c);
|
|
// faster, but does not handle unpaired surrogates or checks bounds
|
|
U16_NEXT_UNSAFE(u16_buf, u16i_next, u32c);
|
|
|
|
// u16 offset added to |u8to16_table|
|
|
size_t u16ix = u16i;
|
|
|
|
// Append u32c to u8buf (1-4 bytes)
|
|
if ((uint32_t)u32c <= 0x7f)
|
|
{
|
|
u8to16_table[u8_len] = u16ix;
|
|
u8buf[u8_len++] = (uint8_t)u32c;
|
|
{
|
|
CharString c;
|
|
c += u8buf[u8_len - 1];
|
|
}
|
|
} else {
|
|
if ((uint32_t)u32c <= 0x7ff)
|
|
{
|
|
u8to16_table[u8_len] = u16ix;
|
|
u8buf[u8_len++] = (uint8_t)((u32c>>6)|0xc0);
|
|
} else {
|
|
if ((uint32_t)u32c <= 0xffff)
|
|
{
|
|
u8to16_table[u8_len] = u16ix;
|
|
u8buf[u8_len++] = (uint8_t)((u32c>>12)|0xe0);
|
|
} else {
|
|
u8to16_table[u8_len] = u16ix;
|
|
u8buf[u8_len++] = (uint8_t)((u32c>>18)|0xf0);
|
|
u8to16_table[u8_len] = u16ix;
|
|
u8buf[u8_len++] = (uint8_t)(((u32c>>12)&0x3f)|0x80);
|
|
}
|
|
u8to16_table[u8_len] = u16ix;
|
|
u8buf[u8_len++] = (uint8_t)(((u32c>>6)&0x3f)|0x80);
|
|
}
|
|
u8to16_table[u8_len] = u16ix;
|
|
u8buf[u8_len++] = (uint8_t)((u32c&0x3f)|0x80);
|
|
}
|
|
u16i = u16i_next;
|
|
}
|
|
delete [] u8to16_table;
|
|
u8buf[u8_len++] = 0;
|
|
|
|
out = String::utf8(u8buf);
|
|
delete [] u8buf;
|
|
|
|
return out;
|
|
}
|
|
|
|
void convert_utf8_to_utf16(String input, uint16_t ** u16_buf, size_t * u16_len)
|
|
{
|
|
Array unicode;
|
|
CharString utf8 = input.utf8();
|
|
int i = 0;
|
|
|
|
#ifdef DEBUG_ENABLED
|
|
if ( u16_len == NULL )
|
|
{
|
|
ERR_PRINT("Bad ptr for u16_len");
|
|
throw "Bad ptr for u16_len";
|
|
}
|
|
if ( u16_buf == NULL )
|
|
{
|
|
ERR_PRINT("Bad ptr for u16_buf");
|
|
throw "Bad ptr for u16_buf";
|
|
}
|
|
if ( *u16_buf != NULL )
|
|
{
|
|
ERR_PRINT("Bad ptr u16_buf not empty");
|
|
throw "Bad ptr u16_buf not empty";
|
|
}
|
|
#endif
|
|
|
|
while (i < utf8.size())
|
|
{
|
|
uint32_t uni;
|
|
size_t todo;
|
|
//bool error = false;
|
|
unsigned char ch = (unsigned char) utf8[i++];
|
|
if (ch <= 0x7F)
|
|
{
|
|
uni = ch;
|
|
todo = 0;
|
|
}
|
|
else if (ch <= 0xBF)
|
|
{
|
|
ERR_PRINT("not a UTF-8 string");
|
|
*u16_len = 0;
|
|
return;
|
|
}
|
|
else if (ch <= 0xDF)
|
|
{
|
|
uni = ch&0x1F;
|
|
todo = 1;
|
|
}
|
|
else if (ch <= 0xEF)
|
|
{
|
|
uni = ch&0x0F;
|
|
todo = 2;
|
|
}
|
|
else if (ch <= 0xF7)
|
|
{
|
|
uni = ch&0x07;
|
|
todo = 3;
|
|
}
|
|
else
|
|
{
|
|
ERR_PRINT("not a UTF-8 string");
|
|
*u16_len = 0;
|
|
return;
|
|
}
|
|
for (size_t j = 0; j < todo; ++j)
|
|
{
|
|
if (i == utf8.size())
|
|
{
|
|
ERR_PRINT("not a UTF-8 string");
|
|
*u16_len = 0;
|
|
return;
|
|
}
|
|
ch = (unsigned char) utf8[i++];
|
|
if (ch < 0x80 || ch > 0xBF)
|
|
{
|
|
ERR_PRINT("not a UTF-8 string");
|
|
*u16_len = 0;
|
|
return;
|
|
}
|
|
uni <<= 6;
|
|
uni += ch & 0x3F;
|
|
}
|
|
if (uni >= 0xD800 && uni <= 0xDFFF)
|
|
{
|
|
ERR_PRINT("not a UTF-8 string");
|
|
*u16_len = 0;
|
|
return;
|
|
}
|
|
if (uni > 0x10FFFF)
|
|
{
|
|
ERR_PRINT("not a UTF-8 string");
|
|
*u16_len = 0;
|
|
return;
|
|
}
|
|
Variant data_pushed(uni);
|
|
unicode.push_back(data_pushed);
|
|
}
|
|
|
|
*u16_len = 0;
|
|
for (i = 0; i < unicode.size(); ++i)
|
|
{
|
|
unsigned long uni = unicode[i];
|
|
if (uni <= 0xFFFF)
|
|
{
|
|
(*u16_len) ++;
|
|
}
|
|
else
|
|
{
|
|
(*u16_len) ++;
|
|
(*u16_len) ++;
|
|
|
|
}
|
|
}
|
|
|
|
*u16_buf = new uint16_t[*u16_len];
|
|
(*u16_len)--;
|
|
size_t ii = 0;
|
|
DBG_PRINT("array (max:" + itos(*u16_len));
|
|
for (i = 0; i < unicode.size(); ++i)
|
|
{
|
|
unsigned long uni = unicode[i];
|
|
DBG_PRINT("array (max:" + itos(*u16_len) + " / i:" + itos(i) + " / current:" + itos(ii) + ")");
|
|
if (uni <= 0xFFFF)
|
|
{
|
|
(*u16_buf)[ii++] = uni;
|
|
}
|
|
else
|
|
{
|
|
uni -= 0x10000;
|
|
(*u16_buf)[ii++] = (uint16_t)((uni >> 10) + 0xD800);
|
|
(*u16_buf)[ii++] = (uint16_t)((uni & 0x3FF) + 0xDC00);
|
|
}
|
|
}
|
|
ii--;
|
|
#ifdef DEBUG_ENABLED
|
|
if(ii != *u16_len)
|
|
{
|
|
ERR_PRINT("Out of array (max:" + itos(*u16_len) + " / current:" + itos(ii) + ")");
|
|
throw "Out of array";
|
|
}
|
|
#endif
|
|
} |