test-client-godot/modules/bitstream/convert_utf16.cpp

335 lines
No EOL
9.7 KiB
C++

/*
* Convert a UTF-16 string to UTF-8, mapping indices to provide low-complexity
* range and index lookups.
*
* Copyright 2010 Rasmus Andersson. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "convert_utf16.h"
#include "core/array.h"
#include "core/variant.h"
#include "core/int_types.h"
#include "modules/debug/debug.h"
// ----------------------------------------------------------------------------
// Macros extracted from icu/unicode/utf16.h
/**
* Is this code unit a lead surrogate (U+d800..U+dbff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @stable ICU 2.4
*/
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* Helper constant for U16_GET_SUPPLEMENTARY. (0x35fdc00)
* @internal
*/
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
/**
* Get a supplementary code point value (U+10000..U+10ffff)
* from its lead and trail surrogates.
* The result is undefined if the input values are not
* lead and trail surrogates.
*
* @param lead lead surrogate (U+d800..U+dbff)
* @param trail trail surrogate (U+dc00..U+dfff)
* @return supplementary code point (U+10000..U+10ffff)
* @stable ICU 2.4
*/
#define U16_GET_SUPPLEMENTARY(lead, trail) \
(((uint32_t)(lead)<<10UL)+(uint32_t)(trail)-U16_SURROGATE_OFFSET)
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset points to a single, unpaired lead surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output uint32_t variable
* @see U16_NEXT
* @stable ICU 2.4
*/
#define U16_NEXT_UNSAFE(s, i, c) { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
} \
}
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then that itself
* will be returned as the code point.
*
* @param s const UChar * string
* @param i string offset, must be i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @stable ICU 2.4
*/
#define U16_NEXT(s, i, length, c) { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
}
// end of icu/unicode/utf16.h
// ----------------------------------------------------------------------------
// convert_utf16_to_utf8 based on HUTF8MappedUTF16String.mm
String convert_utf16_to_utf8(uint8_t * u16_buf, size_t u16_len)
{
String out;
int u8_len = 0;
size_t * u8to16_table = new size_t[u16_len*4];
char *u8buf = new char[u16_len*4+1];
// For each UTF-16 character...
for (size_t u16i=0; u16i < u16_len; )
{
// Retrieve 1-2 UTF-16 characters, forming one 32-bit unicode character
uint32_t u32c = 0;
size_t u16i_next = u16i;
// slower, but "safer"
// U16_NEXT(u16_buf_, u16i_next, u16_len_, u32c);
// faster, but does not handle unpaired surrogates or checks bounds
U16_NEXT_UNSAFE(u16_buf, u16i_next, u32c);
// u16 offset added to |u8to16_table|
size_t u16ix = u16i;
// Append u32c to u8buf (1-4 bytes)
if ((uint32_t)u32c <= 0x7f)
{
u8to16_table[u8_len] = u16ix;
u8buf[u8_len++] = (uint8_t)u32c;
{
CharString c;
c += u8buf[u8_len - 1];
}
} else {
if ((uint32_t)u32c <= 0x7ff)
{
u8to16_table[u8_len] = u16ix;
u8buf[u8_len++] = (uint8_t)((u32c>>6)|0xc0);
} else {
if ((uint32_t)u32c <= 0xffff)
{
u8to16_table[u8_len] = u16ix;
u8buf[u8_len++] = (uint8_t)((u32c>>12)|0xe0);
} else {
u8to16_table[u8_len] = u16ix;
u8buf[u8_len++] = (uint8_t)((u32c>>18)|0xf0);
u8to16_table[u8_len] = u16ix;
u8buf[u8_len++] = (uint8_t)(((u32c>>12)&0x3f)|0x80);
}
u8to16_table[u8_len] = u16ix;
u8buf[u8_len++] = (uint8_t)(((u32c>>6)&0x3f)|0x80);
}
u8to16_table[u8_len] = u16ix;
u8buf[u8_len++] = (uint8_t)((u32c&0x3f)|0x80);
}
u16i = u16i_next;
}
delete [] u8to16_table;
u8buf[u8_len++] = 0;
out = String::utf8(u8buf);
delete [] u8buf;
return out;
}
void convert_utf8_to_utf16(String input, uint16_t ** u16_buf, size_t * u16_len)
{
Array unicode;
CharString utf8 = input.utf8();
int i = 0;
#ifdef DEBUG_ENABLED
if ( u16_len == NULL )
{
ERR_PRINT("Bad ptr for u16_len");
throw "Bad ptr for u16_len";
}
if ( u16_buf == NULL )
{
ERR_PRINT("Bad ptr for u16_buf");
throw "Bad ptr for u16_buf";
}
if ( *u16_buf != NULL )
{
ERR_PRINT("Bad ptr u16_buf not empty");
throw "Bad ptr u16_buf not empty";
}
#endif
while (i < utf8.size())
{
uint32_t uni;
size_t todo;
//bool error = false;
unsigned char ch = (unsigned char) utf8[i++];
if (ch <= 0x7F)
{
uni = ch;
todo = 0;
}
else if (ch <= 0xBF)
{
ERR_PRINT("not a UTF-8 string");
*u16_len = 0;
return;
}
else if (ch <= 0xDF)
{
uni = ch&0x1F;
todo = 1;
}
else if (ch <= 0xEF)
{
uni = ch&0x0F;
todo = 2;
}
else if (ch <= 0xF7)
{
uni = ch&0x07;
todo = 3;
}
else
{
ERR_PRINT("not a UTF-8 string");
*u16_len = 0;
return;
}
for (size_t j = 0; j < todo; ++j)
{
if (i == utf8.size())
{
ERR_PRINT("not a UTF-8 string");
*u16_len = 0;
return;
}
ch = (unsigned char) utf8[i++];
if (ch < 0x80 || ch > 0xBF)
{
ERR_PRINT("not a UTF-8 string");
*u16_len = 0;
return;
}
uni <<= 6;
uni += ch & 0x3F;
}
if (uni >= 0xD800 && uni <= 0xDFFF)
{
ERR_PRINT("not a UTF-8 string");
*u16_len = 0;
return;
}
if (uni > 0x10FFFF)
{
ERR_PRINT("not a UTF-8 string");
*u16_len = 0;
return;
}
Variant data_pushed(uni);
unicode.push_back(data_pushed);
}
*u16_len = 0;
for (i = 0; i < unicode.size(); ++i)
{
unsigned long uni = unicode[i];
if (uni <= 0xFFFF)
{
(*u16_len) ++;
}
else
{
(*u16_len) ++;
(*u16_len) ++;
}
}
*u16_buf = new uint16_t[*u16_len];
(*u16_len)--;
size_t ii = 0;
DBG_PRINT("array (max:" + itos(*u16_len));
for (i = 0; i < unicode.size(); ++i)
{
unsigned long uni = unicode[i];
DBG_PRINT("array (max:" + itos(*u16_len) + " / i:" + itos(i) + " / current:" + itos(ii) + ")");
if (uni <= 0xFFFF)
{
(*u16_buf)[ii++] = uni;
}
else
{
uni -= 0x10000;
(*u16_buf)[ii++] = (uint16_t)((uni >> 10) + 0xD800);
(*u16_buf)[ii++] = (uint16_t)((uni & 0x3FF) + 0xDC00);
}
}
ii--;
#ifdef DEBUG_ENABLED
if(ii != *u16_len)
{
ERR_PRINT("Out of array (max:" + itos(*u16_len) + " / current:" + itos(ii) + ")");
throw "Out of array";
}
#endif
}