1 /* 2 Copyright (c) 2016-2021 Timur Gafarov 3 4 Boost Software License - Version 1.0 - August 17th, 2003 5 6 Permission is hereby granted, free of charge, to any person or organization 7 obtaining a copy of the software and accompanying documentation covered by 8 this license (the "Software") to use, reproduce, display, distribute, 9 execute, and transmit the Software, and to prepare derivative works of the 10 Software, and to permit third-parties to whom the Software is furnished to 11 do so, all subject to the following: 12 13 The copyright notices in the Software and this entire statement, including 14 the above license grant, this restriction and the following disclaimer, 15 must be included in all copies of the Software, in whole or in part, and 16 all derivative works of the Software, unless such copies or derivative 17 works are solely in the form of machine-executable object code generated by 18 a source language processor. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 */ 28 29 /** 30 * UTF-16 decoder and encoder 31 * 32 * Copyright: Timur Gafarov 2016-2021. 33 * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0). 34 * Authors: Timur Gafarov, Roman Chistokhodov 35 */ 36 module dlib.text.utf16; 37 38 import core.stdc.stdio; 39 import dlib.core.memory; 40 import dlib.container.array; 41 import dlib.text.utf8; 42 import dlib.text.utils; 43 import dlib.text.common; 44 45 enum ushort UTF16_HI_SURROGATE = 0xD800; 46 enum ushort UTF16_LO_SURROGATE = 0xDC00; 47 enum ushort UTF16_BOM_LE = 0xfeff; 48 enum ushort UTF16_BOM_BE = 0xfffe; 49 50 /** 51 * UTF-16 LE decoder to use with dlib.text.encodings.transcode 52 */ 53 struct UTF16LEDecoder 54 { 55 // TODO: byte order 56 public: 57 58 /// Input string. Set it before decoding 59 string input; 60 61 /// Current index in an input string 62 size_t index = 0; 63 64 /// Current character index 65 int character = 0; 66 67 /** 68 * Decode next character. 69 * Returns: decoded code point, or UTF8_ERROR if error occured, or UTF8_END if input has no more characters. 70 */ 71 int decodeNext() 72 { 73 if (index >= input.length) 74 return index == input.length ? DECODE_END : DECODE_ERROR; 75 character++; 76 wchar c = *cast(wchar*)(&input[index]); 77 index += 2; 78 return c; 79 } 80 81 /** 82 * Check if decoder is in the end of input. 83 */ 84 bool eos() 85 { 86 return (index >= input.length); 87 } 88 89 /** 90 * Range interface. 91 */ 92 auto decode(string s) 93 { 94 input = s; 95 96 static struct ByDchar 97 { 98 private: 99 UTF16LEDecoder _decoder; 100 dchar _lastRead; 101 102 public: 103 this(UTF16LEDecoder decoder) { 104 _decoder = decoder; 105 _lastRead = cast(dchar)_decoder.decodeNext(); 106 } 107 108 bool empty() { 109 return _lastRead == DECODE_END || _lastRead == DECODE_ERROR; 110 } 111 112 dchar front() { 113 return _lastRead; 114 } 115 116 void popFront() { 117 _lastRead = cast(dchar)_decoder.decodeNext(); 118 } 119 120 auto save() { 121 return this; 122 } 123 } 124 125 return ByDchar(this); 126 } 127 128 /// ditto 129 auto decode() 130 { 131 return decode(input); 132 } 133 } 134 135 /** 136 * UTF-16 LE encoder to use with dlib.text.encodings.transcode 137 */ 138 struct UTF16LEEncoder 139 { 140 /** 141 * Encodes a Unicode code point to UTF-16 LE into user-provided buffer. 142 * Returns number of bytes written, or 0 at error. 143 */ 144 size_t encode(uint ch, char[] buffer) 145 { 146 wchar[] wbuffer = cast(wchar[])buffer; 147 if (ch > 0xFFFF) 148 { 149 wchar x = cast(wchar)ch; 150 wchar vh = cast(wchar)(UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10)); 151 wchar vl = cast(wchar)(UTF16_LO_SURROGATE | (x & ((1 << 10) - 1))); 152 wbuffer[0] = vh; 153 wbuffer[1] = vl; 154 return 4; 155 } 156 else 157 { 158 wbuffer[0] = cast(wchar)ch; 159 return 2; 160 } 161 } 162 } 163 164 /** 165 * Converts UTF-8 to UTF-16 166 * Will be deprecated soon, use transcode!(UTF8Decoder, UTF16LEEncoder) instead 167 */ 168 wchar[] convertUTF8toUTF16(string s, bool nullTerm = false) 169 { 170 Array!wchar array; 171 wchar[] output; 172 173 UTF8Decoder dec = UTF8Decoder(s); 174 175 while (!dec.eos) 176 { 177 int code = dec.decodeNext(); 178 179 if (code == UTF8_ERROR) 180 { 181 array.free(); 182 return output; 183 } 184 185 dchar ch = cast(dchar)code; 186 187 if (ch > 0xFFFF) 188 { 189 // Split ch up into a surrogate pair as it is over 16 bits long. 190 wchar x = cast(wchar)ch; 191 auto vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10); 192 auto vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1)); 193 array.append(cast(wchar)vh); 194 array.append(cast(wchar)vl); 195 } 196 else 197 { 198 array.append(cast(wchar)ch); 199 } 200 } 201 202 if (nullTerm) 203 { 204 array.append(0); 205 } 206 207 output = copy(array.data); 208 array.free(); 209 return output; 210 } 211 212 /** 213 * Converts UTF-16 zero-terminated string to UTF-8 214 */ 215 char[] convertUTF16ztoUTF8(wchar* s, bool nullTerm = false) 216 { 217 Array!char array; 218 char[] output; 219 wchar* utf16 = s; 220 221 wchar utf16char; 222 do 223 { 224 utf16char = *utf16; 225 utf16++; 226 227 if (utf16char) 228 { 229 if (utf16char < 0x80) 230 { 231 array.append((utf16char >> 0 & 0x7F) | 0x00); 232 } 233 else if (utf16char < 0x0800) 234 { 235 array.append((utf16char >> 6 & 0x1F) | 0xC0); 236 array.append((utf16char >> 0 & 0x3F) | 0x80); 237 } 238 else if (utf16char < 0x010000) 239 { 240 array.append((utf16char >> 12 & 0x0F) | 0xE0); 241 array.append((utf16char >> 6 & 0x3F) | 0x80); 242 array.append((utf16char >> 0 & 0x3F) | 0x80); 243 } 244 else if (utf16char < 0x110000) 245 { 246 array.append((utf16char >> 18 & 0x07) | 0xF0); 247 array.append((utf16char >> 12 & 0x3F) | 0x80); 248 array.append((utf16char >> 6 & 0x3F) | 0x80); 249 array.append((utf16char >> 0 & 0x3F) | 0x80); 250 } 251 } 252 } 253 while (utf16char); 254 255 if (nullTerm) 256 { 257 array.append(0); 258 } 259 260 output = copy(array.data); 261 array.free(); 262 return output; 263 }