dlib.text.utf16 source code

1 /*
2 Copyright (c) 2016-2021 Timur Gafarov
3 
4 Boost Software License - Version 1.0 - August 17th, 2003
5 
6 Permission is hereby granted, free of charge, to any person or organization
7 obtaining a copy of the software and accompanying documentation covered by
8 this license (the "Software") to use, reproduce, display, distribute,
9 execute, and transmit the Software, and to prepare derivative works of the
10 Software, and to permit third-parties to whom the Software is furnished to
11 do so, all subject to the following:
12 
13 The copyright notices in the Software and this entire statement, including
14 the above license grant, this restriction and the following disclaimer,
15 must be included in all copies of the Software, in whole or in part, and
16 all derivative works of the Software, unless such copies or derivative
17 works are solely in the form of machine-executable object code generated by
18 a source language processor.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 */
28 
29 /**
30  * UTF-16 decoder and encoder
31  *
32  * Copyright: Timur Gafarov 2016-2021.
33  * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0).
34  * Authors: Timur Gafarov, Roman Chistokhodov
35  */
36 module dlib.text.utf16;
37 
38 import core.stdc.stdio;
39 import dlib.core.memory;
40 import dlib.container.array;
41 import dlib.text.utf8;
42 import dlib.text.utils;
43 import dlib.text.common;
44 
45 enum ushort UTF16_HI_SURROGATE = 0xD800;
46 enum ushort UTF16_LO_SURROGATE = 0xDC00;
47 enum ushort UTF16_BOM_LE = 0xfeff;
48 enum ushort UTF16_BOM_BE = 0xfffe;
49 
50 /**
51  * UTF-16 LE decoder to use with dlib.text.encodings.transcode
52  */
53 struct UTF16LEDecoder
54 {
55     // TODO: byte order
56     public:
57 
58     /// Input string. Set it before decoding
59     string input;
60 
61     /// Current index in an input string
62     size_t index = 0;
63 
64     /// Current character index
65     int character = 0;
66 
67     /**
68      * Decode next character.
69      * Returns: decoded code point, or UTF8_ERROR if error occured, or UTF8_END if input has no more characters.
70      */
71     int decodeNext()
72     {
73         if (index >= input.length)
74             return index == input.length ? DECODE_END : DECODE_ERROR;
75         character++;
76         wchar c = *cast(wchar*)(&input[index]);
77         index += 2;
78         return c;
79     }
80 
81     /**
82      * Check if decoder is in the end of input.
83      */
84     bool eos()
85     {
86         return (index >= input.length);
87     }
88 
89     /**
90      * Range interface.
91      */
92     auto decode(string s)
93     {
94         input = s;
95 
96         static struct ByDchar
97         {
98             private:
99             UTF16LEDecoder _decoder;
100             dchar _lastRead;
101 
102             public:
103             this(UTF16LEDecoder decoder) {
104                 _decoder = decoder;
105                 _lastRead = cast(dchar)_decoder.decodeNext();
106             }
107 
108             bool empty() {
109                 return _lastRead == DECODE_END || _lastRead == DECODE_ERROR;
110             }
111 
112             dchar front() {
113                 return _lastRead;
114             }
115 
116             void popFront() {
117                 _lastRead = cast(dchar)_decoder.decodeNext();
118             }
119 
120             auto save() {
121                 return this;
122             }
123         }
124 
125         return ByDchar(this);
126     }
127 
128     /// ditto
129     auto decode()
130     {
131         return decode(input);
132     }
133 }
134 
135 /**
136  * UTF-16 LE encoder to use with dlib.text.encodings.transcode
137  */
138 struct UTF16LEEncoder
139 {
140     /**
141      * Encodes a Unicode code point to UTF-16 LE into user-provided buffer.
142      * Returns number of bytes written, or 0 at error.
143      */
144     size_t encode(uint ch, char[] buffer)
145     {
146         wchar[] wbuffer = cast(wchar[])buffer;
147         if (ch > 0xFFFF)
148         {
149             wchar x = cast(wchar)ch;
150             wchar vh = cast(wchar)(UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10));
151             wchar vl = cast(wchar)(UTF16_LO_SURROGATE | (x & ((1 << 10) - 1)));
152             wbuffer[0] = vh;
153             wbuffer[1] = vl;
154             return 4;
155         }
156         else
157         {
158             wbuffer[0] = cast(wchar)ch;
159             return 2;
160         }
161     }
162 }
163 
164 /**
165  * Converts UTF-8 to UTF-16
166  * Will be deprecated soon, use transcode!(UTF8Decoder, UTF16LEEncoder) instead
167  */
168 wchar[] convertUTF8toUTF16(string s, bool nullTerm = false)
169 {
170     Array!wchar array;
171     wchar[] output;
172 
173     UTF8Decoder dec = UTF8Decoder(s);
174 
175     while (!dec.eos)
176     {
177         int code = dec.decodeNext();
178 
179         if (code == UTF8_ERROR)
180         {
181             array.free();
182             return output;
183         }
184 
185         dchar ch = cast(dchar)code;
186 
187         if (ch > 0xFFFF)
188         {
189             // Split ch up into a surrogate pair as it is over 16 bits long.
190             wchar x = cast(wchar)ch;
191             auto vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
192             auto vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
193             array.append(cast(wchar)vh);
194             array.append(cast(wchar)vl);
195         }
196         else
197         {
198             array.append(cast(wchar)ch);
199         }
200     }
201 
202     if (nullTerm)
203     {
204         array.append(0);
205     }
206 
207     output = copy(array.data);
208     array.free();
209     return output;
210 }
211 
212 /**
213  * Converts UTF-16 zero-terminated string to UTF-8
214  */
215 char[] convertUTF16ztoUTF8(wchar* s, bool nullTerm = false)
216 {
217     Array!char array;
218     char[] output;
219     wchar* utf16 = s;
220 
221     wchar utf16char;
222     do
223     {
224         utf16char = *utf16;
225         utf16++;
226 
227         if (utf16char)
228         {
229             if (utf16char < 0x80)
230             {
231                 array.append((utf16char >> 0 & 0x7F) | 0x00);
232             }
233             else if (utf16char < 0x0800)
234             {
235                 array.append((utf16char >> 6 & 0x1F) | 0xC0);
236                 array.append((utf16char >> 0 & 0x3F) | 0x80);
237             }
238             else if (utf16char < 0x010000)
239             {
240                 array.append((utf16char >> 12 & 0x0F) | 0xE0);
241                 array.append((utf16char >> 6 & 0x3F) | 0x80);
242                 array.append((utf16char >> 0 & 0x3F) | 0x80);
243             }
244             else if (utf16char < 0x110000)
245             {
246                 array.append((utf16char >> 18 & 0x07) | 0xF0);
247                 array.append((utf16char >> 12 & 0x3F) | 0x80);
248                 array.append((utf16char >> 6 & 0x3F) | 0x80);
249                 array.append((utf16char >> 0 & 0x3F) | 0x80);
250             }
251         }
252     }
253     while (utf16char);
254 
255     if (nullTerm)
256     {
257         array.append(0);
258     }
259 
260     output = copy(array.data);
261     array.free();
262     return output;
263 }