1 /*
2 Copyright (c) 2015-2021 Timur Gafarov
3 
4 Boost Software License - Version 1.0 - August 17th, 2003
5 
6 Permission is hereby granted, free of charge, to any person or organization
7 obtaining a copy of the software and accompanying documentation covered by
8 this license (the "Software") to use, reproduce, display, distribute,
9 execute, and transmit the Software, and to prepare derivative works of the
10 Software, and to permit third-parties to whom the Software is furnished to
11 do so, all subject to the following:
12 
13 The copyright notices in the Software and this entire statement, including
14 the above license grant, this restriction and the following disclaimer,
15 must be included in all copies of the Software, in whole or in part, and
16 all derivative works of the Software, unless such copies or derivative
17 works are solely in the form of machine-executable object code generated by
18 a source language processor.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 */
28 
29 /**
30  * UTF-8 encoder and decoder
31  *
32  * Copyright: Timur Gafarov 2015-2021.
33  * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0).
34  * Authors: Timur Gafarov, Roman Chistokhodov
35  */
36 module dlib.text.utf8;
37 
38 import dlib.text.common;
39 
40 /// Constant to return from UTF8Decoder on the end of string.
41 enum UTF8_END = DECODE_END;
42 
43 /// Constant to return from UTF8Decoder when error occurs.
44 enum UTF8_ERROR = DECODE_ERROR;
45 
46 /**
47  * UTF-8 decoder to use with dlib.text.encodings.transcode
48  */
49 struct UTF8Decoder
50 {
51     public:
52     
53     /// Input string. Set it before decoding
54     string input;
55     
56     /// Current index in an input string
57     size_t index = 0;
58     
59     /// Current character index
60     int character = 0;
61 
62     private:
63     int get()
64     {
65         if (index >= input.length)
66             return UTF8_END;
67         auto c = input[index] & 0xFF;
68         index++;
69         return c;
70     }
71 
72     int cont()
73     {
74         int c = get();
75         return ((c & 0xC0) == 0x80) ? (c & 0x3F): UTF8_ERROR;
76     }
77 
78     public:
79     /**
80      * Decode next character.
81      * Returns: decoded code point, or UTF8_ERROR if error occured, or UTF8_END if input has no more characters.
82      */
83     int decodeNext()
84     {
85         int c;  // the first byte of the character
86         int r;  // the result
87 
88         if (index >= input.length)
89             return index == input.length ? UTF8_END : UTF8_ERROR;
90 
91         character++;
92         c = get();
93 
94         // Zero continuation (0 to 127)
95         if ((c & 0x80) == 0)
96             return c;
97 
98         // One continuation (128 to 2047)
99         if ((c & 0xE0) == 0xC0)
100         {
101             int c1 = cont();
102             if (c1 >= 0)
103             {
104                 r = ((c & 0x1F) << 6) | c1;
105                 return r >= 128 ? r : UTF8_ERROR;
106             }
107         }
108         // Two continuation (2048 to 55295 and 57344 to 65535)
109         else if ((c & 0xF0) == 0xE0)
110         {
111             int c1 = cont();
112             int c2 = cont();
113             if ((c1 | c2) >= 0)
114             {
115                 r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
116                 return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
117             }
118         }
119         // Three continuation (65536 to 1114111)
120         else if ((c & 0xF8) == 0xF0)
121         {
122             int c1 = cont();
123             int c2 = cont();
124             int c3 = cont();
125             if ((c1 | c2 | c3) >= 0)
126             {
127                 return (((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3);
128             }
129         }
130 
131         return UTF8_ERROR;
132     }
133     
134     /**
135      * Check if decoder is in the end of input.
136      */
137     bool eos()
138     {
139         return (index >= input.length);
140     }
141 
142     /**
143      * Range interface.
144      */
145     auto decode(string s)
146     {
147         input = s;
148         
149         static struct ByDchar
150         {
151             private:
152             
153             UTF8Decoder _decoder;
154             dchar _lastRead;
155             
156             public:
157             
158             this(UTF8Decoder decoder)
159             {
160                 _decoder = decoder;
161                 _lastRead = cast(dchar)_decoder.decodeNext();
162             }
163 
164             bool empty()
165             {
166                 return _lastRead == UTF8_END || _lastRead == UTF8_ERROR;
167             }
168 
169             dchar front()
170             {
171                 return _lastRead;
172             }
173 
174             void popFront()
175             {
176                 _lastRead = cast(dchar)_decoder.decodeNext();
177             }
178 
179             auto save() {
180                 return this;
181             }
182         }
183 
184         return ByDchar(this);
185     }
186     
187     /// ditto
188     auto decode()
189     {
190         return decode(input);
191     }
192 
193     ///
194     unittest
195     {
196         auto decoder = UTF8Decoder("Eng 日本語 Кир ©€");
197         import std.algorithm: equal;
198         assert(equal(decoder.decode(), "Eng 日本語 Кир ©€"d));
199 
200         auto range = decoder.decode();
201         auto saved = range.save;
202 
203         range.popFront();
204         range.popFront();
205         range.popFront();
206         range.popFront();
207         range.popFront();
208 
209         assert(equal(range, "本語 Кир ©€"d));
210         assert(equal(saved, "Eng 日本語 Кир ©€"d));
211     }
212 }
213 
214 ///
215 unittest
216 {
217     {
218         auto decoder = UTF8Decoder("Eng 日本語 Кир ©€\xF0\x90\x8D\x88");
219         assert(decoder.decodeNext() == 'E');
220         assert(decoder.decodeNext() == 'n');
221         assert(decoder.decodeNext() == 'g');
222         assert(decoder.decodeNext() == ' ');
223         assert(decoder.decodeNext() == '日');
224         assert(decoder.decodeNext() == '本');
225         assert(decoder.decodeNext() == '語');
226         assert(decoder.decodeNext() == ' ');
227         assert(decoder.decodeNext() == 'К');
228         assert(decoder.decodeNext() == 'и');
229         assert(decoder.decodeNext() == 'р');
230         assert(decoder.decodeNext() == ' ');
231         assert(decoder.decodeNext() == '©');
232         assert(decoder.decodeNext() == '€');
233         assert(decoder.decodeNext() == 0x10348);
234         assert(decoder.decodeNext() == UTF8_END);
235         assert(decoder.get() == UTF8_END);
236         assert(decoder.eos());
237     }
238     {
239         auto decoder = UTF8Decoder("日本語"[0..$-1]);
240         assert(decoder.decodeNext() == '日');
241         assert(decoder.decodeNext() == '本');
242         assert(decoder.decodeNext() == UTF8_ERROR);
243     }
244 }
245 
246 /**
247  * UTF-8 encoder to use with dlib.text.encodings.transcode
248  */
249 struct UTF8Encoder
250 {
251     /**
252      * Encodes a Unicode code point to UTF-8 into user-provided buffer.
253      * Returns number of bytes written, or 0 at error.
254      */
255     size_t encode(uint c, char[] buffer)
256     {
257         if (c <= 0x7F)
258         {
259             // Plain ASCII
260             buffer[0] = cast(char)c;
261             return 1;
262         }
263         else if (c <= 0x07FF)
264         {
265             // 2-byte unicode
266             buffer[0] = cast(char)(((c >> 6) & 0x1F) | 0xC0);
267             buffer[1] = cast(char)(((c >> 0) & 0x3F) | 0x80);
268             return 2;
269         }
270         else if (c <= 0xFFFF)
271         {
272             // 3-byte unicode
273             buffer[0] = cast(char)(((c >> 12) & 0x0F) | 0xE0);
274             buffer[1] = cast(char)(((c >>  6) & 0x3F) | 0x80);
275             buffer[2] = cast(char)(((c >>  0) & 0x3F) | 0x80);
276             return 3;
277         }
278         else if (c <= 0x10FFFF)
279         {
280             // 4-byte unicode
281             buffer[0] = cast(char)(((c >> 18) & 0x07) | 0xF0);
282             buffer[1] = cast(char)(((c >> 12) & 0x3F) | 0x80);
283             buffer[2] = cast(char)(((c >>  6) & 0x3F) | 0x80);
284             buffer[3] = cast(char)(((c >>  0) & 0x3F) | 0x80);
285             return 4;
286         }
287         else
288         {
289             // error
290             return 0;
291         }
292     }
293 }