1 /* 2 Copyright (c) 2015-2021 Timur Gafarov 3 4 Boost Software License - Version 1.0 - August 17th, 2003 5 6 Permission is hereby granted, free of charge, to any person or organization 7 obtaining a copy of the software and accompanying documentation covered by 8 this license (the "Software") to use, reproduce, display, distribute, 9 execute, and transmit the Software, and to prepare derivative works of the 10 Software, and to permit third-parties to whom the Software is furnished to 11 do so, all subject to the following: 12 13 The copyright notices in the Software and this entire statement, including 14 the above license grant, this restriction and the following disclaimer, 15 must be included in all copies of the Software, in whole or in part, and 16 all derivative works of the Software, unless such copies or derivative 17 works are solely in the form of machine-executable object code generated by 18 a source language processor. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 */ 28 29 /** 30 * UTF-8 encoder and decoder 31 * 32 * Copyright: Timur Gafarov 2015-2021. 33 * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0). 34 * Authors: Timur Gafarov, Roman Chistokhodov 35 */ 36 module dlib.text.utf8; 37 38 import dlib.text.common; 39 40 /// Constant to return from UTF8Decoder on the end of string. 41 enum UTF8_END = DECODE_END; 42 43 /// Constant to return from UTF8Decoder when error occurs. 44 enum UTF8_ERROR = DECODE_ERROR; 45 46 /** 47 * UTF-8 decoder to use with dlib.text.encodings.transcode 48 */ 49 struct UTF8Decoder 50 { 51 public: 52 53 /// Input string. Set it before decoding 54 string input; 55 56 /// Current index in an input string 57 size_t index = 0; 58 59 /// Current character index 60 int character = 0; 61 62 private: 63 int get() 64 { 65 if (index >= input.length) 66 return UTF8_END; 67 auto c = input[index] & 0xFF; 68 index++; 69 return c; 70 } 71 72 int cont() 73 { 74 int c = get(); 75 return ((c & 0xC0) == 0x80) ? (c & 0x3F): UTF8_ERROR; 76 } 77 78 public: 79 /** 80 * Decode next character. 81 * Returns: decoded code point, or UTF8_ERROR if error occured, or UTF8_END if input has no more characters. 82 */ 83 int decodeNext() 84 { 85 int c; // the first byte of the character 86 int r; // the result 87 88 if (index >= input.length) 89 return index == input.length ? UTF8_END : UTF8_ERROR; 90 91 character++; 92 c = get(); 93 94 // Zero continuation (0 to 127) 95 if ((c & 0x80) == 0) 96 return c; 97 98 // One continuation (128 to 2047) 99 if ((c & 0xE0) == 0xC0) 100 { 101 int c1 = cont(); 102 if (c1 >= 0) 103 { 104 r = ((c & 0x1F) << 6) | c1; 105 return r >= 128 ? r : UTF8_ERROR; 106 } 107 } 108 // Two continuation (2048 to 55295 and 57344 to 65535) 109 else if ((c & 0xF0) == 0xE0) 110 { 111 int c1 = cont(); 112 int c2 = cont(); 113 if ((c1 | c2) >= 0) 114 { 115 r = ((c & 0x0F) << 12) | (c1 << 6) | c2; 116 return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR; 117 } 118 } 119 // Three continuation (65536 to 1114111) 120 else if ((c & 0xF8) == 0xF0) 121 { 122 int c1 = cont(); 123 int c2 = cont(); 124 int c3 = cont(); 125 if ((c1 | c2 | c3) >= 0) 126 { 127 return (((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3); 128 } 129 } 130 131 return UTF8_ERROR; 132 } 133 134 /** 135 * Check if decoder is in the end of input. 136 */ 137 bool eos() 138 { 139 return (index >= input.length); 140 } 141 142 /** 143 * Range interface. 144 */ 145 auto decode(string s) 146 { 147 input = s; 148 149 static struct ByDchar 150 { 151 private: 152 153 UTF8Decoder _decoder; 154 dchar _lastRead; 155 156 public: 157 158 this(UTF8Decoder decoder) 159 { 160 _decoder = decoder; 161 _lastRead = cast(dchar)_decoder.decodeNext(); 162 } 163 164 bool empty() 165 { 166 return _lastRead == UTF8_END || _lastRead == UTF8_ERROR; 167 } 168 169 dchar front() 170 { 171 return _lastRead; 172 } 173 174 void popFront() 175 { 176 _lastRead = cast(dchar)_decoder.decodeNext(); 177 } 178 179 auto save() { 180 return this; 181 } 182 } 183 184 return ByDchar(this); 185 } 186 187 /// ditto 188 auto decode() 189 { 190 return decode(input); 191 } 192 193 /// 194 unittest 195 { 196 auto decoder = UTF8Decoder("Eng 日本語 Кир ©€"); 197 import std.algorithm: equal; 198 assert(equal(decoder.decode(), "Eng 日本語 Кир ©€"d)); 199 200 auto range = decoder.decode(); 201 auto saved = range.save; 202 203 range.popFront(); 204 range.popFront(); 205 range.popFront(); 206 range.popFront(); 207 range.popFront(); 208 209 assert(equal(range, "本語 Кир ©€"d)); 210 assert(equal(saved, "Eng 日本語 Кир ©€"d)); 211 } 212 } 213 214 /// 215 unittest 216 { 217 { 218 auto decoder = UTF8Decoder("Eng 日本語 Кир ©€\xF0\x90\x8D\x88"); 219 assert(decoder.decodeNext() == 'E'); 220 assert(decoder.decodeNext() == 'n'); 221 assert(decoder.decodeNext() == 'g'); 222 assert(decoder.decodeNext() == ' '); 223 assert(decoder.decodeNext() == '日'); 224 assert(decoder.decodeNext() == '本'); 225 assert(decoder.decodeNext() == '語'); 226 assert(decoder.decodeNext() == ' '); 227 assert(decoder.decodeNext() == 'К'); 228 assert(decoder.decodeNext() == 'и'); 229 assert(decoder.decodeNext() == 'р'); 230 assert(decoder.decodeNext() == ' '); 231 assert(decoder.decodeNext() == '©'); 232 assert(decoder.decodeNext() == '€'); 233 assert(decoder.decodeNext() == 0x10348); 234 assert(decoder.decodeNext() == UTF8_END); 235 assert(decoder.get() == UTF8_END); 236 assert(decoder.eos()); 237 } 238 { 239 auto decoder = UTF8Decoder("日本語"[0..$-1]); 240 assert(decoder.decodeNext() == '日'); 241 assert(decoder.decodeNext() == '本'); 242 assert(decoder.decodeNext() == UTF8_ERROR); 243 } 244 } 245 246 /** 247 * UTF-8 encoder to use with dlib.text.encodings.transcode 248 */ 249 struct UTF8Encoder 250 { 251 /** 252 * Encodes a Unicode code point to UTF-8 into user-provided buffer. 253 * Returns number of bytes written, or 0 at error. 254 */ 255 size_t encode(uint c, char[] buffer) 256 { 257 if (c <= 0x7F) 258 { 259 // Plain ASCII 260 buffer[0] = cast(char)c; 261 return 1; 262 } 263 else if (c <= 0x07FF) 264 { 265 // 2-byte unicode 266 buffer[0] = cast(char)(((c >> 6) & 0x1F) | 0xC0); 267 buffer[1] = cast(char)(((c >> 0) & 0x3F) | 0x80); 268 return 2; 269 } 270 else if (c <= 0xFFFF) 271 { 272 // 3-byte unicode 273 buffer[0] = cast(char)(((c >> 12) & 0x0F) | 0xE0); 274 buffer[1] = cast(char)(((c >> 6) & 0x3F) | 0x80); 275 buffer[2] = cast(char)(((c >> 0) & 0x3F) | 0x80); 276 return 3; 277 } 278 else if (c <= 0x10FFFF) 279 { 280 // 4-byte unicode 281 buffer[0] = cast(char)(((c >> 18) & 0x07) | 0xF0); 282 buffer[1] = cast(char)(((c >> 12) & 0x3F) | 0x80); 283 buffer[2] = cast(char)(((c >> 6) & 0x3F) | 0x80); 284 buffer[3] = cast(char)(((c >> 0) & 0x3F) | 0x80); 285 return 4; 286 } 287 else 288 { 289 // error 290 return 0; 291 } 292 } 293 }