1 /* 2 Copyright (c) 2016-2021 Timur Gafarov 3 4 Boost Software License - Version 1.0 - August 17th, 2003 5 6 Permission is hereby granted, free of charge, to any person or organization 7 obtaining a copy of the software and accompanying documentation covered by 8 this license (the "Software") to use, reproduce, display, distribute, 9 execute, and transmit the Software, and to prepare derivative works of the 10 Software, and to permit third-parties to whom the Software is furnished to 11 do so, all subject to the following: 12 13 The copyright notices in the Software and this entire statement, including 14 the above license grant, this restriction and the following disclaimer, 15 must be included in all copies of the Software, in whole or in part, and 16 all derivative works of the Software, unless such copies or derivative 17 works are solely in the form of machine-executable object code generated by 18 a source language processor. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 */ 28 29 /** 30 * General-purpose non-allocating lexical analyzer. 31 * 32 * Description: 33 * Breaks the input string to a stream of lexemes according to a given delimiter dictionary. 34 * Delimiters are symbols that separate sequences of characters (e.g. operators). 35 * Lexemes are slices of the input string. 36 * Assumes UTF-8 input. 37 * Treats \r\n as a single \n. 38 * 39 * Copyright: Timur Gafarov 2016-2021. 40 * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0). 41 * Authors: Timur Gafarov, Eugene Wissner, Roman Chistokhodov, ijet 42 */ 43 module dlib.text.lexer; 44 45 import std.ascii; 46 import std.range.interfaces; 47 import dlib.text.utf8; 48 49 /** 50 * Lexical analyzer class 51 */ 52 class Lexer: InputRange!string 53 { 54 protected: 55 string input; 56 string[] delims; 57 58 UTF8Decoder dec; 59 uint index; 60 dchar current; 61 uint currentSize; 62 63 public: 64 bool ignoreWhitespaces = false; 65 bool ignoreNewlines = false; 66 67 this(string input, string[] delims) 68 { 69 this.input = input; 70 this.delims = delims; 71 this.dec.input = this.input; 72 this.index = 0; 73 advance(); 74 } 75 76 uint position() @property 77 { 78 return pos(); 79 } 80 81 string getLexeme() 82 { 83 string res = ""; 84 int tStart = -1; 85 while(true) 86 { 87 dchar c = current; 88 if (c == UTF8_END || c == UTF8_ERROR) 89 { 90 if (tStart > -1) 91 { 92 res = input[tStart..pos]; 93 tStart = -1; 94 break; 95 } 96 else 97 { 98 res = ""; 99 break; 100 } 101 } 102 103 if (isNewline(c)) 104 { 105 if (tStart > -1) 106 { 107 res = input[tStart..pos]; 108 tStart = -1; 109 break; 110 } 111 else 112 { 113 if (c == '\r') 114 { 115 advance(); 116 c = current; 117 if (c == '\n') 118 advance(); 119 } 120 else 121 { 122 advance(); 123 } 124 125 if (ignoreNewlines) 126 { 127 continue; 128 } 129 else 130 { 131 res = "\n"; 132 break; 133 } 134 } 135 } 136 137 if (isWhitespace(c)) 138 { 139 if (tStart > -1) 140 { 141 res = input[tStart..pos]; 142 tStart = -1; 143 break; 144 } 145 else 146 { 147 advance(); 148 if (ignoreWhitespaces) 149 { 150 continue; 151 } 152 else 153 { 154 res = " "; 155 break; 156 } 157 } 158 } 159 160 string d = consumeDelimiter(); 161 if (d.length) 162 { 163 if (tStart > -1) 164 { 165 res = input[tStart..pos]; 166 } 167 else 168 { 169 res = d; 170 forwardJump(d.length); 171 } 172 173 break; 174 } 175 else 176 { 177 if (tStart == -1) 178 { 179 tStart = pos; 180 } 181 advance(); 182 } 183 } 184 185 return res; 186 } 187 188 protected: 189 190 uint pos() 191 { 192 return index-currentSize; 193 } 194 195 void advance() 196 { 197 current = dec.decodeNext(); 198 currentSize = cast(uint)dec.index - index; 199 index += currentSize; 200 } 201 202 void forwardJump(size_t numChars) 203 { 204 for(size_t i = 0; i < numChars; i++) 205 { 206 advance(); 207 } 208 } 209 210 bool forwardCompare(string str) 211 { 212 UTF8Decoder dec2 = UTF8Decoder(str); 213 size_t oldIndex = dec.index; 214 215 bool res = true; 216 217 int c1 = current; 218 int c2 = dec2.decodeNext(); 219 do 220 { 221 if (c2 == UTF8_END || c2 == UTF8_ERROR) 222 break; 223 224 if (c1 == UTF8_END && c2 == UTF8_END) 225 { 226 res = true; 227 break; 228 } 229 230 if (c1 != UTF8_END && c1 != UTF8_ERROR && 231 c2 != UTF8_END && c2 != UTF8_ERROR) 232 { 233 if (c1 != c2) 234 { 235 res = false; 236 break; 237 } 238 } 239 else 240 { 241 res = false; 242 break; 243 } 244 245 c1 = dec.decodeNext(); 246 c2 = dec2.decodeNext(); 247 } 248 while(c2 != UTF8_END && c2 != UTF8_ERROR); 249 250 dec.index = oldIndex; 251 252 return res; 253 } 254 255 bool isWhitespace(dchar c) 256 { 257 foreach(w; std.ascii.whitespace) 258 { 259 if (c == w) 260 { 261 return true; 262 } 263 } 264 return false; 265 } 266 267 bool isNewline(dchar c) 268 { 269 return (c == '\n' || c == '\r'); 270 } 271 272 string consumeDelimiter() 273 { 274 size_t bestLen = 0; 275 string bestStr = ""; 276 foreach(d; delims) 277 { 278 if (forwardCompare(d)) 279 { 280 if (d.length > bestLen) 281 { 282 bestLen = d.length; 283 bestStr = input[pos..pos+d.length]; 284 } 285 } 286 } 287 return bestStr; 288 } 289 290 // Range interface 291 292 private: 293 294 string _front; 295 296 public: 297 298 bool empty() 299 { 300 return _front.length == 0; 301 } 302 303 string front() 304 { 305 return _front; 306 } 307 308 void popFront() 309 { 310 _front = getLexeme(); 311 } 312 313 string moveFront() 314 { 315 _front = getLexeme(); 316 return _front; 317 } 318 319 int opApply(scope int delegate(string) dg) 320 { 321 int result = 0; 322 323 while(true) 324 { 325 string lexeme = getLexeme(); 326 327 if (!lexeme.length) 328 break; 329 330 result = dg(lexeme); 331 if (result) 332 break; 333 } 334 335 return result; 336 } 337 338 int opApply(scope int delegate(size_t, string) dg) 339 { 340 int result = 0; 341 size_t i = 0; 342 343 while(true) 344 { 345 string lexeme = getLexeme(); 346 347 if (!lexeme.length) 348 break; 349 350 result = dg(i, lexeme); 351 if (result) 352 break; 353 354 i++; 355 } 356 357 return result; 358 } 359 } 360 361 /// 362 unittest 363 { 364 import std.array: array; 365 import std.range: iota; 366 367 string[] delims = ["(", ")", ";", " ", "{", "}", ".", "\n", "\r", "=", "++", "<"]; 368 auto input = "for (int i=0; i<arr.length; ++i)\r\n{doThing();}\n"; 369 auto lexer = new Lexer(input, delims); 370 371 string[] arr; 372 while(true) { 373 auto lexeme = lexer.getLexeme(); 374 if(lexeme.length == 0) { 375 break; 376 } 377 arr ~= lexeme; 378 } 379 auto reference = [ 380 "for", " ", "(", "int", " ", "i", "=", "0", ";", " ", "i", "<", 381 "arr", ".", "length", ";", " ", "++", "i", ")", "\n", "{", "doThing", 382 "(", ")", ";", "}", "\n" ]; 383 assert(arr == reference); 384 385 lexer = new Lexer(input, delims); 386 arr = array(lexer); 387 assert(arr == reference); 388 389 input = ""; 390 lexer = new Lexer(input, delims); 391 assert(lexer.getLexeme().length == 0); 392 }