1 /*
2 Copyright (c) 2016-2021 Timur Gafarov
3 
4 Boost Software License - Version 1.0 - August 17th, 2003
5 
6 Permission is hereby granted, free of charge, to any person or organization
7 obtaining a copy of the software and accompanying documentation covered by
8 this license (the "Software") to use, reproduce, display, distribute,
9 execute, and transmit the Software, and to prepare derivative works of the
10 Software, and to permit third-parties to whom the Software is furnished to
11 do so, all subject to the following:
12 
13 The copyright notices in the Software and this entire statement, including
14 the above license grant, this restriction and the following disclaimer,
15 must be included in all copies of the Software, in whole or in part, and
16 all derivative works of the Software, unless such copies or derivative
17 works are solely in the form of machine-executable object code generated by
18 a source language processor.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 */
28 
29 /**
30  * General-purpose non-allocating lexical analyzer.
31  *
32  * Description:
33  * Breaks the input string to a stream of lexemes according to a given delimiter dictionary.
34  * Delimiters are symbols that separate sequences of characters (e.g. operators).
35  * Lexemes are slices of the input string.
36  * Assumes UTF-8 input.
37  * Treats \r\n as a single \n.
38  *
39  * Copyright: Timur Gafarov 2016-2021.
40  * License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0).
41  * Authors: Timur Gafarov, Eugene Wissner, Roman Chistokhodov, ijet
42  */
43 module dlib.text.lexer;
44 
45 import std.ascii;
46 import std.range.interfaces;
47 import dlib.text.utf8;
48 
49 /**
50  * Lexical analyzer class
51  */
52 class Lexer: InputRange!string
53 {
54     protected:
55     string input;
56     string[] delims;
57 
58     UTF8Decoder dec;
59     uint index;
60     dchar current;
61     uint currentSize;
62 
63     public:
64     bool ignoreWhitespaces = false;
65     bool ignoreNewlines = false;
66 
67     this(string input, string[] delims)
68     {
69         this.input = input;
70         this.delims = delims;
71         this.dec.input = this.input;
72         this.index = 0;
73         advance();
74     }
75     
76     uint position() @property
77     {
78         return pos();
79     }
80 
81     string getLexeme()
82     {
83         string res = "";
84         int tStart = -1;
85         while(true)
86         {
87             dchar c = current;
88             if (c == UTF8_END || c == UTF8_ERROR)
89             {
90                 if (tStart > -1)
91                 {
92                     res = input[tStart..pos];
93                     tStart = -1;
94                     break;
95                 }
96                 else
97                 {
98                     res = "";
99                     break;
100                 }
101             }
102 
103             if (isNewline(c)) 
104             {
105                 if (tStart > -1)
106                 {
107                     res = input[tStart..pos];
108                     tStart = -1;
109                     break;
110                 }
111                 else
112                 {
113                     if (c == '\r')
114                     {
115                         advance();
116                         c = current;
117                         if (c == '\n')
118                             advance();
119                     }
120                     else
121                     {
122                         advance();
123                     }
124 
125                     if (ignoreNewlines)
126                     {
127                         continue;
128                     }
129                     else
130                     {
131                         res = "\n";
132                         break;
133                     }
134                 }
135             }
136 
137             if (isWhitespace(c))
138             {
139                 if (tStart > -1)
140                 {
141                     res = input[tStart..pos];
142                     tStart = -1;
143                     break;
144                 }
145                 else
146                 {
147                     advance();
148                     if (ignoreWhitespaces)
149                     {
150                         continue;
151                     }
152                     else
153                     {
154                         res = " ";
155                         break;
156                     }
157                 }
158             }
159 
160             string d = consumeDelimiter();
161             if (d.length)
162             {
163                 if (tStart > -1)
164                 {
165                     res = input[tStart..pos];
166                 }
167                 else
168                 {
169                     res = d;
170                     forwardJump(d.length);
171                 }
172 
173                 break;
174             }
175             else
176             {
177                 if (tStart == -1)
178                 {
179                     tStart = pos;
180                 }
181                 advance();
182             }
183         }
184 
185         return res;
186     }
187 
188     protected:
189 
190     uint pos()
191     {
192         return index-currentSize;
193     }
194 
195     void advance()
196     {
197         current = dec.decodeNext();
198         currentSize = cast(uint)dec.index - index;
199         index += currentSize;
200     }
201 
202     void forwardJump(size_t numChars)
203     {
204         for(size_t i = 0; i < numChars; i++)
205         {
206             advance();
207         }
208     }
209 
210     bool forwardCompare(string str)
211     {
212         UTF8Decoder dec2 = UTF8Decoder(str);
213         size_t oldIndex = dec.index;
214 
215         bool res = true;
216 
217         int c1 = current;
218         int c2 = dec2.decodeNext();
219         do
220         {
221             if (c2 == UTF8_END || c2 == UTF8_ERROR)
222                 break;
223 
224             if (c1 == UTF8_END && c2 == UTF8_END)
225             {
226                 res = true;
227                 break;
228             }
229 
230             if (c1 != UTF8_END && c1 != UTF8_ERROR &&
231                 c2 != UTF8_END && c2 != UTF8_ERROR)
232             {
233                 if (c1 != c2)
234                 {
235                     res = false;
236                     break;
237                 }
238             }
239             else
240             {
241                 res = false;
242                 break;
243             }
244 
245             c1 = dec.decodeNext();
246             c2 = dec2.decodeNext();
247         }
248         while(c2 != UTF8_END && c2 != UTF8_ERROR);
249 
250         dec.index = oldIndex;
251 
252         return res;
253     }
254 
255     bool isWhitespace(dchar c)
256     {
257         foreach(w; std.ascii.whitespace)
258         {
259             if (c == w)
260             {
261                 return true;
262             }
263         }
264         return false;
265     }
266 
267     bool isNewline(dchar c)
268     {
269         return (c == '\n' || c == '\r');
270     }
271 
272     string consumeDelimiter()
273     {
274         size_t bestLen = 0;
275         string bestStr = "";
276         foreach(d; delims)
277         {
278             if (forwardCompare(d))
279             {
280                 if (d.length > bestLen)
281                 {
282                     bestLen = d.length;
283                     bestStr = input[pos..pos+d.length];
284                 }
285             }
286         }
287         return bestStr;
288     }
289 
290     // Range interface
291 
292     private:
293 
294     string _front;
295 
296     public:
297 
298     bool empty()
299     {
300         return _front.length == 0;
301     }
302 
303     string front()
304     {
305         return _front;
306     }
307 
308     void popFront()
309     {
310         _front = getLexeme();
311     }
312 
313     string moveFront()
314     {
315         _front = getLexeme();
316         return _front;
317     }
318 
319     int opApply(scope int delegate(string) dg)
320     {
321         int result = 0;
322 
323         while(true)
324         {
325             string lexeme = getLexeme();
326 
327             if (!lexeme.length)
328                 break;
329 
330             result = dg(lexeme);
331             if (result)
332                 break;
333         }
334 
335         return result;
336     }
337 
338     int opApply(scope int delegate(size_t, string) dg)
339     {
340         int result = 0;
341         size_t i = 0;
342 
343         while(true)
344         {
345             string lexeme = getLexeme();
346 
347             if (!lexeme.length)
348                 break;
349 
350             result = dg(i, lexeme);
351             if (result)
352                 break;
353 
354             i++;
355         }
356 
357         return result;
358     }
359 }
360 
361 ///
362 unittest
363 {
364     import std.array: array;
365     import std.range: iota;
366     
367     string[] delims = ["(", ")", ";", " ", "{", "}", ".", "\n", "\r", "=", "++", "<"];
368     auto input = "for (int i=0; i<arr.length; ++i)\r\n{doThing();}\n";
369     auto lexer = new Lexer(input, delims);
370 
371     string[] arr;
372     while(true) {
373         auto lexeme = lexer.getLexeme();
374         if(lexeme.length == 0) {
375             break;
376         }
377         arr ~= lexeme;
378     }
379     auto reference = [
380         "for", " ", "(", "int", " ", "i", "=", "0", ";", " ", "i", "<",
381         "arr", ".", "length", ";", " ", "++", "i", ")", "\n", "{", "doThing",
382         "(", ")", ";", "}", "\n" ];
383     assert(arr == reference);
384 
385     lexer = new Lexer(input, delims);
386     arr = array(lexer);
387     assert(arr == reference);
388 
389     input = "";
390     lexer = new Lexer(input, delims);
391     assert(lexer.getLexeme().length == 0);
392 }