1 /*
2 Copyright (c) 2015-2021 Timur Gafarov
3 
4 Boost Software License - Version 1.0 - August 17th, 2003
5 
6 Permission is hereby granted, free of charge, to any person or organization
7 obtaining a copy of the software and accompanying documentation covered by
8 this license (the "Software") to use, reproduce, display, distribute,
9 execute, and transmit the Software, and to prepare derivative works of the
10 Software, and to permit third-parties to whom the Software is furnished to
11 do so, all subject to the following:
12 
13 The copyright notices in the Software and this entire statement, including
14 the above license grant, this restriction and the following disclaimer,
15 must be included in all copies of the Software, in whole or in part, and
16 all derivative works of the Software, unless such copies or derivative
17 works are solely in the form of machine-executable object code generated by
18 a source language processor.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 */
28 
29 /**
30  GC-free parser for a subset of XML.
31 
32  Description:
33  Has the following limitations:
34  - supports only ASCII and UTF-8 encodings
35  - doesn't support DOCTYPE and some other special tags
36 
37  Copyright: Timur Gafarov 2015-2021.
38  License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0).
39  Authors: Timur Gafarov
40  */
41 module dlib.serialization.xml;
42 
43 import std.stdio;
44 import std.conv;
45 import dlib.core.memory;
46 import dlib.core.compound;
47 import dlib.container.array;
48 import dlib.container.dict;
49 import dlib.container.stack;
50 import dlib.text.lexer;
51 import dlib.text.utils;
52 
53 string[] xmlDelims =
54 [
55     "<", ">", "</", "/>", "=", "<?", "?>", "\"",
56     "<!--", "-->", "<![CDATA[", "]]>",
57     "\"", "'", " ", "\n",
58 ];
59 
60 enum XmlToken
61 {
62     TagOpen,
63     TagClose,
64     TagName,
65     Assignment,
66     Quote,
67     PropValue
68 }
69 
70 string emptyStr;
71 
72 string appendChar(string s, dchar ch)
73 {
74     char[7] firstByteMark = [0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC];
75 
76     char[4] chars;
77     uint byteMask = 0xBF;
78     uint byteMark = 0x80;
79 
80     uint bytesToWrite = 0;
81     if (ch < 0x80) bytesToWrite = 1;
82     else if (ch < 0x800) bytesToWrite = 2;
83     else if (ch < 0x10000) bytesToWrite = 3;
84     else bytesToWrite = 4;
85 
86     char* target = chars.ptr;
87     target += bytesToWrite;
88     switch (bytesToWrite)
89     {
90         case 4: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6; goto case 3;
91         case 3: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6; goto case 2;
92         case 2: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6; goto case 1;
93         case 1: *--target = cast(char)(ch | firstByteMark[bytesToWrite]); break;
94         default: break;
95     }
96 
97     return catStr(s, cast(string)chars[0..bytesToWrite]);
98 }
99 
100 class XmlNode
101 {
102     XmlNode parent;
103     Array!XmlNode children;
104     string name;
105     string text;
106     Dict!(string, string) properties;
107 
108     this(string name, XmlNode parent = null)
109     {
110         this.name = name;
111         this.parent = parent;
112         if (parent !is null)
113         {
114             parent.addChild(this);
115         }
116         this.properties = New!(Dict!(string, string));
117     }
118 
119     ~this()
120     {
121         if (text.length)
122             Delete(text);
123         if (name.length)
124             Delete(name);
125         foreach(k, v; properties)
126         {
127             Delete(k);
128             Delete(v);
129         }
130         Delete(properties);
131         foreach(c; children)
132         {
133             Delete(c);
134         }
135         children.free();
136     }
137 
138     XmlNode firstChildByTag(string tag)
139     {
140         XmlNode res = null;
141         foreach(c; children)
142         {
143             if (c.name == tag)
144             {
145                 res = c;
146                 break;
147             }
148         }
149 
150         return res;
151     }
152 
153     void addChild(XmlNode node)
154     {
155         children.append(node);
156     }
157 
158     void appendText(dchar c)
159     {
160         string newText = appendChar(text, c);
161         if (text.length)
162             Delete(text);
163         text = newText;
164     }
165 
166     string getTextUnmanaged()
167     {
168         Array!char res;
169         res.append(text);
170         foreach(n; children)
171         {
172             string t = n.getTextUnmanaged();
173             if (t.length)
174             {
175                 res.append(t);
176                 Delete(t);
177             }
178         }
179         string output = immutableCopy(cast(string)res.data);
180         res.free();
181         return output;
182     }
183 
184     void printProperties(dstring indent = "")
185     {
186         if (properties.length)
187         {
188             foreach(k, v; properties)
189                 writeln(indent, k, " = ", v);
190         }
191     }
192 
193     // Warning! Causes GC allocation!
194     void print(dstring indent = "")
195     {
196         printProperties(indent);
197 
198         foreach(n; children)
199         {
200             auto nm = n.name;
201             if (nm.length)
202                 writeln(indent, "tag: ", nm);
203             else
204                 writeln(indent, "tag: <anonymous>");
205 
206             string txt = n.getTextUnmanaged();
207             if (txt.length)
208             {
209                 writeln(indent, "text: ", txt);
210                 Delete(txt);
211             }
212 
213             n.print(indent ~ " ");
214         }
215     }
216 }
217 
218 string prop(XmlNode node, string name)
219 {
220     if (name in node.properties)
221         return node.properties[name];
222     else
223         return "";
224 }
225 
226 class XmlDocument
227 {
228     XmlNode prolog = null;
229     XmlNode root;
230 
231     this()
232     {
233         root = New!XmlNode(emptyStr);
234     }
235 
236     ~this()
237     {
238         Delete(root);
239         if (prolog)
240             Delete(prolog);
241     }
242 }
243 
244 XmlDocument parseXMLUnmanaged(string text)
245 {
246     XmlDocument doc = New!XmlDocument();
247     Lexer lex = New!Lexer(text, xmlDelims);
248     Stack!XmlNode nodeStack;
249 
250     nodeStack.push(doc.root);
251 
252     XmlToken expect = XmlToken.TagOpen;
253 
254     bool tagOpening = false;
255     bool xmlPrologDeclaration = false;
256     bool comment = false;
257     bool cdata = false;
258     bool lastCharWasWhitespace = false;
259 
260     string tmpPropName;
261     Array!char tmpPropValue;
262 
263     bool finished = false;
264 
265     bool failed = false;
266     void error(string text, string t)
267     {
268         writefln("XML parse error: %s \"%s\"", text, t);
269         failed = true;
270     }
271 
272     string token;
273     while(!finished)
274     {
275         token = lex.getLexeme();
276 
277         //writeln(token);
278 
279         if (!token.length)
280             break;
281 
282         //version(None)
283         switch(token)
284         {
285             case "<![CDATA[":
286                 if (comment) break;
287                 cdata = true;
288                 break;
289 
290             case "]]>":
291                 if (comment) break;
292                 if (cdata)
293                     cdata = false;
294                 else
295                 {
296                     error("Unexpected token ", token);
297                     finished = true;
298                 }
299                 break;
300 
301             case "<!--":
302                 if (cdata)
303                 {
304                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
305                     node.text = immutableCopy(token);
306                 }
307                 else
308                     comment = true;
309                 break;
310 
311             case "-->":
312                 if (cdata)
313                 {
314                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
315                     node.text = immutableCopy(token);
316                 }
317                 else if (comment)
318                     comment = false;
319                 else
320                 {
321                     error("Unexpected token ", token);
322                     finished = true;
323                 }
324                 break;
325 
326             case "<":
327                 if (comment) break;
328                 if (cdata)
329                 {
330                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
331                     node.text = immutableCopy(token);
332                 }
333                 else if (expect == XmlToken.TagOpen)
334                 {
335                     expect = XmlToken.TagName;
336                     tagOpening = true;
337                 }
338                 else
339                 {
340                     error("Unexpected token ", token);
341                     finished = true;
342                 }
343                 break;
344 
345             case ">":
346                 if (comment) break;
347                 if (cdata)
348                 {
349                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
350                     node.text = immutableCopy(token);
351                 }
352                 else if (expect == XmlToken.TagClose && !xmlPrologDeclaration)
353                 {
354                     expect = XmlToken.TagOpen;
355                 }
356                 else
357                 {
358                     error("Unexpected token ", token);
359                     finished = true;
360                 }
361                 break;
362 
363             case "</":
364                 if (comment) break;
365                 if (cdata)
366                 {
367                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
368                     node.text = immutableCopy(token);
369                 }
370                 else if (expect == XmlToken.TagOpen)
371                 {
372                     expect = XmlToken.TagName;
373                 }
374                 break;
375 
376             case "/>":
377                 if (comment) break;
378                 if (cdata)
379                 {
380                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
381                     node.text = immutableCopy(token);
382                 }
383                 else if (expect == XmlToken.TagClose && !xmlPrologDeclaration)
384                 {
385                     expect = XmlToken.TagOpen;
386                     nodeStack.pop();
387                 }
388                 else
389                 {
390                     error("Unexpected token ", token);
391                     finished = true;
392                 }
393                 break;
394 
395             case "<?":
396                 if (comment) break;
397                 if (cdata)
398                 {
399                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
400                     node.text = immutableCopy(token);
401                 }
402                 else if (expect == XmlToken.TagOpen)
403                 {
404                     expect = XmlToken.TagName;
405                     xmlPrologDeclaration = true;
406                     tagOpening = true;
407                 }
408                 break;
409 
410             case "?>":
411                 if (comment) break;
412                 if (cdata)
413                 {
414                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
415                     node.text = immutableCopy(token);
416                 }
417                 else if (expect == XmlToken.TagClose && xmlPrologDeclaration)
418                 {
419                     expect = XmlToken.TagOpen;
420                     xmlPrologDeclaration = false;
421                     nodeStack.pop();
422                 }
423                 break;
424 
425             case "=":
426                 if (comment) break;
427                 if (cdata)
428                 {
429                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
430                     node.text = immutableCopy(token);
431                 }
432                 else if (expect == XmlToken.Assignment)
433                 {
434                     expect = XmlToken.Quote;
435                 }
436                 else if (expect == XmlToken.TagOpen)
437                 {
438                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
439                     node.text = immutableCopy(token);
440                 }
441                 else
442                 {
443                     error("Unexpected token ", token);
444                     finished = true;
445                 }
446                 break;
447 
448             case "\"":
449                 if (comment) break;
450                 if (cdata)
451                 {
452                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
453                     node.text = immutableCopy(token);
454                 }
455                 else if (expect == XmlToken.Quote)
456                 {
457                     expect = XmlToken.PropValue;
458                 }
459                 else if (expect == XmlToken.PropValue)
460                 {
461                     expect = XmlToken.TagClose;
462                     nodeStack.top.properties[immutableCopy(tmpPropName)] = immutableCopy(cast(string)tmpPropValue.data);
463                     tmpPropValue.free();
464                 }
465                 else
466                 {
467                     error("Unexpected token ", token);
468                     finished = true;
469                 }
470                 break;
471 
472             default:
473                 if (comment) break;
474                 if (cdata)
475                 {
476                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
477                     node.text = immutableCopy(token);
478                     break;
479                 }
480 
481                 if (token != " " && token != "\n")
482                     lastCharWasWhitespace = false;
483 
484                 if (token == " " || token == "\n")
485                 {
486                     if (expect == XmlToken.TagOpen)
487                     {
488                         if (nodeStack.top.children.length)
489                         {
490                             if (nodeStack.top.children.data[$-1].text == " ")
491                                 break;
492                         }
493                         else if (!nodeStack.top.text.length)
494                             break;
495                         else if (nodeStack.top.text[$-1] == ' ')
496                             break;
497 
498                         XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
499                         node.text = immutableCopy(" ");
500                     }
501                     else if (expect == XmlToken.PropValue)
502                     {
503                         if (!lastCharWasWhitespace)
504                         {
505                             tmpPropValue.append(' ');
506                             lastCharWasWhitespace = true;
507                         }
508                     }
509                 }
510                 else if (expect == XmlToken.TagName)
511                 {
512                     expect = XmlToken.TagClose;
513                     if (xmlPrologDeclaration)
514                     {
515                         if (tagOpening)
516                         {
517                             if (doc.prolog is null)
518                             {
519                                 if (token == "xml")
520                                 {
521                                     doc.prolog = New!XmlNode(immutableCopy(token));
522                                     nodeStack.push(doc.prolog);
523                                     tagOpening = false;
524                                 }
525                                 else
526                                 {
527                                     error("Illegal XML prolog", emptyStr);
528                                     finished = true;
529                                 }
530                             }
531                             else
532                             {
533                                 error("More than one XML prolog is not allowed", emptyStr);
534                                 finished = true;
535                             }
536                         }
537                         else
538                         {
539                             nodeStack.pop();
540                         }
541                     }
542                     else if (tagOpening)
543                     {
544                         XmlNode node = New!XmlNode(immutableCopy(token), nodeStack.top);
545                         nodeStack.push(node);
546                         tagOpening = false;
547                     }
548                     else
549                     {
550                         if (token == nodeStack.top.name)
551                             nodeStack.pop();
552                         else
553                         {
554                             error("Mismatched tag", emptyStr);
555                             finished = true;
556                         }
557                     }
558                 }
559                 else if (expect == XmlToken.TagOpen)
560                 {
561                     XmlNode node = New!XmlNode(emptyStr, nodeStack.top);
562                     if (token[0] == '&')
563                     {
564                         if (token[1] == '#' && token.length > 2)
565                         {
566                             dchar c = '?';
567                             if (token[2] == 'x')
568                             {
569                                 int code = hexCharacterCode(token[3..$]);
570                                 if (code == -1)
571                                 {
572                                     error("Failed to parse character reference ", token);
573                                     finished = true;
574                                 }
575                                 else
576                                     c = cast(dchar)code;
577                             }
578                             else
579                                 c = cast(dchar)to!uint(token[2..$-1]);
580 
581                             node.appendText(c);
582                         }
583                         else
584                             node.text = immutableCopy(token);
585                     }
586                     else
587                         node.text = immutableCopy(token);
588                 }
589                 else if (expect == XmlToken.TagClose)
590                 {
591                     expect = XmlToken.Assignment;
592 
593                     if (tmpPropName.length)
594                         Delete(tmpPropName);
595                     tmpPropName = immutableCopy(token);
596                 }
597                 else if (expect == XmlToken.PropValue)
598                 {
599                     tmpPropValue.append(token);
600                 }
601                 else
602                 {
603                     error("Unexpected token ", token);
604                     finished = true;
605                 }
606                 break;
607         }
608     }
609 
610     if (tmpPropName.length)
611         Delete(tmpPropName);
612     tmpPropValue.free();
613 
614     nodeStack.free();
615     Delete(lex);
616 
617     if (failed)
618     {
619         Delete(doc);
620         doc = null;
621     }
622 
623     return doc;
624 }
625 
626 int hexCharacterCode(string input)
627 {
628     int res;
629     foreach(c; input)
630     {
631         switch(c)
632         {
633             case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
634                 res = res * 0x10 | c - '0';
635                 break;
636             case 'a', 'b', 'c', 'd', 'e', 'f':
637                 res = res * 0x10 | c - 'a' + 0xA;
638                 break;
639             case 'A', 'B', 'C', 'D', 'E', 'F':
640                 res = res * 0x10 | c - 'A' + 0xA;
641                 break;
642             case ';':
643                 return res;
644             default:
645                 return -1;
646         }
647     }
648     return res;
649 }