1 /* 2 Copyright (c) 2015-2021 Timur Gafarov 3 4 Boost Software License - Version 1.0 - August 17th, 2003 5 6 Permission is hereby granted, free of charge, to any person or organization 7 obtaining a copy of the software and accompanying documentation covered by 8 this license (the "Software") to use, reproduce, display, distribute, 9 execute, and transmit the Software, and to prepare derivative works of the 10 Software, and to permit third-parties to whom the Software is furnished to 11 do so, all subject to the following: 12 13 The copyright notices in the Software and this entire statement, including 14 the above license grant, this restriction and the following disclaimer, 15 must be included in all copies of the Software, in whole or in part, and 16 all derivative works of the Software, unless such copies or derivative 17 works are solely in the form of machine-executable object code generated by 18 a source language processor. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 23 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 24 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 25 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. 27 */ 28 29 /** 30 GC-free parser for a subset of XML. 31 32 Description: 33 Has the following limitations: 34 - supports only ASCII and UTF-8 encodings 35 - doesn't support DOCTYPE and some other special tags 36 37 Copyright: Timur Gafarov 2015-2021. 38 License: $(LINK2 boost.org/LICENSE_1_0.txt, Boost License 1.0). 39 Authors: Timur Gafarov 40 */ 41 module dlib.serialization.xml; 42 43 import std.stdio; 44 import std.conv; 45 import dlib.core.memory; 46 import dlib.core.compound; 47 import dlib.container.array; 48 import dlib.container.dict; 49 import dlib.container.stack; 50 import dlib.text.lexer; 51 import dlib.text.utils; 52 53 string[] xmlDelims = 54 [ 55 "<", ">", "</", "/>", "=", "<?", "?>", "\"", 56 "<!--", "-->", "<![CDATA[", "]]>", 57 "\"", "'", " ", "\n", 58 ]; 59 60 enum XmlToken 61 { 62 TagOpen, 63 TagClose, 64 TagName, 65 Assignment, 66 Quote, 67 PropValue 68 } 69 70 string emptyStr; 71 72 string appendChar(string s, dchar ch) 73 { 74 char[7] firstByteMark = [0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC]; 75 76 char[4] chars; 77 uint byteMask = 0xBF; 78 uint byteMark = 0x80; 79 80 uint bytesToWrite = 0; 81 if (ch < 0x80) bytesToWrite = 1; 82 else if (ch < 0x800) bytesToWrite = 2; 83 else if (ch < 0x10000) bytesToWrite = 3; 84 else bytesToWrite = 4; 85 86 char* target = chars.ptr; 87 target += bytesToWrite; 88 switch (bytesToWrite) 89 { 90 case 4: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6; goto case 3; 91 case 3: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6; goto case 2; 92 case 2: *--target = cast(char)((ch | byteMark) & byteMask); ch >>= 6; goto case 1; 93 case 1: *--target = cast(char)(ch | firstByteMark[bytesToWrite]); break; 94 default: break; 95 } 96 97 return catStr(s, cast(string)chars[0..bytesToWrite]); 98 } 99 100 class XmlNode 101 { 102 XmlNode parent; 103 Array!XmlNode children; 104 string name; 105 string text; 106 Dict!(string, string) properties; 107 108 this(string name, XmlNode parent = null) 109 { 110 this.name = name; 111 this.parent = parent; 112 if (parent !is null) 113 { 114 parent.addChild(this); 115 } 116 this.properties = New!(Dict!(string, string)); 117 } 118 119 ~this() 120 { 121 if (text.length) 122 Delete(text); 123 if (name.length) 124 Delete(name); 125 foreach(k, v; properties) 126 { 127 Delete(k); 128 Delete(v); 129 } 130 Delete(properties); 131 foreach(c; children) 132 { 133 Delete(c); 134 } 135 children.free(); 136 } 137 138 XmlNode firstChildByTag(string tag) 139 { 140 XmlNode res = null; 141 foreach(c; children) 142 { 143 if (c.name == tag) 144 { 145 res = c; 146 break; 147 } 148 } 149 150 return res; 151 } 152 153 void addChild(XmlNode node) 154 { 155 children.append(node); 156 } 157 158 void appendText(dchar c) 159 { 160 string newText = appendChar(text, c); 161 if (text.length) 162 Delete(text); 163 text = newText; 164 } 165 166 string getTextUnmanaged() 167 { 168 Array!char res; 169 res.append(text); 170 foreach(n; children) 171 { 172 string t = n.getTextUnmanaged(); 173 if (t.length) 174 { 175 res.append(t); 176 Delete(t); 177 } 178 } 179 string output = immutableCopy(cast(string)res.data); 180 res.free(); 181 return output; 182 } 183 184 void printProperties(dstring indent = "") 185 { 186 if (properties.length) 187 { 188 foreach(k, v; properties) 189 writeln(indent, k, " = ", v); 190 } 191 } 192 193 // Warning! Causes GC allocation! 194 void print(dstring indent = "") 195 { 196 printProperties(indent); 197 198 foreach(n; children) 199 { 200 auto nm = n.name; 201 if (nm.length) 202 writeln(indent, "tag: ", nm); 203 else 204 writeln(indent, "tag: <anonymous>"); 205 206 string txt = n.getTextUnmanaged(); 207 if (txt.length) 208 { 209 writeln(indent, "text: ", txt); 210 Delete(txt); 211 } 212 213 n.print(indent ~ " "); 214 } 215 } 216 } 217 218 string prop(XmlNode node, string name) 219 { 220 if (name in node.properties) 221 return node.properties[name]; 222 else 223 return ""; 224 } 225 226 class XmlDocument 227 { 228 XmlNode prolog = null; 229 XmlNode root; 230 231 this() 232 { 233 root = New!XmlNode(emptyStr); 234 } 235 236 ~this() 237 { 238 Delete(root); 239 if (prolog) 240 Delete(prolog); 241 } 242 } 243 244 XmlDocument parseXMLUnmanaged(string text) 245 { 246 XmlDocument doc = New!XmlDocument(); 247 Lexer lex = New!Lexer(text, xmlDelims); 248 Stack!XmlNode nodeStack; 249 250 nodeStack.push(doc.root); 251 252 XmlToken expect = XmlToken.TagOpen; 253 254 bool tagOpening = false; 255 bool xmlPrologDeclaration = false; 256 bool comment = false; 257 bool cdata = false; 258 bool lastCharWasWhitespace = false; 259 260 string tmpPropName; 261 Array!char tmpPropValue; 262 263 bool finished = false; 264 265 bool failed = false; 266 void error(string text, string t) 267 { 268 writefln("XML parse error: %s \"%s\"", text, t); 269 failed = true; 270 } 271 272 string token; 273 while(!finished) 274 { 275 token = lex.getLexeme(); 276 277 //writeln(token); 278 279 if (!token.length) 280 break; 281 282 //version(None) 283 switch(token) 284 { 285 case "<![CDATA[": 286 if (comment) break; 287 cdata = true; 288 break; 289 290 case "]]>": 291 if (comment) break; 292 if (cdata) 293 cdata = false; 294 else 295 { 296 error("Unexpected token ", token); 297 finished = true; 298 } 299 break; 300 301 case "<!--": 302 if (cdata) 303 { 304 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 305 node.text = immutableCopy(token); 306 } 307 else 308 comment = true; 309 break; 310 311 case "-->": 312 if (cdata) 313 { 314 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 315 node.text = immutableCopy(token); 316 } 317 else if (comment) 318 comment = false; 319 else 320 { 321 error("Unexpected token ", token); 322 finished = true; 323 } 324 break; 325 326 case "<": 327 if (comment) break; 328 if (cdata) 329 { 330 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 331 node.text = immutableCopy(token); 332 } 333 else if (expect == XmlToken.TagOpen) 334 { 335 expect = XmlToken.TagName; 336 tagOpening = true; 337 } 338 else 339 { 340 error("Unexpected token ", token); 341 finished = true; 342 } 343 break; 344 345 case ">": 346 if (comment) break; 347 if (cdata) 348 { 349 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 350 node.text = immutableCopy(token); 351 } 352 else if (expect == XmlToken.TagClose && !xmlPrologDeclaration) 353 { 354 expect = XmlToken.TagOpen; 355 } 356 else 357 { 358 error("Unexpected token ", token); 359 finished = true; 360 } 361 break; 362 363 case "</": 364 if (comment) break; 365 if (cdata) 366 { 367 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 368 node.text = immutableCopy(token); 369 } 370 else if (expect == XmlToken.TagOpen) 371 { 372 expect = XmlToken.TagName; 373 } 374 break; 375 376 case "/>": 377 if (comment) break; 378 if (cdata) 379 { 380 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 381 node.text = immutableCopy(token); 382 } 383 else if (expect == XmlToken.TagClose && !xmlPrologDeclaration) 384 { 385 expect = XmlToken.TagOpen; 386 nodeStack.pop(); 387 } 388 else 389 { 390 error("Unexpected token ", token); 391 finished = true; 392 } 393 break; 394 395 case "<?": 396 if (comment) break; 397 if (cdata) 398 { 399 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 400 node.text = immutableCopy(token); 401 } 402 else if (expect == XmlToken.TagOpen) 403 { 404 expect = XmlToken.TagName; 405 xmlPrologDeclaration = true; 406 tagOpening = true; 407 } 408 break; 409 410 case "?>": 411 if (comment) break; 412 if (cdata) 413 { 414 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 415 node.text = immutableCopy(token); 416 } 417 else if (expect == XmlToken.TagClose && xmlPrologDeclaration) 418 { 419 expect = XmlToken.TagOpen; 420 xmlPrologDeclaration = false; 421 nodeStack.pop(); 422 } 423 break; 424 425 case "=": 426 if (comment) break; 427 if (cdata) 428 { 429 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 430 node.text = immutableCopy(token); 431 } 432 else if (expect == XmlToken.Assignment) 433 { 434 expect = XmlToken.Quote; 435 } 436 else if (expect == XmlToken.TagOpen) 437 { 438 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 439 node.text = immutableCopy(token); 440 } 441 else 442 { 443 error("Unexpected token ", token); 444 finished = true; 445 } 446 break; 447 448 case "\"": 449 if (comment) break; 450 if (cdata) 451 { 452 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 453 node.text = immutableCopy(token); 454 } 455 else if (expect == XmlToken.Quote) 456 { 457 expect = XmlToken.PropValue; 458 } 459 else if (expect == XmlToken.PropValue) 460 { 461 expect = XmlToken.TagClose; 462 nodeStack.top.properties[immutableCopy(tmpPropName)] = immutableCopy(cast(string)tmpPropValue.data); 463 tmpPropValue.free(); 464 } 465 else 466 { 467 error("Unexpected token ", token); 468 finished = true; 469 } 470 break; 471 472 default: 473 if (comment) break; 474 if (cdata) 475 { 476 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 477 node.text = immutableCopy(token); 478 break; 479 } 480 481 if (token != " " && token != "\n") 482 lastCharWasWhitespace = false; 483 484 if (token == " " || token == "\n") 485 { 486 if (expect == XmlToken.TagOpen) 487 { 488 if (nodeStack.top.children.length) 489 { 490 if (nodeStack.top.children.data[$-1].text == " ") 491 break; 492 } 493 else if (!nodeStack.top.text.length) 494 break; 495 else if (nodeStack.top.text[$-1] == ' ') 496 break; 497 498 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 499 node.text = immutableCopy(" "); 500 } 501 else if (expect == XmlToken.PropValue) 502 { 503 if (!lastCharWasWhitespace) 504 { 505 tmpPropValue.append(' '); 506 lastCharWasWhitespace = true; 507 } 508 } 509 } 510 else if (expect == XmlToken.TagName) 511 { 512 expect = XmlToken.TagClose; 513 if (xmlPrologDeclaration) 514 { 515 if (tagOpening) 516 { 517 if (doc.prolog is null) 518 { 519 if (token == "xml") 520 { 521 doc.prolog = New!XmlNode(immutableCopy(token)); 522 nodeStack.push(doc.prolog); 523 tagOpening = false; 524 } 525 else 526 { 527 error("Illegal XML prolog", emptyStr); 528 finished = true; 529 } 530 } 531 else 532 { 533 error("More than one XML prolog is not allowed", emptyStr); 534 finished = true; 535 } 536 } 537 else 538 { 539 nodeStack.pop(); 540 } 541 } 542 else if (tagOpening) 543 { 544 XmlNode node = New!XmlNode(immutableCopy(token), nodeStack.top); 545 nodeStack.push(node); 546 tagOpening = false; 547 } 548 else 549 { 550 if (token == nodeStack.top.name) 551 nodeStack.pop(); 552 else 553 { 554 error("Mismatched tag", emptyStr); 555 finished = true; 556 } 557 } 558 } 559 else if (expect == XmlToken.TagOpen) 560 { 561 XmlNode node = New!XmlNode(emptyStr, nodeStack.top); 562 if (token[0] == '&') 563 { 564 if (token[1] == '#' && token.length > 2) 565 { 566 dchar c = '?'; 567 if (token[2] == 'x') 568 { 569 int code = hexCharacterCode(token[3..$]); 570 if (code == -1) 571 { 572 error("Failed to parse character reference ", token); 573 finished = true; 574 } 575 else 576 c = cast(dchar)code; 577 } 578 else 579 c = cast(dchar)to!uint(token[2..$-1]); 580 581 node.appendText(c); 582 } 583 else 584 node.text = immutableCopy(token); 585 } 586 else 587 node.text = immutableCopy(token); 588 } 589 else if (expect == XmlToken.TagClose) 590 { 591 expect = XmlToken.Assignment; 592 593 if (tmpPropName.length) 594 Delete(tmpPropName); 595 tmpPropName = immutableCopy(token); 596 } 597 else if (expect == XmlToken.PropValue) 598 { 599 tmpPropValue.append(token); 600 } 601 else 602 { 603 error("Unexpected token ", token); 604 finished = true; 605 } 606 break; 607 } 608 } 609 610 if (tmpPropName.length) 611 Delete(tmpPropName); 612 tmpPropValue.free(); 613 614 nodeStack.free(); 615 Delete(lex); 616 617 if (failed) 618 { 619 Delete(doc); 620 doc = null; 621 } 622 623 return doc; 624 } 625 626 int hexCharacterCode(string input) 627 { 628 int res; 629 foreach(c; input) 630 { 631 switch(c) 632 { 633 case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 634 res = res * 0x10 | c - '0'; 635 break; 636 case 'a', 'b', 'c', 'd', 'e', 'f': 637 res = res * 0x10 | c - 'a' + 0xA; 638 break; 639 case 'A', 'B', 'C', 'D', 'E', 'F': 640 res = res * 0x10 | c - 'A' + 0xA; 641 break; 642 case ';': 643 return res; 644 default: 645 return -1; 646 } 647 } 648 return res; 649 }