1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <ae@cy.md> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xml.lite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.xml.common; 27 import ae.utils.xml.entities; 28 import ae.utils.xmlwriter; 29 30 // ************************************************************************ 31 32 /// std.stream.Stream-like type with bonus speed 33 private struct StringStream 34 { 35 string s; 36 size_t position; 37 38 @disable this(); 39 @disable this(this); 40 this(string s) 41 { 42 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 43 this.s = (s ~ ditch)[0..$-ditch.length]; 44 } 45 46 char read() { return s[position++]; } 47 @property size_t size() { return s.length; } 48 } 49 50 // ************************************************************************ 51 52 /// The type of an `XmlNode`. 53 enum XmlNodeType 54 { 55 None , /// Initial value. Never created during parsing. 56 Root , /// The root node. Contains top-level nodes as children. 57 Node , /// XML tag. 58 Comment , /// XML comment. 59 Meta , /// XML processing instruction. 60 DocType , /// XML doctype declaration. 61 CData , /// CDATA node. 62 Text , /// Text node. 63 Raw , /// Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is. 64 } 65 66 /// Type used to hold a tag node's attributes. 67 alias XmlAttributes = OrderedMap!(string, string); 68 69 /// An XML node. 70 class XmlNode 71 { 72 string tag; /// The tag name, or the contents for text / comment / CDATA nodes. 73 XmlAttributes attributes; /// Tag attributes. 74 XmlNode parent; /// Parent node. 75 XmlNode[] children; /// Children nodes. 76 XmlNodeType type; /// Node type. 77 /// Start and end offset within the input. 78 ulong startPos, endPos; 79 80 private this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); } 81 82 /// Create and parse from input. 83 this(string s) { auto ss = StringStream(s); this(ss); } 84 85 /// Create a new node. 86 this(XmlNodeType type = XmlNodeType.None, string tag = null) 87 { 88 this.type = type; 89 this.tag = tag; 90 } 91 92 /// Set an attribute with the given value. 93 XmlNode addAttribute(string name, string value) 94 { 95 attributes[name] = value; 96 return this; 97 } 98 99 /// Add a child node, making this node its parent. 100 XmlNode addChild(XmlNode child) 101 { 102 child.parent = this; 103 children ~= child; 104 return this; 105 } 106 107 /// Return XML string. 108 override string toString() const 109 { 110 XmlWriter writer; 111 writeTo(writer); 112 return writer.output.get(); 113 } 114 115 /// Return pretty-printed XML string (with indentation). 116 string toPrettyString() const 117 { 118 PrettyXmlWriter writer; 119 writeTo(writer); 120 return writer.output.get(); 121 } 122 123 /// Write to an `XmlWriter`. 124 final void writeTo(XmlWriter)(ref XmlWriter output) const 125 { 126 void writeChildren() 127 { 128 foreach (child; children) 129 child.writeTo(output); 130 } 131 132 void writeAttributes() 133 { 134 foreach (key, value; attributes) 135 output.addAttribute(key, value); 136 } 137 138 final switch (type) 139 { 140 case XmlNodeType.None: 141 assert(false); 142 case XmlNodeType.Root: 143 writeChildren(); 144 return; 145 case XmlNodeType.Node: 146 output.startTagWithAttributes(tag); 147 writeAttributes(); 148 if (children.length) 149 { 150 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 151 if (oneLine) 152 output.formatter.enabled = false; 153 output.endAttributes(); 154 writeChildren(); 155 output.endTag(tag); 156 if (oneLine) 157 { 158 output.formatter.enabled = true; 159 output.newLine(); 160 } 161 } 162 else 163 output.endAttributesAndTag(); 164 return; 165 case XmlNodeType.Meta: 166 assert(children.length == 0); 167 output.startPI(tag); 168 writeAttributes(); 169 output.endPI(); 170 return; 171 case XmlNodeType.DocType: 172 assert(children.length == 0); 173 output.doctype(tag); 174 return; 175 case XmlNodeType.Text: 176 output.startLine(); 177 output.text(tag); 178 output.newLine(); 179 return; 180 case XmlNodeType.Comment: 181 output.startLine(); 182 output.comment(tag); 183 return; 184 case XmlNodeType.CData: 185 output.text(tag); 186 return; 187 case XmlNodeType.Raw: 188 output.startLine(); 189 output.output.put(tag); 190 output.newLine(); 191 return; 192 } 193 } 194 195 /// Attempts to retrieve the text contents of this node. 196 /// `<br>` tags are converted to newlines. 197 @property string text() 198 { 199 final switch (type) 200 { 201 case XmlNodeType.None: 202 assert(false); 203 case XmlNodeType.Text: 204 case XmlNodeType.CData: 205 return tag; 206 case XmlNodeType.Node: 207 case XmlNodeType.Root: 208 string result; 209 if (tag == "br") 210 result = "\n"; 211 foreach (child; children) 212 result ~= child.text(); 213 return result; 214 case XmlNodeType.Comment: 215 case XmlNodeType.Meta: 216 case XmlNodeType.DocType: 217 return null; 218 case XmlNodeType.Raw: 219 assert(false, "Can't extract text from Raw nodes"); 220 } 221 } 222 223 /// Returns the first immediate child which is a tag and has the tag name `tag`. 224 final XmlNode findChild(string tag) 225 { 226 foreach (child; children) 227 if (child.type == XmlNodeType.Node && child.tag == tag) 228 return child; 229 return null; 230 } 231 232 /// Returns all immediate children which are a tag and have the tag name `tag`. 233 final XmlNode[] findChildren(string tag) 234 { 235 XmlNode[] result; 236 foreach (child; children) 237 if (child.type == XmlNodeType.Node && child.tag == tag) 238 result ~= child; 239 return result; 240 } 241 242 /// Like `findChild`, but throws an exception if no such node is found. 243 final XmlNode opIndex(string tag) 244 { 245 auto node = findChild(tag); 246 if (node is null) 247 throw new XmlParseException("No such child: " ~ tag); 248 return node; 249 } 250 251 /// Like `findChildren[index]`, but throws an 252 /// exception if there are not enough such nodes. 253 final XmlNode opIndex(string tag, size_t index) 254 { 255 auto nodes = findChildren(tag); 256 if (index >= nodes.length) 257 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 258 return nodes[index]; 259 } 260 261 /// Returns the immediate child with the given index. 262 final ref XmlNode opIndex(size_t index) 263 { 264 return children[index]; 265 } 266 267 /// Returns the number of children nodes. 268 final @property size_t length() { return children.length; } 269 alias opDollar = length; /// ditto 270 271 /// Iterates over immediate children. 272 int opApply(int delegate(ref XmlNode) dg) 273 { 274 int result = 0; 275 276 for (int i = 0; i < children.length; i++) 277 { 278 result = dg(children[i]); 279 if (result) 280 break; 281 } 282 return result; 283 } 284 285 /// Creates a deep copy of this node. 286 final @property XmlNode dup() 287 { 288 auto result = new XmlNode(type, tag); 289 result.attributes = attributes.dup; 290 result.children.reserve(children.length); 291 foreach (child; children) 292 result.addChild(child.dup); 293 return result; 294 } 295 } 296 297 /// Root node representing a parsed XML document. 298 class XmlDocument : XmlNode 299 { 300 this() 301 { 302 super(XmlNodeType.Root); 303 tag = "<Root>"; 304 } /// 305 306 private this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 307 308 /// Create and parse from input. 309 this(string s) { auto ss = StringStream(s); this(ss); } 310 311 /// Creates a deep copy of this document. 312 final @property XmlDocument dup() 313 { 314 auto result = new XmlDocument(); 315 result.children = super.dup().children; 316 return result; 317 } 318 } 319 320 /// The logic for how to handle a node's closing tags. 321 enum NodeCloseMode 322 { 323 /// This element must always have an explicit closing tag 324 /// (or a self-closing tag). An unclosed tag will lead to 325 /// a parse error. 326 /// In XML, all tags are "always". 327 always, 328 /* 329 /// Close tags are optional. When an element with a tag is 330 /// encountered directly under an element with the same tag, 331 /// it is assumed that the first element is closed before 332 /// the second, so the two are siblings, not parent/child. 333 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 334 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 335 /// still parsed as `<p>a<div><p>b</p></div></p>`. 336 /// This mode can be used for relaxed HTML parsing. 337 optional, 338 */ 339 /// Close tags are optional, but are implied when absent. 340 /// As a result, these elements cannot have any content, 341 /// and any close tags must be adjacent to the open tag. 342 implicit, 343 344 /// This element is void and must never have a closing tag. 345 /// It is always implicitly closed right after opening. 346 /// A close tag is always an error. 347 /// This mode can be used for strict parsing of HTML5 void 348 /// elements. 349 never, 350 } 351 352 /// Configuration for parsing XML. 353 struct XmlParseConfig 354 { 355 static: 356 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } /// 357 bool preserveWhitespace(string tag) { return false; } /// 358 enum optionalParameterValues = false; /// 359 } 360 361 /// Configuration for strict parsing of HTML5. 362 /// All void tags must never be closed, and all 363 /// non-void tags must always be explicitly closed. 364 /// Attributes must still be quoted like in XML. 365 struct Html5StrictParseConfig 366 { 367 static: 368 immutable voidElements = [ 369 "area" , "base" , "br" , "col" , 370 "command", "embed" , "hr" , "img" , 371 "input" , "keygen", "link" , "meta", 372 "param" , "source", "track", "wbr" , 373 ]; /// 374 375 NodeCloseMode nodeCloseMode(string tag) 376 { 377 return tag.isOneOf(voidElements) 378 ? NodeCloseMode.never 379 : NodeCloseMode.always 380 ; 381 } /// 382 383 enum optionalParameterValues = true; /// 384 bool preserveWhitespace(string tag) { return false; /*TODO*/ } /// 385 } 386 387 /// Parse an SGML-ish string into an XmlNode 388 alias parse = _parseString!XmlNode; 389 390 /// Parse an SGML-ish string into an XmlDocument 391 alias parseDocument = _parseString!XmlDocument; 392 393 /// Parse an XML string into an XmlDocument. 394 alias xmlParse = parseDocument!XmlParseConfig; 395 396 private: 397 398 public // alias 399 template _parseString(Node) 400 { 401 Node _parseString(Config)(string s) 402 { 403 auto ss = StringStream(s); 404 alias f = parseStream!Node; 405 return f!Config(ss); 406 } 407 } 408 409 template parseStream(Node) 410 { 411 Node parseStream(Config)(ref StringStream s) 412 { 413 auto n = new Node; 414 parseInto!Config(n, s); 415 return n; 416 } 417 } 418 419 alias parseNode = parseStream!XmlNode; 420 421 /// Parse an SGML-ish StringStream into an XmlDocument 422 void parseInto(Config)(XmlDocument d, ref StringStream s) 423 { 424 skipWhitespace(s); 425 while (s.position < s.size) 426 try 427 { 428 auto n = new XmlNode; 429 parseInto!Config(n, s, null); 430 d.addChild(n); 431 if (!Config.preserveWhitespace(null)) 432 skipWhitespace(s); 433 } 434 catch (XmlParseException e) 435 { 436 import std.algorithm.searching; 437 import std.range : retro; 438 439 auto head = s.s[0..s.position]; 440 auto row = head.representation.count('\n'); 441 auto column = head.representation.retro.countUntil('\n'); 442 if (column < 0) 443 column = head.length; 444 throw new XmlParseException("Error at %d:%d (offset %d)".format( 445 1 + row, 446 1 + column, 447 head.length, 448 ), e); 449 } 450 } 451 452 /// Parse an SGML-ish StringStream into an XmlNode 453 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false) 454 { 455 char c; 456 457 preserveWhitespace |= Config.preserveWhitespace(parentTag); 458 if (preserveWhitespace) 459 c = s.read(); 460 else 461 do 462 c = s.read(); 463 while (isWhiteChar[c]); 464 465 node.startPos = s.position; 466 if (c!='<') // text node 467 { 468 node.type = XmlNodeType.Text; 469 string text; 470 text ~= c; 471 while (s.position < s.size && s.s[s.position] != '<') 472 text ~= s.read(); 473 if (!preserveWhitespace) 474 while (text.length && isWhiteChar[text[$-1]]) 475 text = text[0..$-1]; 476 node.tag = decodeEntities(text); 477 //tag = tag.strip(); 478 } 479 else 480 { 481 c = s.read(); 482 if (c=='!') 483 { 484 c = s.read(); 485 if (c == '-') // comment 486 { 487 expect(s, '-'); 488 node.type = XmlNodeType.Comment; 489 string tag; 490 do 491 { 492 c = s.read(); 493 tag ~= c; 494 } while (tag.length<3 || tag[$-3..$] != "-->"); 495 tag = tag[0..$-3]; 496 node.tag = tag; 497 } 498 else 499 if (c == '[') // CDATA 500 { 501 foreach (x; "CDATA[") 502 expect(s, x); 503 node.type = XmlNodeType.CData; 504 string tag; 505 do 506 { 507 c = s.read(); 508 tag ~= c; 509 } while (tag.length<3 || tag[$-3..$] != "]]>"); 510 tag = tag[0..$-3]; 511 node.tag = tag; 512 } 513 else // doctype, etc. 514 { 515 node.type = XmlNodeType.DocType; 516 while (c != '>') 517 { 518 node.tag ~= c; 519 c = s.read(); 520 } 521 } 522 } 523 else 524 if (c=='?') 525 { 526 node.type = XmlNodeType.Meta; 527 node.tag = readWord(s); 528 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 529 while (true) 530 { 531 skipWhitespace(s); 532 if (peek(s)=='?') 533 break; 534 readAttribute!Config(node, s); 535 } 536 c = s.read(); 537 expect(s, '>'); 538 } 539 else 540 if (c=='/') 541 throw new XmlParseException("Unexpected close tag"); 542 else 543 { 544 node.type = XmlNodeType.Node; 545 s.position--; 546 node.tag = readWord(s); 547 while (true) 548 { 549 skipWhitespace(s); 550 c = peek(s); 551 if (c=='>' || c=='/') 552 break; 553 readAttribute!Config(node, s); 554 } 555 c = s.read(); 556 557 auto closeMode = Config.nodeCloseMode(node.tag); 558 if (closeMode == NodeCloseMode.never) 559 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 560 else 561 if (closeMode == NodeCloseMode.implicit) 562 { 563 if (c == '/') 564 expect(s, '>'); 565 } 566 else 567 { 568 if (c=='>') 569 { 570 while (true) 571 { 572 while (true) 573 { 574 if (!preserveWhitespace && !Config.preserveWhitespace(node.tag)) 575 skipWhitespace(s); 576 if (peek(s)=='<' && peek(s, 2)=='/') 577 break; 578 try 579 { 580 auto child = new XmlNode; 581 parseInto!Config(child, s, node.tag, preserveWhitespace); 582 node.addChild(child); 583 } 584 catch (XmlParseException e) 585 throw new XmlParseException("Error while processing child of "~node.tag, e); 586 } 587 expect(s, '<'); 588 expect(s, '/'); 589 auto word = readWord(s); 590 if (word != node.tag) 591 { 592 auto closeMode2 = Config.nodeCloseMode(word); 593 if (closeMode2 == NodeCloseMode.implicit) 594 { 595 auto parent = node.parent; 596 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 597 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 598 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 599 continue; 600 } 601 else 602 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 603 } 604 expect(s, '>'); 605 break; 606 } 607 } 608 else // '/' 609 expect(s, '>'); 610 } 611 } 612 } 613 node.endPos = s.position; 614 } 615 616 private: 617 618 void readAttribute(Config)(XmlNode node, ref StringStream s) 619 { 620 string name = readWord(s); 621 if (name.length==0) throw new XmlParseException("Invalid attribute"); 622 skipWhitespace(s); 623 624 static if (Config.optionalParameterValues) 625 { 626 if (peek(s) != '=') 627 { 628 node.attributes[name] = null; 629 return; 630 } 631 } 632 633 expect(s, '='); 634 skipWhitespace(s); 635 char delim; 636 delim = s.read(); 637 if (delim != '\'' && delim != '"') 638 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 639 string value = readUntil(s, delim); 640 node.attributes[name] = decodeEntities(value); 641 } 642 643 char peek(ref StringStream s, int n=1) 644 { 645 return s.s[s.position + n - 1]; 646 } 647 648 void skipWhitespace(ref StringStream s) 649 { 650 while (isWhiteChar[s.s.ptr[s.position]]) 651 s.position++; 652 } 653 654 __gshared bool[256] isWhiteChar, isWordChar; 655 656 shared static this() 657 { 658 foreach (c; 0..256) 659 { 660 isWhiteChar[c] = isWhite(c); 661 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 662 } 663 } 664 665 string readWord(ref StringStream stream) 666 { 667 auto start = stream.s.ptr + stream.position; 668 auto end = stream.s.ptr + stream.s.length; 669 auto p = start; 670 while (p < end && isWordChar[*p]) 671 p++; 672 auto len = p-start; 673 stream.position += len; 674 return start[0..len]; 675 } 676 677 void expect(ref StringStream s, char c) 678 { 679 char c2; 680 c2 = s.read(); 681 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 682 } 683 684 string readUntil(ref StringStream s, char until) 685 { 686 auto start = s.s.ptr + s.position; 687 auto p = start; 688 while (*p != until) p++; 689 auto len = p-start; 690 s.position += len + 1; 691 return start[0..len]; 692 } 693 694 unittest 695 { 696 enum xmlText = 697 `<?xml version="1.0" encoding="UTF-8"?>` ~ 698 `<quotes>` ~ 699 `<quote author="Alan Perlis">` ~ 700 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 701 `</quote>` ~ 702 `</quotes>`; 703 auto doc = new XmlDocument(xmlText); 704 assert(doc.toString() == xmlText, doc.toString()); 705 } 706 707 unittest 708 { 709 string testOne(bool preserve)(string s) 710 { 711 static struct ParseConfig 712 { 713 static: 714 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 715 bool preserveWhitespace(string tag) { return preserve; } 716 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 717 } 718 auto node = new XmlNode; 719 auto str = StringStream("<tag>" ~ s ~ "</tag>"); 720 parseInto!ParseConfig(node, str, null); 721 // import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString); 722 return node.children.length ? node.children[0].tag : null; 723 } 724 725 foreach (tag; ["a", " a", "a ", " a ", " a a ", " ", ""]) 726 { 727 assert(testOne!false(tag) == strip(tag), 728 "Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'"); 729 assert(testOne!true(tag) == tag, 730 "Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'"); 731 } 732 } 733 734 unittest 735 { 736 static struct ParseConfig 737 { 738 static: 739 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 740 bool preserveWhitespace(string tag) { return tag == "a"; } 741 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 742 } 743 auto node = new XmlNode; 744 auto str = StringStream("<a><b> foo </b></a>"); 745 parseInto!ParseConfig(node, str, null); 746 assert(node.children[0].children[0].tag == " foo "); 747 } 748 749 // Parsing naked tags while preserving whitespace 750 unittest 751 { 752 static struct ParseConfig 753 { 754 static: 755 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 756 bool preserveWhitespace(string tag) { return true; } 757 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 758 } 759 auto doc = parseDocument!ParseConfig("<foo/> <bar/>\n"); 760 assert(doc.children.length == 4); 761 }