1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <ae@cy.md> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xml.lite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.xml.common; 27 import ae.utils.xml.entities; 28 import ae.utils.xmlwriter; 29 30 // ************************************************************************ 31 32 /// std.stream.Stream-like type with bonus speed 33 private struct StringStream 34 { 35 string s; 36 size_t position; 37 38 @disable this(); 39 @disable this(this); 40 this(string s) 41 { 42 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 43 this.s = (s ~ ditch)[0..$-ditch.length]; 44 } 45 46 char read() { return s[position++]; } 47 @property size_t size() { return s.length; } 48 } 49 50 // ************************************************************************ 51 52 /// The type of an `XmlNode`. 53 enum XmlNodeType 54 { 55 None , /// Initial value. Never created during parsing. 56 Root , /// The root node. Contains top-level nodes as children. 57 Node , /// XML tag. 58 Comment , /// XML comment. 59 Meta , /// XML processing instruction. 60 DocType , /// XML doctype declaration. 61 CData , /// CDATA node. 62 Text , /// Text node. 63 Raw , /// Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is. 64 } 65 66 /// Type used to hold a tag node's attributes. 67 alias XmlAttributes = OrderedMap!(string, string); 68 69 /// An XML node. 70 class XmlNode 71 { 72 string tag; /// The tag name, or the contents for text / comment / CDATA nodes. 73 XmlAttributes attributes; /// Tag attributes. 74 XmlNode parent; /// Parent node. 75 XmlNode[] children; /// Children nodes. 76 XmlNodeType type; /// Node type. 77 /// Start and end offset within the input. 78 ulong startPos, endPos; 79 80 private this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); } 81 82 /// Create and parse from input. 83 this(string s) { auto ss = StringStream(s); this(ss); } 84 85 /// Create a new node. 86 this(XmlNodeType type = XmlNodeType.None, string tag = null) 87 { 88 this.type = type; 89 this.tag = tag; 90 } 91 92 /// Set an attribute with the given value. 93 XmlNode addAttribute(string name, string value) 94 { 95 attributes[name] = value; 96 return this; 97 } 98 99 /// Add a child node, making this node its parent. 100 XmlNode addChild(XmlNode child) 101 { 102 child.parent = this; 103 children ~= child; 104 return this; 105 } 106 107 /// Return XML string. 108 override string toString() const 109 { 110 XmlWriter writer; 111 writeTo(writer); 112 return writer.output.get(); 113 } 114 115 /// Return pretty-printed XML string (with indentation). 116 string toPrettyString() const 117 { 118 PrettyXmlWriter writer; 119 writeTo(writer); 120 return writer.output.get(); 121 } 122 123 /// Write to an `XmlWriter`. 124 final void writeTo(XmlWriter)(ref XmlWriter output) const 125 { 126 void writeChildren() 127 { 128 foreach (child; children) 129 child.writeTo(output); 130 } 131 132 void writeAttributes() 133 { 134 foreach (key, value; attributes) 135 output.addAttribute(key, value); 136 } 137 138 final switch (type) 139 { 140 case XmlNodeType.None: 141 assert(false); 142 case XmlNodeType.Root: 143 writeChildren(); 144 return; 145 case XmlNodeType.Node: 146 output.startTagWithAttributes(tag); 147 writeAttributes(); 148 if (children.length) 149 { 150 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 151 if (oneLine) 152 output.formatter.enabled = false; 153 output.endAttributes(); 154 writeChildren(); 155 output.endTag(tag); 156 if (oneLine) 157 { 158 output.formatter.enabled = true; 159 output.newLine(); 160 } 161 } 162 else 163 output.endAttributesAndTag(); 164 return; 165 case XmlNodeType.Meta: 166 assert(children.length == 0); 167 output.startPI(tag); 168 writeAttributes(); 169 output.endPI(); 170 return; 171 case XmlNodeType.DocType: 172 assert(children.length == 0); 173 output.doctype(tag); 174 return; 175 case XmlNodeType.Text: 176 output.startLine(); 177 output.text(tag); 178 output.newLine(); 179 return; 180 case XmlNodeType.Comment: 181 output.startLine(); 182 output.comment(tag); 183 return; 184 case XmlNodeType.CData: 185 output.text(tag); 186 return; 187 case XmlNodeType.Raw: 188 output.startLine(); 189 output.output.put(tag); 190 output.newLine(); 191 return; 192 } 193 } 194 195 /// Attempts to retrieve the text contents of this node. 196 /// `<br>` tags are converted to newlines. 197 @property string text() 198 { 199 final switch (type) 200 { 201 case XmlNodeType.None: 202 assert(false); 203 case XmlNodeType.Text: 204 case XmlNodeType.CData: 205 return tag; 206 case XmlNodeType.Node: 207 case XmlNodeType.Root: 208 string result; 209 if (tag == "br") 210 result = "\n"; 211 foreach (child; children) 212 result ~= child.text(); 213 return result; 214 case XmlNodeType.Comment: 215 case XmlNodeType.Meta: 216 case XmlNodeType.DocType: 217 return null; 218 case XmlNodeType.Raw: 219 assert(false, "Can't extract text from Raw nodes"); 220 } 221 } 222 223 /// Returns the first immediate child which is a tag and has the tag name `tag`. 224 final XmlNode findChild(string tag) 225 { 226 foreach (child; children) 227 if (child.type == XmlNodeType.Node && child.tag == tag) 228 return child; 229 return null; 230 } 231 232 /// Returns all immediate children which are a tag and have the tag name `tag`. 233 final XmlNode[] findChildren(string tag) 234 { 235 XmlNode[] result; 236 foreach (child; children) 237 if (child.type == XmlNodeType.Node && child.tag == tag) 238 result ~= child; 239 return result; 240 } 241 242 /// Like `findChild`, but throws an exception if no such node is found. 243 final XmlNode opIndex(string tag) 244 { 245 auto node = findChild(tag); 246 if (node is null) 247 throw new XmlParseException("No such child: " ~ tag); 248 return node; 249 } 250 251 /// Like `findChildren[index]`, but throws an 252 /// exception if there are not enough such nodes. 253 final XmlNode opIndex(string tag, size_t index) 254 { 255 auto nodes = findChildren(tag); 256 if (index >= nodes.length) 257 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 258 return nodes[index]; 259 } 260 261 /// Returns the immediate child with the given index. 262 final ref XmlNode opIndex(size_t index) 263 { 264 return children[index]; 265 } 266 267 /// Returns the number of children nodes. 268 final @property size_t length() { return children.length; } 269 alias opDollar = length; /// ditto 270 271 /// Iterates over immediate children. 272 int opApply(int delegate(ref XmlNode) dg) 273 { 274 int result = 0; 275 276 for (int i = 0; i < children.length; i++) 277 { 278 result = dg(children[i]); 279 if (result) 280 break; 281 } 282 return result; 283 } 284 285 /// Creates a deep copy of this node. 286 final @property XmlNode dup() 287 { 288 auto result = new XmlNode(type, tag); 289 result.attributes = attributes.dup; 290 result.children.reserve(children.length); 291 foreach (child; children) 292 result.addChild(child.dup); 293 return result; 294 } 295 } 296 297 /// Root node representing a parsed XML document. 298 class XmlDocument : XmlNode 299 { 300 this() 301 { 302 super(XmlNodeType.Root); 303 tag = "<Root>"; 304 } /// 305 306 private this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 307 308 /// Create and parse from input. 309 this(string s) { auto ss = StringStream(s); this(ss); } 310 311 /// Creates a deep copy of this document. 312 final @property XmlDocument dup() 313 { 314 auto result = new XmlDocument(); 315 result.children = super.dup().children; 316 return result; 317 } 318 } 319 320 /// The logic for how to handle a node's closing tags. 321 enum NodeCloseMode 322 { 323 /// This element must always have an explicit closing tag 324 /// (or a self-closing tag). An unclosed tag will lead to 325 /// a parse error. 326 /// In XML, all tags are "always". 327 always, 328 /* 329 /// Close tags are optional. When an element with a tag is 330 /// encountered directly under an element with the same tag, 331 /// it is assumed that the first element is closed before 332 /// the second, so the two are siblings, not parent/child. 333 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 334 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 335 /// still parsed as `<p>a<div><p>b</p></div></p>`. 336 /// This mode can be used for relaxed HTML parsing. 337 optional, 338 */ 339 /// Close tags are optional, but are implied when absent. 340 /// As a result, these elements cannot have any content, 341 /// and any close tags must be adjacent to the open tag. 342 implicit, 343 344 /// This element is void and must never have a closing tag. 345 /// It is always implicitly closed right after opening. 346 /// A close tag is always an error. 347 /// This mode can be used for strict parsing of HTML5 void 348 /// elements. 349 never, 350 } 351 352 /// Configuration for parsing XML. 353 struct XmlParseConfig 354 { 355 static: 356 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } /// 357 bool preserveWhitespace(string tag) { return false; } /// 358 enum optionalParameterValues = false; /// 359 } 360 361 /// Configuration for strict parsing of HTML5. 362 /// All void tags must never be closed, and all 363 /// non-void tags must always be explicitly closed. 364 /// Attributes must still be quoted like in XML. 365 struct Html5StrictParseConfig 366 { 367 static: 368 immutable voidElements = [ 369 "area" , "base" , "br" , "col" , 370 "command", "embed" , "hr" , "img" , 371 "input" , "keygen", "link" , "meta", 372 "param" , "source", "track", "wbr" , 373 ]; /// 374 375 NodeCloseMode nodeCloseMode(string tag) 376 { 377 return tag.isOneOf(voidElements) 378 ? NodeCloseMode.never 379 : NodeCloseMode.always 380 ; 381 } /// 382 383 enum optionalParameterValues = true; /// 384 bool preserveWhitespace(string tag) { return false; /*TODO*/ } /// 385 } 386 387 /// Parse an SGML-ish string into an XmlNode 388 alias parse = _parseString!XmlNode; 389 390 /// Parse an SGML-ish string into an XmlDocument 391 alias parseDocument = _parseString!XmlDocument; 392 393 /// Parse an XML string into an XmlDocument. 394 alias xmlParse = parseDocument!XmlParseConfig; 395 396 private: 397 398 public // alias 399 template _parseString(Node) 400 { 401 Node _parseString(Config)(string s) 402 { 403 auto ss = StringStream(s); 404 alias f = parseStream!Node; 405 return f!Config(ss); 406 } 407 } 408 409 template parseStream(Node) 410 { 411 Node parseStream(Config)(ref StringStream s) 412 { 413 auto n = new Node; 414 parseInto!Config(n, s); 415 return n; 416 } 417 } 418 419 alias parseNode = parseStream!XmlNode; 420 421 /// Parse an SGML-ish StringStream into an XmlDocument 422 void parseInto(Config)(XmlDocument d, ref StringStream s) 423 { 424 skipWhitespace(s); 425 while (s.position < s.size) 426 try 427 { 428 auto n = new XmlNode; 429 parseInto!Config(n, s, null); 430 d.addChild(n); 431 skipWhitespace(s); 432 } 433 catch (XmlParseException e) 434 { 435 import std.algorithm.searching; 436 import std.range : retro; 437 438 auto head = s.s[0..s.position]; 439 auto row = head.representation.count('\n'); 440 auto column = head.representation.retro.countUntil('\n'); 441 if (column < 0) 442 column = head.length; 443 throw new XmlParseException("Error at %d:%d (offset %d)".format( 444 1 + row, 445 1 + column, 446 head.length, 447 ), e); 448 } 449 } 450 451 /// Parse an SGML-ish StringStream into an XmlNode 452 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false) 453 { 454 char c; 455 456 preserveWhitespace |= Config.preserveWhitespace(parentTag); 457 if (preserveWhitespace) 458 c = s.read(); 459 else 460 do 461 c = s.read(); 462 while (isWhiteChar[c]); 463 464 node.startPos = s.position; 465 if (c!='<') // text node 466 { 467 node.type = XmlNodeType.Text; 468 string text; 469 while (c!='<') 470 { 471 // TODO: check for EOF 472 text ~= c; 473 c = s.read(); 474 } 475 s.position--; // rewind to '<' 476 if (!preserveWhitespace) 477 while (text.length && isWhiteChar[text[$-1]]) 478 text = text[0..$-1]; 479 node.tag = decodeEntities(text); 480 //tag = tag.strip(); 481 } 482 else 483 { 484 c = s.read(); 485 if (c=='!') 486 { 487 c = s.read(); 488 if (c == '-') // comment 489 { 490 expect(s, '-'); 491 node.type = XmlNodeType.Comment; 492 string tag; 493 do 494 { 495 c = s.read(); 496 tag ~= c; 497 } while (tag.length<3 || tag[$-3..$] != "-->"); 498 tag = tag[0..$-3]; 499 node.tag = tag; 500 } 501 else 502 if (c == '[') // CDATA 503 { 504 foreach (x; "CDATA[") 505 expect(s, x); 506 node.type = XmlNodeType.CData; 507 string tag; 508 do 509 { 510 c = s.read(); 511 tag ~= c; 512 } while (tag.length<3 || tag[$-3..$] != "]]>"); 513 tag = tag[0..$-3]; 514 node.tag = tag; 515 } 516 else // doctype, etc. 517 { 518 node.type = XmlNodeType.DocType; 519 while (c != '>') 520 { 521 node.tag ~= c; 522 c = s.read(); 523 } 524 } 525 } 526 else 527 if (c=='?') 528 { 529 node.type = XmlNodeType.Meta; 530 node.tag = readWord(s); 531 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 532 while (true) 533 { 534 skipWhitespace(s); 535 if (peek(s)=='?') 536 break; 537 readAttribute!Config(node, s); 538 } 539 c = s.read(); 540 expect(s, '>'); 541 } 542 else 543 if (c=='/') 544 throw new XmlParseException("Unexpected close tag"); 545 else 546 { 547 node.type = XmlNodeType.Node; 548 s.position--; 549 node.tag = readWord(s); 550 while (true) 551 { 552 skipWhitespace(s); 553 c = peek(s); 554 if (c=='>' || c=='/') 555 break; 556 readAttribute!Config(node, s); 557 } 558 c = s.read(); 559 560 auto closeMode = Config.nodeCloseMode(node.tag); 561 if (closeMode == NodeCloseMode.never) 562 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 563 else 564 if (closeMode == NodeCloseMode.implicit) 565 { 566 if (c == '/') 567 expect(s, '>'); 568 } 569 else 570 { 571 if (c=='>') 572 { 573 while (true) 574 { 575 while (true) 576 { 577 if (!preserveWhitespace && !Config.preserveWhitespace(node.tag)) 578 skipWhitespace(s); 579 if (peek(s)=='<' && peek(s, 2)=='/') 580 break; 581 try 582 { 583 auto child = new XmlNode; 584 parseInto!Config(child, s, node.tag, preserveWhitespace); 585 node.addChild(child); 586 } 587 catch (XmlParseException e) 588 throw new XmlParseException("Error while processing child of "~node.tag, e); 589 } 590 expect(s, '<'); 591 expect(s, '/'); 592 auto word = readWord(s); 593 if (word != node.tag) 594 { 595 auto closeMode2 = Config.nodeCloseMode(word); 596 if (closeMode2 == NodeCloseMode.implicit) 597 { 598 auto parent = node.parent; 599 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 600 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 601 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 602 continue; 603 } 604 else 605 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 606 } 607 expect(s, '>'); 608 break; 609 } 610 } 611 else // '/' 612 expect(s, '>'); 613 } 614 } 615 } 616 node.endPos = s.position; 617 } 618 619 private: 620 621 void readAttribute(Config)(XmlNode node, ref StringStream s) 622 { 623 string name = readWord(s); 624 if (name.length==0) throw new XmlParseException("Invalid attribute"); 625 skipWhitespace(s); 626 627 static if (Config.optionalParameterValues) 628 { 629 if (peek(s) != '=') 630 { 631 node.attributes[name] = null; 632 return; 633 } 634 } 635 636 expect(s, '='); 637 skipWhitespace(s); 638 char delim; 639 delim = s.read(); 640 if (delim != '\'' && delim != '"') 641 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 642 string value = readUntil(s, delim); 643 node.attributes[name] = decodeEntities(value); 644 } 645 646 char peek(ref StringStream s, int n=1) 647 { 648 return s.s[s.position + n - 1]; 649 } 650 651 void skipWhitespace(ref StringStream s) 652 { 653 while (isWhiteChar[s.s.ptr[s.position]]) 654 s.position++; 655 } 656 657 __gshared bool[256] isWhiteChar, isWordChar; 658 659 shared static this() 660 { 661 foreach (c; 0..256) 662 { 663 isWhiteChar[c] = isWhite(c); 664 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 665 } 666 } 667 668 string readWord(ref StringStream stream) 669 { 670 auto start = stream.s.ptr + stream.position; 671 auto end = stream.s.ptr + stream.s.length; 672 auto p = start; 673 while (p < end && isWordChar[*p]) 674 p++; 675 auto len = p-start; 676 stream.position += len; 677 return start[0..len]; 678 } 679 680 void expect(ref StringStream s, char c) 681 { 682 char c2; 683 c2 = s.read(); 684 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 685 } 686 687 string readUntil(ref StringStream s, char until) 688 { 689 auto start = s.s.ptr + s.position; 690 auto p = start; 691 while (*p != until) p++; 692 auto len = p-start; 693 s.position += len + 1; 694 return start[0..len]; 695 } 696 697 unittest 698 { 699 enum xmlText = 700 `<?xml version="1.0" encoding="UTF-8"?>` ~ 701 `<quotes>` ~ 702 `<quote author="Alan Perlis">` ~ 703 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 704 `</quote>` ~ 705 `</quotes>`; 706 auto doc = new XmlDocument(xmlText); 707 assert(doc.toString() == xmlText, doc.toString()); 708 } 709 710 unittest 711 { 712 string testOne(bool preserve)(string s) 713 { 714 static struct ParseConfig 715 { 716 static: 717 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 718 bool preserveWhitespace(string tag) { return preserve; } 719 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 720 } 721 auto node = new XmlNode; 722 auto str = StringStream("<tag>" ~ s ~ "</tag>"); 723 parseInto!ParseConfig(node, str, null); 724 // import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString); 725 return node.children.length ? node.children[0].tag : null; 726 } 727 728 foreach (tag; ["a", " a", "a ", " a ", " a a ", " ", ""]) 729 { 730 assert(testOne!false(tag) == strip(tag), 731 "Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'"); 732 assert(testOne!true(tag) == tag, 733 "Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'"); 734 } 735 } 736 737 unittest 738 { 739 static struct ParseConfig 740 { 741 static: 742 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 743 bool preserveWhitespace(string tag) { return tag == "a"; } 744 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 745 } 746 auto node = new XmlNode; 747 auto str = StringStream("<a><b> foo </b></a>"); 748 parseInto!ParseConfig(node, str, null); 749 assert(node.children[0].children[0].tag == " foo "); 750 }