1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <ae@cy.md> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xml.lite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std..string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.xml.common; 27 import ae.utils.xml.entities; 28 import ae.utils.xmlwriter; 29 30 // ************************************************************************ 31 32 /// std.stream.Stream-like type with bonus speed 33 private struct StringStream 34 { 35 string s; 36 size_t position; 37 38 @disable this(); 39 @disable this(this); 40 this(string s) 41 { 42 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 43 this.s = (s ~ ditch)[0..$-ditch.length]; 44 } 45 46 char read() { return s[position++]; } 47 @property size_t size() { return s.length; } 48 } 49 50 // ************************************************************************ 51 52 /// The type of an `XmlNode`. 53 enum XmlNodeType 54 { 55 None , /// Initial value. Never created during parsing. 56 Root , /// The root node. Contains top-level nodes as children. 57 Node , /// XML tag. 58 Comment , /// XML comment. 59 Meta , /// XML processing instruction. 60 DocType , /// XML doctype declaration. 61 CData , /// CDATA node. 62 Text , /// Text node. 63 Raw , /// Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is. 64 } 65 66 /// Type used to hold a tag node's attributes. 67 alias XmlAttributes = OrderedMap!(string, string); 68 69 /// An XML node. 70 class XmlNode 71 { 72 string tag; /// The tag name, or the contents for text / comment / CDATA nodes. 73 XmlAttributes attributes; /// Tag attributes. 74 XmlNode parent; /// Parent node. 75 XmlNode[] children; /// Children nodes. 76 XmlNodeType type; /// Node type. 77 /// Start and end offset within the input. 78 ulong startPos, endPos; 79 80 this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); } 81 /// Create and parse from input. 82 this(string s) { auto ss = StringStream(s); this(ss); } 83 84 /// Create a new node. 85 this(XmlNodeType type = XmlNodeType.None, string tag = null) 86 { 87 this.type = type; 88 this.tag = tag; 89 } 90 91 /// Set an attribute with the given value. 92 XmlNode addAttribute(string name, string value) 93 { 94 attributes[name] = value; 95 return this; 96 } 97 98 /// Add a child node, making this node its parent. 99 XmlNode addChild(XmlNode child) 100 { 101 child.parent = this; 102 children ~= child; 103 return this; 104 } 105 106 /// Return XML string. 107 override string toString() const 108 { 109 XmlWriter writer; 110 writeTo(writer); 111 return writer.output.get(); 112 } 113 114 /// Return pretty-printed XML string (with indentation). 115 string toPrettyString() const 116 { 117 PrettyXmlWriter writer; 118 writeTo(writer); 119 return writer.output.get(); 120 } 121 122 /// Write to an `XmlWriter`. 123 final void writeTo(XmlWriter)(ref XmlWriter output) const 124 { 125 void writeChildren() 126 { 127 foreach (child; children) 128 child.writeTo(output); 129 } 130 131 void writeAttributes() 132 { 133 foreach (key, value; attributes) 134 output.addAttribute(key, value); 135 } 136 137 final switch (type) 138 { 139 case XmlNodeType.None: 140 assert(false); 141 case XmlNodeType.Root: 142 writeChildren(); 143 return; 144 case XmlNodeType.Node: 145 output.startTagWithAttributes(tag); 146 writeAttributes(); 147 if (children.length) 148 { 149 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 150 if (oneLine) 151 output.formatter.enabled = false; 152 output.endAttributes(); 153 writeChildren(); 154 output.endTag(tag); 155 if (oneLine) 156 { 157 output.formatter.enabled = true; 158 output.newLine(); 159 } 160 } 161 else 162 output.endAttributesAndTag(); 163 return; 164 case XmlNodeType.Meta: 165 assert(children.length == 0); 166 output.startPI(tag); 167 writeAttributes(); 168 output.endPI(); 169 return; 170 case XmlNodeType.DocType: 171 assert(children.length == 0); 172 output.doctype(tag); 173 return; 174 case XmlNodeType.Text: 175 output.startLine(); 176 output.text(tag); 177 output.newLine(); 178 return; 179 case XmlNodeType.Comment: 180 output.startLine(); 181 output.comment(tag); 182 return; 183 case XmlNodeType.CData: 184 output.text(tag); 185 return; 186 case XmlNodeType.Raw: 187 output.startLine(); 188 output.output.put(tag); 189 output.newLine(); 190 return; 191 } 192 } 193 194 /// Attempts to retrieve the text contents of this node. 195 /// `<br>` tags are converted to newlines. 196 @property string text() 197 { 198 final switch (type) 199 { 200 case XmlNodeType.None: 201 assert(false); 202 case XmlNodeType.Text: 203 case XmlNodeType.CData: 204 return tag; 205 case XmlNodeType.Node: 206 case XmlNodeType.Root: 207 string result; 208 if (tag == "br") 209 result = "\n"; 210 foreach (child; children) 211 result ~= child.text(); 212 return result; 213 case XmlNodeType.Comment: 214 case XmlNodeType.Meta: 215 case XmlNodeType.DocType: 216 return null; 217 case XmlNodeType.Raw: 218 assert(false, "Can't extract text from Raw nodes"); 219 } 220 } 221 222 /// Returns the first immediate child which is a tag and has the tag name `tag`. 223 final XmlNode findChild(string tag) 224 { 225 foreach (child; children) 226 if (child.type == XmlNodeType.Node && child.tag == tag) 227 return child; 228 return null; 229 } 230 231 /// Returns all immediate children which are a tag and have the tag name `tag`. 232 final XmlNode[] findChildren(string tag) 233 { 234 XmlNode[] result; 235 foreach (child; children) 236 if (child.type == XmlNodeType.Node && child.tag == tag) 237 result ~= child; 238 return result; 239 } 240 241 /// Like `findChild`, but throws an exception if no such node is found. 242 final XmlNode opIndex(string tag) 243 { 244 auto node = findChild(tag); 245 if (node is null) 246 throw new XmlParseException("No such child: " ~ tag); 247 return node; 248 } 249 250 /// Like `findChildren[index]`, but throws an 251 /// exception if there are not enough such nodes. 252 final XmlNode opIndex(string tag, size_t index) 253 { 254 auto nodes = findChildren(tag); 255 if (index >= nodes.length) 256 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 257 return nodes[index]; 258 } 259 260 /// Returns the immediate child with the given index. 261 final ref XmlNode opIndex(size_t index) 262 { 263 return children[index]; 264 } 265 266 /// Returns the number of children nodes. 267 final @property size_t length() { return children.length; } 268 alias opDollar = length; /// ditto 269 270 /// Iterates over immediate children. 271 int opApply(int delegate(ref XmlNode) dg) 272 { 273 int result = 0; 274 275 for (int i = 0; i < children.length; i++) 276 { 277 result = dg(children[i]); 278 if (result) 279 break; 280 } 281 return result; 282 } 283 284 /// Creates a deep copy of this node. 285 final @property XmlNode dup() 286 { 287 auto result = new XmlNode(type, tag); 288 result.attributes = attributes.dup; 289 result.children.reserve(children.length); 290 foreach (child; children) 291 result.addChild(child.dup); 292 return result; 293 } 294 } 295 296 /// Root node representing a parsed XML document. 297 class XmlDocument : XmlNode 298 { 299 this() 300 { 301 super(XmlNodeType.Root); 302 tag = "<Root>"; 303 } /// 304 305 this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 306 307 /// Create and parse from input. 308 this(string s) { auto ss = StringStream(s); this(ss); } 309 310 /// Creates a deep copy of this document. 311 final @property XmlDocument dup() 312 { 313 auto result = new XmlDocument(); 314 result.children = super.dup().children; 315 return result; 316 } 317 } 318 319 /// The logic for how to handle a node's closing tags. 320 enum NodeCloseMode 321 { 322 /// This element must always have an explicit closing tag 323 /// (or a self-closing tag). An unclosed tag will lead to 324 /// a parse error. 325 /// In XML, all tags are "always". 326 always, 327 /* 328 /// Close tags are optional. When an element with a tag is 329 /// encountered directly under an element with the same tag, 330 /// it is assumed that the first element is closed before 331 /// the second, so the two are siblings, not parent/child. 332 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 333 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 334 /// still parsed as `<p>a<div><p>b</p></div></p>`. 335 /// This mode can be used for relaxed HTML parsing. 336 optional, 337 */ 338 /// Close tags are optional, but are implied when absent. 339 /// As a result, these elements cannot have any content, 340 /// and any close tags must be adjacent to the open tag. 341 implicit, 342 343 /// This element is void and must never have a closing tag. 344 /// It is always implicitly closed right after opening. 345 /// A close tag is always an error. 346 /// This mode can be used for strict parsing of HTML5 void 347 /// elements. 348 never, 349 } 350 351 /// Configuration for parsing XML. 352 struct XmlParseConfig 353 { 354 static: 355 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } /// 356 bool preserveWhitespace(string tag) { return false; } /// 357 enum optionalParameterValues = false; /// 358 } 359 360 /// Configuration for strict parsing of HTML5. 361 /// All void tags must never be closed, and all 362 /// non-void tags must always be explicitly closed. 363 /// Attributes must still be quoted like in XML. 364 struct Html5StrictParseConfig 365 { 366 static: 367 immutable voidElements = [ 368 "area" , "base" , "br" , "col" , 369 "command", "embed" , "hr" , "img" , 370 "input" , "keygen", "link" , "meta", 371 "param" , "source", "track", "wbr" , 372 ]; /// 373 374 NodeCloseMode nodeCloseMode(string tag) 375 { 376 return tag.isOneOf(voidElements) 377 ? NodeCloseMode.never 378 : NodeCloseMode.always 379 ; 380 } /// 381 382 enum optionalParameterValues = true; /// 383 bool preserveWhitespace(string tag) { return false; /*TODO*/ } /// 384 } 385 386 /// Parse an SGML-ish string into an XmlNode 387 alias parse = parseString!XmlNode; 388 389 /// Parse an SGML-ish string into an XmlDocument 390 alias parseDocument = parseString!XmlDocument; 391 392 /// Parse an XML string into an XmlDocument. 393 alias xmlParse = parseDocument!XmlParseConfig; 394 395 private: 396 397 public // alias 398 template parseString(Node) 399 { 400 Node parseString(Config)(string s) 401 { 402 auto ss = StringStream(s); 403 alias f = parseStream!Node; 404 return f!Config(ss); 405 } 406 } 407 408 template parseStream(Node) 409 { 410 Node parseStream(Config)(ref StringStream s) 411 { 412 auto n = new Node; 413 parseInto!Config(n, s); 414 return n; 415 } 416 } 417 418 alias parseNode = parseStream!XmlNode; 419 420 /// Parse an SGML-ish StringStream into an XmlDocument 421 void parseInto(Config)(XmlDocument d, ref StringStream s) 422 { 423 skipWhitespace(s); 424 while (s.position < s.size) 425 try 426 { 427 auto n = new XmlNode; 428 parseInto!Config(n, s, null); 429 d.addChild(n); 430 skipWhitespace(s); 431 } 432 catch (XmlParseException e) 433 { 434 import std.algorithm.searching; 435 import std.range : retro; 436 437 auto head = s.s[0..s.position]; 438 auto row = head.representation.count('\n'); 439 auto column = head.representation.retro.countUntil('\n'); 440 if (column < 0) 441 column = head.length; 442 throw new XmlParseException("Error at %d:%d (offset %d)".format( 443 1 + row, 444 1 + column, 445 head.length, 446 ), e); 447 } 448 } 449 450 /// Parse an SGML-ish StringStream into an XmlNode 451 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false) 452 { 453 char c; 454 455 preserveWhitespace |= Config.preserveWhitespace(parentTag); 456 if (preserveWhitespace) 457 c = s.read(); 458 else 459 do 460 c = s.read(); 461 while (isWhiteChar[c]); 462 463 node.startPos = s.position; 464 if (c!='<') // text node 465 { 466 node.type = XmlNodeType.Text; 467 string text; 468 while (c!='<') 469 { 470 // TODO: check for EOF 471 text ~= c; 472 c = s.read(); 473 } 474 s.position--; // rewind to '<' 475 if (!preserveWhitespace) 476 while (text.length && isWhiteChar[text[$-1]]) 477 text = text[0..$-1]; 478 node.tag = decodeEntities(text); 479 //tag = tag.strip(); 480 } 481 else 482 { 483 c = s.read(); 484 if (c=='!') 485 { 486 c = s.read(); 487 if (c == '-') // comment 488 { 489 expect(s, '-'); 490 node.type = XmlNodeType.Comment; 491 string tag; 492 do 493 { 494 c = s.read(); 495 tag ~= c; 496 } while (tag.length<3 || tag[$-3..$] != "-->"); 497 tag = tag[0..$-3]; 498 node.tag = tag; 499 } 500 else 501 if (c == '[') // CDATA 502 { 503 foreach (x; "CDATA[") 504 expect(s, x); 505 node.type = XmlNodeType.CData; 506 string tag; 507 do 508 { 509 c = s.read(); 510 tag ~= c; 511 } while (tag.length<3 || tag[$-3..$] != "]]>"); 512 tag = tag[0..$-3]; 513 node.tag = tag; 514 } 515 else // doctype, etc. 516 { 517 node.type = XmlNodeType.DocType; 518 while (c != '>') 519 { 520 node.tag ~= c; 521 c = s.read(); 522 } 523 } 524 } 525 else 526 if (c=='?') 527 { 528 node.type = XmlNodeType.Meta; 529 node.tag = readWord(s); 530 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 531 while (true) 532 { 533 skipWhitespace(s); 534 if (peek(s)=='?') 535 break; 536 readAttribute!Config(node, s); 537 } 538 c = s.read(); 539 expect(s, '>'); 540 } 541 else 542 if (c=='/') 543 throw new XmlParseException("Unexpected close tag"); 544 else 545 { 546 node.type = XmlNodeType.Node; 547 s.position--; 548 node.tag = readWord(s); 549 while (true) 550 { 551 skipWhitespace(s); 552 c = peek(s); 553 if (c=='>' || c=='/') 554 break; 555 readAttribute!Config(node, s); 556 } 557 c = s.read(); 558 559 auto closeMode = Config.nodeCloseMode(node.tag); 560 if (closeMode == NodeCloseMode.never) 561 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 562 else 563 if (closeMode == NodeCloseMode.implicit) 564 { 565 if (c == '/') 566 expect(s, '>'); 567 } 568 else 569 { 570 if (c=='>') 571 { 572 while (true) 573 { 574 while (true) 575 { 576 if (!preserveWhitespace && !Config.preserveWhitespace(node.tag)) 577 skipWhitespace(s); 578 if (peek(s)=='<' && peek(s, 2)=='/') 579 break; 580 try 581 { 582 auto child = new XmlNode; 583 parseInto!Config(child, s, node.tag, preserveWhitespace); 584 node.addChild(child); 585 } 586 catch (XmlParseException e) 587 throw new XmlParseException("Error while processing child of "~node.tag, e); 588 } 589 expect(s, '<'); 590 expect(s, '/'); 591 auto word = readWord(s); 592 if (word != node.tag) 593 { 594 auto closeMode2 = Config.nodeCloseMode(word); 595 if (closeMode2 == NodeCloseMode.implicit) 596 { 597 auto parent = node.parent; 598 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 599 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 600 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 601 continue; 602 } 603 else 604 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 605 } 606 expect(s, '>'); 607 break; 608 } 609 } 610 else // '/' 611 expect(s, '>'); 612 } 613 } 614 } 615 node.endPos = s.position; 616 } 617 618 private: 619 620 void readAttribute(Config)(XmlNode node, ref StringStream s) 621 { 622 string name = readWord(s); 623 if (name.length==0) throw new XmlParseException("Invalid attribute"); 624 skipWhitespace(s); 625 626 static if (Config.optionalParameterValues) 627 { 628 if (peek(s) != '=') 629 { 630 node.attributes[name] = null; 631 return; 632 } 633 } 634 635 expect(s, '='); 636 skipWhitespace(s); 637 char delim; 638 delim = s.read(); 639 if (delim != '\'' && delim != '"') 640 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 641 string value = readUntil(s, delim); 642 node.attributes[name] = decodeEntities(value); 643 } 644 645 char peek(ref StringStream s, int n=1) 646 { 647 return s.s[s.position + n - 1]; 648 } 649 650 void skipWhitespace(ref StringStream s) 651 { 652 while (isWhiteChar[s.s.ptr[s.position]]) 653 s.position++; 654 } 655 656 __gshared bool[256] isWhiteChar, isWordChar; 657 658 shared static this() 659 { 660 foreach (c; 0..256) 661 { 662 isWhiteChar[c] = isWhite(c); 663 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 664 } 665 } 666 667 string readWord(ref StringStream stream) 668 { 669 auto start = stream.s.ptr + stream.position; 670 auto end = stream.s.ptr + stream.s.length; 671 auto p = start; 672 while (p < end && isWordChar[*p]) 673 p++; 674 auto len = p-start; 675 stream.position += len; 676 return start[0..len]; 677 } 678 679 void expect(ref StringStream s, char c) 680 { 681 char c2; 682 c2 = s.read(); 683 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 684 } 685 686 string readUntil(ref StringStream s, char until) 687 { 688 auto start = s.s.ptr + s.position; 689 auto p = start; 690 while (*p != until) p++; 691 auto len = p-start; 692 s.position += len + 1; 693 return start[0..len]; 694 } 695 696 unittest 697 { 698 enum xmlText = 699 `<?xml version="1.0" encoding="UTF-8"?>` ~ 700 `<quotes>` ~ 701 `<quote author="Alan Perlis">` ~ 702 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 703 `</quote>` ~ 704 `</quotes>`; 705 auto doc = new XmlDocument(xmlText); 706 assert(doc.toString() == xmlText, doc.toString()); 707 } 708 709 unittest 710 { 711 string testOne(bool preserve)(string s) 712 { 713 static struct ParseConfig 714 { 715 static: 716 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 717 bool preserveWhitespace(string tag) { return preserve; } 718 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 719 } 720 auto node = new XmlNode; 721 auto str = StringStream("<tag>" ~ s ~ "</tag>"); 722 parseInto!ParseConfig(node, str, null); 723 // import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString); 724 return node.children.length ? node.children[0].tag : null; 725 } 726 727 foreach (tag; ["a", " a", "a ", " a ", " a a ", " ", ""]) 728 { 729 assert(testOne!false(tag) == strip(tag), 730 "Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'"); 731 assert(testOne!true(tag) == tag, 732 "Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'"); 733 } 734 } 735 736 unittest 737 { 738 static struct ParseConfig 739 { 740 static: 741 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 742 bool preserveWhitespace(string tag) { return tag == "a"; } 743 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 744 } 745 auto node = new XmlNode; 746 auto str = StringStream("<a><b> foo </b></a>"); 747 parseInto!ParseConfig(node, str, null); 748 assert(node.children[0].children[0].tag == " foo "); 749 }