1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <vladimir@thecybershadow.net> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xml.lite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.xml.common; 27 import ae.utils.xml.entities; 28 import ae.utils.xmlwriter; 29 30 // ************************************************************************ 31 32 /// std.stream.Stream-like type with bonus speed 33 private struct StringStream 34 { 35 string s; 36 size_t position; 37 38 @disable this(); 39 @disable this(this); 40 this(string s) 41 { 42 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 43 this.s = (s ~ ditch)[0..$-ditch.length]; 44 } 45 46 char read() { return s[position++]; } 47 @property size_t size() { return s.length; } 48 } 49 50 // ************************************************************************ 51 52 enum XmlNodeType 53 { 54 None, 55 Root, 56 Node, 57 Comment, 58 Meta, 59 DocType, 60 CData, 61 Text, 62 Raw, // Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is. 63 } 64 65 alias XmlAttributes = OrderedMap!(string, string); 66 67 class XmlNode 68 { 69 string tag; 70 XmlAttributes attributes; 71 XmlNode parent; 72 XmlNode[] children; 73 XmlNodeType type; 74 ulong startPos, endPos; 75 76 this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); } 77 this(string s) { auto ss = StringStream(s); this(ss); } 78 79 this(XmlNodeType type = XmlNodeType.None, string tag = null) 80 { 81 this.type = type; 82 this.tag = tag; 83 } 84 85 XmlNode addAttribute(string name, string value) 86 { 87 attributes[name] = value; 88 return this; 89 } 90 91 XmlNode addChild(XmlNode child) 92 { 93 child.parent = this; 94 children ~= child; 95 return this; 96 } 97 98 override string toString() const 99 { 100 XmlWriter writer; 101 writeTo(writer); 102 return writer.output.get(); 103 } 104 105 string toPrettyString() const 106 { 107 PrettyXmlWriter writer; 108 writeTo(writer); 109 return writer.output.get(); 110 } 111 112 final void writeTo(XmlWriter)(ref XmlWriter output) const 113 { 114 void writeChildren() 115 { 116 foreach (child; children) 117 child.writeTo(output); 118 } 119 120 void writeAttributes() 121 { 122 foreach (key, value; attributes) 123 output.addAttribute(key, value); 124 } 125 126 final switch (type) 127 { 128 case XmlNodeType.None: 129 assert(false); 130 case XmlNodeType.Root: 131 writeChildren(); 132 return; 133 case XmlNodeType.Node: 134 output.startTagWithAttributes(tag); 135 writeAttributes(); 136 if (children.length) 137 { 138 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 139 if (oneLine) 140 output.formatter.enabled = false; 141 output.endAttributes(); 142 writeChildren(); 143 output.endTag(tag); 144 if (oneLine) 145 { 146 output.formatter.enabled = true; 147 output.newLine(); 148 } 149 } 150 else 151 output.endAttributesAndTag(); 152 return; 153 case XmlNodeType.Meta: 154 assert(children.length == 0); 155 output.startPI(tag); 156 writeAttributes(); 157 output.endPI(); 158 return; 159 case XmlNodeType.DocType: 160 assert(children.length == 0); 161 output.doctype(tag); 162 return; 163 case XmlNodeType.Text: 164 output.startLine(); 165 output.text(tag); 166 output.newLine(); 167 return; 168 case XmlNodeType.Comment: 169 output.startLine(); 170 output.comment(tag); 171 return; 172 case XmlNodeType.CData: 173 output.text(tag); 174 return; 175 case XmlNodeType.Raw: 176 output.startLine(); 177 output.output.put(tag); 178 output.newLine(); 179 return; 180 } 181 } 182 183 @property string text() 184 { 185 final switch (type) 186 { 187 case XmlNodeType.None: 188 assert(false); 189 case XmlNodeType.Text: 190 case XmlNodeType.CData: 191 return tag; 192 case XmlNodeType.Node: 193 case XmlNodeType.Root: 194 string result; 195 if (tag == "br") 196 result = "\n"; 197 foreach (child; children) 198 result ~= child.text(); 199 return result; 200 case XmlNodeType.Comment: 201 case XmlNodeType.Meta: 202 case XmlNodeType.DocType: 203 return null; 204 case XmlNodeType.Raw: 205 assert(false, "Can't extract text from Raw nodes"); 206 } 207 } 208 209 final XmlNode findChild(string tag) 210 { 211 foreach (child; children) 212 if (child.type == XmlNodeType.Node && child.tag == tag) 213 return child; 214 return null; 215 } 216 217 final XmlNode[] findChildren(string tag) 218 { 219 XmlNode[] result; 220 foreach (child; children) 221 if (child.type == XmlNodeType.Node && child.tag == tag) 222 result ~= child; 223 return result; 224 } 225 226 final XmlNode opIndex(string tag) 227 { 228 auto node = findChild(tag); 229 if (node is null) 230 throw new XmlParseException("No such child: " ~ tag); 231 return node; 232 } 233 234 final XmlNode opIndex(string tag, size_t index) 235 { 236 auto nodes = findChildren(tag); 237 if (index >= nodes.length) 238 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 239 return nodes[index]; 240 } 241 242 final ref XmlNode opIndex(size_t index) 243 { 244 return children[index]; 245 } 246 247 final @property size_t length() { return children.length; } 248 alias opDollar = length; 249 250 int opApply(int delegate(ref XmlNode) dg) 251 { 252 int result = 0; 253 254 for (int i = 0; i < children.length; i++) 255 { 256 result = dg(children[i]); 257 if (result) 258 break; 259 } 260 return result; 261 } 262 263 final @property XmlNode dup() 264 { 265 auto result = new XmlNode(type, tag); 266 result.attributes = attributes.dup; 267 result.children.reserve(children.length); 268 foreach (child; children) 269 result.addChild(child.dup); 270 return result; 271 } 272 } 273 274 class XmlDocument : XmlNode 275 { 276 this() 277 { 278 super(XmlNodeType.Root); 279 tag = "<Root>"; 280 } 281 282 this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 283 this(string s) { auto ss = StringStream(s); this(ss); } 284 285 final @property XmlDocument dup() 286 { 287 auto result = new XmlDocument(); 288 result.children = super.dup().children; 289 return result; 290 } 291 } 292 293 /// The logic for how to handle a node's closing tags. 294 enum NodeCloseMode 295 { 296 /// This element must always have an explicit closing tag 297 /// (or a self-closing tag). An unclosed tag will lead to 298 /// a parse error. 299 /// In XML, all tags are "always". 300 always, 301 /* 302 /// Close tags are optional. When an element with a tag is 303 /// encountered directly under an element with the same tag, 304 /// it is assumed that the first element is closed before 305 /// the second, so the two are siblings, not parent/child. 306 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 307 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 308 /// still parsed as `<p>a<div><p>b</p></div></p>`. 309 /// This mode can be used for relaxed HTML parsing. 310 optional, 311 */ 312 /// Close tags are optional, but are implied when absent. 313 /// As a result, these elements cannot have any content, 314 /// and any close tags must be adjacent to the open tag. 315 implicit, 316 317 /// This element is void and must never have a closing tag. 318 /// It is always implicitly closed right after opening. 319 /// A close tag is always an error. 320 /// This mode can be used for strict parsing of HTML5 void 321 /// elements. 322 never, 323 } 324 325 /// Configuration for parsing XML. 326 struct XmlParseConfig 327 { 328 static: 329 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } 330 bool preserveWhitespace(string tag) { return false; } 331 enum optionalParameterValues = false; 332 } 333 334 /// Configuration for strict parsing of HTML5. 335 /// All void tags must never be closed, and all 336 /// non-void tags must always be explicitly closed. 337 /// Attributes must still be quoted like in XML. 338 struct Html5StrictParseConfig 339 { 340 static: 341 immutable voidElements = [ 342 "area" , "base" , "br" , "col" , 343 "command", "embed" , "hr" , "img" , 344 "input" , "keygen", "link" , "meta", 345 "param" , "source", "track", "wbr" , 346 ]; 347 348 NodeCloseMode nodeCloseMode(string tag) 349 { 350 return tag.isOneOf(voidElements) 351 ? NodeCloseMode.never 352 : NodeCloseMode.always 353 ; 354 } 355 356 enum optionalParameterValues = true; 357 bool preserveWhitespace(string tag) { return false; /*TODO*/ } 358 } 359 360 /// Parse an SGML-ish string into an XmlNode 361 alias parse = parseString!XmlNode; 362 363 /// Parse an SGML-ish StringStream into an XmlDocument 364 alias parseDocument = parseString!XmlDocument; 365 366 alias xmlParse = parseDocument!XmlParseConfig; 367 368 private: 369 370 public // alias 371 template parseString(Node) 372 { 373 Node parseString(Config)(string s) 374 { 375 auto ss = StringStream(s); 376 alias f = parseStream!Node; 377 return f!Config(ss); 378 } 379 } 380 381 template parseStream(Node) 382 { 383 Node parseStream(Config)(ref StringStream s) 384 { 385 auto n = new Node; 386 parseInto!Config(n, s); 387 return n; 388 } 389 } 390 391 alias parseNode = parseStream!XmlNode; 392 393 /// Parse an SGML-ish StringStream into an XmlDocument 394 void parseInto(Config)(XmlDocument d, ref StringStream s) 395 { 396 skipWhitespace(s); 397 while (s.position < s.size) 398 try 399 { 400 auto n = new XmlNode; 401 parseInto!Config(n, s, null); 402 d.addChild(n); 403 skipWhitespace(s); 404 } 405 catch (XmlParseException e) 406 { 407 import std.algorithm.searching; 408 import std.range : retro; 409 410 auto head = s.s[0..s.position]; 411 auto row = head.representation.count('\n'); 412 auto column = head.representation.retro.countUntil('\n'); 413 if (column < 0) 414 column = head.length; 415 throw new XmlParseException("Error at %d:%d (offset %d)".format( 416 1 + row, 417 1 + column, 418 head.length, 419 ), e); 420 } 421 } 422 423 /// Parse an SGML-ish StringStream into an XmlNode 424 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false) 425 { 426 char c; 427 428 preserveWhitespace |= Config.preserveWhitespace(parentTag); 429 if (preserveWhitespace) 430 c = s.read(); 431 else 432 do 433 c = s.read(); 434 while (isWhiteChar[c]); 435 436 node.startPos = s.position; 437 if (c!='<') // text node 438 { 439 node.type = XmlNodeType.Text; 440 string text; 441 while (c!='<') 442 { 443 // TODO: check for EOF 444 text ~= c; 445 c = s.read(); 446 } 447 s.position--; // rewind to '<' 448 if (!preserveWhitespace) 449 while (text.length && isWhiteChar[text[$-1]]) 450 text = text[0..$-1]; 451 node.tag = decodeEntities(text); 452 //tag = tag.strip(); 453 } 454 else 455 { 456 c = s.read(); 457 if (c=='!') 458 { 459 c = s.read(); 460 if (c == '-') // comment 461 { 462 expect(s, '-'); 463 node.type = XmlNodeType.Comment; 464 string tag; 465 do 466 { 467 c = s.read(); 468 tag ~= c; 469 } while (tag.length<3 || tag[$-3..$] != "-->"); 470 tag = tag[0..$-3]; 471 node.tag = tag; 472 } 473 else 474 if (c == '[') // CDATA 475 { 476 foreach (x; "CDATA[") 477 expect(s, x); 478 node.type = XmlNodeType.CData; 479 string tag; 480 do 481 { 482 c = s.read(); 483 tag ~= c; 484 } while (tag.length<3 || tag[$-3..$] != "]]>"); 485 tag = tag[0..$-3]; 486 node.tag = tag; 487 } 488 else // doctype, etc. 489 { 490 node.type = XmlNodeType.DocType; 491 while (c != '>') 492 { 493 node.tag ~= c; 494 c = s.read(); 495 } 496 } 497 } 498 else 499 if (c=='?') 500 { 501 node.type = XmlNodeType.Meta; 502 node.tag = readWord(s); 503 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 504 while (true) 505 { 506 skipWhitespace(s); 507 if (peek(s)=='?') 508 break; 509 readAttribute!Config(node, s); 510 } 511 c = s.read(); 512 expect(s, '>'); 513 } 514 else 515 if (c=='/') 516 throw new XmlParseException("Unexpected close tag"); 517 else 518 { 519 node.type = XmlNodeType.Node; 520 s.position--; 521 node.tag = readWord(s); 522 while (true) 523 { 524 skipWhitespace(s); 525 c = peek(s); 526 if (c=='>' || c=='/') 527 break; 528 readAttribute!Config(node, s); 529 } 530 c = s.read(); 531 532 auto closeMode = Config.nodeCloseMode(node.tag); 533 if (closeMode == NodeCloseMode.never) 534 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 535 else 536 if (closeMode == NodeCloseMode.implicit) 537 { 538 if (c == '/') 539 expect(s, '>'); 540 } 541 else 542 { 543 if (c=='>') 544 { 545 while (true) 546 { 547 while (true) 548 { 549 if (!preserveWhitespace && !Config.preserveWhitespace(node.tag)) 550 skipWhitespace(s); 551 if (peek(s)=='<' && peek(s, 2)=='/') 552 break; 553 try 554 { 555 auto child = new XmlNode; 556 parseInto!Config(child, s, node.tag, preserveWhitespace); 557 node.addChild(child); 558 } 559 catch (XmlParseException e) 560 throw new XmlParseException("Error while processing child of "~node.tag, e); 561 } 562 expect(s, '<'); 563 expect(s, '/'); 564 auto word = readWord(s); 565 if (word != node.tag) 566 { 567 auto closeMode2 = Config.nodeCloseMode(word); 568 if (closeMode2 == NodeCloseMode.implicit) 569 { 570 auto parent = node.parent; 571 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 572 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 573 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 574 continue; 575 } 576 else 577 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 578 } 579 expect(s, '>'); 580 break; 581 } 582 } 583 else // '/' 584 expect(s, '>'); 585 } 586 } 587 } 588 node.endPos = s.position; 589 } 590 591 private: 592 593 void readAttribute(Config)(XmlNode node, ref StringStream s) 594 { 595 string name = readWord(s); 596 if (name.length==0) throw new XmlParseException("Invalid attribute"); 597 skipWhitespace(s); 598 599 static if (Config.optionalParameterValues) 600 { 601 if (peek(s) != '=') 602 { 603 node.attributes[name] = null; 604 return; 605 } 606 } 607 608 expect(s, '='); 609 skipWhitespace(s); 610 char delim; 611 delim = s.read(); 612 if (delim != '\'' && delim != '"') 613 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 614 string value = readUntil(s, delim); 615 node.attributes[name] = decodeEntities(value); 616 } 617 618 char peek(ref StringStream s, int n=1) 619 { 620 return s.s[s.position + n - 1]; 621 } 622 623 void skipWhitespace(ref StringStream s) 624 { 625 while (isWhiteChar[s.s.ptr[s.position]]) 626 s.position++; 627 } 628 629 __gshared bool[256] isWhiteChar, isWordChar; 630 631 shared static this() 632 { 633 foreach (c; 0..256) 634 { 635 isWhiteChar[c] = isWhite(c); 636 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 637 } 638 } 639 640 string readWord(ref StringStream stream) 641 { 642 auto start = stream.s.ptr + stream.position; 643 auto end = stream.s.ptr + stream.s.length; 644 auto p = start; 645 while (p < end && isWordChar[*p]) 646 p++; 647 auto len = p-start; 648 stream.position += len; 649 return start[0..len]; 650 } 651 652 void expect(ref StringStream s, char c) 653 { 654 char c2; 655 c2 = s.read(); 656 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 657 } 658 659 string readUntil(ref StringStream s, char until) 660 { 661 auto start = s.s.ptr + s.position; 662 auto p = start; 663 while (*p != until) p++; 664 auto len = p-start; 665 s.position += len + 1; 666 return start[0..len]; 667 } 668 669 unittest 670 { 671 enum xmlText = 672 `<?xml version="1.0" encoding="UTF-8"?>` ~ 673 `<quotes>` ~ 674 `<quote author="Alan Perlis">` ~ 675 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 676 `</quote>` ~ 677 `</quotes>`; 678 auto doc = new XmlDocument(xmlText); 679 assert(doc.toString() == xmlText, doc.toString()); 680 } 681 682 unittest 683 { 684 string testOne(bool preserve)(string s) 685 { 686 static struct ParseConfig 687 { 688 static: 689 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 690 bool preserveWhitespace(string tag) { return preserve; } 691 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 692 } 693 auto node = new XmlNode; 694 auto str = StringStream("<tag>" ~ s ~ "</tag>"); 695 parseInto!ParseConfig(node, str, null); 696 // import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString); 697 return node.children.length ? node.children[0].tag : null; 698 } 699 700 foreach (tag; ["a", " a", "a ", " a ", " a a ", " ", ""]) 701 { 702 assert(testOne!false(tag) == strip(tag), 703 "Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'"); 704 assert(testOne!true(tag) == tag, 705 "Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'"); 706 } 707 } 708 709 unittest 710 { 711 static struct ParseConfig 712 { 713 static: 714 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 715 bool preserveWhitespace(string tag) { return tag == "a"; } 716 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 717 } 718 auto node = new XmlNode; 719 auto str = StringStream("<a><b> foo </b></a>"); 720 parseInto!ParseConfig(node, str, null); 721 assert(node.children[0].children[0].tag == " foo "); 722 }