1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <vladimir@thecybershadow.net> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xml.lite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.xml.common; 27 import ae.utils.xml.entities; 28 import ae.utils.xmlwriter; 29 30 // ************************************************************************ 31 32 /// std.stream.Stream-like type with bonus speed 33 private struct StringStream 34 { 35 string s; 36 size_t position; 37 38 @disable this(); 39 @disable this(this); 40 this(string s) 41 { 42 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 43 this.s = (s ~ ditch)[0..$-ditch.length]; 44 } 45 46 char read() { return s[position++]; } 47 @property size_t size() { return s.length; } 48 } 49 50 // ************************************************************************ 51 52 enum XmlNodeType 53 { 54 None, 55 Root, 56 Node, 57 Comment, 58 Meta, 59 DocType, 60 CData, 61 Text, 62 Raw, // Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is. 63 } 64 65 alias XmlAttributes = OrderedMap!(string, string); 66 67 class XmlNode 68 { 69 string tag; 70 XmlAttributes attributes; 71 XmlNode parent; 72 XmlNode[] children; 73 XmlNodeType type; 74 ulong startPos, endPos; 75 76 this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); } 77 this(string s) { auto ss = StringStream(s); this(ss); } 78 79 this(XmlNodeType type = XmlNodeType.None, string tag = null) 80 { 81 this.type = type; 82 this.tag = tag; 83 } 84 85 XmlNode addAttribute(string name, string value) 86 { 87 attributes[name] = value; 88 return this; 89 } 90 91 XmlNode addChild(XmlNode child) 92 { 93 child.parent = this; 94 children ~= child; 95 return this; 96 } 97 98 override string toString() const 99 { 100 XmlWriter writer; 101 writeTo(writer); 102 return writer.output.get(); 103 } 104 105 string toPrettyString() const 106 { 107 PrettyXmlWriter writer; 108 writeTo(writer); 109 return writer.output.get(); 110 } 111 112 final void writeTo(XmlWriter)(ref XmlWriter output) const 113 { 114 void writeChildren() 115 { 116 foreach (child; children) 117 child.writeTo(output); 118 } 119 120 void writeAttributes() 121 { 122 foreach (key, value; attributes) 123 output.addAttribute(key, value); 124 } 125 126 final switch (type) 127 { 128 case XmlNodeType.None: 129 assert(false); 130 case XmlNodeType.Root: 131 writeChildren(); 132 return; 133 case XmlNodeType.Node: 134 output.startTagWithAttributes(tag); 135 writeAttributes(); 136 if (children.length) 137 { 138 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 139 if (oneLine) 140 output.formatter.enabled = false; 141 output.endAttributes(); 142 writeChildren(); 143 output.endTag(tag); 144 if (oneLine) 145 { 146 output.formatter.enabled = true; 147 output.newLine(); 148 } 149 } 150 else 151 output.endAttributesAndTag(); 152 return; 153 case XmlNodeType.Meta: 154 assert(children.length == 0); 155 output.startPI(tag); 156 writeAttributes(); 157 output.endPI(); 158 return; 159 case XmlNodeType.DocType: 160 assert(children.length == 0); 161 output.doctype(tag); 162 return; 163 case XmlNodeType.Text: 164 output.startLine(); 165 output.text(tag); 166 output.newLine(); 167 return; 168 case XmlNodeType.Comment: 169 output.startLine(); 170 output.comment(tag); 171 return; 172 case XmlNodeType.CData: 173 output.text(tag); 174 return; 175 case XmlNodeType.Raw: 176 output.startLine(); 177 output.output.put(tag); 178 output.newLine(); 179 return; 180 } 181 } 182 183 @property string text() 184 { 185 final switch (type) 186 { 187 case XmlNodeType.None: 188 assert(false); 189 case XmlNodeType.Text: 190 case XmlNodeType.CData: 191 return tag; 192 case XmlNodeType.Node: 193 case XmlNodeType.Root: 194 string result; 195 if (tag == "br") 196 result = "\n"; 197 foreach (child; children) 198 result ~= child.text(); 199 return result; 200 case XmlNodeType.Comment: 201 case XmlNodeType.Meta: 202 case XmlNodeType.DocType: 203 return null; 204 case XmlNodeType.Raw: 205 assert(false, "Can't extract text from Raw nodes"); 206 } 207 } 208 209 final XmlNode findChild(string tag) 210 { 211 foreach (child; children) 212 if (child.type == XmlNodeType.Node && child.tag == tag) 213 return child; 214 return null; 215 } 216 217 final XmlNode[] findChildren(string tag) 218 { 219 XmlNode[] result; 220 foreach (child; children) 221 if (child.type == XmlNodeType.Node && child.tag == tag) 222 result ~= child; 223 return result; 224 } 225 226 final XmlNode opIndex(string tag) 227 { 228 auto node = findChild(tag); 229 if (node is null) 230 throw new XmlParseException("No such child: " ~ tag); 231 return node; 232 } 233 234 final XmlNode opIndex(string tag, size_t index) 235 { 236 auto nodes = findChildren(tag); 237 if (index >= nodes.length) 238 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 239 return nodes[index]; 240 } 241 242 final ref XmlNode opIndex(size_t index) 243 { 244 return children[index]; 245 } 246 247 final @property size_t length() { return children.length; } 248 alias opDollar = length; 249 250 int opApply(int delegate(ref XmlNode) dg) 251 { 252 int result = 0; 253 254 for (int i = 0; i < children.length; i++) 255 { 256 result = dg(children[i]); 257 if (result) 258 break; 259 } 260 return result; 261 } 262 263 final @property XmlNode dup() 264 { 265 auto result = new XmlNode(type, tag); 266 result.attributes = attributes.dup; 267 result.children.reserve(children.length); 268 foreach (child; children) 269 result.addChild(child.dup); 270 return result; 271 } 272 } 273 274 class XmlDocument : XmlNode 275 { 276 this() 277 { 278 super(XmlNodeType.Root); 279 tag = "<Root>"; 280 } 281 282 this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 283 this(string s) { auto ss = StringStream(s); this(ss); } 284 } 285 286 /// The logic for how to handle a node's closing tags. 287 enum NodeCloseMode 288 { 289 /// This element must always have an explicit closing tag 290 /// (or a self-closing tag). An unclosed tag will lead to 291 /// a parse error. 292 /// In XML, all tags are "always". 293 always, 294 /* 295 /// Close tags are optional. When an element with a tag is 296 /// encountered directly under an element with the same tag, 297 /// it is assumed that the first element is closed before 298 /// the second, so the two are siblings, not parent/child. 299 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 300 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 301 /// still parsed as `<p>a<div><p>b</p></div></p>`. 302 /// This mode can be used for relaxed HTML parsing. 303 optional, 304 */ 305 /// Close tags are optional, but are implied when absent. 306 /// As a result, these elements cannot have any content, 307 /// and any close tags must be adjacent to the open tag. 308 implicit, 309 310 /// This element is void and must never have a closing tag. 311 /// It is always implicitly closed right after opening. 312 /// A close tag is always an error. 313 /// This mode can be used for strict parsing of HTML5 void 314 /// elements. 315 never, 316 } 317 318 /// Configuration for parsing XML. 319 struct XmlParseConfig 320 { 321 static: 322 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } 323 bool preserveWhitespace(string tag) { return false; } 324 enum optionalParameterValues = false; 325 } 326 327 /// Configuration for strict parsing of HTML5. 328 /// All void tags must never be closed, and all 329 /// non-void tags must always be explicitly closed. 330 /// Attributes must still be quoted like in XML. 331 struct Html5StrictParseConfig 332 { 333 static: 334 immutable voidElements = [ 335 "area" , "base" , "br" , "col" , 336 "command", "embed" , "hr" , "img" , 337 "input" , "keygen", "link" , "meta", 338 "param" , "source", "track", "wbr" , 339 ]; 340 341 NodeCloseMode nodeCloseMode(string tag) 342 { 343 return tag.isOneOf(voidElements) 344 ? NodeCloseMode.never 345 : NodeCloseMode.always 346 ; 347 } 348 349 enum optionalParameterValues = true; 350 bool preserveWhitespace(string tag) { return false; /*TODO*/ } 351 } 352 353 /// Parse an SGML-ish string into an XmlNode 354 alias parse = parseString!XmlNode; 355 356 /// Parse an SGML-ish StringStream into an XmlDocument 357 alias parseDocument = parseString!XmlDocument; 358 359 alias xmlParse = parseDocument!XmlParseConfig; 360 361 private: 362 363 public // alias 364 template parseString(Node) 365 { 366 Node parseString(Config)(string s) 367 { 368 auto ss = StringStream(s); 369 alias f = parseStream!Node; 370 return f!Config(ss); 371 } 372 } 373 374 template parseStream(Node) 375 { 376 Node parseStream(Config)(ref StringStream s) 377 { 378 auto n = new Node; 379 parseInto!Config(n, s); 380 return n; 381 } 382 } 383 384 alias parseNode = parseStream!XmlNode; 385 386 /// Parse an SGML-ish StringStream into an XmlDocument 387 void parseInto(Config)(XmlDocument d, ref StringStream s) 388 { 389 skipWhitespace(s); 390 while (s.position < s.size) 391 try 392 { 393 auto n = new XmlNode; 394 parseInto!Config(n, s, null); 395 d.addChild(n); 396 skipWhitespace(s); 397 } 398 catch (XmlParseException e) 399 { 400 import std.algorithm.searching; 401 import std.range : retro; 402 403 auto head = s.s[0..s.position]; 404 auto row = head.representation.count('\n'); 405 auto column = head.representation.retro.countUntil('\n'); 406 if (column < 0) 407 column = head.length; 408 throw new XmlParseException("Error at %d:%d (offset %d)".format( 409 1 + row, 410 1 + column, 411 head.length, 412 ), e); 413 } 414 } 415 416 /// Parse an SGML-ish StringStream into an XmlNode 417 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false) 418 { 419 char c; 420 421 preserveWhitespace |= Config.preserveWhitespace(parentTag); 422 if (preserveWhitespace) 423 c = s.read(); 424 else 425 do 426 c = s.read(); 427 while (isWhiteChar[c]); 428 429 node.startPos = s.position; 430 if (c!='<') // text node 431 { 432 node.type = XmlNodeType.Text; 433 string text; 434 while (c!='<') 435 { 436 // TODO: check for EOF 437 text ~= c; 438 c = s.read(); 439 } 440 s.position--; // rewind to '<' 441 if (!preserveWhitespace) 442 while (text.length && isWhiteChar[text[$-1]]) 443 text = text[0..$-1]; 444 node.tag = decodeEntities(text); 445 //tag = tag.strip(); 446 } 447 else 448 { 449 c = s.read(); 450 if (c=='!') 451 { 452 c = s.read(); 453 if (c == '-') // comment 454 { 455 expect(s, '-'); 456 node.type = XmlNodeType.Comment; 457 string tag; 458 do 459 { 460 c = s.read(); 461 tag ~= c; 462 } while (tag.length<3 || tag[$-3..$] != "-->"); 463 tag = tag[0..$-3]; 464 node.tag = tag; 465 } 466 else 467 if (c == '[') // CDATA 468 { 469 foreach (x; "CDATA[") 470 expect(s, x); 471 node.type = XmlNodeType.CData; 472 string tag; 473 do 474 { 475 c = s.read(); 476 tag ~= c; 477 } while (tag.length<3 || tag[$-3..$] != "]]>"); 478 tag = tag[0..$-3]; 479 node.tag = tag; 480 } 481 else // doctype, etc. 482 { 483 node.type = XmlNodeType.DocType; 484 while (c != '>') 485 { 486 node.tag ~= c; 487 c = s.read(); 488 } 489 } 490 } 491 else 492 if (c=='?') 493 { 494 node.type = XmlNodeType.Meta; 495 node.tag = readWord(s); 496 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 497 while (true) 498 { 499 skipWhitespace(s); 500 if (peek(s)=='?') 501 break; 502 readAttribute!Config(node, s); 503 } 504 c = s.read(); 505 expect(s, '>'); 506 } 507 else 508 if (c=='/') 509 throw new XmlParseException("Unexpected close tag"); 510 else 511 { 512 node.type = XmlNodeType.Node; 513 s.position--; 514 node.tag = readWord(s); 515 while (true) 516 { 517 skipWhitespace(s); 518 c = peek(s); 519 if (c=='>' || c=='/') 520 break; 521 readAttribute!Config(node, s); 522 } 523 c = s.read(); 524 525 auto closeMode = Config.nodeCloseMode(node.tag); 526 if (closeMode == NodeCloseMode.never) 527 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 528 else 529 if (closeMode == NodeCloseMode.implicit) 530 { 531 if (c == '/') 532 expect(s, '>'); 533 } 534 else 535 { 536 if (c=='>') 537 { 538 while (true) 539 { 540 while (true) 541 { 542 if (!preserveWhitespace && !Config.preserveWhitespace(node.tag)) 543 skipWhitespace(s); 544 if (peek(s)=='<' && peek(s, 2)=='/') 545 break; 546 try 547 { 548 auto child = new XmlNode; 549 parseInto!Config(child, s, node.tag, preserveWhitespace); 550 node.addChild(child); 551 } 552 catch (XmlParseException e) 553 throw new XmlParseException("Error while processing child of "~node.tag, e); 554 } 555 expect(s, '<'); 556 expect(s, '/'); 557 auto word = readWord(s); 558 if (word != node.tag) 559 { 560 auto closeMode2 = Config.nodeCloseMode(word); 561 if (closeMode2 == NodeCloseMode.implicit) 562 { 563 auto parent = node.parent; 564 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 565 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 566 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 567 continue; 568 } 569 else 570 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 571 } 572 expect(s, '>'); 573 break; 574 } 575 } 576 else // '/' 577 expect(s, '>'); 578 } 579 } 580 } 581 node.endPos = s.position; 582 } 583 584 private: 585 586 void readAttribute(Config)(XmlNode node, ref StringStream s) 587 { 588 string name = readWord(s); 589 if (name.length==0) throw new XmlParseException("Invalid attribute"); 590 skipWhitespace(s); 591 592 static if (Config.optionalParameterValues) 593 { 594 if (peek(s) != '=') 595 { 596 node.attributes[name] = null; 597 return; 598 } 599 } 600 601 expect(s, '='); 602 skipWhitespace(s); 603 char delim; 604 delim = s.read(); 605 if (delim != '\'' && delim != '"') 606 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 607 string value = readUntil(s, delim); 608 node.attributes[name] = decodeEntities(value); 609 } 610 611 char peek(ref StringStream s, int n=1) 612 { 613 return s.s[s.position + n - 1]; 614 } 615 616 void skipWhitespace(ref StringStream s) 617 { 618 while (isWhiteChar[s.s.ptr[s.position]]) 619 s.position++; 620 } 621 622 __gshared bool[256] isWhiteChar, isWordChar; 623 624 shared static this() 625 { 626 foreach (c; 0..256) 627 { 628 isWhiteChar[c] = isWhite(c); 629 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 630 } 631 } 632 633 string readWord(ref StringStream stream) 634 { 635 auto start = stream.s.ptr + stream.position; 636 auto end = stream.s.ptr + stream.s.length; 637 auto p = start; 638 while (p < end && isWordChar[*p]) 639 p++; 640 auto len = p-start; 641 stream.position += len; 642 return start[0..len]; 643 } 644 645 void expect(ref StringStream s, char c) 646 { 647 char c2; 648 c2 = s.read(); 649 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 650 } 651 652 string readUntil(ref StringStream s, char until) 653 { 654 auto start = s.s.ptr + s.position; 655 auto p = start; 656 while (*p != until) p++; 657 auto len = p-start; 658 s.position += len + 1; 659 return start[0..len]; 660 } 661 662 unittest 663 { 664 enum xmlText = 665 `<?xml version="1.0" encoding="UTF-8"?>` ~ 666 `<quotes>` ~ 667 `<quote author="Alan Perlis">` ~ 668 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 669 `</quote>` ~ 670 `</quotes>`; 671 auto doc = new XmlDocument(xmlText); 672 assert(doc.toString() == xmlText, doc.toString()); 673 } 674 675 unittest 676 { 677 string testOne(bool preserve)(string s) 678 { 679 static struct ParseConfig 680 { 681 static: 682 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 683 bool preserveWhitespace(string tag) { return preserve; } 684 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 685 } 686 auto node = new XmlNode; 687 auto str = StringStream("<tag>" ~ s ~ "</tag>"); 688 parseInto!ParseConfig(node, str, null); 689 // import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString); 690 return node.children.length ? node.children[0].tag : null; 691 } 692 693 foreach (tag; ["a", " a", "a ", " a ", " a a ", " ", ""]) 694 { 695 assert(testOne!false(tag) == strip(tag), 696 "Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'"); 697 assert(testOne!true(tag) == tag, 698 "Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'"); 699 } 700 } 701 702 unittest 703 { 704 static struct ParseConfig 705 { 706 static: 707 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 708 bool preserveWhitespace(string tag) { return tag == "a"; } 709 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 710 } 711 auto node = new XmlNode; 712 auto str = StringStream("<a><b> foo </b></a>"); 713 parseInto!ParseConfig(node, str, null); 714 assert(node.children[0].children[0].tag == " foo "); 715 }