1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <vladimir@thecybershadow.net> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xmllite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.exception; 27 import ae.utils.xmlwriter; 28 29 // ************************************************************************ 30 31 /// std.stream.Stream-like type with bonus speed 32 private struct StringStream 33 { 34 string s; 35 size_t position; 36 37 @disable this(); 38 @disable this(this); 39 this(string s) 40 { 41 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 42 this.s = (s ~ ditch)[0..$-ditch.length]; 43 } 44 45 char read() { return s[position++]; } 46 @property size_t size() { return s.length; } 47 } 48 49 // ************************************************************************ 50 51 mixin DeclareException!q{XmlParseException}; 52 53 enum XmlNodeType 54 { 55 None, 56 Root, 57 Node, 58 Comment, 59 Meta, 60 DocType, 61 CData, 62 Text 63 } 64 65 class XmlNode 66 { 67 string tag; 68 string[string] attributes; 69 XmlNode parent; 70 XmlNode[] children; 71 XmlNodeType type; 72 ulong startPos, endPos; 73 74 this(ref StringStream s) { parseInto!XmlParseConfig(this, s); } 75 this(string s) { auto ss = StringStream(s); this(ss); } 76 77 this(XmlNodeType type = XmlNodeType.None, string tag = null) 78 { 79 this.type = type; 80 this.tag = tag; 81 } 82 83 XmlNode addAttribute(string name, string value) 84 { 85 attributes[name] = value; 86 return this; 87 } 88 89 XmlNode addChild(XmlNode child) 90 { 91 child.parent = this; 92 children ~= child; 93 return this; 94 } 95 96 override string toString() const 97 { 98 XmlWriter writer; 99 writeTo(writer); 100 return writer.output.get(); 101 } 102 103 final void writeTo(XmlWriter)(ref XmlWriter output) const 104 { 105 void writeChildren() 106 { 107 foreach (child; children) 108 child.writeTo(output); 109 } 110 111 void writeAttributes() 112 { 113 foreach (key, value; attributes) 114 output.addAttribute(key, value); 115 } 116 117 final switch (type) 118 { 119 case XmlNodeType.None: 120 assert(false); 121 case XmlNodeType.Root: 122 writeChildren(); 123 return; 124 case XmlNodeType.Node: 125 output.startTagWithAttributes(tag); 126 writeAttributes(); 127 output.endAttributes(); 128 writeChildren(); 129 output.endTag(tag); 130 return; 131 case XmlNodeType.Meta: 132 assert(children.length == 0); 133 output.startPI(tag); 134 writeAttributes(); 135 output.endPI(); 136 return; 137 case XmlNodeType.DocType: 138 assert(children.length == 0); 139 output.doctype(tag); 140 return; 141 case XmlNodeType.Text: 142 output.text(tag); 143 return; 144 case XmlNodeType.Comment: 145 // TODO 146 return; 147 case XmlNodeType.CData: 148 output.text(tag); 149 return; 150 } 151 } 152 153 @property string text() 154 { 155 final switch (type) 156 { 157 case XmlNodeType.None: 158 assert(false); 159 case XmlNodeType.Text: 160 case XmlNodeType.CData: 161 return tag; 162 case XmlNodeType.Node: 163 case XmlNodeType.Root: 164 string result; 165 if (tag == "br") 166 result = "\n"; 167 foreach (child; children) 168 result ~= child.text(); 169 return result; 170 case XmlNodeType.Comment: 171 case XmlNodeType.Meta: 172 case XmlNodeType.DocType: 173 return null; 174 } 175 } 176 177 final XmlNode findChild(string tag) 178 { 179 foreach (child; children) 180 if (child.type == XmlNodeType.Node && child.tag == tag) 181 return child; 182 return null; 183 } 184 185 final XmlNode[] findChildren(string tag) 186 { 187 XmlNode[] result; 188 foreach (child; children) 189 if (child.type == XmlNodeType.Node && child.tag == tag) 190 result ~= child; 191 return result; 192 } 193 194 final XmlNode opIndex(string tag) 195 { 196 auto node = findChild(tag); 197 if (node is null) 198 throw new XmlParseException("No such child: " ~ tag); 199 return node; 200 } 201 202 final XmlNode opIndex(string tag, size_t index) 203 { 204 auto nodes = findChildren(tag); 205 if (index >= nodes.length) 206 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 207 return nodes[index]; 208 } 209 210 final XmlNode opIndex(size_t index) 211 { 212 return children[index]; 213 } 214 215 final @property size_t length() { return children.length; } 216 217 int opApply(int delegate(ref XmlNode) dg) 218 { 219 int result = 0; 220 221 for (int i = 0; i < children.length; i++) 222 { 223 result = dg(children[i]); 224 if (result) 225 break; 226 } 227 return result; 228 } 229 230 final @property XmlNode dup() 231 { 232 auto result = new XmlNode(type, tag); 233 result.attributes = attributes.dup; 234 result.children.reserve(children.length); 235 foreach (child; children) 236 result.addChild(child.dup); 237 return result; 238 } 239 } 240 241 class XmlDocument : XmlNode 242 { 243 this() 244 { 245 super(XmlNodeType.Root); 246 tag = "<Root>"; 247 } 248 249 this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 250 this(string s) { auto ss = StringStream(s); this(ss); } 251 } 252 253 /// The logic for how to handle a node's closing tags. 254 enum NodeCloseMode 255 { 256 /// This element must always have an explicit closing tag 257 /// (or a self-closing tag). An unclosed tag will lead to 258 /// a parse error. 259 /// In XML, all tags are "always". 260 always, 261 /* 262 /// Close tags are optional. When an element with a tag is 263 /// encountered directly under an element with the same tag, 264 /// it is assumed that the first element is closed before 265 /// the second, so the two are siblings, not parent/child. 266 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 267 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 268 /// still parsed as `<p>a<div><p>b</p></div></p>`. 269 /// This mode can be used for relaxed HTML parsing. 270 optional, 271 */ 272 /// Close tags are optional, but are implied when absent. 273 /// As a result, these elements cannot have any content, 274 /// and any close tags must be adjacent to the open tag. 275 implicit, 276 277 /// This element is void and must never have a closing tag. 278 /// It is always implicitly closed right after opening. 279 /// A close tag is always an error. 280 /// This mode can be used for strict parsing of HTML5 void 281 /// elements. 282 never, 283 } 284 285 /// Configuration for parsing XML. 286 struct XmlParseConfig 287 { 288 static: 289 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } 290 enum optionalParameterValues = false; 291 } 292 293 /// Configuration for strict parsing of HTML5. 294 /// All void tags must never be closed, and all 295 /// non-void tags must always be explicitly closed. 296 /// Attributes must still be quoted like in XML. 297 struct Html5StrictParseConfig 298 { 299 static: 300 immutable voidElements = [ 301 "area" , "base" , "br" , "col" , 302 "command", "embed" , "hr" , "img" , 303 "input" , "keygen", "link" , "meta", 304 "param" , "source", "track", "wbr" , 305 ]; 306 307 NodeCloseMode nodeCloseMode(string tag) 308 { 309 return tag.isOneOf(voidElements) 310 ? NodeCloseMode.never 311 : NodeCloseMode.always 312 ; 313 } 314 315 enum optionalParameterValues = true; 316 } 317 318 /// Parse an SGML-ish string into an XmlNode 319 alias parse = parseString!XmlNode; 320 321 /// Parse an SGML-ish StringStream into an XmlDocument 322 alias parseDocument = parseString!XmlDocument; 323 324 alias xmlParse = parseDocument!XmlParseConfig; 325 326 private: 327 328 public // alias 329 template parseString(Node) 330 { 331 Node parseString(Config)(string s) 332 { 333 auto ss = StringStream(s); 334 alias f = parseStream!Node; 335 return f!Config(ss); 336 } 337 } 338 339 template parseStream(Node) 340 { 341 Node parseStream(Config)(ref StringStream s) 342 { 343 auto n = new Node; 344 parseInto!Config(n, s); 345 return n; 346 } 347 } 348 349 alias parseNode = parseStream!XmlNode; 350 351 /// Parse an SGML-ish StringStream into an XmlDocument 352 void parseInto(Config)(XmlDocument d, ref StringStream s) 353 { 354 skipWhitespace(s); 355 while (s.position < s.size) 356 try 357 { 358 auto n = new XmlNode; 359 parseInto!Config(n, s); 360 d.addChild(n); 361 skipWhitespace(s); 362 } 363 catch (XmlParseException e) 364 { 365 import std.algorithm.searching; 366 import std.range : retro; 367 368 auto head = s.s[0..s.position]; 369 auto row = head.representation.count('\n'); 370 auto column = head.representation.retro.countUntil('\n'); 371 if (column < 0) 372 column = head.length; 373 throw new XmlParseException("Error at %d:%d (offset %d)".format( 374 1 + row, 375 1 + column, 376 head.length, 377 ), e); 378 } 379 } 380 381 /// Parse an SGML-ish StringStream into an XmlNode 382 void parseInto(Config)(XmlNode node, ref StringStream s) 383 { 384 node.startPos = s.position; 385 char c; 386 do 387 c = s.read(); 388 while (isWhiteChar[c]); 389 390 if (c!='<') // text node 391 { 392 node.type = XmlNodeType.Text; 393 string text; 394 while (c!='<') 395 { 396 // TODO: check for EOF 397 text ~= c; 398 c = s.read(); 399 } 400 s.position--; // rewind to '<' 401 node.tag = decodeEntities(text); 402 //tag = tag.strip(); 403 } 404 else 405 { 406 c = s.read(); 407 if (c=='!') 408 { 409 c = s.read(); 410 if (c == '-') // comment 411 { 412 expect(s, '-'); 413 node.type = XmlNodeType.Comment; 414 string tag; 415 do 416 { 417 c = s.read(); 418 tag ~= c; 419 } while (tag.length<3 || tag[$-3..$] != "-->"); 420 tag = tag[0..$-3]; 421 node.tag = tag; 422 } 423 else 424 if (c == '[') // CDATA 425 { 426 foreach (x; "CDATA[") 427 expect(s, x); 428 node.type = XmlNodeType.CData; 429 string tag; 430 do 431 { 432 c = s.read(); 433 tag ~= c; 434 } while (tag.length<3 || tag[$-3..$] != "]]>"); 435 tag = tag[0..$-3]; 436 node.tag = tag; 437 } 438 else // doctype, etc. 439 { 440 node.type = XmlNodeType.DocType; 441 while (c != '>') 442 { 443 node.tag ~= c; 444 c = s.read(); 445 } 446 } 447 } 448 else 449 if (c=='?') 450 { 451 node.type = XmlNodeType.Meta; 452 node.tag = readWord(s); 453 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 454 while (true) 455 { 456 skipWhitespace(s); 457 if (peek(s)=='?') 458 break; 459 readAttribute!Config(node, s); 460 } 461 c = s.read(); 462 expect(s, '>'); 463 } 464 else 465 if (c=='/') 466 throw new XmlParseException("Unexpected close tag"); 467 else 468 { 469 node.type = XmlNodeType.Node; 470 node.tag = c~readWord(s); 471 while (true) 472 { 473 skipWhitespace(s); 474 c = peek(s); 475 if (c=='>' || c=='/') 476 break; 477 readAttribute!Config(node, s); 478 } 479 c = s.read(); 480 481 auto closeMode = Config.nodeCloseMode(node.tag); 482 if (closeMode == NodeCloseMode.never) 483 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 484 else 485 if (closeMode == NodeCloseMode.implicit) 486 { 487 if (c == '/') 488 expect(s, '>'); 489 } 490 else 491 { 492 if (c=='>') 493 { 494 while (true) 495 { 496 while (true) 497 { 498 skipWhitespace(s); 499 if (peek(s)=='<' && peek(s, 2)=='/') 500 break; 501 try 502 node.addChild(parseNode!Config(s)); 503 catch (XmlParseException e) 504 throw new XmlParseException("Error while processing child of "~node.tag, e); 505 } 506 expect(s, '<'); 507 expect(s, '/'); 508 auto word = readWord(s); 509 if (word != node.tag) 510 { 511 auto closeMode2 = Config.nodeCloseMode(word); 512 if (closeMode2 == NodeCloseMode.implicit) 513 { 514 auto parent = node.parent; 515 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 516 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 517 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 518 continue; 519 } 520 else 521 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 522 } 523 expect(s, '>'); 524 break; 525 } 526 } 527 else // '/' 528 expect(s, '>'); 529 } 530 } 531 } 532 node.endPos = s.position; 533 } 534 535 private: 536 537 void readAttribute(Config)(XmlNode node, ref StringStream s) 538 { 539 string name = readWord(s); 540 if (name.length==0) throw new XmlParseException("Invalid attribute"); 541 skipWhitespace(s); 542 543 static if (Config.optionalParameterValues) 544 { 545 if (peek(s) != '=') 546 { 547 node.attributes[name] = null; 548 return; 549 } 550 } 551 552 expect(s, '='); 553 skipWhitespace(s); 554 char delim; 555 delim = s.read(); 556 if (delim != '\'' && delim != '"') 557 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 558 string value = readUntil(s, delim); 559 node.attributes[name] = decodeEntities(value); 560 } 561 562 char peek(ref StringStream s, int n=1) 563 { 564 return s.s[s.position + n - 1]; 565 } 566 567 void skipWhitespace(ref StringStream s) 568 { 569 while (isWhiteChar[s.s.ptr[s.position]]) 570 s.position++; 571 } 572 573 __gshared bool[256] isWhiteChar, isWordChar; 574 575 shared static this() 576 { 577 foreach (c; 0..256) 578 { 579 isWhiteChar[c] = isWhite(c); 580 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 581 } 582 } 583 584 string readWord(ref StringStream stream) 585 { 586 auto start = stream.s.ptr + stream.position; 587 auto end = stream.s.ptr + stream.s.length; 588 auto p = start; 589 while (p < end && isWordChar[*p]) 590 p++; 591 auto len = p-start; 592 stream.position += len; 593 return start[0..len]; 594 } 595 596 void expect(ref StringStream s, char c) 597 { 598 char c2; 599 c2 = s.read(); 600 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 601 } 602 603 string readUntil(ref StringStream s, char until) 604 { 605 auto start = s.s.ptr + s.position; 606 auto p = start; 607 while (*p != until) p++; 608 auto len = p-start; 609 s.position += len + 1; 610 return start[0..len]; 611 } 612 613 unittest 614 { 615 enum xmlText = 616 `<?xml version="1.0" encoding="UTF-8"?>` 617 `<quotes>` 618 `<quote author="Alan Perlis">` 619 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` 620 `</quote>` 621 `</quotes>`; 622 auto doc = new XmlDocument(xmlText); 623 assert(doc.toString() == xmlText); 624 } 625 626 const dchar[string] entities; 627 /*const*/ string[dchar] entityNames; 628 shared static this() 629 { 630 entities = 631 [ 632 "quot" : '\"', 633 "amp" : '\&', 634 "lt" : '\<', 635 "gt" : '\>', 636 637 "OElig" : '\Œ', 638 "oelig" : '\œ', 639 "Scaron" : '\Š', 640 "scaron" : '\š', 641 "Yuml" : '\Ÿ', 642 "circ" : '\ˆ', 643 "tilde" : '\˜', 644 "ensp" : '\ ', 645 "emsp" : '\ ', 646 "thinsp" : '\ ', 647 "zwnj" : '\‌', 648 "zwj" : '\‍', 649 "lrm" : '\‎', 650 "rlm" : '\‏', 651 "ndash" : '\–', 652 "mdash" : '\—', 653 "lsquo" : '\‘', 654 "rsquo" : '\’', 655 "sbquo" : '\‚', 656 "ldquo" : '\“', 657 "rdquo" : '\”', 658 "bdquo" : '\„', 659 "dagger" : '\†', 660 "Dagger" : '\‡', 661 "permil" : '\‰', 662 "lsaquo" : '\‹', 663 "rsaquo" : '\›', 664 "euro" : '\€', 665 666 "nbsp" : '\ ', 667 "iexcl" : '\¡', 668 "cent" : '\¢', 669 "pound" : '\£', 670 "curren" : '\¤', 671 "yen" : '\¥', 672 "brvbar" : '\¦', 673 "sect" : '\§', 674 "uml" : '\¨', 675 "copy" : '\©', 676 "ordf" : '\ª', 677 "laquo" : '\«', 678 "not" : '\¬', 679 "shy" : '\­', 680 "reg" : '\®', 681 "macr" : '\¯', 682 "deg" : '\°', 683 "plusmn" : '\±', 684 "sup2" : '\²', 685 "sup3" : '\³', 686 "acute" : '\´', 687 "micro" : '\µ', 688 "para" : '\¶', 689 "middot" : '\·', 690 "cedil" : '\¸', 691 "sup1" : '\¹', 692 "ordm" : '\º', 693 "raquo" : '\»', 694 "frac14" : '\¼', 695 "frac12" : '\½', 696 "frac34" : '\¾', 697 "iquest" : '\¿', 698 "Agrave" : '\À', 699 "Aacute" : '\Á', 700 "Acirc" : '\Â', 701 "Atilde" : '\Ã', 702 "Auml" : '\Ä', 703 "Aring" : '\Å', 704 "AElig" : '\Æ', 705 "Ccedil" : '\Ç', 706 "Egrave" : '\È', 707 "Eacute" : '\É', 708 "Ecirc" : '\Ê', 709 "Euml" : '\Ë', 710 "Igrave" : '\Ì', 711 "Iacute" : '\Í', 712 "Icirc" : '\Î', 713 "Iuml" : '\Ï', 714 "ETH" : '\Ð', 715 "Ntilde" : '\Ñ', 716 "Ograve" : '\Ò', 717 "Oacute" : '\Ó', 718 "Ocirc" : '\Ô', 719 "Otilde" : '\Õ', 720 "Ouml" : '\Ö', 721 "times" : '\×', 722 "Oslash" : '\Ø', 723 "Ugrave" : '\Ù', 724 "Uacute" : '\Ú', 725 "Ucirc" : '\Û', 726 "Uuml" : '\Ü', 727 "Yacute" : '\Ý', 728 "THORN" : '\Þ', 729 "szlig" : '\ß', 730 "agrave" : '\à', 731 "aacute" : '\á', 732 "acirc" : '\â', 733 "atilde" : '\ã', 734 "auml" : '\ä', 735 "aring" : '\å', 736 "aelig" : '\æ', 737 "ccedil" : '\ç', 738 "egrave" : '\è', 739 "eacute" : '\é', 740 "ecirc" : '\ê', 741 "euml" : '\ë', 742 "igrave" : '\ì', 743 "iacute" : '\í', 744 "icirc" : '\î', 745 "iuml" : '\ï', 746 "eth" : '\ð', 747 "ntilde" : '\ñ', 748 "ograve" : '\ò', 749 "oacute" : '\ó', 750 "ocirc" : '\ô', 751 "otilde" : '\õ', 752 "ouml" : '\ö', 753 "divide" : '\÷', 754 "oslash" : '\ø', 755 "ugrave" : '\ù', 756 "uacute" : '\ú', 757 "ucirc" : '\û', 758 "uuml" : '\ü', 759 "yacute" : '\ý', 760 "thorn" : '\þ', 761 "yuml" : '\ÿ', 762 763 "fnof" : '\ƒ', 764 "Alpha" : '\Α', 765 "Beta" : '\Β', 766 "Gamma" : '\Γ', 767 "Delta" : '\Δ', 768 "Epsilon" : '\Ε', 769 "Zeta" : '\Ζ', 770 "Eta" : '\Η', 771 "Theta" : '\Θ', 772 "Iota" : '\Ι', 773 "Kappa" : '\Κ', 774 "Lambda" : '\Λ', 775 "Mu" : '\Μ', 776 "Nu" : '\Ν', 777 "Xi" : '\Ξ', 778 "Omicron" : '\Ο', 779 "Pi" : '\Π', 780 "Rho" : '\Ρ', 781 "Sigma" : '\Σ', 782 "Tau" : '\Τ', 783 "Upsilon" : '\Υ', 784 "Phi" : '\Φ', 785 "Chi" : '\Χ', 786 "Psi" : '\Ψ', 787 "Omega" : '\Ω', 788 "alpha" : '\α', 789 "beta" : '\β', 790 "gamma" : '\γ', 791 "delta" : '\δ', 792 "epsilon" : '\ε', 793 "zeta" : '\ζ', 794 "eta" : '\η', 795 "theta" : '\θ', 796 "iota" : '\ι', 797 "kappa" : '\κ', 798 "lambda" : '\λ', 799 "mu" : '\μ', 800 "nu" : '\ν', 801 "xi" : '\ξ', 802 "omicron" : '\ο', 803 "pi" : '\π', 804 "rho" : '\ρ', 805 "sigmaf" : '\ς', 806 "sigma" : '\σ', 807 "tau" : '\τ', 808 "upsilon" : '\υ', 809 "phi" : '\φ', 810 "chi" : '\χ', 811 "psi" : '\ψ', 812 "omega" : '\ω', 813 "thetasym" : '\ϑ', 814 "upsih" : '\ϒ', 815 "piv" : '\ϖ', 816 "bull" : '\•', 817 "hellip" : '\…', 818 "prime" : '\′', 819 "Prime" : '\″', 820 "oline" : '\‾', 821 "frasl" : '\⁄', 822 "weierp" : '\℘', 823 "image" : '\ℑ', 824 "real" : '\ℜ', 825 "trade" : '\™', 826 "alefsym" : '\ℵ', 827 "larr" : '\←', 828 "uarr" : '\↑', 829 "rarr" : '\→', 830 "darr" : '\↓', 831 "harr" : '\↔', 832 "crarr" : '\↵', 833 "lArr" : '\⇐', 834 "uArr" : '\⇑', 835 "rArr" : '\⇒', 836 "dArr" : '\⇓', 837 "hArr" : '\⇔', 838 "forall" : '\∀', 839 "part" : '\∂', 840 "exist" : '\∃', 841 "empty" : '\∅', 842 "nabla" : '\∇', 843 "isin" : '\∈', 844 "notin" : '\∉', 845 "ni" : '\∋', 846 "prod" : '\∏', 847 "sum" : '\∑', 848 "minus" : '\−', 849 "lowast" : '\∗', 850 "radic" : '\√', 851 "prop" : '\∝', 852 "infin" : '\∞', 853 "ang" : '\∠', 854 "and" : '\∧', 855 "or" : '\∨', 856 "cap" : '\∩', 857 "cup" : '\∪', 858 "int" : '\∫', 859 "there4" : '\∴', 860 "sim" : '\∼', 861 "cong" : '\≅', 862 "asymp" : '\≈', 863 "ne" : '\≠', 864 "equiv" : '\≡', 865 "le" : '\≤', 866 "ge" : '\≥', 867 "sub" : '\⊂', 868 "sup" : '\⊃', 869 "nsub" : '\⊄', 870 "sube" : '\⊆', 871 "supe" : '\⊇', 872 "oplus" : '\⊕', 873 "otimes" : '\⊗', 874 "perp" : '\⊥', 875 "sdot" : '\⋅', 876 "lceil" : '\⌈', 877 "rceil" : '\⌉', 878 "lfloor" : '\⌊', 879 "rfloor" : '\⌋', 880 "loz" : '\◊', 881 "spades" : '\♠', 882 "clubs" : '\♣', 883 "hearts" : '\♥', 884 "diams" : '\♦', 885 "lang" : '\⟨', 886 "rang" : '\⟩', 887 888 "apos" : '\'' 889 ]; 890 foreach (name, c; entities) 891 entityNames[c] = name; 892 } 893 894 import core.stdc.stdio; 895 import std.utf; 896 import ae.utils.textout; 897 898 public string encodeEntities(string str) 899 { 900 foreach (i, c; str) 901 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 902 { 903 StringBuilder sb; 904 sb.preallocate(str.length * 11 / 10); 905 sb.put(str[0..i]); 906 sb.putEncodedEntities(str[i..$]); 907 return sb.get(); 908 } 909 return str; 910 } 911 912 public void putEncodedEntities(Sink, S)(ref Sink sink, S str) 913 { 914 size_t start = 0; 915 foreach (i, c; str) 916 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 917 { 918 sink.put(str[start..i], '&', entityNames[c], ';'); 919 start = i+1; 920 } 921 sink.put(str[start..$]); 922 } 923 924 public string encodeAllEntities(string str) 925 { 926 // TODO: optimize 927 foreach_reverse (i, dchar c; str) 928 { 929 auto name = c in entityNames; 930 if (name) 931 str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$]; 932 } 933 return str; 934 } 935 936 import ae.utils.text; 937 import std.conv; 938 939 public string decodeEntities(string str) 940 { 941 auto fragments = str.fastSplit('&'); 942 if (fragments.length <= 1) 943 return str; 944 945 auto interleaved = new string[fragments.length*2 - 1]; 946 auto buffers = new char[4][fragments.length-1]; 947 interleaved[0] = fragments[0]; 948 949 foreach (n, fragment; fragments[1..$]) 950 { 951 auto p = fragment.indexOf(';'); 952 enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)"); 953 954 dchar c; 955 if (fragment[0]=='#') 956 { 957 if (fragment[1]=='x') 958 c = fromHex!uint(fragment[2..p]); 959 else 960 c = to!uint(fragment[1..p]); 961 } 962 else 963 { 964 auto pentity = fragment[0..p] in entities; 965 enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]); 966 c = *pentity; 967 } 968 969 interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)]; 970 interleaved[2+n*2] = fragment[p+1..$]; 971 } 972 973 return interleaved.join(); 974 } 975 976 deprecated alias decodeEntities convertEntities; 977 978 unittest 979 { 980 assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The <Smith & Wesson> "lock'n'load"`); 981 assert(encodeAllEntities("©,€") == "©,€"); 982 assert(decodeEntities("©,€") == "©,€"); 983 }