1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <vladimir@thecybershadow.net> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xmllite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.exception; 27 import ae.utils.xmlwriter; 28 29 // ************************************************************************ 30 31 /// std.stream.Stream-like type with bonus speed 32 private struct StringStream 33 { 34 string s; 35 size_t position; 36 37 @disable this(); 38 @disable this(this); 39 this(string s) 40 { 41 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 42 this.s = (s ~ ditch)[0..$-ditch.length]; 43 } 44 45 char read() { return s[position++]; } 46 @property size_t size() { return s.length; } 47 } 48 49 // ************************************************************************ 50 51 mixin DeclareException!q{XmlParseException}; 52 53 enum XmlNodeType 54 { 55 None, 56 Root, 57 Node, 58 Comment, 59 Meta, 60 DocType, 61 CData, 62 Text 63 } 64 65 alias XmlAttributes = OrderedMap!(string, string); 66 67 class XmlNode 68 { 69 string tag; 70 XmlAttributes attributes; 71 XmlNode parent; 72 XmlNode[] children; 73 XmlNodeType type; 74 ulong startPos, endPos; 75 76 this(ref StringStream s) { parseInto!XmlParseConfig(this, s); } 77 this(string s) { auto ss = StringStream(s); this(ss); } 78 79 this(XmlNodeType type = XmlNodeType.None, string tag = null) 80 { 81 this.type = type; 82 this.tag = tag; 83 } 84 85 XmlNode addAttribute(string name, string value) 86 { 87 attributes[name] = value; 88 return this; 89 } 90 91 XmlNode addChild(XmlNode child) 92 { 93 child.parent = this; 94 children ~= child; 95 return this; 96 } 97 98 override string toString() const 99 { 100 XmlWriter writer; 101 writeTo(writer); 102 return writer.output.get(); 103 } 104 105 final void writeTo(XmlWriter)(ref XmlWriter output) const 106 { 107 void writeChildren() 108 { 109 foreach (child; children) 110 child.writeTo(output); 111 } 112 113 void writeAttributes() 114 { 115 foreach (key, value; attributes) 116 output.addAttribute(key, value); 117 } 118 119 final switch (type) 120 { 121 case XmlNodeType.None: 122 assert(false); 123 case XmlNodeType.Root: 124 writeChildren(); 125 return; 126 case XmlNodeType.Node: 127 output.startTagWithAttributes(tag); 128 writeAttributes(); 129 if (children.length) 130 { 131 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 132 if (oneLine) 133 output.formatter.enabled = false; 134 output.endAttributes(); 135 writeChildren(); 136 output.endTag(tag); 137 if (oneLine) 138 { 139 output.formatter.enabled = true; 140 output.newLine(); 141 } 142 } 143 else 144 output.endAttributesAndTag(); 145 return; 146 case XmlNodeType.Meta: 147 assert(children.length == 0); 148 output.startPI(tag); 149 writeAttributes(); 150 output.endPI(); 151 return; 152 case XmlNodeType.DocType: 153 assert(children.length == 0); 154 output.doctype(tag); 155 return; 156 case XmlNodeType.Text: 157 output.startLine(); 158 output.text(tag); 159 output.newLine(); 160 return; 161 case XmlNodeType.Comment: 162 // TODO 163 return; 164 case XmlNodeType.CData: 165 output.text(tag); 166 return; 167 } 168 } 169 170 @property string text() 171 { 172 final switch (type) 173 { 174 case XmlNodeType.None: 175 assert(false); 176 case XmlNodeType.Text: 177 case XmlNodeType.CData: 178 return tag; 179 case XmlNodeType.Node: 180 case XmlNodeType.Root: 181 string result; 182 if (tag == "br") 183 result = "\n"; 184 foreach (child; children) 185 result ~= child.text(); 186 return result; 187 case XmlNodeType.Comment: 188 case XmlNodeType.Meta: 189 case XmlNodeType.DocType: 190 return null; 191 } 192 } 193 194 final XmlNode findChild(string tag) 195 { 196 foreach (child; children) 197 if (child.type == XmlNodeType.Node && child.tag == tag) 198 return child; 199 return null; 200 } 201 202 final XmlNode[] findChildren(string tag) 203 { 204 XmlNode[] result; 205 foreach (child; children) 206 if (child.type == XmlNodeType.Node && child.tag == tag) 207 result ~= child; 208 return result; 209 } 210 211 final XmlNode opIndex(string tag) 212 { 213 auto node = findChild(tag); 214 if (node is null) 215 throw new XmlParseException("No such child: " ~ tag); 216 return node; 217 } 218 219 final XmlNode opIndex(string tag, size_t index) 220 { 221 auto nodes = findChildren(tag); 222 if (index >= nodes.length) 223 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 224 return nodes[index]; 225 } 226 227 final XmlNode opIndex(size_t index) 228 { 229 return children[index]; 230 } 231 232 final @property size_t length() { return children.length; } 233 234 int opApply(int delegate(ref XmlNode) dg) 235 { 236 int result = 0; 237 238 for (int i = 0; i < children.length; i++) 239 { 240 result = dg(children[i]); 241 if (result) 242 break; 243 } 244 return result; 245 } 246 247 final @property XmlNode dup() 248 { 249 auto result = new XmlNode(type, tag); 250 result.attributes = attributes.dup; 251 result.children.reserve(children.length); 252 foreach (child; children) 253 result.addChild(child.dup); 254 return result; 255 } 256 } 257 258 class XmlDocument : XmlNode 259 { 260 this() 261 { 262 super(XmlNodeType.Root); 263 tag = "<Root>"; 264 } 265 266 this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 267 this(string s) { auto ss = StringStream(s); this(ss); } 268 } 269 270 /// The logic for how to handle a node's closing tags. 271 enum NodeCloseMode 272 { 273 /// This element must always have an explicit closing tag 274 /// (or a self-closing tag). An unclosed tag will lead to 275 /// a parse error. 276 /// In XML, all tags are "always". 277 always, 278 /* 279 /// Close tags are optional. When an element with a tag is 280 /// encountered directly under an element with the same tag, 281 /// it is assumed that the first element is closed before 282 /// the second, so the two are siblings, not parent/child. 283 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 284 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 285 /// still parsed as `<p>a<div><p>b</p></div></p>`. 286 /// This mode can be used for relaxed HTML parsing. 287 optional, 288 */ 289 /// Close tags are optional, but are implied when absent. 290 /// As a result, these elements cannot have any content, 291 /// and any close tags must be adjacent to the open tag. 292 implicit, 293 294 /// This element is void and must never have a closing tag. 295 /// It is always implicitly closed right after opening. 296 /// A close tag is always an error. 297 /// This mode can be used for strict parsing of HTML5 void 298 /// elements. 299 never, 300 } 301 302 /// Configuration for parsing XML. 303 struct XmlParseConfig 304 { 305 static: 306 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } 307 enum optionalParameterValues = false; 308 } 309 310 /// Configuration for strict parsing of HTML5. 311 /// All void tags must never be closed, and all 312 /// non-void tags must always be explicitly closed. 313 /// Attributes must still be quoted like in XML. 314 struct Html5StrictParseConfig 315 { 316 static: 317 immutable voidElements = [ 318 "area" , "base" , "br" , "col" , 319 "command", "embed" , "hr" , "img" , 320 "input" , "keygen", "link" , "meta", 321 "param" , "source", "track", "wbr" , 322 ]; 323 324 NodeCloseMode nodeCloseMode(string tag) 325 { 326 return tag.isOneOf(voidElements) 327 ? NodeCloseMode.never 328 : NodeCloseMode.always 329 ; 330 } 331 332 enum optionalParameterValues = true; 333 } 334 335 /// Parse an SGML-ish string into an XmlNode 336 alias parse = parseString!XmlNode; 337 338 /// Parse an SGML-ish StringStream into an XmlDocument 339 alias parseDocument = parseString!XmlDocument; 340 341 alias xmlParse = parseDocument!XmlParseConfig; 342 343 private: 344 345 public // alias 346 template parseString(Node) 347 { 348 Node parseString(Config)(string s) 349 { 350 auto ss = StringStream(s); 351 alias f = parseStream!Node; 352 return f!Config(ss); 353 } 354 } 355 356 template parseStream(Node) 357 { 358 Node parseStream(Config)(ref StringStream s) 359 { 360 auto n = new Node; 361 parseInto!Config(n, s); 362 return n; 363 } 364 } 365 366 alias parseNode = parseStream!XmlNode; 367 368 /// Parse an SGML-ish StringStream into an XmlDocument 369 void parseInto(Config)(XmlDocument d, ref StringStream s) 370 { 371 skipWhitespace(s); 372 while (s.position < s.size) 373 try 374 { 375 auto n = new XmlNode; 376 parseInto!Config(n, s); 377 d.addChild(n); 378 skipWhitespace(s); 379 } 380 catch (XmlParseException e) 381 { 382 import std.algorithm.searching; 383 import std.range : retro; 384 385 auto head = s.s[0..s.position]; 386 auto row = head.representation.count('\n'); 387 auto column = head.representation.retro.countUntil('\n'); 388 if (column < 0) 389 column = head.length; 390 throw new XmlParseException("Error at %d:%d (offset %d)".format( 391 1 + row, 392 1 + column, 393 head.length, 394 ), e); 395 } 396 } 397 398 /// Parse an SGML-ish StringStream into an XmlNode 399 void parseInto(Config)(XmlNode node, ref StringStream s) 400 { 401 node.startPos = s.position; 402 char c; 403 do 404 c = s.read(); 405 while (isWhiteChar[c]); 406 407 if (c!='<') // text node 408 { 409 node.type = XmlNodeType.Text; 410 string text; 411 while (c!='<') 412 { 413 // TODO: check for EOF 414 text ~= c; 415 c = s.read(); 416 } 417 s.position--; // rewind to '<' 418 node.tag = decodeEntities(text); 419 //tag = tag.strip(); 420 } 421 else 422 { 423 c = s.read(); 424 if (c=='!') 425 { 426 c = s.read(); 427 if (c == '-') // comment 428 { 429 expect(s, '-'); 430 node.type = XmlNodeType.Comment; 431 string tag; 432 do 433 { 434 c = s.read(); 435 tag ~= c; 436 } while (tag.length<3 || tag[$-3..$] != "-->"); 437 tag = tag[0..$-3]; 438 node.tag = tag; 439 } 440 else 441 if (c == '[') // CDATA 442 { 443 foreach (x; "CDATA[") 444 expect(s, x); 445 node.type = XmlNodeType.CData; 446 string tag; 447 do 448 { 449 c = s.read(); 450 tag ~= c; 451 } while (tag.length<3 || tag[$-3..$] != "]]>"); 452 tag = tag[0..$-3]; 453 node.tag = tag; 454 } 455 else // doctype, etc. 456 { 457 node.type = XmlNodeType.DocType; 458 while (c != '>') 459 { 460 node.tag ~= c; 461 c = s.read(); 462 } 463 } 464 } 465 else 466 if (c=='?') 467 { 468 node.type = XmlNodeType.Meta; 469 node.tag = readWord(s); 470 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 471 while (true) 472 { 473 skipWhitespace(s); 474 if (peek(s)=='?') 475 break; 476 readAttribute!Config(node, s); 477 } 478 c = s.read(); 479 expect(s, '>'); 480 } 481 else 482 if (c=='/') 483 throw new XmlParseException("Unexpected close tag"); 484 else 485 { 486 node.type = XmlNodeType.Node; 487 node.tag = c~readWord(s); 488 while (true) 489 { 490 skipWhitespace(s); 491 c = peek(s); 492 if (c=='>' || c=='/') 493 break; 494 readAttribute!Config(node, s); 495 } 496 c = s.read(); 497 498 auto closeMode = Config.nodeCloseMode(node.tag); 499 if (closeMode == NodeCloseMode.never) 500 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 501 else 502 if (closeMode == NodeCloseMode.implicit) 503 { 504 if (c == '/') 505 expect(s, '>'); 506 } 507 else 508 { 509 if (c=='>') 510 { 511 while (true) 512 { 513 while (true) 514 { 515 skipWhitespace(s); 516 if (peek(s)=='<' && peek(s, 2)=='/') 517 break; 518 try 519 node.addChild(parseNode!Config(s)); 520 catch (XmlParseException e) 521 throw new XmlParseException("Error while processing child of "~node.tag, e); 522 } 523 expect(s, '<'); 524 expect(s, '/'); 525 auto word = readWord(s); 526 if (word != node.tag) 527 { 528 auto closeMode2 = Config.nodeCloseMode(word); 529 if (closeMode2 == NodeCloseMode.implicit) 530 { 531 auto parent = node.parent; 532 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 533 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 534 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 535 continue; 536 } 537 else 538 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 539 } 540 expect(s, '>'); 541 break; 542 } 543 } 544 else // '/' 545 expect(s, '>'); 546 } 547 } 548 } 549 node.endPos = s.position; 550 } 551 552 private: 553 554 void readAttribute(Config)(XmlNode node, ref StringStream s) 555 { 556 string name = readWord(s); 557 if (name.length==0) throw new XmlParseException("Invalid attribute"); 558 skipWhitespace(s); 559 560 static if (Config.optionalParameterValues) 561 { 562 if (peek(s) != '=') 563 { 564 node.attributes[name] = null; 565 return; 566 } 567 } 568 569 expect(s, '='); 570 skipWhitespace(s); 571 char delim; 572 delim = s.read(); 573 if (delim != '\'' && delim != '"') 574 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 575 string value = readUntil(s, delim); 576 node.attributes[name] = decodeEntities(value); 577 } 578 579 char peek(ref StringStream s, int n=1) 580 { 581 return s.s[s.position + n - 1]; 582 } 583 584 void skipWhitespace(ref StringStream s) 585 { 586 while (isWhiteChar[s.s.ptr[s.position]]) 587 s.position++; 588 } 589 590 __gshared bool[256] isWhiteChar, isWordChar; 591 592 shared static this() 593 { 594 foreach (c; 0..256) 595 { 596 isWhiteChar[c] = isWhite(c); 597 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 598 } 599 } 600 601 string readWord(ref StringStream stream) 602 { 603 auto start = stream.s.ptr + stream.position; 604 auto end = stream.s.ptr + stream.s.length; 605 auto p = start; 606 while (p < end && isWordChar[*p]) 607 p++; 608 auto len = p-start; 609 stream.position += len; 610 return start[0..len]; 611 } 612 613 void expect(ref StringStream s, char c) 614 { 615 char c2; 616 c2 = s.read(); 617 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 618 } 619 620 string readUntil(ref StringStream s, char until) 621 { 622 auto start = s.s.ptr + s.position; 623 auto p = start; 624 while (*p != until) p++; 625 auto len = p-start; 626 s.position += len + 1; 627 return start[0..len]; 628 } 629 630 unittest 631 { 632 enum xmlText = 633 `<?xml version="1.0" encoding="UTF-8"?>` ~ 634 `<quotes>` ~ 635 `<quote author="Alan Perlis">` ~ 636 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 637 `</quote>` ~ 638 `</quotes>`; 639 auto doc = new XmlDocument(xmlText); 640 assert(doc.toString() == xmlText); 641 } 642 643 const dchar[string] entities; 644 /*const*/ string[dchar] entityNames; 645 shared static this() 646 { 647 entities = 648 [ 649 "quot" : '\"', 650 "amp" : '\&', 651 "lt" : '\<', 652 "gt" : '\>', 653 654 "OElig" : '\Œ', 655 "oelig" : '\œ', 656 "Scaron" : '\Š', 657 "scaron" : '\š', 658 "Yuml" : '\Ÿ', 659 "circ" : '\ˆ', 660 "tilde" : '\˜', 661 "ensp" : '\ ', 662 "emsp" : '\ ', 663 "thinsp" : '\ ', 664 "zwnj" : '\‌', 665 "zwj" : '\‍', 666 "lrm" : '\‎', 667 "rlm" : '\‏', 668 "ndash" : '\–', 669 "mdash" : '\—', 670 "lsquo" : '\‘', 671 "rsquo" : '\’', 672 "sbquo" : '\‚', 673 "ldquo" : '\“', 674 "rdquo" : '\”', 675 "bdquo" : '\„', 676 "dagger" : '\†', 677 "Dagger" : '\‡', 678 "permil" : '\‰', 679 "lsaquo" : '\‹', 680 "rsaquo" : '\›', 681 "euro" : '\€', 682 683 "nbsp" : '\ ', 684 "iexcl" : '\¡', 685 "cent" : '\¢', 686 "pound" : '\£', 687 "curren" : '\¤', 688 "yen" : '\¥', 689 "brvbar" : '\¦', 690 "sect" : '\§', 691 "uml" : '\¨', 692 "copy" : '\©', 693 "ordf" : '\ª', 694 "laquo" : '\«', 695 "not" : '\¬', 696 "shy" : '\­', 697 "reg" : '\®', 698 "macr" : '\¯', 699 "deg" : '\°', 700 "plusmn" : '\±', 701 "sup2" : '\²', 702 "sup3" : '\³', 703 "acute" : '\´', 704 "micro" : '\µ', 705 "para" : '\¶', 706 "middot" : '\·', 707 "cedil" : '\¸', 708 "sup1" : '\¹', 709 "ordm" : '\º', 710 "raquo" : '\»', 711 "frac14" : '\¼', 712 "frac12" : '\½', 713 "frac34" : '\¾', 714 "iquest" : '\¿', 715 "Agrave" : '\À', 716 "Aacute" : '\Á', 717 "Acirc" : '\Â', 718 "Atilde" : '\Ã', 719 "Auml" : '\Ä', 720 "Aring" : '\Å', 721 "AElig" : '\Æ', 722 "Ccedil" : '\Ç', 723 "Egrave" : '\È', 724 "Eacute" : '\É', 725 "Ecirc" : '\Ê', 726 "Euml" : '\Ë', 727 "Igrave" : '\Ì', 728 "Iacute" : '\Í', 729 "Icirc" : '\Î', 730 "Iuml" : '\Ï', 731 "ETH" : '\Ð', 732 "Ntilde" : '\Ñ', 733 "Ograve" : '\Ò', 734 "Oacute" : '\Ó', 735 "Ocirc" : '\Ô', 736 "Otilde" : '\Õ', 737 "Ouml" : '\Ö', 738 "times" : '\×', 739 "Oslash" : '\Ø', 740 "Ugrave" : '\Ù', 741 "Uacute" : '\Ú', 742 "Ucirc" : '\Û', 743 "Uuml" : '\Ü', 744 "Yacute" : '\Ý', 745 "THORN" : '\Þ', 746 "szlig" : '\ß', 747 "agrave" : '\à', 748 "aacute" : '\á', 749 "acirc" : '\â', 750 "atilde" : '\ã', 751 "auml" : '\ä', 752 "aring" : '\å', 753 "aelig" : '\æ', 754 "ccedil" : '\ç', 755 "egrave" : '\è', 756 "eacute" : '\é', 757 "ecirc" : '\ê', 758 "euml" : '\ë', 759 "igrave" : '\ì', 760 "iacute" : '\í', 761 "icirc" : '\î', 762 "iuml" : '\ï', 763 "eth" : '\ð', 764 "ntilde" : '\ñ', 765 "ograve" : '\ò', 766 "oacute" : '\ó', 767 "ocirc" : '\ô', 768 "otilde" : '\õ', 769 "ouml" : '\ö', 770 "divide" : '\÷', 771 "oslash" : '\ø', 772 "ugrave" : '\ù', 773 "uacute" : '\ú', 774 "ucirc" : '\û', 775 "uuml" : '\ü', 776 "yacute" : '\ý', 777 "thorn" : '\þ', 778 "yuml" : '\ÿ', 779 780 "fnof" : '\ƒ', 781 "Alpha" : '\Α', 782 "Beta" : '\Β', 783 "Gamma" : '\Γ', 784 "Delta" : '\Δ', 785 "Epsilon" : '\Ε', 786 "Zeta" : '\Ζ', 787 "Eta" : '\Η', 788 "Theta" : '\Θ', 789 "Iota" : '\Ι', 790 "Kappa" : '\Κ', 791 "Lambda" : '\Λ', 792 "Mu" : '\Μ', 793 "Nu" : '\Ν', 794 "Xi" : '\Ξ', 795 "Omicron" : '\Ο', 796 "Pi" : '\Π', 797 "Rho" : '\Ρ', 798 "Sigma" : '\Σ', 799 "Tau" : '\Τ', 800 "Upsilon" : '\Υ', 801 "Phi" : '\Φ', 802 "Chi" : '\Χ', 803 "Psi" : '\Ψ', 804 "Omega" : '\Ω', 805 "alpha" : '\α', 806 "beta" : '\β', 807 "gamma" : '\γ', 808 "delta" : '\δ', 809 "epsilon" : '\ε', 810 "zeta" : '\ζ', 811 "eta" : '\η', 812 "theta" : '\θ', 813 "iota" : '\ι', 814 "kappa" : '\κ', 815 "lambda" : '\λ', 816 "mu" : '\μ', 817 "nu" : '\ν', 818 "xi" : '\ξ', 819 "omicron" : '\ο', 820 "pi" : '\π', 821 "rho" : '\ρ', 822 "sigmaf" : '\ς', 823 "sigma" : '\σ', 824 "tau" : '\τ', 825 "upsilon" : '\υ', 826 "phi" : '\φ', 827 "chi" : '\χ', 828 "psi" : '\ψ', 829 "omega" : '\ω', 830 "thetasym" : '\ϑ', 831 "upsih" : '\ϒ', 832 "piv" : '\ϖ', 833 "bull" : '\•', 834 "hellip" : '\…', 835 "prime" : '\′', 836 "Prime" : '\″', 837 "oline" : '\‾', 838 "frasl" : '\⁄', 839 "weierp" : '\℘', 840 "image" : '\ℑ', 841 "real" : '\ℜ', 842 "trade" : '\™', 843 "alefsym" : '\ℵ', 844 "larr" : '\←', 845 "uarr" : '\↑', 846 "rarr" : '\→', 847 "darr" : '\↓', 848 "harr" : '\↔', 849 "crarr" : '\↵', 850 "lArr" : '\⇐', 851 "uArr" : '\⇑', 852 "rArr" : '\⇒', 853 "dArr" : '\⇓', 854 "hArr" : '\⇔', 855 "forall" : '\∀', 856 "part" : '\∂', 857 "exist" : '\∃', 858 "empty" : '\∅', 859 "nabla" : '\∇', 860 "isin" : '\∈', 861 "notin" : '\∉', 862 "ni" : '\∋', 863 "prod" : '\∏', 864 "sum" : '\∑', 865 "minus" : '\−', 866 "lowast" : '\∗', 867 "radic" : '\√', 868 "prop" : '\∝', 869 "infin" : '\∞', 870 "ang" : '\∠', 871 "and" : '\∧', 872 "or" : '\∨', 873 "cap" : '\∩', 874 "cup" : '\∪', 875 "int" : '\∫', 876 "there4" : '\∴', 877 "sim" : '\∼', 878 "cong" : '\≅', 879 "asymp" : '\≈', 880 "ne" : '\≠', 881 "equiv" : '\≡', 882 "le" : '\≤', 883 "ge" : '\≥', 884 "sub" : '\⊂', 885 "sup" : '\⊃', 886 "nsub" : '\⊄', 887 "sube" : '\⊆', 888 "supe" : '\⊇', 889 "oplus" : '\⊕', 890 "otimes" : '\⊗', 891 "perp" : '\⊥', 892 "sdot" : '\⋅', 893 "lceil" : '\⌈', 894 "rceil" : '\⌉', 895 "lfloor" : '\⌊', 896 "rfloor" : '\⌋', 897 "loz" : '\◊', 898 "spades" : '\♠', 899 "clubs" : '\♣', 900 "hearts" : '\♥', 901 "diams" : '\♦', 902 "lang" : '\⟨', 903 "rang" : '\⟩', 904 905 "apos" : '\'' 906 ]; 907 foreach (name, c; entities) 908 entityNames[c] = name; 909 } 910 911 import core.stdc.stdio; 912 import std.utf; 913 import ae.utils.textout; 914 915 public string encodeEntities(string str) 916 { 917 foreach (i, c; str) 918 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 919 { 920 StringBuilder sb; 921 sb.preallocate(str.length * 11 / 10); 922 sb.put(str[0..i]); 923 sb.putEncodedEntities(str[i..$]); 924 return sb.get(); 925 } 926 return str; 927 } 928 929 public void putEncodedEntities(Sink, S)(ref Sink sink, S str) 930 { 931 size_t start = 0; 932 foreach (i, c; str) 933 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 934 { 935 sink.put(str[start..i], '&', entityNames[c], ';'); 936 start = i+1; 937 } 938 sink.put(str[start..$]); 939 } 940 941 public string encodeAllEntities(string str) 942 { 943 // TODO: optimize 944 foreach_reverse (i, dchar c; str) 945 { 946 auto name = c in entityNames; 947 if (name) 948 str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$]; 949 } 950 return str; 951 } 952 953 import ae.utils.text; 954 import std.conv; 955 956 public string decodeEntities(string str) 957 { 958 auto fragments = str.fastSplit('&'); 959 if (fragments.length <= 1) 960 return str; 961 962 auto interleaved = new string[fragments.length*2 - 1]; 963 auto buffers = new char[4][fragments.length-1]; 964 interleaved[0] = fragments[0]; 965 966 foreach (n, fragment; fragments[1..$]) 967 { 968 auto p = fragment.indexOf(';'); 969 enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)"); 970 971 dchar c; 972 if (fragment[0]=='#') 973 { 974 if (fragment[1]=='x') 975 c = fromHex!uint(fragment[2..p]); 976 else 977 c = to!uint(fragment[1..p]); 978 } 979 else 980 { 981 auto pentity = fragment[0..p] in entities; 982 enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]); 983 c = *pentity; 984 } 985 986 interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)]; 987 interleaved[2+n*2] = fragment[p+1..$]; 988 } 989 990 return interleaved.join(); 991 } 992 993 deprecated alias decodeEntities convertEntities; 994 995 unittest 996 { 997 assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The <Smith & Wesson> "lock'n'load"`); 998 assert(encodeAllEntities("©,€") == "©,€"); 999 assert(decodeEntities("©,€") == "©,€"); 1000 }