1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <vladimir@thecybershadow.net> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xmllite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.exception; 27 import ae.utils.xmlwriter; 28 29 // ************************************************************************ 30 31 /// std.stream.Stream-like type with bonus speed 32 private struct StringStream 33 { 34 string s; 35 size_t position; 36 37 @disable this(); 38 @disable this(this); 39 this(string s) 40 { 41 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 42 this.s = (s ~ ditch)[0..$-ditch.length]; 43 } 44 45 char read() { return s[position++]; } 46 @property size_t size() { return s.length; } 47 } 48 49 // ************************************************************************ 50 51 mixin DeclareException!q{XmlParseException}; 52 53 enum XmlNodeType 54 { 55 None, 56 Root, 57 Node, 58 Comment, 59 Meta, 60 DocType, 61 CData, 62 Text 63 } 64 65 class XmlNode 66 { 67 string tag; 68 OrderedMap!(string, string) attributes; 69 XmlNode parent; 70 XmlNode[] children; 71 XmlNodeType type; 72 ulong startPos, endPos; 73 74 this(ref StringStream s) { parseInto!XmlParseConfig(this, s); } 75 this(string s) { auto ss = StringStream(s); this(ss); } 76 77 this(XmlNodeType type = XmlNodeType.None, string tag = null) 78 { 79 this.type = type; 80 this.tag = tag; 81 } 82 83 XmlNode addAttribute(string name, string value) 84 { 85 attributes[name] = value; 86 return this; 87 } 88 89 XmlNode addChild(XmlNode child) 90 { 91 child.parent = this; 92 children ~= child; 93 return this; 94 } 95 96 override string toString() const 97 { 98 XmlWriter writer; 99 writeTo(writer); 100 return writer.output.get(); 101 } 102 103 final void writeTo(XmlWriter)(ref XmlWriter output) const 104 { 105 void writeChildren() 106 { 107 foreach (child; children) 108 child.writeTo(output); 109 } 110 111 void writeAttributes() 112 { 113 foreach (key, value; attributes) 114 output.addAttribute(key, value); 115 } 116 117 final switch (type) 118 { 119 case XmlNodeType.None: 120 assert(false); 121 case XmlNodeType.Root: 122 writeChildren(); 123 return; 124 case XmlNodeType.Node: 125 output.startTagWithAttributes(tag); 126 writeAttributes(); 127 if (children.length) 128 { 129 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 130 if (oneLine) 131 output.formatter.enabled = false; 132 output.endAttributes(); 133 writeChildren(); 134 output.endTag(tag); 135 if (oneLine) 136 { 137 output.formatter.enabled = true; 138 output.newLine(); 139 } 140 } 141 else 142 output.endAttributesAndTag(); 143 return; 144 case XmlNodeType.Meta: 145 assert(children.length == 0); 146 output.startPI(tag); 147 writeAttributes(); 148 output.endPI(); 149 return; 150 case XmlNodeType.DocType: 151 assert(children.length == 0); 152 output.doctype(tag); 153 return; 154 case XmlNodeType.Text: 155 output.text(tag); 156 return; 157 case XmlNodeType.Comment: 158 // TODO 159 return; 160 case XmlNodeType.CData: 161 output.text(tag); 162 return; 163 } 164 } 165 166 @property string text() 167 { 168 final switch (type) 169 { 170 case XmlNodeType.None: 171 assert(false); 172 case XmlNodeType.Text: 173 case XmlNodeType.CData: 174 return tag; 175 case XmlNodeType.Node: 176 case XmlNodeType.Root: 177 string result; 178 if (tag == "br") 179 result = "\n"; 180 foreach (child; children) 181 result ~= child.text(); 182 return result; 183 case XmlNodeType.Comment: 184 case XmlNodeType.Meta: 185 case XmlNodeType.DocType: 186 return null; 187 } 188 } 189 190 final XmlNode findChild(string tag) 191 { 192 foreach (child; children) 193 if (child.type == XmlNodeType.Node && child.tag == tag) 194 return child; 195 return null; 196 } 197 198 final XmlNode[] findChildren(string tag) 199 { 200 XmlNode[] result; 201 foreach (child; children) 202 if (child.type == XmlNodeType.Node && child.tag == tag) 203 result ~= child; 204 return result; 205 } 206 207 final XmlNode opIndex(string tag) 208 { 209 auto node = findChild(tag); 210 if (node is null) 211 throw new XmlParseException("No such child: " ~ tag); 212 return node; 213 } 214 215 final XmlNode opIndex(string tag, size_t index) 216 { 217 auto nodes = findChildren(tag); 218 if (index >= nodes.length) 219 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 220 return nodes[index]; 221 } 222 223 final XmlNode opIndex(size_t index) 224 { 225 return children[index]; 226 } 227 228 final @property size_t length() { return children.length; } 229 230 int opApply(int delegate(ref XmlNode) dg) 231 { 232 int result = 0; 233 234 for (int i = 0; i < children.length; i++) 235 { 236 result = dg(children[i]); 237 if (result) 238 break; 239 } 240 return result; 241 } 242 243 final @property XmlNode dup() 244 { 245 auto result = new XmlNode(type, tag); 246 result.attributes = attributes.dup; 247 result.children.reserve(children.length); 248 foreach (child; children) 249 result.addChild(child.dup); 250 return result; 251 } 252 } 253 254 class XmlDocument : XmlNode 255 { 256 this() 257 { 258 super(XmlNodeType.Root); 259 tag = "<Root>"; 260 } 261 262 this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 263 this(string s) { auto ss = StringStream(s); this(ss); } 264 } 265 266 /// The logic for how to handle a node's closing tags. 267 enum NodeCloseMode 268 { 269 /// This element must always have an explicit closing tag 270 /// (or a self-closing tag). An unclosed tag will lead to 271 /// a parse error. 272 /// In XML, all tags are "always". 273 always, 274 /* 275 /// Close tags are optional. When an element with a tag is 276 /// encountered directly under an element with the same tag, 277 /// it is assumed that the first element is closed before 278 /// the second, so the two are siblings, not parent/child. 279 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 280 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 281 /// still parsed as `<p>a<div><p>b</p></div></p>`. 282 /// This mode can be used for relaxed HTML parsing. 283 optional, 284 */ 285 /// Close tags are optional, but are implied when absent. 286 /// As a result, these elements cannot have any content, 287 /// and any close tags must be adjacent to the open tag. 288 implicit, 289 290 /// This element is void and must never have a closing tag. 291 /// It is always implicitly closed right after opening. 292 /// A close tag is always an error. 293 /// This mode can be used for strict parsing of HTML5 void 294 /// elements. 295 never, 296 } 297 298 /// Configuration for parsing XML. 299 struct XmlParseConfig 300 { 301 static: 302 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } 303 enum optionalParameterValues = false; 304 } 305 306 /// Configuration for strict parsing of HTML5. 307 /// All void tags must never be closed, and all 308 /// non-void tags must always be explicitly closed. 309 /// Attributes must still be quoted like in XML. 310 struct Html5StrictParseConfig 311 { 312 static: 313 immutable voidElements = [ 314 "area" , "base" , "br" , "col" , 315 "command", "embed" , "hr" , "img" , 316 "input" , "keygen", "link" , "meta", 317 "param" , "source", "track", "wbr" , 318 ]; 319 320 NodeCloseMode nodeCloseMode(string tag) 321 { 322 return tag.isOneOf(voidElements) 323 ? NodeCloseMode.never 324 : NodeCloseMode.always 325 ; 326 } 327 328 enum optionalParameterValues = true; 329 } 330 331 /// Parse an SGML-ish string into an XmlNode 332 alias parse = parseString!XmlNode; 333 334 /// Parse an SGML-ish StringStream into an XmlDocument 335 alias parseDocument = parseString!XmlDocument; 336 337 alias xmlParse = parseDocument!XmlParseConfig; 338 339 private: 340 341 public // alias 342 template parseString(Node) 343 { 344 Node parseString(Config)(string s) 345 { 346 auto ss = StringStream(s); 347 alias f = parseStream!Node; 348 return f!Config(ss); 349 } 350 } 351 352 template parseStream(Node) 353 { 354 Node parseStream(Config)(ref StringStream s) 355 { 356 auto n = new Node; 357 parseInto!Config(n, s); 358 return n; 359 } 360 } 361 362 alias parseNode = parseStream!XmlNode; 363 364 /// Parse an SGML-ish StringStream into an XmlDocument 365 void parseInto(Config)(XmlDocument d, ref StringStream s) 366 { 367 skipWhitespace(s); 368 while (s.position < s.size) 369 try 370 { 371 auto n = new XmlNode; 372 parseInto!Config(n, s); 373 d.addChild(n); 374 skipWhitespace(s); 375 } 376 catch (XmlParseException e) 377 { 378 import std.algorithm.searching; 379 import std.range : retro; 380 381 auto head = s.s[0..s.position]; 382 auto row = head.representation.count('\n'); 383 auto column = head.representation.retro.countUntil('\n'); 384 if (column < 0) 385 column = head.length; 386 throw new XmlParseException("Error at %d:%d (offset %d)".format( 387 1 + row, 388 1 + column, 389 head.length, 390 ), e); 391 } 392 } 393 394 /// Parse an SGML-ish StringStream into an XmlNode 395 void parseInto(Config)(XmlNode node, ref StringStream s) 396 { 397 node.startPos = s.position; 398 char c; 399 do 400 c = s.read(); 401 while (isWhiteChar[c]); 402 403 if (c!='<') // text node 404 { 405 node.type = XmlNodeType.Text; 406 string text; 407 while (c!='<') 408 { 409 // TODO: check for EOF 410 text ~= c; 411 c = s.read(); 412 } 413 s.position--; // rewind to '<' 414 node.tag = decodeEntities(text); 415 //tag = tag.strip(); 416 } 417 else 418 { 419 c = s.read(); 420 if (c=='!') 421 { 422 c = s.read(); 423 if (c == '-') // comment 424 { 425 expect(s, '-'); 426 node.type = XmlNodeType.Comment; 427 string tag; 428 do 429 { 430 c = s.read(); 431 tag ~= c; 432 } while (tag.length<3 || tag[$-3..$] != "-->"); 433 tag = tag[0..$-3]; 434 node.tag = tag; 435 } 436 else 437 if (c == '[') // CDATA 438 { 439 foreach (x; "CDATA[") 440 expect(s, x); 441 node.type = XmlNodeType.CData; 442 string tag; 443 do 444 { 445 c = s.read(); 446 tag ~= c; 447 } while (tag.length<3 || tag[$-3..$] != "]]>"); 448 tag = tag[0..$-3]; 449 node.tag = tag; 450 } 451 else // doctype, etc. 452 { 453 node.type = XmlNodeType.DocType; 454 while (c != '>') 455 { 456 node.tag ~= c; 457 c = s.read(); 458 } 459 } 460 } 461 else 462 if (c=='?') 463 { 464 node.type = XmlNodeType.Meta; 465 node.tag = readWord(s); 466 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 467 while (true) 468 { 469 skipWhitespace(s); 470 if (peek(s)=='?') 471 break; 472 readAttribute!Config(node, s); 473 } 474 c = s.read(); 475 expect(s, '>'); 476 } 477 else 478 if (c=='/') 479 throw new XmlParseException("Unexpected close tag"); 480 else 481 { 482 node.type = XmlNodeType.Node; 483 node.tag = c~readWord(s); 484 while (true) 485 { 486 skipWhitespace(s); 487 c = peek(s); 488 if (c=='>' || c=='/') 489 break; 490 readAttribute!Config(node, s); 491 } 492 c = s.read(); 493 494 auto closeMode = Config.nodeCloseMode(node.tag); 495 if (closeMode == NodeCloseMode.never) 496 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 497 else 498 if (closeMode == NodeCloseMode.implicit) 499 { 500 if (c == '/') 501 expect(s, '>'); 502 } 503 else 504 { 505 if (c=='>') 506 { 507 while (true) 508 { 509 while (true) 510 { 511 skipWhitespace(s); 512 if (peek(s)=='<' && peek(s, 2)=='/') 513 break; 514 try 515 node.addChild(parseNode!Config(s)); 516 catch (XmlParseException e) 517 throw new XmlParseException("Error while processing child of "~node.tag, e); 518 } 519 expect(s, '<'); 520 expect(s, '/'); 521 auto word = readWord(s); 522 if (word != node.tag) 523 { 524 auto closeMode2 = Config.nodeCloseMode(word); 525 if (closeMode2 == NodeCloseMode.implicit) 526 { 527 auto parent = node.parent; 528 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 529 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 530 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 531 continue; 532 } 533 else 534 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 535 } 536 expect(s, '>'); 537 break; 538 } 539 } 540 else // '/' 541 expect(s, '>'); 542 } 543 } 544 } 545 node.endPos = s.position; 546 } 547 548 private: 549 550 void readAttribute(Config)(XmlNode node, ref StringStream s) 551 { 552 string name = readWord(s); 553 if (name.length==0) throw new XmlParseException("Invalid attribute"); 554 skipWhitespace(s); 555 556 static if (Config.optionalParameterValues) 557 { 558 if (peek(s) != '=') 559 { 560 node.attributes[name] = null; 561 return; 562 } 563 } 564 565 expect(s, '='); 566 skipWhitespace(s); 567 char delim; 568 delim = s.read(); 569 if (delim != '\'' && delim != '"') 570 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 571 string value = readUntil(s, delim); 572 node.attributes[name] = decodeEntities(value); 573 } 574 575 char peek(ref StringStream s, int n=1) 576 { 577 return s.s[s.position + n - 1]; 578 } 579 580 void skipWhitespace(ref StringStream s) 581 { 582 while (isWhiteChar[s.s.ptr[s.position]]) 583 s.position++; 584 } 585 586 __gshared bool[256] isWhiteChar, isWordChar; 587 588 shared static this() 589 { 590 foreach (c; 0..256) 591 { 592 isWhiteChar[c] = isWhite(c); 593 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 594 } 595 } 596 597 string readWord(ref StringStream stream) 598 { 599 auto start = stream.s.ptr + stream.position; 600 auto end = stream.s.ptr + stream.s.length; 601 auto p = start; 602 while (p < end && isWordChar[*p]) 603 p++; 604 auto len = p-start; 605 stream.position += len; 606 return start[0..len]; 607 } 608 609 void expect(ref StringStream s, char c) 610 { 611 char c2; 612 c2 = s.read(); 613 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 614 } 615 616 string readUntil(ref StringStream s, char until) 617 { 618 auto start = s.s.ptr + s.position; 619 auto p = start; 620 while (*p != until) p++; 621 auto len = p-start; 622 s.position += len + 1; 623 return start[0..len]; 624 } 625 626 unittest 627 { 628 enum xmlText = 629 `<?xml version="1.0" encoding="UTF-8"?>` ~ 630 `<quotes>` ~ 631 `<quote author="Alan Perlis">` ~ 632 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 633 `</quote>` ~ 634 `</quotes>`; 635 auto doc = new XmlDocument(xmlText); 636 assert(doc.toString() == xmlText); 637 } 638 639 const dchar[string] entities; 640 /*const*/ string[dchar] entityNames; 641 shared static this() 642 { 643 entities = 644 [ 645 "quot" : '\"', 646 "amp" : '\&', 647 "lt" : '\<', 648 "gt" : '\>', 649 650 "OElig" : '\Œ', 651 "oelig" : '\œ', 652 "Scaron" : '\Š', 653 "scaron" : '\š', 654 "Yuml" : '\Ÿ', 655 "circ" : '\ˆ', 656 "tilde" : '\˜', 657 "ensp" : '\ ', 658 "emsp" : '\ ', 659 "thinsp" : '\ ', 660 "zwnj" : '\‌', 661 "zwj" : '\‍', 662 "lrm" : '\‎', 663 "rlm" : '\‏', 664 "ndash" : '\–', 665 "mdash" : '\—', 666 "lsquo" : '\‘', 667 "rsquo" : '\’', 668 "sbquo" : '\‚', 669 "ldquo" : '\“', 670 "rdquo" : '\”', 671 "bdquo" : '\„', 672 "dagger" : '\†', 673 "Dagger" : '\‡', 674 "permil" : '\‰', 675 "lsaquo" : '\‹', 676 "rsaquo" : '\›', 677 "euro" : '\€', 678 679 "nbsp" : '\ ', 680 "iexcl" : '\¡', 681 "cent" : '\¢', 682 "pound" : '\£', 683 "curren" : '\¤', 684 "yen" : '\¥', 685 "brvbar" : '\¦', 686 "sect" : '\§', 687 "uml" : '\¨', 688 "copy" : '\©', 689 "ordf" : '\ª', 690 "laquo" : '\«', 691 "not" : '\¬', 692 "shy" : '\­', 693 "reg" : '\®', 694 "macr" : '\¯', 695 "deg" : '\°', 696 "plusmn" : '\±', 697 "sup2" : '\²', 698 "sup3" : '\³', 699 "acute" : '\´', 700 "micro" : '\µ', 701 "para" : '\¶', 702 "middot" : '\·', 703 "cedil" : '\¸', 704 "sup1" : '\¹', 705 "ordm" : '\º', 706 "raquo" : '\»', 707 "frac14" : '\¼', 708 "frac12" : '\½', 709 "frac34" : '\¾', 710 "iquest" : '\¿', 711 "Agrave" : '\À', 712 "Aacute" : '\Á', 713 "Acirc" : '\Â', 714 "Atilde" : '\Ã', 715 "Auml" : '\Ä', 716 "Aring" : '\Å', 717 "AElig" : '\Æ', 718 "Ccedil" : '\Ç', 719 "Egrave" : '\È', 720 "Eacute" : '\É', 721 "Ecirc" : '\Ê', 722 "Euml" : '\Ë', 723 "Igrave" : '\Ì', 724 "Iacute" : '\Í', 725 "Icirc" : '\Î', 726 "Iuml" : '\Ï', 727 "ETH" : '\Ð', 728 "Ntilde" : '\Ñ', 729 "Ograve" : '\Ò', 730 "Oacute" : '\Ó', 731 "Ocirc" : '\Ô', 732 "Otilde" : '\Õ', 733 "Ouml" : '\Ö', 734 "times" : '\×', 735 "Oslash" : '\Ø', 736 "Ugrave" : '\Ù', 737 "Uacute" : '\Ú', 738 "Ucirc" : '\Û', 739 "Uuml" : '\Ü', 740 "Yacute" : '\Ý', 741 "THORN" : '\Þ', 742 "szlig" : '\ß', 743 "agrave" : '\à', 744 "aacute" : '\á', 745 "acirc" : '\â', 746 "atilde" : '\ã', 747 "auml" : '\ä', 748 "aring" : '\å', 749 "aelig" : '\æ', 750 "ccedil" : '\ç', 751 "egrave" : '\è', 752 "eacute" : '\é', 753 "ecirc" : '\ê', 754 "euml" : '\ë', 755 "igrave" : '\ì', 756 "iacute" : '\í', 757 "icirc" : '\î', 758 "iuml" : '\ï', 759 "eth" : '\ð', 760 "ntilde" : '\ñ', 761 "ograve" : '\ò', 762 "oacute" : '\ó', 763 "ocirc" : '\ô', 764 "otilde" : '\õ', 765 "ouml" : '\ö', 766 "divide" : '\÷', 767 "oslash" : '\ø', 768 "ugrave" : '\ù', 769 "uacute" : '\ú', 770 "ucirc" : '\û', 771 "uuml" : '\ü', 772 "yacute" : '\ý', 773 "thorn" : '\þ', 774 "yuml" : '\ÿ', 775 776 "fnof" : '\ƒ', 777 "Alpha" : '\Α', 778 "Beta" : '\Β', 779 "Gamma" : '\Γ', 780 "Delta" : '\Δ', 781 "Epsilon" : '\Ε', 782 "Zeta" : '\Ζ', 783 "Eta" : '\Η', 784 "Theta" : '\Θ', 785 "Iota" : '\Ι', 786 "Kappa" : '\Κ', 787 "Lambda" : '\Λ', 788 "Mu" : '\Μ', 789 "Nu" : '\Ν', 790 "Xi" : '\Ξ', 791 "Omicron" : '\Ο', 792 "Pi" : '\Π', 793 "Rho" : '\Ρ', 794 "Sigma" : '\Σ', 795 "Tau" : '\Τ', 796 "Upsilon" : '\Υ', 797 "Phi" : '\Φ', 798 "Chi" : '\Χ', 799 "Psi" : '\Ψ', 800 "Omega" : '\Ω', 801 "alpha" : '\α', 802 "beta" : '\β', 803 "gamma" : '\γ', 804 "delta" : '\δ', 805 "epsilon" : '\ε', 806 "zeta" : '\ζ', 807 "eta" : '\η', 808 "theta" : '\θ', 809 "iota" : '\ι', 810 "kappa" : '\κ', 811 "lambda" : '\λ', 812 "mu" : '\μ', 813 "nu" : '\ν', 814 "xi" : '\ξ', 815 "omicron" : '\ο', 816 "pi" : '\π', 817 "rho" : '\ρ', 818 "sigmaf" : '\ς', 819 "sigma" : '\σ', 820 "tau" : '\τ', 821 "upsilon" : '\υ', 822 "phi" : '\φ', 823 "chi" : '\χ', 824 "psi" : '\ψ', 825 "omega" : '\ω', 826 "thetasym" : '\ϑ', 827 "upsih" : '\ϒ', 828 "piv" : '\ϖ', 829 "bull" : '\•', 830 "hellip" : '\…', 831 "prime" : '\′', 832 "Prime" : '\″', 833 "oline" : '\‾', 834 "frasl" : '\⁄', 835 "weierp" : '\℘', 836 "image" : '\ℑ', 837 "real" : '\ℜ', 838 "trade" : '\™', 839 "alefsym" : '\ℵ', 840 "larr" : '\←', 841 "uarr" : '\↑', 842 "rarr" : '\→', 843 "darr" : '\↓', 844 "harr" : '\↔', 845 "crarr" : '\↵', 846 "lArr" : '\⇐', 847 "uArr" : '\⇑', 848 "rArr" : '\⇒', 849 "dArr" : '\⇓', 850 "hArr" : '\⇔', 851 "forall" : '\∀', 852 "part" : '\∂', 853 "exist" : '\∃', 854 "empty" : '\∅', 855 "nabla" : '\∇', 856 "isin" : '\∈', 857 "notin" : '\∉', 858 "ni" : '\∋', 859 "prod" : '\∏', 860 "sum" : '\∑', 861 "minus" : '\−', 862 "lowast" : '\∗', 863 "radic" : '\√', 864 "prop" : '\∝', 865 "infin" : '\∞', 866 "ang" : '\∠', 867 "and" : '\∧', 868 "or" : '\∨', 869 "cap" : '\∩', 870 "cup" : '\∪', 871 "int" : '\∫', 872 "there4" : '\∴', 873 "sim" : '\∼', 874 "cong" : '\≅', 875 "asymp" : '\≈', 876 "ne" : '\≠', 877 "equiv" : '\≡', 878 "le" : '\≤', 879 "ge" : '\≥', 880 "sub" : '\⊂', 881 "sup" : '\⊃', 882 "nsub" : '\⊄', 883 "sube" : '\⊆', 884 "supe" : '\⊇', 885 "oplus" : '\⊕', 886 "otimes" : '\⊗', 887 "perp" : '\⊥', 888 "sdot" : '\⋅', 889 "lceil" : '\⌈', 890 "rceil" : '\⌉', 891 "lfloor" : '\⌊', 892 "rfloor" : '\⌋', 893 "loz" : '\◊', 894 "spades" : '\♠', 895 "clubs" : '\♣', 896 "hearts" : '\♥', 897 "diams" : '\♦', 898 "lang" : '\⟨', 899 "rang" : '\⟩', 900 901 "apos" : '\'' 902 ]; 903 foreach (name, c; entities) 904 entityNames[c] = name; 905 } 906 907 import core.stdc.stdio; 908 import std.utf; 909 import ae.utils.textout; 910 911 public string encodeEntities(string str) 912 { 913 foreach (i, c; str) 914 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 915 { 916 StringBuilder sb; 917 sb.preallocate(str.length * 11 / 10); 918 sb.put(str[0..i]); 919 sb.putEncodedEntities(str[i..$]); 920 return sb.get(); 921 } 922 return str; 923 } 924 925 public void putEncodedEntities(Sink, S)(ref Sink sink, S str) 926 { 927 size_t start = 0; 928 foreach (i, c; str) 929 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 930 { 931 sink.put(str[start..i], '&', entityNames[c], ';'); 932 start = i+1; 933 } 934 sink.put(str[start..$]); 935 } 936 937 public string encodeAllEntities(string str) 938 { 939 // TODO: optimize 940 foreach_reverse (i, dchar c; str) 941 { 942 auto name = c in entityNames; 943 if (name) 944 str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$]; 945 } 946 return str; 947 } 948 949 import ae.utils.text; 950 import std.conv; 951 952 public string decodeEntities(string str) 953 { 954 auto fragments = str.fastSplit('&'); 955 if (fragments.length <= 1) 956 return str; 957 958 auto interleaved = new string[fragments.length*2 - 1]; 959 auto buffers = new char[4][fragments.length-1]; 960 interleaved[0] = fragments[0]; 961 962 foreach (n, fragment; fragments[1..$]) 963 { 964 auto p = fragment.indexOf(';'); 965 enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)"); 966 967 dchar c; 968 if (fragment[0]=='#') 969 { 970 if (fragment[1]=='x') 971 c = fromHex!uint(fragment[2..p]); 972 else 973 c = to!uint(fragment[1..p]); 974 } 975 else 976 { 977 auto pentity = fragment[0..p] in entities; 978 enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]); 979 c = *pentity; 980 } 981 982 interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)]; 983 interleaved[2+n*2] = fragment[p+1..$]; 984 } 985 986 return interleaved.join(); 987 } 988 989 deprecated alias decodeEntities convertEntities; 990 991 unittest 992 { 993 assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The <Smith & Wesson> "lock'n'load"`); 994 assert(encodeAllEntities("©,€") == "©,€"); 995 assert(decodeEntities("©,€") == "©,€"); 996 }