1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <vladimir@thecybershadow.net> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xmllite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.exception; 27 import ae.utils.xmlwriter; 28 29 // ************************************************************************ 30 31 /// std.stream.Stream-like type with bonus speed 32 private struct StringStream 33 { 34 string s; 35 size_t position; 36 37 @disable this(); 38 @disable this(this); 39 this(string s) 40 { 41 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 42 this.s = (s ~ ditch)[0..$-ditch.length]; 43 } 44 45 char read() { return s[position++]; } 46 @property size_t size() { return s.length; } 47 } 48 49 // ************************************************************************ 50 51 mixin DeclareException!q{XmlParseException}; 52 53 enum XmlNodeType 54 { 55 None, 56 Root, 57 Node, 58 Comment, 59 Meta, 60 DocType, 61 CData, 62 Text 63 } 64 65 alias XmlAttributes = OrderedMap!(string, string); 66 67 class XmlNode 68 { 69 string tag; 70 XmlAttributes attributes; 71 XmlNode parent; 72 XmlNode[] children; 73 XmlNodeType type; 74 ulong startPos, endPos; 75 76 this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); } 77 this(string s) { auto ss = StringStream(s); this(ss); } 78 79 this(XmlNodeType type = XmlNodeType.None, string tag = null) 80 { 81 this.type = type; 82 this.tag = tag; 83 } 84 85 XmlNode addAttribute(string name, string value) 86 { 87 attributes[name] = value; 88 return this; 89 } 90 91 XmlNode addChild(XmlNode child) 92 { 93 child.parent = this; 94 children ~= child; 95 return this; 96 } 97 98 override string toString() const 99 { 100 XmlWriter writer; 101 writeTo(writer); 102 return writer.output.get(); 103 } 104 105 string toPrettyString() const 106 { 107 PrettyXmlWriter writer; 108 writeTo(writer); 109 return writer.output.get(); 110 } 111 112 final void writeTo(XmlWriter)(ref XmlWriter output) const 113 { 114 void writeChildren() 115 { 116 foreach (child; children) 117 child.writeTo(output); 118 } 119 120 void writeAttributes() 121 { 122 foreach (key, value; attributes) 123 output.addAttribute(key, value); 124 } 125 126 final switch (type) 127 { 128 case XmlNodeType.None: 129 assert(false); 130 case XmlNodeType.Root: 131 writeChildren(); 132 return; 133 case XmlNodeType.Node: 134 output.startTagWithAttributes(tag); 135 writeAttributes(); 136 if (children.length) 137 { 138 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 139 if (oneLine) 140 output.formatter.enabled = false; 141 output.endAttributes(); 142 writeChildren(); 143 output.endTag(tag); 144 if (oneLine) 145 { 146 output.formatter.enabled = true; 147 output.newLine(); 148 } 149 } 150 else 151 output.endAttributesAndTag(); 152 return; 153 case XmlNodeType.Meta: 154 assert(children.length == 0); 155 output.startPI(tag); 156 writeAttributes(); 157 output.endPI(); 158 return; 159 case XmlNodeType.DocType: 160 assert(children.length == 0); 161 output.doctype(tag); 162 return; 163 case XmlNodeType.Text: 164 output.startLine(); 165 output.text(tag); 166 output.newLine(); 167 return; 168 case XmlNodeType.Comment: 169 output.startLine(); 170 output.comment(tag); 171 return; 172 case XmlNodeType.CData: 173 output.text(tag); 174 return; 175 } 176 } 177 178 @property string text() 179 { 180 final switch (type) 181 { 182 case XmlNodeType.None: 183 assert(false); 184 case XmlNodeType.Text: 185 case XmlNodeType.CData: 186 return tag; 187 case XmlNodeType.Node: 188 case XmlNodeType.Root: 189 string result; 190 if (tag == "br") 191 result = "\n"; 192 foreach (child; children) 193 result ~= child.text(); 194 return result; 195 case XmlNodeType.Comment: 196 case XmlNodeType.Meta: 197 case XmlNodeType.DocType: 198 return null; 199 } 200 } 201 202 final XmlNode findChild(string tag) 203 { 204 foreach (child; children) 205 if (child.type == XmlNodeType.Node && child.tag == tag) 206 return child; 207 return null; 208 } 209 210 final XmlNode[] findChildren(string tag) 211 { 212 XmlNode[] result; 213 foreach (child; children) 214 if (child.type == XmlNodeType.Node && child.tag == tag) 215 result ~= child; 216 return result; 217 } 218 219 final XmlNode opIndex(string tag) 220 { 221 auto node = findChild(tag); 222 if (node is null) 223 throw new XmlParseException("No such child: " ~ tag); 224 return node; 225 } 226 227 final XmlNode opIndex(string tag, size_t index) 228 { 229 auto nodes = findChildren(tag); 230 if (index >= nodes.length) 231 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 232 return nodes[index]; 233 } 234 235 final ref XmlNode opIndex(size_t index) 236 { 237 return children[index]; 238 } 239 240 final @property size_t length() { return children.length; } 241 alias opDollar = length; 242 243 int opApply(int delegate(ref XmlNode) dg) 244 { 245 int result = 0; 246 247 for (int i = 0; i < children.length; i++) 248 { 249 result = dg(children[i]); 250 if (result) 251 break; 252 } 253 return result; 254 } 255 256 final @property XmlNode dup() 257 { 258 auto result = new XmlNode(type, tag); 259 result.attributes = attributes.dup; 260 result.children.reserve(children.length); 261 foreach (child; children) 262 result.addChild(child.dup); 263 return result; 264 } 265 } 266 267 class XmlDocument : XmlNode 268 { 269 this() 270 { 271 super(XmlNodeType.Root); 272 tag = "<Root>"; 273 } 274 275 this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 276 this(string s) { auto ss = StringStream(s); this(ss); } 277 } 278 279 /// The logic for how to handle a node's closing tags. 280 enum NodeCloseMode 281 { 282 /// This element must always have an explicit closing tag 283 /// (or a self-closing tag). An unclosed tag will lead to 284 /// a parse error. 285 /// In XML, all tags are "always". 286 always, 287 /* 288 /// Close tags are optional. When an element with a tag is 289 /// encountered directly under an element with the same tag, 290 /// it is assumed that the first element is closed before 291 /// the second, so the two are siblings, not parent/child. 292 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 293 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 294 /// still parsed as `<p>a<div><p>b</p></div></p>`. 295 /// This mode can be used for relaxed HTML parsing. 296 optional, 297 */ 298 /// Close tags are optional, but are implied when absent. 299 /// As a result, these elements cannot have any content, 300 /// and any close tags must be adjacent to the open tag. 301 implicit, 302 303 /// This element is void and must never have a closing tag. 304 /// It is always implicitly closed right after opening. 305 /// A close tag is always an error. 306 /// This mode can be used for strict parsing of HTML5 void 307 /// elements. 308 never, 309 } 310 311 /// Configuration for parsing XML. 312 struct XmlParseConfig 313 { 314 static: 315 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } 316 bool preserveWhitespace(string tag) { return false; } 317 enum optionalParameterValues = false; 318 } 319 320 /// Configuration for strict parsing of HTML5. 321 /// All void tags must never be closed, and all 322 /// non-void tags must always be explicitly closed. 323 /// Attributes must still be quoted like in XML. 324 struct Html5StrictParseConfig 325 { 326 static: 327 immutable voidElements = [ 328 "area" , "base" , "br" , "col" , 329 "command", "embed" , "hr" , "img" , 330 "input" , "keygen", "link" , "meta", 331 "param" , "source", "track", "wbr" , 332 ]; 333 334 NodeCloseMode nodeCloseMode(string tag) 335 { 336 return tag.isOneOf(voidElements) 337 ? NodeCloseMode.never 338 : NodeCloseMode.always 339 ; 340 } 341 342 enum optionalParameterValues = true; 343 bool preserveWhitespace(string tag) { return false; /*TODO*/ } 344 } 345 346 /// Parse an SGML-ish string into an XmlNode 347 alias parse = parseString!XmlNode; 348 349 /// Parse an SGML-ish StringStream into an XmlDocument 350 alias parseDocument = parseString!XmlDocument; 351 352 alias xmlParse = parseDocument!XmlParseConfig; 353 354 private: 355 356 public // alias 357 template parseString(Node) 358 { 359 Node parseString(Config)(string s) 360 { 361 auto ss = StringStream(s); 362 alias f = parseStream!Node; 363 return f!Config(ss); 364 } 365 } 366 367 template parseStream(Node) 368 { 369 Node parseStream(Config)(ref StringStream s) 370 { 371 auto n = new Node; 372 parseInto!Config(n, s); 373 return n; 374 } 375 } 376 377 alias parseNode = parseStream!XmlNode; 378 379 /// Parse an SGML-ish StringStream into an XmlDocument 380 void parseInto(Config)(XmlDocument d, ref StringStream s) 381 { 382 skipWhitespace(s); 383 while (s.position < s.size) 384 try 385 { 386 auto n = new XmlNode; 387 parseInto!Config(n, s, null); 388 d.addChild(n); 389 skipWhitespace(s); 390 } 391 catch (XmlParseException e) 392 { 393 import std.algorithm.searching; 394 import std.range : retro; 395 396 auto head = s.s[0..s.position]; 397 auto row = head.representation.count('\n'); 398 auto column = head.representation.retro.countUntil('\n'); 399 if (column < 0) 400 column = head.length; 401 throw new XmlParseException("Error at %d:%d (offset %d)".format( 402 1 + row, 403 1 + column, 404 head.length, 405 ), e); 406 } 407 } 408 409 /// Parse an SGML-ish StringStream into an XmlNode 410 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false) 411 { 412 char c; 413 414 preserveWhitespace |= Config.preserveWhitespace(parentTag); 415 if (preserveWhitespace) 416 c = s.read(); 417 else 418 do 419 c = s.read(); 420 while (isWhiteChar[c]); 421 422 node.startPos = s.position; 423 if (c!='<') // text node 424 { 425 node.type = XmlNodeType.Text; 426 string text; 427 while (c!='<') 428 { 429 // TODO: check for EOF 430 text ~= c; 431 c = s.read(); 432 } 433 s.position--; // rewind to '<' 434 if (!preserveWhitespace) 435 while (text.length && isWhiteChar[text[$-1]]) 436 text = text[0..$-1]; 437 node.tag = decodeEntities(text); 438 //tag = tag.strip(); 439 } 440 else 441 { 442 c = s.read(); 443 if (c=='!') 444 { 445 c = s.read(); 446 if (c == '-') // comment 447 { 448 expect(s, '-'); 449 node.type = XmlNodeType.Comment; 450 string tag; 451 do 452 { 453 c = s.read(); 454 tag ~= c; 455 } while (tag.length<3 || tag[$-3..$] != "-->"); 456 tag = tag[0..$-3]; 457 node.tag = tag; 458 } 459 else 460 if (c == '[') // CDATA 461 { 462 foreach (x; "CDATA[") 463 expect(s, x); 464 node.type = XmlNodeType.CData; 465 string tag; 466 do 467 { 468 c = s.read(); 469 tag ~= c; 470 } while (tag.length<3 || tag[$-3..$] != "]]>"); 471 tag = tag[0..$-3]; 472 node.tag = tag; 473 } 474 else // doctype, etc. 475 { 476 node.type = XmlNodeType.DocType; 477 while (c != '>') 478 { 479 node.tag ~= c; 480 c = s.read(); 481 } 482 } 483 } 484 else 485 if (c=='?') 486 { 487 node.type = XmlNodeType.Meta; 488 node.tag = readWord(s); 489 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 490 while (true) 491 { 492 skipWhitespace(s); 493 if (peek(s)=='?') 494 break; 495 readAttribute!Config(node, s); 496 } 497 c = s.read(); 498 expect(s, '>'); 499 } 500 else 501 if (c=='/') 502 throw new XmlParseException("Unexpected close tag"); 503 else 504 { 505 node.type = XmlNodeType.Node; 506 node.tag = c~readWord(s); 507 while (true) 508 { 509 skipWhitespace(s); 510 c = peek(s); 511 if (c=='>' || c=='/') 512 break; 513 readAttribute!Config(node, s); 514 } 515 c = s.read(); 516 517 auto closeMode = Config.nodeCloseMode(node.tag); 518 if (closeMode == NodeCloseMode.never) 519 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 520 else 521 if (closeMode == NodeCloseMode.implicit) 522 { 523 if (c == '/') 524 expect(s, '>'); 525 } 526 else 527 { 528 if (c=='>') 529 { 530 while (true) 531 { 532 while (true) 533 { 534 if (!preserveWhitespace && !Config.preserveWhitespace(node.tag)) 535 skipWhitespace(s); 536 if (peek(s)=='<' && peek(s, 2)=='/') 537 break; 538 try 539 { 540 auto child = new XmlNode; 541 parseInto!Config(child, s, node.tag, preserveWhitespace); 542 node.addChild(child); 543 } 544 catch (XmlParseException e) 545 throw new XmlParseException("Error while processing child of "~node.tag, e); 546 } 547 expect(s, '<'); 548 expect(s, '/'); 549 auto word = readWord(s); 550 if (word != node.tag) 551 { 552 auto closeMode2 = Config.nodeCloseMode(word); 553 if (closeMode2 == NodeCloseMode.implicit) 554 { 555 auto parent = node.parent; 556 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 557 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 558 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 559 continue; 560 } 561 else 562 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 563 } 564 expect(s, '>'); 565 break; 566 } 567 } 568 else // '/' 569 expect(s, '>'); 570 } 571 } 572 } 573 node.endPos = s.position; 574 } 575 576 private: 577 578 void readAttribute(Config)(XmlNode node, ref StringStream s) 579 { 580 string name = readWord(s); 581 if (name.length==0) throw new XmlParseException("Invalid attribute"); 582 skipWhitespace(s); 583 584 static if (Config.optionalParameterValues) 585 { 586 if (peek(s) != '=') 587 { 588 node.attributes[name] = null; 589 return; 590 } 591 } 592 593 expect(s, '='); 594 skipWhitespace(s); 595 char delim; 596 delim = s.read(); 597 if (delim != '\'' && delim != '"') 598 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 599 string value = readUntil(s, delim); 600 node.attributes[name] = decodeEntities(value); 601 } 602 603 char peek(ref StringStream s, int n=1) 604 { 605 return s.s[s.position + n - 1]; 606 } 607 608 void skipWhitespace(ref StringStream s) 609 { 610 while (isWhiteChar[s.s.ptr[s.position]]) 611 s.position++; 612 } 613 614 __gshared bool[256] isWhiteChar, isWordChar; 615 616 shared static this() 617 { 618 foreach (c; 0..256) 619 { 620 isWhiteChar[c] = isWhite(c); 621 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 622 } 623 } 624 625 string readWord(ref StringStream stream) 626 { 627 auto start = stream.s.ptr + stream.position; 628 auto end = stream.s.ptr + stream.s.length; 629 auto p = start; 630 while (p < end && isWordChar[*p]) 631 p++; 632 auto len = p-start; 633 stream.position += len; 634 return start[0..len]; 635 } 636 637 void expect(ref StringStream s, char c) 638 { 639 char c2; 640 c2 = s.read(); 641 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 642 } 643 644 string readUntil(ref StringStream s, char until) 645 { 646 auto start = s.s.ptr + s.position; 647 auto p = start; 648 while (*p != until) p++; 649 auto len = p-start; 650 s.position += len + 1; 651 return start[0..len]; 652 } 653 654 unittest 655 { 656 enum xmlText = 657 `<?xml version="1.0" encoding="UTF-8"?>` ~ 658 `<quotes>` ~ 659 `<quote author="Alan Perlis">` ~ 660 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 661 `</quote>` ~ 662 `</quotes>`; 663 auto doc = new XmlDocument(xmlText); 664 assert(doc.toString() == xmlText, doc.toString()); 665 } 666 667 unittest 668 { 669 string testOne(bool preserve)(string s) 670 { 671 static struct ParseConfig 672 { 673 static: 674 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 675 bool preserveWhitespace(string tag) { return preserve; } 676 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 677 } 678 auto node = new XmlNode; 679 auto str = StringStream("<tag>" ~ s ~ "</tag>"); 680 parseInto!ParseConfig(node, str, null); 681 // import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString); 682 return node.children.length ? node.children[0].tag : null; 683 } 684 685 foreach (tag; ["a", " a", "a ", " a ", " a a ", " ", ""]) 686 { 687 assert(testOne!false(tag) == strip(tag), 688 "Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'"); 689 assert(testOne!true(tag) == tag, 690 "Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'"); 691 } 692 } 693 694 unittest 695 { 696 static struct ParseConfig 697 { 698 static: 699 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 700 bool preserveWhitespace(string tag) { return tag == "a"; } 701 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 702 } 703 auto node = new XmlNode; 704 auto str = StringStream("<a><b> foo </b></a>"); 705 parseInto!ParseConfig(node, str, null); 706 assert(node.children[0].children[0].tag == " foo "); 707 } 708 709 const dchar[string] entities; 710 /*const*/ string[dchar] entityNames; 711 shared static this() 712 { 713 entities = 714 [ 715 "quot" : '\"', 716 "amp" : '\&', 717 "lt" : '\<', 718 "gt" : '\>', 719 720 "OElig" : '\Œ', 721 "oelig" : '\œ', 722 "Scaron" : '\Š', 723 "scaron" : '\š', 724 "Yuml" : '\Ÿ', 725 "circ" : '\ˆ', 726 "tilde" : '\˜', 727 "ensp" : '\ ', 728 "emsp" : '\ ', 729 "thinsp" : '\ ', 730 "zwnj" : '\‌', 731 "zwj" : '\‍', 732 "lrm" : '\‎', 733 "rlm" : '\‏', 734 "ndash" : '\–', 735 "mdash" : '\—', 736 "lsquo" : '\‘', 737 "rsquo" : '\’', 738 "sbquo" : '\‚', 739 "ldquo" : '\“', 740 "rdquo" : '\”', 741 "bdquo" : '\„', 742 "dagger" : '\†', 743 "Dagger" : '\‡', 744 "permil" : '\‰', 745 "lsaquo" : '\‹', 746 "rsaquo" : '\›', 747 "euro" : '\€', 748 749 "nbsp" : '\ ', 750 "iexcl" : '\¡', 751 "cent" : '\¢', 752 "pound" : '\£', 753 "curren" : '\¤', 754 "yen" : '\¥', 755 "brvbar" : '\¦', 756 "sect" : '\§', 757 "uml" : '\¨', 758 "copy" : '\©', 759 "ordf" : '\ª', 760 "laquo" : '\«', 761 "not" : '\¬', 762 "shy" : '\­', 763 "reg" : '\®', 764 "macr" : '\¯', 765 "deg" : '\°', 766 "plusmn" : '\±', 767 "sup2" : '\²', 768 "sup3" : '\³', 769 "acute" : '\´', 770 "micro" : '\µ', 771 "para" : '\¶', 772 "middot" : '\·', 773 "cedil" : '\¸', 774 "sup1" : '\¹', 775 "ordm" : '\º', 776 "raquo" : '\»', 777 "frac14" : '\¼', 778 "frac12" : '\½', 779 "frac34" : '\¾', 780 "iquest" : '\¿', 781 "Agrave" : '\À', 782 "Aacute" : '\Á', 783 "Acirc" : '\Â', 784 "Atilde" : '\Ã', 785 "Auml" : '\Ä', 786 "Aring" : '\Å', 787 "AElig" : '\Æ', 788 "Ccedil" : '\Ç', 789 "Egrave" : '\È', 790 "Eacute" : '\É', 791 "Ecirc" : '\Ê', 792 "Euml" : '\Ë', 793 "Igrave" : '\Ì', 794 "Iacute" : '\Í', 795 "Icirc" : '\Î', 796 "Iuml" : '\Ï', 797 "ETH" : '\Ð', 798 "Ntilde" : '\Ñ', 799 "Ograve" : '\Ò', 800 "Oacute" : '\Ó', 801 "Ocirc" : '\Ô', 802 "Otilde" : '\Õ', 803 "Ouml" : '\Ö', 804 "times" : '\×', 805 "Oslash" : '\Ø', 806 "Ugrave" : '\Ù', 807 "Uacute" : '\Ú', 808 "Ucirc" : '\Û', 809 "Uuml" : '\Ü', 810 "Yacute" : '\Ý', 811 "THORN" : '\Þ', 812 "szlig" : '\ß', 813 "agrave" : '\à', 814 "aacute" : '\á', 815 "acirc" : '\â', 816 "atilde" : '\ã', 817 "auml" : '\ä', 818 "aring" : '\å', 819 "aelig" : '\æ', 820 "ccedil" : '\ç', 821 "egrave" : '\è', 822 "eacute" : '\é', 823 "ecirc" : '\ê', 824 "euml" : '\ë', 825 "igrave" : '\ì', 826 "iacute" : '\í', 827 "icirc" : '\î', 828 "iuml" : '\ï', 829 "eth" : '\ð', 830 "ntilde" : '\ñ', 831 "ograve" : '\ò', 832 "oacute" : '\ó', 833 "ocirc" : '\ô', 834 "otilde" : '\õ', 835 "ouml" : '\ö', 836 "divide" : '\÷', 837 "oslash" : '\ø', 838 "ugrave" : '\ù', 839 "uacute" : '\ú', 840 "ucirc" : '\û', 841 "uuml" : '\ü', 842 "yacute" : '\ý', 843 "thorn" : '\þ', 844 "yuml" : '\ÿ', 845 846 "fnof" : '\ƒ', 847 "Alpha" : '\Α', 848 "Beta" : '\Β', 849 "Gamma" : '\Γ', 850 "Delta" : '\Δ', 851 "Epsilon" : '\Ε', 852 "Zeta" : '\Ζ', 853 "Eta" : '\Η', 854 "Theta" : '\Θ', 855 "Iota" : '\Ι', 856 "Kappa" : '\Κ', 857 "Lambda" : '\Λ', 858 "Mu" : '\Μ', 859 "Nu" : '\Ν', 860 "Xi" : '\Ξ', 861 "Omicron" : '\Ο', 862 "Pi" : '\Π', 863 "Rho" : '\Ρ', 864 "Sigma" : '\Σ', 865 "Tau" : '\Τ', 866 "Upsilon" : '\Υ', 867 "Phi" : '\Φ', 868 "Chi" : '\Χ', 869 "Psi" : '\Ψ', 870 "Omega" : '\Ω', 871 "alpha" : '\α', 872 "beta" : '\β', 873 "gamma" : '\γ', 874 "delta" : '\δ', 875 "epsilon" : '\ε', 876 "zeta" : '\ζ', 877 "eta" : '\η', 878 "theta" : '\θ', 879 "iota" : '\ι', 880 "kappa" : '\κ', 881 "lambda" : '\λ', 882 "mu" : '\μ', 883 "nu" : '\ν', 884 "xi" : '\ξ', 885 "omicron" : '\ο', 886 "pi" : '\π', 887 "rho" : '\ρ', 888 "sigmaf" : '\ς', 889 "sigma" : '\σ', 890 "tau" : '\τ', 891 "upsilon" : '\υ', 892 "phi" : '\φ', 893 "chi" : '\χ', 894 "psi" : '\ψ', 895 "omega" : '\ω', 896 "thetasym" : '\ϑ', 897 "upsih" : '\ϒ', 898 "piv" : '\ϖ', 899 "bull" : '\•', 900 "hellip" : '\…', 901 "prime" : '\′', 902 "Prime" : '\″', 903 "oline" : '\‾', 904 "frasl" : '\⁄', 905 "weierp" : '\℘', 906 "image" : '\ℑ', 907 "real" : '\ℜ', 908 "trade" : '\™', 909 "alefsym" : '\ℵ', 910 "larr" : '\←', 911 "uarr" : '\↑', 912 "rarr" : '\→', 913 "darr" : '\↓', 914 "harr" : '\↔', 915 "crarr" : '\↵', 916 "lArr" : '\⇐', 917 "uArr" : '\⇑', 918 "rArr" : '\⇒', 919 "dArr" : '\⇓', 920 "hArr" : '\⇔', 921 "forall" : '\∀', 922 "part" : '\∂', 923 "exist" : '\∃', 924 "empty" : '\∅', 925 "nabla" : '\∇', 926 "isin" : '\∈', 927 "notin" : '\∉', 928 "ni" : '\∋', 929 "prod" : '\∏', 930 "sum" : '\∑', 931 "minus" : '\−', 932 "lowast" : '\∗', 933 "radic" : '\√', 934 "prop" : '\∝', 935 "infin" : '\∞', 936 "ang" : '\∠', 937 "and" : '\∧', 938 "or" : '\∨', 939 "cap" : '\∩', 940 "cup" : '\∪', 941 "int" : '\∫', 942 "there4" : '\∴', 943 "sim" : '\∼', 944 "cong" : '\≅', 945 "asymp" : '\≈', 946 "ne" : '\≠', 947 "equiv" : '\≡', 948 "le" : '\≤', 949 "ge" : '\≥', 950 "sub" : '\⊂', 951 "sup" : '\⊃', 952 "nsub" : '\⊄', 953 "sube" : '\⊆', 954 "supe" : '\⊇', 955 "oplus" : '\⊕', 956 "otimes" : '\⊗', 957 "perp" : '\⊥', 958 "sdot" : '\⋅', 959 "lceil" : '\⌈', 960 "rceil" : '\⌉', 961 "lfloor" : '\⌊', 962 "rfloor" : '\⌋', 963 "loz" : '\◊', 964 "spades" : '\♠', 965 "clubs" : '\♣', 966 "hearts" : '\♥', 967 "diams" : '\♦', 968 "lang" : '\⟨', 969 "rang" : '\⟩', 970 971 "apos" : '\'' 972 ]; 973 foreach (name, c; entities) 974 entityNames[c] = name; 975 } 976 977 import core.stdc.stdio; 978 import std.utf; 979 import ae.utils.textout; 980 981 /*private*/ public string encodeEntitiesImpl(bool unicode, alias pred)(string str) 982 { 983 size_t i = 0; 984 while (i < str.length) 985 { 986 size_t o = i; 987 static if (unicode) 988 dchar c = decode(str, i); 989 else 990 char c = str[i++]; 991 992 if (pred(c)) 993 { 994 StringBuilder sb; 995 sb.preallocate(str.length * 11 / 10); 996 sb.put(str[0..o]); 997 sb.putEncodedEntitiesImpl!(unicode, pred)(str[o..$]); 998 return sb.get(); 999 } 1000 } 1001 return str; 1002 } 1003 1004 /*private*/ public template putEncodedEntitiesImpl(bool unicode, alias pred) 1005 { 1006 void putEncodedEntitiesImpl(Sink, S)(ref Sink sink, S str) 1007 { 1008 size_t start = 0, i = 0; 1009 while (i < str.length) 1010 { 1011 size_t o = i; 1012 static if (unicode) 1013 dchar c = decode(str, i); 1014 else 1015 char c = str[i++]; 1016 1017 if (pred(c)) 1018 { 1019 sink.put(str[start..o], '&', entityNames[c], ';'); 1020 start = i; 1021 } 1022 } 1023 sink.put(str[start..$]); 1024 } 1025 } 1026 1027 public alias encodeEntities = encodeEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&'); 1028 public alias putEncodedEntities = putEncodedEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&'); 1029 1030 public string encodeAllEntities(string str) 1031 { 1032 // TODO: optimize 1033 foreach_reverse (i, dchar c; str) 1034 { 1035 auto name = c in entityNames; 1036 if (name) 1037 str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$]; 1038 } 1039 return str; 1040 } 1041 1042 import ae.utils.text; 1043 import std.conv; 1044 1045 public string decodeEntities(string str) 1046 { 1047 auto fragments = str.fastSplit('&'); 1048 if (fragments.length <= 1) 1049 return str; 1050 1051 auto interleaved = new string[fragments.length*2 - 1]; 1052 auto buffers = new char[4][fragments.length-1]; 1053 interleaved[0] = fragments[0]; 1054 1055 foreach (n, fragment; fragments[1..$]) 1056 { 1057 auto p = fragment.indexOf(';'); 1058 enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)"); 1059 1060 dchar c; 1061 if (fragment[0]=='#') 1062 { 1063 if (fragment[1]=='x') 1064 c = fromHex!uint(fragment[2..p]); 1065 else 1066 c = to!uint(fragment[1..p]); 1067 } 1068 else 1069 { 1070 auto pentity = fragment[0..p] in entities; 1071 enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]); 1072 c = *pentity; 1073 } 1074 1075 interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)]; 1076 interleaved[2+n*2] = fragment[p+1..$]; 1077 } 1078 1079 return interleaved.join(); 1080 } 1081 1082 deprecated alias decodeEntities convertEntities; 1083 1084 unittest 1085 { 1086 assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The <Smith & Wesson> "lock'n'load"`); 1087 assert(encodeAllEntities("©,€") == "©,€"); 1088 assert(decodeEntities("©,€") == "©,€"); 1089 }