1 /** 2 * Light read-only XML library 3 * May be deprecated in the future. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <vladimir@thecybershadow.net> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xmllite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.string; 22 import std.ascii; 23 import std.exception; 24 25 import ae.utils.array; 26 import ae.utils.exception; 27 import ae.utils.xmlwriter; 28 29 // ************************************************************************ 30 31 /// std.stream.Stream-like type with bonus speed 32 private struct StringStream 33 { 34 string s; 35 size_t position; 36 37 @disable this(); 38 @disable this(this); 39 this(string s) 40 { 41 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 42 this.s = (s ~ ditch)[0..$-ditch.length]; 43 } 44 45 char read() { return s[position++]; } 46 @property size_t size() { return s.length; } 47 } 48 49 // ************************************************************************ 50 51 mixin DeclareException!q{XmlParseException}; 52 53 enum XmlNodeType 54 { 55 None, 56 Root, 57 Node, 58 Comment, 59 Meta, 60 DocType, 61 CData, 62 Text 63 } 64 65 alias XmlAttributes = OrderedMap!(string, string); 66 67 class XmlNode 68 { 69 string tag; 70 XmlAttributes attributes; 71 XmlNode parent; 72 XmlNode[] children; 73 XmlNodeType type; 74 ulong startPos, endPos; 75 76 this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); } 77 this(string s) { auto ss = StringStream(s); this(ss); } 78 79 this(XmlNodeType type = XmlNodeType.None, string tag = null) 80 { 81 this.type = type; 82 this.tag = tag; 83 } 84 85 XmlNode addAttribute(string name, string value) 86 { 87 attributes[name] = value; 88 return this; 89 } 90 91 XmlNode addChild(XmlNode child) 92 { 93 child.parent = this; 94 children ~= child; 95 return this; 96 } 97 98 override string toString() const 99 { 100 XmlWriter writer; 101 writeTo(writer); 102 return writer.output.get(); 103 } 104 105 final void writeTo(XmlWriter)(ref XmlWriter output) const 106 { 107 void writeChildren() 108 { 109 foreach (child; children) 110 child.writeTo(output); 111 } 112 113 void writeAttributes() 114 { 115 foreach (key, value; attributes) 116 output.addAttribute(key, value); 117 } 118 119 final switch (type) 120 { 121 case XmlNodeType.None: 122 assert(false); 123 case XmlNodeType.Root: 124 writeChildren(); 125 return; 126 case XmlNodeType.Node: 127 output.startTagWithAttributes(tag); 128 writeAttributes(); 129 if (children.length) 130 { 131 bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text; 132 if (oneLine) 133 output.formatter.enabled = false; 134 output.endAttributes(); 135 writeChildren(); 136 output.endTag(tag); 137 if (oneLine) 138 { 139 output.formatter.enabled = true; 140 output.newLine(); 141 } 142 } 143 else 144 output.endAttributesAndTag(); 145 return; 146 case XmlNodeType.Meta: 147 assert(children.length == 0); 148 output.startPI(tag); 149 writeAttributes(); 150 output.endPI(); 151 return; 152 case XmlNodeType.DocType: 153 assert(children.length == 0); 154 output.doctype(tag); 155 return; 156 case XmlNodeType.Text: 157 output.startLine(); 158 output.text(tag); 159 output.newLine(); 160 return; 161 case XmlNodeType.Comment: 162 output.startLine(); 163 output.comment(tag); 164 return; 165 case XmlNodeType.CData: 166 output.text(tag); 167 return; 168 } 169 } 170 171 @property string text() 172 { 173 final switch (type) 174 { 175 case XmlNodeType.None: 176 assert(false); 177 case XmlNodeType.Text: 178 case XmlNodeType.CData: 179 return tag; 180 case XmlNodeType.Node: 181 case XmlNodeType.Root: 182 string result; 183 if (tag == "br") 184 result = "\n"; 185 foreach (child; children) 186 result ~= child.text(); 187 return result; 188 case XmlNodeType.Comment: 189 case XmlNodeType.Meta: 190 case XmlNodeType.DocType: 191 return null; 192 } 193 } 194 195 final XmlNode findChild(string tag) 196 { 197 foreach (child; children) 198 if (child.type == XmlNodeType.Node && child.tag == tag) 199 return child; 200 return null; 201 } 202 203 final XmlNode[] findChildren(string tag) 204 { 205 XmlNode[] result; 206 foreach (child; children) 207 if (child.type == XmlNodeType.Node && child.tag == tag) 208 result ~= child; 209 return result; 210 } 211 212 final XmlNode opIndex(string tag) 213 { 214 auto node = findChild(tag); 215 if (node is null) 216 throw new XmlParseException("No such child: " ~ tag); 217 return node; 218 } 219 220 final XmlNode opIndex(string tag, size_t index) 221 { 222 auto nodes = findChildren(tag); 223 if (index >= nodes.length) 224 throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 225 return nodes[index]; 226 } 227 228 final XmlNode opIndex(size_t index) 229 { 230 return children[index]; 231 } 232 233 final @property size_t length() { return children.length; } 234 235 int opApply(int delegate(ref XmlNode) dg) 236 { 237 int result = 0; 238 239 for (int i = 0; i < children.length; i++) 240 { 241 result = dg(children[i]); 242 if (result) 243 break; 244 } 245 return result; 246 } 247 248 final @property XmlNode dup() 249 { 250 auto result = new XmlNode(type, tag); 251 result.attributes = attributes.dup; 252 result.children.reserve(children.length); 253 foreach (child; children) 254 result.addChild(child.dup); 255 return result; 256 } 257 } 258 259 class XmlDocument : XmlNode 260 { 261 this() 262 { 263 super(XmlNodeType.Root); 264 tag = "<Root>"; 265 } 266 267 this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); } 268 this(string s) { auto ss = StringStream(s); this(ss); } 269 } 270 271 /// The logic for how to handle a node's closing tags. 272 enum NodeCloseMode 273 { 274 /// This element must always have an explicit closing tag 275 /// (or a self-closing tag). An unclosed tag will lead to 276 /// a parse error. 277 /// In XML, all tags are "always". 278 always, 279 /* 280 /// Close tags are optional. When an element with a tag is 281 /// encountered directly under an element with the same tag, 282 /// it is assumed that the first element is closed before 283 /// the second, so the two are siblings, not parent/child. 284 /// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`, 285 /// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is 286 /// still parsed as `<p>a<div><p>b</p></div></p>`. 287 /// This mode can be used for relaxed HTML parsing. 288 optional, 289 */ 290 /// Close tags are optional, but are implied when absent. 291 /// As a result, these elements cannot have any content, 292 /// and any close tags must be adjacent to the open tag. 293 implicit, 294 295 /// This element is void and must never have a closing tag. 296 /// It is always implicitly closed right after opening. 297 /// A close tag is always an error. 298 /// This mode can be used for strict parsing of HTML5 void 299 /// elements. 300 never, 301 } 302 303 /// Configuration for parsing XML. 304 struct XmlParseConfig 305 { 306 static: 307 NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } 308 bool preserveWhitespace(string tag) { return false; } 309 enum optionalParameterValues = false; 310 } 311 312 /// Configuration for strict parsing of HTML5. 313 /// All void tags must never be closed, and all 314 /// non-void tags must always be explicitly closed. 315 /// Attributes must still be quoted like in XML. 316 struct Html5StrictParseConfig 317 { 318 static: 319 immutable voidElements = [ 320 "area" , "base" , "br" , "col" , 321 "command", "embed" , "hr" , "img" , 322 "input" , "keygen", "link" , "meta", 323 "param" , "source", "track", "wbr" , 324 ]; 325 326 NodeCloseMode nodeCloseMode(string tag) 327 { 328 return tag.isOneOf(voidElements) 329 ? NodeCloseMode.never 330 : NodeCloseMode.always 331 ; 332 } 333 334 enum optionalParameterValues = true; 335 bool preserveWhitespace(string tag) { return false; /*TODO*/ } 336 } 337 338 /// Parse an SGML-ish string into an XmlNode 339 alias parse = parseString!XmlNode; 340 341 /// Parse an SGML-ish StringStream into an XmlDocument 342 alias parseDocument = parseString!XmlDocument; 343 344 alias xmlParse = parseDocument!XmlParseConfig; 345 346 private: 347 348 public // alias 349 template parseString(Node) 350 { 351 Node parseString(Config)(string s) 352 { 353 auto ss = StringStream(s); 354 alias f = parseStream!Node; 355 return f!Config(ss); 356 } 357 } 358 359 template parseStream(Node) 360 { 361 Node parseStream(Config)(ref StringStream s) 362 { 363 auto n = new Node; 364 parseInto!Config(n, s); 365 return n; 366 } 367 } 368 369 alias parseNode = parseStream!XmlNode; 370 371 /// Parse an SGML-ish StringStream into an XmlDocument 372 void parseInto(Config)(XmlDocument d, ref StringStream s) 373 { 374 skipWhitespace(s); 375 while (s.position < s.size) 376 try 377 { 378 auto n = new XmlNode; 379 parseInto!Config(n, s, null); 380 d.addChild(n); 381 skipWhitespace(s); 382 } 383 catch (XmlParseException e) 384 { 385 import std.algorithm.searching; 386 import std.range : retro; 387 388 auto head = s.s[0..s.position]; 389 auto row = head.representation.count('\n'); 390 auto column = head.representation.retro.countUntil('\n'); 391 if (column < 0) 392 column = head.length; 393 throw new XmlParseException("Error at %d:%d (offset %d)".format( 394 1 + row, 395 1 + column, 396 head.length, 397 ), e); 398 } 399 } 400 401 /// Parse an SGML-ish StringStream into an XmlNode 402 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false) 403 { 404 char c; 405 406 preserveWhitespace |= Config.preserveWhitespace(parentTag); 407 if (preserveWhitespace) 408 c = s.read(); 409 else 410 do 411 c = s.read(); 412 while (isWhiteChar[c]); 413 414 node.startPos = s.position; 415 if (c!='<') // text node 416 { 417 node.type = XmlNodeType.Text; 418 string text; 419 while (c!='<') 420 { 421 // TODO: check for EOF 422 text ~= c; 423 c = s.read(); 424 } 425 s.position--; // rewind to '<' 426 if (!preserveWhitespace) 427 while (text.length && isWhiteChar[text[$-1]]) 428 text = text[0..$-1]; 429 node.tag = decodeEntities(text); 430 //tag = tag.strip(); 431 } 432 else 433 { 434 c = s.read(); 435 if (c=='!') 436 { 437 c = s.read(); 438 if (c == '-') // comment 439 { 440 expect(s, '-'); 441 node.type = XmlNodeType.Comment; 442 string tag; 443 do 444 { 445 c = s.read(); 446 tag ~= c; 447 } while (tag.length<3 || tag[$-3..$] != "-->"); 448 tag = tag[0..$-3]; 449 node.tag = tag; 450 } 451 else 452 if (c == '[') // CDATA 453 { 454 foreach (x; "CDATA[") 455 expect(s, x); 456 node.type = XmlNodeType.CData; 457 string tag; 458 do 459 { 460 c = s.read(); 461 tag ~= c; 462 } while (tag.length<3 || tag[$-3..$] != "]]>"); 463 tag = tag[0..$-3]; 464 node.tag = tag; 465 } 466 else // doctype, etc. 467 { 468 node.type = XmlNodeType.DocType; 469 while (c != '>') 470 { 471 node.tag ~= c; 472 c = s.read(); 473 } 474 } 475 } 476 else 477 if (c=='?') 478 { 479 node.type = XmlNodeType.Meta; 480 node.tag = readWord(s); 481 if (node.tag.length==0) throw new XmlParseException("Invalid tag"); 482 while (true) 483 { 484 skipWhitespace(s); 485 if (peek(s)=='?') 486 break; 487 readAttribute!Config(node, s); 488 } 489 c = s.read(); 490 expect(s, '>'); 491 } 492 else 493 if (c=='/') 494 throw new XmlParseException("Unexpected close tag"); 495 else 496 { 497 node.type = XmlNodeType.Node; 498 node.tag = c~readWord(s); 499 while (true) 500 { 501 skipWhitespace(s); 502 c = peek(s); 503 if (c=='>' || c=='/') 504 break; 505 readAttribute!Config(node, s); 506 } 507 c = s.read(); 508 509 auto closeMode = Config.nodeCloseMode(node.tag); 510 if (closeMode == NodeCloseMode.never) 511 enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag)); 512 else 513 if (closeMode == NodeCloseMode.implicit) 514 { 515 if (c == '/') 516 expect(s, '>'); 517 } 518 else 519 { 520 if (c=='>') 521 { 522 while (true) 523 { 524 while (true) 525 { 526 if (!preserveWhitespace && !Config.preserveWhitespace(node.tag)) 527 skipWhitespace(s); 528 if (peek(s)=='<' && peek(s, 2)=='/') 529 break; 530 try 531 { 532 auto child = new XmlNode; 533 parseInto!Config(child, s, node.tag, preserveWhitespace); 534 node.addChild(child); 535 } 536 catch (XmlParseException e) 537 throw new XmlParseException("Error while processing child of "~node.tag, e); 538 } 539 expect(s, '<'); 540 expect(s, '/'); 541 auto word = readWord(s); 542 if (word != node.tag) 543 { 544 auto closeMode2 = Config.nodeCloseMode(word); 545 if (closeMode2 == NodeCloseMode.implicit) 546 { 547 auto parent = node.parent; 548 enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word)); 549 enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word)); 550 enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word)); 551 continue; 552 } 553 else 554 enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word)); 555 } 556 expect(s, '>'); 557 break; 558 } 559 } 560 else // '/' 561 expect(s, '>'); 562 } 563 } 564 } 565 node.endPos = s.position; 566 } 567 568 private: 569 570 void readAttribute(Config)(XmlNode node, ref StringStream s) 571 { 572 string name = readWord(s); 573 if (name.length==0) throw new XmlParseException("Invalid attribute"); 574 skipWhitespace(s); 575 576 static if (Config.optionalParameterValues) 577 { 578 if (peek(s) != '=') 579 { 580 node.attributes[name] = null; 581 return; 582 } 583 } 584 585 expect(s, '='); 586 skipWhitespace(s); 587 char delim; 588 delim = s.read(); 589 if (delim != '\'' && delim != '"') 590 throw new XmlParseException("Expected ' or \", not %s".format(delim)); 591 string value = readUntil(s, delim); 592 node.attributes[name] = decodeEntities(value); 593 } 594 595 char peek(ref StringStream s, int n=1) 596 { 597 return s.s[s.position + n - 1]; 598 } 599 600 void skipWhitespace(ref StringStream s) 601 { 602 while (isWhiteChar[s.s.ptr[s.position]]) 603 s.position++; 604 } 605 606 __gshared bool[256] isWhiteChar, isWordChar; 607 608 shared static this() 609 { 610 foreach (c; 0..256) 611 { 612 isWhiteChar[c] = isWhite(c); 613 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 614 } 615 } 616 617 string readWord(ref StringStream stream) 618 { 619 auto start = stream.s.ptr + stream.position; 620 auto end = stream.s.ptr + stream.s.length; 621 auto p = start; 622 while (p < end && isWordChar[*p]) 623 p++; 624 auto len = p-start; 625 stream.position += len; 626 return start[0..len]; 627 } 628 629 void expect(ref StringStream s, char c) 630 { 631 char c2; 632 c2 = s.read(); 633 enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2); 634 } 635 636 string readUntil(ref StringStream s, char until) 637 { 638 auto start = s.s.ptr + s.position; 639 auto p = start; 640 while (*p != until) p++; 641 auto len = p-start; 642 s.position += len + 1; 643 return start[0..len]; 644 } 645 646 unittest 647 { 648 enum xmlText = 649 `<?xml version="1.0" encoding="UTF-8"?>` ~ 650 `<quotes>` ~ 651 `<quote author="Alan Perlis">` ~ 652 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~ 653 `</quote>` ~ 654 `</quotes>`; 655 auto doc = new XmlDocument(xmlText); 656 assert(doc.toString() == xmlText, doc.toString()); 657 } 658 659 unittest 660 { 661 string testOne(bool preserve)(string s) 662 { 663 static struct ParseConfig 664 { 665 static: 666 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 667 bool preserveWhitespace(string tag) { return preserve; } 668 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 669 } 670 auto node = new XmlNode; 671 auto str = StringStream("<tag>" ~ s ~ "</tag>"); 672 parseInto!ParseConfig(node, str, null); 673 // import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString); 674 return node.children.length ? node.children[0].tag : null; 675 } 676 677 foreach (tag; ["a", " a", "a ", " a ", " a a ", " ", ""]) 678 { 679 assert(testOne!false(tag) == strip(tag), 680 "Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'"); 681 assert(testOne!true(tag) == tag, 682 "Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'"); 683 } 684 } 685 686 unittest 687 { 688 static struct ParseConfig 689 { 690 static: 691 NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); } 692 bool preserveWhitespace(string tag) { return tag == "a"; } 693 enum optionalParameterValues = XmlParseConfig.optionalParameterValues; 694 } 695 auto node = new XmlNode; 696 auto str = StringStream("<a><b> foo </b></a>"); 697 parseInto!ParseConfig(node, str, null); 698 assert(node.children[0].children[0].tag == " foo "); 699 } 700 701 const dchar[string] entities; 702 /*const*/ string[dchar] entityNames; 703 shared static this() 704 { 705 entities = 706 [ 707 "quot" : '\"', 708 "amp" : '\&', 709 "lt" : '\<', 710 "gt" : '\>', 711 712 "OElig" : '\Œ', 713 "oelig" : '\œ', 714 "Scaron" : '\Š', 715 "scaron" : '\š', 716 "Yuml" : '\Ÿ', 717 "circ" : '\ˆ', 718 "tilde" : '\˜', 719 "ensp" : '\ ', 720 "emsp" : '\ ', 721 "thinsp" : '\ ', 722 "zwnj" : '\‌', 723 "zwj" : '\‍', 724 "lrm" : '\‎', 725 "rlm" : '\‏', 726 "ndash" : '\–', 727 "mdash" : '\—', 728 "lsquo" : '\‘', 729 "rsquo" : '\’', 730 "sbquo" : '\‚', 731 "ldquo" : '\“', 732 "rdquo" : '\”', 733 "bdquo" : '\„', 734 "dagger" : '\†', 735 "Dagger" : '\‡', 736 "permil" : '\‰', 737 "lsaquo" : '\‹', 738 "rsaquo" : '\›', 739 "euro" : '\€', 740 741 "nbsp" : '\ ', 742 "iexcl" : '\¡', 743 "cent" : '\¢', 744 "pound" : '\£', 745 "curren" : '\¤', 746 "yen" : '\¥', 747 "brvbar" : '\¦', 748 "sect" : '\§', 749 "uml" : '\¨', 750 "copy" : '\©', 751 "ordf" : '\ª', 752 "laquo" : '\«', 753 "not" : '\¬', 754 "shy" : '\­', 755 "reg" : '\®', 756 "macr" : '\¯', 757 "deg" : '\°', 758 "plusmn" : '\±', 759 "sup2" : '\²', 760 "sup3" : '\³', 761 "acute" : '\´', 762 "micro" : '\µ', 763 "para" : '\¶', 764 "middot" : '\·', 765 "cedil" : '\¸', 766 "sup1" : '\¹', 767 "ordm" : '\º', 768 "raquo" : '\»', 769 "frac14" : '\¼', 770 "frac12" : '\½', 771 "frac34" : '\¾', 772 "iquest" : '\¿', 773 "Agrave" : '\À', 774 "Aacute" : '\Á', 775 "Acirc" : '\Â', 776 "Atilde" : '\Ã', 777 "Auml" : '\Ä', 778 "Aring" : '\Å', 779 "AElig" : '\Æ', 780 "Ccedil" : '\Ç', 781 "Egrave" : '\È', 782 "Eacute" : '\É', 783 "Ecirc" : '\Ê', 784 "Euml" : '\Ë', 785 "Igrave" : '\Ì', 786 "Iacute" : '\Í', 787 "Icirc" : '\Î', 788 "Iuml" : '\Ï', 789 "ETH" : '\Ð', 790 "Ntilde" : '\Ñ', 791 "Ograve" : '\Ò', 792 "Oacute" : '\Ó', 793 "Ocirc" : '\Ô', 794 "Otilde" : '\Õ', 795 "Ouml" : '\Ö', 796 "times" : '\×', 797 "Oslash" : '\Ø', 798 "Ugrave" : '\Ù', 799 "Uacute" : '\Ú', 800 "Ucirc" : '\Û', 801 "Uuml" : '\Ü', 802 "Yacute" : '\Ý', 803 "THORN" : '\Þ', 804 "szlig" : '\ß', 805 "agrave" : '\à', 806 "aacute" : '\á', 807 "acirc" : '\â', 808 "atilde" : '\ã', 809 "auml" : '\ä', 810 "aring" : '\å', 811 "aelig" : '\æ', 812 "ccedil" : '\ç', 813 "egrave" : '\è', 814 "eacute" : '\é', 815 "ecirc" : '\ê', 816 "euml" : '\ë', 817 "igrave" : '\ì', 818 "iacute" : '\í', 819 "icirc" : '\î', 820 "iuml" : '\ï', 821 "eth" : '\ð', 822 "ntilde" : '\ñ', 823 "ograve" : '\ò', 824 "oacute" : '\ó', 825 "ocirc" : '\ô', 826 "otilde" : '\õ', 827 "ouml" : '\ö', 828 "divide" : '\÷', 829 "oslash" : '\ø', 830 "ugrave" : '\ù', 831 "uacute" : '\ú', 832 "ucirc" : '\û', 833 "uuml" : '\ü', 834 "yacute" : '\ý', 835 "thorn" : '\þ', 836 "yuml" : '\ÿ', 837 838 "fnof" : '\ƒ', 839 "Alpha" : '\Α', 840 "Beta" : '\Β', 841 "Gamma" : '\Γ', 842 "Delta" : '\Δ', 843 "Epsilon" : '\Ε', 844 "Zeta" : '\Ζ', 845 "Eta" : '\Η', 846 "Theta" : '\Θ', 847 "Iota" : '\Ι', 848 "Kappa" : '\Κ', 849 "Lambda" : '\Λ', 850 "Mu" : '\Μ', 851 "Nu" : '\Ν', 852 "Xi" : '\Ξ', 853 "Omicron" : '\Ο', 854 "Pi" : '\Π', 855 "Rho" : '\Ρ', 856 "Sigma" : '\Σ', 857 "Tau" : '\Τ', 858 "Upsilon" : '\Υ', 859 "Phi" : '\Φ', 860 "Chi" : '\Χ', 861 "Psi" : '\Ψ', 862 "Omega" : '\Ω', 863 "alpha" : '\α', 864 "beta" : '\β', 865 "gamma" : '\γ', 866 "delta" : '\δ', 867 "epsilon" : '\ε', 868 "zeta" : '\ζ', 869 "eta" : '\η', 870 "theta" : '\θ', 871 "iota" : '\ι', 872 "kappa" : '\κ', 873 "lambda" : '\λ', 874 "mu" : '\μ', 875 "nu" : '\ν', 876 "xi" : '\ξ', 877 "omicron" : '\ο', 878 "pi" : '\π', 879 "rho" : '\ρ', 880 "sigmaf" : '\ς', 881 "sigma" : '\σ', 882 "tau" : '\τ', 883 "upsilon" : '\υ', 884 "phi" : '\φ', 885 "chi" : '\χ', 886 "psi" : '\ψ', 887 "omega" : '\ω', 888 "thetasym" : '\ϑ', 889 "upsih" : '\ϒ', 890 "piv" : '\ϖ', 891 "bull" : '\•', 892 "hellip" : '\…', 893 "prime" : '\′', 894 "Prime" : '\″', 895 "oline" : '\‾', 896 "frasl" : '\⁄', 897 "weierp" : '\℘', 898 "image" : '\ℑ', 899 "real" : '\ℜ', 900 "trade" : '\™', 901 "alefsym" : '\ℵ', 902 "larr" : '\←', 903 "uarr" : '\↑', 904 "rarr" : '\→', 905 "darr" : '\↓', 906 "harr" : '\↔', 907 "crarr" : '\↵', 908 "lArr" : '\⇐', 909 "uArr" : '\⇑', 910 "rArr" : '\⇒', 911 "dArr" : '\⇓', 912 "hArr" : '\⇔', 913 "forall" : '\∀', 914 "part" : '\∂', 915 "exist" : '\∃', 916 "empty" : '\∅', 917 "nabla" : '\∇', 918 "isin" : '\∈', 919 "notin" : '\∉', 920 "ni" : '\∋', 921 "prod" : '\∏', 922 "sum" : '\∑', 923 "minus" : '\−', 924 "lowast" : '\∗', 925 "radic" : '\√', 926 "prop" : '\∝', 927 "infin" : '\∞', 928 "ang" : '\∠', 929 "and" : '\∧', 930 "or" : '\∨', 931 "cap" : '\∩', 932 "cup" : '\∪', 933 "int" : '\∫', 934 "there4" : '\∴', 935 "sim" : '\∼', 936 "cong" : '\≅', 937 "asymp" : '\≈', 938 "ne" : '\≠', 939 "equiv" : '\≡', 940 "le" : '\≤', 941 "ge" : '\≥', 942 "sub" : '\⊂', 943 "sup" : '\⊃', 944 "nsub" : '\⊄', 945 "sube" : '\⊆', 946 "supe" : '\⊇', 947 "oplus" : '\⊕', 948 "otimes" : '\⊗', 949 "perp" : '\⊥', 950 "sdot" : '\⋅', 951 "lceil" : '\⌈', 952 "rceil" : '\⌉', 953 "lfloor" : '\⌊', 954 "rfloor" : '\⌋', 955 "loz" : '\◊', 956 "spades" : '\♠', 957 "clubs" : '\♣', 958 "hearts" : '\♥', 959 "diams" : '\♦', 960 "lang" : '\⟨', 961 "rang" : '\⟩', 962 963 "apos" : '\'' 964 ]; 965 foreach (name, c; entities) 966 entityNames[c] = name; 967 } 968 969 import core.stdc.stdio; 970 import std.utf; 971 import ae.utils.textout; 972 973 public string encodeEntities(string str) 974 { 975 foreach (i, c; str) 976 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 977 { 978 StringBuilder sb; 979 sb.preallocate(str.length * 11 / 10); 980 sb.put(str[0..i]); 981 sb.putEncodedEntities(str[i..$]); 982 return sb.get(); 983 } 984 return str; 985 } 986 987 public void putEncodedEntities(Sink, S)(ref Sink sink, S str) 988 { 989 size_t start = 0; 990 foreach (i, c; str) 991 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 992 { 993 sink.put(str[start..i], '&', entityNames[c], ';'); 994 start = i+1; 995 } 996 sink.put(str[start..$]); 997 } 998 999 public string encodeAllEntities(string str) 1000 { 1001 // TODO: optimize 1002 foreach_reverse (i, dchar c; str) 1003 { 1004 auto name = c in entityNames; 1005 if (name) 1006 str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$]; 1007 } 1008 return str; 1009 } 1010 1011 import ae.utils.text; 1012 import std.conv; 1013 1014 public string decodeEntities(string str) 1015 { 1016 auto fragments = str.fastSplit('&'); 1017 if (fragments.length <= 1) 1018 return str; 1019 1020 auto interleaved = new string[fragments.length*2 - 1]; 1021 auto buffers = new char[4][fragments.length-1]; 1022 interleaved[0] = fragments[0]; 1023 1024 foreach (n, fragment; fragments[1..$]) 1025 { 1026 auto p = fragment.indexOf(';'); 1027 enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)"); 1028 1029 dchar c; 1030 if (fragment[0]=='#') 1031 { 1032 if (fragment[1]=='x') 1033 c = fromHex!uint(fragment[2..p]); 1034 else 1035 c = to!uint(fragment[1..p]); 1036 } 1037 else 1038 { 1039 auto pentity = fragment[0..p] in entities; 1040 enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]); 1041 c = *pentity; 1042 } 1043 1044 interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)]; 1045 interleaved[2+n*2] = fragment[p+1..$]; 1046 } 1047 1048 return interleaved.join(); 1049 } 1050 1051 deprecated alias decodeEntities convertEntities; 1052 1053 unittest 1054 { 1055 assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The <Smith & Wesson> "lock'n'load"`); 1056 assert(encodeAllEntities("©,€") == "©,€"); 1057 assert(decodeEntities("©,€") == "©,€"); 1058 }