1 /** 2 * Light read-only XML library 3 * Soon to be deprecated. 4 * See other XML modules for better implementations. 5 * 6 * License: 7 * This Source Code Form is subject to the terms of 8 * the Mozilla Public License, v. 2.0. If a copy of 9 * the MPL was not distributed with this file, You 10 * can obtain one at http://mozilla.org/MPL/2.0/. 11 * 12 * Authors: 13 * Vladimir Panteleev <vladimir@thecybershadow.net> 14 * Simon Arlott 15 */ 16 17 module ae.utils.xmllite; 18 19 // TODO: better/safer handling of malformed XML 20 21 import std.stream; 22 import std.string; 23 import std.ascii; 24 import std.exception; 25 26 import ae.utils.xmlwriter; 27 28 // ************************************************************************ 29 30 /// Stream-like type with bonus speed 31 private struct StringStream 32 { 33 string s; 34 size_t position; 35 36 this(string s) 37 { 38 enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution 39 this.s = (s ~ ditch)[0..$-ditch.length]; 40 } 41 42 void read(out char c) { c = s[position++]; } 43 void seekCur(sizediff_t offset) { position += offset; } 44 @property size_t size() { return s.length; } 45 } 46 47 // ************************************************************************ 48 49 enum XmlNodeType 50 { 51 Root, 52 Node, 53 Comment, 54 Meta, 55 DocType, 56 Text 57 } 58 59 class XmlNode 60 { 61 string tag; 62 string[string] attributes; 63 XmlNode[] children; 64 XmlNodeType type; 65 ulong startPos, endPos; 66 67 this(Stream s) { parse(s); } 68 this(StringStream* s) { parse(s); } 69 this(string s) { this(new StringStream(s)); } 70 71 private final void parse(S)(S s) 72 { 73 startPos = s.position; 74 char c; 75 do 76 s.read(c); 77 while (isWhiteChar[c]); 78 79 if (c!='<') // text node 80 { 81 type = XmlNodeType.Text; 82 string text; 83 while (c!='<') 84 { 85 // TODO: check for EOF 86 text ~= c; 87 s.read(c); 88 } 89 s.seekCur(-1); // rewind to '<' 90 tag = decodeEntities(text); 91 //tag = tag.strip(); 92 } 93 else 94 { 95 s.read(c); 96 if (c=='!') 97 { 98 s.read(c); 99 if (c == '-') // comment 100 { 101 expect(s, '-'); 102 type = XmlNodeType.Comment; 103 do 104 { 105 s.read(c); 106 tag ~= c; 107 } while (tag.length<3 || tag[$-3..$] != "-->"); 108 tag = tag[0..$-3]; 109 } 110 else // doctype, etc. 111 { 112 type = XmlNodeType.DocType; 113 while (c != '>') 114 { 115 tag ~= c; 116 s.read(c); 117 } 118 } 119 } 120 else 121 if (c=='?') 122 { 123 type = XmlNodeType.Meta; 124 tag = readWord(s); 125 if (tag.length==0) throw new Exception("Invalid tag"); 126 while (true) 127 { 128 skipWhitespace(s); 129 if (peek(s)=='?') 130 break; 131 readAttribute(s); 132 } 133 s.read(c); 134 expect(s, '>'); 135 } 136 else 137 if (c=='/') 138 throw new Exception("Unexpected close tag"); 139 else 140 { 141 type = XmlNodeType.Node; 142 tag = c~readWord(s); 143 while (true) 144 { 145 skipWhitespace(s); 146 c = peek(s); 147 if (c=='>' || c=='/') 148 break; 149 readAttribute(s); 150 } 151 s.read(c); 152 if (c=='>') 153 { 154 while (true) 155 { 156 skipWhitespace(s); 157 if (peek(s)=='<' && peek(s, 2)=='/') 158 break; 159 try 160 children ~= new XmlNode(s); 161 catch (Exception e) 162 throw new Exception("Error while processing child of "~tag, e); 163 } 164 expect(s, '<'); 165 expect(s, '/'); 166 foreach (tc; tag) 167 expect(s, tc); 168 expect(s, '>'); 169 } 170 else 171 expect(s, '>'); 172 } 173 } 174 endPos = s.position; 175 } 176 177 this(XmlNodeType type, string tag = null) 178 { 179 this.type = type; 180 this.tag = tag; 181 } 182 183 XmlNode addAttribute(string name, string value) 184 { 185 attributes[name] = value; 186 return this; 187 } 188 189 XmlNode addChild(XmlNode child) 190 { 191 children ~= child; 192 return this; 193 } 194 195 override string toString() const 196 { 197 XmlWriter writer; 198 writeTo(writer); 199 return writer.output.get(); 200 } 201 202 final void writeTo(XmlWriter)(ref XmlWriter output) const 203 { 204 void writeChildren() 205 { 206 foreach (child; children) 207 child.writeTo(output); 208 } 209 210 void writeAttributes() 211 { 212 foreach (key, value; attributes) 213 output.addAttribute(key, value); 214 } 215 216 switch(type) 217 { 218 case XmlNodeType.Root: 219 writeChildren(); 220 return; 221 case XmlNodeType.Node: 222 output.startTagWithAttributes(tag); 223 writeAttributes(); 224 output.endAttributes(); 225 writeChildren(); 226 output.endTag(tag); 227 return; 228 case XmlNodeType.Meta: 229 assert(children.length == 0); 230 output.startPI(tag); 231 writeAttributes(); 232 output.endPI(); 233 return; 234 case XmlNodeType.DocType: 235 assert(children.length == 0); 236 output.doctype(tag); 237 return; 238 case XmlNodeType.Text: 239 output.text(tag); 240 return; 241 default: 242 return; 243 } 244 } 245 246 @property string text() 247 { 248 switch(type) 249 { 250 case XmlNodeType.Text: 251 return tag; 252 case XmlNodeType.Node: 253 case XmlNodeType.Root: 254 string childrenText; 255 foreach (child; children) 256 childrenText ~= child.text(); 257 return childrenText; 258 default: 259 return null; 260 } 261 } 262 263 final XmlNode findChild(string tag) 264 { 265 foreach (child; children) 266 if (child.type == XmlNodeType.Node && child.tag == tag) 267 return child; 268 return null; 269 } 270 271 final XmlNode[] findChildren(string tag) 272 { 273 XmlNode[] result; 274 foreach (child; children) 275 if (child.type == XmlNodeType.Node && child.tag == tag) 276 result ~= child; 277 return result; 278 } 279 280 final XmlNode opIndex(string tag) 281 { 282 auto node = findChild(tag); 283 if (node is null) 284 throw new Exception("No such child: " ~ tag); 285 return node; 286 } 287 288 final XmlNode opIndex(string tag, size_t index) 289 { 290 auto nodes = findChildren(tag); 291 if (index >= nodes.length) 292 throw new Exception(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length)); 293 return nodes[index]; 294 } 295 296 final XmlNode opIndex(size_t index) 297 { 298 return children[index]; 299 } 300 301 final @property size_t length() { return children.length; } 302 303 int opApply(int delegate(ref XmlNode) dg) 304 { 305 int result = 0; 306 307 for (int i = 0; i < children.length; i++) 308 { 309 result = dg(children[i]); 310 if (result) 311 break; 312 } 313 return result; 314 } 315 316 final @property XmlNode dup() 317 { 318 auto result = new XmlNode(type, tag); 319 result.attributes = attributes.dup; 320 result.children.length = children.length; 321 foreach (i, child; children) 322 result.children[i] = child.dup; 323 return result; 324 } 325 326 private: 327 final void readAttribute(S)(S s) 328 { 329 string name = readWord(s); 330 if (name.length==0) throw new Exception("Invalid attribute"); 331 skipWhitespace(s); 332 expect(s, '='); 333 skipWhitespace(s); 334 char delim; 335 s.read(delim); 336 if (delim != '\'' && delim != '"') 337 throw new Exception("Expected ' or \""); 338 string value = readUntil(s, delim); 339 attributes[name] = decodeEntities(value); 340 } 341 } 342 343 class XmlDocument : XmlNode 344 { 345 this() 346 { 347 super(XmlNodeType.Root); 348 tag = "<Root>"; 349 } 350 351 this(Stream s) { this(); parse(s); } 352 this(StringStream* s) { this(); parse(s); } 353 this(string s) { this(new StringStream(s)); } 354 355 final void parse(S)(S s) 356 { 357 skipWhitespace(s); 358 while (s.position < s.size) 359 try 360 { 361 children ~= new XmlNode(s); 362 skipWhitespace(s); 363 } 364 catch (Exception e) 365 throw new Exception(format("Error at %d", s.position), e); 366 } 367 } 368 369 XmlDocument xmlParse(T)(T source) { return new XmlDocument(source); } 370 371 private: 372 373 char peek(Stream s, int n=1) 374 { 375 char c; 376 for (int i=0; i<n; i++) 377 s.read(c); 378 s.seekCur(-n); 379 return c; 380 } 381 382 char peek(StringStream* s, int n=1) 383 { 384 return s.s[s.position + n - 1]; 385 } 386 387 void skipWhitespace(Stream s) 388 { 389 char c; 390 do 391 { 392 if (s.position==s.size) 393 return; 394 s.read(c); 395 } 396 while (isWhiteChar[c]); 397 s.seekCur(-1); 398 } 399 400 void skipWhitespace(StringStream* s) 401 { 402 while (isWhiteChar[s.s.ptr[s.position]]) 403 s.position++; 404 } 405 406 __gshared bool[256] isWhiteChar, isWordChar; 407 408 shared static this() 409 { 410 foreach (c; 0..256) 411 { 412 isWhiteChar[c] = isWhite(c); 413 isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c); 414 } 415 } 416 417 string readWord(Stream s) 418 { 419 char c; 420 string result; 421 while (true) 422 { 423 s.read(c); 424 if (!isWordChar[c]) 425 break; 426 result ~= c; 427 } 428 s.seekCur(-1); 429 return result; 430 } 431 432 string readWord(StringStream* stream) 433 { 434 auto start = stream.s.ptr + stream.position; 435 auto end = stream.s.ptr + stream.s.length; 436 auto p = start; 437 while (p < end && isWordChar[*p]) 438 p++; 439 auto len = p-start; 440 stream.position += len; 441 return start[0..len]; 442 } 443 444 void expect(S)(S s, char c) 445 { 446 char c2; 447 s.read(c2); 448 enforce(c==c2, "Expected " ~ c ~ ", got " ~ c2); 449 } 450 451 string readUntil(Stream s, char until) 452 { 453 string value; 454 while (true) 455 { 456 char c; 457 s.read(c); 458 if (c==until) 459 return value; 460 value ~= c; 461 } 462 } 463 464 string readUntil(StringStream* s, char until) 465 { 466 auto start = s.s.ptr + s.position; 467 auto p = start; 468 while (*p != until) p++; 469 auto len = p-start; 470 s.position += len + 1; 471 return start[0..len]; 472 } 473 474 unittest 475 { 476 enum xmlText = 477 `<?xml version="1.0" encoding="UTF-8"?>` 478 `<quotes>` 479 `<quote author="Alan Perlis">` 480 `When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` 481 `</quote>` 482 `</quotes>`; 483 auto doc = new XmlDocument(new MemoryStream(xmlText.dup)); 484 assert(doc.toString() == xmlText); 485 doc = new XmlDocument(xmlText); 486 assert(doc.toString() == xmlText); 487 } 488 489 const dchar[string] entities; 490 /*const*/ string[dchar] entityNames; 491 static this() 492 { 493 entities = 494 [ 495 "quot"[]: '\"' , 496 "amp" : '\&' , 497 "lt" : '\<' , 498 "gt" : '\>' , 499 "circ" : '\ˆ' , 500 "tilde" : '\˜' , 501 "nbsp" : '\ ' , 502 "ensp" : '\ ' , 503 "emsp" : '\ ' , 504 "thinsp": '\ ', 505 "ndash" : '\–' , 506 "mdash" : '\—' , 507 "lsquo" : '\‘' , 508 "rsquo" : '\’' , 509 "sbquo" : '\‚' , 510 "ldquo" : '\“' , 511 "rdquo" : '\”' , 512 "bdquo" : '\„' , 513 "dagger": '\†', 514 "Dagger": '\‡', 515 "permil": '\‰', 516 "laquo" : '\«' , 517 "raquo" : '\»' , 518 "lsaquo": '\‹', 519 "rsaquo": '\›', 520 "euro" : '\€' , 521 "copy" : '\©' , 522 "reg" : '\®' , 523 "apos" : '\'' 524 ]; 525 foreach (name, c; entities) 526 entityNames[c] = name; 527 } 528 529 import std.utf; 530 import core.stdc.stdio; 531 532 public string encodeEntities(string str) 533 { 534 // TODO: optimize 535 foreach_reverse (i, c; str) 536 if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&') 537 str = str[0..i] ~ '&' ~ entityNames[c] ~ ';' ~ str[i+1..$]; 538 return str; 539 } 540 541 public string encodeAllEntities(string str) 542 { 543 // TODO: optimize 544 foreach_reverse (i, dchar c; str) 545 { 546 auto name = c in entityNames; 547 if (name) 548 str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$]; 549 } 550 return str; 551 } 552 553 import ae.utils.text; 554 import std.conv; 555 556 public string decodeEntities(string str) 557 { 558 auto fragments = str.fastSplit('&'); 559 if (fragments.length <= 1) 560 return str; 561 562 auto interleaved = new string[fragments.length*2 - 1]; 563 auto buffers = new char[4][fragments.length-1]; 564 interleaved[0] = fragments[0]; 565 566 foreach (n, fragment; fragments[1..$]) 567 { 568 auto p = fragment.indexOf(';'); 569 enforce(p>0, "Invalid entity (unescaped ampersand?)"); 570 571 dchar c; 572 if (fragment[0]=='#') 573 { 574 if (fragment[1]=='x') 575 c = fromHex!uint(fragment[2..p]); 576 else 577 c = to!uint(fragment[1..p]); 578 } 579 else 580 { 581 auto pentity = fragment[0..p] in entities; 582 enforce(pentity, "Unknown entity: " ~ fragment[0..p]); 583 c = *pentity; 584 } 585 586 interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)]; 587 interleaved[2+n*2] = fragment[p+1..$]; 588 } 589 590 return interleaved.join(); 591 } 592 593 deprecated alias decodeEntities convertEntities; 594 595 unittest 596 { 597 assert(encodeAllEntities("©,€") == "©,€"); 598 assert(decodeEntities("©,€") == "©,€"); 599 }