1 /**
2  * Light read-only XML library
3  * May be deprecated in the future.
4  * See other XML modules for better implementations.
5  *
6  * License:
7  *   This Source Code Form is subject to the terms of
8  *   the Mozilla Public License, v. 2.0. If a copy of
9  *   the MPL was not distributed with this file, You
10  *   can obtain one at http://mozilla.org/MPL/2.0/.
11  *
12  * Authors:
13  *   Vladimir Panteleev <ae@cy.md>
14  *   Simon Arlott
15  */
16 
17 module ae.utils.xml.lite;
18 
19 // TODO: better/safer handling of malformed XML
20 
21 import std.string;
22 import std.ascii;
23 import std.exception;
24 
25 import ae.utils.array;
26 import ae.utils.xml.common;
27 import ae.utils.xml.entities;
28 import ae.utils.xmlwriter;
29 
30 // ************************************************************************
31 
32 /// std.stream.Stream-like type with bonus speed
33 private struct StringStream
34 {
35 	string s;
36 	size_t position;
37 
38 	@disable this();
39 	@disable this(this);
40 	this(string s)
41 	{
42 		enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution
43 		this.s = (s ~ ditch)[0..$-ditch.length];
44 	}
45 
46 	char read() { return s[position++]; }
47 	@property size_t size() { return s.length; }
48 }
49 
50 // ************************************************************************
51 
52 /// The type of an `XmlNode`.
53 enum XmlNodeType
54 {
55 	None    , /// Initial value. Never created during parsing.
56 	Root    , /// The root node. Contains top-level nodes as children.
57 	Node    , /// XML tag.
58 	Comment , /// XML comment.
59 	Meta    , /// XML processing instruction.
60 	DocType , /// XML doctype declaration.
61 	CData   , /// CDATA node.
62 	Text    , /// Text node.
63 	Raw     , /// Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is.
64 }
65 
66 /// Type used to hold a tag node's attributes.
67 alias XmlAttributes = OrderedMap!(string, string);
68 
69 /// An XML node.
70 class XmlNode
71 {
72 	string tag; /// The tag name, or the contents for text / comment / CDATA nodes.
73 	XmlAttributes attributes; /// Tag attributes.
74 	XmlNode parent; /// Parent node.
75 	XmlNode[] children; /// Children nodes.
76 	XmlNodeType type; /// Node type.
77 	/// Start and end offset within the input.
78 	ulong startPos, endPos;
79 
80 	private this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); }
81 
82 	/// Create and parse from input.
83 	this(string s) { auto ss = StringStream(s); this(ss); }
84 
85 	/// Create a new node.
86 	this(XmlNodeType type = XmlNodeType.None, string tag = null)
87 	{
88 		this.type = type;
89 		this.tag = tag;
90 	}
91 
92 	/// Set an attribute with the given value.
93 	XmlNode addAttribute(string name, string value)
94 	{
95 		attributes[name] = value;
96 		return this;
97 	}
98 
99 	/// Add a child node, making this node its parent.
100 	XmlNode addChild(XmlNode child)
101 	{
102 		child.parent = this;
103 		children ~= child;
104 		return this;
105 	}
106 
107 	/// Return XML string.
108 	override string toString() const
109 	{
110 		XmlWriter writer;
111 		writeTo(writer);
112 		return writer.output.get();
113 	}
114 
115 	/// Return pretty-printed XML string (with indentation).
116 	string toPrettyString() const
117 	{
118 		PrettyXmlWriter writer;
119 		writeTo(writer);
120 		return writer.output.get();
121 	}
122 
123 	/// Write to an `XmlWriter`.
124 	final void writeTo(XmlWriter)(ref XmlWriter output) const
125 	{
126 		void writeChildren()
127 		{
128 			foreach (child; children)
129 				child.writeTo(output);
130 		}
131 
132 		void writeAttributes()
133 		{
134 			foreach (key, value; attributes)
135 				output.addAttribute(key, value);
136 		}
137 
138 		final switch (type)
139 		{
140 			case XmlNodeType.None:
141 				assert(false);
142 			case XmlNodeType.Root:
143 				writeChildren();
144 				return;
145 			case XmlNodeType.Node:
146 				output.startTagWithAttributes(tag);
147 				writeAttributes();
148 				if (children.length)
149 				{
150 					bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text;
151 					if (oneLine)
152 						output.formatter.enabled = false;
153 					output.endAttributes();
154 					writeChildren();
155 					output.endTag(tag);
156 					if (oneLine)
157 					{
158 						output.formatter.enabled = true;
159 						output.newLine();
160 					}
161 				}
162 				else
163 					output.endAttributesAndTag();
164 				return;
165 			case XmlNodeType.Meta:
166 				assert(children.length == 0);
167 				output.startPI(tag);
168 				writeAttributes();
169 				output.endPI();
170 				return;
171 			case XmlNodeType.DocType:
172 				assert(children.length == 0);
173 				output.doctype(tag);
174 				return;
175 			case XmlNodeType.Text:
176 				output.startLine();
177 				output.text(tag);
178 				output.newLine();
179 				return;
180 			case XmlNodeType.Comment:
181 				output.startLine();
182 				output.comment(tag);
183 				return;
184 			case XmlNodeType.CData:
185 				output.text(tag);
186 				return;
187 			case XmlNodeType.Raw:
188 				output.startLine();
189 				output.output.put(tag);
190 				output.newLine();
191 				return;
192 		}
193 	}
194 
195 	/// Attempts to retrieve the text contents of this node.
196 	/// `<br>` tags are converted to newlines.
197 	@property string text()
198 	{
199 		final switch (type)
200 		{
201 			case XmlNodeType.None:
202 				assert(false);
203 			case XmlNodeType.Text:
204 			case XmlNodeType.CData:
205 				return tag;
206 			case XmlNodeType.Node:
207 			case XmlNodeType.Root:
208 				string result;
209 				if (tag == "br")
210 					result = "\n";
211 				foreach (child; children)
212 					result ~= child.text();
213 				return result;
214 			case XmlNodeType.Comment:
215 			case XmlNodeType.Meta:
216 			case XmlNodeType.DocType:
217 				return null;
218 			case XmlNodeType.Raw:
219 				assert(false, "Can't extract text from Raw nodes");
220 		}
221 	}
222 
223 	/// Returns the first immediate child which is a tag and has the tag name `tag`.
224 	final XmlNode findChild(string tag)
225 	{
226 		foreach (child; children)
227 			if (child.type == XmlNodeType.Node && child.tag == tag)
228 				return child;
229 		return null;
230 	}
231 
232 	/// Returns all immediate children which are a tag and have the tag name `tag`.
233 	final XmlNode[] findChildren(string tag)
234 	{
235 		XmlNode[] result;
236 		foreach (child; children)
237 			if (child.type == XmlNodeType.Node && child.tag == tag)
238 				result ~= child;
239 		return result;
240 	}
241 
242 	/// Like `findChild`, but throws an exception if no such node is found.
243 	final XmlNode opIndex(string tag)
244 	{
245 		auto node = findChild(tag);
246 		if (node is null)
247 			throw new XmlParseException("No such child: " ~ tag);
248 		return node;
249 	}
250 
251 	/// Like `findChildren[index]`, but throws an
252 	/// exception if there are not enough such nodes.
253 	final XmlNode opIndex(string tag, size_t index)
254 	{
255 		auto nodes = findChildren(tag);
256 		if (index >= nodes.length)
257 			throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length));
258 		return nodes[index];
259 	}
260 
261 	/// Returns the immediate child with the given index.
262 	final ref XmlNode opIndex(size_t index)
263 	{
264 		return children[index];
265 	}
266 
267 	/// Returns the number of children nodes.
268 	final @property size_t length() { return children.length; }
269 	alias opDollar = length; /// ditto
270 
271 	/// Iterates over immediate children.
272 	int opApply(int delegate(ref XmlNode) dg)
273 	{
274 		int result = 0;
275 
276 		for (int i = 0; i < children.length; i++)
277 		{
278 			result = dg(children[i]);
279 			if (result)
280 				break;
281 		}
282 		return result;
283 	}
284 
285 	/// Creates a deep copy of this node.
286 	final @property XmlNode dup()
287 	{
288 		auto result = new XmlNode(type, tag);
289 		result.attributes = attributes.dup;
290 		result.children.reserve(children.length);
291 		foreach (child; children)
292 			result.addChild(child.dup);
293 		return result;
294 	}
295 }
296 
297 /// Root node representing a parsed XML document.
298 class XmlDocument : XmlNode
299 {
300 	this()
301 	{
302 		super(XmlNodeType.Root);
303 		tag = "<Root>";
304 	} ///
305 
306 	private this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); }
307 
308 	/// Create and parse from input.
309 	this(string s) { auto ss = StringStream(s); this(ss); }
310 
311 	/// Creates a deep copy of this document.
312 	final @property XmlDocument dup()
313 	{
314 		auto result = new XmlDocument();
315 		result.children = super.dup().children;
316 		return result;
317 	}
318 }
319 
320 /// The logic for how to handle a node's closing tags.
321 enum NodeCloseMode
322 {
323 	/// This element must always have an explicit closing tag
324 	/// (or a self-closing tag). An unclosed tag will lead to
325 	/// a parse error.
326 	/// In XML, all tags are "always".
327 	always,
328 /*
329 	/// Close tags are optional. When an element with a tag is
330 	/// encountered directly under an element with the same tag,
331 	/// it is assumed that the first element is closed before
332 	/// the second, so the two are siblings, not parent/child.
333 	/// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`,
334 	/// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is
335 	/// still parsed as `<p>a<div><p>b</p></div></p>`.
336 	/// This mode can be used for relaxed HTML parsing.
337 	optional,
338 */
339 	/// Close tags are optional, but are implied when absent.
340 	/// As a result, these elements cannot have any content,
341 	/// and any close tags must be adjacent to the open tag.
342 	implicit,
343 
344 	/// This element is void and must never have a closing tag.
345 	/// It is always implicitly closed right after opening.
346 	/// A close tag is always an error.
347 	/// This mode can be used for strict parsing of HTML5 void
348 	/// elements.
349 	never,
350 }
351 
352 /// Configuration for parsing XML.
353 struct XmlParseConfig
354 {
355 static:
356 	NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } ///
357 	bool preserveWhitespace(string tag) { return false; } ///
358 	enum optionalParameterValues = false; ///
359 }
360 
361 /// Configuration for strict parsing of HTML5.
362 /// All void tags must never be closed, and all
363 /// non-void tags must always be explicitly closed.
364 /// Attributes must still be quoted like in XML.
365 struct Html5StrictParseConfig
366 {
367 static:
368 	immutable voidElements = [
369 		"area"   , "base"  , "br"   , "col" ,
370 		"command", "embed" , "hr"   , "img" ,
371 		"input"  , "keygen", "link" , "meta",
372 		"param"  , "source", "track", "wbr" ,
373 	]; ///
374 
375 	NodeCloseMode nodeCloseMode(string tag)
376 	{
377 		return tag.isOneOf(voidElements)
378 			? NodeCloseMode.never
379 			: NodeCloseMode.always
380 		;
381 	} ///
382 
383 	enum optionalParameterValues = true; ///
384 	bool preserveWhitespace(string tag) { return false; /*TODO*/ } ///
385 }
386 
387 /// Parse an SGML-ish string into an XmlNode
388 alias parse = _parseString!XmlNode;
389 
390 /// Parse an SGML-ish string into an XmlDocument
391 alias parseDocument = _parseString!XmlDocument;
392 
393 /// Parse an XML string into an XmlDocument.
394 alias xmlParse = parseDocument!XmlParseConfig;
395 
396 private:
397 
398 public // alias
399 template _parseString(Node)
400 {
401 	Node _parseString(Config)(string s)
402 	{
403 		auto ss = StringStream(s);
404 		alias f = parseStream!Node;
405 		return f!Config(ss);
406 	}
407 }
408 
409 template parseStream(Node)
410 {
411 	Node parseStream(Config)(ref StringStream s)
412 	{
413 		auto n = new Node;
414 		parseInto!Config(n, s);
415 		return n;
416 	}
417 }
418 
419 alias parseNode = parseStream!XmlNode;
420 
421 /// Parse an SGML-ish StringStream into an XmlDocument
422 void parseInto(Config)(XmlDocument d, ref StringStream s)
423 {
424 	skipWhitespace(s);
425 	while (s.position < s.size)
426 		try
427 		{
428 			auto n = new XmlNode;
429 			parseInto!Config(n, s, null);
430 			d.addChild(n);
431 			if (!Config.preserveWhitespace(null))
432 				skipWhitespace(s);
433 		}
434 		catch (XmlParseException e)
435 		{
436 			import std.algorithm.searching;
437 			import std.range : retro;
438 
439 			auto head = s.s[0..s.position];
440 			auto row    = head.representation.count('\n');
441 			auto column = head.representation.retro.countUntil('\n');
442 			if (column < 0)
443 				column = head.length;
444 			throw new XmlParseException("Error at %d:%d (offset %d)".format(
445 				1 + row,
446 				1 + column,
447 				head.length,
448 			), e);
449 		}
450 }
451 
452 /// Parse an SGML-ish StringStream into an XmlNode
453 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false)
454 {
455 	char c;
456 
457 	preserveWhitespace |= Config.preserveWhitespace(parentTag);
458 	if (preserveWhitespace)
459 		c = s.read();
460 	else
461 		do
462 			c = s.read();
463 		while (isWhiteChar[c]);
464 
465 	node.startPos = s.position;
466 	if (c!='<')  // text node
467 	{
468 		node.type = XmlNodeType.Text;
469 		string text;
470 		text ~= c;
471 		while (s.position < s.size && s.s[s.position] != '<')
472 			text ~= s.read();
473 		if (!preserveWhitespace)
474 			while (text.length && isWhiteChar[text[$-1]])
475 				text = text[0..$-1];
476 		node.tag = decodeEntities(text);
477 		//tag = tag.strip();
478 	}
479 	else
480 	{
481 		c = s.read();
482 		if (c=='!')
483 		{
484 			c = s.read();
485 			if (c == '-') // comment
486 			{
487 				expect(s, '-');
488 				node.type = XmlNodeType.Comment;
489 				string tag;
490 				do
491 				{
492 					c = s.read();
493 					tag ~= c;
494 				} while (tag.length<3 || tag[$-3..$] != "-->");
495 				tag = tag[0..$-3];
496 				node.tag = tag;
497 			}
498 			else
499 			if (c == '[') // CDATA
500 			{
501 				foreach (x; "CDATA[")
502 					expect(s, x);
503 				node.type = XmlNodeType.CData;
504 				string tag;
505 				do
506 				{
507 					c = s.read();
508 					tag ~= c;
509 				} while (tag.length<3 || tag[$-3..$] != "]]>");
510 				tag = tag[0..$-3];
511 				node.tag = tag;
512 			}
513 			else // doctype, etc.
514 			{
515 				node.type = XmlNodeType.DocType;
516 				while (c != '>')
517 				{
518 					node.tag ~= c;
519 					c = s.read();
520 				}
521 			}
522 		}
523 		else
524 		if (c=='?')
525 		{
526 			node.type = XmlNodeType.Meta;
527 			node.tag = readWord(s);
528 			if (node.tag.length==0) throw new XmlParseException("Invalid tag");
529 			while (true)
530 			{
531 				skipWhitespace(s);
532 				if (peek(s)=='?')
533 					break;
534 				readAttribute!Config(node, s);
535 			}
536 			c = s.read();
537 			expect(s, '>');
538 		}
539 		else
540 		if (c=='/')
541 			throw new XmlParseException("Unexpected close tag");
542 		else
543 		{
544 			node.type = XmlNodeType.Node;
545 			s.position--;
546 			node.tag = readWord(s);
547 			while (true)
548 			{
549 				skipWhitespace(s);
550 				c = peek(s);
551 				if (c=='>' || c=='/')
552 					break;
553 				readAttribute!Config(node, s);
554 			}
555 			c = s.read();
556 
557 			auto closeMode = Config.nodeCloseMode(node.tag);
558 			if (closeMode == NodeCloseMode.never)
559 				enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag));
560 			else
561 			if (closeMode == NodeCloseMode.implicit)
562 			{
563 				if (c == '/')
564 					expect(s, '>');
565 			}
566 			else
567 			{
568 				if (c=='>')
569 				{
570 					while (true)
571 					{
572 						while (true)
573 						{
574 							if (!preserveWhitespace && !Config.preserveWhitespace(node.tag))
575 								skipWhitespace(s);
576 							if (peek(s)=='<' && peek(s, 2)=='/')
577 								break;
578 							try
579 							{
580 								auto child = new XmlNode;
581 								parseInto!Config(child, s, node.tag, preserveWhitespace);
582 								node.addChild(child);
583 							}
584 							catch (XmlParseException e)
585 								throw new XmlParseException("Error while processing child of "~node.tag, e);
586 						}
587 						expect(s, '<');
588 						expect(s, '/');
589 						auto word = readWord(s);
590 						if (word != node.tag)
591 						{
592 							auto closeMode2 = Config.nodeCloseMode(word);
593 							if (closeMode2 == NodeCloseMode.implicit)
594 							{
595 								auto parent = node.parent;
596 								enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word));
597 								enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word));
598 								enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word));
599 								continue;
600 							}
601 							else
602 								enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word));
603 						}
604 						expect(s, '>');
605 						break;
606 					}
607 				}
608 				else // '/'
609 					expect(s, '>');
610 			}
611 		}
612 	}
613 	node.endPos = s.position;
614 }
615 
616 private:
617 
618 void readAttribute(Config)(XmlNode node, ref StringStream s)
619 {
620 	string name = readWord(s);
621 	if (name.length==0) throw new XmlParseException("Invalid attribute");
622 	skipWhitespace(s);
623 
624 	static if (Config.optionalParameterValues)
625 	{
626 		if (peek(s) != '=')
627 		{
628 			node.attributes[name] = null;
629 			return;
630 		}
631 	}
632 
633 	expect(s, '=');
634 	skipWhitespace(s);
635 	char delim;
636 	delim = s.read();
637 	if (delim != '\'' && delim != '"')
638 		throw new XmlParseException("Expected ' or \", not %s".format(delim));
639 	string value = readUntil(s, delim);
640 	node.attributes[name] = decodeEntities(value);
641 }
642 
643 char peek(ref StringStream s, int n=1)
644 {
645 	return s.s[s.position + n - 1];
646 }
647 
648 void skipWhitespace(ref StringStream s)
649 {
650 	while (isWhiteChar[s.s.ptr[s.position]])
651 		s.position++;
652 }
653 
654 __gshared bool[256] isWhiteChar, isWordChar;
655 
656 shared static this()
657 {
658 	foreach (c; 0..256)
659 	{
660 		isWhiteChar[c] = isWhite(c);
661 		isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c);
662 	}
663 }
664 
665 string readWord(ref StringStream stream)
666 {
667 	auto start = stream.s.ptr + stream.position;
668 	auto end = stream.s.ptr + stream.s.length;
669 	auto p = start;
670 	while (p < end && isWordChar[*p])
671 		p++;
672 	auto len = p-start;
673 	stream.position += len;
674 	return start[0..len];
675 }
676 
677 void expect(ref StringStream s, char c)
678 {
679 	char c2;
680 	c2 = s.read();
681 	enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2);
682 }
683 
684 string readUntil(ref StringStream s, char until)
685 {
686 	auto start = s.s.ptr + s.position;
687 	auto p = start;
688 	while (*p != until) p++;
689 	auto len = p-start;
690 	s.position += len + 1;
691 	return start[0..len];
692 }
693 
694 unittest
695 {
696 	enum xmlText =
697 		`<?xml version="1.0" encoding="UTF-8"?>` ~
698 		`<quotes>` ~
699 			`<quote author="Alan Perlis">` ~
700 				`When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~
701 			`</quote>` ~
702 		`</quotes>`;
703 	auto doc = new XmlDocument(xmlText);
704 	assert(doc.toString() == xmlText, doc.toString());
705 }
706 
707 unittest
708 {
709 	string testOne(bool preserve)(string s)
710 	{
711 		static struct ParseConfig
712 		{
713 		static:
714 			NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
715 			bool preserveWhitespace(string tag) { return preserve; }
716 			enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
717 		}
718 		auto node = new XmlNode;
719 		auto str = StringStream("<tag>" ~ s ~ "</tag>");
720 		parseInto!ParseConfig(node, str, null);
721 		// import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString);
722 		return node.children.length ? node.children[0].tag : null;
723 	}
724 
725 	foreach (tag; ["a", " a", "a ", " a ", " a  a ", " ", ""])
726 	{
727 		assert(testOne!false(tag) == strip(tag),
728 			"Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'");
729 		assert(testOne!true(tag) == tag,
730 			"Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'");
731 	}
732 }
733 
734 unittest
735 {
736 	static struct ParseConfig
737 	{
738 	static:
739 		NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
740 		bool preserveWhitespace(string tag) { return tag == "a"; }
741 		enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
742 	}
743 	auto node = new XmlNode;
744 	auto str = StringStream("<a><b> foo </b></a>");
745 	parseInto!ParseConfig(node, str, null);
746 	assert(node.children[0].children[0].tag == " foo ");
747 }
748 
749 // Parsing naked tags while preserving whitespace
750 unittest
751 {
752 	static struct ParseConfig
753 	{
754 	static:
755 		NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
756 		bool preserveWhitespace(string tag) { return true; }
757 		enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
758 	}
759 	auto doc = parseDocument!ParseConfig("<foo/> <bar/>\n");
760 	assert(doc.children.length == 4);
761 }