1 /**
2  * Light read-only XML library
3  * May be deprecated in the future.
4  * See other XML modules for better implementations.
5  *
6  * License:
7  *   This Source Code Form is subject to the terms of
8  *   the Mozilla Public License, v. 2.0. If a copy of
9  *   the MPL was not distributed with this file, You
10  *   can obtain one at http://mozilla.org/MPL/2.0/.
11  *
12  * Authors:
13  *   Vladimir Panteleev <ae@cy.md>
14  *   Simon Arlott
15  */
16 
17 module ae.utils.xml.lite;
18 
19 // TODO: better/safer handling of malformed XML
20 
21 import std.string;
22 import std.ascii;
23 import std.exception;
24 
25 import ae.utils.array;
26 import ae.utils.xml.common;
27 import ae.utils.xml.entities;
28 import ae.utils.xmlwriter;
29 
30 // ************************************************************************
31 
32 /// std.stream.Stream-like type with bonus speed
33 private struct StringStream
34 {
35 	string s;
36 	size_t position;
37 
38 	@disable this();
39 	@disable this(this);
40 	this(string s)
41 	{
42 		enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution
43 		this.s = (s ~ ditch)[0..$-ditch.length];
44 	}
45 
46 	char read() { return s[position++]; }
47 	@property size_t size() { return s.length; }
48 }
49 
50 // ************************************************************************
51 
52 /// The type of an `XmlNode`.
53 enum XmlNodeType
54 {
55 	None    , /// Initial value. Never created during parsing.
56 	Root    , /// The root node. Contains top-level nodes as children.
57 	Node    , /// XML tag.
58 	Comment , /// XML comment.
59 	Meta    , /// XML processing instruction.
60 	DocType , /// XML doctype declaration.
61 	CData   , /// CDATA node.
62 	Text    , /// Text node.
63 	Raw     , /// Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is.
64 }
65 
66 /// Type used to hold a tag node's attributes.
67 alias XmlAttributes = OrderedMap!(string, string);
68 
69 /// An XML node.
70 class XmlNode
71 {
72 	string tag; /// The tag name, or the contents for text / comment / CDATA nodes.
73 	XmlAttributes attributes; /// Tag attributes.
74 	XmlNode parent; /// Parent node.
75 	XmlNode[] children; /// Children nodes.
76 	XmlNodeType type; /// Node type.
77 	/// Start and end offset within the input.
78 	ulong startPos, endPos;
79 
80 	private this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); }
81 
82 	/// Create and parse from input.
83 	this(string s) { auto ss = StringStream(s); this(ss); }
84 
85 	/// Create a new node.
86 	this(XmlNodeType type = XmlNodeType.None, string tag = null)
87 	{
88 		this.type = type;
89 		this.tag = tag;
90 	}
91 
92 	/// Set an attribute with the given value.
93 	XmlNode addAttribute(string name, string value)
94 	{
95 		attributes[name] = value;
96 		return this;
97 	}
98 
99 	/// Add a child node, making this node its parent.
100 	XmlNode addChild(XmlNode child)
101 	{
102 		child.parent = this;
103 		children ~= child;
104 		return this;
105 	}
106 
107 	/// Return XML string.
108 	override string toString() const
109 	{
110 		XmlWriter writer;
111 		writeTo(writer);
112 		return writer.output.get();
113 	}
114 
115 	/// Return pretty-printed XML string (with indentation).
116 	string toPrettyString() const
117 	{
118 		PrettyXmlWriter writer;
119 		writeTo(writer);
120 		return writer.output.get();
121 	}
122 
123 	/// Write to an `XmlWriter`.
124 	final void writeTo(XmlWriter)(ref XmlWriter output) const
125 	{
126 		void writeChildren()
127 		{
128 			foreach (child; children)
129 				child.writeTo(output);
130 		}
131 
132 		void writeAttributes()
133 		{
134 			foreach (key, value; attributes)
135 				output.addAttribute(key, value);
136 		}
137 
138 		final switch (type)
139 		{
140 			case XmlNodeType.None:
141 				assert(false);
142 			case XmlNodeType.Root:
143 				writeChildren();
144 				return;
145 			case XmlNodeType.Node:
146 				output.startTagWithAttributes(tag);
147 				writeAttributes();
148 				if (children.length)
149 				{
150 					bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text;
151 					if (oneLine)
152 						output.formatter.enabled = false;
153 					output.endAttributes();
154 					writeChildren();
155 					output.endTag(tag);
156 					if (oneLine)
157 					{
158 						output.formatter.enabled = true;
159 						output.newLine();
160 					}
161 				}
162 				else
163 					output.endAttributesAndTag();
164 				return;
165 			case XmlNodeType.Meta:
166 				assert(children.length == 0);
167 				output.startPI(tag);
168 				writeAttributes();
169 				output.endPI();
170 				return;
171 			case XmlNodeType.DocType:
172 				assert(children.length == 0);
173 				output.doctype(tag);
174 				return;
175 			case XmlNodeType.Text:
176 				output.startLine();
177 				output.text(tag);
178 				output.newLine();
179 				return;
180 			case XmlNodeType.Comment:
181 				output.startLine();
182 				output.comment(tag);
183 				return;
184 			case XmlNodeType.CData:
185 				output.text(tag);
186 				return;
187 			case XmlNodeType.Raw:
188 				output.startLine();
189 				output.output.put(tag);
190 				output.newLine();
191 				return;
192 		}
193 	}
194 
195 	/// Attempts to retrieve the text contents of this node.
196 	/// `<br>` tags are converted to newlines.
197 	@property string text()
198 	{
199 		final switch (type)
200 		{
201 			case XmlNodeType.None:
202 				assert(false);
203 			case XmlNodeType.Text:
204 			case XmlNodeType.CData:
205 				return tag;
206 			case XmlNodeType.Node:
207 			case XmlNodeType.Root:
208 				string result;
209 				if (tag == "br")
210 					result = "\n";
211 				foreach (child; children)
212 					result ~= child.text();
213 				return result;
214 			case XmlNodeType.Comment:
215 			case XmlNodeType.Meta:
216 			case XmlNodeType.DocType:
217 				return null;
218 			case XmlNodeType.Raw:
219 				assert(false, "Can't extract text from Raw nodes");
220 		}
221 	}
222 
223 	/// Returns the first immediate child which is a tag and has the tag name `tag`.
224 	final XmlNode findChild(string tag)
225 	{
226 		foreach (child; children)
227 			if (child.type == XmlNodeType.Node && child.tag == tag)
228 				return child;
229 		return null;
230 	}
231 
232 	/// Returns all immediate children which are a tag and have the tag name `tag`.
233 	final XmlNode[] findChildren(string tag)
234 	{
235 		XmlNode[] result;
236 		foreach (child; children)
237 			if (child.type == XmlNodeType.Node && child.tag == tag)
238 				result ~= child;
239 		return result;
240 	}
241 
242 	/// Like `findChild`, but throws an exception if no such node is found.
243 	final XmlNode opIndex(string tag)
244 	{
245 		auto node = findChild(tag);
246 		if (node is null)
247 			throw new XmlParseException("No such child: " ~ tag);
248 		return node;
249 	}
250 
251 	/// Like `findChildren[index]`, but throws an
252 	/// exception if there are not enough such nodes.
253 	final XmlNode opIndex(string tag, size_t index)
254 	{
255 		auto nodes = findChildren(tag);
256 		if (index >= nodes.length)
257 			throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length));
258 		return nodes[index];
259 	}
260 
261 	/// Returns the immediate child with the given index.
262 	final ref XmlNode opIndex(size_t index)
263 	{
264 		return children[index];
265 	}
266 
267 	/// Returns the number of children nodes.
268 	final @property size_t length() { return children.length; }
269 	alias opDollar = length; /// ditto
270 
271 	/// Iterates over immediate children.
272 	int opApply(int delegate(ref XmlNode) dg)
273 	{
274 		int result = 0;
275 
276 		for (int i = 0; i < children.length; i++)
277 		{
278 			result = dg(children[i]);
279 			if (result)
280 				break;
281 		}
282 		return result;
283 	}
284 
285 	/// Creates a deep copy of this node.
286 	final @property XmlNode dup()
287 	{
288 		auto result = new XmlNode(type, tag);
289 		result.attributes = attributes.dup;
290 		result.children.reserve(children.length);
291 		foreach (child; children)
292 			result.addChild(child.dup);
293 		return result;
294 	}
295 }
296 
297 /// Root node representing a parsed XML document.
298 class XmlDocument : XmlNode
299 {
300 	this()
301 	{
302 		super(XmlNodeType.Root);
303 		tag = "<Root>";
304 	} ///
305 
306 	private this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); }
307 
308 	/// Create and parse from input.
309 	this(string s) { auto ss = StringStream(s); this(ss); }
310 
311 	/// Creates a deep copy of this document.
312 	final @property XmlDocument dup()
313 	{
314 		auto result = new XmlDocument();
315 		result.children = super.dup().children;
316 		return result;
317 	}
318 }
319 
320 /// The logic for how to handle a node's closing tags.
321 enum NodeCloseMode
322 {
323 	/// This element must always have an explicit closing tag
324 	/// (or a self-closing tag). An unclosed tag will lead to
325 	/// a parse error.
326 	/// In XML, all tags are "always".
327 	always,
328 /*
329 	/// Close tags are optional. When an element with a tag is
330 	/// encountered directly under an element with the same tag,
331 	/// it is assumed that the first element is closed before
332 	/// the second, so the two are siblings, not parent/child.
333 	/// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`,
334 	/// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is
335 	/// still parsed as `<p>a<div><p>b</p></div></p>`.
336 	/// This mode can be used for relaxed HTML parsing.
337 	optional,
338 */
339 	/// Close tags are optional, but are implied when absent.
340 	/// As a result, these elements cannot have any content,
341 	/// and any close tags must be adjacent to the open tag.
342 	implicit,
343 
344 	/// This element is void and must never have a closing tag.
345 	/// It is always implicitly closed right after opening.
346 	/// A close tag is always an error.
347 	/// This mode can be used for strict parsing of HTML5 void
348 	/// elements.
349 	never,
350 }
351 
352 /// Configuration for parsing XML.
353 struct XmlParseConfig
354 {
355 static:
356 	NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } ///
357 	bool preserveWhitespace(string tag) { return false; } ///
358 	enum optionalParameterValues = false; ///
359 }
360 
361 /// Configuration for strict parsing of HTML5.
362 /// All void tags must never be closed, and all
363 /// non-void tags must always be explicitly closed.
364 /// Attributes must still be quoted like in XML.
365 struct Html5StrictParseConfig
366 {
367 static:
368 	immutable voidElements = [
369 		"area"   , "base"  , "br"   , "col" ,
370 		"command", "embed" , "hr"   , "img" ,
371 		"input"  , "keygen", "link" , "meta",
372 		"param"  , "source", "track", "wbr" ,
373 	]; ///
374 
375 	NodeCloseMode nodeCloseMode(string tag)
376 	{
377 		return tag.isOneOf(voidElements)
378 			? NodeCloseMode.never
379 			: NodeCloseMode.always
380 		;
381 	} ///
382 
383 	enum optionalParameterValues = true; ///
384 	bool preserveWhitespace(string tag) { return false; /*TODO*/ } ///
385 }
386 
387 /// Parse an SGML-ish string into an XmlNode
388 alias parse = _parseString!XmlNode;
389 
390 /// Parse an SGML-ish string into an XmlDocument
391 alias parseDocument = _parseString!XmlDocument;
392 
393 /// Parse an XML string into an XmlDocument.
394 alias xmlParse = parseDocument!XmlParseConfig;
395 
396 private:
397 
398 public // alias
399 template _parseString(Node)
400 {
401 	Node _parseString(Config)(string s)
402 	{
403 		auto ss = StringStream(s);
404 		alias f = parseStream!Node;
405 		return f!Config(ss);
406 	}
407 }
408 
409 template parseStream(Node)
410 {
411 	Node parseStream(Config)(ref StringStream s)
412 	{
413 		auto n = new Node;
414 		parseInto!Config(n, s);
415 		return n;
416 	}
417 }
418 
419 alias parseNode = parseStream!XmlNode;
420 
421 /// Parse an SGML-ish StringStream into an XmlDocument
422 void parseInto(Config)(XmlDocument d, ref StringStream s)
423 {
424 	skipWhitespace(s);
425 	while (s.position < s.size)
426 		try
427 		{
428 			auto n = new XmlNode;
429 			parseInto!Config(n, s, null);
430 			d.addChild(n);
431 			skipWhitespace(s);
432 		}
433 		catch (XmlParseException e)
434 		{
435 			import std.algorithm.searching;
436 			import std.range : retro;
437 
438 			auto head = s.s[0..s.position];
439 			auto row    = head.representation.count('\n');
440 			auto column = head.representation.retro.countUntil('\n');
441 			if (column < 0)
442 				column = head.length;
443 			throw new XmlParseException("Error at %d:%d (offset %d)".format(
444 				1 + row,
445 				1 + column,
446 				head.length,
447 			), e);
448 		}
449 }
450 
451 /// Parse an SGML-ish StringStream into an XmlNode
452 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false)
453 {
454 	char c;
455 
456 	preserveWhitespace |= Config.preserveWhitespace(parentTag);
457 	if (preserveWhitespace)
458 		c = s.read();
459 	else
460 		do
461 			c = s.read();
462 		while (isWhiteChar[c]);
463 
464 	node.startPos = s.position;
465 	if (c!='<')  // text node
466 	{
467 		node.type = XmlNodeType.Text;
468 		string text;
469 		while (c!='<')
470 		{
471 			// TODO: check for EOF
472 			text ~= c;
473 			c = s.read();
474 		}
475 		s.position--; // rewind to '<'
476 		if (!preserveWhitespace)
477 			while (text.length && isWhiteChar[text[$-1]])
478 				text = text[0..$-1];
479 		node.tag = decodeEntities(text);
480 		//tag = tag.strip();
481 	}
482 	else
483 	{
484 		c = s.read();
485 		if (c=='!')
486 		{
487 			c = s.read();
488 			if (c == '-') // comment
489 			{
490 				expect(s, '-');
491 				node.type = XmlNodeType.Comment;
492 				string tag;
493 				do
494 				{
495 					c = s.read();
496 					tag ~= c;
497 				} while (tag.length<3 || tag[$-3..$] != "-->");
498 				tag = tag[0..$-3];
499 				node.tag = tag;
500 			}
501 			else
502 			if (c == '[') // CDATA
503 			{
504 				foreach (x; "CDATA[")
505 					expect(s, x);
506 				node.type = XmlNodeType.CData;
507 				string tag;
508 				do
509 				{
510 					c = s.read();
511 					tag ~= c;
512 				} while (tag.length<3 || tag[$-3..$] != "]]>");
513 				tag = tag[0..$-3];
514 				node.tag = tag;
515 			}
516 			else // doctype, etc.
517 			{
518 				node.type = XmlNodeType.DocType;
519 				while (c != '>')
520 				{
521 					node.tag ~= c;
522 					c = s.read();
523 				}
524 			}
525 		}
526 		else
527 		if (c=='?')
528 		{
529 			node.type = XmlNodeType.Meta;
530 			node.tag = readWord(s);
531 			if (node.tag.length==0) throw new XmlParseException("Invalid tag");
532 			while (true)
533 			{
534 				skipWhitespace(s);
535 				if (peek(s)=='?')
536 					break;
537 				readAttribute!Config(node, s);
538 			}
539 			c = s.read();
540 			expect(s, '>');
541 		}
542 		else
543 		if (c=='/')
544 			throw new XmlParseException("Unexpected close tag");
545 		else
546 		{
547 			node.type = XmlNodeType.Node;
548 			s.position--;
549 			node.tag = readWord(s);
550 			while (true)
551 			{
552 				skipWhitespace(s);
553 				c = peek(s);
554 				if (c=='>' || c=='/')
555 					break;
556 				readAttribute!Config(node, s);
557 			}
558 			c = s.read();
559 
560 			auto closeMode = Config.nodeCloseMode(node.tag);
561 			if (closeMode == NodeCloseMode.never)
562 				enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag));
563 			else
564 			if (closeMode == NodeCloseMode.implicit)
565 			{
566 				if (c == '/')
567 					expect(s, '>');
568 			}
569 			else
570 			{
571 				if (c=='>')
572 				{
573 					while (true)
574 					{
575 						while (true)
576 						{
577 							if (!preserveWhitespace && !Config.preserveWhitespace(node.tag))
578 								skipWhitespace(s);
579 							if (peek(s)=='<' && peek(s, 2)=='/')
580 								break;
581 							try
582 							{
583 								auto child = new XmlNode;
584 								parseInto!Config(child, s, node.tag, preserveWhitespace);
585 								node.addChild(child);
586 							}
587 							catch (XmlParseException e)
588 								throw new XmlParseException("Error while processing child of "~node.tag, e);
589 						}
590 						expect(s, '<');
591 						expect(s, '/');
592 						auto word = readWord(s);
593 						if (word != node.tag)
594 						{
595 							auto closeMode2 = Config.nodeCloseMode(word);
596 							if (closeMode2 == NodeCloseMode.implicit)
597 							{
598 								auto parent = node.parent;
599 								enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word));
600 								enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word));
601 								enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word));
602 								continue;
603 							}
604 							else
605 								enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word));
606 						}
607 						expect(s, '>');
608 						break;
609 					}
610 				}
611 				else // '/'
612 					expect(s, '>');
613 			}
614 		}
615 	}
616 	node.endPos = s.position;
617 }
618 
619 private:
620 
621 void readAttribute(Config)(XmlNode node, ref StringStream s)
622 {
623 	string name = readWord(s);
624 	if (name.length==0) throw new XmlParseException("Invalid attribute");
625 	skipWhitespace(s);
626 
627 	static if (Config.optionalParameterValues)
628 	{
629 		if (peek(s) != '=')
630 		{
631 			node.attributes[name] = null;
632 			return;
633 		}
634 	}
635 
636 	expect(s, '=');
637 	skipWhitespace(s);
638 	char delim;
639 	delim = s.read();
640 	if (delim != '\'' && delim != '"')
641 		throw new XmlParseException("Expected ' or \", not %s".format(delim));
642 	string value = readUntil(s, delim);
643 	node.attributes[name] = decodeEntities(value);
644 }
645 
646 char peek(ref StringStream s, int n=1)
647 {
648 	return s.s[s.position + n - 1];
649 }
650 
651 void skipWhitespace(ref StringStream s)
652 {
653 	while (isWhiteChar[s.s.ptr[s.position]])
654 		s.position++;
655 }
656 
657 __gshared bool[256] isWhiteChar, isWordChar;
658 
659 shared static this()
660 {
661 	foreach (c; 0..256)
662 	{
663 		isWhiteChar[c] = isWhite(c);
664 		isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c);
665 	}
666 }
667 
668 string readWord(ref StringStream stream)
669 {
670 	auto start = stream.s.ptr + stream.position;
671 	auto end = stream.s.ptr + stream.s.length;
672 	auto p = start;
673 	while (p < end && isWordChar[*p])
674 		p++;
675 	auto len = p-start;
676 	stream.position += len;
677 	return start[0..len];
678 }
679 
680 void expect(ref StringStream s, char c)
681 {
682 	char c2;
683 	c2 = s.read();
684 	enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2);
685 }
686 
687 string readUntil(ref StringStream s, char until)
688 {
689 	auto start = s.s.ptr + s.position;
690 	auto p = start;
691 	while (*p != until) p++;
692 	auto len = p-start;
693 	s.position += len + 1;
694 	return start[0..len];
695 }
696 
697 unittest
698 {
699 	enum xmlText =
700 		`<?xml version="1.0" encoding="UTF-8"?>` ~
701 		`<quotes>` ~
702 			`<quote author="Alan Perlis">` ~
703 				`When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~
704 			`</quote>` ~
705 		`</quotes>`;
706 	auto doc = new XmlDocument(xmlText);
707 	assert(doc.toString() == xmlText, doc.toString());
708 }
709 
710 unittest
711 {
712 	string testOne(bool preserve)(string s)
713 	{
714 		static struct ParseConfig
715 		{
716 		static:
717 			NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
718 			bool preserveWhitespace(string tag) { return preserve; }
719 			enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
720 		}
721 		auto node = new XmlNode;
722 		auto str = StringStream("<tag>" ~ s ~ "</tag>");
723 		parseInto!ParseConfig(node, str, null);
724 		// import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString);
725 		return node.children.length ? node.children[0].tag : null;
726 	}
727 
728 	foreach (tag; ["a", " a", "a ", " a ", " a  a ", " ", ""])
729 	{
730 		assert(testOne!false(tag) == strip(tag),
731 			"Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'");
732 		assert(testOne!true(tag) == tag,
733 			"Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'");
734 	}
735 }
736 
737 unittest
738 {
739 	static struct ParseConfig
740 	{
741 	static:
742 		NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
743 		bool preserveWhitespace(string tag) { return tag == "a"; }
744 		enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
745 	}
746 	auto node = new XmlNode;
747 	auto str = StringStream("<a><b> foo </b></a>");
748 	parseInto!ParseConfig(node, str, null);
749 	assert(node.children[0].children[0].tag == " foo ");
750 }