1 /**
2  * Light read-only XML library
3  * May be deprecated in the future.
4  * See other XML modules for better implementations.
5  *
6  * License:
7  *   This Source Code Form is subject to the terms of
8  *   the Mozilla Public License, v. 2.0. If a copy of
9  *   the MPL was not distributed with this file, You
10  *   can obtain one at http://mozilla.org/MPL/2.0/.
11  *
12  * Authors:
13  *   Vladimir Panteleev <ae@cy.md>
14  *   Simon Arlott
15  */
16 
17 module ae.utils.xml.lite;
18 
19 // TODO: better/safer handling of malformed XML
20 
21 import std..string;
22 import std.ascii;
23 import std.exception;
24 
25 import ae.utils.array;
26 import ae.utils.xml.common;
27 import ae.utils.xml.entities;
28 import ae.utils.xmlwriter;
29 
30 // ************************************************************************
31 
32 /// std.stream.Stream-like type with bonus speed
33 private struct StringStream
34 {
35 	string s;
36 	size_t position;
37 
38 	@disable this();
39 	@disable this(this);
40 	this(string s)
41 	{
42 		enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution
43 		this.s = (s ~ ditch)[0..$-ditch.length];
44 	}
45 
46 	char read() { return s[position++]; }
47 	@property size_t size() { return s.length; }
48 }
49 
50 // ************************************************************************
51 
52 /// The type of an `XmlNode`.
53 enum XmlNodeType
54 {
55 	None    , /// Initial value. Never created during parsing.
56 	Root    , /// The root node. Contains top-level nodes as children.
57 	Node    , /// XML tag.
58 	Comment , /// XML comment.
59 	Meta    , /// XML processing instruction.
60 	DocType , /// XML doctype declaration.
61 	CData   , /// CDATA node.
62 	Text    , /// Text node.
63 	Raw     , /// Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is.
64 }
65 
66 /// Type used to hold a tag node's attributes.
67 alias XmlAttributes = OrderedMap!(string, string);
68 
69 /// An XML node.
70 class XmlNode
71 {
72 	string tag; /// The tag name, or the contents for text / comment / CDATA nodes.
73 	XmlAttributes attributes; /// Tag attributes.
74 	XmlNode parent; /// Parent node.
75 	XmlNode[] children; /// Children nodes.
76 	XmlNodeType type; /// Node type.
77 	/// Start and end offset within the input.
78 	ulong startPos, endPos;
79 
80 	this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); }
81 	/// Create and parse from input.
82 	this(string s) { auto ss = StringStream(s); this(ss); }
83 
84 	/// Create a new node.
85 	this(XmlNodeType type = XmlNodeType.None, string tag = null)
86 	{
87 		this.type = type;
88 		this.tag = tag;
89 	}
90 
91 	/// Set an attribute with the given value.
92 	XmlNode addAttribute(string name, string value)
93 	{
94 		attributes[name] = value;
95 		return this;
96 	}
97 
98 	/// Add a child node, making this node its parent.
99 	XmlNode addChild(XmlNode child)
100 	{
101 		child.parent = this;
102 		children ~= child;
103 		return this;
104 	}
105 
106 	/// Return XML string.
107 	override string toString() const
108 	{
109 		XmlWriter writer;
110 		writeTo(writer);
111 		return writer.output.get();
112 	}
113 
114 	/// Return pretty-printed XML string (with indentation).
115 	string toPrettyString() const
116 	{
117 		PrettyXmlWriter writer;
118 		writeTo(writer);
119 		return writer.output.get();
120 	}
121 
122 	/// Write to an `XmlWriter`.
123 	final void writeTo(XmlWriter)(ref XmlWriter output) const
124 	{
125 		void writeChildren()
126 		{
127 			foreach (child; children)
128 				child.writeTo(output);
129 		}
130 
131 		void writeAttributes()
132 		{
133 			foreach (key, value; attributes)
134 				output.addAttribute(key, value);
135 		}
136 
137 		final switch (type)
138 		{
139 			case XmlNodeType.None:
140 				assert(false);
141 			case XmlNodeType.Root:
142 				writeChildren();
143 				return;
144 			case XmlNodeType.Node:
145 				output.startTagWithAttributes(tag);
146 				writeAttributes();
147 				if (children.length)
148 				{
149 					bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text;
150 					if (oneLine)
151 						output.formatter.enabled = false;
152 					output.endAttributes();
153 					writeChildren();
154 					output.endTag(tag);
155 					if (oneLine)
156 					{
157 						output.formatter.enabled = true;
158 						output.newLine();
159 					}
160 				}
161 				else
162 					output.endAttributesAndTag();
163 				return;
164 			case XmlNodeType.Meta:
165 				assert(children.length == 0);
166 				output.startPI(tag);
167 				writeAttributes();
168 				output.endPI();
169 				return;
170 			case XmlNodeType.DocType:
171 				assert(children.length == 0);
172 				output.doctype(tag);
173 				return;
174 			case XmlNodeType.Text:
175 				output.startLine();
176 				output.text(tag);
177 				output.newLine();
178 				return;
179 			case XmlNodeType.Comment:
180 				output.startLine();
181 				output.comment(tag);
182 				return;
183 			case XmlNodeType.CData:
184 				output.text(tag);
185 				return;
186 			case XmlNodeType.Raw:
187 				output.startLine();
188 				output.output.put(tag);
189 				output.newLine();
190 				return;
191 		}
192 	}
193 
194 	/// Attempts to retrieve the text contents of this node.
195 	/// `<br>` tags are converted to newlines.
196 	@property string text()
197 	{
198 		final switch (type)
199 		{
200 			case XmlNodeType.None:
201 				assert(false);
202 			case XmlNodeType.Text:
203 			case XmlNodeType.CData:
204 				return tag;
205 			case XmlNodeType.Node:
206 			case XmlNodeType.Root:
207 				string result;
208 				if (tag == "br")
209 					result = "\n";
210 				foreach (child; children)
211 					result ~= child.text();
212 				return result;
213 			case XmlNodeType.Comment:
214 			case XmlNodeType.Meta:
215 			case XmlNodeType.DocType:
216 				return null;
217 			case XmlNodeType.Raw:
218 				assert(false, "Can't extract text from Raw nodes");
219 		}
220 	}
221 
222 	/// Returns the first immediate child which is a tag and has the tag name `tag`.
223 	final XmlNode findChild(string tag)
224 	{
225 		foreach (child; children)
226 			if (child.type == XmlNodeType.Node && child.tag == tag)
227 				return child;
228 		return null;
229 	}
230 
231 	/// Returns all immediate children which are a tag and have the tag name `tag`.
232 	final XmlNode[] findChildren(string tag)
233 	{
234 		XmlNode[] result;
235 		foreach (child; children)
236 			if (child.type == XmlNodeType.Node && child.tag == tag)
237 				result ~= child;
238 		return result;
239 	}
240 
241 	/// Like `findChild`, but throws an exception if no such node is found.
242 	final XmlNode opIndex(string tag)
243 	{
244 		auto node = findChild(tag);
245 		if (node is null)
246 			throw new XmlParseException("No such child: " ~ tag);
247 		return node;
248 	}
249 
250 	/// Like `findChildren[index]`, but throws an
251 	/// exception if there are not enough such nodes.
252 	final XmlNode opIndex(string tag, size_t index)
253 	{
254 		auto nodes = findChildren(tag);
255 		if (index >= nodes.length)
256 			throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length));
257 		return nodes[index];
258 	}
259 
260 	/// Returns the immediate child with the given index.
261 	final ref XmlNode opIndex(size_t index)
262 	{
263 		return children[index];
264 	}
265 
266 	/// Returns the number of children nodes.
267 	final @property size_t length() { return children.length; }
268 	alias opDollar = length; /// ditto
269 
270 	/// Iterates over immediate children.
271 	int opApply(int delegate(ref XmlNode) dg)
272 	{
273 		int result = 0;
274 
275 		for (int i = 0; i < children.length; i++)
276 		{
277 			result = dg(children[i]);
278 			if (result)
279 				break;
280 		}
281 		return result;
282 	}
283 
284 	/// Creates a deep copy of this node.
285 	final @property XmlNode dup()
286 	{
287 		auto result = new XmlNode(type, tag);
288 		result.attributes = attributes.dup;
289 		result.children.reserve(children.length);
290 		foreach (child; children)
291 			result.addChild(child.dup);
292 		return result;
293 	}
294 }
295 
296 /// Root node representing a parsed XML document.
297 class XmlDocument : XmlNode
298 {
299 	this()
300 	{
301 		super(XmlNodeType.Root);
302 		tag = "<Root>";
303 	} ///
304 
305 	this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); }
306 
307 	/// Create and parse from input.
308 	this(string s) { auto ss = StringStream(s); this(ss); }
309 
310 	/// Creates a deep copy of this document.
311 	final @property XmlDocument dup()
312 	{
313 		auto result = new XmlDocument();
314 		result.children = super.dup().children;
315 		return result;
316 	}
317 }
318 
319 /// The logic for how to handle a node's closing tags.
320 enum NodeCloseMode
321 {
322 	/// This element must always have an explicit closing tag
323 	/// (or a self-closing tag). An unclosed tag will lead to
324 	/// a parse error.
325 	/// In XML, all tags are "always".
326 	always,
327 /*
328 	/// Close tags are optional. When an element with a tag is
329 	/// encountered directly under an element with the same tag,
330 	/// it is assumed that the first element is closed before
331 	/// the second, so the two are siblings, not parent/child.
332 	/// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`,
333 	/// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is
334 	/// still parsed as `<p>a<div><p>b</p></div></p>`.
335 	/// This mode can be used for relaxed HTML parsing.
336 	optional,
337 */
338 	/// Close tags are optional, but are implied when absent.
339 	/// As a result, these elements cannot have any content,
340 	/// and any close tags must be adjacent to the open tag.
341 	implicit,
342 
343 	/// This element is void and must never have a closing tag.
344 	/// It is always implicitly closed right after opening.
345 	/// A close tag is always an error.
346 	/// This mode can be used for strict parsing of HTML5 void
347 	/// elements.
348 	never,
349 }
350 
351 /// Configuration for parsing XML.
352 struct XmlParseConfig
353 {
354 static:
355 	NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; } ///
356 	bool preserveWhitespace(string tag) { return false; } ///
357 	enum optionalParameterValues = false; ///
358 }
359 
360 /// Configuration for strict parsing of HTML5.
361 /// All void tags must never be closed, and all
362 /// non-void tags must always be explicitly closed.
363 /// Attributes must still be quoted like in XML.
364 struct Html5StrictParseConfig
365 {
366 static:
367 	immutable voidElements = [
368 		"area"   , "base"  , "br"   , "col" ,
369 		"command", "embed" , "hr"   , "img" ,
370 		"input"  , "keygen", "link" , "meta",
371 		"param"  , "source", "track", "wbr" ,
372 	]; ///
373 
374 	NodeCloseMode nodeCloseMode(string tag)
375 	{
376 		return tag.isOneOf(voidElements)
377 			? NodeCloseMode.never
378 			: NodeCloseMode.always
379 		;
380 	} ///
381 
382 	enum optionalParameterValues = true; ///
383 	bool preserveWhitespace(string tag) { return false; /*TODO*/ } ///
384 }
385 
386 /// Parse an SGML-ish string into an XmlNode
387 alias parse = parseString!XmlNode;
388 
389 /// Parse an SGML-ish string into an XmlDocument
390 alias parseDocument = parseString!XmlDocument;
391 
392 /// Parse an XML string into an XmlDocument.
393 alias xmlParse = parseDocument!XmlParseConfig;
394 
395 private:
396 
397 public // alias
398 template parseString(Node)
399 {
400 	Node parseString(Config)(string s)
401 	{
402 		auto ss = StringStream(s);
403 		alias f = parseStream!Node;
404 		return f!Config(ss);
405 	}
406 }
407 
408 template parseStream(Node)
409 {
410 	Node parseStream(Config)(ref StringStream s)
411 	{
412 		auto n = new Node;
413 		parseInto!Config(n, s);
414 		return n;
415 	}
416 }
417 
418 alias parseNode = parseStream!XmlNode;
419 
420 /// Parse an SGML-ish StringStream into an XmlDocument
421 void parseInto(Config)(XmlDocument d, ref StringStream s)
422 {
423 	skipWhitespace(s);
424 	while (s.position < s.size)
425 		try
426 		{
427 			auto n = new XmlNode;
428 			parseInto!Config(n, s, null);
429 			d.addChild(n);
430 			skipWhitespace(s);
431 		}
432 		catch (XmlParseException e)
433 		{
434 			import std.algorithm.searching;
435 			import std.range : retro;
436 
437 			auto head = s.s[0..s.position];
438 			auto row    = head.representation.count('\n');
439 			auto column = head.representation.retro.countUntil('\n');
440 			if (column < 0)
441 				column = head.length;
442 			throw new XmlParseException("Error at %d:%d (offset %d)".format(
443 				1 + row,
444 				1 + column,
445 				head.length,
446 			), e);
447 		}
448 }
449 
450 /// Parse an SGML-ish StringStream into an XmlNode
451 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false)
452 {
453 	char c;
454 
455 	preserveWhitespace |= Config.preserveWhitespace(parentTag);
456 	if (preserveWhitespace)
457 		c = s.read();
458 	else
459 		do
460 			c = s.read();
461 		while (isWhiteChar[c]);
462 
463 	node.startPos = s.position;
464 	if (c!='<')  // text node
465 	{
466 		node.type = XmlNodeType.Text;
467 		string text;
468 		while (c!='<')
469 		{
470 			// TODO: check for EOF
471 			text ~= c;
472 			c = s.read();
473 		}
474 		s.position--; // rewind to '<'
475 		if (!preserveWhitespace)
476 			while (text.length && isWhiteChar[text[$-1]])
477 				text = text[0..$-1];
478 		node.tag = decodeEntities(text);
479 		//tag = tag.strip();
480 	}
481 	else
482 	{
483 		c = s.read();
484 		if (c=='!')
485 		{
486 			c = s.read();
487 			if (c == '-') // comment
488 			{
489 				expect(s, '-');
490 				node.type = XmlNodeType.Comment;
491 				string tag;
492 				do
493 				{
494 					c = s.read();
495 					tag ~= c;
496 				} while (tag.length<3 || tag[$-3..$] != "-->");
497 				tag = tag[0..$-3];
498 				node.tag = tag;
499 			}
500 			else
501 			if (c == '[') // CDATA
502 			{
503 				foreach (x; "CDATA[")
504 					expect(s, x);
505 				node.type = XmlNodeType.CData;
506 				string tag;
507 				do
508 				{
509 					c = s.read();
510 					tag ~= c;
511 				} while (tag.length<3 || tag[$-3..$] != "]]>");
512 				tag = tag[0..$-3];
513 				node.tag = tag;
514 			}
515 			else // doctype, etc.
516 			{
517 				node.type = XmlNodeType.DocType;
518 				while (c != '>')
519 				{
520 					node.tag ~= c;
521 					c = s.read();
522 				}
523 			}
524 		}
525 		else
526 		if (c=='?')
527 		{
528 			node.type = XmlNodeType.Meta;
529 			node.tag = readWord(s);
530 			if (node.tag.length==0) throw new XmlParseException("Invalid tag");
531 			while (true)
532 			{
533 				skipWhitespace(s);
534 				if (peek(s)=='?')
535 					break;
536 				readAttribute!Config(node, s);
537 			}
538 			c = s.read();
539 			expect(s, '>');
540 		}
541 		else
542 		if (c=='/')
543 			throw new XmlParseException("Unexpected close tag");
544 		else
545 		{
546 			node.type = XmlNodeType.Node;
547 			s.position--;
548 			node.tag = readWord(s);
549 			while (true)
550 			{
551 				skipWhitespace(s);
552 				c = peek(s);
553 				if (c=='>' || c=='/')
554 					break;
555 				readAttribute!Config(node, s);
556 			}
557 			c = s.read();
558 
559 			auto closeMode = Config.nodeCloseMode(node.tag);
560 			if (closeMode == NodeCloseMode.never)
561 				enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag));
562 			else
563 			if (closeMode == NodeCloseMode.implicit)
564 			{
565 				if (c == '/')
566 					expect(s, '>');
567 			}
568 			else
569 			{
570 				if (c=='>')
571 				{
572 					while (true)
573 					{
574 						while (true)
575 						{
576 							if (!preserveWhitespace && !Config.preserveWhitespace(node.tag))
577 								skipWhitespace(s);
578 							if (peek(s)=='<' && peek(s, 2)=='/')
579 								break;
580 							try
581 							{
582 								auto child = new XmlNode;
583 								parseInto!Config(child, s, node.tag, preserveWhitespace);
584 								node.addChild(child);
585 							}
586 							catch (XmlParseException e)
587 								throw new XmlParseException("Error while processing child of "~node.tag, e);
588 						}
589 						expect(s, '<');
590 						expect(s, '/');
591 						auto word = readWord(s);
592 						if (word != node.tag)
593 						{
594 							auto closeMode2 = Config.nodeCloseMode(word);
595 							if (closeMode2 == NodeCloseMode.implicit)
596 							{
597 								auto parent = node.parent;
598 								enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word));
599 								enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word));
600 								enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word));
601 								continue;
602 							}
603 							else
604 								enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word));
605 						}
606 						expect(s, '>');
607 						break;
608 					}
609 				}
610 				else // '/'
611 					expect(s, '>');
612 			}
613 		}
614 	}
615 	node.endPos = s.position;
616 }
617 
618 private:
619 
620 void readAttribute(Config)(XmlNode node, ref StringStream s)
621 {
622 	string name = readWord(s);
623 	if (name.length==0) throw new XmlParseException("Invalid attribute");
624 	skipWhitespace(s);
625 
626 	static if (Config.optionalParameterValues)
627 	{
628 		if (peek(s) != '=')
629 		{
630 			node.attributes[name] = null;
631 			return;
632 		}
633 	}
634 
635 	expect(s, '=');
636 	skipWhitespace(s);
637 	char delim;
638 	delim = s.read();
639 	if (delim != '\'' && delim != '"')
640 		throw new XmlParseException("Expected ' or \", not %s".format(delim));
641 	string value = readUntil(s, delim);
642 	node.attributes[name] = decodeEntities(value);
643 }
644 
645 char peek(ref StringStream s, int n=1)
646 {
647 	return s.s[s.position + n - 1];
648 }
649 
650 void skipWhitespace(ref StringStream s)
651 {
652 	while (isWhiteChar[s.s.ptr[s.position]])
653 		s.position++;
654 }
655 
656 __gshared bool[256] isWhiteChar, isWordChar;
657 
658 shared static this()
659 {
660 	foreach (c; 0..256)
661 	{
662 		isWhiteChar[c] = isWhite(c);
663 		isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c);
664 	}
665 }
666 
667 string readWord(ref StringStream stream)
668 {
669 	auto start = stream.s.ptr + stream.position;
670 	auto end = stream.s.ptr + stream.s.length;
671 	auto p = start;
672 	while (p < end && isWordChar[*p])
673 		p++;
674 	auto len = p-start;
675 	stream.position += len;
676 	return start[0..len];
677 }
678 
679 void expect(ref StringStream s, char c)
680 {
681 	char c2;
682 	c2 = s.read();
683 	enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2);
684 }
685 
686 string readUntil(ref StringStream s, char until)
687 {
688 	auto start = s.s.ptr + s.position;
689 	auto p = start;
690 	while (*p != until) p++;
691 	auto len = p-start;
692 	s.position += len + 1;
693 	return start[0..len];
694 }
695 
696 unittest
697 {
698 	enum xmlText =
699 		`<?xml version="1.0" encoding="UTF-8"?>` ~
700 		`<quotes>` ~
701 			`<quote author="Alan Perlis">` ~
702 				`When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~
703 			`</quote>` ~
704 		`</quotes>`;
705 	auto doc = new XmlDocument(xmlText);
706 	assert(doc.toString() == xmlText, doc.toString());
707 }
708 
709 unittest
710 {
711 	string testOne(bool preserve)(string s)
712 	{
713 		static struct ParseConfig
714 		{
715 		static:
716 			NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
717 			bool preserveWhitespace(string tag) { return preserve; }
718 			enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
719 		}
720 		auto node = new XmlNode;
721 		auto str = StringStream("<tag>" ~ s ~ "</tag>");
722 		parseInto!ParseConfig(node, str, null);
723 		// import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString);
724 		return node.children.length ? node.children[0].tag : null;
725 	}
726 
727 	foreach (tag; ["a", " a", "a ", " a ", " a  a ", " ", ""])
728 	{
729 		assert(testOne!false(tag) == strip(tag),
730 			"Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'");
731 		assert(testOne!true(tag) == tag,
732 			"Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'");
733 	}
734 }
735 
736 unittest
737 {
738 	static struct ParseConfig
739 	{
740 	static:
741 		NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
742 		bool preserveWhitespace(string tag) { return tag == "a"; }
743 		enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
744 	}
745 	auto node = new XmlNode;
746 	auto str = StringStream("<a><b> foo </b></a>");
747 	parseInto!ParseConfig(node, str, null);
748 	assert(node.children[0].children[0].tag == " foo ");
749 }