1 /**
2  * Light read-only XML library
3  * May be deprecated in the future.
4  * See other XML modules for better implementations.
5  *
6  * License:
7  *   This Source Code Form is subject to the terms of
8  *   the Mozilla Public License, v. 2.0. If a copy of
9  *   the MPL was not distributed with this file, You
10  *   can obtain one at http://mozilla.org/MPL/2.0/.
11  *
12  * Authors:
13  *   Vladimir Panteleev <vladimir@thecybershadow.net>
14  *   Simon Arlott
15  */
16 
17 module ae.utils.xml.lite;
18 
19 // TODO: better/safer handling of malformed XML
20 
21 import std.string;
22 import std.ascii;
23 import std.exception;
24 
25 import ae.utils.array;
26 import ae.utils.xml.common;
27 import ae.utils.xml.entities;
28 import ae.utils.xmlwriter;
29 
30 // ************************************************************************
31 
32 /// std.stream.Stream-like type with bonus speed
33 private struct StringStream
34 {
35 	string s;
36 	size_t position;
37 
38 	@disable this();
39 	@disable this(this);
40 	this(string s)
41 	{
42 		enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution
43 		this.s = (s ~ ditch)[0..$-ditch.length];
44 	}
45 
46 	char read() { return s[position++]; }
47 	@property size_t size() { return s.length; }
48 }
49 
50 // ************************************************************************
51 
52 enum XmlNodeType
53 {
54 	None,
55 	Root,
56 	Node,
57 	Comment,
58 	Meta,
59 	DocType,
60 	CData,
61 	Text,
62 	Raw, // Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is.
63 }
64 
65 alias XmlAttributes = OrderedMap!(string, string);
66 
67 class XmlNode
68 {
69 	string tag;
70 	XmlAttributes attributes;
71 	XmlNode parent;
72 	XmlNode[] children;
73 	XmlNodeType type;
74 	ulong startPos, endPos;
75 
76 	this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); }
77 	this(string s) { auto ss = StringStream(s); this(ss); }
78 
79 	this(XmlNodeType type = XmlNodeType.None, string tag = null)
80 	{
81 		this.type = type;
82 		this.tag = tag;
83 	}
84 
85 	XmlNode addAttribute(string name, string value)
86 	{
87 		attributes[name] = value;
88 		return this;
89 	}
90 
91 	XmlNode addChild(XmlNode child)
92 	{
93 		child.parent = this;
94 		children ~= child;
95 		return this;
96 	}
97 
98 	override string toString() const
99 	{
100 		XmlWriter writer;
101 		writeTo(writer);
102 		return writer.output.get();
103 	}
104 
105 	string toPrettyString() const
106 	{
107 		PrettyXmlWriter writer;
108 		writeTo(writer);
109 		return writer.output.get();
110 	}
111 
112 	final void writeTo(XmlWriter)(ref XmlWriter output) const
113 	{
114 		void writeChildren()
115 		{
116 			foreach (child; children)
117 				child.writeTo(output);
118 		}
119 
120 		void writeAttributes()
121 		{
122 			foreach (key, value; attributes)
123 				output.addAttribute(key, value);
124 		}
125 
126 		final switch (type)
127 		{
128 			case XmlNodeType.None:
129 				assert(false);
130 			case XmlNodeType.Root:
131 				writeChildren();
132 				return;
133 			case XmlNodeType.Node:
134 				output.startTagWithAttributes(tag);
135 				writeAttributes();
136 				if (children.length)
137 				{
138 					bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text;
139 					if (oneLine)
140 						output.formatter.enabled = false;
141 					output.endAttributes();
142 					writeChildren();
143 					output.endTag(tag);
144 					if (oneLine)
145 					{
146 						output.formatter.enabled = true;
147 						output.newLine();
148 					}
149 				}
150 				else
151 					output.endAttributesAndTag();
152 				return;
153 			case XmlNodeType.Meta:
154 				assert(children.length == 0);
155 				output.startPI(tag);
156 				writeAttributes();
157 				output.endPI();
158 				return;
159 			case XmlNodeType.DocType:
160 				assert(children.length == 0);
161 				output.doctype(tag);
162 				return;
163 			case XmlNodeType.Text:
164 				output.startLine();
165 				output.text(tag);
166 				output.newLine();
167 				return;
168 			case XmlNodeType.Comment:
169 				output.startLine();
170 				output.comment(tag);
171 				return;
172 			case XmlNodeType.CData:
173 				output.text(tag);
174 				return;
175 			case XmlNodeType.Raw:
176 				output.startLine();
177 				output.output.put(tag);
178 				output.newLine();
179 				return;
180 		}
181 	}
182 
183 	@property string text()
184 	{
185 		final switch (type)
186 		{
187 			case XmlNodeType.None:
188 				assert(false);
189 			case XmlNodeType.Text:
190 			case XmlNodeType.CData:
191 				return tag;
192 			case XmlNodeType.Node:
193 			case XmlNodeType.Root:
194 				string result;
195 				if (tag == "br")
196 					result = "\n";
197 				foreach (child; children)
198 					result ~= child.text();
199 				return result;
200 			case XmlNodeType.Comment:
201 			case XmlNodeType.Meta:
202 			case XmlNodeType.DocType:
203 				return null;
204 			case XmlNodeType.Raw:
205 				assert(false, "Can't extract text from Raw nodes");
206 		}
207 	}
208 
209 	final XmlNode findChild(string tag)
210 	{
211 		foreach (child; children)
212 			if (child.type == XmlNodeType.Node && child.tag == tag)
213 				return child;
214 		return null;
215 	}
216 
217 	final XmlNode[] findChildren(string tag)
218 	{
219 		XmlNode[] result;
220 		foreach (child; children)
221 			if (child.type == XmlNodeType.Node && child.tag == tag)
222 				result ~= child;
223 		return result;
224 	}
225 
226 	final XmlNode opIndex(string tag)
227 	{
228 		auto node = findChild(tag);
229 		if (node is null)
230 			throw new XmlParseException("No such child: " ~ tag);
231 		return node;
232 	}
233 
234 	final XmlNode opIndex(string tag, size_t index)
235 	{
236 		auto nodes = findChildren(tag);
237 		if (index >= nodes.length)
238 			throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length));
239 		return nodes[index];
240 	}
241 
242 	final ref XmlNode opIndex(size_t index)
243 	{
244 		return children[index];
245 	}
246 
247 	final @property size_t length() { return children.length; }
248 	alias opDollar = length;
249 
250 	int opApply(int delegate(ref XmlNode) dg)
251 	{
252 		int result = 0;
253 
254 		for (int i = 0; i < children.length; i++)
255 		{
256 			result = dg(children[i]);
257 			if (result)
258 				break;
259 		}
260 		return result;
261 	}
262 
263 	final @property XmlNode dup()
264 	{
265 		auto result = new XmlNode(type, tag);
266 		result.attributes = attributes.dup;
267 		result.children.reserve(children.length);
268 		foreach (child; children)
269 			result.addChild(child.dup);
270 		return result;
271 	}
272 }
273 
274 class XmlDocument : XmlNode
275 {
276 	this()
277 	{
278 		super(XmlNodeType.Root);
279 		tag = "<Root>";
280 	}
281 
282 	this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); }
283 	this(string s) { auto ss = StringStream(s); this(ss); }
284 
285 	final @property XmlDocument dup()
286 	{
287 		auto result = new XmlDocument();
288 		result.children = super.dup().children;
289 		return result;
290 	}
291 }
292 
293 /// The logic for how to handle a node's closing tags.
294 enum NodeCloseMode
295 {
296 	/// This element must always have an explicit closing tag
297 	/// (or a self-closing tag). An unclosed tag will lead to
298 	/// a parse error.
299 	/// In XML, all tags are "always".
300 	always,
301 /*
302 	/// Close tags are optional. When an element with a tag is
303 	/// encountered directly under an element with the same tag,
304 	/// it is assumed that the first element is closed before
305 	/// the second, so the two are siblings, not parent/child.
306 	/// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`,
307 	/// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is
308 	/// still parsed as `<p>a<div><p>b</p></div></p>`.
309 	/// This mode can be used for relaxed HTML parsing.
310 	optional,
311 */
312 	/// Close tags are optional, but are implied when absent.
313 	/// As a result, these elements cannot have any content,
314 	/// and any close tags must be adjacent to the open tag.
315 	implicit,
316 
317 	/// This element is void and must never have a closing tag.
318 	/// It is always implicitly closed right after opening.
319 	/// A close tag is always an error.
320 	/// This mode can be used for strict parsing of HTML5 void
321 	/// elements.
322 	never,
323 }
324 
325 /// Configuration for parsing XML.
326 struct XmlParseConfig
327 {
328 static:
329 	NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; }
330 	bool preserveWhitespace(string tag) { return false; }
331 	enum optionalParameterValues = false;
332 }
333 
334 /// Configuration for strict parsing of HTML5.
335 /// All void tags must never be closed, and all
336 /// non-void tags must always be explicitly closed.
337 /// Attributes must still be quoted like in XML.
338 struct Html5StrictParseConfig
339 {
340 static:
341 	immutable voidElements = [
342 		"area"   , "base"  , "br"   , "col" ,
343 		"command", "embed" , "hr"   , "img" ,
344 		"input"  , "keygen", "link" , "meta",
345 		"param"  , "source", "track", "wbr" ,
346 	];
347 
348 	NodeCloseMode nodeCloseMode(string tag)
349 	{
350 		return tag.isOneOf(voidElements)
351 			? NodeCloseMode.never
352 			: NodeCloseMode.always
353 		;
354 	}
355 
356 	enum optionalParameterValues = true;
357 	bool preserveWhitespace(string tag) { return false; /*TODO*/ }
358 }
359 
360 /// Parse an SGML-ish string into an XmlNode
361 alias parse = parseString!XmlNode;
362 
363 /// Parse an SGML-ish StringStream into an XmlDocument
364 alias parseDocument = parseString!XmlDocument;
365 
366 alias xmlParse = parseDocument!XmlParseConfig;
367 
368 private:
369 
370 public // alias
371 template parseString(Node)
372 {
373 	Node parseString(Config)(string s)
374 	{
375 		auto ss = StringStream(s);
376 		alias f = parseStream!Node;
377 		return f!Config(ss);
378 	}
379 }
380 
381 template parseStream(Node)
382 {
383 	Node parseStream(Config)(ref StringStream s)
384 	{
385 		auto n = new Node;
386 		parseInto!Config(n, s);
387 		return n;
388 	}
389 }
390 
391 alias parseNode = parseStream!XmlNode;
392 
393 /// Parse an SGML-ish StringStream into an XmlDocument
394 void parseInto(Config)(XmlDocument d, ref StringStream s)
395 {
396 	skipWhitespace(s);
397 	while (s.position < s.size)
398 		try
399 		{
400 			auto n = new XmlNode;
401 			parseInto!Config(n, s, null);
402 			d.addChild(n);
403 			skipWhitespace(s);
404 		}
405 		catch (XmlParseException e)
406 		{
407 			import std.algorithm.searching;
408 			import std.range : retro;
409 
410 			auto head = s.s[0..s.position];
411 			auto row    = head.representation.count('\n');
412 			auto column = head.representation.retro.countUntil('\n');
413 			if (column < 0)
414 				column = head.length;
415 			throw new XmlParseException("Error at %d:%d (offset %d)".format(
416 				1 + row,
417 				1 + column,
418 				head.length,
419 			), e);
420 		}
421 }
422 
423 /// Parse an SGML-ish StringStream into an XmlNode
424 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false)
425 {
426 	char c;
427 
428 	preserveWhitespace |= Config.preserveWhitespace(parentTag);
429 	if (preserveWhitespace)
430 		c = s.read();
431 	else
432 		do
433 			c = s.read();
434 		while (isWhiteChar[c]);
435 
436 	node.startPos = s.position;
437 	if (c!='<')  // text node
438 	{
439 		node.type = XmlNodeType.Text;
440 		string text;
441 		while (c!='<')
442 		{
443 			// TODO: check for EOF
444 			text ~= c;
445 			c = s.read();
446 		}
447 		s.position--; // rewind to '<'
448 		if (!preserveWhitespace)
449 			while (text.length && isWhiteChar[text[$-1]])
450 				text = text[0..$-1];
451 		node.tag = decodeEntities(text);
452 		//tag = tag.strip();
453 	}
454 	else
455 	{
456 		c = s.read();
457 		if (c=='!')
458 		{
459 			c = s.read();
460 			if (c == '-') // comment
461 			{
462 				expect(s, '-');
463 				node.type = XmlNodeType.Comment;
464 				string tag;
465 				do
466 				{
467 					c = s.read();
468 					tag ~= c;
469 				} while (tag.length<3 || tag[$-3..$] != "-->");
470 				tag = tag[0..$-3];
471 				node.tag = tag;
472 			}
473 			else
474 			if (c == '[') // CDATA
475 			{
476 				foreach (x; "CDATA[")
477 					expect(s, x);
478 				node.type = XmlNodeType.CData;
479 				string tag;
480 				do
481 				{
482 					c = s.read();
483 					tag ~= c;
484 				} while (tag.length<3 || tag[$-3..$] != "]]>");
485 				tag = tag[0..$-3];
486 				node.tag = tag;
487 			}
488 			else // doctype, etc.
489 			{
490 				node.type = XmlNodeType.DocType;
491 				while (c != '>')
492 				{
493 					node.tag ~= c;
494 					c = s.read();
495 				}
496 			}
497 		}
498 		else
499 		if (c=='?')
500 		{
501 			node.type = XmlNodeType.Meta;
502 			node.tag = readWord(s);
503 			if (node.tag.length==0) throw new XmlParseException("Invalid tag");
504 			while (true)
505 			{
506 				skipWhitespace(s);
507 				if (peek(s)=='?')
508 					break;
509 				readAttribute!Config(node, s);
510 			}
511 			c = s.read();
512 			expect(s, '>');
513 		}
514 		else
515 		if (c=='/')
516 			throw new XmlParseException("Unexpected close tag");
517 		else
518 		{
519 			node.type = XmlNodeType.Node;
520 			s.position--;
521 			node.tag = readWord(s);
522 			while (true)
523 			{
524 				skipWhitespace(s);
525 				c = peek(s);
526 				if (c=='>' || c=='/')
527 					break;
528 				readAttribute!Config(node, s);
529 			}
530 			c = s.read();
531 
532 			auto closeMode = Config.nodeCloseMode(node.tag);
533 			if (closeMode == NodeCloseMode.never)
534 				enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag));
535 			else
536 			if (closeMode == NodeCloseMode.implicit)
537 			{
538 				if (c == '/')
539 					expect(s, '>');
540 			}
541 			else
542 			{
543 				if (c=='>')
544 				{
545 					while (true)
546 					{
547 						while (true)
548 						{
549 							if (!preserveWhitespace && !Config.preserveWhitespace(node.tag))
550 								skipWhitespace(s);
551 							if (peek(s)=='<' && peek(s, 2)=='/')
552 								break;
553 							try
554 							{
555 								auto child = new XmlNode;
556 								parseInto!Config(child, s, node.tag, preserveWhitespace);
557 								node.addChild(child);
558 							}
559 							catch (XmlParseException e)
560 								throw new XmlParseException("Error while processing child of "~node.tag, e);
561 						}
562 						expect(s, '<');
563 						expect(s, '/');
564 						auto word = readWord(s);
565 						if (word != node.tag)
566 						{
567 							auto closeMode2 = Config.nodeCloseMode(word);
568 							if (closeMode2 == NodeCloseMode.implicit)
569 							{
570 								auto parent = node.parent;
571 								enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word));
572 								enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word));
573 								enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word));
574 								continue;
575 							}
576 							else
577 								enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word));
578 						}
579 						expect(s, '>');
580 						break;
581 					}
582 				}
583 				else // '/'
584 					expect(s, '>');
585 			}
586 		}
587 	}
588 	node.endPos = s.position;
589 }
590 
591 private:
592 
593 void readAttribute(Config)(XmlNode node, ref StringStream s)
594 {
595 	string name = readWord(s);
596 	if (name.length==0) throw new XmlParseException("Invalid attribute");
597 	skipWhitespace(s);
598 
599 	static if (Config.optionalParameterValues)
600 	{
601 		if (peek(s) != '=')
602 		{
603 			node.attributes[name] = null;
604 			return;
605 		}
606 	}
607 
608 	expect(s, '=');
609 	skipWhitespace(s);
610 	char delim;
611 	delim = s.read();
612 	if (delim != '\'' && delim != '"')
613 		throw new XmlParseException("Expected ' or \", not %s".format(delim));
614 	string value = readUntil(s, delim);
615 	node.attributes[name] = decodeEntities(value);
616 }
617 
618 char peek(ref StringStream s, int n=1)
619 {
620 	return s.s[s.position + n - 1];
621 }
622 
623 void skipWhitespace(ref StringStream s)
624 {
625 	while (isWhiteChar[s.s.ptr[s.position]])
626 		s.position++;
627 }
628 
629 __gshared bool[256] isWhiteChar, isWordChar;
630 
631 shared static this()
632 {
633 	foreach (c; 0..256)
634 	{
635 		isWhiteChar[c] = isWhite(c);
636 		isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c);
637 	}
638 }
639 
640 string readWord(ref StringStream stream)
641 {
642 	auto start = stream.s.ptr + stream.position;
643 	auto end = stream.s.ptr + stream.s.length;
644 	auto p = start;
645 	while (p < end && isWordChar[*p])
646 		p++;
647 	auto len = p-start;
648 	stream.position += len;
649 	return start[0..len];
650 }
651 
652 void expect(ref StringStream s, char c)
653 {
654 	char c2;
655 	c2 = s.read();
656 	enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2);
657 }
658 
659 string readUntil(ref StringStream s, char until)
660 {
661 	auto start = s.s.ptr + s.position;
662 	auto p = start;
663 	while (*p != until) p++;
664 	auto len = p-start;
665 	s.position += len + 1;
666 	return start[0..len];
667 }
668 
669 unittest
670 {
671 	enum xmlText =
672 		`<?xml version="1.0" encoding="UTF-8"?>` ~
673 		`<quotes>` ~
674 			`<quote author="Alan Perlis">` ~
675 				`When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~
676 			`</quote>` ~
677 		`</quotes>`;
678 	auto doc = new XmlDocument(xmlText);
679 	assert(doc.toString() == xmlText, doc.toString());
680 }
681 
682 unittest
683 {
684 	string testOne(bool preserve)(string s)
685 	{
686 		static struct ParseConfig
687 		{
688 		static:
689 			NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
690 			bool preserveWhitespace(string tag) { return preserve; }
691 			enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
692 		}
693 		auto node = new XmlNode;
694 		auto str = StringStream("<tag>" ~ s ~ "</tag>");
695 		parseInto!ParseConfig(node, str, null);
696 		// import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString);
697 		return node.children.length ? node.children[0].tag : null;
698 	}
699 
700 	foreach (tag; ["a", " a", "a ", " a ", " a  a ", " ", ""])
701 	{
702 		assert(testOne!false(tag) == strip(tag),
703 			"Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'");
704 		assert(testOne!true(tag) == tag,
705 			"Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'");
706 	}
707 }
708 
709 unittest
710 {
711 	static struct ParseConfig
712 	{
713 	static:
714 		NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
715 		bool preserveWhitespace(string tag) { return tag == "a"; }
716 		enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
717 	}
718 	auto node = new XmlNode;
719 	auto str = StringStream("<a><b> foo </b></a>");
720 	parseInto!ParseConfig(node, str, null);
721 	assert(node.children[0].children[0].tag == " foo ");
722 }