1 /**
2  * Light read-only XML library
3  * May be deprecated in the future.
4  * See other XML modules for better implementations.
5  *
6  * License:
7  *   This Source Code Form is subject to the terms of
8  *   the Mozilla Public License, v. 2.0. If a copy of
9  *   the MPL was not distributed with this file, You
10  *   can obtain one at http://mozilla.org/MPL/2.0/.
11  *
12  * Authors:
13  *   Vladimir Panteleev <vladimir@thecybershadow.net>
14  *   Simon Arlott
15  */
16 
17 module ae.utils.xml.lite;
18 
19 // TODO: better/safer handling of malformed XML
20 
21 import std.string;
22 import std.ascii;
23 import std.exception;
24 
25 import ae.utils.array;
26 import ae.utils.xml.common;
27 import ae.utils.xml.entities;
28 import ae.utils.xmlwriter;
29 
30 // ************************************************************************
31 
32 /// std.stream.Stream-like type with bonus speed
33 private struct StringStream
34 {
35 	string s;
36 	size_t position;
37 
38 	@disable this();
39 	@disable this(this);
40 	this(string s)
41 	{
42 		enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution
43 		this.s = (s ~ ditch)[0..$-ditch.length];
44 	}
45 
46 	char read() { return s[position++]; }
47 	@property size_t size() { return s.length; }
48 }
49 
50 // ************************************************************************
51 
52 enum XmlNodeType
53 {
54 	None,
55 	Root,
56 	Node,
57 	Comment,
58 	Meta,
59 	DocType,
60 	CData,
61 	Text,
62 	Raw, // Never created during parsing. Programs can put raw XML fragments in `Raw` nodes to emit it as-is.
63 }
64 
65 alias XmlAttributes = OrderedMap!(string, string);
66 
67 class XmlNode
68 {
69 	string tag;
70 	XmlAttributes attributes;
71 	XmlNode parent;
72 	XmlNode[] children;
73 	XmlNodeType type;
74 	ulong startPos, endPos;
75 
76 	this(ref StringStream s) { parseInto!XmlParseConfig(this, s, null); }
77 	this(string s) { auto ss = StringStream(s); this(ss); }
78 
79 	this(XmlNodeType type = XmlNodeType.None, string tag = null)
80 	{
81 		this.type = type;
82 		this.tag = tag;
83 	}
84 
85 	XmlNode addAttribute(string name, string value)
86 	{
87 		attributes[name] = value;
88 		return this;
89 	}
90 
91 	XmlNode addChild(XmlNode child)
92 	{
93 		child.parent = this;
94 		children ~= child;
95 		return this;
96 	}
97 
98 	override string toString() const
99 	{
100 		XmlWriter writer;
101 		writeTo(writer);
102 		return writer.output.get();
103 	}
104 
105 	string toPrettyString() const
106 	{
107 		PrettyXmlWriter writer;
108 		writeTo(writer);
109 		return writer.output.get();
110 	}
111 
112 	final void writeTo(XmlWriter)(ref XmlWriter output) const
113 	{
114 		void writeChildren()
115 		{
116 			foreach (child; children)
117 				child.writeTo(output);
118 		}
119 
120 		void writeAttributes()
121 		{
122 			foreach (key, value; attributes)
123 				output.addAttribute(key, value);
124 		}
125 
126 		final switch (type)
127 		{
128 			case XmlNodeType.None:
129 				assert(false);
130 			case XmlNodeType.Root:
131 				writeChildren();
132 				return;
133 			case XmlNodeType.Node:
134 				output.startTagWithAttributes(tag);
135 				writeAttributes();
136 				if (children.length)
137 				{
138 					bool oneLine = children.length == 1 && children[0].type == XmlNodeType.Text;
139 					if (oneLine)
140 						output.formatter.enabled = false;
141 					output.endAttributes();
142 					writeChildren();
143 					output.endTag(tag);
144 					if (oneLine)
145 					{
146 						output.formatter.enabled = true;
147 						output.newLine();
148 					}
149 				}
150 				else
151 					output.endAttributesAndTag();
152 				return;
153 			case XmlNodeType.Meta:
154 				assert(children.length == 0);
155 				output.startPI(tag);
156 				writeAttributes();
157 				output.endPI();
158 				return;
159 			case XmlNodeType.DocType:
160 				assert(children.length == 0);
161 				output.doctype(tag);
162 				return;
163 			case XmlNodeType.Text:
164 				output.startLine();
165 				output.text(tag);
166 				output.newLine();
167 				return;
168 			case XmlNodeType.Comment:
169 				output.startLine();
170 				output.comment(tag);
171 				return;
172 			case XmlNodeType.CData:
173 				output.text(tag);
174 				return;
175 			case XmlNodeType.Raw:
176 				output.startLine();
177 				output.output.put(tag);
178 				output.newLine();
179 				return;
180 		}
181 	}
182 
183 	@property string text()
184 	{
185 		final switch (type)
186 		{
187 			case XmlNodeType.None:
188 				assert(false);
189 			case XmlNodeType.Text:
190 			case XmlNodeType.CData:
191 				return tag;
192 			case XmlNodeType.Node:
193 			case XmlNodeType.Root:
194 				string result;
195 				if (tag == "br")
196 					result = "\n";
197 				foreach (child; children)
198 					result ~= child.text();
199 				return result;
200 			case XmlNodeType.Comment:
201 			case XmlNodeType.Meta:
202 			case XmlNodeType.DocType:
203 				return null;
204 			case XmlNodeType.Raw:
205 				assert(false, "Can't extract text from Raw nodes");
206 		}
207 	}
208 
209 	final XmlNode findChild(string tag)
210 	{
211 		foreach (child; children)
212 			if (child.type == XmlNodeType.Node && child.tag == tag)
213 				return child;
214 		return null;
215 	}
216 
217 	final XmlNode[] findChildren(string tag)
218 	{
219 		XmlNode[] result;
220 		foreach (child; children)
221 			if (child.type == XmlNodeType.Node && child.tag == tag)
222 				result ~= child;
223 		return result;
224 	}
225 
226 	final XmlNode opIndex(string tag)
227 	{
228 		auto node = findChild(tag);
229 		if (node is null)
230 			throw new XmlParseException("No such child: " ~ tag);
231 		return node;
232 	}
233 
234 	final XmlNode opIndex(string tag, size_t index)
235 	{
236 		auto nodes = findChildren(tag);
237 		if (index >= nodes.length)
238 			throw new XmlParseException(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length));
239 		return nodes[index];
240 	}
241 
242 	final ref XmlNode opIndex(size_t index)
243 	{
244 		return children[index];
245 	}
246 
247 	final @property size_t length() { return children.length; }
248 	alias opDollar = length;
249 
250 	int opApply(int delegate(ref XmlNode) dg)
251 	{
252 		int result = 0;
253 
254 		for (int i = 0; i < children.length; i++)
255 		{
256 			result = dg(children[i]);
257 			if (result)
258 				break;
259 		}
260 		return result;
261 	}
262 
263 	final @property XmlNode dup()
264 	{
265 		auto result = new XmlNode(type, tag);
266 		result.attributes = attributes.dup;
267 		result.children.reserve(children.length);
268 		foreach (child; children)
269 			result.addChild(child.dup);
270 		return result;
271 	}
272 }
273 
274 class XmlDocument : XmlNode
275 {
276 	this()
277 	{
278 		super(XmlNodeType.Root);
279 		tag = "<Root>";
280 	}
281 
282 	this(ref StringStream s) { this(); parseInto!XmlParseConfig(this, s); }
283 	this(string s) { auto ss = StringStream(s); this(ss); }
284 }
285 
286 /// The logic for how to handle a node's closing tags.
287 enum NodeCloseMode
288 {
289 	/// This element must always have an explicit closing tag
290 	/// (or a self-closing tag). An unclosed tag will lead to
291 	/// a parse error.
292 	/// In XML, all tags are "always".
293 	always,
294 /*
295 	/// Close tags are optional. When an element with a tag is
296 	/// encountered directly under an element with the same tag,
297 	/// it is assumed that the first element is closed before
298 	/// the second, so the two are siblings, not parent/child.
299 	/// Thus, `<p>a<p>b</p>` is parsed as `<p>a</p><p>b</p>`,
300 	/// not `<p>a<p>b</p></p>`, however `<p>a<div><p>b</div>` is
301 	/// still parsed as `<p>a<div><p>b</p></div></p>`.
302 	/// This mode can be used for relaxed HTML parsing.
303 	optional,
304 */
305 	/// Close tags are optional, but are implied when absent.
306 	/// As a result, these elements cannot have any content,
307 	/// and any close tags must be adjacent to the open tag.
308 	implicit,
309 
310 	/// This element is void and must never have a closing tag.
311 	/// It is always implicitly closed right after opening.
312 	/// A close tag is always an error.
313 	/// This mode can be used for strict parsing of HTML5 void
314 	/// elements.
315 	never,
316 }
317 
318 /// Configuration for parsing XML.
319 struct XmlParseConfig
320 {
321 static:
322 	NodeCloseMode nodeCloseMode(string tag) { return NodeCloseMode.always; }
323 	bool preserveWhitespace(string tag) { return false; }
324 	enum optionalParameterValues = false;
325 }
326 
327 /// Configuration for strict parsing of HTML5.
328 /// All void tags must never be closed, and all
329 /// non-void tags must always be explicitly closed.
330 /// Attributes must still be quoted like in XML.
331 struct Html5StrictParseConfig
332 {
333 static:
334 	immutable voidElements = [
335 		"area"   , "base"  , "br"   , "col" ,
336 		"command", "embed" , "hr"   , "img" ,
337 		"input"  , "keygen", "link" , "meta",
338 		"param"  , "source", "track", "wbr" ,
339 	];
340 
341 	NodeCloseMode nodeCloseMode(string tag)
342 	{
343 		return tag.isOneOf(voidElements)
344 			? NodeCloseMode.never
345 			: NodeCloseMode.always
346 		;
347 	}
348 
349 	enum optionalParameterValues = true;
350 	bool preserveWhitespace(string tag) { return false; /*TODO*/ }
351 }
352 
353 /// Parse an SGML-ish string into an XmlNode
354 alias parse = parseString!XmlNode;
355 
356 /// Parse an SGML-ish StringStream into an XmlDocument
357 alias parseDocument = parseString!XmlDocument;
358 
359 alias xmlParse = parseDocument!XmlParseConfig;
360 
361 private:
362 
363 public // alias
364 template parseString(Node)
365 {
366 	Node parseString(Config)(string s)
367 	{
368 		auto ss = StringStream(s);
369 		alias f = parseStream!Node;
370 		return f!Config(ss);
371 	}
372 }
373 
374 template parseStream(Node)
375 {
376 	Node parseStream(Config)(ref StringStream s)
377 	{
378 		auto n = new Node;
379 		parseInto!Config(n, s);
380 		return n;
381 	}
382 }
383 
384 alias parseNode = parseStream!XmlNode;
385 
386 /// Parse an SGML-ish StringStream into an XmlDocument
387 void parseInto(Config)(XmlDocument d, ref StringStream s)
388 {
389 	skipWhitespace(s);
390 	while (s.position < s.size)
391 		try
392 		{
393 			auto n = new XmlNode;
394 			parseInto!Config(n, s, null);
395 			d.addChild(n);
396 			skipWhitespace(s);
397 		}
398 		catch (XmlParseException e)
399 		{
400 			import std.algorithm.searching;
401 			import std.range : retro;
402 
403 			auto head = s.s[0..s.position];
404 			auto row    = head.representation.count('\n');
405 			auto column = head.representation.retro.countUntil('\n');
406 			if (column < 0)
407 				column = head.length;
408 			throw new XmlParseException("Error at %d:%d (offset %d)".format(
409 				1 + row,
410 				1 + column,
411 				head.length,
412 			), e);
413 		}
414 }
415 
416 /// Parse an SGML-ish StringStream into an XmlNode
417 void parseInto(Config)(XmlNode node, ref StringStream s, string parentTag = null, bool preserveWhitespace = false)
418 {
419 	char c;
420 
421 	preserveWhitespace |= Config.preserveWhitespace(parentTag);
422 	if (preserveWhitespace)
423 		c = s.read();
424 	else
425 		do
426 			c = s.read();
427 		while (isWhiteChar[c]);
428 
429 	node.startPos = s.position;
430 	if (c!='<')  // text node
431 	{
432 		node.type = XmlNodeType.Text;
433 		string text;
434 		while (c!='<')
435 		{
436 			// TODO: check for EOF
437 			text ~= c;
438 			c = s.read();
439 		}
440 		s.position--; // rewind to '<'
441 		if (!preserveWhitespace)
442 			while (text.length && isWhiteChar[text[$-1]])
443 				text = text[0..$-1];
444 		node.tag = decodeEntities(text);
445 		//tag = tag.strip();
446 	}
447 	else
448 	{
449 		c = s.read();
450 		if (c=='!')
451 		{
452 			c = s.read();
453 			if (c == '-') // comment
454 			{
455 				expect(s, '-');
456 				node.type = XmlNodeType.Comment;
457 				string tag;
458 				do
459 				{
460 					c = s.read();
461 					tag ~= c;
462 				} while (tag.length<3 || tag[$-3..$] != "-->");
463 				tag = tag[0..$-3];
464 				node.tag = tag;
465 			}
466 			else
467 			if (c == '[') // CDATA
468 			{
469 				foreach (x; "CDATA[")
470 					expect(s, x);
471 				node.type = XmlNodeType.CData;
472 				string tag;
473 				do
474 				{
475 					c = s.read();
476 					tag ~= c;
477 				} while (tag.length<3 || tag[$-3..$] != "]]>");
478 				tag = tag[0..$-3];
479 				node.tag = tag;
480 			}
481 			else // doctype, etc.
482 			{
483 				node.type = XmlNodeType.DocType;
484 				while (c != '>')
485 				{
486 					node.tag ~= c;
487 					c = s.read();
488 				}
489 			}
490 		}
491 		else
492 		if (c=='?')
493 		{
494 			node.type = XmlNodeType.Meta;
495 			node.tag = readWord(s);
496 			if (node.tag.length==0) throw new XmlParseException("Invalid tag");
497 			while (true)
498 			{
499 				skipWhitespace(s);
500 				if (peek(s)=='?')
501 					break;
502 				readAttribute!Config(node, s);
503 			}
504 			c = s.read();
505 			expect(s, '>');
506 		}
507 		else
508 		if (c=='/')
509 			throw new XmlParseException("Unexpected close tag");
510 		else
511 		{
512 			node.type = XmlNodeType.Node;
513 			s.position--;
514 			node.tag = readWord(s);
515 			while (true)
516 			{
517 				skipWhitespace(s);
518 				c = peek(s);
519 				if (c=='>' || c=='/')
520 					break;
521 				readAttribute!Config(node, s);
522 			}
523 			c = s.read();
524 
525 			auto closeMode = Config.nodeCloseMode(node.tag);
526 			if (closeMode == NodeCloseMode.never)
527 				enforce!XmlParseException(c=='>', "Self-closing void tag <%s>".format(node.tag));
528 			else
529 			if (closeMode == NodeCloseMode.implicit)
530 			{
531 				if (c == '/')
532 					expect(s, '>');
533 			}
534 			else
535 			{
536 				if (c=='>')
537 				{
538 					while (true)
539 					{
540 						while (true)
541 						{
542 							if (!preserveWhitespace && !Config.preserveWhitespace(node.tag))
543 								skipWhitespace(s);
544 							if (peek(s)=='<' && peek(s, 2)=='/')
545 								break;
546 							try
547 							{
548 								auto child = new XmlNode;
549 								parseInto!Config(child, s, node.tag, preserveWhitespace);
550 								node.addChild(child);
551 							}
552 							catch (XmlParseException e)
553 								throw new XmlParseException("Error while processing child of "~node.tag, e);
554 						}
555 						expect(s, '<');
556 						expect(s, '/');
557 						auto word = readWord(s);
558 						if (word != node.tag)
559 						{
560 							auto closeMode2 = Config.nodeCloseMode(word);
561 							if (closeMode2 == NodeCloseMode.implicit)
562 							{
563 								auto parent = node.parent;
564 								enforce!XmlParseException(parent, "Top-level close tag for implicitly-closed node </%s>".format(word));
565 								enforce!XmlParseException(parent.children.length, "First-child close tag for implicitly-closed node </%s>".format(word));
566 								enforce!XmlParseException(parent.children[$-1].tag == word, "Non-empty implicitly-closed node <%s>".format(word));
567 								continue;
568 							}
569 							else
570 								enforce!XmlParseException(word == node.tag, "Expected </%s>, not </%s>".format(node.tag, word));
571 						}
572 						expect(s, '>');
573 						break;
574 					}
575 				}
576 				else // '/'
577 					expect(s, '>');
578 			}
579 		}
580 	}
581 	node.endPos = s.position;
582 }
583 
584 private:
585 
586 void readAttribute(Config)(XmlNode node, ref StringStream s)
587 {
588 	string name = readWord(s);
589 	if (name.length==0) throw new XmlParseException("Invalid attribute");
590 	skipWhitespace(s);
591 
592 	static if (Config.optionalParameterValues)
593 	{
594 		if (peek(s) != '=')
595 		{
596 			node.attributes[name] = null;
597 			return;
598 		}
599 	}
600 
601 	expect(s, '=');
602 	skipWhitespace(s);
603 	char delim;
604 	delim = s.read();
605 	if (delim != '\'' && delim != '"')
606 		throw new XmlParseException("Expected ' or \", not %s".format(delim));
607 	string value = readUntil(s, delim);
608 	node.attributes[name] = decodeEntities(value);
609 }
610 
611 char peek(ref StringStream s, int n=1)
612 {
613 	return s.s[s.position + n - 1];
614 }
615 
616 void skipWhitespace(ref StringStream s)
617 {
618 	while (isWhiteChar[s.s.ptr[s.position]])
619 		s.position++;
620 }
621 
622 __gshared bool[256] isWhiteChar, isWordChar;
623 
624 shared static this()
625 {
626 	foreach (c; 0..256)
627 	{
628 		isWhiteChar[c] = isWhite(c);
629 		isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c);
630 	}
631 }
632 
633 string readWord(ref StringStream stream)
634 {
635 	auto start = stream.s.ptr + stream.position;
636 	auto end = stream.s.ptr + stream.s.length;
637 	auto p = start;
638 	while (p < end && isWordChar[*p])
639 		p++;
640 	auto len = p-start;
641 	stream.position += len;
642 	return start[0..len];
643 }
644 
645 void expect(ref StringStream s, char c)
646 {
647 	char c2;
648 	c2 = s.read();
649 	enforce!XmlParseException(c==c2, "Expected " ~ c ~ ", got " ~ c2);
650 }
651 
652 string readUntil(ref StringStream s, char until)
653 {
654 	auto start = s.s.ptr + s.position;
655 	auto p = start;
656 	while (*p != until) p++;
657 	auto len = p-start;
658 	s.position += len + 1;
659 	return start[0..len];
660 }
661 
662 unittest
663 {
664 	enum xmlText =
665 		`<?xml version="1.0" encoding="UTF-8"?>` ~
666 		`<quotes>` ~
667 			`<quote author="Alan Perlis">` ~
668 				`When someone says, "I want a programming language in which I need only say what I want done," give him a lollipop.` ~
669 			`</quote>` ~
670 		`</quotes>`;
671 	auto doc = new XmlDocument(xmlText);
672 	assert(doc.toString() == xmlText, doc.toString());
673 }
674 
675 unittest
676 {
677 	string testOne(bool preserve)(string s)
678 	{
679 		static struct ParseConfig
680 		{
681 		static:
682 			NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
683 			bool preserveWhitespace(string tag) { return preserve; }
684 			enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
685 		}
686 		auto node = new XmlNode;
687 		auto str = StringStream("<tag>" ~ s ~ "</tag>");
688 		parseInto!ParseConfig(node, str, null);
689 		// import std.stdio; writeln(preserve, ": ", str.s, " -> ", node.toString);
690 		return node.children.length ? node.children[0].tag : null;
691 	}
692 
693 	foreach (tag; ["a", " a", "a ", " a ", " a  a ", " ", ""])
694 	{
695 		assert(testOne!false(tag) == strip(tag),
696 			"Parsing <tag>" ~ tag ~ "</tag> while not preserving whitespace, expecting '" ~ strip(tag) ~ "', got '" ~ testOne!false(tag) ~ "'");
697 		assert(testOne!true(tag) == tag,
698 			"Parsing <tag>" ~ tag ~ "</tag> while preserving whitespace, expecting '" ~ tag ~ "', got '" ~ testOne!true(tag) ~ "'");
699 	}
700 }
701 
702 unittest
703 {
704 	static struct ParseConfig
705 	{
706 	static:
707 		NodeCloseMode nodeCloseMode(string tag) { return XmlParseConfig.nodeCloseMode(tag); }
708 		bool preserveWhitespace(string tag) { return tag == "a"; }
709 		enum optionalParameterValues = XmlParseConfig.optionalParameterValues;
710 	}
711 	auto node = new XmlNode;
712 	auto str = StringStream("<a><b> foo </b></a>");
713 	parseInto!ParseConfig(node, str, null);
714 	assert(node.children[0].children[0].tag == " foo ");
715 }