1 /**
2  * Light read-only XML library
3  * Soon to be deprecated.
4  * See other XML modules for better implementations.
5  *
6  * License:
7  *   This Source Code Form is subject to the terms of
8  *   the Mozilla Public License, v. 2.0. If a copy of
9  *   the MPL was not distributed with this file, You
10  *   can obtain one at http://mozilla.org/MPL/2.0/.
11  *
12  * Authors:
13  *   Vladimir Panteleev <vladimir@thecybershadow.net>
14  *   Simon Arlott
15  */
16 
17 module ae.utils.xmllite;
18 
19 // TODO: better/safer handling of malformed XML
20 
21 import std.stream;
22 import std.string;
23 import std.ascii;
24 import std.exception;
25 
26 import ae.utils.xmlwriter;
27 
28 // ************************************************************************
29 
30 /// Stream-like type with bonus speed
31 private struct StringStream
32 {
33 	string s;
34 	size_t position;
35 
36 	this(string s)
37 	{
38 		enum ditch = "'\">\0\0\0\0\0"; // Dirty precaution
39 		this.s = (s ~ ditch)[0..$-ditch.length];
40 	}
41 
42 	void read(out char c) { c = s[position++]; }
43 	void seekCur(sizediff_t offset) { position += offset; }
44 	@property size_t size() { return s.length; }
45 }
46 
47 // ************************************************************************
48 
49 enum XmlNodeType
50 {
51 	Root,
52 	Node,
53 	Comment,
54 	Meta,
55 	DocType,
56 	Text
57 }
58 
59 class XmlNode
60 {
61 	string tag;
62 	string[string] attributes;
63 	XmlNode[] children;
64 	XmlNodeType type;
65 	ulong startPos, endPos;
66 
67 	this(Stream        s) { parse(s); }
68 	this(StringStream* s) { parse(s); }
69 	this(string s) { this(new StringStream(s)); }
70 
71 	private final void parse(S)(S s)
72 	{
73 		startPos = s.position;
74 		char c;
75 		do
76 			s.read(c);
77 		while (isWhiteChar[c]);
78 
79 		if (c!='<')  // text node
80 		{
81 			type = XmlNodeType.Text;
82 			string text;
83 			while (c!='<')
84 			{
85 				// TODO: check for EOF
86 				text ~= c;
87 				s.read(c);
88 			}
89 			s.seekCur(-1); // rewind to '<'
90 			tag = decodeEntities(text);
91 			//tag = tag.strip();
92 		}
93 		else
94 		{
95 			s.read(c);
96 			if (c=='!')
97 			{
98 				s.read(c);
99 				if (c == '-') // comment
100 				{
101 					expect(s, '-');
102 					type = XmlNodeType.Comment;
103 					do
104 					{
105 						s.read(c);
106 						tag ~= c;
107 					} while (tag.length<3 || tag[$-3..$] != "-->");
108 					tag = tag[0..$-3];
109 				}
110 				else // doctype, etc.
111 				{
112 					type = XmlNodeType.DocType;
113 					while (c != '>')
114 					{
115 						tag ~= c;
116 						s.read(c);
117 					}
118 				}
119 			}
120 			else
121 			if (c=='?')
122 			{
123 				type = XmlNodeType.Meta;
124 				tag = readWord(s);
125 				if (tag.length==0) throw new Exception("Invalid tag");
126 				while (true)
127 				{
128 					skipWhitespace(s);
129 					if (peek(s)=='?')
130 						break;
131 					readAttribute(s);
132 				}
133 				s.read(c);
134 				expect(s, '>');
135 			}
136 			else
137 			if (c=='/')
138 				throw new Exception("Unexpected close tag");
139 			else
140 			{
141 				type = XmlNodeType.Node;
142 				tag = c~readWord(s);
143 				while (true)
144 				{
145 					skipWhitespace(s);
146 					c = peek(s);
147 					if (c=='>' || c=='/')
148 						break;
149 					readAttribute(s);
150 				}
151 				s.read(c);
152 				if (c=='>')
153 				{
154 					while (true)
155 					{
156 						skipWhitespace(s);
157 						if (peek(s)=='<' && peek(s, 2)=='/')
158 							break;
159 						try
160 							children ~= new XmlNode(s);
161 						catch (Exception e)
162 							throw new Exception("Error while processing child of "~tag, e);
163 					}
164 					expect(s, '<');
165 					expect(s, '/');
166 					foreach (tc; tag)
167 						expect(s, tc);
168 					expect(s, '>');
169 				}
170 				else
171 					expect(s, '>');
172 			}
173 		}
174 		endPos = s.position;
175 	}
176 
177 	this(XmlNodeType type, string tag = null)
178 	{
179 		this.type = type;
180 		this.tag = tag;
181 	}
182 
183 	XmlNode addAttribute(string name, string value)
184 	{
185 		attributes[name] = value;
186 		return this;
187 	}
188 
189 	XmlNode addChild(XmlNode child)
190 	{
191 		children ~= child;
192 		return this;
193 	}
194 
195 	override string toString() const
196 	{
197 		XmlWriter writer;
198 		writeTo(writer);
199 		return writer.output.get();
200 	}
201 
202 	final void writeTo(XmlWriter)(ref XmlWriter output) const
203 	{
204 		void writeChildren()
205 		{
206 			foreach (child; children)
207 				child.writeTo(output);
208 		}
209 
210 		void writeAttributes()
211 		{
212 			foreach (key, value; attributes)
213 				output.addAttribute(key, value);
214 		}
215 
216 		switch(type)
217 		{
218 			case XmlNodeType.Root:
219 				writeChildren();
220 				return;
221 			case XmlNodeType.Node:
222 				output.startTagWithAttributes(tag);
223 				writeAttributes();
224 				output.endAttributes();
225 				writeChildren();
226 				output.endTag(tag);
227 				return;
228 			case XmlNodeType.Meta:
229 				assert(children.length == 0);
230 				output.startPI(tag);
231 				writeAttributes();
232 				output.endPI();
233 				return;
234 			case XmlNodeType.DocType:
235 				assert(children.length == 0);
236 				output.doctype(tag);
237 				return;
238 			case XmlNodeType.Text:
239 				output.text(tag);
240 				return;
241 			default:
242 				return;
243 		}
244 	}
245 
246 	@property string text()
247 	{
248 		switch(type)
249 		{
250 			case XmlNodeType.Text:
251 				return tag;
252 			case XmlNodeType.Node:
253 			case XmlNodeType.Root:
254 				string childrenText;
255 				foreach (child; children)
256 					childrenText ~= child.text();
257 				return childrenText;
258 			default:
259 				return null;
260 		}
261 	}
262 
263 	final XmlNode findChild(string tag)
264 	{
265 		foreach (child; children)
266 			if (child.type == XmlNodeType.Node && child.tag == tag)
267 				return child;
268 		return null;
269 	}
270 
271 	final XmlNode[] findChildren(string tag)
272 	{
273 		XmlNode[] result;
274 		foreach (child; children)
275 			if (child.type == XmlNodeType.Node && child.tag == tag)
276 				result ~= child;
277 		return result;
278 	}
279 
280 	final XmlNode opIndex(string tag)
281 	{
282 		auto node = findChild(tag);
283 		if (node is null)
284 			throw new Exception("No such child: " ~ tag);
285 		return node;
286 	}
287 
288 	final XmlNode opIndex(string tag, size_t index)
289 	{
290 		auto nodes = findChildren(tag);
291 		if (index >= nodes.length)
292 			throw new Exception(format("Can't get node with tag %s and index %d, there are only %d children with that tag", tag, index, nodes.length));
293 		return nodes[index];
294 	}
295 
296 	final XmlNode opIndex(size_t index)
297 	{
298 		return children[index];
299 	}
300 
301 	final @property size_t length() { return children.length; }
302 
303 	int opApply(int delegate(ref XmlNode) dg)
304 	{
305 		int result = 0;
306 
307 		for (int i = 0; i < children.length; i++)
308 		{
309 			result = dg(children[i]);
310 			if (result)
311 				break;
312 		}
313 		return result;
314 	}
315 
316 	final @property XmlNode dup()
317 	{
318 		auto result = new XmlNode(type, tag);
319 		result.attributes = attributes.dup;
320 		result.children.length = children.length;
321 		foreach (i, child; children)
322 			result.children[i] = child.dup;
323 		return result;
324 	}
325 
326 private:
327 	final void readAttribute(S)(S s)
328 	{
329 		string name = readWord(s);
330 		if (name.length==0) throw new Exception("Invalid attribute");
331 		skipWhitespace(s);
332 		expect(s, '=');
333 		skipWhitespace(s);
334 		char delim;
335 		s.read(delim);
336 		if (delim != '\'' && delim != '"')
337 			throw new Exception("Expected ' or \"");
338 		string value = readUntil(s, delim);
339 		attributes[name] = decodeEntities(value);
340 	}
341 }
342 
343 class XmlDocument : XmlNode
344 {
345 	this()
346 	{
347 		super(XmlNodeType.Root);
348 		tag = "<Root>";
349 	}
350 
351 	this(Stream        s) { this(); parse(s); }
352 	this(StringStream* s) { this(); parse(s); }
353 	this(string s) { this(new StringStream(s)); }
354 
355 	final void parse(S)(S s)
356 	{
357 		skipWhitespace(s);
358 		while (s.position < s.size)
359 			try
360 			{
361 				children ~= new XmlNode(s);
362 				skipWhitespace(s);
363 			}
364 			catch (Exception e)
365 				throw new Exception(format("Error at %d", s.position), e);
366 	}
367 }
368 
369 XmlDocument xmlParse(T)(T source) { return new XmlDocument(source); }
370 
371 private:
372 
373 char peek(Stream s, int n=1)
374 {
375 	char c;
376 	for (int i=0; i<n; i++)
377 		s.read(c);
378 	s.seekCur(-n);
379 	return c;
380 }
381 
382 char peek(StringStream* s, int n=1)
383 {
384 	return s.s[s.position + n - 1];
385 }
386 
387 void skipWhitespace(Stream s)
388 {
389 	char c;
390 	do
391 	{
392 		if (s.position==s.size)
393 			return;
394 		s.read(c);
395 	}
396 	while (isWhiteChar[c]);
397 	s.seekCur(-1);
398 }
399 
400 void skipWhitespace(StringStream* s)
401 {
402 	while (isWhiteChar[s.s.ptr[s.position]])
403 		s.position++;
404 }
405 
406 __gshared bool[256] isWhiteChar, isWordChar;
407 
408 shared static this()
409 {
410 	foreach (c; 0..256)
411 	{
412 		isWhiteChar[c] = isWhite(c);
413 		isWordChar[c] = c=='-' || c=='_' || c==':' || isAlphaNum(c);
414 	}
415 }
416 
417 string readWord(Stream s)
418 {
419 	char c;
420 	string result;
421 	while (true)
422 	{
423 		s.read(c);
424 		if (!isWordChar[c])
425 			break;
426 		result ~= c;
427 	}
428 	s.seekCur(-1);
429 	return result;
430 }
431 
432 string readWord(StringStream* stream)
433 {
434 	auto start = stream.s.ptr + stream.position;
435 	auto end = stream.s.ptr + stream.s.length;
436 	auto p = start;
437 	while (p < end && isWordChar[*p])
438 		p++;
439 	auto len = p-start;
440 	stream.position += len;
441 	return start[0..len];
442 }
443 
444 void expect(S)(S s, char c)
445 {
446 	char c2;
447 	s.read(c2);
448 	enforce(c==c2, "Expected " ~ c ~ ", got " ~ c2);
449 }
450 
451 string readUntil(Stream s, char until)
452 {
453 	string value;
454 	while (true)
455 	{
456 		char c;
457 		s.read(c);
458 		if (c==until)
459 			return value;
460 		value ~= c;
461 	}
462 }
463 
464 string readUntil(StringStream* s, char until)
465 {
466 	auto start = s.s.ptr + s.position;
467 	auto p = start;
468 	while (*p != until) p++;
469 	auto len = p-start;
470 	s.position += len + 1;
471 	return start[0..len];
472 }
473 
474 unittest
475 {
476 	enum xmlText =
477 		`<?xml version="1.0" encoding="UTF-8"?>`
478 		`<quotes>`
479 			`<quote author="Alan Perlis">`
480 				`When someone says, &quot;I want a programming language in which I need only say what I want done,&quot; give him a lollipop.`
481 			`</quote>`
482 		`</quotes>`;
483 	auto doc = new XmlDocument(new MemoryStream(xmlText.dup));
484 	assert(doc.toString() == xmlText);
485 	doc = new XmlDocument(xmlText);
486 	assert(doc.toString() == xmlText);
487 }
488 
489 const dchar[string] entities;
490 /*const*/ string[dchar] entityNames;
491 static this()
492 {
493 	entities =
494 	[
495 		"quot"[]: '\&quot;'  ,
496 		"amp"   : '\&amp;'   ,
497 		"lt"    : '\&lt;'    ,
498 		"gt"    : '\&gt;'    ,
499 		"circ"  : '\&circ;'  ,
500 		"tilde" : '\&tilde;' ,
501 		"nbsp"  : '\&nbsp;'  ,
502 		"ensp"  : '\&ensp;'  ,
503 		"emsp"  : '\&emsp;'  ,
504 		"thinsp": '\&thinsp;',
505 		"ndash" : '\&ndash;' ,
506 		"mdash" : '\&mdash;' ,
507 		"lsquo" : '\&lsquo;' ,
508 		"rsquo" : '\&rsquo;' ,
509 		"sbquo" : '\&sbquo;' ,
510 		"ldquo" : '\&ldquo;' ,
511 		"rdquo" : '\&rdquo;' ,
512 		"bdquo" : '\&bdquo;' ,
513 		"dagger": '\&dagger;',
514 		"Dagger": '\&Dagger;',
515 		"permil": '\&permil;',
516 		"laquo" : '\&laquo;' ,
517 		"raquo" : '\&raquo;' ,
518 		"lsaquo": '\&lsaquo;',
519 		"rsaquo": '\&rsaquo;',
520 		"euro"  : '\&euro;'  ,
521 		"copy"  : '\&copy;'  ,
522 		"reg"   : '\&reg;'   ,
523 		"apos"  : '\''
524 	];
525 	foreach (name, c; entities)
526 		entityNames[c] = name;
527 }
528 
529 import std.utf;
530 import core.stdc.stdio;
531 
532 public string encodeEntities(string str)
533 {
534 	// TODO: optimize
535 	foreach_reverse (i, c; str)
536 		if (c=='<' || c=='>' || c=='"' || c=='\'' || c=='&')
537 			str = str[0..i] ~ '&' ~ entityNames[c] ~ ';' ~ str[i+1..$];
538 	return str;
539 }
540 
541 public string encodeAllEntities(string str)
542 {
543 	// TODO: optimize
544 	foreach_reverse (i, dchar c; str)
545 	{
546 		auto name = c in entityNames;
547 		if (name)
548 			str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$];
549 	}
550 	return str;
551 }
552 
553 import ae.utils.text;
554 import std.conv;
555 
556 public string decodeEntities(string str)
557 {
558 	auto fragments = str.fastSplit('&');
559 	if (fragments.length <= 1)
560 		return str;
561 
562 	auto interleaved = new string[fragments.length*2 - 1];
563 	auto buffers = new char[4][fragments.length-1];
564 	interleaved[0] = fragments[0];
565 
566 	foreach (n, fragment; fragments[1..$])
567 	{
568 		auto p = fragment.indexOf(';');
569 		enforce(p>0, "Invalid entity (unescaped ampersand?)");
570 
571 		dchar c;
572 		if (fragment[0]=='#')
573 		{
574 			if (fragment[1]=='x')
575 				c = fromHex!uint(fragment[2..p]);
576 			else
577 				c = to!uint(fragment[1..p]);
578 		}
579 		else
580 		{
581 			auto pentity = fragment[0..p] in entities;
582 			enforce(pentity, "Unknown entity: " ~ fragment[0..p]);
583 			c = *pentity;
584 		}
585 
586 		interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)];
587 		interleaved[2+n*2] = fragment[p+1..$];
588 	}
589 
590 	return interleaved.join();
591 }
592 
593 deprecated alias decodeEntities convertEntities;
594 
595 unittest
596 {
597 	assert(encodeAllEntities("©,€") == "&copy;,&euro;");
598 	assert(decodeEntities("&copy;,&euro;") == "©,€");
599 }