1 /**
2  * SAX-like XML parser
3  * WORK IN PROGRESS.
4  *
5  * License:
6  *   This Source Code Form is subject to the terms of
7  *   the Mozilla Public License, v. 2.0. If a copy of
8  *   the MPL was not distributed with this file, You
9  *   can obtain one at http://mozilla.org/MPL/2.0/.
10  *
11  * Authors:
12  *   Vladimir Panteleev <vladimir@thecybershadow.net>
13  */
14 
15 module ae.utils.xmlparser;
16 
17 import std.exception;
18 import std.functional;
19 import std.range;
20 import std.string;
21 import std.traits;
22 
23 import ae.utils.range;
24 
25 /// Does not allocate (except for exceptions).
26 /// No XML nesting state.
27 /// Does not check for premature stream end, paired tags, etc.
28 ///
29 /// INPUT is an input range which needs to support the following
30 /// additional properties:
31 ///   .ptr - returns a type usable with ptrSlice (used to save
32 ///          the position in INPUT, then later take a slice
33 ///          from that position until an end position).
34 /// WARNING: Using a narrow D string type for INPUT will result
35 /// in wasteful UTF decoding (due to std.array.front returning a
36 /// dchar).
37 ///
38 /// OUTPUT accepts strings with the XML entities still encoded,
39 /// to allow for lazy decoding.
40 
41 // TODO: namespaces, CDATA
42 
43 struct XmlParser(INPUT, OUTPUT)
44 {
45 	INPUT input;
46 	OUTPUT output;
47 
48 	alias typeof(input.front) C;
49 	alias std.traits.Unqual!C U;
50 
51 	void run()
52 	{
53 		output.startDocument();
54 		skipWhitespace();
55 
56 		while (!input.empty)
57 		{
58 			if (input.front != '<')  // text node
59 				output.text(xmlString(readWhile!q{c != '<'}()));
60 			else
61 			{
62 				input.popFront();
63 				if (input.front=='!')
64 				{
65 					input.popFront();
66 					if (input.front == '-') // comment
67 					{
68 						input.popFront();
69 						expect('-');
70 						U c0, c1, c2;
71 						do
72 						{
73 							c0=c1; c1=c2; c2=input.front;
74 							input.popFront();
75 						} while (c0 != '-' || c1 != '-' || c2 != '>');
76 					}
77 					else // doctype, etc.
78 						output.directive(xmlString(readWhile!q{c != '>'}()));
79 				}
80 				else
81 				if (input.front=='?')
82 				{
83 					input.popFront();
84 					output.startProcessingInstruction(readWord());
85 					while (!input.empty)
86 					{
87 						skipWhitespace();
88 						if (input.front=='?')
89 							break;
90 						readAttribute();
91 					}
92 					input.popFront(); // '?'
93 					expect('>');
94 					output.endProcessingInstruction();
95 				}
96 				else
97 				if (input.front=='/')
98 				{
99 					input.popFront();
100 					output.endTag(readWord());
101 					expect('>');
102 				}
103 				else
104 				{
105 					output.startTag(readWord());
106 					while (!input.empty)
107 					{
108 						skipWhitespace();
109 						if (input.front=='>' || input.front=='/')
110 							break;
111 						readAttribute();
112 					}
113 					output.endAttributes();
114 					if (input.front == '/')
115 					{
116 						input.popFront();
117 						output.endAttributesAndTag();
118 						expect('>');
119 					}
120 					else
121 						input.popFront(); // '>'
122 				}
123 			}
124 			skipWhitespace();
125 		}
126 
127 		output.endDocument();
128 	}
129 
130 private:
131 	void readAttribute()
132 	{
133 		auto name = readWord();
134 		skipWhitespace();
135 		expect('=');
136 		skipWhitespace();
137 		auto delim = input.front;
138 		enforce(delim == '\'' || delim == '"', format("Bad attribute delimiter. Expected ' or \", got %s", delim));
139 		auto value = delim == '"' ? readWhile!q{c != '"'}() : readWhile!q{c != '\''}();
140 		output.attribute(name, xmlString(value));
141 	}
142 
143 	void expect(C c)
144 	{
145 		enforce(input.front == c, format("Expected %s, got %s", c, input.front));
146 		input.popFront();
147 	}
148 
149 	auto readWhile(alias COND)()
150 	{
151 		auto start = input.ptr;
152 		skipWhile!COND();
153 		return ptrSlice(start, input.ptr);
154 	}
155 
156 	void skipWhile(alias COND)()
157 	{
158 		alias unaryFun!(COND, "c") cond;
159 		while (!input.empty && cond(input.front))
160 			input.popFront();
161 	}
162 
163 	alias skipWhile!xmlIsWhite skipWhitespace;
164 	alias readWhile!xmlIsWord  readWord;
165 }
166 
167 /// The type of a slice (using ptrSlice) of an input range used in XmlParser
168 template SliceType(INPUT)
169 {
170 	alias typeof(ptrSlice(T.init.ptr, T.init.ptr)) SliceType;
171 }
172 
173 unittest
174 {
175 	// Just test compilation with a dummy receiver
176 	static struct DummyOutput
177 	{
178 		void opDispatch(string S, T...)(T args) { }
179 	}
180 
181 	// Note: don't use string! This will do UTF-8 decoding.
182 	XmlParser!(string, DummyOutput) stringParser;
183 
184 	// An example with more sensible performance
185 	XmlParser!(FastArrayRange!(immutable(char)), DummyOutput) fastParser;
186 }
187 
188 // ***************************************************************************
189 
190 /// Represents a string (slice of XmlParser input stream) which still contains
191 /// encoded XML entities.
192 struct XmlString(S)
193 {
194 	S encoded;
195 }
196 
197 XmlString!S xmlString(S)(S s) { return XmlString!S(s); }
198 
199 /+
200 import std.traits;
201 
202 static import ae.utils.xmllite;
203 
204 XmlString!S getXmlEncodedString(S)(S s)
205 	if (isSomeString!S)
206 {
207 	XmlString!S xmls;
208 	xmls.encoded = ae.utils.xmllite.encodeEntities(s);
209 	return xmls;
210 }
211 
212 X getXmlEncodedString(X)(X x)
213 	if (is(X S : XmlString!S))
214 {
215 	return x;
216 }
217 
218 auto getXmlDecodedString(X)(X x)
219 	if (is(X S : XmlString!S))
220 {
221 	return ae.utils.xmllite.decodeEntities(x.encoded);
222 }
223 
224 S getXmlDecodedString(S)(S s)
225 	if (isSomeString!S)
226 {
227 	return s;
228 }
229 
230 unittest
231 {
232 	auto s0 = "<";
233 	auto s1 = s0.getXmlDecodedString();
234 	assert(s0 is s1);
235 	auto x0 = s0.getXmlEncodedString();
236 	assert(x0.encoded == "&lt;");
237 	auto x1 = x0.getXmlEncodedString();
238 	assert(x0.encoded is x1.encoded);
239 	auto s2 = x0.getXmlDecodedString();
240 	assert(s0 == s2);
241 }
242 +/
243 
244 // ***************************************************************************
245 
246 /// Generate a fast table lookup function, which compiles to a single lookup
247 /// for small index types and an additional check + default value for larger
248 /// index types.
249 private template fastLookup(alias TABLE, bool DEFAULT)
250 {
251 	bool fastLookup(C)(C c) @trusted pure nothrow
252 	{
253 		static if (cast(size_t)C.max > TABLE.length)
254 			if (cast(size_t)c >= TABLE.length)
255 				return DEFAULT;
256 		return TABLE.ptr[cast(size_t)c];
257 	}
258 }
259 
260 alias fastLookup!(xmlWhiteChars, false) xmlIsWhite;
261 alias fastLookup!(xmlWordChars , true ) xmlIsWord ; /// ditto
262 
263 bool[256] genTable(string COND)()
264 {
265 	import std.ascii;
266 	bool[256] table;
267 	foreach (uint c, ref b; table) b = mixin(COND);
268 	return table;
269 }
270 
271 immutable bool[256] xmlWhiteChars = genTable!q{isWhite   (c)                              }();
272 immutable bool[256] xmlWordChars  = genTable!q{isAlphaNum(c) || c=='-' || c=='_' || c==':'}();
273 
274 unittest
275 {
276 	assert( xmlIsWhite(' '));
277 	assert(!xmlIsWhite('a'));
278 	assert(!xmlIsWhite('я'));
279 	assert(!xmlIsWord (' '));
280 	assert( xmlIsWord ('a'));
281 	assert( xmlIsWord ('я'));
282 }