1 /**
2  * SAX-like XML parser
3  * WORK IN PROGRESS.
4  *
5  * License:
6  *   This Source Code Form is subject to the terms of
7  *   the Mozilla Public License, v. 2.0. If a copy of
8  *   the MPL was not distributed with this file, You
9  *   can obtain one at http://mozilla.org/MPL/2.0/.
10  *
11  * Authors:
12  *   Vladimir Panteleev <ae@cy.md>
13  */
14 
15 deprecated module ae.utils.xmlparser;
16 deprecated:
17 
18 import std.exception;
19 import std.functional;
20 import std.range;
21 import std.string;
22 import std.traits;
23 
24 import ae.utils.range;
25 
26 /// Does not allocate (except for exceptions).
27 /// No XML nesting state.
28 /// Does not check for premature stream end, paired tags, etc.
29 ///
30 /// INPUT is an input range which needs to support the following
31 /// additional properties:
32 ///   .ptr - returns a type usable with ptrSlice (used to save
33 ///          the position in INPUT, then later take a slice
34 ///          from that position until an end position).
35 /// WARNING: Using a narrow D string type for INPUT will result
36 /// in wasteful UTF decoding (due to std.array.front returning a
37 /// dchar).
38 ///
39 /// OUTPUT accepts strings with the XML entities still encoded,
40 /// to allow for lazy decoding.
41 
42 // TODO: namespaces, CDATA
43 
44 struct XmlParser(INPUT, OUTPUT)
45 {
46 	INPUT input;
47 	OUTPUT output;
48 
49 	alias typeof(input.front) C;
50 	alias std.traits.Unqual!C U;
51 
52 	void run()
53 	{
54 		output.startDocument();
55 		skipWhitespace();
56 
57 		while (!input.empty)
58 		{
59 			if (input.front != '<')  // text node
60 				output.text(xmlString(readWhile!q{c != '<'}()));
61 			else
62 			{
63 				input.popFront();
64 				if (input.front=='!')
65 				{
66 					input.popFront();
67 					if (input.front == '-') // comment
68 					{
69 						input.popFront();
70 						expect('-');
71 						U c0, c1, c2;
72 						do
73 						{
74 							c0=c1; c1=c2; c2=input.front;
75 							input.popFront();
76 						} while (c0 != '-' || c1 != '-' || c2 != '>');
77 					}
78 					else // doctype, etc.
79 						output.directive(xmlString(readWhile!q{c != '>'}()));
80 				}
81 				else
82 				if (input.front=='?')
83 				{
84 					input.popFront();
85 					output.startProcessingInstruction(readWord());
86 					while (!input.empty)
87 					{
88 						skipWhitespace();
89 						if (input.front=='?')
90 							break;
91 						readAttribute();
92 					}
93 					input.popFront(); // '?'
94 					expect('>');
95 					output.endProcessingInstruction();
96 				}
97 				else
98 				if (input.front=='/')
99 				{
100 					input.popFront();
101 					output.endTag(readWord());
102 					expect('>');
103 				}
104 				else
105 				{
106 					output.startTag(readWord());
107 					while (!input.empty)
108 					{
109 						skipWhitespace();
110 						if (input.front=='>' || input.front=='/')
111 							break;
112 						readAttribute();
113 					}
114 					output.endAttributes();
115 					if (input.front == '/')
116 					{
117 						input.popFront();
118 						output.endAttributesAndTag();
119 						expect('>');
120 					}
121 					else
122 						input.popFront(); // '>'
123 				}
124 			}
125 			skipWhitespace();
126 		}
127 
128 		output.endDocument();
129 	}
130 
131 private:
132 	void readAttribute()
133 	{
134 		auto name = readWord();
135 		skipWhitespace();
136 		expect('=');
137 		skipWhitespace();
138 		auto delim = input.front;
139 		enforce(delim == '\'' || delim == '"', format("Bad attribute delimiter. Expected ' or \", got %s", delim));
140 		auto value = delim == '"' ? readWhile!q{c != '"'}() : readWhile!q{c != '\''}();
141 		output.attribute(name, xmlString(value));
142 	}
143 
144 	void expect(C c)
145 	{
146 		enforce(input.front == c, format("Expected %s, got %s", c, input.front));
147 		input.popFront();
148 	}
149 
150 	auto readWhile(alias COND)()
151 	{
152 		auto start = input.ptr;
153 		skipWhile!COND();
154 		return ptrSlice(start, input.ptr);
155 	}
156 
157 	void skipWhile(alias COND)()
158 	{
159 		alias unaryFun!(COND, "c") cond;
160 		while (!input.empty && cond(input.front))
161 			input.popFront();
162 	}
163 
164 	alias skipWhile!xmlIsWhite skipWhitespace;
165 	alias readWhile!xmlIsWord  readWord;
166 }
167 
168 /// The type of a slice (using ptrSlice) of an input range used in XmlParser
169 template SliceType(INPUT)
170 {
171 	alias typeof(ptrSlice(T.init.ptr, T.init.ptr)) SliceType;
172 }
173 
174 unittest
175 {
176 	// Just test compilation with a dummy receiver
177 	static struct DummyOutput
178 	{
179 		void opDispatch(string S, T...)(T args) { }
180 	}
181 
182 	// Note: don't use string! This will do UTF-8 decoding.
183 	XmlParser!(string, DummyOutput) stringParser;
184 
185 	// An example with more sensible performance
186 	XmlParser!(FastArrayRange!(immutable(char)), DummyOutput) fastParser;
187 }
188 
189 // ***************************************************************************
190 
191 /// Represents a string (slice of XmlParser input stream) which still contains
192 /// encoded XML entities.
193 struct XmlString(S)
194 {
195 	S encoded;
196 }
197 
198 XmlString!S xmlString(S)(S s) { return XmlString!S(s); }
199 
200 /+
201 import std.traits;
202 
203 static import ae.utils.xmllite;
204 
205 XmlString!S getXmlEncodedString(S)(S s)
206 	if (isSomeString!S)
207 {
208 	XmlString!S xmls;
209 	xmls.encoded = ae.utils.xmllite.encodeEntities(s);
210 	return xmls;
211 }
212 
213 X getXmlEncodedString(X)(X x)
214 	if (is(X S : XmlString!S))
215 {
216 	return x;
217 }
218 
219 auto getXmlDecodedString(X)(X x)
220 	if (is(X S : XmlString!S))
221 {
222 	return ae.utils.xmllite.decodeEntities(x.encoded);
223 }
224 
225 S getXmlDecodedString(S)(S s)
226 	if (isSomeString!S)
227 {
228 	return s;
229 }
230 
231 unittest
232 {
233 	auto s0 = "<";
234 	auto s1 = s0.getXmlDecodedString();
235 	assert(s0 is s1);
236 	auto x0 = s0.getXmlEncodedString();
237 	assert(x0.encoded == "&lt;");
238 	auto x1 = x0.getXmlEncodedString();
239 	assert(x0.encoded is x1.encoded);
240 	auto s2 = x0.getXmlDecodedString();
241 	assert(s0 == s2);
242 }
243 +/
244 
245 // ***************************************************************************
246 
247 /// Generate a fast table lookup function, which compiles to a single lookup
248 /// for small index types and an additional check + default value for larger
249 /// index types.
250 private template fastLookup(alias TABLE, bool DEFAULT)
251 {
252 	bool fastLookup(C)(C c) @trusted pure nothrow
253 	{
254 		static if (cast(size_t)C.max > TABLE.length)
255 			if (cast(size_t)c >= TABLE.length)
256 				return DEFAULT;
257 		return TABLE.ptr[cast(size_t)c];
258 	}
259 }
260 
261 alias fastLookup!(xmlWhiteChars, false) xmlIsWhite;
262 alias fastLookup!(xmlWordChars , true ) xmlIsWord ; /// ditto
263 
264 bool[256] genTable(string COND)()
265 {
266 	import std.ascii;
267 	bool[256] table;
268 	foreach (uint c, ref b; table) b = mixin(COND);
269 	return table;
270 }
271 
272 immutable bool[256] xmlWhiteChars = genTable!q{isWhite   (c)                              }();
273 immutable bool[256] xmlWordChars  = genTable!q{isAlphaNum(c) || c=='-' || c=='_' || c==':'}();
274 
275 unittest
276 {
277 	assert( xmlIsWhite(' '));
278 	assert(!xmlIsWhite('a'));
279 	assert(!xmlIsWhite('я'));
280 	assert(!xmlIsWord (' '));
281 	assert( xmlIsWord ('a'));
282 	assert( xmlIsWord ('я'));
283 }