1 /** 2 * SAX-like XML parser 3 * WORK IN PROGRESS. 4 * 5 * License: 6 * This Source Code Form is subject to the terms of 7 * the Mozilla Public License, v. 2.0. If a copy of 8 * the MPL was not distributed with this file, You 9 * can obtain one at http://mozilla.org/MPL/2.0/. 10 * 11 * Authors: 12 * Vladimir Panteleev <vladimir@thecybershadow.net> 13 */ 14 15 module ae.utils.xmlparser; 16 17 import std.exception; 18 import std.functional; 19 import std.range; 20 import std.string; 21 import std.traits; 22 23 import ae.utils.range; 24 25 /// Does not allocate (except for exceptions). 26 /// No XML nesting state. 27 /// Does not check for premature stream end, paired tags, etc. 28 /// 29 /// INPUT is an input range which needs to support the following 30 /// additional properties: 31 /// .ptr - returns a type usable with ptrSlice (used to save 32 /// the position in INPUT, then later take a slice 33 /// from that position until an end position). 34 /// WARNING: Using a narrow D string type for INPUT will result 35 /// in wasteful UTF decoding (due to std.array.front returning a 36 /// dchar). 37 /// 38 /// OUTPUT accepts strings with the XML entities still encoded, 39 /// to allow for lazy decoding. 40 41 // TODO: namespaces, CDATA 42 43 struct XmlParser(INPUT, OUTPUT) 44 { 45 INPUT input; 46 OUTPUT output; 47 48 alias typeof(input.front) C; 49 alias std.traits.Unqual!C U; 50 51 void run() 52 { 53 output.startDocument(); 54 skipWhitespace(); 55 56 while (!input.empty) 57 { 58 if (input.front != '<') // text node 59 output.text(xmlString(readWhile!q{c != '<'}())); 60 else 61 { 62 input.popFront(); 63 if (input.front=='!') 64 { 65 input.popFront(); 66 if (input.front == '-') // comment 67 { 68 input.popFront(); 69 expect('-'); 70 U c0, c1, c2; 71 do 72 { 73 c0=c1; c1=c2; c2=input.front; 74 input.popFront(); 75 } while (c0 != '-' || c1 != '-' || c2 != '>'); 76 } 77 else // doctype, etc. 78 output.directive(xmlString(readWhile!q{c != '>'}())); 79 } 80 else 81 if (input.front=='?') 82 { 83 input.popFront(); 84 output.startProcessingInstruction(readWord()); 85 while (!input.empty) 86 { 87 skipWhitespace(); 88 if (input.front=='?') 89 break; 90 readAttribute(); 91 } 92 input.popFront(); // '?' 93 expect('>'); 94 output.endProcessingInstruction(); 95 } 96 else 97 if (input.front=='/') 98 { 99 input.popFront(); 100 output.endTag(readWord()); 101 expect('>'); 102 } 103 else 104 { 105 output.startTag(readWord()); 106 while (!input.empty) 107 { 108 skipWhitespace(); 109 if (input.front=='>' || input.front=='/') 110 break; 111 readAttribute(); 112 } 113 output.endAttributes(); 114 if (input.front == '/') 115 { 116 input.popFront(); 117 output.endAttributesAndTag(); 118 expect('>'); 119 } 120 else 121 input.popFront(); // '>' 122 } 123 } 124 skipWhitespace(); 125 } 126 127 output.endDocument(); 128 } 129 130 private: 131 void readAttribute() 132 { 133 auto name = readWord(); 134 skipWhitespace(); 135 expect('='); 136 skipWhitespace(); 137 auto delim = input.front; 138 enforce(delim == '\'' || delim == '"', format("Bad attribute delimiter. Expected ' or \", got %s", delim)); 139 auto value = delim == '"' ? readWhile!q{c != '"'}() : readWhile!q{c != '\''}(); 140 output.attribute(name, xmlString(value)); 141 } 142 143 void expect(C c) 144 { 145 enforce(input.front == c, format("Expected %s, got %s", c, input.front)); 146 input.popFront(); 147 } 148 149 auto readWhile(alias COND)() 150 { 151 auto start = input.ptr; 152 skipWhile!COND(); 153 return ptrSlice(start, input.ptr); 154 } 155 156 void skipWhile(alias COND)() 157 { 158 alias unaryFun!(COND, "c") cond; 159 while (!input.empty && cond(input.front)) 160 input.popFront(); 161 } 162 163 alias skipWhile!xmlIsWhite skipWhitespace; 164 alias readWhile!xmlIsWord readWord; 165 } 166 167 /// The type of a slice (using ptrSlice) of an input range used in XmlParser 168 template SliceType(INPUT) 169 { 170 alias typeof(ptrSlice(T.init.ptr, T.init.ptr)) SliceType; 171 } 172 173 unittest 174 { 175 // Just test compilation with a dummy receiver 176 static struct DummyOutput 177 { 178 void opDispatch(string S, T...)(T args) { } 179 } 180 181 // Note: don't use string! This will do UTF-8 decoding. 182 XmlParser!(string, DummyOutput) stringParser; 183 184 // An example with more sensible performance 185 XmlParser!(FastArrayRange!(immutable(char)), DummyOutput) fastParser; 186 } 187 188 // *************************************************************************** 189 190 /// Represents a string (slice of XmlParser input stream) which still contains 191 /// encoded XML entities. 192 struct XmlString(S) 193 { 194 S encoded; 195 } 196 197 XmlString!S xmlString(S)(S s) { return XmlString!S(s); } 198 199 /+ 200 import std.traits; 201 202 static import ae.utils.xmllite; 203 204 XmlString!S getXmlEncodedString(S)(S s) 205 if (isSomeString!S) 206 { 207 XmlString!S xmls; 208 xmls.encoded = ae.utils.xmllite.encodeEntities(s); 209 return xmls; 210 } 211 212 X getXmlEncodedString(X)(X x) 213 if (is(X S : XmlString!S)) 214 { 215 return x; 216 } 217 218 auto getXmlDecodedString(X)(X x) 219 if (is(X S : XmlString!S)) 220 { 221 return ae.utils.xmllite.decodeEntities(x.encoded); 222 } 223 224 S getXmlDecodedString(S)(S s) 225 if (isSomeString!S) 226 { 227 return s; 228 } 229 230 unittest 231 { 232 auto s0 = "<"; 233 auto s1 = s0.getXmlDecodedString(); 234 assert(s0 is s1); 235 auto x0 = s0.getXmlEncodedString(); 236 assert(x0.encoded == "<"); 237 auto x1 = x0.getXmlEncodedString(); 238 assert(x0.encoded is x1.encoded); 239 auto s2 = x0.getXmlDecodedString(); 240 assert(s0 == s2); 241 } 242 +/ 243 244 // *************************************************************************** 245 246 /// Generate a fast table lookup function, which compiles to a single lookup 247 /// for small index types and an additional check + default value for larger 248 /// index types. 249 private template fastLookup(alias TABLE, bool DEFAULT) 250 { 251 bool fastLookup(C)(C c) @trusted pure nothrow 252 { 253 static if (cast(size_t)C.max > TABLE.length) 254 if (cast(size_t)c >= TABLE.length) 255 return DEFAULT; 256 return TABLE.ptr[cast(size_t)c]; 257 } 258 } 259 260 alias fastLookup!(xmlWhiteChars, false) xmlIsWhite; 261 alias fastLookup!(xmlWordChars , true ) xmlIsWord ; /// ditto 262 263 bool[256] genTable(string COND)() 264 { 265 import std.ascii; 266 bool[256] table; 267 foreach (uint c, ref b; table) b = mixin(COND); 268 return table; 269 } 270 271 immutable bool[256] xmlWhiteChars = genTable!q{isWhite (c) }(); 272 immutable bool[256] xmlWordChars = genTable!q{isAlphaNum(c) || c=='-' || c=='_' || c==':'}(); 273 274 unittest 275 { 276 assert( xmlIsWhite(' ')); 277 assert(!xmlIsWhite('a')); 278 assert(!xmlIsWhite('я')); 279 assert(!xmlIsWord (' ')); 280 assert( xmlIsWord ('a')); 281 assert( xmlIsWord ('я')); 282 }