1 /**
2  * XML/HTML entity encoding/decoding.
3  *
4  * License:
5  *   This Source Code Form is subject to the terms of
6  *   the Mozilla Public License, v. 2.0. If a copy of
7  *   the MPL was not distributed with this file, You
8  *   can obtain one at http://mozilla.org/MPL/2.0/.
9  *
10  * Authors:
11  *   Vladimir Panteleev <vladimir@thecybershadow.net>
12  *   Simon Arlott
13  */
14 
15 module ae.utils.xml.entities;
16 
17 import ae.utils.xml.common;
18 
19 const dchar[string] entities;
20 /*const*/ string[dchar] entityNames;
21 shared static this()
22 {
23 	entities =
24 	[
25 		"quot" : '\&quot;',
26 		"amp" : '\&amp;',
27 		"lt" : '\&lt;',
28 		"gt" : '\&gt;',
29 
30 		"OElig" : '\&OElig;',
31 		"oelig" : '\&oelig;',
32 		"Scaron" : '\&Scaron;',
33 		"scaron" : '\&scaron;',
34 		"Yuml" : '\&Yuml;',
35 		"circ" : '\&circ;',
36 		"tilde" : '\&tilde;',
37 		"ensp" : '\&ensp;',
38 		"emsp" : '\&emsp;',
39 		"thinsp" : '\&thinsp;',
40 		"zwnj" : '\&zwnj;',
41 		"zwj" : '\&zwj;',
42 		"lrm" : '\&lrm;',
43 		"rlm" : '\&rlm;',
44 		"ndash" : '\&ndash;',
45 		"mdash" : '\&mdash;',
46 		"lsquo" : '\&lsquo;',
47 		"rsquo" : '\&rsquo;',
48 		"sbquo" : '\&sbquo;',
49 		"ldquo" : '\&ldquo;',
50 		"rdquo" : '\&rdquo;',
51 		"bdquo" : '\&bdquo;',
52 		"dagger" : '\&dagger;',
53 		"Dagger" : '\&Dagger;',
54 		"permil" : '\&permil;',
55 		"lsaquo" : '\&lsaquo;',
56 		"rsaquo" : '\&rsaquo;',
57 		"euro" : '\&euro;',
58 
59 		"nbsp" : '\&nbsp;',
60 		"iexcl" : '\&iexcl;',
61 		"cent" : '\&cent;',
62 		"pound" : '\&pound;',
63 		"curren" : '\&curren;',
64 		"yen" : '\&yen;',
65 		"brvbar" : '\&brvbar;',
66 		"sect" : '\&sect;',
67 		"uml" : '\&uml;',
68 		"copy" : '\&copy;',
69 		"ordf" : '\&ordf;',
70 		"laquo" : '\&laquo;',
71 		"not" : '\&not;',
72 		"shy" : '\&shy;',
73 		"reg" : '\&reg;',
74 		"macr" : '\&macr;',
75 		"deg" : '\&deg;',
76 		"plusmn" : '\&plusmn;',
77 		"sup2" : '\&sup2;',
78 		"sup3" : '\&sup3;',
79 		"acute" : '\&acute;',
80 		"micro" : '\&micro;',
81 		"para" : '\&para;',
82 		"middot" : '\&middot;',
83 		"cedil" : '\&cedil;',
84 		"sup1" : '\&sup1;',
85 		"ordm" : '\&ordm;',
86 		"raquo" : '\&raquo;',
87 		"frac14" : '\&frac14;',
88 		"frac12" : '\&frac12;',
89 		"frac34" : '\&frac34;',
90 		"iquest" : '\&iquest;',
91 		"Agrave" : '\&Agrave;',
92 		"Aacute" : '\&Aacute;',
93 		"Acirc" : '\&Acirc;',
94 		"Atilde" : '\&Atilde;',
95 		"Auml" : '\&Auml;',
96 		"Aring" : '\&Aring;',
97 		"AElig" : '\&AElig;',
98 		"Ccedil" : '\&Ccedil;',
99 		"Egrave" : '\&Egrave;',
100 		"Eacute" : '\&Eacute;',
101 		"Ecirc" : '\&Ecirc;',
102 		"Euml" : '\&Euml;',
103 		"Igrave" : '\&Igrave;',
104 		"Iacute" : '\&Iacute;',
105 		"Icirc" : '\&Icirc;',
106 		"Iuml" : '\&Iuml;',
107 		"ETH" : '\&ETH;',
108 		"Ntilde" : '\&Ntilde;',
109 		"Ograve" : '\&Ograve;',
110 		"Oacute" : '\&Oacute;',
111 		"Ocirc" : '\&Ocirc;',
112 		"Otilde" : '\&Otilde;',
113 		"Ouml" : '\&Ouml;',
114 		"times" : '\&times;',
115 		"Oslash" : '\&Oslash;',
116 		"Ugrave" : '\&Ugrave;',
117 		"Uacute" : '\&Uacute;',
118 		"Ucirc" : '\&Ucirc;',
119 		"Uuml" : '\&Uuml;',
120 		"Yacute" : '\&Yacute;',
121 		"THORN" : '\&THORN;',
122 		"szlig" : '\&szlig;',
123 		"agrave" : '\&agrave;',
124 		"aacute" : '\&aacute;',
125 		"acirc" : '\&acirc;',
126 		"atilde" : '\&atilde;',
127 		"auml" : '\&auml;',
128 		"aring" : '\&aring;',
129 		"aelig" : '\&aelig;',
130 		"ccedil" : '\&ccedil;',
131 		"egrave" : '\&egrave;',
132 		"eacute" : '\&eacute;',
133 		"ecirc" : '\&ecirc;',
134 		"euml" : '\&euml;',
135 		"igrave" : '\&igrave;',
136 		"iacute" : '\&iacute;',
137 		"icirc" : '\&icirc;',
138 		"iuml" : '\&iuml;',
139 		"eth" : '\&eth;',
140 		"ntilde" : '\&ntilde;',
141 		"ograve" : '\&ograve;',
142 		"oacute" : '\&oacute;',
143 		"ocirc" : '\&ocirc;',
144 		"otilde" : '\&otilde;',
145 		"ouml" : '\&ouml;',
146 		"divide" : '\&divide;',
147 		"oslash" : '\&oslash;',
148 		"ugrave" : '\&ugrave;',
149 		"uacute" : '\&uacute;',
150 		"ucirc" : '\&ucirc;',
151 		"uuml" : '\&uuml;',
152 		"yacute" : '\&yacute;',
153 		"thorn" : '\&thorn;',
154 		"yuml" : '\&yuml;',
155 
156 		"fnof" : '\&fnof;',
157 		"Alpha" : '\&Alpha;',
158 		"Beta" : '\&Beta;',
159 		"Gamma" : '\&Gamma;',
160 		"Delta" : '\&Delta;',
161 		"Epsilon" : '\&Epsilon;',
162 		"Zeta" : '\&Zeta;',
163 		"Eta" : '\&Eta;',
164 		"Theta" : '\&Theta;',
165 		"Iota" : '\&Iota;',
166 		"Kappa" : '\&Kappa;',
167 		"Lambda" : '\&Lambda;',
168 		"Mu" : '\&Mu;',
169 		"Nu" : '\&Nu;',
170 		"Xi" : '\&Xi;',
171 		"Omicron" : '\&Omicron;',
172 		"Pi" : '\&Pi;',
173 		"Rho" : '\&Rho;',
174 		"Sigma" : '\&Sigma;',
175 		"Tau" : '\&Tau;',
176 		"Upsilon" : '\&Upsilon;',
177 		"Phi" : '\&Phi;',
178 		"Chi" : '\&Chi;',
179 		"Psi" : '\&Psi;',
180 		"Omega" : '\&Omega;',
181 		"alpha" : '\&alpha;',
182 		"beta" : '\&beta;',
183 		"gamma" : '\&gamma;',
184 		"delta" : '\&delta;',
185 		"epsilon" : '\&epsilon;',
186 		"zeta" : '\&zeta;',
187 		"eta" : '\&eta;',
188 		"theta" : '\&theta;',
189 		"iota" : '\&iota;',
190 		"kappa" : '\&kappa;',
191 		"lambda" : '\&lambda;',
192 		"mu" : '\&mu;',
193 		"nu" : '\&nu;',
194 		"xi" : '\&xi;',
195 		"omicron" : '\&omicron;',
196 		"pi" : '\&pi;',
197 		"rho" : '\&rho;',
198 		"sigmaf" : '\&sigmaf;',
199 		"sigma" : '\&sigma;',
200 		"tau" : '\&tau;',
201 		"upsilon" : '\&upsilon;',
202 		"phi" : '\&phi;',
203 		"chi" : '\&chi;',
204 		"psi" : '\&psi;',
205 		"omega" : '\&omega;',
206 		"thetasym" : '\&thetasym;',
207 		"upsih" : '\&upsih;',
208 		"piv" : '\&piv;',
209 		"bull" : '\&bull;',
210 		"hellip" : '\&hellip;',
211 		"prime" : '\&prime;',
212 		"Prime" : '\&Prime;',
213 		"oline" : '\&oline;',
214 		"frasl" : '\&frasl;',
215 		"weierp" : '\&weierp;',
216 		"image" : '\&image;',
217 		"real" : '\&real;',
218 		"trade" : '\&trade;',
219 		"alefsym" : '\&alefsym;',
220 		"larr" : '\&larr;',
221 		"uarr" : '\&uarr;',
222 		"rarr" : '\&rarr;',
223 		"darr" : '\&darr;',
224 		"harr" : '\&harr;',
225 		"crarr" : '\&crarr;',
226 		"lArr" : '\&lArr;',
227 		"uArr" : '\&uArr;',
228 		"rArr" : '\&rArr;',
229 		"dArr" : '\&dArr;',
230 		"hArr" : '\&hArr;',
231 		"forall" : '\&forall;',
232 		"part" : '\&part;',
233 		"exist" : '\&exist;',
234 		"empty" : '\&empty;',
235 		"nabla" : '\&nabla;',
236 		"isin" : '\&isin;',
237 		"notin" : '\&notin;',
238 		"ni" : '\&ni;',
239 		"prod" : '\&prod;',
240 		"sum" : '\&sum;',
241 		"minus" : '\&minus;',
242 		"lowast" : '\&lowast;',
243 		"radic" : '\&radic;',
244 		"prop" : '\&prop;',
245 		"infin" : '\&infin;',
246 		"ang" : '\&ang;',
247 		"and" : '\&and;',
248 		"or" : '\&or;',
249 		"cap" : '\&cap;',
250 		"cup" : '\&cup;',
251 		"int" : '\&int;',
252 		"there4" : '\&there4;',
253 		"sim" : '\&sim;',
254 		"cong" : '\&cong;',
255 		"asymp" : '\&asymp;',
256 		"ne" : '\&ne;',
257 		"equiv" : '\&equiv;',
258 		"le" : '\&le;',
259 		"ge" : '\&ge;',
260 		"sub" : '\&sub;',
261 		"sup" : '\&sup;',
262 		"nsub" : '\&nsub;',
263 		"sube" : '\&sube;',
264 		"supe" : '\&supe;',
265 		"oplus" : '\&oplus;',
266 		"otimes" : '\&otimes;',
267 		"perp" : '\&perp;',
268 		"sdot" : '\&sdot;',
269 		"lceil" : '\&lceil;',
270 		"rceil" : '\&rceil;',
271 		"lfloor" : '\&lfloor;',
272 		"rfloor" : '\&rfloor;',
273 		"loz" : '\&loz;',
274 		"spades" : '\&spades;',
275 		"clubs" : '\&clubs;',
276 		"hearts" : '\&hearts;',
277 		"diams" : '\&diams;',
278 		"lang" : '\&lang;',
279 		"rang" : '\&rang;',
280 
281 		"apos"  : '\''
282 	];
283 	foreach (name, c; entities)
284 		entityNames[c] = name;
285 }
286 
287 import core.stdc.stdio;
288 import std.array;
289 import std.exception;
290 import std.utf;
291 import ae.utils.textout;
292 
293 /*private*/ public string encodeEntitiesImpl(bool unicode, alias pred)(string str)
294 {
295 	size_t i = 0;
296 	while (i < str.length)
297 	{
298 		size_t o = i;
299 		static if (unicode)
300 			dchar c = decode(str, i);
301 		else
302 			char c = str[i++];
303 
304 		if (pred(c))
305 		{
306 			StringBuilder sb;
307 			sb.preallocate(str.length * 11 / 10);
308 			sb.put(str[0..o]);
309 			sb.putEncodedEntitiesImpl!(unicode, pred)(str[o..$]);
310 			return sb.get();
311 		}
312 	}
313 	return str;
314 }
315 
316 /*private*/ public template putEncodedEntitiesImpl(bool unicode, alias pred)
317 {
318 	void putEncodedEntitiesImpl(Sink, S)(ref Sink sink, S str)
319 	{
320 		size_t start = 0, i = 0;
321 		while (i < str.length)
322 		{
323 			size_t o = i;
324 			static if (unicode)
325 				dchar c = decode(str, i);
326 			else
327 				char c = str[i++];
328 
329 			if (pred(c))
330 			{
331 				sink.put(str[start..o], '&', entityNames[c], ';');
332 				start = i;
333 			}
334 		}
335 		sink.put(str[start..$]);
336 	}
337 }
338 
339 public alias encodeEntities = encodeEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&');
340 public alias putEncodedEntities = putEncodedEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&');
341 
342 public string encodeAllEntities(string str)
343 {
344 	// TODO: optimize
345 	foreach_reverse (i, dchar c; str)
346 	{
347 		auto name = c in entityNames;
348 		if (name)
349 			str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$];
350 	}
351 	return str;
352 }
353 
354 import ae.utils.text;
355 import std.conv;
356 
357 public string decodeEntities(string str)
358 {
359 	auto fragments = str.fastSplit('&');
360 	if (fragments.length <= 1)
361 		return str;
362 
363 	auto interleaved = new string[fragments.length*2 - 1];
364 	auto buffers = new char[4][fragments.length-1];
365 	interleaved[0] = fragments[0];
366 
367 	foreach (n, fragment; fragments[1..$])
368 	{
369 		auto p = fragment.indexOf(';');
370 		enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)");
371 
372 		dchar c;
373 		if (fragment[0]=='#')
374 		{
375 			if (fragment[1]=='x')
376 				c = fromHex!uint(fragment[2..p]);
377 			else
378 				c = to!uint(fragment[1..p]);
379 		}
380 		else
381 		{
382 			auto pentity = fragment[0..p] in entities;
383 			enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]);
384 			c = *pentity;
385 		}
386 
387 		interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)];
388 		interleaved[2+n*2] = fragment[p+1..$];
389 	}
390 
391 	return interleaved.join();
392 }
393 
394 deprecated alias decodeEntities convertEntities;
395 
396 unittest
397 {
398 	assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The &lt;Smith &amp; Wesson&gt; &quot;lock&apos;n&apos;load&quot;`);
399 	assert(encodeAllEntities("©,€") == "&copy;,&euro;");
400 	assert(decodeEntities("&copy;,&euro;") == "©,€");
401 }