1 /**
2  * XML/HTML entity encoding/decoding.
3  *
4  * License:
5  *   This Source Code Form is subject to the terms of
6  *   the Mozilla Public License, v. 2.0. If a copy of
7  *   the MPL was not distributed with this file, You
8  *   can obtain one at http://mozilla.org/MPL/2.0/.
9  *
10  * Authors:
11  *   Vladimir Panteleev <ae@cy.md>
12  *   Simon Arlott
13  */
14 
15 module ae.utils.xml.entities;
16 
17 import ae.utils.xml.common;
18 
19 /// A mapping from HTML entity names to `dchar`.
20 const dchar[string] entities;
21 
22 /// A mapping from `dchar` to the corresponding HTML entity name.
23 /*const*/ string[dchar] entityNames;
24 shared static this()
25 {
26 	entities =
27 	[
28 		"quot" : '\&quot;',
29 		"amp" : '\&amp;',
30 		"lt" : '\&lt;',
31 		"gt" : '\&gt;',
32 
33 		"OElig" : '\&OElig;',
34 		"oelig" : '\&oelig;',
35 		"Scaron" : '\&Scaron;',
36 		"scaron" : '\&scaron;',
37 		"Yuml" : '\&Yuml;',
38 		"circ" : '\&circ;',
39 		"tilde" : '\&tilde;',
40 		"ensp" : '\&ensp;',
41 		"emsp" : '\&emsp;',
42 		"thinsp" : '\&thinsp;',
43 		"zwnj" : '\&zwnj;',
44 		"zwj" : '\&zwj;',
45 		"lrm" : '\&lrm;',
46 		"rlm" : '\&rlm;',
47 		"ndash" : '\&ndash;',
48 		"mdash" : '\&mdash;',
49 		"lsquo" : '\&lsquo;',
50 		"rsquo" : '\&rsquo;',
51 		"sbquo" : '\&sbquo;',
52 		"ldquo" : '\&ldquo;',
53 		"rdquo" : '\&rdquo;',
54 		"bdquo" : '\&bdquo;',
55 		"dagger" : '\&dagger;',
56 		"Dagger" : '\&Dagger;',
57 		"permil" : '\&permil;',
58 		"lsaquo" : '\&lsaquo;',
59 		"rsaquo" : '\&rsaquo;',
60 		"euro" : '\&euro;',
61 
62 		"nbsp" : '\&nbsp;',
63 		"iexcl" : '\&iexcl;',
64 		"cent" : '\&cent;',
65 		"pound" : '\&pound;',
66 		"curren" : '\&curren;',
67 		"yen" : '\&yen;',
68 		"brvbar" : '\&brvbar;',
69 		"sect" : '\&sect;',
70 		"uml" : '\&uml;',
71 		"copy" : '\&copy;',
72 		"ordf" : '\&ordf;',
73 		"laquo" : '\&laquo;',
74 		"not" : '\&not;',
75 		"shy" : '\&shy;',
76 		"reg" : '\&reg;',
77 		"macr" : '\&macr;',
78 		"deg" : '\&deg;',
79 		"plusmn" : '\&plusmn;',
80 		"sup2" : '\&sup2;',
81 		"sup3" : '\&sup3;',
82 		"acute" : '\&acute;',
83 		"micro" : '\&micro;',
84 		"para" : '\&para;',
85 		"middot" : '\&middot;',
86 		"cedil" : '\&cedil;',
87 		"sup1" : '\&sup1;',
88 		"ordm" : '\&ordm;',
89 		"raquo" : '\&raquo;',
90 		"frac14" : '\&frac14;',
91 		"frac12" : '\&frac12;',
92 		"frac34" : '\&frac34;',
93 		"iquest" : '\&iquest;',
94 		"Agrave" : '\&Agrave;',
95 		"Aacute" : '\&Aacute;',
96 		"Acirc" : '\&Acirc;',
97 		"Atilde" : '\&Atilde;',
98 		"Auml" : '\&Auml;',
99 		"Aring" : '\&Aring;',
100 		"AElig" : '\&AElig;',
101 		"Ccedil" : '\&Ccedil;',
102 		"Egrave" : '\&Egrave;',
103 		"Eacute" : '\&Eacute;',
104 		"Ecirc" : '\&Ecirc;',
105 		"Euml" : '\&Euml;',
106 		"Igrave" : '\&Igrave;',
107 		"Iacute" : '\&Iacute;',
108 		"Icirc" : '\&Icirc;',
109 		"Iuml" : '\&Iuml;',
110 		"ETH" : '\&ETH;',
111 		"Ntilde" : '\&Ntilde;',
112 		"Ograve" : '\&Ograve;',
113 		"Oacute" : '\&Oacute;',
114 		"Ocirc" : '\&Ocirc;',
115 		"Otilde" : '\&Otilde;',
116 		"Ouml" : '\&Ouml;',
117 		"times" : '\&times;',
118 		"Oslash" : '\&Oslash;',
119 		"Ugrave" : '\&Ugrave;',
120 		"Uacute" : '\&Uacute;',
121 		"Ucirc" : '\&Ucirc;',
122 		"Uuml" : '\&Uuml;',
123 		"Yacute" : '\&Yacute;',
124 		"THORN" : '\&THORN;',
125 		"szlig" : '\&szlig;',
126 		"agrave" : '\&agrave;',
127 		"aacute" : '\&aacute;',
128 		"acirc" : '\&acirc;',
129 		"atilde" : '\&atilde;',
130 		"auml" : '\&auml;',
131 		"aring" : '\&aring;',
132 		"aelig" : '\&aelig;',
133 		"ccedil" : '\&ccedil;',
134 		"egrave" : '\&egrave;',
135 		"eacute" : '\&eacute;',
136 		"ecirc" : '\&ecirc;',
137 		"euml" : '\&euml;',
138 		"igrave" : '\&igrave;',
139 		"iacute" : '\&iacute;',
140 		"icirc" : '\&icirc;',
141 		"iuml" : '\&iuml;',
142 		"eth" : '\&eth;',
143 		"ntilde" : '\&ntilde;',
144 		"ograve" : '\&ograve;',
145 		"oacute" : '\&oacute;',
146 		"ocirc" : '\&ocirc;',
147 		"otilde" : '\&otilde;',
148 		"ouml" : '\&ouml;',
149 		"divide" : '\&divide;',
150 		"oslash" : '\&oslash;',
151 		"ugrave" : '\&ugrave;',
152 		"uacute" : '\&uacute;',
153 		"ucirc" : '\&ucirc;',
154 		"uuml" : '\&uuml;',
155 		"yacute" : '\&yacute;',
156 		"thorn" : '\&thorn;',
157 		"yuml" : '\&yuml;',
158 
159 		"fnof" : '\&fnof;',
160 		"Alpha" : '\&Alpha;',
161 		"Beta" : '\&Beta;',
162 		"Gamma" : '\&Gamma;',
163 		"Delta" : '\&Delta;',
164 		"Epsilon" : '\&Epsilon;',
165 		"Zeta" : '\&Zeta;',
166 		"Eta" : '\&Eta;',
167 		"Theta" : '\&Theta;',
168 		"Iota" : '\&Iota;',
169 		"Kappa" : '\&Kappa;',
170 		"Lambda" : '\&Lambda;',
171 		"Mu" : '\&Mu;',
172 		"Nu" : '\&Nu;',
173 		"Xi" : '\&Xi;',
174 		"Omicron" : '\&Omicron;',
175 		"Pi" : '\&Pi;',
176 		"Rho" : '\&Rho;',
177 		"Sigma" : '\&Sigma;',
178 		"Tau" : '\&Tau;',
179 		"Upsilon" : '\&Upsilon;',
180 		"Phi" : '\&Phi;',
181 		"Chi" : '\&Chi;',
182 		"Psi" : '\&Psi;',
183 		"Omega" : '\&Omega;',
184 		"alpha" : '\&alpha;',
185 		"beta" : '\&beta;',
186 		"gamma" : '\&gamma;',
187 		"delta" : '\&delta;',
188 		"epsilon" : '\&epsilon;',
189 		"zeta" : '\&zeta;',
190 		"eta" : '\&eta;',
191 		"theta" : '\&theta;',
192 		"iota" : '\&iota;',
193 		"kappa" : '\&kappa;',
194 		"lambda" : '\&lambda;',
195 		"mu" : '\&mu;',
196 		"nu" : '\&nu;',
197 		"xi" : '\&xi;',
198 		"omicron" : '\&omicron;',
199 		"pi" : '\&pi;',
200 		"rho" : '\&rho;',
201 		"sigmaf" : '\&sigmaf;',
202 		"sigma" : '\&sigma;',
203 		"tau" : '\&tau;',
204 		"upsilon" : '\&upsilon;',
205 		"phi" : '\&phi;',
206 		"chi" : '\&chi;',
207 		"psi" : '\&psi;',
208 		"omega" : '\&omega;',
209 		"thetasym" : '\&thetasym;',
210 		"upsih" : '\&upsih;',
211 		"piv" : '\&piv;',
212 		"bull" : '\&bull;',
213 		"hellip" : '\&hellip;',
214 		"prime" : '\&prime;',
215 		"Prime" : '\&Prime;',
216 		"oline" : '\&oline;',
217 		"frasl" : '\&frasl;',
218 		"weierp" : '\&weierp;',
219 		"image" : '\&image;',
220 		"real" : '\&real;',
221 		"trade" : '\&trade;',
222 		"alefsym" : '\&alefsym;',
223 		"larr" : '\&larr;',
224 		"uarr" : '\&uarr;',
225 		"rarr" : '\&rarr;',
226 		"darr" : '\&darr;',
227 		"harr" : '\&harr;',
228 		"crarr" : '\&crarr;',
229 		"lArr" : '\&lArr;',
230 		"uArr" : '\&uArr;',
231 		"rArr" : '\&rArr;',
232 		"dArr" : '\&dArr;',
233 		"hArr" : '\&hArr;',
234 		"forall" : '\&forall;',
235 		"part" : '\&part;',
236 		"exist" : '\&exist;',
237 		"empty" : '\&empty;',
238 		"nabla" : '\&nabla;',
239 		"isin" : '\&isin;',
240 		"notin" : '\&notin;',
241 		"ni" : '\&ni;',
242 		"prod" : '\&prod;',
243 		"sum" : '\&sum;',
244 		"minus" : '\&minus;',
245 		"lowast" : '\&lowast;',
246 		"radic" : '\&radic;',
247 		"prop" : '\&prop;',
248 		"infin" : '\&infin;',
249 		"ang" : '\&ang;',
250 		"and" : '\&and;',
251 		"or" : '\&or;',
252 		"cap" : '\&cap;',
253 		"cup" : '\&cup;',
254 		"int" : '\&int;',
255 		"there4" : '\&there4;',
256 		"sim" : '\&sim;',
257 		"cong" : '\&cong;',
258 		"asymp" : '\&asymp;',
259 		"ne" : '\&ne;',
260 		"equiv" : '\&equiv;',
261 		"le" : '\&le;',
262 		"ge" : '\&ge;',
263 		"sub" : '\&sub;',
264 		"sup" : '\&sup;',
265 		"nsub" : '\&nsub;',
266 		"sube" : '\&sube;',
267 		"supe" : '\&supe;',
268 		"oplus" : '\&oplus;',
269 		"otimes" : '\&otimes;',
270 		"perp" : '\&perp;',
271 		"sdot" : '\&sdot;',
272 		"lceil" : '\&lceil;',
273 		"rceil" : '\&rceil;',
274 		"lfloor" : '\&lfloor;',
275 		"rfloor" : '\&rfloor;',
276 		"loz" : '\&loz;',
277 		"spades" : '\&spades;',
278 		"clubs" : '\&clubs;',
279 		"hearts" : '\&hearts;',
280 		"diams" : '\&diams;',
281 		"lang" : '\&lang;',
282 		"rang" : '\&rang;',
283 
284 		"apos"  : '\''
285 	];
286 	foreach (name, c; entities)
287 		entityNames[c] = name;
288 }
289 
290 import core.stdc.stdio;
291 import std.array;
292 import std.exception;
293 import std.string : indexOf;
294 import std.utf;
295 import ae.utils.textout;
296 
297 /*private*/ public string _encodeEntitiesImpl(bool unicode, alias pred)(string str)
298 {
299 	size_t i = 0;
300 	while (i < str.length)
301 	{
302 		size_t o = i;
303 		static if (unicode)
304 			dchar c = decode(str, i);
305 		else
306 			char c = str[i++];
307 
308 		if (pred(c))
309 		{
310 			StringBuilder sb;
311 			sb.preallocate(str.length * 11 / 10);
312 			sb.put(str[0..o]);
313 			sb._putEncodedEntitiesImpl!(unicode, pred)(str[o..$]);
314 			return sb.get();
315 		}
316 	}
317 	return str;
318 }
319 
320 /*private*/ public template _putEncodedEntitiesImpl(bool unicode, alias pred)
321 {
322 	void _putEncodedEntitiesImpl(Sink, S)(ref Sink sink, S str)
323 	{
324 		size_t start = 0, i = 0;
325 		while (i < str.length)
326 		{
327 			size_t o = i;
328 			static if (unicode)
329 				dchar c = decode(str, i);
330 			else
331 				char c = str[i++];
332 
333 			if (pred(c))
334 			{
335 				sink.put(str[start..o], '&', entityNames[c], ';');
336 				start = i;
337 			}
338 		}
339 		sink.put(str[start..$]);
340 	}
341 }
342 
343 /// Encode HTML entities and return the resulting string.
344 public alias encodeEntities = _encodeEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&');
345 
346 /// Write a string to a sink, encoding HTML entities.
347 public alias putEncodedEntities = _putEncodedEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&');
348 
349 /// Encode all known characters as HTML entities.
350 public string encodeAllEntities(string str)
351 {
352 	// TODO: optimize
353 	foreach_reverse (i, dchar c; str)
354 	{
355 		auto name = c in entityNames;
356 		if (name)
357 			str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$];
358 	}
359 	return str;
360 }
361 
362 import ae.utils.text;
363 import std.conv;
364 
365 /// Decode HTML entities and return the resulting string.
366 public string decodeEntities(string str)
367 {
368 	auto fragments = str.fastSplit('&');
369 	if (fragments.length <= 1)
370 		return str;
371 
372 	auto interleaved = new string[fragments.length*2 - 1];
373 	auto buffers = new char[4][fragments.length-1];
374 	interleaved[0] = fragments[0];
375 
376 	foreach (n, fragment; fragments[1..$])
377 	{
378 		auto p = fragment.indexOf(';');
379 		enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)");
380 
381 		dchar c;
382 		if (fragment[0]=='#')
383 		{
384 			if (fragment[1]=='x')
385 				c = fromHex!uint(fragment[2..p]);
386 			else
387 				c = to!uint(fragment[1..p]);
388 		}
389 		else
390 		{
391 			auto pentity = fragment[0..p] in entities;
392 			enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]);
393 			c = *pentity;
394 		}
395 
396 		interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)];
397 		interleaved[2+n*2] = fragment[p+1..$];
398 	}
399 
400 	return interleaved.join();
401 }
402 
403 deprecated alias decodeEntities convertEntities;
404 
405 unittest
406 {
407 	assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The &lt;Smith &amp; Wesson&gt; &quot;lock&apos;n&apos;load&quot;`);
408 	assert(encodeAllEntities("©,€") == "&copy;,&euro;");
409 	assert(decodeEntities("&copy;,&euro;") == "©,€");
410 }