1 /** 2 * XML/HTML entity encoding/decoding. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <ae@cy.md> 12 * Simon Arlott 13 */ 14 15 module ae.utils.xml.entities; 16 17 import ae.utils.xml.common; 18 19 /// A mapping from HTML entity names to `dchar`. 20 const dchar[string] entities; 21 22 /// A mapping from `dchar` to the corresponding HTML entity name. 23 /*const*/ string[dchar] entityNames; 24 shared static this() 25 { 26 entities = 27 [ 28 "quot" : '\"', 29 "amp" : '\&', 30 "lt" : '\<', 31 "gt" : '\>', 32 33 "OElig" : '\Œ', 34 "oelig" : '\œ', 35 "Scaron" : '\Š', 36 "scaron" : '\š', 37 "Yuml" : '\Ÿ', 38 "circ" : '\ˆ', 39 "tilde" : '\˜', 40 "ensp" : '\ ', 41 "emsp" : '\ ', 42 "thinsp" : '\ ', 43 "zwnj" : '\‌', 44 "zwj" : '\‍', 45 "lrm" : '\‎', 46 "rlm" : '\‏', 47 "ndash" : '\–', 48 "mdash" : '\—', 49 "lsquo" : '\‘', 50 "rsquo" : '\’', 51 "sbquo" : '\‚', 52 "ldquo" : '\“', 53 "rdquo" : '\”', 54 "bdquo" : '\„', 55 "dagger" : '\†', 56 "Dagger" : '\‡', 57 "permil" : '\‰', 58 "lsaquo" : '\‹', 59 "rsaquo" : '\›', 60 "euro" : '\€', 61 62 "nbsp" : '\ ', 63 "iexcl" : '\¡', 64 "cent" : '\¢', 65 "pound" : '\£', 66 "curren" : '\¤', 67 "yen" : '\¥', 68 "brvbar" : '\¦', 69 "sect" : '\§', 70 "uml" : '\¨', 71 "copy" : '\©', 72 "ordf" : '\ª', 73 "laquo" : '\«', 74 "not" : '\¬', 75 "shy" : '\­', 76 "reg" : '\®', 77 "macr" : '\¯', 78 "deg" : '\°', 79 "plusmn" : '\±', 80 "sup2" : '\²', 81 "sup3" : '\³', 82 "acute" : '\´', 83 "micro" : '\µ', 84 "para" : '\¶', 85 "middot" : '\·', 86 "cedil" : '\¸', 87 "sup1" : '\¹', 88 "ordm" : '\º', 89 "raquo" : '\»', 90 "frac14" : '\¼', 91 "frac12" : '\½', 92 "frac34" : '\¾', 93 "iquest" : '\¿', 94 "Agrave" : '\À', 95 "Aacute" : '\Á', 96 "Acirc" : '\Â', 97 "Atilde" : '\Ã', 98 "Auml" : '\Ä', 99 "Aring" : '\Å', 100 "AElig" : '\Æ', 101 "Ccedil" : '\Ç', 102 "Egrave" : '\È', 103 "Eacute" : '\É', 104 "Ecirc" : '\Ê', 105 "Euml" : '\Ë', 106 "Igrave" : '\Ì', 107 "Iacute" : '\Í', 108 "Icirc" : '\Î', 109 "Iuml" : '\Ï', 110 "ETH" : '\Ð', 111 "Ntilde" : '\Ñ', 112 "Ograve" : '\Ò', 113 "Oacute" : '\Ó', 114 "Ocirc" : '\Ô', 115 "Otilde" : '\Õ', 116 "Ouml" : '\Ö', 117 "times" : '\×', 118 "Oslash" : '\Ø', 119 "Ugrave" : '\Ù', 120 "Uacute" : '\Ú', 121 "Ucirc" : '\Û', 122 "Uuml" : '\Ü', 123 "Yacute" : '\Ý', 124 "THORN" : '\Þ', 125 "szlig" : '\ß', 126 "agrave" : '\à', 127 "aacute" : '\á', 128 "acirc" : '\â', 129 "atilde" : '\ã', 130 "auml" : '\ä', 131 "aring" : '\å', 132 "aelig" : '\æ', 133 "ccedil" : '\ç', 134 "egrave" : '\è', 135 "eacute" : '\é', 136 "ecirc" : '\ê', 137 "euml" : '\ë', 138 "igrave" : '\ì', 139 "iacute" : '\í', 140 "icirc" : '\î', 141 "iuml" : '\ï', 142 "eth" : '\ð', 143 "ntilde" : '\ñ', 144 "ograve" : '\ò', 145 "oacute" : '\ó', 146 "ocirc" : '\ô', 147 "otilde" : '\õ', 148 "ouml" : '\ö', 149 "divide" : '\÷', 150 "oslash" : '\ø', 151 "ugrave" : '\ù', 152 "uacute" : '\ú', 153 "ucirc" : '\û', 154 "uuml" : '\ü', 155 "yacute" : '\ý', 156 "thorn" : '\þ', 157 "yuml" : '\ÿ', 158 159 "fnof" : '\ƒ', 160 "Alpha" : '\Α', 161 "Beta" : '\Β', 162 "Gamma" : '\Γ', 163 "Delta" : '\Δ', 164 "Epsilon" : '\Ε', 165 "Zeta" : '\Ζ', 166 "Eta" : '\Η', 167 "Theta" : '\Θ', 168 "Iota" : '\Ι', 169 "Kappa" : '\Κ', 170 "Lambda" : '\Λ', 171 "Mu" : '\Μ', 172 "Nu" : '\Ν', 173 "Xi" : '\Ξ', 174 "Omicron" : '\Ο', 175 "Pi" : '\Π', 176 "Rho" : '\Ρ', 177 "Sigma" : '\Σ', 178 "Tau" : '\Τ', 179 "Upsilon" : '\Υ', 180 "Phi" : '\Φ', 181 "Chi" : '\Χ', 182 "Psi" : '\Ψ', 183 "Omega" : '\Ω', 184 "alpha" : '\α', 185 "beta" : '\β', 186 "gamma" : '\γ', 187 "delta" : '\δ', 188 "epsilon" : '\ε', 189 "zeta" : '\ζ', 190 "eta" : '\η', 191 "theta" : '\θ', 192 "iota" : '\ι', 193 "kappa" : '\κ', 194 "lambda" : '\λ', 195 "mu" : '\μ', 196 "nu" : '\ν', 197 "xi" : '\ξ', 198 "omicron" : '\ο', 199 "pi" : '\π', 200 "rho" : '\ρ', 201 "sigmaf" : '\ς', 202 "sigma" : '\σ', 203 "tau" : '\τ', 204 "upsilon" : '\υ', 205 "phi" : '\φ', 206 "chi" : '\χ', 207 "psi" : '\ψ', 208 "omega" : '\ω', 209 "thetasym" : '\ϑ', 210 "upsih" : '\ϒ', 211 "piv" : '\ϖ', 212 "bull" : '\•', 213 "hellip" : '\…', 214 "prime" : '\′', 215 "Prime" : '\″', 216 "oline" : '\‾', 217 "frasl" : '\⁄', 218 "weierp" : '\℘', 219 "image" : '\ℑ', 220 "real" : '\ℜ', 221 "trade" : '\™', 222 "alefsym" : '\ℵ', 223 "larr" : '\←', 224 "uarr" : '\↑', 225 "rarr" : '\→', 226 "darr" : '\↓', 227 "harr" : '\↔', 228 "crarr" : '\↵', 229 "lArr" : '\⇐', 230 "uArr" : '\⇑', 231 "rArr" : '\⇒', 232 "dArr" : '\⇓', 233 "hArr" : '\⇔', 234 "forall" : '\∀', 235 "part" : '\∂', 236 "exist" : '\∃', 237 "empty" : '\∅', 238 "nabla" : '\∇', 239 "isin" : '\∈', 240 "notin" : '\∉', 241 "ni" : '\∋', 242 "prod" : '\∏', 243 "sum" : '\∑', 244 "minus" : '\−', 245 "lowast" : '\∗', 246 "radic" : '\√', 247 "prop" : '\∝', 248 "infin" : '\∞', 249 "ang" : '\∠', 250 "and" : '\∧', 251 "or" : '\∨', 252 "cap" : '\∩', 253 "cup" : '\∪', 254 "int" : '\∫', 255 "there4" : '\∴', 256 "sim" : '\∼', 257 "cong" : '\≅', 258 "asymp" : '\≈', 259 "ne" : '\≠', 260 "equiv" : '\≡', 261 "le" : '\≤', 262 "ge" : '\≥', 263 "sub" : '\⊂', 264 "sup" : '\⊃', 265 "nsub" : '\⊄', 266 "sube" : '\⊆', 267 "supe" : '\⊇', 268 "oplus" : '\⊕', 269 "otimes" : '\⊗', 270 "perp" : '\⊥', 271 "sdot" : '\⋅', 272 "lceil" : '\⌈', 273 "rceil" : '\⌉', 274 "lfloor" : '\⌊', 275 "rfloor" : '\⌋', 276 "loz" : '\◊', 277 "spades" : '\♠', 278 "clubs" : '\♣', 279 "hearts" : '\♥', 280 "diams" : '\♦', 281 "lang" : '\⟨', 282 "rang" : '\⟩', 283 284 "apos" : '\'' 285 ]; 286 foreach (name, c; entities) 287 entityNames[c] = name; 288 } 289 290 import core.stdc.stdio; 291 import std.array; 292 import std.exception; 293 import std..string : indexOf; 294 import std.utf; 295 import ae.utils.textout; 296 297 /*private*/ public string _encodeEntitiesImpl(bool unicode, alias pred)(string str) 298 { 299 size_t i = 0; 300 while (i < str.length) 301 { 302 size_t o = i; 303 static if (unicode) 304 dchar c = decode(str, i); 305 else 306 char c = str[i++]; 307 308 if (pred(c)) 309 { 310 StringBuilder sb; 311 sb.preallocate(str.length * 11 / 10); 312 sb.put(str[0..o]); 313 sb._putEncodedEntitiesImpl!(unicode, pred)(str[o..$]); 314 return sb.get(); 315 } 316 } 317 return str; 318 } 319 320 /*private*/ public template _putEncodedEntitiesImpl(bool unicode, alias pred) 321 { 322 void _putEncodedEntitiesImpl(Sink, S)(ref Sink sink, S str) 323 { 324 size_t start = 0, i = 0; 325 while (i < str.length) 326 { 327 size_t o = i; 328 static if (unicode) 329 dchar c = decode(str, i); 330 else 331 char c = str[i++]; 332 333 if (pred(c)) 334 { 335 sink.put(str[start..o], '&', entityNames[c], ';'); 336 start = i; 337 } 338 } 339 sink.put(str[start..$]); 340 } 341 } 342 343 /// Encode HTML entities and return the resulting string. 344 public alias encodeEntities = _encodeEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&'); 345 346 /// Write a string to a sink, encoding HTML entities. 347 public alias putEncodedEntities = _putEncodedEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&'); 348 349 /// Encode all known characters as HTML entities. 350 public string encodeAllEntities(string str) 351 { 352 // TODO: optimize 353 foreach_reverse (i, dchar c; str) 354 { 355 auto name = c in entityNames; 356 if (name) 357 str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$]; 358 } 359 return str; 360 } 361 362 import ae.utils.text; 363 import std.conv; 364 365 /// Decode HTML entities and return the resulting string. 366 public string decodeEntities(string str) 367 { 368 auto fragments = str.fastSplit('&'); 369 if (fragments.length <= 1) 370 return str; 371 372 auto interleaved = new string[fragments.length*2 - 1]; 373 auto buffers = new char[4][fragments.length-1]; 374 interleaved[0] = fragments[0]; 375 376 foreach (n, fragment; fragments[1..$]) 377 { 378 auto p = fragment.indexOf(';'); 379 enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)"); 380 381 dchar c; 382 if (fragment[0]=='#') 383 { 384 if (fragment[1]=='x') 385 c = fromHex!uint(fragment[2..p]); 386 else 387 c = to!uint(fragment[1..p]); 388 } 389 else 390 { 391 auto pentity = fragment[0..p] in entities; 392 enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]); 393 c = *pentity; 394 } 395 396 interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)]; 397 interleaved[2+n*2] = fragment[p+1..$]; 398 } 399 400 return interleaved.join(); 401 } 402 403 deprecated alias decodeEntities convertEntities; 404 405 unittest 406 { 407 assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The <Smith & Wesson> "lock'n'load"`); 408 assert(encodeAllEntities("©,€") == "©,€"); 409 assert(decodeEntities("©,€") == "©,€"); 410 }