1 /** 2 * XML/HTML entity encoding/decoding. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <vladimir@thecybershadow.net> 12 * Simon Arlott 13 */ 14 15 module ae.utils.xml.entities; 16 17 import ae.utils.xml.common; 18 19 const dchar[string] entities; 20 /*const*/ string[dchar] entityNames; 21 shared static this() 22 { 23 entities = 24 [ 25 "quot" : '\"', 26 "amp" : '\&', 27 "lt" : '\<', 28 "gt" : '\>', 29 30 "OElig" : '\Œ', 31 "oelig" : '\œ', 32 "Scaron" : '\Š', 33 "scaron" : '\š', 34 "Yuml" : '\Ÿ', 35 "circ" : '\ˆ', 36 "tilde" : '\˜', 37 "ensp" : '\ ', 38 "emsp" : '\ ', 39 "thinsp" : '\ ', 40 "zwnj" : '\‌', 41 "zwj" : '\‍', 42 "lrm" : '\‎', 43 "rlm" : '\‏', 44 "ndash" : '\–', 45 "mdash" : '\—', 46 "lsquo" : '\‘', 47 "rsquo" : '\’', 48 "sbquo" : '\‚', 49 "ldquo" : '\“', 50 "rdquo" : '\”', 51 "bdquo" : '\„', 52 "dagger" : '\†', 53 "Dagger" : '\‡', 54 "permil" : '\‰', 55 "lsaquo" : '\‹', 56 "rsaquo" : '\›', 57 "euro" : '\€', 58 59 "nbsp" : '\ ', 60 "iexcl" : '\¡', 61 "cent" : '\¢', 62 "pound" : '\£', 63 "curren" : '\¤', 64 "yen" : '\¥', 65 "brvbar" : '\¦', 66 "sect" : '\§', 67 "uml" : '\¨', 68 "copy" : '\©', 69 "ordf" : '\ª', 70 "laquo" : '\«', 71 "not" : '\¬', 72 "shy" : '\­', 73 "reg" : '\®', 74 "macr" : '\¯', 75 "deg" : '\°', 76 "plusmn" : '\±', 77 "sup2" : '\²', 78 "sup3" : '\³', 79 "acute" : '\´', 80 "micro" : '\µ', 81 "para" : '\¶', 82 "middot" : '\·', 83 "cedil" : '\¸', 84 "sup1" : '\¹', 85 "ordm" : '\º', 86 "raquo" : '\»', 87 "frac14" : '\¼', 88 "frac12" : '\½', 89 "frac34" : '\¾', 90 "iquest" : '\¿', 91 "Agrave" : '\À', 92 "Aacute" : '\Á', 93 "Acirc" : '\Â', 94 "Atilde" : '\Ã', 95 "Auml" : '\Ä', 96 "Aring" : '\Å', 97 "AElig" : '\Æ', 98 "Ccedil" : '\Ç', 99 "Egrave" : '\È', 100 "Eacute" : '\É', 101 "Ecirc" : '\Ê', 102 "Euml" : '\Ë', 103 "Igrave" : '\Ì', 104 "Iacute" : '\Í', 105 "Icirc" : '\Î', 106 "Iuml" : '\Ï', 107 "ETH" : '\Ð', 108 "Ntilde" : '\Ñ', 109 "Ograve" : '\Ò', 110 "Oacute" : '\Ó', 111 "Ocirc" : '\Ô', 112 "Otilde" : '\Õ', 113 "Ouml" : '\Ö', 114 "times" : '\×', 115 "Oslash" : '\Ø', 116 "Ugrave" : '\Ù', 117 "Uacute" : '\Ú', 118 "Ucirc" : '\Û', 119 "Uuml" : '\Ü', 120 "Yacute" : '\Ý', 121 "THORN" : '\Þ', 122 "szlig" : '\ß', 123 "agrave" : '\à', 124 "aacute" : '\á', 125 "acirc" : '\â', 126 "atilde" : '\ã', 127 "auml" : '\ä', 128 "aring" : '\å', 129 "aelig" : '\æ', 130 "ccedil" : '\ç', 131 "egrave" : '\è', 132 "eacute" : '\é', 133 "ecirc" : '\ê', 134 "euml" : '\ë', 135 "igrave" : '\ì', 136 "iacute" : '\í', 137 "icirc" : '\î', 138 "iuml" : '\ï', 139 "eth" : '\ð', 140 "ntilde" : '\ñ', 141 "ograve" : '\ò', 142 "oacute" : '\ó', 143 "ocirc" : '\ô', 144 "otilde" : '\õ', 145 "ouml" : '\ö', 146 "divide" : '\÷', 147 "oslash" : '\ø', 148 "ugrave" : '\ù', 149 "uacute" : '\ú', 150 "ucirc" : '\û', 151 "uuml" : '\ü', 152 "yacute" : '\ý', 153 "thorn" : '\þ', 154 "yuml" : '\ÿ', 155 156 "fnof" : '\ƒ', 157 "Alpha" : '\Α', 158 "Beta" : '\Β', 159 "Gamma" : '\Γ', 160 "Delta" : '\Δ', 161 "Epsilon" : '\Ε', 162 "Zeta" : '\Ζ', 163 "Eta" : '\Η', 164 "Theta" : '\Θ', 165 "Iota" : '\Ι', 166 "Kappa" : '\Κ', 167 "Lambda" : '\Λ', 168 "Mu" : '\Μ', 169 "Nu" : '\Ν', 170 "Xi" : '\Ξ', 171 "Omicron" : '\Ο', 172 "Pi" : '\Π', 173 "Rho" : '\Ρ', 174 "Sigma" : '\Σ', 175 "Tau" : '\Τ', 176 "Upsilon" : '\Υ', 177 "Phi" : '\Φ', 178 "Chi" : '\Χ', 179 "Psi" : '\Ψ', 180 "Omega" : '\Ω', 181 "alpha" : '\α', 182 "beta" : '\β', 183 "gamma" : '\γ', 184 "delta" : '\δ', 185 "epsilon" : '\ε', 186 "zeta" : '\ζ', 187 "eta" : '\η', 188 "theta" : '\θ', 189 "iota" : '\ι', 190 "kappa" : '\κ', 191 "lambda" : '\λ', 192 "mu" : '\μ', 193 "nu" : '\ν', 194 "xi" : '\ξ', 195 "omicron" : '\ο', 196 "pi" : '\π', 197 "rho" : '\ρ', 198 "sigmaf" : '\ς', 199 "sigma" : '\σ', 200 "tau" : '\τ', 201 "upsilon" : '\υ', 202 "phi" : '\φ', 203 "chi" : '\χ', 204 "psi" : '\ψ', 205 "omega" : '\ω', 206 "thetasym" : '\ϑ', 207 "upsih" : '\ϒ', 208 "piv" : '\ϖ', 209 "bull" : '\•', 210 "hellip" : '\…', 211 "prime" : '\′', 212 "Prime" : '\″', 213 "oline" : '\‾', 214 "frasl" : '\⁄', 215 "weierp" : '\℘', 216 "image" : '\ℑ', 217 "real" : '\ℜ', 218 "trade" : '\™', 219 "alefsym" : '\ℵ', 220 "larr" : '\←', 221 "uarr" : '\↑', 222 "rarr" : '\→', 223 "darr" : '\↓', 224 "harr" : '\↔', 225 "crarr" : '\↵', 226 "lArr" : '\⇐', 227 "uArr" : '\⇑', 228 "rArr" : '\⇒', 229 "dArr" : '\⇓', 230 "hArr" : '\⇔', 231 "forall" : '\∀', 232 "part" : '\∂', 233 "exist" : '\∃', 234 "empty" : '\∅', 235 "nabla" : '\∇', 236 "isin" : '\∈', 237 "notin" : '\∉', 238 "ni" : '\∋', 239 "prod" : '\∏', 240 "sum" : '\∑', 241 "minus" : '\−', 242 "lowast" : '\∗', 243 "radic" : '\√', 244 "prop" : '\∝', 245 "infin" : '\∞', 246 "ang" : '\∠', 247 "and" : '\∧', 248 "or" : '\∨', 249 "cap" : '\∩', 250 "cup" : '\∪', 251 "int" : '\∫', 252 "there4" : '\∴', 253 "sim" : '\∼', 254 "cong" : '\≅', 255 "asymp" : '\≈', 256 "ne" : '\≠', 257 "equiv" : '\≡', 258 "le" : '\≤', 259 "ge" : '\≥', 260 "sub" : '\⊂', 261 "sup" : '\⊃', 262 "nsub" : '\⊄', 263 "sube" : '\⊆', 264 "supe" : '\⊇', 265 "oplus" : '\⊕', 266 "otimes" : '\⊗', 267 "perp" : '\⊥', 268 "sdot" : '\⋅', 269 "lceil" : '\⌈', 270 "rceil" : '\⌉', 271 "lfloor" : '\⌊', 272 "rfloor" : '\⌋', 273 "loz" : '\◊', 274 "spades" : '\♠', 275 "clubs" : '\♣', 276 "hearts" : '\♥', 277 "diams" : '\♦', 278 "lang" : '\⟨', 279 "rang" : '\⟩', 280 281 "apos" : '\'' 282 ]; 283 foreach (name, c; entities) 284 entityNames[c] = name; 285 } 286 287 import core.stdc.stdio; 288 import std.array; 289 import std.exception; 290 import std.utf; 291 import ae.utils.textout; 292 293 /*private*/ public string encodeEntitiesImpl(bool unicode, alias pred)(string str) 294 { 295 size_t i = 0; 296 while (i < str.length) 297 { 298 size_t o = i; 299 static if (unicode) 300 dchar c = decode(str, i); 301 else 302 char c = str[i++]; 303 304 if (pred(c)) 305 { 306 StringBuilder sb; 307 sb.preallocate(str.length * 11 / 10); 308 sb.put(str[0..o]); 309 sb.putEncodedEntitiesImpl!(unicode, pred)(str[o..$]); 310 return sb.get(); 311 } 312 } 313 return str; 314 } 315 316 /*private*/ public template putEncodedEntitiesImpl(bool unicode, alias pred) 317 { 318 void putEncodedEntitiesImpl(Sink, S)(ref Sink sink, S str) 319 { 320 size_t start = 0, i = 0; 321 while (i < str.length) 322 { 323 size_t o = i; 324 static if (unicode) 325 dchar c = decode(str, i); 326 else 327 char c = str[i++]; 328 329 if (pred(c)) 330 { 331 sink.put(str[start..o], '&', entityNames[c], ';'); 332 start = i; 333 } 334 } 335 sink.put(str[start..$]); 336 } 337 } 338 339 public alias encodeEntities = encodeEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&'); 340 public alias putEncodedEntities = putEncodedEntitiesImpl!(false, (char c) => c=='<' || c=='>' || c=='"' || c=='\'' || c=='&'); 341 342 public string encodeAllEntities(string str) 343 { 344 // TODO: optimize 345 foreach_reverse (i, dchar c; str) 346 { 347 auto name = c in entityNames; 348 if (name) 349 str = str[0..i] ~ '&' ~ *name ~ ';' ~ str[i+stride(str,i)..$]; 350 } 351 return str; 352 } 353 354 import ae.utils.text; 355 import std.conv; 356 357 public string decodeEntities(string str) 358 { 359 auto fragments = str.fastSplit('&'); 360 if (fragments.length <= 1) 361 return str; 362 363 auto interleaved = new string[fragments.length*2 - 1]; 364 auto buffers = new char[4][fragments.length-1]; 365 interleaved[0] = fragments[0]; 366 367 foreach (n, fragment; fragments[1..$]) 368 { 369 auto p = fragment.indexOf(';'); 370 enforce!XmlParseException(p>0, "Invalid entity (unescaped ampersand?)"); 371 372 dchar c; 373 if (fragment[0]=='#') 374 { 375 if (fragment[1]=='x') 376 c = fromHex!uint(fragment[2..p]); 377 else 378 c = to!uint(fragment[1..p]); 379 } 380 else 381 { 382 auto pentity = fragment[0..p] in entities; 383 enforce!XmlParseException(pentity, "Unknown entity: " ~ fragment[0..p]); 384 c = *pentity; 385 } 386 387 interleaved[1+n*2] = cast(string) buffers[n][0..std.utf.encode(buffers[n], c)]; 388 interleaved[2+n*2] = fragment[p+1..$]; 389 } 390 391 return interleaved.join(); 392 } 393 394 deprecated alias decodeEntities convertEntities; 395 396 unittest 397 { 398 assert(encodeEntities(`The <Smith & Wesson> "lock'n'load"`) == `The <Smith & Wesson> "lock'n'load"`); 399 assert(encodeAllEntities("©,€") == "©,€"); 400 assert(decodeEntities("©,€") == "©,€"); 401 }