1 /** 2 * Code to convert strings from a few common codepages to UTF-8. 3 * Loosely based on https://github.com/adamdruppe/misc-stuff-including-D-programming-language-web-stuff/blob/master/characterencodings.d 4 * 5 * License: 6 * This Source Code Form is subject to the terms of 7 * the Mozilla Public License, v. 2.0. If a copy of 8 * the MPL was not distributed with this file, You 9 * can obtain one at http://mozilla.org/MPL/2.0/. 10 * 11 * Authors: 12 * Vladimir Panteleev <ae@cy.md> 13 */ 14 15 module ae.utils.iconv; 16 17 import std.string; 18 import std.array; 19 import std.conv; 20 import std.exception; 21 22 import ae.utils.text : ascii; 23 24 /// Convert text in an arbitrary (known) encoding to UTF-8. 25 /// Params: 26 /// data = text to convert 27 /// cp = the name of the source character encoding 28 /// force = do not throw on errors; instead, do a best-effort translation 29 string toUtf8(in return ascii data, string cp, bool force = false) 30 { 31 cp = toLower(cp).replace("-", ""); 32 33 // Windows-1252 is a superset of ISO-8859-1 34 if (cp == "iso88591") 35 cp = "windows1252"; 36 37 switch (cp) 38 { 39 case "utf8": 40 { 41 import std.utf; 42 validate(data); 43 return data; 44 } 45 case "usascii": 46 { 47 if (hasHighAsciiChars(data)) 48 { 49 if (force) 50 return stripNonAscii(data); 51 else 52 throw new Exception("Non-ASCII characters in US-ASCII text"); 53 } 54 55 return data; 56 } 57 case "utf16": 58 enforce(data.length % 2 == 0, "Bad number of bytes for utf16"); 59 return to!string(cast(wstring)data); 60 case "utf32": 61 enforce(data.length % 4 == 0, "Bad number of bytes for utf32"); 62 return to!string(cast(dstring)data); 63 default: 64 { 65 if (!hasHighAsciiChars(data)) 66 return data; 67 68 if (cp in codepages) 69 { 70 wstring cpData = codepages[cp]; 71 wchar[] result = new wchar[data.length]; 72 foreach (size_t i, ubyte b; data) 73 result[i] = b < 0x80 ? b : cpData[b - 0x80]; 74 return to!string(result); 75 } 76 else 77 if (force) 78 return stripNonAscii(data); 79 else 80 throw new Exception("Don't know how to decode " ~ cp); 81 } 82 } 83 } 84 85 /// True if `data` contains non-ASCII characters. 86 bool hasHighAsciiChars(in ascii data) 87 { 88 foreach (char b; data) 89 if (b >= 0x80) 90 return true; 91 return false; 92 } 93 94 /// Replaces non-ASCII characters (with the high bit set) with the Unicode replacement character. 95 string stripNonAscii(in ascii data) 96 { 97 wchar[] result = new wchar[data.length]; 98 foreach (size_t i, ubyte b; data) 99 result[i] = b < 0x80 ? b : '\uFFFD'; 100 return to!string(result); 101 } 102 103 104 immutable shared wstring[string] codepages; /// High part of known 8-bit code pages. 105 106 shared static this() 107 { 108 codepages["windows1250"] = "€‚„…†‡‰Š‹ŚŤŽŹ‘’“”•–—™š›śťžź ˇ˘Ł¤Ą¦§¨©Ş«¬®Ż°±˛ł´µ¶·¸ąş»Ľ˝ľżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙"w; 109 codepages["windows1251"] = "ЂЃ‚ѓ„…†‡€‰Љ‹ЊЌЋЏђ‘’“”•–—™љ›њќћџ ЎўЈ¤Ґ¦§Ё©Є«¬®Ї°±Ііґµ¶·ё№є»јЅѕїАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя"w; 110 codepages["windows1252"] = "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"w; 111 codepages["windows1253"] = "€‚ƒ„…†‡‰‹‘’“”•–—™› ΅Ά£¤¥¦§¨©«¬®―°±²³΄µ¶·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"w; 112 codepages["windows1254"] = "€‚ƒ„…†‡ˆ‰Š‹Œ‘’“”•–—˜™š›œŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖ×ØÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ"w; 113 codepages["windows1255"] = "€‚ƒ„…†‡ˆ‰‹‘’“”•–—˜™› ¡¢£₪¥¦§¨©×«¬®¯°±²³´µ¶·¸¹÷»¼½¾¿ְֱֲֳִֵֶַָֹֺֻּֽ־ֿ׀ׁׂ׃װױײ׳״אבגדהוזחטיךכלםמןנסעףפץצקרשת\u200E\u200F"w; 114 codepages["windows1256"] = "€پ‚ƒ„…†‡ˆ‰ٹ‹Œچژڈگ‘’“”•–—ک™ڑ›œں ،¢£¤¥¦§¨©ھ«¬®¯°±²³´µ¶·¸¹؛»¼½¾؟ہءآأؤإئابةتثجحخدذرزسشصض×طظعغـفقكàلâمنهوçèéêëىيîïًٌٍَôُِ÷ّùْûü\u200E\u200Fے"w; 115 codepages["windows1257"] = "€‚„…†‡‰‹¨ˇ¸‘’“”•–—™›¯˛ ¢£¤¦§Ø©Ŗ«¬®Æ°±²³´µ¶·ø¹ŗ»¼½¾æĄĮĀĆÄÅĘĒČÉŹĖĢĶĪĻŠŃŅÓŌÕÖ×ŲŁŚŪÜŻŽßąįāćäåęēčéźėģķīļšńņóōõö÷ųłśūüżž˙"w; 116 codepages["windows1258"] = "€‚ƒ„…†‡ˆ‰‹Œ‘’“”•–—˜™›œŸ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂĂÄÅÆÇÈÉÊË̀ÍÎÏĐÑ̉ÓÔƠÖ×ØÙÚÛÜỮßàáâăäåæçèéêë́íîïđṇ̃óôơö÷øùúûüư₫ÿ"w; 117 codepages["koi8r"] = "─│┌┐└┘├┤┬┴┼▀▄█▌▐░▒▓⌠■∙√≈≤≥ ⌡°²·÷═║╒ё╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡Ё╢╣╤╥╦╧╨╩╪╫╬©юабцдефгхийклмнопярстужвьызшэщчъЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ"w; 118 codepages["koi8u"] = "─│┌┐└┘├┤┬┴┼▀▄█▌▐░▒▓⌠■∙√≈≤≥ ⌡°²·÷═║╒ёє╔ії╗╘╙╚╛ґў╞╟╠╡ЁЄ╣ІЇ╦╧╨╩╪ҐЎ©юабцдефгхийклмнопярстужвьызшэщчъЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ"w; 119 codepages["iso88591"] = " ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"w; 120 codepages["iso88592"] = " Ą˘Ł¤ĽŚ§¨ŠŞŤŹŽŻ°ą˛ł´ľśˇ¸šşťź˝žżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙"w; 121 codepages["iso88593"] = " Ħ˘£¤Ĥ§¨İŞĞĴŻ°ħ²³´µĥ·¸ışğĵ½żÀÁÂÄĊĈÇÈÉÊËÌÍÎÏÑÒÓÔĠÖ×ĜÙÚÛÜŬŜßàáâäċĉçèéêëìíîïñòóôġö÷ĝùúûüŭŝ˙"w; 122 codepages["iso88594"] = " ĄĸŖ¤ĨĻ§¨ŠĒĢŦŽ¯°ą˛ŗ´ĩļˇ¸šēģŧŊžŋĀÁÂÃÄÅÆĮČÉĘËĖÍÎĪĐŅŌĶÔÕÖ×ØŲÚÛÜŨŪßāáâãäåæįčéęëėíîīđņōķôõö÷øųúûüũū˙"w; 123 codepages["iso88595"] = " ЁЂЃЄЅІЇЈЉЊЋЌЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя№ёђѓєѕіїјљњћќ§ўџ"w; 124 codepages["iso88596"] = " ¤،؛؟ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْ"w; 125 codepages["iso88597"] = " ʽʼ£¦§¨©«¬―°±²³΄΅Ά·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"w; 126 codepages["iso88598"] = " ¢£¤¥¦§¨©×«¬®‾°±²³´µ¶·¸¹÷»¼½¾‗אבגדהוזחטיךכלםמןנסעףפץצקרשת"w; 127 codepages["iso88599"] = " ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖ×ØÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ"w; 128 codepages["iso885913"] = " ”¢£¤„¦§Ø©Ŗ«¬®Æ°±²³“µ¶·ø¹ŗ»¼½¾æĄĮĀĆÄÅĘĒČÉŹĖĢĶĪĻŠŃŅÓŌÕÖ×ŲŁŚŪÜŻŽßąįāćäåęēčéźėģķīļšńņóōõö÷ųłśūüżž’"w; 129 codepages["iso885915"] = " ¡¢£€¥Š§š©ª«¬®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"w; 130 codepages["ascii8"] = " ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"w; 131 } 132 133 unittest 134 { 135 foreach (name, chars; codepages) 136 assert(chars.length == 128); 137 } 138 139 deprecated string toUtf8(in return ubyte[] data, string cp, bool force) { return toUtf8(cast(ascii)data, cp, force); } 140 deprecated bool hasHighAsciiChars(in ubyte[] data) { return hasHighAsciiChars(cast(ascii)data); }