1 /**
2  * Code to convert strings from a few common codepages to UTF-8.
3  * Loosely based on https://github.com/adamdruppe/misc-stuff-including-D-programming-language-web-stuff/blob/master/characterencodings.d
4  *
5  * License:
6  *   This Source Code Form is subject to the terms of
7  *   the Mozilla Public License, v. 2.0. If a copy of
8  *   the MPL was not distributed with this file, You
9  *   can obtain one at http://mozilla.org/MPL/2.0/.
10  *
11  * Authors:
12  *   Vladimir Panteleev <ae@cy.md>
13  */
14 
15 module ae.utils.iconv;
16 
17 import std.string;
18 import std.array;
19 import std.conv;
20 import std.exception;
21 
22 import ae.utils.text : ascii;
23 
24 /// Convert text in an arbitrary (known) encoding to UTF-8.
25 /// Params:
26 ///   data  = text to convert
27 ///   cp    = the name of the source character encoding
28 ///   force = do not throw on errors; instead, do a best-effort translation
29 string toUtf8(in return ascii data, string cp, bool force = false)
30 {
31 	cp = toLower(cp).replace("-", "");
32 
33 	// Windows-1252 is a superset of ISO-8859-1
34 	if (cp == "iso88591")
35 		cp = "windows1252";
36 
37 	switch (cp)
38 	{
39 		case "utf8":
40 		{
41 			import std.utf;
42 			validate(data);
43 			return data;
44 		}
45 		case "usascii":
46 		{
47 			if (hasHighAsciiChars(data))
48 			{
49 				if (force)
50 					return stripNonAscii(data);
51 				else
52 					throw new Exception("Non-ASCII characters in US-ASCII text");
53 			}
54 
55 			return data;
56 		}
57 		case "utf16":
58 			enforce(data.length % 2 == 0, "Bad number of bytes for utf16");
59 			return to!string(cast(wstring)data);
60 		case "utf32":
61 			enforce(data.length % 4 == 0, "Bad number of bytes for utf32");
62 			return to!string(cast(dstring)data);
63 		default:
64 		{
65 			if (!hasHighAsciiChars(data))
66 				return data;
67 
68 			if (cp in codepages)
69 			{
70 				wstring cpData = codepages[cp];
71 				wchar[] result = new wchar[data.length];
72 				foreach (size_t i, ubyte b; data)
73 					result[i] = b < 0x80 ? b : cpData[b - 0x80];
74 				return to!string(result);
75 			}
76 			else
77 			if (force)
78 				return stripNonAscii(data);
79 			else
80 				throw new Exception("Don't know how to decode " ~ cp);
81 		}
82 	}
83 }
84 
85 /// True if `data` contains non-ASCII characters.
86 bool hasHighAsciiChars(in ascii data)
87 {
88 	foreach (char b; data)
89 		if (b >= 0x80)
90 			return true;
91 	return false;
92 }
93 
94 /// Replaces non-ASCII characters (with the high bit set) with the Unicode replacement character.
95 string stripNonAscii(in ascii data)
96 {
97 	wchar[] result = new wchar[data.length];
98 	foreach (size_t i, ubyte b; data)
99 		result[i] = b < 0x80 ? b : '\uFFFD';
100 	return to!string(result);
101 }
102 
103 
104 immutable shared wstring[string] codepages; /// High part of known 8-bit code pages.
105 
106 shared static this()
107 {
108 	codepages["windows1250"] = "€‚ƒ„…†‡ˆ‰Š‹ŚŤŽŹ‘’“”•–—˜™š›śťžź ˇ˘Ł¤Ą¦§¨©Ş«¬­®Ż°±˛ł´µ¶·¸ąş»Ľ˝ľżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙"w;
109 	codepages["windows1251"] = "ЂЃ‚ѓ„…†‡€‰Љ‹ЊЌЋЏђ‘’“”•–—˜™љ›њќћџ ЎўЈ¤Ґ¦§Ё©Є«¬­®Ї°±Ііґµ¶·ё№є»јЅѕїАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя"w;
110 	codepages["windows1252"] = "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"w;
111 	codepages["windows1253"] = "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ΅Ά£¤¥¦§¨©«¬­®―°±²³΄µ¶·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"w;
112 	codepages["windows1254"] = "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖ×ØÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ"w;
113 	codepages["windows1255"] = "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£₪¥¦§¨©×«¬­®¯°±²³´µ¶·¸¹÷»¼½¾¿ְֱֲֳִֵֶַָֹֺֻּֽ־ֿ׀ׁׂ׃װױײ׳״אבגדהוזחטיךכלםמןנסעףפץצקרשת\u200E\u200F"w;
114 	codepages["windows1256"] = "€پ‚ƒ„…†‡ˆ‰ٹ‹Œچژڈگ‘’“”•–—ک™ڑ›œ‌‍ں ،¢£¤¥¦§¨©ھ«¬­®¯°±²³´µ¶·¸¹؛»¼½¾؟ہءآأؤإئابةتثجحخدذرزسشصض×طظعغـفقكàلâمنهوçèéêëىيîïًٌٍَôُِ÷ّùْûü\u200E\u200Fے"w;
115 	codepages["windows1257"] = "€‚ƒ„…†‡ˆ‰Š‹Œ¨ˇ¸‘’“”•–—˜™š›œ¯˛Ÿ ¢£¤¦§Ø©Ŗ«¬­®Æ°±²³´µ¶·ø¹ŗ»¼½¾æĄĮĀĆÄÅĘĒČÉŹĖĢĶĪĻŠŃŅÓŌÕÖ×ŲŁŚŪÜŻŽßąįāćäåęēčéźėģķīļšńņóōõö÷ųłśūüżž˙"w;
116 	codepages["windows1258"] = "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂĂÄÅÆÇÈÉÊË̀ÍÎÏĐÑ̉ÓÔƠÖ×ØÙÚÛÜỮßàáâăäåæçèéêë́íîïđṇ̃óôơö÷øùúûüư₫ÿ"w;
117 	codepages["koi8r"] =       "─│┌┐└┘├┤┬┴┼▀▄█▌▐░▒▓⌠■∙√≈≤≥ ⌡°²·÷═║╒ё╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡Ё╢╣╤╥╦╧╨╩╪╫╬©юабцдефгхийклмнопярстужвьызшэщчъЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ"w;
118 	codepages["koi8u"] =       "─│┌┐└┘├┤┬┴┼▀▄█▌▐░▒▓⌠■∙√≈≤≥ ⌡°²·÷═║╒ёє╔ії╗╘╙╚╛ґў╞╟╠╡ЁЄ╣ІЇ╦╧╨╩╪ҐЎ©юабцдефгхийклмнопярстужвьызшэщчъЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ"w;
119 	codepages["iso88591"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"w;
120 	codepages["iso88592"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ Ą˘Ł¤ĽŚ§¨ŠŞŤŹ­ŽŻ°ą˛ł´ľśˇ¸šşťź˝žżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙"w;
121 	codepages["iso88593"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ Ħ˘£¤Ĥ§¨İŞĞĴ­Ż°ħ²³´µĥ·¸ışğĵ½żÀÁÂÄĊĈÇÈÉÊËÌÍÎÏÑÒÓÔĠÖ×ĜÙÚÛÜŬŜßàáâäċĉçèéêëìíîïñòóôġö÷ĝùúûüŭŝ˙"w;
122 	codepages["iso88594"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ĄĸŖ¤ĨĻ§¨ŠĒĢŦ­Ž¯°ą˛ŗ´ĩļˇ¸šēģŧŊžŋĀÁÂÃÄÅÆĮČÉĘËĖÍÎĪĐŅŌĶÔÕÖ×ØŲÚÛÜŨŪßāáâãäåæįčéęëėíîīđņōķôõö÷øųúûüũū˙"w;
123 	codepages["iso88595"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ЁЂЃЄЅІЇЈЉЊЋЌ­ЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя№ёђѓєѕіїјљњћќ§ўџ"w;
124 	codepages["iso88596"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¤،­؛؟ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْ"w;
125 	codepages["iso88597"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ʽʼ£¦§¨©«¬­―°±²³΄΅Ά·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ"w;
126 	codepages["iso88598"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¢£¤¥¦§¨©×«¬­®‾°±²³´µ¶·¸¹÷»¼½¾‗אבגדהוזחטיךכלםמןנסעףפץצקרשת"w;
127 	codepages["iso88599"] =    "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖ×ØÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ"w;
128 	codepages["iso885913"] =   "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ”¢£¤„¦§Ø©Ŗ«¬­®Æ°±²³“µ¶·ø¹ŗ»¼½¾æĄĮĀĆÄÅĘĒČÉŹĖĢĶĪĻŠŃŅÓŌÕÖ×ŲŁŚŪÜŻŽßąįāćäåęēčéźėģķīļšńņóōõö÷ųłśūüżž’"w;
129 	codepages["iso885915"] =   "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£€¥Š§š©ª«¬­®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"w;
130 	codepages["ascii8"] =      "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"w;
131 }
132 
133 unittest
134 {
135 	foreach (name, chars; codepages)
136 		assert(chars.length == 128);
137 }
138 
139 deprecated string toUtf8(in return ubyte[] data, string cp, bool force) { return toUtf8(cast(ascii)data, cp, force); }
140 deprecated bool hasHighAsciiChars(in ubyte[] data) { return hasHighAsciiChars(cast(ascii)data); }