1 /** 2 * RFC 2646. May be upgraded to RFC 3676 for international text. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <vladimir@thecybershadow.net> 12 */ 13 14 module ae.net.ietf.wrap; 15 16 import std.algorithm; 17 import std.range; 18 import std..string; 19 import std.uni; 20 21 import ae.utils.text; 22 23 struct Paragraph 24 { 25 string quotePrefix, text; 26 } 27 28 enum WrapFormat 29 { 30 fixed, /// One paragraph per line 31 flowed, /// format=flowed 32 flowedDelSp, /// format=flowed; delsp=yes 33 heuristics, /// Guess 34 input, /// As emitted by Rfc850Message.replyTemplate 35 } 36 37 Paragraph[] unwrapText(string text, WrapFormat wrapFormat) 38 { 39 auto lines = text.splitAsciiLines(); 40 41 Paragraph[] paragraphs; 42 43 string stripQuotePrefix(ref string line) 44 { 45 auto oline = line; 46 47 while (line.startsWith(">")) 48 { 49 int l = 1; 50 // This is against standard, but many clients 51 // (incl. Web-News and M$ Outlook) don't give a damn: 52 if (line.startsWith("> ")) 53 l = 2; 54 55 line = line[l..$]; 56 } 57 58 return oline[0..line.ptr - oline.ptr]; 59 } 60 61 final switch (wrapFormat) 62 { 63 case WrapFormat.fixed: 64 foreach (line; lines) 65 { 66 string quotePrefix = stripQuotePrefix(line); 67 paragraphs ~= Paragraph(quotePrefix, line); 68 } 69 break; 70 case WrapFormat.flowed: 71 case WrapFormat.flowedDelSp: 72 case WrapFormat.input: 73 foreach (line; lines) 74 { 75 string quotePrefix = stripQuotePrefix(line); 76 77 // Remove space-stuffing 78 if (wrapFormat != WrapFormat.input && !quotePrefix.length && line.startsWith(" ")) 79 line = line[1..$]; 80 81 if (paragraphs.length>0 82 && paragraphs[$-1].quotePrefix==quotePrefix 83 && paragraphs[$-1].text.endsWith(" ") 84 && !line.startsWith(" ") 85 && line.length 86 && line != "-- " 87 && paragraphs[$-1].text != "-- " 88 && (wrapFormat != WrapFormat.input || quotePrefix.length)) 89 { 90 if (wrapFormat == WrapFormat.flowedDelSp) 91 paragraphs[$-1].text = paragraphs[$-1].text[0..$-1]; 92 paragraphs[$-1].text ~= line; 93 } 94 else 95 paragraphs ~= Paragraph(quotePrefix, line); 96 } 97 break; 98 case WrapFormat.heuristics: 99 { 100 // Use heuristics for non-format=flowed text. 101 102 static bool isWrapped(in string[] lines) 103 { 104 assert(lines.all!(line => line.length)); 105 106 // Heuristic checks (from most to least confidence): 107 108 // Zero or one line - as-is 109 if (lines.length < 2) 110 return false; 111 112 // If any line starts with whitespace or contains a tab, 113 // consider pre-wrapped (code, likely). 114 if (lines.any!(line => isWhite(line[0]) || line.canFind('\t'))) 115 return false; 116 117 // Detect implicit format=flowed (trailing space) 118 if (lines[0..$-1].all!(line => line[$-1] == ' ')) 119 return true; 120 121 // Check if the set of lines can feasibly be the output 122 // of a typical naive line-wrapping algorithm 123 // (and calculate the possible range of line widths). 124 size_t wrapMin = 1, wrapMax = 1000; 125 foreach (i, line; lines[0..$-1]) 126 { 127 auto lineMin = line.stripRight.length; 128 auto nextWord = lines[i+1].findSplit(" ")[0]; 129 auto lineMax = lineMin + 1 + nextWord.length; 130 // Are we outside of our current range? 131 if (lineMin > wrapMax || lineMax < wrapMin) 132 return false; // pre-wrapped 133 // Now, narrow down the range accordingly 134 wrapMin = max(wrapMin, lineMin); 135 wrapMax = min(wrapMax, lineMax); 136 } 137 // Finally, test last line 138 if (lines[$-1].length > wrapMax) 139 return false; 140 // Sanity checks. 141 if (wrapMax < 60 || wrapMin > 120) 142 return false; 143 144 // Character frequency check. 145 146 size_t[256] count; 147 size_t total; 148 foreach (line; lines) 149 foreach (c; line) 150 count[c]++, total++; 151 152 // D code tends to contain a lot of parens. 153 auto parenFreq = (count['('] + count[')']) * 100 / total; 154 155 return parenFreq < 2; 156 } 157 158 void handleParagraph(string quotePrefix, in string[] lines) 159 { 160 if (isWrapped(lines)) 161 paragraphs ~= Paragraph(quotePrefix, lines.map!stripRight.join(" ")); 162 else 163 paragraphs ~= lines.map!(line => Paragraph(quotePrefix, line.stripRight())).array; 164 } 165 166 sizediff_t start = -1; 167 string lastQuotePrefix; 168 169 foreach (i, ref line; lines) 170 { 171 auto oline = line; 172 string quotePrefix = stripQuotePrefix(line); 173 174 bool isDelim = !line.length 175 || line.strip() == "--" // signature 176 || line.startsWith("---") // Bugzilla 177 ; 178 179 if (isDelim || quotePrefix != lastQuotePrefix) 180 { 181 if (start >= 0) 182 { 183 handleParagraph(lastQuotePrefix, lines[start..i]); 184 start = -1; 185 } 186 } 187 188 if (isDelim) 189 paragraphs ~= Paragraph(quotePrefix, line); 190 else 191 if (start < 0) 192 start = i; 193 194 lastQuotePrefix = quotePrefix; 195 } 196 197 if (start >= 0) 198 handleParagraph(lastQuotePrefix, lines[start..$]); 199 } 200 } 201 202 return paragraphs; 203 } 204 205 enum DEFAULT_WRAP_LENGTH = 66; 206 207 /// Returns wrapped text in the WrapFormat.flowed format. 208 string wrapText(Paragraph[] paragraphs, int margin = DEFAULT_WRAP_LENGTH) 209 { 210 string[] lines; 211 212 void addLine(string quotePrefix, string line) 213 { 214 line = quotePrefix ~ line; 215 // Add space-stuffing 216 if (line.startsWith(" ") || 217 line.startsWith("From ") || 218 (line.startsWith(">") && quotePrefix.length==0)) 219 { 220 line = " " ~ line; 221 } 222 lines ~= line; 223 } 224 225 foreach (paragraph; paragraphs) 226 { 227 string line = paragraph.text; 228 229 while (line.length && line[$-1] == ' ') 230 line = line[0..$-1]; 231 232 if (!line.length) 233 { 234 addLine(paragraph.quotePrefix, null); 235 continue; 236 } 237 238 while (line.length) 239 { 240 size_t lastIndex = 0; 241 size_t lastLength = paragraph.quotePrefix.length; 242 foreach (i, c; line) 243 if (c == ' ' || i == line.length-1) 244 { 245 auto length = lastLength + line[lastIndex..i+1].byGrapheme.walkLength; 246 if (length > margin) 247 break; 248 lastIndex = i+1; 249 lastLength = length; 250 } 251 252 if (lastIndex == 0) 253 { 254 // Couldn't wrap. Wrap whole line 255 lastIndex = line.length; 256 } 257 258 addLine(paragraph.quotePrefix, line[0..lastIndex]); 259 line = line[lastIndex..$]; 260 } 261 } 262 263 return lines.join("\n"); 264 } 265 266 unittest 267 { 268 // Space-stuffing 269 assert(wrapText(unwrapText(" Hello", WrapFormat.fixed)) == " Hello"); 270 271 // Don't rewrap user input 272 assert(wrapText(unwrapText("Line 1 \nLine 2 ", WrapFormat.input)) == "Line 1\nLine 2"); 273 // ...but rewrap quoted text 274 assert(wrapText(unwrapText("> Line 1 \n> Line 2 ", WrapFormat.input)) == "> Line 1 Line 2"); 275 // Wrap long lines 276 import std.array : replicate; 277 assert(wrapText(unwrapText(replicate("abcde ", 20), WrapFormat.fixed)).split("\n").length > 1); 278 279 // Wrap by character count, not UTF-8 code-unit count. TODO: take into account surrogates and composite characters. 280 enum str = "Это очень очень очень очень очень очень очень длинная строка"; 281 import std.utf; 282 static assert(str.toUTF32().length < DEFAULT_WRAP_LENGTH); 283 static assert(str.length > DEFAULT_WRAP_LENGTH); 284 assert(wrapText(unwrapText(str, WrapFormat.fixed)).split("\n").length == 1); 285 }