1 /** 2 * RFC 2646. May be upgraded to RFC 3676 for international text. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <ae@cy.md> 12 */ 13 14 module ae.net.ietf.wrap; 15 16 import std.algorithm; 17 import std.range; 18 import std.string; 19 import std.uni; 20 21 import ae.utils.text; 22 23 /// A plain-text paragraph. 24 struct Paragraph 25 { 26 /// The leading part of the paragraph (identical for all lines in 27 /// the encoded form). Generally some mix of `'>'` and space 28 /// characters. 29 string quotePrefix; 30 31 /// The contents of the paragraph. 32 string text; 33 } 34 35 /// Specifies the format for how line breaks and paragraphs are 36 /// encoded in a message. 37 enum WrapFormat 38 { 39 fixed, /// One paragraph per line 40 flowed, /// format=flowed 41 flowedDelSp, /// format=flowed; delsp=yes 42 heuristics, /// Guess 43 input, /// As emitted by Rfc850Message.replyTemplate 44 markdown, /// Hard linebreak is 2 or more spaces 45 } 46 47 /// Parses a message body holding text in the 48 /// specified format, and returns parsed paragraphs. 49 Paragraph[] unwrapText(string text, WrapFormat wrapFormat) 50 { 51 auto lines = text.splitAsciiLines(); 52 53 Paragraph[] paragraphs; 54 55 string stripQuotePrefix(ref string line) 56 { 57 auto oline = line; 58 59 while (line.startsWith(">")) 60 { 61 int l = 1; 62 // This is against standard, but many clients 63 // (incl. Web-News and M$ Outlook) don't give a damn: 64 if (line.startsWith("> ")) 65 l = 2; 66 67 line = line[l..$]; 68 } 69 70 return oline[0..line.ptr - oline.ptr]; 71 } 72 73 final switch (wrapFormat) 74 { 75 case WrapFormat.fixed: 76 foreach (line; lines) 77 { 78 string quotePrefix = stripQuotePrefix(line); 79 paragraphs ~= Paragraph(quotePrefix, line); 80 } 81 break; 82 case WrapFormat.flowed: 83 case WrapFormat.flowedDelSp: 84 case WrapFormat.input: 85 foreach (line; lines) 86 { 87 string quotePrefix = stripQuotePrefix(line); 88 89 // Remove space-stuffing 90 if (wrapFormat != WrapFormat.input && !quotePrefix.length && line.startsWith(" ")) 91 line = line[1..$]; 92 93 if (paragraphs.length>0 94 && paragraphs[$-1].quotePrefix==quotePrefix 95 && paragraphs[$-1].text.endsWith(" ") 96 && line.length 97 && line != "-- " 98 && paragraphs[$-1].text != "-- " 99 && (wrapFormat != WrapFormat.input || quotePrefix.length)) 100 { 101 if (wrapFormat == WrapFormat.flowedDelSp) 102 paragraphs[$-1].text = paragraphs[$-1].text[0..$-1]; 103 paragraphs[$-1].text ~= line; 104 } 105 else 106 paragraphs ~= Paragraph(quotePrefix, line); 107 } 108 break; 109 case WrapFormat.markdown: 110 foreach (line; lines) 111 { 112 string quotePrefix = stripQuotePrefix(line); 113 114 if (paragraphs.length>0 115 && paragraphs[$-1].quotePrefix==quotePrefix 116 && !paragraphs[$-1].text.endsWith(" ") 117 && line.length 118 && line != "-- " 119 && paragraphs[$-1].text != "-- ") 120 { 121 if (!paragraphs[$-1].text.endsWith(" ")) 122 paragraphs[$-1].text ~= " "; 123 paragraphs[$-1].text ~= line; 124 } 125 else 126 paragraphs ~= Paragraph(quotePrefix, line); 127 } 128 break; 129 case WrapFormat.heuristics: 130 { 131 // Use heuristics for non-format=flowed text. 132 133 static bool isWrapped(in string[] lines) 134 { 135 assert(lines.all!(line => line.length)); 136 137 // Heuristic checks (from most to least confidence): 138 139 // Zero or one line - as-is 140 if (lines.length < 2) 141 return false; 142 143 // If any line starts with whitespace or contains a tab, 144 // consider pre-wrapped (code, likely). 145 if (lines.any!(line => isWhite(line[0]) || line.canFind('\t'))) 146 return false; 147 148 // Detect implicit format=flowed (trailing space) 149 if (lines[0..$-1].all!(line => line[$-1] == ' ')) 150 return true; 151 152 // Check if the set of lines can feasibly be the output 153 // of a typical naive line-wrapping algorithm 154 // (and calculate the possible range of line widths). 155 size_t wrapMin = 1, wrapMax = 1000; 156 foreach (i, line; lines[0..$-1]) 157 { 158 auto lineMin = line.stripRight.length; 159 auto nextWord = lines[i+1].findSplit(" ")[0]; 160 auto lineMax = lineMin + 1 + nextWord.length; 161 // Are we outside of our current range? 162 if (lineMin > wrapMax || lineMax < wrapMin) 163 return false; // pre-wrapped 164 // Now, narrow down the range accordingly 165 wrapMin = max(wrapMin, lineMin); 166 wrapMax = min(wrapMax, lineMax); 167 } 168 // Finally, test last line 169 if (lines[$-1].length > wrapMax) 170 return false; 171 // Sanity checks. 172 if (wrapMax < 60 || wrapMin > 120) 173 return false; 174 175 // Character frequency check. 176 177 size_t[256] count; 178 size_t total; 179 foreach (line; lines) 180 foreach (c; line) 181 count[c]++, total++; 182 183 // D code tends to contain a lot of parens. 184 auto parenFreq = (count['('] + count[')']) * 100 / total; 185 186 return parenFreq < 2; 187 } 188 189 void handleParagraph(string quotePrefix, in string[] lines) 190 { 191 if (isWrapped(lines)) 192 paragraphs ~= Paragraph(quotePrefix, lines.map!stripRight.join(" ")); 193 else 194 paragraphs ~= lines.map!(line => Paragraph(quotePrefix, line.stripRight())).array; 195 } 196 197 sizediff_t start = -1; 198 string lastQuotePrefix; 199 200 foreach (i, ref line; lines) 201 { 202 auto oline = line; 203 string quotePrefix = stripQuotePrefix(line); 204 205 bool isDelim = !line.length 206 || line.strip() == "--" // signature 207 || line.startsWith("---") // Bugzilla 208 ; 209 210 if (isDelim || quotePrefix != lastQuotePrefix) 211 { 212 if (start >= 0) 213 { 214 handleParagraph(lastQuotePrefix, lines[start..i]); 215 start = -1; 216 } 217 } 218 219 if (isDelim) 220 paragraphs ~= Paragraph(quotePrefix, line); 221 else 222 if (start < 0) 223 start = i; 224 225 lastQuotePrefix = quotePrefix; 226 } 227 228 if (start >= 0) 229 handleParagraph(lastQuotePrefix, lines[start..$]); 230 } 231 } 232 233 return paragraphs; 234 } 235 236 /// The default value of `wrapText`'s `margin` parameter. 237 enum DEFAULT_WRAP_LENGTH = 66; 238 239 /// Returns wrapped text in the `WrapFormat.flowed` format. 240 string wrapText(Paragraph[] paragraphs, int margin = DEFAULT_WRAP_LENGTH) 241 { 242 string[] lines; 243 244 void addLine(string quotePrefix, string line) 245 { 246 line = quotePrefix ~ line; 247 // Add space-stuffing 248 if (line.startsWith(" ") || 249 line.startsWith("From ") || 250 (line.startsWith(">") && quotePrefix.length==0)) 251 { 252 line = " " ~ line; 253 } 254 lines ~= line; 255 } 256 257 foreach (paragraph; paragraphs) 258 { 259 string line = paragraph.text; 260 261 while (line.length && line[$-1] == ' ') 262 line = line[0..$-1]; 263 264 if (!line.length) 265 { 266 addLine(paragraph.quotePrefix, null); 267 continue; 268 } 269 270 while (line.length) 271 { 272 size_t lastIndex = 0; 273 size_t lastLength = paragraph.quotePrefix.length; 274 foreach (i, c; line) 275 if (c == ' ' || i == line.length-1) 276 { 277 auto length = lastLength + line[lastIndex..i+1].byGrapheme.walkLength; 278 if (length > margin) 279 break; 280 lastIndex = i+1; 281 lastLength = length; 282 } 283 284 if (lastIndex == 0) 285 { 286 // Couldn't wrap. Wrap whole line 287 lastIndex = line.length; 288 } 289 290 addLine(paragraph.quotePrefix, line[0..lastIndex]); 291 line = line[lastIndex..$]; 292 } 293 } 294 295 return lines.join("\n"); 296 } 297 298 unittest 299 { 300 // Space-stuffing 301 assert(wrapText(unwrapText(" Hello", WrapFormat.fixed)) == " Hello"); 302 303 // Don't rewrap user input 304 assert(wrapText(unwrapText("Line 1 \nLine 2 ", WrapFormat.input)) == "Line 1\nLine 2"); 305 // ...but rewrap quoted text 306 assert(wrapText(unwrapText("> Line 1 \n> Line 2 ", WrapFormat.input)) == "> Line 1 Line 2"); 307 // Wrap long lines 308 import std.array : replicate; 309 assert(wrapText(unwrapText(replicate("abcde ", 20), WrapFormat.fixed)).split("\n").length > 1); 310 311 // Wrap by character count, not UTF-8 code-unit count. TODO: take into account surrogates and composite characters. 312 enum str = "Это очень очень очень очень очень очень очень длинная строка"; 313 import std.utf; 314 static assert(str.toUTF32().length < DEFAULT_WRAP_LENGTH); 315 static assert(str.length > DEFAULT_WRAP_LENGTH); 316 assert(wrapText(unwrapText(str, WrapFormat.fixed)).split("\n").length == 1); 317 318 // Allow wrapping and correctly unwrapping long sequences of spaces 319 assert(unwrapText("| \n |", WrapFormat.flowed) == [Paragraph("", "| |")]); 320 }