1 /** 2 * RFC 2646. May be upgraded to RFC 3676 for international text. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <ae@cy.md> 12 */ 13 14 module ae.net.ietf.wrap; 15 16 import std.algorithm; 17 import std.range; 18 import std.string; 19 import std.uni; 20 21 import ae.utils.text; 22 23 /// A plain-text paragraph. 24 struct Paragraph 25 { 26 /// The leading part of the paragraph (identical for all lines in 27 /// the encoded form). Generally some mix of `'>'` and space 28 /// characters. 29 string quotePrefix; 30 31 /// The contents of the paragraph. 32 string text; 33 } 34 35 /// Specifies the format for how line breaks and paragraphs are 36 /// encoded in a message. 37 enum WrapFormat 38 { 39 fixed, /// One paragraph per line 40 flowed, /// format=flowed 41 flowedDelSp, /// format=flowed; delsp=yes 42 heuristics, /// Guess 43 input, /// As emitted by Rfc850Message.replyTemplate 44 } 45 46 /// Parses a message body holding text in the 47 /// specified format, and returns parsed paragraphs. 48 Paragraph[] unwrapText(string text, WrapFormat wrapFormat) 49 { 50 auto lines = text.splitAsciiLines(); 51 52 Paragraph[] paragraphs; 53 54 string stripQuotePrefix(ref string line) 55 { 56 auto oline = line; 57 58 while (line.startsWith(">")) 59 { 60 int l = 1; 61 // This is against standard, but many clients 62 // (incl. Web-News and M$ Outlook) don't give a damn: 63 if (line.startsWith("> ")) 64 l = 2; 65 66 line = line[l..$]; 67 } 68 69 return oline[0..line.ptr - oline.ptr]; 70 } 71 72 final switch (wrapFormat) 73 { 74 case WrapFormat.fixed: 75 foreach (line; lines) 76 { 77 string quotePrefix = stripQuotePrefix(line); 78 paragraphs ~= Paragraph(quotePrefix, line); 79 } 80 break; 81 case WrapFormat.flowed: 82 case WrapFormat.flowedDelSp: 83 case WrapFormat.input: 84 foreach (line; lines) 85 { 86 string quotePrefix = stripQuotePrefix(line); 87 88 // Remove space-stuffing 89 if (wrapFormat != WrapFormat.input && !quotePrefix.length && line.startsWith(" ")) 90 line = line[1..$]; 91 92 if (paragraphs.length>0 93 && paragraphs[$-1].quotePrefix==quotePrefix 94 && paragraphs[$-1].text.endsWith(" ") 95 && line.length 96 && line != "-- " 97 && paragraphs[$-1].text != "-- " 98 && (wrapFormat != WrapFormat.input || quotePrefix.length)) 99 { 100 if (wrapFormat == WrapFormat.flowedDelSp) 101 paragraphs[$-1].text = paragraphs[$-1].text[0..$-1]; 102 paragraphs[$-1].text ~= line; 103 } 104 else 105 paragraphs ~= Paragraph(quotePrefix, line); 106 } 107 break; 108 case WrapFormat.heuristics: 109 { 110 // Use heuristics for non-format=flowed text. 111 112 static bool isWrapped(in string[] lines) 113 { 114 assert(lines.all!(line => line.length)); 115 116 // Heuristic checks (from most to least confidence): 117 118 // Zero or one line - as-is 119 if (lines.length < 2) 120 return false; 121 122 // If any line starts with whitespace or contains a tab, 123 // consider pre-wrapped (code, likely). 124 if (lines.any!(line => isWhite(line[0]) || line.canFind('\t'))) 125 return false; 126 127 // Detect implicit format=flowed (trailing space) 128 if (lines[0..$-1].all!(line => line[$-1] == ' ')) 129 return true; 130 131 // Check if the set of lines can feasibly be the output 132 // of a typical naive line-wrapping algorithm 133 // (and calculate the possible range of line widths). 134 size_t wrapMin = 1, wrapMax = 1000; 135 foreach (i, line; lines[0..$-1]) 136 { 137 auto lineMin = line.stripRight.length; 138 auto nextWord = lines[i+1].findSplit(" ")[0]; 139 auto lineMax = lineMin + 1 + nextWord.length; 140 // Are we outside of our current range? 141 if (lineMin > wrapMax || lineMax < wrapMin) 142 return false; // pre-wrapped 143 // Now, narrow down the range accordingly 144 wrapMin = max(wrapMin, lineMin); 145 wrapMax = min(wrapMax, lineMax); 146 } 147 // Finally, test last line 148 if (lines[$-1].length > wrapMax) 149 return false; 150 // Sanity checks. 151 if (wrapMax < 60 || wrapMin > 120) 152 return false; 153 154 // Character frequency check. 155 156 size_t[256] count; 157 size_t total; 158 foreach (line; lines) 159 foreach (c; line) 160 count[c]++, total++; 161 162 // D code tends to contain a lot of parens. 163 auto parenFreq = (count['('] + count[')']) * 100 / total; 164 165 return parenFreq < 2; 166 } 167 168 void handleParagraph(string quotePrefix, in string[] lines) 169 { 170 if (isWrapped(lines)) 171 paragraphs ~= Paragraph(quotePrefix, lines.map!stripRight.join(" ")); 172 else 173 paragraphs ~= lines.map!(line => Paragraph(quotePrefix, line.stripRight())).array; 174 } 175 176 sizediff_t start = -1; 177 string lastQuotePrefix; 178 179 foreach (i, ref line; lines) 180 { 181 auto oline = line; 182 string quotePrefix = stripQuotePrefix(line); 183 184 bool isDelim = !line.length 185 || line.strip() == "--" // signature 186 || line.startsWith("---") // Bugzilla 187 ; 188 189 if (isDelim || quotePrefix != lastQuotePrefix) 190 { 191 if (start >= 0) 192 { 193 handleParagraph(lastQuotePrefix, lines[start..i]); 194 start = -1; 195 } 196 } 197 198 if (isDelim) 199 paragraphs ~= Paragraph(quotePrefix, line); 200 else 201 if (start < 0) 202 start = i; 203 204 lastQuotePrefix = quotePrefix; 205 } 206 207 if (start >= 0) 208 handleParagraph(lastQuotePrefix, lines[start..$]); 209 } 210 } 211 212 return paragraphs; 213 } 214 215 /// The default value of `wrapText`'s `margin` parameter. 216 enum DEFAULT_WRAP_LENGTH = 66; 217 218 /// Returns wrapped text in the `WrapFormat.flowed` format. 219 string wrapText(Paragraph[] paragraphs, int margin = DEFAULT_WRAP_LENGTH) 220 { 221 string[] lines; 222 223 void addLine(string quotePrefix, string line) 224 { 225 line = quotePrefix ~ line; 226 // Add space-stuffing 227 if (line.startsWith(" ") || 228 line.startsWith("From ") || 229 (line.startsWith(">") && quotePrefix.length==0)) 230 { 231 line = " " ~ line; 232 } 233 lines ~= line; 234 } 235 236 foreach (paragraph; paragraphs) 237 { 238 string line = paragraph.text; 239 240 while (line.length && line[$-1] == ' ') 241 line = line[0..$-1]; 242 243 if (!line.length) 244 { 245 addLine(paragraph.quotePrefix, null); 246 continue; 247 } 248 249 while (line.length) 250 { 251 size_t lastIndex = 0; 252 size_t lastLength = paragraph.quotePrefix.length; 253 foreach (i, c; line) 254 if (c == ' ' || i == line.length-1) 255 { 256 auto length = lastLength + line[lastIndex..i+1].byGrapheme.walkLength; 257 if (length > margin) 258 break; 259 lastIndex = i+1; 260 lastLength = length; 261 } 262 263 if (lastIndex == 0) 264 { 265 // Couldn't wrap. Wrap whole line 266 lastIndex = line.length; 267 } 268 269 addLine(paragraph.quotePrefix, line[0..lastIndex]); 270 line = line[lastIndex..$]; 271 } 272 } 273 274 return lines.join("\n"); 275 } 276 277 unittest 278 { 279 // Space-stuffing 280 assert(wrapText(unwrapText(" Hello", WrapFormat.fixed)) == " Hello"); 281 282 // Don't rewrap user input 283 assert(wrapText(unwrapText("Line 1 \nLine 2 ", WrapFormat.input)) == "Line 1\nLine 2"); 284 // ...but rewrap quoted text 285 assert(wrapText(unwrapText("> Line 1 \n> Line 2 ", WrapFormat.input)) == "> Line 1 Line 2"); 286 // Wrap long lines 287 import std.array : replicate; 288 assert(wrapText(unwrapText(replicate("abcde ", 20), WrapFormat.fixed)).split("\n").length > 1); 289 290 // Wrap by character count, not UTF-8 code-unit count. TODO: take into account surrogates and composite characters. 291 enum str = "Это очень очень очень очень очень очень очень длинная строка"; 292 import std.utf; 293 static assert(str.toUTF32().length < DEFAULT_WRAP_LENGTH); 294 static assert(str.length > DEFAULT_WRAP_LENGTH); 295 assert(wrapText(unwrapText(str, WrapFormat.fixed)).split("\n").length == 1); 296 297 // Allow wrapping and correctly unwrapping long sequences of spaces 298 assert(unwrapText("| \n |", WrapFormat.flowed) == [Paragraph("", "| |")]); 299 }