1 /** 2 * RFC 2646. May be upgraded to RFC 3676 for international text. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <ae@cy.md> 12 */ 13 14 module ae.net.ietf.wrap; 15 16 import std.algorithm; 17 import std.range; 18 import std.string; 19 import std.uni; 20 21 import ae.utils.text; 22 23 /// A plain-text paragraph. 24 struct Paragraph 25 { 26 /// The leading part of the paragraph (identical for all lines in 27 /// the encoded form). Generally some mix of `'>'` and space 28 /// characters. 29 string quotePrefix; 30 31 /// The contents of the paragraph. 32 string text; 33 } 34 35 /// Specifies the format for how line breaks and paragraphs are 36 /// encoded in a message. 37 enum WrapFormat 38 { 39 fixed, /// One paragraph per line 40 flowed, /// format=flowed 41 flowedDelSp, /// format=flowed; delsp=yes 42 heuristics, /// Guess 43 input, /// As emitted by Rfc850Message.replyTemplate 44 } 45 46 /// Parses a message body holding text in the 47 /// specified format, and returns parsed paragraphs. 48 Paragraph[] unwrapText(string text, WrapFormat wrapFormat) 49 { 50 auto lines = text.splitAsciiLines(); 51 52 Paragraph[] paragraphs; 53 54 string stripQuotePrefix(ref string line) 55 { 56 auto oline = line; 57 58 while (line.startsWith(">")) 59 { 60 int l = 1; 61 // This is against standard, but many clients 62 // (incl. Web-News and M$ Outlook) don't give a damn: 63 if (line.startsWith("> ")) 64 l = 2; 65 66 line = line[l..$]; 67 } 68 69 return oline[0..line.ptr - oline.ptr]; 70 } 71 72 final switch (wrapFormat) 73 { 74 case WrapFormat.fixed: 75 foreach (line; lines) 76 { 77 string quotePrefix = stripQuotePrefix(line); 78 paragraphs ~= Paragraph(quotePrefix, line); 79 } 80 break; 81 case WrapFormat.flowed: 82 case WrapFormat.flowedDelSp: 83 case WrapFormat.input: 84 foreach (line; lines) 85 { 86 string quotePrefix = stripQuotePrefix(line); 87 88 // Remove space-stuffing 89 if (wrapFormat != WrapFormat.input && !quotePrefix.length && line.startsWith(" ")) 90 line = line[1..$]; 91 92 if (paragraphs.length>0 93 && paragraphs[$-1].quotePrefix==quotePrefix 94 && paragraphs[$-1].text.endsWith(" ") 95 && !line.startsWith(" ") 96 && line.length 97 && line != "-- " 98 && paragraphs[$-1].text != "-- " 99 && (wrapFormat != WrapFormat.input || quotePrefix.length)) 100 { 101 if (wrapFormat == WrapFormat.flowedDelSp) 102 paragraphs[$-1].text = paragraphs[$-1].text[0..$-1]; 103 paragraphs[$-1].text ~= line; 104 } 105 else 106 paragraphs ~= Paragraph(quotePrefix, line); 107 } 108 break; 109 case WrapFormat.heuristics: 110 { 111 // Use heuristics for non-format=flowed text. 112 113 static bool isWrapped(in string[] lines) 114 { 115 assert(lines.all!(line => line.length)); 116 117 // Heuristic checks (from most to least confidence): 118 119 // Zero or one line - as-is 120 if (lines.length < 2) 121 return false; 122 123 // If any line starts with whitespace or contains a tab, 124 // consider pre-wrapped (code, likely). 125 if (lines.any!(line => isWhite(line[0]) || line.canFind('\t'))) 126 return false; 127 128 // Detect implicit format=flowed (trailing space) 129 if (lines[0..$-1].all!(line => line[$-1] == ' ')) 130 return true; 131 132 // Check if the set of lines can feasibly be the output 133 // of a typical naive line-wrapping algorithm 134 // (and calculate the possible range of line widths). 135 size_t wrapMin = 1, wrapMax = 1000; 136 foreach (i, line; lines[0..$-1]) 137 { 138 auto lineMin = line.stripRight.length; 139 auto nextWord = lines[i+1].findSplit(" ")[0]; 140 auto lineMax = lineMin + 1 + nextWord.length; 141 // Are we outside of our current range? 142 if (lineMin > wrapMax || lineMax < wrapMin) 143 return false; // pre-wrapped 144 // Now, narrow down the range accordingly 145 wrapMin = max(wrapMin, lineMin); 146 wrapMax = min(wrapMax, lineMax); 147 } 148 // Finally, test last line 149 if (lines[$-1].length > wrapMax) 150 return false; 151 // Sanity checks. 152 if (wrapMax < 60 || wrapMin > 120) 153 return false; 154 155 // Character frequency check. 156 157 size_t[256] count; 158 size_t total; 159 foreach (line; lines) 160 foreach (c; line) 161 count[c]++, total++; 162 163 // D code tends to contain a lot of parens. 164 auto parenFreq = (count['('] + count[')']) * 100 / total; 165 166 return parenFreq < 2; 167 } 168 169 void handleParagraph(string quotePrefix, in string[] lines) 170 { 171 if (isWrapped(lines)) 172 paragraphs ~= Paragraph(quotePrefix, lines.map!stripRight.join(" ")); 173 else 174 paragraphs ~= lines.map!(line => Paragraph(quotePrefix, line.stripRight())).array; 175 } 176 177 sizediff_t start = -1; 178 string lastQuotePrefix; 179 180 foreach (i, ref line; lines) 181 { 182 auto oline = line; 183 string quotePrefix = stripQuotePrefix(line); 184 185 bool isDelim = !line.length 186 || line.strip() == "--" // signature 187 || line.startsWith("---") // Bugzilla 188 ; 189 190 if (isDelim || quotePrefix != lastQuotePrefix) 191 { 192 if (start >= 0) 193 { 194 handleParagraph(lastQuotePrefix, lines[start..i]); 195 start = -1; 196 } 197 } 198 199 if (isDelim) 200 paragraphs ~= Paragraph(quotePrefix, line); 201 else 202 if (start < 0) 203 start = i; 204 205 lastQuotePrefix = quotePrefix; 206 } 207 208 if (start >= 0) 209 handleParagraph(lastQuotePrefix, lines[start..$]); 210 } 211 } 212 213 return paragraphs; 214 } 215 216 /// The default value of `wrapText`'s `margin` parameter. 217 enum DEFAULT_WRAP_LENGTH = 66; 218 219 /// Returns wrapped text in the `WrapFormat.flowed` format. 220 string wrapText(Paragraph[] paragraphs, int margin = DEFAULT_WRAP_LENGTH) 221 { 222 string[] lines; 223 224 void addLine(string quotePrefix, string line) 225 { 226 line = quotePrefix ~ line; 227 // Add space-stuffing 228 if (line.startsWith(" ") || 229 line.startsWith("From ") || 230 (line.startsWith(">") && quotePrefix.length==0)) 231 { 232 line = " " ~ line; 233 } 234 lines ~= line; 235 } 236 237 foreach (paragraph; paragraphs) 238 { 239 string line = paragraph.text; 240 241 while (line.length && line[$-1] == ' ') 242 line = line[0..$-1]; 243 244 if (!line.length) 245 { 246 addLine(paragraph.quotePrefix, null); 247 continue; 248 } 249 250 while (line.length) 251 { 252 size_t lastIndex = 0; 253 size_t lastLength = paragraph.quotePrefix.length; 254 foreach (i, c; line) 255 if (c == ' ' || i == line.length-1) 256 { 257 auto length = lastLength + line[lastIndex..i+1].byGrapheme.walkLength; 258 if (length > margin) 259 break; 260 lastIndex = i+1; 261 lastLength = length; 262 } 263 264 if (lastIndex == 0) 265 { 266 // Couldn't wrap. Wrap whole line 267 lastIndex = line.length; 268 } 269 270 addLine(paragraph.quotePrefix, line[0..lastIndex]); 271 line = line[lastIndex..$]; 272 } 273 } 274 275 return lines.join("\n"); 276 } 277 278 unittest 279 { 280 // Space-stuffing 281 assert(wrapText(unwrapText(" Hello", WrapFormat.fixed)) == " Hello"); 282 283 // Don't rewrap user input 284 assert(wrapText(unwrapText("Line 1 \nLine 2 ", WrapFormat.input)) == "Line 1\nLine 2"); 285 // ...but rewrap quoted text 286 assert(wrapText(unwrapText("> Line 1 \n> Line 2 ", WrapFormat.input)) == "> Line 1 Line 2"); 287 // Wrap long lines 288 import std.array : replicate; 289 assert(wrapText(unwrapText(replicate("abcde ", 20), WrapFormat.fixed)).split("\n").length > 1); 290 291 // Wrap by character count, not UTF-8 code-unit count. TODO: take into account surrogates and composite characters. 292 enum str = "Это очень очень очень очень очень очень очень длинная строка"; 293 import std.utf; 294 static assert(str.toUTF32().length < DEFAULT_WRAP_LENGTH); 295 static assert(str.length > DEFAULT_WRAP_LENGTH); 296 assert(wrapText(unwrapText(str, WrapFormat.fixed)).split("\n").length == 1); 297 }