1 /**
2  * RFC 2646. May be upgraded to RFC 3676 for international text.
3  *
4  * License:
5  *   This Source Code Form is subject to the terms of
6  *   the Mozilla Public License, v. 2.0. If a copy of
7  *   the MPL was not distributed with this file, You
8  *   can obtain one at http://mozilla.org/MPL/2.0/.
9  *
10  * Authors:
11  *   Vladimir Panteleev <ae@cy.md>
12  */
13 
14 module ae.net.ietf.wrap;
15 
16 import std.algorithm;
17 import std.range;
18 import std.string;
19 import std.uni;
20 
21 import ae.utils.text;
22 
23 /// A plain-text paragraph.
24 struct Paragraph
25 {
26 	/// The leading part of the paragraph (identical for all lines in
27 	/// the encoded form). Generally some mix of `'>'` and space
28 	/// characters.
29 	string quotePrefix;
30 
31 	/// The contents of the paragraph.
32 	string text;
33 }
34 
35 /// Specifies the format for how line breaks and paragraphs are
36 /// encoded in a message.
37 enum WrapFormat
38 {
39 	fixed,       /// One paragraph per line
40 	flowed,      /// format=flowed
41 	flowedDelSp, /// format=flowed; delsp=yes
42 	heuristics,  /// Guess
43 	input,       /// As emitted by Rfc850Message.replyTemplate
44 	markdown,    /// Hard linebreak is 2 or more spaces
45 }
46 
47 /// Parses a message body holding text in the
48 /// specified format, and returns parsed paragraphs.
49 Paragraph[] unwrapText(string text, WrapFormat wrapFormat)
50 {
51 	auto lines = text.splitAsciiLines();
52 
53 	Paragraph[] paragraphs;
54 
55 	string stripQuotePrefix(ref string line)
56 	{
57 		auto oline = line;
58 
59 		while (line.startsWith(">"))
60 		{
61 			int l = 1;
62 			// This is against standard, but many clients
63 			// (incl. Web-News and M$ Outlook) don't give a damn:
64 			if (line.startsWith("> "))
65 				l = 2;
66 
67 			line = line[l..$];
68 		}
69 
70 		return oline[0..line.ptr - oline.ptr];
71 	}
72 
73 	final switch (wrapFormat)
74 	{
75 		case WrapFormat.fixed:
76 			foreach (line; lines)
77 			{
78 				string quotePrefix = stripQuotePrefix(line);
79 				paragraphs ~= Paragraph(quotePrefix, line);
80 			}
81 			break;
82 		case WrapFormat.flowed:
83 		case WrapFormat.flowedDelSp:
84 		case WrapFormat.input:
85 			foreach (line; lines)
86 			{
87 				string quotePrefix = stripQuotePrefix(line);
88 
89 				// Remove space-stuffing
90 				if (wrapFormat != WrapFormat.input && !quotePrefix.length && line.startsWith(" "))
91 					line = line[1..$];
92 
93 				if (paragraphs.length>0
94 				 && paragraphs[$-1].quotePrefix==quotePrefix
95 				 && paragraphs[$-1].text.endsWith(" ")
96 				 && line.length
97 				 && line != "-- "
98 				 && paragraphs[$-1].text != "-- "
99 				 && (wrapFormat != WrapFormat.input || quotePrefix.length))
100 				{
101 					if (wrapFormat == WrapFormat.flowedDelSp)
102 						paragraphs[$-1].text = paragraphs[$-1].text[0..$-1];
103 					paragraphs[$-1].text ~= line;
104 				}
105 				else
106 					paragraphs ~= Paragraph(quotePrefix, line);
107 			}
108 			break;
109 		case WrapFormat.markdown:
110 			foreach (line; lines)
111 			{
112 				string quotePrefix = stripQuotePrefix(line);
113 
114 				if (paragraphs.length>0
115 				 && paragraphs[$-1].quotePrefix==quotePrefix
116 				 && !paragraphs[$-1].text.endsWith("  ")
117 				 && line.length
118 				 && line != "-- "
119 				 && paragraphs[$-1].text != "-- ")
120 				{
121 					if (!paragraphs[$-1].text.endsWith(" "))
122 						paragraphs[$-1].text ~= " ";
123 					paragraphs[$-1].text ~= line;
124 				}
125 				else
126 					paragraphs ~= Paragraph(quotePrefix, line);
127 			}
128 			break;
129 		case WrapFormat.heuristics:
130 		{
131 			// Use heuristics for non-format=flowed text.
132 
133 			static bool isWrapped(in string[] lines)
134 			{
135 				assert(lines.all!(line => line.length));
136 
137 				// Heuristic checks (from most to least confidence):
138 
139 				// Zero or one line - as-is
140 				if (lines.length < 2)
141 					return false;
142 
143 				// If any line starts with whitespace or contains a tab,
144 				// consider pre-wrapped (code, likely).
145 				if (lines.any!(line => isWhite(line[0]) || line.canFind('\t')))
146 					return false;
147 
148 				// Detect implicit format=flowed (trailing space)
149 				if (lines[0..$-1].all!(line => line[$-1] == ' '))
150 					return true;
151 
152 				// Check if the set of lines can feasibly be the output
153 				// of a typical naive line-wrapping algorithm
154 				// (and calculate the possible range of line widths).
155 				size_t wrapMin = 1, wrapMax = 1000;
156 				foreach (i, line; lines[0..$-1])
157 				{
158 					auto lineMin = line.stripRight.length;
159 					auto nextWord = lines[i+1].findSplit(" ")[0];
160 					auto lineMax = lineMin + 1 + nextWord.length;
161 					// Are we outside of our current range?
162 					if (lineMin > wrapMax || lineMax < wrapMin)
163 						return false; // pre-wrapped
164 					// Now, narrow down the range accordingly
165 					wrapMin = max(wrapMin, lineMin);
166 					wrapMax = min(wrapMax, lineMax);
167 				}
168 				// Finally, test last line
169 				if (lines[$-1].length > wrapMax)
170 					return false;
171 				// Sanity checks.
172 				if (wrapMax < 60 || wrapMin > 120)
173 					return false;
174 
175 				// Character frequency check.
176 
177 				size_t[256] count;
178 				size_t total;
179 				foreach (line; lines)
180 					foreach (c; line)
181 						count[c]++, total++;
182 
183 				// D code tends to contain a lot of parens.
184 				auto parenFreq = (count['('] + count[')']) * 100 / total;
185 
186 				return parenFreq < 2;
187 			}
188 
189 			void handleParagraph(string quotePrefix, in string[] lines)
190 			{
191 				if (isWrapped(lines))
192 					paragraphs ~= Paragraph(quotePrefix, lines.map!stripRight.join(" "));
193 				else
194 					paragraphs ~= lines.map!(line => Paragraph(quotePrefix, line.stripRight())).array;
195 			}
196 
197 			sizediff_t start = -1;
198 			string lastQuotePrefix;
199 
200 			foreach (i, ref line; lines)
201 			{
202 				auto oline = line;
203 				string quotePrefix = stripQuotePrefix(line);
204 
205 				bool isDelim = !line.length
206 					|| line.strip() == "--" // signature
207 					|| line.startsWith("---") // Bugzilla
208 				;
209 
210 				if (isDelim || quotePrefix != lastQuotePrefix)
211 				{
212 					if (start >= 0)
213 					{
214 						handleParagraph(lastQuotePrefix, lines[start..i]);
215 						start = -1;
216 					}
217 				}
218 
219 				if (isDelim)
220 					paragraphs ~= Paragraph(quotePrefix, line);
221 				else
222 				if (start < 0)
223 					start = i;
224 
225 				lastQuotePrefix = quotePrefix;
226 			}
227 
228 			if (start >= 0)
229 				handleParagraph(lastQuotePrefix, lines[start..$]);
230 		}
231 	}
232 
233 	return paragraphs;
234 }
235 
236 /// The default value of `wrapText`'s `margin` parameter.
237 enum DEFAULT_WRAP_LENGTH = 66;
238 
239 /// Returns wrapped text in the `WrapFormat.flowed` format.
240 string wrapText(Paragraph[] paragraphs, int margin = DEFAULT_WRAP_LENGTH)
241 {
242 	string[] lines;
243 
244 	void addLine(string quotePrefix, string line)
245 	{
246 		line = quotePrefix ~ line;
247 		// Add space-stuffing
248 		if (line.startsWith(" ") ||
249 			line.startsWith("From ") ||
250 			(line.startsWith(">") && quotePrefix.length==0))
251 		{
252 			line = " " ~ line;
253 		}
254 		lines ~= line;
255 	}
256 
257 	foreach (paragraph; paragraphs)
258 	{
259 		string line = paragraph.text;
260 
261 		while (line.length && line[$-1] == ' ')
262 			line = line[0..$-1];
263 
264 		if (!line.length)
265 		{
266 			addLine(paragraph.quotePrefix, null);
267 			continue;
268 		}
269 
270 		while (line.length)
271 		{
272 			size_t lastIndex = 0;
273 			size_t lastLength = paragraph.quotePrefix.length;
274 			foreach (i, c; line)
275 				if (c == ' ' || i == line.length-1)
276 				{
277 					auto length = lastLength + line[lastIndex..i+1].byGrapheme.walkLength;
278 					if (length > margin)
279 						break;
280 					lastIndex = i+1;
281 					lastLength = length;
282 				}
283 
284 			if (lastIndex == 0)
285 			{
286 				// Couldn't wrap. Wrap whole line
287 				lastIndex = line.length;
288 			}
289 
290 			addLine(paragraph.quotePrefix, line[0..lastIndex]);
291 			line = line[lastIndex..$];
292 		}
293 	}
294 
295 	return lines.join("\n");
296 }
297 
298 unittest
299 {
300 	// Space-stuffing
301 	assert(wrapText(unwrapText(" Hello", WrapFormat.fixed)) == "  Hello");
302 
303 	// Don't rewrap user input
304 	assert(wrapText(unwrapText("Line 1 \nLine 2 ", WrapFormat.input)) == "Line 1\nLine 2");
305 	// ...but rewrap quoted text
306 	assert(wrapText(unwrapText("> Line 1 \n> Line 2 ", WrapFormat.input)) == "> Line 1 Line 2");
307 	// Wrap long lines
308 	import std.array : replicate;
309 	assert(wrapText(unwrapText(replicate("abcde ", 20), WrapFormat.fixed)).split("\n").length > 1);
310 
311 	// Wrap by character count, not UTF-8 code-unit count. TODO: take into account surrogates and composite characters.
312 	enum str = "Это очень очень очень очень очень очень очень длинная строка";
313 	import std.utf;
314 	static assert(str.toUTF32().length < DEFAULT_WRAP_LENGTH);
315 	static assert(str.length > DEFAULT_WRAP_LENGTH);
316 	assert(wrapText(unwrapText(str, WrapFormat.fixed)).split("\n").length == 1);
317 
318 	// Allow wrapping and correctly unwrapping long sequences of spaces
319 	assert(unwrapText("|  \n  |", WrapFormat.flowed) == [Paragraph("", "|   |")]);
320 }