1 /**
2  * RFC 2646. May be upgraded to RFC 3676 for international text.
3  *
4  * License:
5  *   This Source Code Form is subject to the terms of
6  *   the Mozilla Public License, v. 2.0. If a copy of
7  *   the MPL was not distributed with this file, You
8  *   can obtain one at http://mozilla.org/MPL/2.0/.
9  *
10  * Authors:
11  *   Vladimir Panteleev <ae@cy.md>
12  */
13 
14 module ae.net.ietf.wrap;
15 
16 import std.algorithm;
17 import std.range;
18 import std.string;
19 import std.uni;
20 
21 import ae.utils.text;
22 
23 /// A plain-text paragraph.
24 struct Paragraph
25 {
26 	/// The leading part of the paragraph (identical for all lines in
27 	/// the encoded form). Generally some mix of `'>'` and space
28 	/// characters.
29 	string quotePrefix;
30 
31 	/// The contents of the paragraph.
32 	string text;
33 }
34 
35 /// Specifies the format for how line breaks and paragraphs are
36 /// encoded in a message.
37 enum WrapFormat
38 {
39 	fixed,       /// One paragraph per line
40 	flowed,      /// format=flowed
41 	flowedDelSp, /// format=flowed; delsp=yes
42 	heuristics,  /// Guess
43 	input,       /// As emitted by Rfc850Message.replyTemplate
44 }
45 
46 /// Parses a message body holding text in the
47 /// specified format, and returns parsed paragraphs.
48 Paragraph[] unwrapText(string text, WrapFormat wrapFormat)
49 {
50 	auto lines = text.splitAsciiLines();
51 
52 	Paragraph[] paragraphs;
53 
54 	string stripQuotePrefix(ref string line)
55 	{
56 		auto oline = line;
57 
58 		while (line.startsWith(">"))
59 		{
60 			int l = 1;
61 			// This is against standard, but many clients
62 			// (incl. Web-News and M$ Outlook) don't give a damn:
63 			if (line.startsWith("> "))
64 				l = 2;
65 
66 			line = line[l..$];
67 		}
68 
69 		return oline[0..line.ptr - oline.ptr];
70 	}
71 
72 	final switch (wrapFormat)
73 	{
74 		case WrapFormat.fixed:
75 			foreach (line; lines)
76 			{
77 				string quotePrefix = stripQuotePrefix(line);
78 				paragraphs ~= Paragraph(quotePrefix, line);
79 			}
80 			break;
81 		case WrapFormat.flowed:
82 		case WrapFormat.flowedDelSp:
83 		case WrapFormat.input:
84 			foreach (line; lines)
85 			{
86 				string quotePrefix = stripQuotePrefix(line);
87 
88 				// Remove space-stuffing
89 				if (wrapFormat != WrapFormat.input && !quotePrefix.length && line.startsWith(" "))
90 					line = line[1..$];
91 
92 				if (paragraphs.length>0
93 				 && paragraphs[$-1].quotePrefix==quotePrefix
94 				 && paragraphs[$-1].text.endsWith(" ")
95 				 && line.length
96 				 && line != "-- "
97 				 && paragraphs[$-1].text != "-- "
98 				 && (wrapFormat != WrapFormat.input || quotePrefix.length))
99 				{
100 					if (wrapFormat == WrapFormat.flowedDelSp)
101 						paragraphs[$-1].text = paragraphs[$-1].text[0..$-1];
102 					paragraphs[$-1].text ~= line;
103 				}
104 				else
105 					paragraphs ~= Paragraph(quotePrefix, line);
106 			}
107 			break;
108 		case WrapFormat.heuristics:
109 		{
110 			// Use heuristics for non-format=flowed text.
111 
112 			static bool isWrapped(in string[] lines)
113 			{
114 				assert(lines.all!(line => line.length));
115 
116 				// Heuristic checks (from most to least confidence):
117 
118 				// Zero or one line - as-is
119 				if (lines.length < 2)
120 					return false;
121 
122 				// If any line starts with whitespace or contains a tab,
123 				// consider pre-wrapped (code, likely).
124 				if (lines.any!(line => isWhite(line[0]) || line.canFind('\t')))
125 					return false;
126 
127 				// Detect implicit format=flowed (trailing space)
128 				if (lines[0..$-1].all!(line => line[$-1] == ' '))
129 					return true;
130 
131 				// Check if the set of lines can feasibly be the output
132 				// of a typical naive line-wrapping algorithm
133 				// (and calculate the possible range of line widths).
134 				size_t wrapMin = 1, wrapMax = 1000;
135 				foreach (i, line; lines[0..$-1])
136 				{
137 					auto lineMin = line.stripRight.length;
138 					auto nextWord = lines[i+1].findSplit(" ")[0];
139 					auto lineMax = lineMin + 1 + nextWord.length;
140 					// Are we outside of our current range?
141 					if (lineMin > wrapMax || lineMax < wrapMin)
142 						return false; // pre-wrapped
143 					// Now, narrow down the range accordingly
144 					wrapMin = max(wrapMin, lineMin);
145 					wrapMax = min(wrapMax, lineMax);
146 				}
147 				// Finally, test last line
148 				if (lines[$-1].length > wrapMax)
149 					return false;
150 				// Sanity checks.
151 				if (wrapMax < 60 || wrapMin > 120)
152 					return false;
153 
154 				// Character frequency check.
155 
156 				size_t[256] count;
157 				size_t total;
158 				foreach (line; lines)
159 					foreach (c; line)
160 						count[c]++, total++;
161 
162 				// D code tends to contain a lot of parens.
163 				auto parenFreq = (count['('] + count[')']) * 100 / total;
164 
165 				return parenFreq < 2;
166 			}
167 
168 			void handleParagraph(string quotePrefix, in string[] lines)
169 			{
170 				if (isWrapped(lines))
171 					paragraphs ~= Paragraph(quotePrefix, lines.map!stripRight.join(" "));
172 				else
173 					paragraphs ~= lines.map!(line => Paragraph(quotePrefix, line.stripRight())).array;
174 			}
175 
176 			sizediff_t start = -1;
177 			string lastQuotePrefix;
178 
179 			foreach (i, ref line; lines)
180 			{
181 				auto oline = line;
182 				string quotePrefix = stripQuotePrefix(line);
183 
184 				bool isDelim = !line.length
185 					|| line.strip() == "--" // signature
186 					|| line.startsWith("---") // Bugzilla
187 				;
188 
189 				if (isDelim || quotePrefix != lastQuotePrefix)
190 				{
191 					if (start >= 0)
192 					{
193 						handleParagraph(lastQuotePrefix, lines[start..i]);
194 						start = -1;
195 					}
196 				}
197 
198 				if (isDelim)
199 					paragraphs ~= Paragraph(quotePrefix, line);
200 				else
201 				if (start < 0)
202 					start = i;
203 
204 				lastQuotePrefix = quotePrefix;
205 			}
206 
207 			if (start >= 0)
208 				handleParagraph(lastQuotePrefix, lines[start..$]);
209 		}
210 	}
211 
212 	return paragraphs;
213 }
214 
215 /// The default value of `wrapText`'s `margin` parameter.
216 enum DEFAULT_WRAP_LENGTH = 66;
217 
218 /// Returns wrapped text in the `WrapFormat.flowed` format.
219 string wrapText(Paragraph[] paragraphs, int margin = DEFAULT_WRAP_LENGTH)
220 {
221 	string[] lines;
222 
223 	void addLine(string quotePrefix, string line)
224 	{
225 		line = quotePrefix ~ line;
226 		// Add space-stuffing
227 		if (line.startsWith(" ") ||
228 			line.startsWith("From ") ||
229 			(line.startsWith(">") && quotePrefix.length==0))
230 		{
231 			line = " " ~ line;
232 		}
233 		lines ~= line;
234 	}
235 
236 	foreach (paragraph; paragraphs)
237 	{
238 		string line = paragraph.text;
239 
240 		while (line.length && line[$-1] == ' ')
241 			line = line[0..$-1];
242 
243 		if (!line.length)
244 		{
245 			addLine(paragraph.quotePrefix, null);
246 			continue;
247 		}
248 
249 		while (line.length)
250 		{
251 			size_t lastIndex = 0;
252 			size_t lastLength = paragraph.quotePrefix.length;
253 			foreach (i, c; line)
254 				if (c == ' ' || i == line.length-1)
255 				{
256 					auto length = lastLength + line[lastIndex..i+1].byGrapheme.walkLength;
257 					if (length > margin)
258 						break;
259 					lastIndex = i+1;
260 					lastLength = length;
261 				}
262 
263 			if (lastIndex == 0)
264 			{
265 				// Couldn't wrap. Wrap whole line
266 				lastIndex = line.length;
267 			}
268 
269 			addLine(paragraph.quotePrefix, line[0..lastIndex]);
270 			line = line[lastIndex..$];
271 		}
272 	}
273 
274 	return lines.join("\n");
275 }
276 
277 unittest
278 {
279 	// Space-stuffing
280 	assert(wrapText(unwrapText(" Hello", WrapFormat.fixed)) == "  Hello");
281 
282 	// Don't rewrap user input
283 	assert(wrapText(unwrapText("Line 1 \nLine 2 ", WrapFormat.input)) == "Line 1\nLine 2");
284 	// ...but rewrap quoted text
285 	assert(wrapText(unwrapText("> Line 1 \n> Line 2 ", WrapFormat.input)) == "> Line 1 Line 2");
286 	// Wrap long lines
287 	import std.array : replicate;
288 	assert(wrapText(unwrapText(replicate("abcde ", 20), WrapFormat.fixed)).split("\n").length > 1);
289 
290 	// Wrap by character count, not UTF-8 code-unit count. TODO: take into account surrogates and composite characters.
291 	enum str = "Это очень очень очень очень очень очень очень длинная строка";
292 	import std.utf;
293 	static assert(str.toUTF32().length < DEFAULT_WRAP_LENGTH);
294 	static assert(str.length > DEFAULT_WRAP_LENGTH);
295 	assert(wrapText(unwrapText(str, WrapFormat.fixed)).split("\n").length == 1);
296 
297 	// Allow wrapping and correctly unwrapping long sequences of spaces
298 	assert(unwrapText("|  \n  |", WrapFormat.flowed) == [Paragraph("", "|   |")]);
299 }