1 /**
2  * RFC 2646. May be upgraded to RFC 3676 for international text.
3  *
4  * License:
5  *   This Source Code Form is subject to the terms of
6  *   the Mozilla Public License, v. 2.0. If a copy of
7  *   the MPL was not distributed with this file, You
8  *   can obtain one at http://mozilla.org/MPL/2.0/.
9  *
10  * Authors:
11  *   Vladimir Panteleev <ae@cy.md>
12  */
13 
14 module ae.net.ietf.wrap;
15 
16 import std.algorithm;
17 import std.range;
18 import std.string;
19 import std.uni;
20 
21 import ae.utils.text;
22 
23 /// A plain-text paragraph.
24 struct Paragraph
25 {
26 	/// The leading part of the paragraph (identical for all lines in
27 	/// the encoded form). Generally some mix of `'>'` and space
28 	/// characters.
29 	string quotePrefix;
30 
31 	/// The contents of the paragraph.
32 	string text;
33 }
34 
35 /// Specifies the format for how line breaks and paragraphs are
36 /// encoded in a message.
37 enum WrapFormat
38 {
39 	fixed,       /// One paragraph per line
40 	flowed,      /// format=flowed
41 	flowedDelSp, /// format=flowed; delsp=yes
42 	heuristics,  /// Guess
43 	input,       /// As emitted by Rfc850Message.replyTemplate
44 }
45 
46 /// Parses a message body holding text in the
47 /// specified format, and returns parsed paragraphs.
48 Paragraph[] unwrapText(string text, WrapFormat wrapFormat)
49 {
50 	auto lines = text.splitAsciiLines();
51 
52 	Paragraph[] paragraphs;
53 
54 	string stripQuotePrefix(ref string line)
55 	{
56 		auto oline = line;
57 
58 		while (line.startsWith(">"))
59 		{
60 			int l = 1;
61 			// This is against standard, but many clients
62 			// (incl. Web-News and M$ Outlook) don't give a damn:
63 			if (line.startsWith("> "))
64 				l = 2;
65 
66 			line = line[l..$];
67 		}
68 
69 		return oline[0..line.ptr - oline.ptr];
70 	}
71 
72 	final switch (wrapFormat)
73 	{
74 		case WrapFormat.fixed:
75 			foreach (line; lines)
76 			{
77 				string quotePrefix = stripQuotePrefix(line);
78 				paragraphs ~= Paragraph(quotePrefix, line);
79 			}
80 			break;
81 		case WrapFormat.flowed:
82 		case WrapFormat.flowedDelSp:
83 		case WrapFormat.input:
84 			foreach (line; lines)
85 			{
86 				string quotePrefix = stripQuotePrefix(line);
87 
88 				// Remove space-stuffing
89 				if (wrapFormat != WrapFormat.input && !quotePrefix.length && line.startsWith(" "))
90 					line = line[1..$];
91 
92 				if (paragraphs.length>0
93 				 && paragraphs[$-1].quotePrefix==quotePrefix
94 				 && paragraphs[$-1].text.endsWith(" ")
95 				 && !line.startsWith(" ")
96 				 && line.length
97 				 && line != "-- "
98 				 && paragraphs[$-1].text != "-- "
99 				 && (wrapFormat != WrapFormat.input || quotePrefix.length))
100 				{
101 					if (wrapFormat == WrapFormat.flowedDelSp)
102 						paragraphs[$-1].text = paragraphs[$-1].text[0..$-1];
103 					paragraphs[$-1].text ~= line;
104 				}
105 				else
106 					paragraphs ~= Paragraph(quotePrefix, line);
107 			}
108 			break;
109 		case WrapFormat.heuristics:
110 		{
111 			// Use heuristics for non-format=flowed text.
112 
113 			static bool isWrapped(in string[] lines)
114 			{
115 				assert(lines.all!(line => line.length));
116 
117 				// Heuristic checks (from most to least confidence):
118 
119 				// Zero or one line - as-is
120 				if (lines.length < 2)
121 					return false;
122 
123 				// If any line starts with whitespace or contains a tab,
124 				// consider pre-wrapped (code, likely).
125 				if (lines.any!(line => isWhite(line[0]) || line.canFind('\t')))
126 					return false;
127 
128 				// Detect implicit format=flowed (trailing space)
129 				if (lines[0..$-1].all!(line => line[$-1] == ' '))
130 					return true;
131 
132 				// Check if the set of lines can feasibly be the output
133 				// of a typical naive line-wrapping algorithm
134 				// (and calculate the possible range of line widths).
135 				size_t wrapMin = 1, wrapMax = 1000;
136 				foreach (i, line; lines[0..$-1])
137 				{
138 					auto lineMin = line.stripRight.length;
139 					auto nextWord = lines[i+1].findSplit(" ")[0];
140 					auto lineMax = lineMin + 1 + nextWord.length;
141 					// Are we outside of our current range?
142 					if (lineMin > wrapMax || lineMax < wrapMin)
143 						return false; // pre-wrapped
144 					// Now, narrow down the range accordingly
145 					wrapMin = max(wrapMin, lineMin);
146 					wrapMax = min(wrapMax, lineMax);
147 				}
148 				// Finally, test last line
149 				if (lines[$-1].length > wrapMax)
150 					return false;
151 				// Sanity checks.
152 				if (wrapMax < 60 || wrapMin > 120)
153 					return false;
154 
155 				// Character frequency check.
156 
157 				size_t[256] count;
158 				size_t total;
159 				foreach (line; lines)
160 					foreach (c; line)
161 						count[c]++, total++;
162 
163 				// D code tends to contain a lot of parens.
164 				auto parenFreq = (count['('] + count[')']) * 100 / total;
165 
166 				return parenFreq < 2;
167 			}
168 
169 			void handleParagraph(string quotePrefix, in string[] lines)
170 			{
171 				if (isWrapped(lines))
172 					paragraphs ~= Paragraph(quotePrefix, lines.map!stripRight.join(" "));
173 				else
174 					paragraphs ~= lines.map!(line => Paragraph(quotePrefix, line.stripRight())).array;
175 			}
176 
177 			sizediff_t start = -1;
178 			string lastQuotePrefix;
179 
180 			foreach (i, ref line; lines)
181 			{
182 				auto oline = line;
183 				string quotePrefix = stripQuotePrefix(line);
184 
185 				bool isDelim = !line.length
186 					|| line.strip() == "--" // signature
187 					|| line.startsWith("---") // Bugzilla
188 				;
189 
190 				if (isDelim || quotePrefix != lastQuotePrefix)
191 				{
192 					if (start >= 0)
193 					{
194 						handleParagraph(lastQuotePrefix, lines[start..i]);
195 						start = -1;
196 					}
197 				}
198 
199 				if (isDelim)
200 					paragraphs ~= Paragraph(quotePrefix, line);
201 				else
202 				if (start < 0)
203 					start = i;
204 
205 				lastQuotePrefix = quotePrefix;
206 			}
207 
208 			if (start >= 0)
209 				handleParagraph(lastQuotePrefix, lines[start..$]);
210 		}
211 	}
212 
213 	return paragraphs;
214 }
215 
216 /// The default value of `wrapText`'s `margin` parameter.
217 enum DEFAULT_WRAP_LENGTH = 66;
218 
219 /// Returns wrapped text in the `WrapFormat.flowed` format.
220 string wrapText(Paragraph[] paragraphs, int margin = DEFAULT_WRAP_LENGTH)
221 {
222 	string[] lines;
223 
224 	void addLine(string quotePrefix, string line)
225 	{
226 		line = quotePrefix ~ line;
227 		// Add space-stuffing
228 		if (line.startsWith(" ") ||
229 			line.startsWith("From ") ||
230 			(line.startsWith(">") && quotePrefix.length==0))
231 		{
232 			line = " " ~ line;
233 		}
234 		lines ~= line;
235 	}
236 
237 	foreach (paragraph; paragraphs)
238 	{
239 		string line = paragraph.text;
240 
241 		while (line.length && line[$-1] == ' ')
242 			line = line[0..$-1];
243 
244 		if (!line.length)
245 		{
246 			addLine(paragraph.quotePrefix, null);
247 			continue;
248 		}
249 
250 		while (line.length)
251 		{
252 			size_t lastIndex = 0;
253 			size_t lastLength = paragraph.quotePrefix.length;
254 			foreach (i, c; line)
255 				if (c == ' ' || i == line.length-1)
256 				{
257 					auto length = lastLength + line[lastIndex..i+1].byGrapheme.walkLength;
258 					if (length > margin)
259 						break;
260 					lastIndex = i+1;
261 					lastLength = length;
262 				}
263 
264 			if (lastIndex == 0)
265 			{
266 				// Couldn't wrap. Wrap whole line
267 				lastIndex = line.length;
268 			}
269 
270 			addLine(paragraph.quotePrefix, line[0..lastIndex]);
271 			line = line[lastIndex..$];
272 		}
273 	}
274 
275 	return lines.join("\n");
276 }
277 
278 unittest
279 {
280 	// Space-stuffing
281 	assert(wrapText(unwrapText(" Hello", WrapFormat.fixed)) == "  Hello");
282 
283 	// Don't rewrap user input
284 	assert(wrapText(unwrapText("Line 1 \nLine 2 ", WrapFormat.input)) == "Line 1\nLine 2");
285 	// ...but rewrap quoted text
286 	assert(wrapText(unwrapText("> Line 1 \n> Line 2 ", WrapFormat.input)) == "> Line 1 Line 2");
287 	// Wrap long lines
288 	import std.array : replicate;
289 	assert(wrapText(unwrapText(replicate("abcde ", 20), WrapFormat.fixed)).split("\n").length > 1);
290 
291 	// Wrap by character count, not UTF-8 code-unit count. TODO: take into account surrogates and composite characters.
292 	enum str = "Это очень очень очень очень очень очень очень длинная строка";
293 	import std.utf;
294 	static assert(str.toUTF32().length < DEFAULT_WRAP_LENGTH);
295 	static assert(str.length > DEFAULT_WRAP_LENGTH);
296 	assert(wrapText(unwrapText(str, WrapFormat.fixed)).split("\n").length == 1);
297 }