1 /**
2  * RFC 2646. May be upgraded to RFC 3676 for international text.
3  *
4  * License:
5  *   This Source Code Form is subject to the terms of
6  *   the Mozilla Public License, v. 2.0. If a copy of
7  *   the MPL was not distributed with this file, You
8  *   can obtain one at http://mozilla.org/MPL/2.0/.
9  *
10  * Authors:
11  *   Vladimir Panteleev <vladimir@thecybershadow.net>
12  */
13 
14 module ae.net.ietf.wrap;
15 
16 import std.algorithm;
17 import std.range;
18 import std.string;
19 import std.uni;
20 
21 import ae.utils.text;
22 
23 struct Paragraph
24 {
25 	string quotePrefix, text;
26 }
27 
28 enum WrapFormat
29 {
30 	fixed,       /// One paragraph per line
31 	flowed,      /// format=flowed
32 	flowedDelSp, /// format=flowed; delsp=yes
33 	heuristics,  /// Guess
34 	input,       /// As emitted by Rfc850Message.replyTemplate
35 }
36 
37 Paragraph[] unwrapText(string text, WrapFormat wrapFormat)
38 {
39 	auto lines = text.splitAsciiLines();
40 
41 	Paragraph[] paragraphs;
42 
43 	string stripQuotePrefix(ref string line)
44 	{
45 		auto oline = line;
46 
47 		while (line.startsWith(">"))
48 		{
49 			int l = 1;
50 			// This is against standard, but many clients
51 			// (incl. Web-News and M$ Outlook) don't give a damn:
52 			if (line.startsWith("> "))
53 				l = 2;
54 
55 			line = line[l..$];
56 		}
57 
58 		return oline[0..line.ptr - oline.ptr];
59 	}
60 
61 	final switch (wrapFormat)
62 	{
63 		case WrapFormat.fixed:
64 			foreach (line; lines)
65 			{
66 				string quotePrefix = stripQuotePrefix(line);
67 				paragraphs ~= Paragraph(quotePrefix, line);
68 			}
69 			break;
70 		case WrapFormat.flowed:
71 		case WrapFormat.flowedDelSp:
72 		case WrapFormat.input:
73 			foreach (line; lines)
74 			{
75 				string quotePrefix = stripQuotePrefix(line);
76 
77 				// Remove space-stuffing
78 				if (wrapFormat != WrapFormat.input && !quotePrefix.length && line.startsWith(" "))
79 					line = line[1..$];
80 
81 				if (paragraphs.length>0
82 				 && paragraphs[$-1].quotePrefix==quotePrefix
83 				 && paragraphs[$-1].text.endsWith(" ")
84 				 && !line.startsWith(" ")
85 				 && line.length
86 				 && line != "-- "
87 				 && paragraphs[$-1].text != "-- "
88 				 && (wrapFormat != WrapFormat.input || quotePrefix.length))
89 				{
90 					if (wrapFormat == WrapFormat.flowedDelSp)
91 						paragraphs[$-1].text = paragraphs[$-1].text[0..$-1];
92 					paragraphs[$-1].text ~= line;
93 				}
94 				else
95 					paragraphs ~= Paragraph(quotePrefix, line);
96 			}
97 			break;
98 		case WrapFormat.heuristics:
99 		{
100 			// Use heuristics for non-format=flowed text.
101 
102 			static bool isWrapped(in string[] lines)
103 			{
104 				assert(lines.all!(line => line.length));
105 
106 				// Heuristic checks (from most to least confidence):
107 
108 				// Zero or one line - as-is
109 				if (lines.length < 2)
110 					return false;
111 
112 				// If any line starts with whitespace or contains a tab,
113 				// consider pre-wrapped (code, likely).
114 				if (lines.any!(line => isWhite(line[0]) || line.canFind('\t')))
115 					return false;
116 
117 				// Detect implicit format=flowed (trailing space)
118 				if (lines[0..$-1].all!(line => line[$-1] == ' '))
119 					return true;
120 
121 				// Check if the set of lines can feasibly be the output
122 				// of a typical naive line-wrapping algorithm
123 				// (and calculate the possible range of line widths).
124 				size_t wrapMin = 1, wrapMax = 1000;
125 				foreach (i, line; lines[0..$-1])
126 				{
127 					auto lineMin = line.stripRight.length;
128 					auto nextWord = lines[i+1].findSplit(" ")[0];
129 					auto lineMax = lineMin + 1 + nextWord.length;
130 					// Are we outside of our current range?
131 					if (lineMin > wrapMax || lineMax < wrapMin)
132 						return false; // pre-wrapped
133 					// Now, narrow down the range accordingly
134 					wrapMin = max(wrapMin, lineMin);
135 					wrapMax = min(wrapMax, lineMax);
136 				}
137 				// Finally, test last line
138 				if (lines[$-1].length > wrapMax)
139 					return false;
140 				// Sanity checks.
141 				if (wrapMax < 60 || wrapMin > 120)
142 					return false;
143 
144 				// Character frequency check.
145 
146 				size_t[256] count;
147 				size_t total;
148 				foreach (line; lines)
149 					foreach (c; line)
150 						count[c]++, total++;
151 
152 				// D code tends to contain a lot of parens.
153 				auto parenFreq = (count['('] + count[')']) * 100 / total;
154 
155 				return parenFreq < 2;
156 			}
157 
158 			void handleParagraph(string quotePrefix, in string[] lines)
159 			{
160 				if (isWrapped(lines))
161 					paragraphs ~= Paragraph(quotePrefix, lines.map!stripRight.join(" "));
162 				else
163 					paragraphs ~= lines.map!(line => Paragraph(quotePrefix, line.stripRight())).array;
164 			}
165 
166 			sizediff_t start = -1;
167 			string lastQuotePrefix;
168 
169 			foreach (i, ref line; lines)
170 			{
171 				auto oline = line;
172 				string quotePrefix = stripQuotePrefix(line);
173 
174 				bool isDelim = !line.length
175 					|| line.strip() == "--" // signature
176 					|| line.startsWith("---") // Bugzilla
177 				;
178 
179 				if (isDelim || quotePrefix != lastQuotePrefix)
180 				{
181 					if (start >= 0)
182 					{
183 						handleParagraph(lastQuotePrefix, lines[start..i]);
184 						start = -1;
185 					}
186 				}
187 
188 				if (isDelim)
189 					paragraphs ~= Paragraph(quotePrefix, line);
190 				else
191 				if (start < 0)
192 					start = i;
193 
194 				lastQuotePrefix = quotePrefix;
195 			}
196 
197 			if (start >= 0)
198 				handleParagraph(lastQuotePrefix, lines[start..$]);
199 		}
200 	}
201 
202 	return paragraphs;
203 }
204 
205 enum DEFAULT_WRAP_LENGTH = 66;
206 
207 /// Returns wrapped text in the WrapFormat.flowed format.
208 string wrapText(Paragraph[] paragraphs, int margin = DEFAULT_WRAP_LENGTH)
209 {
210 	string[] lines;
211 
212 	void addLine(string quotePrefix, string line)
213 	{
214 		line = quotePrefix ~ line;
215 		// Add space-stuffing
216 		if (line.startsWith(" ") ||
217 			line.startsWith("From ") ||
218 			(line.startsWith(">") && quotePrefix.length==0))
219 		{
220 			line = " " ~ line;
221 		}
222 		lines ~= line;
223 	}
224 
225 	foreach (paragraph; paragraphs)
226 	{
227 		string line = paragraph.text;
228 
229 		while (line.length && line[$-1] == ' ')
230 			line = line[0..$-1];
231 
232 		if (!line.length)
233 		{
234 			addLine(paragraph.quotePrefix, null);
235 			continue;
236 		}
237 
238 		while (line.length)
239 		{
240 			size_t lastIndex = 0;
241 			size_t lastLength = paragraph.quotePrefix.length;
242 			foreach (i, c; line)
243 				if (c == ' ' || i == line.length-1)
244 				{
245 					auto length = lastLength + line[lastIndex..i+1].byGrapheme.walkLength;
246 					if (length > margin)
247 						break;
248 					lastIndex = i+1;
249 					lastLength = length;
250 				}
251 
252 			if (lastIndex == 0)
253 			{
254 				// Couldn't wrap. Wrap whole line
255 				lastIndex = line.length;
256 			}
257 
258 			addLine(paragraph.quotePrefix, line[0..lastIndex]);
259 			line = line[lastIndex..$];
260 		}
261 	}
262 
263 	return lines.join("\n");
264 }
265 
266 unittest
267 {
268 	// Space-stuffing
269 	assert(wrapText(unwrapText(" Hello", WrapFormat.fixed)) == "  Hello");
270 
271 	// Don't rewrap user input
272 	assert(wrapText(unwrapText("Line 1 \nLine 2 ", WrapFormat.input)) == "Line 1\nLine 2");
273 	// ...but rewrap quoted text
274 	assert(wrapText(unwrapText("> Line 1 \n> Line 2 ", WrapFormat.input)) == "> Line 1 Line 2");
275 	// Wrap long lines
276 	import std.array : replicate;
277 	assert(wrapText(unwrapText(replicate("abcde ", 20), WrapFormat.fixed)).split("\n").length > 1);
278 
279 	// Wrap by character count, not UTF-8 code-unit count. TODO: take into account surrogates and composite characters.
280 	enum str = "Это очень очень очень очень очень очень очень длинная строка";
281 	import std.utf;
282 	static assert(str.toUTF32().length < DEFAULT_WRAP_LENGTH);
283 	static assert(str.length > DEFAULT_WRAP_LENGTH);
284 	assert(wrapText(unwrapText(str, WrapFormat.fixed)).split("\n").length == 1);
285 }