1 /**
2  * std.regex helpers
3  *
4  * License:
5  *   This Source Code Form is subject to the terms of
6  *   the Mozilla Public License, v. 2.0. If a copy of
7  *   the MPL was not distributed with this file, You
8  *   can obtain one at http://mozilla.org/MPL/2.0/.
9  *
10  * Authors:
11  *   Vladimir Panteleev <ae@cy.md>
12  */
13 
14 module ae.utils.regex;
15 
16 import std.algorithm;
17 import std.conv;
18 import std.exception;
19 import std.regex;
20 import std.string;
21 import std.traits;
22 import std.typecons;
23 
24 import ae.utils.text;
25 
26 // ************************************************************************
27 
28 /// Allows specifying regular expression patterns in expressions,
29 /// without having to compile them each time.
30 /// Example:
31 ///   if (text.match(`^\d+$`)) {}    // old code - recompiles every time
32 ///   if (text.match(re!`^\d+$`)) {} // new code - recompiles once
33 
34 Regex!char re(string pattern, alias flags = [])()
35 {
36 	static Regex!char r;
37 	if (r.empty)
38 		r = regex(pattern, flags);
39 	return r;
40 }
41 
42 unittest
43 {
44 	assert( "123".match(re!`^\d+$`));
45 	assert(!"abc".match(re!`^\d+$`));
46 }
47 
48 private void convertCaptures(C, T...)(C captures, out T values)
49 {
50 	assert(values.length == captures.length-1, "Capture group count mismatch: %s arguments / %s capture groups".format(values.length, captures.length-1));
51 	foreach (n, ref value; values)
52 		value = to!(T[n])(captures[n+1]);
53 }
54 
55 /// Lua-like pattern matching.
56 bool matchInto(S, R, Args...)(S s, R r, ref Args args)
57 {
58 	auto m = s.match(r);
59 	if (m)
60 	{
61 		convertCaptures(m.captures, args);
62 		return true;
63 	}
64 	return false;
65 }
66 
67 ///
68 unittest
69 {
70 	string name, fruit;
71 	int count;
72 	assert("Mary has 5 apples"
73 		.matchInto(`^(\w+) has (\d+) (\w+)$`, name, count, fruit));
74 	assert(name == "Mary" && count == 5 && fruit == "apples");
75 }
76 
77 /// Match into a delegate.
78 bool matchCaptures(S, R, Ret, Args...)(S s, R r, Ret delegate(Args args) fun)
79 {
80 	auto m = s.match(r);
81 	if (m)
82 	{
83 		Args args;
84 		convertCaptures(m.captures, args);
85 		fun(args);
86 		return true;
87 	}
88 	return false;
89 }
90 
91 ///
92 unittest
93 {
94 	assert("Mary has 5 apples"
95 		.matchCaptures(`^(\w+) has (\d+) (\w+)$`,
96 			(string name, int count, string fruit)
97 			{
98 				assert(name == "Mary" && count == 5 && fruit == "apples");
99 			}
100 		)
101 	);
102 }
103 
104 /// Call a delegate over all matches.
105 size_t matchAllCaptures(S, R, Ret, Args...)(S s, R r, Ret delegate(Args args) fun)
106 {
107 	size_t matches;
108 	foreach (m; s.matchAll(r))
109 	{
110 		Args args;
111 		convertCaptures(m.captures, args);
112 		fun(args);
113 		matches++;
114 	}
115 	return matches;
116 }
117 
118 /// Returns a range which extracts a capture from text.
119 template extractCaptures(T...)
120 {
121 	auto extractCaptures(S, R)(S s, R r)
122 	{
123 		return s.matchAll(r).map!(
124 			(m)
125 			{
126 				static if (T.length == 1)
127 					return m.captures[1].to!T;
128 				else
129 				{
130 					Tuple!T r;
131 					foreach (n, TT; T)
132 						r[n] = m.captures[1+n].to!TT;
133 					return r;
134 				}
135 			});
136 	}
137 }
138 
139 alias extractCapture = extractCaptures; /// ditto
140 
141 auto extractCapture(S, R)(S s, R r)
142 if (isSomeString!S)
143 {
144 	alias x = .extractCaptures!S;
145 	return x(s, r);
146 } /// ditto
147 
148 ///
149 unittest
150 {
151 	auto s = "One 2 three 42";
152 	auto r = `(\d+)`;
153 	assert(s.extractCapture    (r).equal(["2", "42"]));
154 	assert(s.extractCapture!int(r).equal([ 2 ,  42 ]));
155 }
156 
157 ///
158 unittest
159 {
160 	auto s = "2.3 4.56 78.9";
161 	auto r = `(\d+)\.(\d+)`;
162 	assert(s.extractCapture!(int, int)(r).equal([tuple(2, 3), tuple(4, 56), tuple(78, 9)]));
163 }
164 
165 // ************************************************************************
166 
167 /// Take a string, and return a regular expression that matches that string
168 /// exactly (escape RE metacharacters).
169 string escapeRE(string s)
170 {
171 	// TODO: test
172 
173 	string result;
174 	foreach (c; s)
175 		switch (c)
176 		{
177 		//	case '!':
178 		//	case '"':
179 		//	case '#':
180 			case '$':
181 		//	case '%':
182 		//	case '&':
183 			case '\'':
184 			case '(':
185 			case ')':
186 			case '*':
187 			case '+':
188 		//	case ',':
189 		//	case '-':
190 			case '.':
191 			case '/':
192 		//	case ':':
193 		//	case ';':
194 		//	case '<':
195 		//	case '=':
196 		//	case '>':
197 			case '?':
198 		//	case '@':
199 			case '[':
200 			case '\\':
201 			case ']':
202 			case '^':
203 		//	case '_':
204 		//	case '`':
205 			case '{':
206 			case '|':
207 			case '}':
208 		//	case '~':
209 				result ~= '\\';
210 				goto default;
211 			default:
212 				result ~= c;
213 		}
214 	return result;
215 }
216 
217 // We only need to make sure that there are no unescaped forward slashes
218 // in the regex, which would mean the end of the search pattern part of the
219 // regex transform. All escaped forward slashes will be unescaped during
220 // parsing of the regex transform (which won't affect the regex, as forward
221 // slashes have no special meaning, escaped or unescaped).
222 private string escapeUnescapedSlashes(string s)
223 {
224 	bool escaped = false;
225 	string result;
226 	foreach (c; s)
227 	{
228 		if (escaped)
229 			escaped = false;
230 		else
231 		if (c == '\\')
232 			escaped = true;
233 		else
234 		if (c == '/')
235 			result ~= '\\';
236 
237 		result ~= c;
238 	}
239 	assert(!escaped, "Regex ends with an escape");
240 	return result;
241 }
242 
243 // For the replacement part, we just need to escape all forward and backslashes.
244 private string escapeSlashes(string s)
245 {
246 	return s.fastReplace(`\`, `\\`).fastReplace(`/`, `\/`);
247 }
248 
249 // Reverse of the above
250 private string unescapeSlashes(string s)
251 {
252 	return s.fastReplace(`\/`, `/`).fastReplace(`\\`, `\`);
253 }
254 
255 /// Build a RE search-and-replace transform (as used by applyRE).
256 string buildReplaceTransformation(string search, string replacement, string flags)
257 {
258 	return "s/" ~ escapeUnescapedSlashes(search) ~ "/" ~ escapeSlashes(replacement) ~ "/" ~ flags;
259 }
260 
261 private struct Transformation
262 {
263 	enum Type
264 	{
265 		replace,
266 	}
267 	Type type;
268 
269 	struct Replace
270 	{
271 		string search, replacement, flags;
272 	}
273 
274 	union
275 	{
276 		Replace replace;
277 	}
278 }
279 
280 private Transformation[] splitRETransformation(string s)
281 {
282 	enforce(s.length >= 2, "Bad transformation");
283 	Transformation[] result;
284 	while (s.length)
285 	{
286 		Transformation t;
287 		switch (s[0])
288 		{
289 			case 's':
290 			{
291 				t.type = Transformation.Type.replace;
292 				s = s[1..$];
293 
294 				auto boundary = s[0];
295 				s = s[1..$];
296 
297 				string readString()
298 				{
299 					bool escaped = false;
300 					foreach (i, c; s)
301 						if (escaped)
302 							escaped = false;
303 						else
304 						if (c=='\\')
305 							escaped = true;
306 						else
307 						if (c == boundary)
308 						{
309 							auto result = s[0..i];
310 							s = s[i+1..$];
311 							return result;
312 						}
313 					throw new Exception("Unexpected end of regex replace transformation");
314 				}
315 
316 				t.replace.search = readString();
317 				t.replace.replacement = readString();
318 				foreach (i, c; s)
319 				{
320 					if (c == ';')
321 					{
322 						t.replace.flags = s[0..i];
323 						s = s[i+1..$];
324 						goto endOfReplace;
325 					}
326 					else
327 					if (c == boundary)
328 						throw new Exception("Too many regex replace transformation parameters");
329 				}
330 				t.replace.flags = s;
331 				s = null;
332 			endOfReplace:
333 				result ~= t;
334 				break;
335 			}
336 			default:
337 				throw new Exception("Unsupported regex transformation: " ~ s[0]);
338 		}
339 	}
340 	return result;
341 }
342 
343 unittest
344 {
345 	auto actual = splitRETransformation("s/from/to/");
346 	Transformation expected = { type : Transformation.Type.replace, replace : { search : "from", replacement : "to", flags : "" } };
347 	assert(actual.length == 1 && actual[0].tupleof == expected.tupleof);
348 }
349 
350 /// Apply sed-like regex transformation (in the form of "s/FROM/TO/FLAGS") to a string.
351 /// Multiple commands can be separated by ';'.
352 string applyRE()(string str, string transformation)
353 {
354 	import std.regex;
355 	auto transformations = splitRETransformation(transformation);
356 	foreach (t; transformations)
357 		final switch (t.type)
358 		{
359 			case Transformation.Type.replace:
360 			{
361 				auto r = regex(t.replace.search, t.replace.flags);
362 				str = replace(str, r, unescapeSlashes(t.replace.replacement));
363 			}
364 		}
365 	return str;
366 }
367 
368 unittest
369 {
370 	auto transformation = buildReplaceTransformation(`(?<=\d)(?=(\d\d\d)+\b)`, `,`, "g");
371 	assert("12000 + 42100 = 54100".applyRE(transformation) == "12,000 + 42,100 = 54,100");
372 
373 	void testSlashes(string s)
374 	{
375 		assert(s.applyRE(buildReplaceTransformation(`\/`, `\`, "g")) == s.fastReplace(`/`, `\`));
376 		assert(s.applyRE(buildReplaceTransformation(`\\`, `/`, "g")) == s.fastReplace(`\`, `/`));
377 	}
378 	testSlashes(`a/b\c`);
379 	testSlashes(`a//b\\c`);
380 	testSlashes(`a/\b\/c`);
381 	testSlashes(`a/\\b\//c`);
382 	testSlashes(`a//\b\\/c`);
383 
384 	assert("babba".applyRE(`s/a/c/g;s/b/a/g;s/c/b/g`) == "abaab");
385 }
386 
387 // ************************************************************************