1 /**
2  * std.regex helpers
3  *
4  * License:
5  *   This Source Code Form is subject to the terms of
6  *   the Mozilla Public License, v. 2.0. If a copy of
7  *   the MPL was not distributed with this file, You
8  *   can obtain one at http://mozilla.org/MPL/2.0/.
9  *
10  * Authors:
11  *   Vladimir Panteleev <vladimir@thecybershadow.net>
12  */
13 
14 module ae.utils.regex;
15 
16 import std.algorithm;
17 import std.conv;
18 import std.exception;
19 import std.regex;
20 import std.string;
21 
22 import ae.utils.text;
23 
24 // ************************************************************************
25 
26 /// Allows specifying regular expression patterns in expressions,
27 /// without having to compile them each time.
28 /// Example:
29 ///   if (text.match(`^\d+$`)) {}    // old code - recompiles every time
30 ///   if (text.match(re!`^\d+$`)) {} // new code - recompiles once
31 
32 Regex!char re(string pattern, alias flags = [])()
33 {
34 	static Regex!char r;
35 	if (r.empty)
36 		r = regex(pattern, flags);
37 	return r;
38 }
39 
40 unittest
41 {
42 	assert( "123".match(re!`^\d+$`));
43 	assert(!"abc".match(re!`^\d+$`));
44 }
45 
46 void convertCaptures(C, T...)(C captures, out T values)
47 {
48 	assert(values.length == captures.length-1, "Capture group count mismatch: %s arguments / %s capture groups".format(values.length, captures.length-1));
49 	foreach (n, ref value; values)
50 		value = to!(T[n])(captures[n+1]);
51 }
52 
53 /// Lua-like pattern matching.
54 bool matchInto(S, R, Args...)(S s, R r, ref Args args)
55 {
56 	auto m = s.match(r);
57 	if (m)
58 	{
59 		convertCaptures(m.captures, args);
60 		return true;
61 	}
62 	return false;
63 }
64 
65 ///
66 unittest
67 {
68 	string name, fruit;
69 	int count;
70 	assert("Mary has 5 apples"
71 		.matchInto(`^(\w+) has (\d+) (\w+)$`, name, count, fruit));
72 	assert(name == "Mary" && count == 5 && fruit == "apples");
73 }
74 
75 /// Match into a delegate.
76 bool matchCaptures(S, R, Ret, Args...)(S s, R r, Ret delegate(Args args) fun)
77 {
78 	auto m = s.match(r);
79 	if (m)
80 	{
81 		Args args;
82 		convertCaptures(m.captures, args);
83 		fun(args);
84 		return true;
85 	}
86 	return false;
87 }
88 
89 ///
90 unittest
91 {
92 	assert("Mary has 5 apples"
93 		.matchCaptures(`^(\w+) has (\d+) (\w+)$`,
94 			(string name, int count, string fruit)
95 			{
96 				assert(name == "Mary" && count == 5 && fruit == "apples");
97 			}
98 		)
99 	);
100 }
101 
102 /// Call a delegate over all matches.
103 size_t matchAllCaptures(S, R, Ret, Args...)(S s, R r, Ret delegate(Args args) fun)
104 {
105 	size_t matches;
106 	foreach (m; s.matchAll(r))
107 	{
108 		Args args;
109 		convertCaptures(m.captures, args);
110 		fun(args);
111 		matches++;
112 	}
113 	return matches;
114 }
115 
116 /// Returns a range which extracts a capture from text.
117 template extractCapture(T)
118 {
119 	auto extractCapture(S, R)(S s, R r)
120 	{
121 		return s.matchAll(r).map!(m => m.captures[1].to!T);
122 	}
123 }
124 
125 auto extractCapture(S, R)(S s, R r) { alias x = extractCapture!S; return x(s, r); }
126 
127 ///
128 unittest
129 {
130 	auto s = "One 2 three 42";
131 	auto r = `(\d+)`;
132 	assert(s.extractCapture    (r).equal(["2", "42"]));
133 	assert(s.extractCapture!int(r).equal([ 2 ,  42 ]));
134 }
135 
136 // ************************************************************************
137 
138 /// Take a string, and return a regular expression that matches that string
139 /// exactly (escape RE metacharacters).
140 string escapeRE(string s)
141 {
142 	// TODO: test
143 
144 	string result;
145 	foreach (c; s)
146 		switch (c)
147 		{
148 		//	case '!':
149 		//	case '"':
150 		//	case '#':
151 			case '$':
152 		//	case '%':
153 		//	case '&':
154 			case '\'':
155 			case '(':
156 			case ')':
157 			case '*':
158 			case '+':
159 		//	case ',':
160 		//	case '-':
161 			case '.':
162 			case '/':
163 		//	case ':':
164 		//	case ';':
165 		//	case '<':
166 		//	case '=':
167 		//	case '>':
168 			case '?':
169 		//	case '@':
170 			case '[':
171 			case '\\':
172 			case ']':
173 			case '^':
174 		//	case '_':
175 		//	case '`':
176 			case '{':
177 			case '|':
178 			case '}':
179 		//	case '~':
180 				result ~= '\\';
181 				goto default;
182 			default:
183 				result ~= c;
184 		}
185 	return result;
186 }
187 
188 // We only need to make sure that there are no unescaped forward slashes
189 // in the regex, which would mean the end of the search pattern part of the
190 // regex transform. All escaped forward slashes will be unescaped during
191 // parsing of the regex transform (which won't affect the regex, as forward
192 // slashes have no special meaning, escaped or unescaped).
193 private string escapeUnescapedSlashes(string s)
194 {
195 	bool escaped = false;
196 	string result;
197 	foreach (c; s)
198 	{
199 		if (escaped)
200 			escaped = false;
201 		else
202 		if (c == '\\')
203 			escaped = true;
204 		else
205 		if (c == '/')
206 			result ~= '\\';
207 
208 		result ~= c;
209 	}
210 	assert(!escaped, "Regex ends with an escape");
211 	return result;
212 }
213 
214 // For the replacement part, we just need to escape all forward and backslashes.
215 private string escapeSlashes(string s)
216 {
217 	return s.fastReplace(`\`, `\\`).fastReplace(`/`, `\/`);
218 }
219 
220 // Reverse of the above
221 private string unescapeSlashes(string s)
222 {
223 	return s.fastReplace(`\/`, `/`).fastReplace(`\\`, `\`);
224 }
225 
226 /// Build a RE search-and-replace transform (as used by applyRE).
227 string buildReplaceTransformation(string search, string replacement, string flags)
228 {
229 	return "s/" ~ escapeUnescapedSlashes(search) ~ "/" ~ escapeSlashes(replacement) ~ "/" ~ flags;
230 }
231 
232 private string[] splitRETransformation(string t)
233 {
234 	enforce(t.length >= 2, "Bad transformation");
235 	string[] result = [t[0..1]];
236 	auto boundary = t[1];
237 	t = t[2..$];
238 	size_t start = 0;
239 	bool escaped = false;
240 	foreach (i, c; t)
241 		if (escaped)
242 			escaped = false;
243 		else
244 		if (c=='\\')
245 			escaped = true;
246 		else
247 		if (c == boundary)
248 		{
249 			result ~= t[start..i];
250 			start = i+1;
251 		}
252 	result ~= t[start..$];
253 	return result;
254 }
255 
256 unittest
257 {
258 	assert(splitRETransformation("s/from/to/") == ["s", "from", "to", ""]);
259 }
260 
261 /// Apply regex transformation (in the form of "s/FROM/TO/FLAGS") to a string.
262 string applyRE()(string str, string transformation)
263 {
264 	import std.regex;
265 	auto params = splitRETransformation(transformation);
266 	enforce(params[0] == "s", "Unsupported regex transformation");
267 	enforce(params.length == 4, "Wrong number of regex transformation parameters");
268 	auto r = regex(params[1], params[3]);
269 	return replace(str, r, unescapeSlashes(params[2]));
270 }
271 
272 unittest
273 {
274 	auto transformation = buildReplaceTransformation(`(?<=\d)(?=(\d\d\d)+\b)`, `,`, "g");
275 	assert("12000 + 42100 = 54100".applyRE(transformation) == "12,000 + 42,100 = 54,100");
276 
277 	void testSlashes(string s)
278 	{
279 		assert(s.applyRE(buildReplaceTransformation(`\/`, `\`, "g")) == s.fastReplace(`/`, `\`));
280 		assert(s.applyRE(buildReplaceTransformation(`\\`, `/`, "g")) == s.fastReplace(`\`, `/`));
281 	}
282 	testSlashes(`a/b\c`);
283 	testSlashes(`a//b\\c`);
284 	testSlashes(`a/\b\/c`);
285 	testSlashes(`a/\\b\//c`);
286 	testSlashes(`a//\b\\/c`);
287 }
288 
289 // ************************************************************************