1 /** 2 * Utility code related to string and text processing. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <vladimir@thecybershadow.net> 12 */ 13 14 module ae.utils.text; 15 16 import std.algorithm; 17 import std.ascii; 18 import std.exception; 19 import std.conv; 20 import std.format; 21 import std.string; 22 import std.traits; 23 import std.typetuple; 24 25 import core.stdc.string; 26 27 import ae.utils.meta; 28 import ae.utils.textout; 29 30 public import ae.utils.regex; 31 32 alias indexOf = std..string.indexOf; 33 34 // ************************************************************************ 35 36 /// Semantic alias for an array of immutable bytes containing some 37 /// ASCII-based 8-bit character encoding. Might be UTF-8, but not 38 /// necessarily - thus, is a semantic superset of the D "string" alias. 39 alias string ascii; 40 41 /// Convenience helper 42 bool contains(T, U)(T[] str, U[] what) 43 if (is(Unqual!T == Unqual!U)) 44 { 45 return str.indexOf(what)>=0; 46 } 47 48 /// CTFE helper 49 string formatAs(T)(auto ref T obj, string fmt) 50 { 51 return format(fmt, obj); 52 } 53 54 // Uses memchr (not Boyer-Moore), best for short strings. 55 T[] fastReplace(T)(T[] what, T[] from, T[] to) 56 if (T.sizeof == 1) // TODO (uses memchr) 57 { 58 alias Unqual!T U; 59 60 // debug scope(failure) std.stdio.writeln("fastReplace crashed: ", [what, from, to]); 61 enum RAM = cast(U*)null; 62 63 if (what.length < from.length || from.length==0) 64 return what; 65 66 if (from.length==1) 67 { 68 auto fromc = from[0]; 69 if (to.length==1) 70 { 71 auto p = cast(T*)memchr(what.ptr, fromc, what.length); 72 if (!p) 73 return what; 74 75 auto result = what.dup; 76 auto delta = result.ptr - what.ptr; 77 auto toChar = to[0]; 78 auto end = what.ptr + what.length; 79 do 80 { 81 (cast(U*)p)[delta] = toChar; // zomg hax lol 82 p++; 83 p = cast(T*)memchr(p, fromc, end - p); 84 } while (p); 85 return assumeUnique(result); 86 } 87 else 88 { 89 auto p = cast(immutable(T)*)memchr(what.ptr, fromc, what.length); 90 if (!p) 91 return what; 92 93 auto sb = StringBuilder(what.length); 94 do 95 { 96 sb.put(what[0..p-what.ptr], to); 97 what = what[p-what.ptr+1..$]; 98 p = cast(immutable(T)*)memchr(what.ptr, fromc, what.length); 99 } 100 while (p); 101 102 sb.put(what); 103 return sb.get(); 104 } 105 } 106 107 auto head = from[0]; 108 auto tail = from[1..$]; 109 110 auto p = cast(T*)what.ptr; 111 auto end = p + what.length - tail.length; 112 p = cast(T*)memchr(p, head, end-p); 113 while (p) 114 { 115 p++; 116 if (p[0..tail.length] == tail) 117 { 118 if (from.length == to.length) 119 { 120 auto result = what.dup; 121 auto deltaMinusOne = (result.ptr - what.ptr) - 1; 122 123 goto replaceA; 124 dummyA: // compiler complains 125 126 do 127 { 128 p++; 129 if (p[0..tail.length] == tail) 130 { 131 replaceA: 132 (cast(U*)p+deltaMinusOne)[0..to.length] = to[]; 133 } 134 p = cast(T*)memchr(p, head, end-p); 135 } 136 while (p); 137 138 return assumeUnique(result); 139 } 140 else 141 { 142 auto start = cast(T*)what.ptr; 143 auto sb = StringBuilder(what.length); 144 goto replaceB; 145 dummyB: // compiler complains 146 147 do 148 { 149 p++; 150 if (p[0..tail.length] == tail) 151 { 152 replaceB: 153 sb.put(RAM[cast(size_t)start .. cast(size_t)p-1], to); 154 start = p + tail.length; 155 what = what[start-what.ptr..$]; 156 } 157 else 158 { 159 what = what[p-what.ptr..$]; 160 } 161 p = cast(T*)memchr(what.ptr, head, what.length); 162 } 163 while (p); 164 165 //sb.put(what); 166 sb.put(RAM[cast(size_t)start..cast(size_t)(what.ptr+what.length)]); 167 return sb.get(); 168 } 169 170 assert(0); 171 } 172 p = cast(T*)memchr(p, head, end-p); 173 } 174 175 return what; 176 } 177 178 unittest 179 { 180 import std.array; 181 void test(string haystack, string from, string to) 182 { 183 auto description = `("` ~ haystack ~ `", "` ~ from ~ `", "` ~ to ~ `")`; 184 185 auto r1 = fastReplace(haystack, from, to); 186 auto r2 = replace(haystack, from, to); 187 assert(r1 == r2, `Bad replace: ` ~ description ~ ` == "` ~ r1 ~ `"`); 188 189 if (r1 == haystack) 190 assert(r1 is haystack, `Pointless reallocation: ` ~ description); 191 } 192 193 test("Mary had a little lamb", "a", "b"); 194 test("Mary had a little lamb", "a", "aaa"); 195 test("Mary had a little lamb", "Mary", "Lucy"); 196 test("Mary had a little lamb", "Mary", "Jimmy"); 197 test("Mary had a little lamb", "lamb", "goat"); 198 test("Mary had a little lamb", "lamb", "sheep"); 199 test("Mary had a little lamb", " l", " x"); 200 test("Mary had a little lamb", " l", " xx"); 201 202 test("Mary had a little lamb", "X" , "Y" ); 203 test("Mary had a little lamb", "XX", "Y" ); 204 test("Mary had a little lamb", "X" , "YY"); 205 test("Mary had a little lamb", "XX", "YY"); 206 test("Mary had a little lamb", "aX", "Y" ); 207 test("Mary had a little lamb", "aX", "YY"); 208 209 test("foo", "foobar", "bar"); 210 } 211 212 T[][] fastSplit(T, U)(T[] s, U d) 213 if (is(Unqual!T == Unqual!U)) 214 { 215 if (!s.length) 216 return null; 217 218 auto p = cast(T*)memchr(s.ptr, d, s.length); 219 if (!p) 220 return [s]; 221 222 size_t n; 223 auto end = s.ptr + s.length; 224 do 225 { 226 n++; 227 p++; 228 p = cast(T*) memchr(p, d, end-p); 229 } 230 while (p); 231 232 auto result = new T[][n+1]; 233 n = 0; 234 auto start = s.ptr; 235 p = cast(T*) memchr(start, d, s.length); 236 do 237 { 238 result[n++] = start[0..p-start]; 239 start = ++p; 240 p = cast(T*) memchr(p, d, end-p); 241 } 242 while (p); 243 result[n] = start[0..end-start]; 244 245 return result; 246 } 247 248 T[][] splitAsciiLines(T)(T[] text) 249 if (is(Unqual!T == char)) 250 { 251 auto lines = text.fastSplit('\n'); 252 foreach (ref line; lines) 253 if (line.length && line[$-1]=='\r') 254 line = line[0..$-1]; 255 return lines; 256 } 257 258 unittest 259 { 260 assert(splitAsciiLines("a\nb\r\nc\r\rd\n\re\r\n\nf") == ["a", "b", "c\r\rd", "\re", "", "f"]); 261 assert(splitAsciiLines(string.init) == splitLines(string.init)); 262 } 263 264 T[] asciiStrip(T)(T[] s) 265 if (is(Unqual!T == char)) 266 { 267 while (s.length && isWhite(s[0])) 268 s = s[1..$]; 269 while (s.length && isWhite(s[$-1])) 270 s = s[0..$-1]; 271 return s; 272 } 273 274 unittest 275 { 276 string s = "Hello, world!"; 277 assert(asciiStrip(s) is s); 278 assert(asciiStrip("\r\n\tHello ".dup) == "Hello"); 279 } 280 281 /// Covering slice-list of s with interleaved whitespace. 282 T[][] segmentByWhitespace(T)(T[] s) 283 if (is(Unqual!T == char)) 284 { 285 if (!s.length) 286 return null; 287 288 T[][] segments; 289 bool wasWhite = isWhite(s[0]); 290 size_t start = 0; 291 foreach (p, char c; s) 292 { 293 bool isWhite = isWhite(c); 294 if (isWhite != wasWhite) 295 segments ~= s[start..p], 296 start = p; 297 wasWhite = isWhite; 298 } 299 segments ~= s[start..$]; 300 301 return segments; 302 } 303 304 T[] newlinesToSpaces(T)(T[] s) 305 if (is(Unqual!T == char)) 306 { 307 auto slices = segmentByWhitespace(s); 308 foreach (ref slice; slices) 309 if (slice.contains("\n")) 310 slice = " "; 311 return slices.join(); 312 } 313 314 ascii normalizeWhitespace(ascii s) 315 { 316 auto slices = segmentByWhitespace(strip(s)); 317 foreach (i, ref slice; slices) 318 if (i & 1) // odd 319 slice = " "; 320 return slices.join(); 321 } 322 323 unittest 324 { 325 assert(normalizeWhitespace(" Mary had\ta\nlittle\r\n\tlamb") == "Mary had a little lamb"); 326 } 327 328 string[] splitByCamelCase(string s) 329 { 330 string[] result; 331 size_t start = 0; 332 foreach (i; 1..s.length+1) 333 if (i == s.length 334 || (isLower(s[i-1]) && isUpper(s[i])) 335 || (i+1 < s.length && isUpper(s[i-1]) && isUpper(s[i]) && isLower(s[i+1])) 336 ) 337 { 338 result ~= s[start..i]; 339 start = i; 340 } 341 return result; 342 } 343 344 unittest 345 { 346 assert(splitByCamelCase("parseIPString") == ["parse", "IP", "String"]); 347 assert(splitByCamelCase("IPString") == ["IP", "String"]); 348 } 349 350 string camelCaseJoin(string[] arr) 351 { 352 if (!arr.length) 353 return null; 354 string result = arr[0]; 355 foreach (s; arr[1..$]) 356 result ~= std.ascii.toUpper(s[0]) ~ s[1..$]; 357 return result; 358 } 359 360 unittest 361 { 362 assert("parse-IP-string".split('-').camelCaseJoin() == "parseIPString"); 363 } 364 365 // ************************************************************************ 366 367 private __gshared char[256] asciiLower, asciiUpper; 368 369 shared static this() 370 { 371 foreach (c; 0..256) 372 { 373 asciiLower[c] = cast(char)std.ascii.toLower(c); 374 asciiUpper[c] = cast(char)std.ascii.toUpper(c); 375 } 376 } 377 378 void xlat(alias TABLE, T)(T[] buf) 379 { 380 foreach (ref c; buf) 381 c = TABLE[c]; 382 } 383 384 alias xlat!(asciiLower, char) asciiToLower; 385 alias xlat!(asciiUpper, char) asciiToUpper; 386 387 // ************************************************************************ 388 389 import std.utf; 390 391 /// Convert any data to a valid UTF-8 bytestream, so D's string functions can 392 /// properly work on it. 393 string rawToUTF8(in char[] s) 394 { 395 auto d = new dchar[s.length]; 396 foreach (i, char c; s) 397 d[i] = c; 398 return toUTF8(d); 399 } 400 401 /// Undo rawToUTF8. 402 ascii UTF8ToRaw(in char[] r) 403 { 404 auto s = new char[r.length]; 405 size_t i = 0; 406 foreach (dchar c; r) 407 { 408 assert(c < '\u0100'); 409 s[i++] = cast(char)c; 410 } 411 return assumeUnique(s[0..i]); 412 } 413 414 unittest 415 { 416 char[1] c; 417 for (int i=0; i<256; i++) 418 { 419 c[0] = cast(char)i; 420 assert(UTF8ToRaw(rawToUTF8(c[])) == c[], format("%s -> %s -> %s", cast(ubyte[])c[], cast(ubyte[])rawToUTF8(c[]), cast(ubyte[])UTF8ToRaw(rawToUTF8(c[])))); 421 } 422 } 423 424 /// Where a delegate with this signature is required. 425 string nullStringTransform(in char[] s) { return to!string(s); } 426 427 string forceValidUTF8(string s) 428 { 429 try 430 { 431 validate(s); 432 return s; 433 } 434 catch (UTFException) 435 return rawToUTF8(s); 436 } 437 438 // ************************************************************************ 439 440 /// Return the slice up to the first NUL character, 441 /// or of the whole array if none is found. 442 C[] fromZArray(C, n)(ref C[n] arr) 443 { 444 auto p = arr.representation.countUntil(0); 445 return arr[0 .. p<0 ? $ : p]; 446 } 447 448 /// ditto 449 C[] fromZArray(C)(C[] arr) 450 { 451 auto p = arr.representation.countUntil(0); 452 return arr[0 .. p<0 ? $ : p]; 453 } 454 455 unittest 456 { 457 char[4] arr = "ab\0d"; 458 assert(arr.fromZArray == "ab"); 459 arr[] = "abcd"; 460 assert(arr.fromZArray == "abcd"); 461 } 462 463 unittest 464 { 465 string arr = "ab\0d"; 466 assert(arr.fromZArray == "ab"); 467 arr = "abcd"; 468 assert(arr.fromZArray == "abcd"); 469 } 470 471 // ************************************************************************ 472 473 /// Formats binary data as a hex dump (three-column layout consisting of hex 474 /// offset, byte values in hex, and printable low-ASCII characters). 475 string hexDump(const(void)[] b) 476 { 477 auto data = cast(const(ubyte)[]) b; 478 assert(data.length); 479 size_t i=0; 480 string s; 481 while (i<data.length) 482 { 483 s ~= format("%08X: ", i); 484 foreach (x; 0..16) 485 { 486 if (i+x<data.length) 487 s ~= format("%02X ", data[i+x]); 488 else 489 s ~= " "; 490 if (x==7) 491 s ~= "| "; 492 } 493 s ~= " "; 494 foreach (x; 0..16) 495 { 496 if (i+x<data.length) 497 if (data[i+x]==0) 498 s ~= ' '; 499 else 500 if (data[i+x]<32 || data[i+x]>=128) 501 s ~= '.'; 502 else 503 s ~= cast(char)data[i+x]; 504 else 505 s ~= ' '; 506 } 507 s ~= "\n"; 508 i += 16; 509 } 510 return s; 511 } 512 513 import std.conv; 514 515 T fromHex(T : ulong = uint, C)(const(C)[] s) 516 { 517 T result = parse!T(s, 16); 518 enforce(s.length==0, new ConvException("Could not parse entire string")); 519 return result; 520 } 521 522 ubyte[] arrayFromHex(in char[] hex, ubyte[] buf = null) 523 { 524 if (buf is null) 525 buf = new ubyte[hex.length/2]; 526 else 527 assert(buf.length == hex.length/2); 528 for (int i=0; i<hex.length; i+=2) 529 buf[i/2] = cast(ubyte)( 530 hexDigits.indexOf(hex[i ], CaseSensitive.no)*16 + 531 hexDigits.indexOf(hex[i+1], CaseSensitive.no) 532 ); 533 return buf; 534 } 535 536 string toHex()(in ubyte[] data, char[] buf = null) 537 { 538 if (buf is null) 539 buf = new char[data.length*2]; 540 else 541 assert(buf.length == data.length*2); 542 foreach (i, b; data) 543 { 544 buf[i*2 ] = hexDigits[b>>4]; 545 buf[i*2+1] = hexDigits[b&15]; 546 } 547 return assumeUnique(buf); 548 } 549 550 void toHex(T : ulong, size_t U = T.sizeof*2)(T n, ref char[U] buf) 551 { 552 foreach (i; Reverse!(RangeTuple!(T.sizeof*2))) 553 { 554 buf[i] = hexDigits[n & 0xF]; 555 n >>= 4; 556 } 557 } 558 559 unittest 560 { 561 char[8] buf; 562 toHex(0x01234567, buf); 563 assert(buf == "01234567"); 564 } 565 566 /// How many significant decimal digits does a FP type have 567 /// (determined empirically) 568 enum significantDigits(T : real) = 2 + 2 * T.sizeof; 569 570 /// Format string for a FP type which includes all necessary 571 /// significant digits 572 enum fpFormatString(T) = "%." ~ text(significantDigits!T) ~ "g"; 573 574 /// Get shortest string representation of a FP type that still converts to exactly the same number. 575 template fpToString(F) 576 { 577 string fpToString(F v) 578 { 579 /// Bypass FPU register, which may contain a different precision 580 static F forceType(F d) { static F n; n = d; return n; } 581 582 StaticBuf!(char, 64) buf; 583 formattedWrite(&buf, fpFormatString!F, forceType(v)); 584 char[] s = buf.data(); 585 586 if (s != "nan" && s != "-nan" && s != "inf" && s != "-inf") 587 { 588 if (forceType(to!F(s)) != v) 589 { 590 static if (is(F == real)) 591 { 592 // Something funny with DM libc real parsing... e.g. 0.6885036635121051783 593 return s.idup; 594 } 595 else 596 assert(false, "Initial conversion fails: " ~ format(fpFormatString!F, to!F(s))); 597 } 598 599 foreach_reverse (i; 1..s.length) 600 if (s[i]>='0' && s[i]<='8') 601 { 602 s[i]++; 603 if (forceType(to!F(s[0..i+1]))==v) 604 s = s[0..i+1]; 605 else 606 s[i]--; 607 } 608 while (s.length>2 && s[$-1]!='.' && forceType(to!F(s[0..$-1]))==v) 609 s = s[0..$-1]; 610 } 611 return s.idup; 612 } 613 614 static if (!is(F == real)) 615 unittest 616 { 617 union U 618 { 619 ubyte[F.sizeof] bytes; 620 F d; 621 string toString() { return (fpFormatString!F ~ " %a [%(%02X %)]").format(d, d, bytes[]); } 622 } 623 import std.random : Xorshift, uniform; 624 import std.stdio : stderr; 625 Xorshift rng; 626 foreach (n; 0..10000) 627 { 628 U u; 629 foreach (ref b; u.bytes[]) 630 b = uniform!ubyte(rng); 631 static if (is(F == real)) 632 u.bytes[7] |= 0x80; // require normalized value 633 scope(failure) stderr.writeln("Input:\t", u); 634 auto s = fpToString(u.d); 635 scope(failure) stderr.writeln("Result:\t", s); 636 if (s == "nan" || s == "-nan") 637 continue; // there are many NaNs... 638 U r; 639 r.d = to!F(s); 640 assert(r.bytes == u.bytes, 641 "fpToString mismatch:\nOutput:\t%s".format(r)); 642 } 643 } 644 } 645 646 alias doubleToString = fpToString!double; 647 648 unittest 649 { 650 alias floatToString = fpToString!float; 651 alias realToString = fpToString!real; 652 } 653 654 import std.algorithm : max; 655 656 template DecimalSize(T : ulong) 657 { 658 enum DecimalSize = max(text(T.min).length, text(T.max).length); 659 } 660 661 static assert(DecimalSize!ubyte == 3); 662 static assert(DecimalSize!byte == 4); 663 static assert(DecimalSize!ushort == 5); 664 static assert(DecimalSize!short == 6); 665 static assert(DecimalSize!uint == 10); 666 static assert(DecimalSize!int == 11); 667 static assert(DecimalSize!ulong == 20); 668 static assert(DecimalSize!long == 20); 669 670 import std.typecons; 671 672 /// Writes n as decimal number to buf (right-aligned), returns slice of buf containing result. 673 char[] toDec(N : ulong, size_t U)(N o, ref char[U] buf) 674 { 675 static assert(U >= DecimalSize!N, "Buffer too small to fit any " ~ N.stringof ~ " value"); 676 677 Unqual!N n = o; 678 char* p = buf.ptr+buf.length; 679 680 if (isSigned!N && n<0) 681 { 682 do 683 { 684 *--p = '0' - n%10; 685 n = n/10; 686 } while (n); 687 *--p = '-'; 688 } 689 else 690 do 691 { 692 *--p = '0' + n%10; 693 n = n/10; 694 } while (n); 695 696 return p[0 .. buf.ptr + buf.length - p]; 697 } 698 699 string toDec(T : ulong)(T n) 700 { 701 static struct Buf { char[DecimalSize!T] buf; } // Can't put static array on heap, use struct 702 return assumeUnique(toDec(n, (new Buf).buf)); 703 } 704 705 unittest 706 { 707 assert(toDec(42) == "42"); 708 assert(toDec(int.min) == int.min.to!string()); 709 } 710 711 /// Print an unsigned integer as a zero-padded, right-aligned decimal number into a buffer 712 void toDecFixed(N : ulong, size_t U)(N n, ref char[U] buf) 713 if (!isSigned!N) 714 { 715 assert(n < 10^^U, "Number too large"); 716 717 foreach (i; Reverse!(RangeTuple!U)) 718 { 719 buf[i] = cast(char)('0' + (n % 10)); 720 n /= 10; 721 } 722 } 723 724 /// ditto 725 char[U] toDecFixed(size_t U, N : ulong)(N n) 726 if (!isSigned!N) 727 { 728 char[U] buf; 729 toDecFixed(n, buf); 730 return buf; 731 } 732 733 unittest 734 { 735 assert(toDecFixed!6(12345u) == "012345"); 736 } 737 738 string numberToString(T)(T v) 739 if (isNumeric!T) 740 { 741 static if (is(T : real)) 742 return fpToString(v); 743 else 744 return toDec(v); 745 } 746 747 // ************************************************************************ 748 749 /// Simpler implementation of Levenshtein string distance 750 int stringDistance(string s, string t) 751 { 752 int n = cast(int)s.length; 753 int m = cast(int)t.length; 754 if (n == 0) return m; 755 if (m == 0) return n; 756 int[][] distance = new int[][](n+1, m+1); // matrix 757 int cost=0; 758 //init1 759 foreach (i; 0..n+1) distance[i][0]=i; 760 foreach (j; 0..m+1) distance[0][j]=j; 761 //find min distance 762 foreach (i; 1..n+1) 763 foreach (j; 1..m+1) 764 { 765 cost = t[j-1] == s[i-1] ? 0 : 1; 766 distance[i][j] = min( 767 distance[i-1][j ] + 1, 768 distance[i ][j-1] + 1, 769 distance[i-1][j-1] + cost 770 ); 771 } 772 return distance[n][m]; 773 } 774 775 /// Return a number between 0.0 and 1.0 indicating how similar two strings are 776 /// (1.0 if identical) 777 float stringSimilarity(string string1, string string2) 778 { 779 float dis = stringDistance(string1, string2); 780 float maxLen = string1.length; 781 if (maxLen < string2.length) 782 maxLen = string2.length; 783 if (maxLen == 0) 784 return 1; 785 else 786 return 1f - dis/maxLen; 787 } 788 789 /// Select best match from a list of items. 790 /// Returns null if none are above the threshold. 791 string selectBestFrom(in string[] items, string target, float threshold = 0.7) 792 { 793 string found = null; 794 float best = 0; 795 796 foreach (item; items) 797 { 798 float match = stringSimilarity(toLower(item),toLower(target)); 799 if (match>threshold && match>=best) 800 { 801 best = match; 802 found = item; 803 } 804 } 805 806 return found; 807 } 808 809 // ************************************************************************ 810 811 import std.random; 812 813 string randomString(int length=20, string chars="abcdefghijklmnopqrstuvwxyz") 814 { 815 char[] result = new char[length]; 816 foreach (ref c; result) 817 c = chars[uniform(0, $)]; 818 return assumeUnique(result); 819 }