1 /** 2 * Utility code related to string and text processing. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <ae@cy.md> 12 */ 13 14 module ae.utils.text; 15 16 import std.algorithm; 17 import std.ascii; 18 import std.exception; 19 import std.conv; 20 import std.format; 21 import std.range.primitives; 22 import std.string; 23 import std.traits; 24 import std.typetuple; 25 26 import core.stdc.stdio : snprintf, sscanf; 27 import core.stdc.string; 28 29 import ae.utils.array; 30 import ae.utils.meta; 31 import ae.utils.text.parsefp; 32 import ae.utils.textout; 33 34 private alias indexOf = std..string.indexOf; 35 36 public import ae.utils.text.ascii : ascii, DecimalSize, toDec, toDecFixed, asciiToLower, asciiToUpper; 37 public import ae.utils.array : contains; 38 39 // ************************************************************************ 40 41 /// UFCS helper 42 string formatAs(T)(auto ref T obj, string fmt) 43 { 44 return format(fmt, obj); 45 } 46 47 /// Lazily formatted object 48 auto formatted(string fmt, T...)(auto ref T values) 49 { 50 static struct Formatted 51 { 52 T values; 53 54 void toString(void delegate(const(char)[]) sink) const 55 { 56 sink.formattedWrite!fmt(values); 57 } 58 59 void toString(W)(ref W writer) const 60 if (isOutputRange!(W, char)) 61 { 62 writer.formattedWrite!fmt(values); 63 } 64 } 65 return Formatted(values); 66 } 67 68 unittest 69 { 70 assert(format!"%s%s%s"("<", formatted!"%x"(64), ">") == "<40>"); 71 } 72 73 // ************************************************************************ 74 75 /// Consume a LF or CRLF terminated line from s. 76 /// Sets s to null and returns the remainder 77 /// if there is no line terminator in s. 78 T[] eatLine(T)(ref T[] s, bool eatIncompleteLines = true) 79 { 80 return s.skipUntil([T('\n')], eatIncompleteLines).chomp(); 81 } 82 83 deprecated template eatLine(OnEof onEof) 84 { 85 T[] eatLine(T)(ref T[] s) 86 { 87 return s.eatUntil!onEof([T('\n')]).chomp(); 88 } 89 } 90 91 unittest 92 { 93 string s = "Hello\nworld"; 94 assert(s.eatLine() == "Hello"); 95 assert(s.eatLine() == "world"); 96 assert(s is null); 97 assert(s.eatLine() is null); 98 } 99 100 // Uses memchr (not Boyer-Moore), best for short strings. 101 /// An implementation of `replace` optimized for common cases (short strings). 102 T[] fastReplace(T)(T[] what, T[] from, T[] to) 103 if (T.sizeof == 1) // TODO (uses memchr) 104 { 105 alias Unqual!T U; 106 107 // debug scope(failure) std.stdio.writeln("fastReplace crashed: ", [what, from, to]); 108 enum RAM = cast(U*)null; 109 110 if (what.length < from.length || from.length==0) 111 return what; 112 113 if (from.length==1) 114 { 115 auto fromc = from[0]; 116 if (to.length==1) 117 { 118 auto p = cast(T*)memchr(what.ptr, fromc, what.length); 119 if (!p) 120 return what; 121 122 T[] result = what.dup; 123 auto delta = result.ptr - what.ptr; 124 auto toChar = to[0]; 125 auto end = what.ptr + what.length; 126 do 127 { 128 (cast(U*)p)[delta] = toChar; // zomg hax lol 129 p++; 130 p = cast(T*)memchr(p, fromc, end - p); 131 } while (p); 132 return result; 133 } 134 else 135 { 136 auto p = cast(immutable(T)*)memchr(what.ptr, fromc, what.length); 137 if (!p) 138 return what; 139 140 auto sb = StringBuilder(what.length); 141 do 142 { 143 sb.put(what[0..p-what.ptr], to); 144 what = what[p-what.ptr+1..$]; 145 p = cast(immutable(T)*)memchr(what.ptr, fromc, what.length); 146 } 147 while (p); 148 149 sb.put(what); 150 return sb.get(); 151 } 152 } 153 154 auto head = from[0]; 155 auto tail = from[1..$]; 156 157 auto p = cast(T*)what.ptr; 158 auto end = p + what.length - tail.length; 159 p = cast(T*)memchr(p, head, end-p); 160 while (p) 161 { 162 p++; 163 if (p[0..tail.length] == tail) 164 { 165 if (from.length == to.length) 166 { 167 T[] result = what.dup; 168 auto deltaMinusOne = (result.ptr - what.ptr) - 1; 169 170 goto replaceA; 171 dummyA: // compiler complains 172 173 do 174 { 175 p++; 176 if (p[0..tail.length] == tail) 177 { 178 replaceA: 179 (cast(U*)p+deltaMinusOne)[0..to.length] = to[]; 180 } 181 p = cast(T*)memchr(p, head, end-p); 182 } 183 while (p); 184 185 return result; 186 } 187 else 188 { 189 auto start = cast(T*)what.ptr; 190 auto sb = StringBuilder(what.length); 191 goto replaceB; 192 dummyB: // compiler complains 193 194 do 195 { 196 p++; 197 if (p[0..tail.length] == tail) 198 { 199 replaceB: 200 sb.put(RAM[cast(size_t)start .. cast(size_t)p-1], to); 201 start = p + tail.length; 202 what = what[start-what.ptr..$]; 203 } 204 else 205 { 206 what = what[p-what.ptr..$]; 207 } 208 p = cast(T*)memchr(what.ptr, head, what.length); 209 } 210 while (p); 211 212 //sb.put(what); 213 sb.put(RAM[cast(size_t)start..cast(size_t)(what.ptr+what.length)]); 214 return sb.get(); 215 } 216 217 assert(0); 218 } 219 p = cast(T*)memchr(p, head, end-p); 220 } 221 222 return what; 223 } 224 225 unittest 226 { 227 import std.array; 228 void test(string haystack, string from, string to) 229 { 230 auto description = `("` ~ haystack ~ `", "` ~ from ~ `", "` ~ to ~ `")`; 231 232 auto r1 = fastReplace(haystack, from, to); 233 auto r2 = replace(haystack, from, to); 234 assert(r1 == r2, `Bad replace: ` ~ description ~ ` == "` ~ r1 ~ `"`); 235 236 if (r1 == haystack) 237 assert(r1 is haystack, `Pointless reallocation: ` ~ description); 238 } 239 240 test("Mary had a little lamb", "a", "b"); 241 test("Mary had a little lamb", "a", "aaa"); 242 test("Mary had a little lamb", "Mary", "Lucy"); 243 test("Mary had a little lamb", "Mary", "Jimmy"); 244 test("Mary had a little lamb", "lamb", "goat"); 245 test("Mary had a little lamb", "lamb", "sheep"); 246 test("Mary had a little lamb", " l", " x"); 247 test("Mary had a little lamb", " l", " xx"); 248 249 test("Mary had a little lamb", "X" , "Y" ); 250 test("Mary had a little lamb", "XX", "Y" ); 251 test("Mary had a little lamb", "X" , "YY"); 252 test("Mary had a little lamb", "XX", "YY"); 253 test("Mary had a little lamb", "aX", "Y" ); 254 test("Mary had a little lamb", "aX", "YY"); 255 256 test("foo", "foobar", "bar"); 257 } 258 259 /// An implementation of `split` optimized for common cases. Allocates only once. 260 T[][] fastSplit(T, U)(T[] s, U d) 261 if (is(Unqual!T == Unqual!U)) 262 { 263 if (!s.length) 264 return null; 265 266 auto p = cast(T*)memchr(s.ptr, d, s.length); 267 if (!p) 268 return [s]; 269 270 size_t n; 271 auto end = s.ptr + s.length; 272 do 273 { 274 n++; 275 p++; 276 p = cast(T*) memchr(p, d, end-p); 277 } 278 while (p); 279 280 auto result = new T[][n+1]; 281 n = 0; 282 auto start = s.ptr; 283 p = cast(T*) memchr(start, d, s.length); 284 do 285 { 286 result[n++] = start[0..p-start]; 287 start = ++p; 288 p = cast(T*) memchr(p, d, end-p); 289 } 290 while (p); 291 result[n] = start[0..end-start]; 292 293 return result; 294 } 295 296 /// Like `splitLines`, but does not attempt to split on Unicode line endings. 297 /// Only splits on `"\r"`, `"\n"`, and `"\r\n"`. 298 T[][] splitAsciiLines(T)(T[] text) 299 if (is(Unqual!T == char)) 300 { 301 auto lines = text.fastSplit('\n'); 302 foreach (ref line; lines) 303 if (line.length && line[$-1]=='\r') 304 line = line[0..$-1]; 305 return lines; 306 } 307 308 unittest 309 { 310 assert(splitAsciiLines("a\nb\r\nc\r\rd\n\re\r\n\nf") == ["a", "b", "c\r\rd", "\re", "", "f"]); 311 assert(splitAsciiLines(string.init) == splitLines(string.init)); 312 } 313 314 /// Like std.string.split (one argument version, which splits by 315 /// whitespace), but only splits by ASCII and does not autodecode. 316 T[][] asciiSplit(T)(T[] text) 317 if (is(Unqual!T == char)) 318 { 319 bool inWhitespace = true; 320 size_t wordStart; 321 T[][] result; 322 323 void endWord(size_t p) 324 { 325 if (!inWhitespace) 326 { 327 result ~= text[wordStart..p]; 328 inWhitespace = true; 329 } 330 } 331 332 foreach (p, c; text) 333 if (std.ascii.isWhite(c)) 334 endWord(p); 335 else 336 if (inWhitespace) 337 { 338 inWhitespace = false; 339 wordStart = p; 340 } 341 endWord(text.length); 342 return result; 343 } 344 345 unittest 346 { 347 foreach (s; ["", " ", "a", " a", "a ", "a b", " a b", "a b ", " a b ", 348 " ", " a", "a ", "a b", "a b ", "a b c"]) 349 assert(s.split == s.asciiSplit, format("Got %s, expected %s", s.asciiSplit, s.split)); 350 } 351 352 /// Like `strip`, but only removes ASCII whitespace. 353 T[] asciiStrip(T)(T[] s) 354 if (is(Unqual!T == char)) 355 { 356 while (s.length && isWhite(s[0])) 357 s = s[1..$]; 358 while (s.length && isWhite(s[$-1])) 359 s = s[0..$-1]; 360 return s; 361 } 362 363 /// 364 unittest 365 { 366 string s = "Hello, world!"; 367 assert(asciiStrip(s) is s); 368 assert(asciiStrip("\r\n\tHello ".dup) == "Hello"); 369 } 370 371 /// Covering slice-list of s with interleaved whitespace. 372 T[][] segmentByWhitespace(T)(T[] s) 373 if (is(Unqual!T == char)) 374 { 375 if (!s.length) 376 return null; 377 378 T[][] segments; 379 bool wasWhite = isWhite(s[0]); 380 size_t start = 0; 381 foreach (p, char c; s) 382 { 383 bool isWhite = isWhite(c); 384 if (isWhite != wasWhite) 385 segments ~= s[start..p], 386 start = p; 387 wasWhite = isWhite; 388 } 389 segments ~= s[start..$]; 390 391 return segments; 392 } 393 394 /// Replaces runs of ASCII whitespace which contain a newline (`'\n'`) into a single space. 395 T[] newlinesToSpaces(T)(T[] s) 396 if (is(Unqual!T == char)) 397 { 398 auto slices = segmentByWhitespace(s); 399 foreach (ref slice; slices) 400 if (slice.contains("\n")) 401 slice = " "; 402 return slices.join(); 403 } 404 405 /// Replaces all runs of ASCII whitespace with a single space. 406 ascii normalizeWhitespace(ascii s) 407 { 408 auto slices = segmentByWhitespace(strip(s)); 409 foreach (i, ref slice; slices) 410 if (i & 1) // odd 411 slice = " "; 412 return slices.join(); 413 } 414 415 /// 416 unittest 417 { 418 assert(normalizeWhitespace(" Mary had\ta\nlittle\r\n\tlamb") == "Mary had a little lamb"); 419 } 420 421 /// Splits out words from a camel-cased string. 422 /// All-uppercase words are returned as a single word. 423 string[] splitByCamelCase(string s) 424 { 425 string[] result; 426 size_t start = 0; 427 foreach (i; 1..s.length+1) 428 if (i == s.length 429 || (isLower(s[i-1]) && isUpper(s[i])) 430 || (i+1 < s.length && isUpper(s[i-1]) && isUpper(s[i]) && isLower(s[i+1])) 431 ) 432 { 433 result ~= s[start..i]; 434 start = i; 435 } 436 return result; 437 } 438 439 /// 440 unittest 441 { 442 assert(splitByCamelCase("parseIPString") == ["parse", "IP", "String"]); 443 assert(splitByCamelCase("IPString") == ["IP", "String"]); 444 } 445 446 /// Join an array of words into a camel-cased string. 447 string camelCaseJoin(string[] arr) 448 { 449 if (!arr.length) 450 return null; 451 string result = arr[0]; 452 foreach (s; arr[1..$]) 453 result ~= std.ascii.toUpper(s[0]) ~ s[1..$]; 454 return result; 455 } 456 457 unittest 458 { 459 assert("parse-IP-string".split('-').camelCaseJoin() == "parseIPString"); 460 } 461 462 // ************************************************************************ 463 464 /// Like std.string.wrap, but preserves whitespace at line start and 465 /// between (non-wrapped) words. 466 string verbatimWrap( 467 string s, 468 size_t columns = 80, 469 string firstIndent = null, 470 string indent = null, 471 size_t tabWidth = 8, 472 ) 473 { 474 if (!s.length) 475 return s; 476 477 import std.uni : isWhite; 478 import std.range; 479 480 // Result buffer. Append-only (contains only text which has been wrapped). 481 string result; 482 // Index in `s` corresponding to the end of `result` 483 size_t start; 484 // Index in `s` corresponding to after the last newline in `result` 485 size_t lineStart; 486 // Current column 487 size_t col; 488 // Was the previous character we looked at whitespace? 489 bool wasWhite; 490 // We need to add an indent at the next (non-newline) character. 491 bool needIndent; 492 493 result = firstIndent; 494 col = firstIndent.walkLength; 495 auto indentWidth = indent.walkLength; 496 497 void flush(size_t pos) 498 { 499 if (col > columns && start > lineStart) 500 { 501 result ~= "\n" ~ indent; 502 col = indentWidth; 503 504 // Consume whitespace at line break 505 size_t numWhite; 506 foreach (i, c; s[start .. $]) 507 if (isWhite(c)) 508 numWhite = i; 509 else 510 break; 511 start += numWhite; 512 lineStart = start; 513 } 514 result ~= s[start .. pos]; 515 start = pos; 516 } 517 518 foreach (pos, dchar c; s) 519 { 520 auto atWhite = isWhite(c); 521 if (atWhite && !wasWhite) 522 flush(pos); 523 if (c == '\n') 524 { 525 flush(pos); 526 result ~= "\n"; 527 start++; // past newline 528 lineStart = start; 529 needIndent = true; 530 col = 0; 531 } 532 else 533 { 534 if (needIndent) 535 { 536 assert(col == 0); 537 result ~= indent; 538 col += indentWidth; 539 needIndent = false; 540 } 541 if (c == '\t') 542 col += tabWidth; 543 else 544 col++; 545 } 546 wasWhite = atWhite; 547 } 548 flush(s.length); 549 if (col) 550 result ~= "\n"; // trailing newline 551 552 return result; 553 } 554 555 // ************************************************************************ 556 557 /// Case-insensitive ASCII string. 558 alias CIAsciiString = NormalizedArray!(immutable(char), s => s.byCodeUnit.map!(std.ascii.toLower)); 559 560 /// 561 unittest 562 { 563 CIAsciiString s = "test"; 564 assert(s == "TEST"); 565 assert(s >= "Test" && s <= "Test"); 566 assert(CIAsciiString("a") == CIAsciiString("A")); 567 assert(CIAsciiString("a") != CIAsciiString("B")); 568 assert(CIAsciiString("a") < CIAsciiString("B")); 569 assert(CIAsciiString("A") < CIAsciiString("b")); 570 assert(CIAsciiString("я") != CIAsciiString("Я")); 571 } 572 573 import std.uni : toLower; 574 575 /// Case-insensitive Unicode string. 576 alias CIUniString = NormalizedArray!(immutable(char), s => s.map!(toLower)); 577 578 /// 579 unittest 580 { 581 CIUniString s = "привет"; 582 assert(s == "ПРИВЕТ"); 583 assert(s >= "Привет" && s <= "Привет"); 584 assert(CIUniString("я") == CIUniString("Я")); 585 assert(CIUniString("а") != CIUniString("Б")); 586 assert(CIUniString("а") < CIUniString("Б")); 587 assert(CIUniString("А") < CIUniString("б")); 588 } 589 590 // ************************************************************************ 591 592 import std.utf; 593 594 /// Convert any data to a valid UTF-8 bytestream, so D's string functions can 595 /// properly work on it. 596 string rawToUTF8(in char[] s) 597 { 598 auto d = new dchar[s.length]; 599 foreach (i, char c; s) 600 d[i] = c; 601 return toUTF8(d); 602 } 603 604 /// Undo rawToUTF8. 605 ascii UTF8ToRaw(in char[] r) pure 606 { 607 auto s = new char[r.length]; 608 size_t i = 0; 609 foreach (dchar c; r) 610 { 611 assert(c < '\u0100'); 612 s[i++] = cast(char)c; 613 } 614 return s[0..i]; 615 } 616 617 unittest 618 { 619 char[1] c; 620 for (int i=0; i<256; i++) 621 { 622 c[0] = cast(char)i; 623 assert(UTF8ToRaw(rawToUTF8(c[])) == c[], format("%s -> %s -> %s", cast(ubyte[])c[], cast(ubyte[])rawToUTF8(c[]), cast(ubyte[])UTF8ToRaw(rawToUTF8(c[])))); 624 } 625 } 626 627 /// Where a delegate with this signature is required. 628 string nullStringTransform(in char[] s) { return to!string(s); } 629 630 /// Like readText, but with in-memory data. 631 /// Reverse of ae.utils.array.bytes (for strings). 632 inout(char)[] asText(inout(ubyte)[] bytes) 633 { 634 auto s = cast(inout(char)[]) bytes; 635 validate(s); 636 return s; 637 } 638 639 /// Lossily convert arbitrary data into a valid UTF-8 string. 640 string forceValidUTF8(ascii s) 641 { 642 try 643 { 644 validate(s); 645 return s; 646 } 647 catch (UTFException) 648 return rawToUTF8(s); 649 } 650 651 // ************************************************************************ 652 653 /// Return the slice up to the first NUL character, 654 /// or of the whole array if none is found. 655 C[] fromZArray(C, n)(ref C[n] arr) 656 { 657 auto p = arr.representation.countUntil(0); 658 return arr[0 .. p<0 ? $ : p]; 659 } 660 661 /// ditto 662 C[] fromZArray(C)(C[] arr) 663 { 664 auto p = arr.representation.countUntil(0); 665 return arr[0 .. p<0 ? $ : p]; 666 } 667 668 unittest 669 { 670 char[4] arr = "ab\0d"; 671 assert(arr.fromZArray == "ab"); 672 arr[] = "abcd"; 673 assert(arr.fromZArray == "abcd"); 674 } 675 676 unittest 677 { 678 string arr = "ab\0d"; 679 assert(arr.fromZArray == "ab"); 680 arr = "abcd"; 681 assert(arr.fromZArray == "abcd"); 682 } 683 684 // ************************************************************************ 685 686 /// Formats binary data as a hex dump (three-column layout consisting of hex 687 /// offset, byte values in hex, and printable low-ASCII characters). 688 string hexDump(const(void)[] b) 689 { 690 auto data = cast(const(ubyte)[]) b; 691 assert(data.length); 692 size_t i=0; 693 string s; 694 while (i<data.length) 695 { 696 s ~= format("%08X: ", i); 697 foreach (x; 0..16) 698 { 699 if (i+x<data.length) 700 s ~= format("%02X ", data[i+x]); 701 else 702 s ~= " "; 703 if (x==7) 704 s ~= "| "; 705 } 706 s ~= " "; 707 foreach (x; 0..16) 708 { 709 if (i+x<data.length) 710 if (data[i+x]==0) 711 s ~= ' '; 712 else 713 if (data[i+x]<32 || data[i+x]>=128) 714 s ~= '.'; 715 else 716 s ~= cast(char)data[i+x]; 717 else 718 s ~= ' '; 719 } 720 s ~= "\n"; 721 i += 16; 722 } 723 return s; 724 } 725 726 import std.conv; 727 728 /// Parses `s` as a hexadecimal number into an integer of type `T`. 729 T fromHex(T : ulong = uint, C)(const(C)[] s) 730 { 731 T result = parse!T(s, 16); 732 enforce(s.length==0, new ConvException("Could not parse entire string")); 733 return result; 734 } 735 736 /// Parses `hex` into an array of bytes. 737 /// `hex.length` should be even. 738 ubyte[] arrayFromHex(in char[] hex) 739 { 740 auto buf = new ubyte[hex.length/2]; 741 arrayFromHex(hex, buf); 742 return buf; 743 } 744 745 /// Policy for `parseHexDigit`. 746 struct HexParseConfig 747 { 748 bool checked = true; /// Throw on invalid digits. 749 bool lower = true; /// Accept lower-case digits. 750 bool upper = true; /// Accept upper-case digits. 751 } 752 753 /// Parse a single hexadecimal digit according to the policy in `config`. 754 ubyte parseHexDigit(HexParseConfig config = HexParseConfig.init)(char c) 755 { 756 static assert(config.lower || config.upper, 757 "Must parse at least either lower or upper case digits"); 758 static if (config.checked) 759 { 760 switch (c) 761 { 762 case '0': .. case '9': return cast(ubyte)(c - '0'); 763 case 'a': .. case 'f': return cast(ubyte)(c - 'a' + 10); 764 case 'A': .. case 'F': return cast(ubyte)(c - 'A' + 10); 765 default: throw new Exception("Bad hex digit: " ~ c); 766 } 767 } 768 else 769 { 770 if (c <= '9') 771 return cast(ubyte)(c - '0'); 772 static if (config.lower && config.upper) 773 { 774 if (c < 'a') 775 return cast(ubyte)(c - 'A' + 10); 776 else 777 return cast(ubyte)(c - 'a' + 10); 778 } 779 else 780 static if (config.lower) 781 return cast(ubyte)(c - 'a' + 10); 782 else 783 return cast(ubyte)(c - 'A' + 10); 784 } 785 } 786 787 /// Parses `hex` into the given array `buf`. 788 void arrayFromHex(HexParseConfig config = HexParseConfig.init)(in char[] hex, ubyte[] buf) 789 { 790 assert(buf.length == hex.length/2, "Wrong buffer size for arrayFromHex"); 791 for (int i=0; i<hex.length; i+=2) 792 buf[i/2] = cast(ubyte)( 793 parseHexDigit!config(hex[i ])*16 + 794 parseHexDigit!config(hex[i+1]) 795 ); 796 } 797 798 /// Parses `hex` into the given array `buf`. 799 /// Fast version for static arrays of known length. 800 void sarrayFromHex(HexParseConfig config = HexParseConfig.init, size_t N, Hex)(ref const Hex hex, ref ubyte[N] buf) 801 if (is(Hex == char[N*2])) 802 { 803 foreach (i; 0..N/4) 804 { 805 ulong chars = (cast(ulong*)hex.ptr)[i]; 806 uint res = 807 (parseHexDigit!config((chars >> (8*0)) & 0xFF) << (4*1)) | 808 (parseHexDigit!config((chars >> (8*1)) & 0xFF) << (4*0)) | 809 (parseHexDigit!config((chars >> (8*2)) & 0xFF) << (4*3)) | 810 (parseHexDigit!config((chars >> (8*3)) & 0xFF) << (4*2)) | 811 (parseHexDigit!config((chars >> (8*4)) & 0xFF) << (4*5)) | 812 (parseHexDigit!config((chars >> (8*5)) & 0xFF) << (4*4)) | 813 (parseHexDigit!config((chars >> (8*6)) & 0xFF) << (4*7)) | 814 (parseHexDigit!config((chars >> (8*7)) & 0xFF) << (4*6)); 815 (cast(uint*)buf.ptr)[i] = res; 816 } 817 foreach (i; N/4*4..N) 818 buf[i] = cast(ubyte)( 819 parseHexDigit!config(hex[i*2 ])*16 + 820 parseHexDigit!config(hex[i*2+1]) 821 ); 822 } 823 824 unittest 825 { 826 foreach (checked; TypeTuple!(false, true)) 827 foreach (lower; TypeTuple!(false, true)) 828 foreach (upper; TypeTuple!(false, true)) 829 static if (lower || upper) 830 { 831 enum config = HexParseConfig(checked, lower, upper); 832 char[18] buf; 833 foreach (n; 0..18) 834 if (lower && upper ? n & 1 : upper) 835 buf[n] = hexDigits[n % 16]; 836 else 837 buf[n] = lowerHexDigits[n % 16]; 838 ubyte[9] res; 839 sarrayFromHex!config(buf, res); 840 assert(res == [0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF, 0x01], text(res)); 841 } 842 } 843 844 /// Conversion from bytes to hexadecimal strings. 845 template toHex(alias digits = hexDigits) 846 { 847 /// Dynamic array version. 848 char[] toHex(in ubyte[] data, char[] buf) pure 849 { 850 assert(buf.length == data.length*2); 851 foreach (i, b; data) 852 { 853 buf[i*2 ] = digits[b>>4]; 854 buf[i*2+1] = digits[b&15]; 855 } 856 return buf; 857 } 858 859 /// Static array version. 860 char[n*2] toHex(size_t n)(in ubyte[n] data) pure 861 { 862 char[n*2] buf; 863 foreach (i, b; data) 864 { 865 buf[i*2 ] = digits[b>>4]; 866 buf[i*2+1] = digits[b&15]; 867 } 868 return buf; 869 } 870 871 /// Allocating version. 872 string toHex(in ubyte[] data) pure 873 { 874 auto buf = new char[data.length*2]; 875 foreach (i, b; data) 876 { 877 buf[i*2 ] = digits[b>>4]; 878 buf[i*2+1] = digits[b&15]; 879 } 880 return buf; 881 } 882 } 883 884 alias toLowerHex = toHex!lowerHexDigits; /// ditto 885 886 /// Conversion an integer type to a fixed-length hexadecimal string. 887 void toHex(T : ulong, size_t U = T.sizeof*2)(T n, ref char[U] buf) 888 { 889 Unqual!T x = n; 890 foreach (i; Reverse!(RangeTuple!(T.sizeof*2))) 891 { 892 buf[i] = hexDigits[x & 0xF]; 893 x >>= 4; 894 } 895 } 896 897 unittest 898 { 899 ubyte[] bytes = [0x12, 0x34]; 900 assert(toHex(bytes) == "1234"); 901 } 902 903 unittest 904 { 905 ubyte[] bytes = [0x12, 0x34]; 906 char[] buf = new char[4]; 907 toHex(bytes, buf); 908 assert(buf == "1234"); 909 } 910 911 unittest 912 { 913 char[8] buf; 914 toHex(0x01234567, buf); 915 assert(buf == "01234567"); 916 } 917 918 /// ditto 919 char[T.sizeof*2] toHex(T : ulong)(T n) 920 { 921 char[T.sizeof*2] buf; 922 toHex(n, buf); 923 return buf; 924 } 925 926 unittest 927 { 928 assert(toHex(0x01234567) == "01234567"); 929 } 930 931 unittest 932 { 933 ubyte[2] bytes = [0x12, 0x34]; 934 auto buf = bytes.toLowerHex(); 935 static assert(buf.length == 4); 936 assert(buf == "1234"); 937 } 938 939 /// How many significant decimal digits does a FP type have 940 /// (determined empirically - valid for all D FP types on x86/64) 941 enum significantDigits(T : real) = 2 + 2 * T.sizeof; 942 943 /// Format string for a FP type which includes all necessary 944 /// significant digits 945 enum fpFormatString(T) = "%." ~ text(significantDigits!T) ~ "g"; 946 private template cWidthString(T) 947 { 948 static if (is(Unqual!T == float)) 949 enum cWidthString = ""; 950 else 951 static if (is(Unqual!T == double)) 952 enum cWidthString = "l"; 953 else 954 static if (is(Unqual!T == real)) 955 enum cWidthString = "L"; 956 } 957 /// C format string to exactly format a floating-point type `T`. 958 enum fpCFormatString(T) = "%." ~ text(significantDigits!T) ~ cWidthString!T ~ "g"; 959 960 private auto safeSprintf(size_t N, Args...)(ref char[N] buf, auto ref Args args) @trusted @nogc 961 { 962 return snprintf(buf.ptr, N, args); 963 } 964 965 private auto fpToBuf(Q)(Q val) @safe nothrow @nogc 966 { 967 alias F = Unqual!Q; 968 969 /// Bypass FPU register, which may contain a different precision 970 static F forceType(F d) { static F n; n = d; return n; } 971 972 enum isReal = is(F == real); 973 974 StaticBuf!(char, 64) buf = void; 975 976 // MSVC workaround from std.format: 977 version (CRuntime_Microsoft) 978 { 979 import std.math : isNaN, isInfinity; 980 immutable double v = val; // convert early to get "inf" in case of overflow 981 { 982 string s; 983 if (isNaN(v)) 984 s = "nan"; // snprintf writes 1.#QNAN 985 else if (isInfinity(v)) 986 s = val >= 0 ? "inf" : "-inf"; // snprintf writes 1.#INF 987 else 988 goto L1; 989 buf.buf[0..s.length] = s; 990 buf.pos = s.length; 991 return buf; 992 L1: 993 } 994 } 995 else 996 alias v = val; 997 998 buf.pos = safeSprintf(buf.buf, &fpCFormatString!F[0], forceType(v)); 999 char[] s = buf.data(); 1000 1001 F parse(char[] s) 1002 { 1003 F f; 1004 auto res = tryParse(s, f); 1005 assert(res, "Failed to parse number we created"); 1006 assert(!s.length, "Failed to completely parse number we created"); 1007 return f; 1008 } 1009 1010 if (s != "nan" && s != "-nan" && s != "inf" && s != "-inf") 1011 { 1012 if (forceType(parse(s)) != v) 1013 { 1014 static if (isReal) 1015 { 1016 // Something funny with DM libc real parsing... e.g. 0.6885036635121051783 1017 return buf; 1018 } 1019 else 1020 // assert(false, "Initial conversion fails: " ~ format(fpFormatString!F, parse(s)) ~ " / " ~ s); 1021 assert(false, "Initial conversion fails"); 1022 } 1023 1024 foreach_reverse (i; 1..s.length) 1025 if (s[i]>='0' && s[i]<='8') 1026 { 1027 s[i]++; 1028 if (forceType(parse(s[0..i+1]))==v) 1029 s = s[0..i+1]; 1030 else 1031 s[i]--; 1032 } 1033 while (s.length>2 && s[$-1]!='.' && forceType(parse(s[0..$-1]))==v) 1034 s = s[0..$-1]; 1035 } 1036 buf.pos = s.length; 1037 return buf; 1038 } 1039 1040 /// Get shortest string representation of a FP type that still converts to exactly the same number. 1041 template fpToString(F) 1042 { 1043 string fpToString(F v) @safe nothrow 1044 { 1045 return fpToBuf(v).data.idup; 1046 } 1047 1048 static if (!is(Unqual!F == real)) 1049 unittest 1050 { 1051 union U 1052 { 1053 ubyte[F.sizeof] bytes; 1054 Unqual!F d; 1055 string toString() const { return (fpFormatString!F ~ " %a [%(%02X %)]").format(d, d, bytes[]); } 1056 } 1057 import std.random : Xorshift, uniform; 1058 import std.stdio : stderr; 1059 Xorshift rng; 1060 foreach (n; 0..10000) 1061 { 1062 U u; 1063 foreach (ref b; u.bytes[]) 1064 b = uniform!ubyte(rng); 1065 static if (is(Unqual!F == real)) 1066 u.bytes[7] |= 0x80; // require normalized value 1067 scope(failure) stderr.writeln("Input:\t", u); 1068 auto s = fpToString(u.d); 1069 scope(failure) stderr.writeln("Result:\t", s); 1070 if (s == "nan" || s == "-nan") 1071 continue; // there are many NaNs... 1072 U r; 1073 r.d = to!F(s); 1074 assert(r.bytes == u.bytes, 1075 "fpToString mismatch:\nOutput:\t%s".format(r)); 1076 } 1077 } 1078 } 1079 1080 alias doubleToString = fpToString!double; /// 1081 1082 unittest 1083 { 1084 alias floatToString = fpToString!float; 1085 alias realToString = fpToString!real; 1086 alias crealToString = fpToString!(const(real)); 1087 } 1088 1089 /// Like `fpToString`, but writes the result to a sink. 1090 void putFP(Writer, F)(auto ref Writer writer, F v) 1091 { 1092 writer.put(fpToBuf(v).data); 1093 } 1094 1095 1096 /// Wraps the result of `fpToString` in a non-allocating stringifiable struct. 1097 struct FPAsString(T) 1098 { 1099 private typeof(fpToBuf(T.init)) buf; 1100 1101 this(T f) 1102 { 1103 buf = fpToBuf(f); 1104 } /// 1105 1106 string toString() const pure nothrow 1107 { 1108 return buf.data.idup; 1109 } /// 1110 1111 void toString(W)(ref W w) const 1112 { 1113 static if (is(typeof(w.put(buf.data)))) 1114 w.put(buf.data); 1115 else 1116 foreach (c; buf.data) 1117 w.put(c); 1118 } /// 1119 } 1120 FPAsString!T fpAsString(T)(T f) { return FPAsString!T(f); } /// ditto 1121 1122 @safe //nothrow @nogc 1123 unittest 1124 { 1125 StaticBuf!(char, 1024) buf; 1126 buf.formattedWrite!"%s"(fpAsString(0.1)); 1127 assert(buf.data == "0.1"); 1128 } 1129 1130 /// Get shortest string representation of a numeric 1131 /// type that still converts to exactly the same number. 1132 string numberToString(T)(T v) 1133 if (isNumeric!T) 1134 { 1135 static if (is(T : ulong)) 1136 return toDec(v); 1137 else 1138 return fpToString(v); 1139 } 1140 1141 // ************************************************************************ 1142 1143 /// Simpler implementation of Levenshtein string distance 1144 int stringDistance(string s, string t) 1145 { 1146 int n = cast(int)s.length; 1147 int m = cast(int)t.length; 1148 if (n == 0) return m; 1149 if (m == 0) return n; 1150 int[][] distance = new int[][](n+1, m+1); // matrix 1151 int cost=0; 1152 //init1 1153 foreach (i; 0..n+1) distance[i][0]=i; 1154 foreach (j; 0..m+1) distance[0][j]=j; 1155 //find min distance 1156 foreach (i; 1..n+1) 1157 foreach (j; 1..m+1) 1158 { 1159 cost = t[j-1] == s[i-1] ? 0 : 1; 1160 distance[i][j] = min( 1161 distance[i-1][j ] + 1, 1162 distance[i ][j-1] + 1, 1163 distance[i-1][j-1] + cost 1164 ); 1165 } 1166 return distance[n][m]; 1167 } 1168 1169 /// Return a number between 0.0 and 1.0 indicating how similar two strings are 1170 /// (1.0 if identical) 1171 float stringSimilarity(string string1, string string2) 1172 { 1173 float dis = stringDistance(string1, string2); 1174 float maxLen = string1.length; 1175 if (maxLen < string2.length) 1176 maxLen = string2.length; 1177 if (maxLen == 0) 1178 return 1; 1179 else 1180 return 1f - dis/maxLen; 1181 } 1182 1183 /// Select best match from a list of items. 1184 /// Returns -1 if none are above the threshold. 1185 sizediff_t findBestMatch(in string[] items, string target, float threshold = 0.7) 1186 { 1187 sizediff_t found = -1; 1188 float best = 0; 1189 1190 foreach (i, item; items) 1191 { 1192 float match = stringSimilarity(toLower(item),toLower(target)); 1193 if (match>threshold && match>=best) 1194 { 1195 best = match; 1196 found = i; 1197 } 1198 } 1199 1200 return found; 1201 } 1202 1203 /// Select best match from a list of items. 1204 /// Returns null if none are above the threshold. 1205 string selectBestFrom(in string[] items, string target, float threshold = 0.7) 1206 { 1207 auto index = findBestMatch(items, target, threshold); 1208 return index < 0 ? null : items[index]; 1209 } 1210 1211 // ************************************************************************ 1212 1213 /// Generate a random string with the given parameters. 1214 /// `std.random` is used as the source of randomness. 1215 /// Not cryptographically secure. 1216 string randomString()(int length=20, string chars="abcdefghijklmnopqrstuvwxyz") 1217 { 1218 import std.random; 1219 import std.range; 1220 1221 return length.iota.map!(n => chars[uniform(0, $)]).array; 1222 }