1 /** 2 * Utility code related to string and text processing. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <ae@cy.md> 12 */ 13 14 module ae.utils.text; 15 16 import core.stdc.stdio : snprintf, sscanf; 17 import core.stdc.string; 18 19 import std.algorithm; 20 import std.ascii; 21 import std.exception; 22 import std.conv; 23 import std.format; 24 import std.range.primitives; 25 import std.string; 26 import std.traits; 27 import std.typetuple; 28 29 import ae.utils.array; 30 import ae.utils.meta : rangeTuple; 31 import ae.utils.text.parsefp; 32 import ae.utils.textout; 33 34 private alias indexOf = std..string.indexOf; 35 36 public import ae.utils.array : contains; 37 public import ae.utils.text.ascii : ascii, decimalSize, toDec, toDecFixed, asciiToLower, asciiToUpper; 38 public import ae.utils.text.functor : formatted; 39 40 deprecated("legacy transitive import - please `import ae.utils.text.ascii;`.") 41 public import ae.utils.text.ascii : DecimalSize; 42 43 // ************************************************************************ 44 45 /// UFCS helper 46 string formatAs(T)(auto ref T obj, string fmt) 47 { 48 return format(fmt, obj); 49 } 50 51 /// 52 unittest 53 { 54 assert(5.formatAs("%03d") == "005"); 55 } 56 57 // ************************************************************************ 58 59 /// Consume a LF or CRLF terminated line from s. 60 /// Sets s to null and returns the remainder 61 /// if there is no line terminator in s. 62 T[] eatLine(T)(ref T[] s, bool eatIncompleteLines = true) 63 { 64 return s.skipUntil([T('\n')], eatIncompleteLines).chomp(); 65 } 66 67 deprecated template eatLine(OnEof onEof) 68 { 69 T[] eatLine(T)(ref T[] s) 70 { 71 return s.eatUntil!onEof([T('\n')]).chomp(); 72 } 73 } 74 75 unittest 76 { 77 string s = "Hello\nworld"; 78 assert(s.eatLine() == "Hello"); 79 assert(s.eatLine() == "world"); 80 assert(s is null); 81 assert(s.eatLine() is null); 82 } 83 84 // Uses memchr (not Boyer-Moore), best for short strings. 85 /// An implementation of `replace` optimized for common cases (short strings). 86 T[] fastReplace(T)(T[] what, T[] from, T[] to) 87 if (T.sizeof == 1) // TODO (uses memchr) 88 { 89 alias Unqual!T U; 90 91 // debug scope(failure) std.stdio.writeln("fastReplace crashed: ", [what, from, to]); 92 enum RAM = cast(U*)null; 93 94 if (what.length < from.length || from.length==0) 95 return what; 96 97 if (from.length==1) 98 { 99 auto fromc = from[0]; 100 if (to.length==1) 101 { 102 auto p = cast(T*)memchr(what.ptr, fromc, what.length); 103 if (!p) 104 return what; 105 106 T[] result = what.dup; 107 auto delta = result.ptr - what.ptr; 108 auto toChar = to[0]; 109 auto end = what.ptr + what.length; 110 do 111 { 112 (cast(U*)p)[delta] = toChar; // zomg hax lol 113 p++; 114 p = cast(T*)memchr(p, fromc, end - p); 115 } while (p); 116 return result; 117 } 118 else 119 { 120 auto p = cast(immutable(T)*)memchr(what.ptr, fromc, what.length); 121 if (!p) 122 return what; 123 124 auto sb = StringBuilder(what.length); 125 do 126 { 127 sb.put(what[0..p-what.ptr], to); 128 what = what[p-what.ptr+1..$]; 129 p = cast(immutable(T)*)memchr(what.ptr, fromc, what.length); 130 } 131 while (p); 132 133 sb.put(what); 134 return sb.get(); 135 } 136 } 137 138 auto head = from[0]; 139 auto tail = from[1..$]; 140 141 auto p = cast(T*)what.ptr; 142 auto end = p + what.length - tail.length; 143 p = cast(T*)memchr(p, head, end-p); 144 while (p) 145 { 146 p++; 147 if (p[0..tail.length] == tail) 148 { 149 if (from.length == to.length) 150 { 151 T[] result = what.dup; 152 auto deltaMinusOne = (result.ptr - what.ptr) - 1; 153 154 goto replaceA; 155 dummyA: // compiler complains 156 157 do 158 { 159 p++; 160 if (p[0..tail.length] == tail) 161 { 162 replaceA: 163 (cast(U*)p+deltaMinusOne)[0..to.length] = to[]; 164 } 165 p = cast(T*)memchr(p, head, end-p); 166 } 167 while (p); 168 169 return result; 170 } 171 else 172 { 173 auto start = cast(T*)what.ptr; 174 auto sb = StringBuilder(what.length); 175 goto replaceB; 176 dummyB: // compiler complains 177 178 do 179 { 180 p++; 181 if (p[0..tail.length] == tail) 182 { 183 replaceB: 184 sb.put(RAM[cast(size_t)start .. cast(size_t)p-1], to); 185 start = p + tail.length; 186 what = what[start-what.ptr..$]; 187 } 188 else 189 { 190 what = what[p-what.ptr..$]; 191 } 192 p = cast(T*)memchr(what.ptr, head, what.length); 193 } 194 while (p); 195 196 //sb.put(what); 197 sb.put(RAM[cast(size_t)start..cast(size_t)(what.ptr+what.length)]); 198 return sb.get(); 199 } 200 201 assert(0); 202 } 203 p = cast(T*)memchr(p, head, end-p); 204 } 205 206 return what; 207 } 208 209 unittest 210 { 211 import std.array; 212 void test(string haystack, string from, string to) 213 { 214 auto description = `("` ~ haystack ~ `", "` ~ from ~ `", "` ~ to ~ `")`; 215 216 auto r1 = fastReplace(haystack, from, to); 217 auto r2 = replace(haystack, from, to); 218 assert(r1 == r2, `Bad replace: ` ~ description ~ ` == "` ~ r1 ~ `"`); 219 220 if (r1 == haystack) 221 assert(r1 is haystack, `Pointless reallocation: ` ~ description); 222 } 223 224 test("Mary had a little lamb", "a", "b"); 225 test("Mary had a little lamb", "a", "aaa"); 226 test("Mary had a little lamb", "Mary", "Lucy"); 227 test("Mary had a little lamb", "Mary", "Jimmy"); 228 test("Mary had a little lamb", "lamb", "goat"); 229 test("Mary had a little lamb", "lamb", "sheep"); 230 test("Mary had a little lamb", " l", " x"); 231 test("Mary had a little lamb", " l", " xx"); 232 233 test("Mary had a little lamb", "X" , "Y" ); 234 test("Mary had a little lamb", "XX", "Y" ); 235 test("Mary had a little lamb", "X" , "YY"); 236 test("Mary had a little lamb", "XX", "YY"); 237 test("Mary had a little lamb", "aX", "Y" ); 238 test("Mary had a little lamb", "aX", "YY"); 239 240 test("foo", "foobar", "bar"); 241 } 242 243 /// An implementation of `split` optimized for common cases. Allocates only once. 244 T[][] fastSplit(T, U)(T[] s, U d) 245 if (is(Unqual!T == Unqual!U)) 246 { 247 if (!s.length) 248 return null; 249 250 auto p = cast(T*)memchr(s.ptr, d, s.length); 251 if (!p) 252 return [s]; 253 254 size_t n; 255 auto end = s.ptr + s.length; 256 do 257 { 258 n++; 259 p++; 260 p = cast(T*) memchr(p, d, end-p); 261 } 262 while (p); 263 264 auto result = new T[][n+1]; 265 n = 0; 266 auto start = s.ptr; 267 p = cast(T*) memchr(start, d, s.length); 268 do 269 { 270 result[n++] = start[0..p-start]; 271 start = ++p; 272 p = cast(T*) memchr(p, d, end-p); 273 } 274 while (p); 275 result[n] = start[0..end-start]; 276 277 return result; 278 } 279 280 /// Like `splitLines`, but does not attempt to split on Unicode line endings. 281 /// Only splits on `"\r"`, `"\n"`, and `"\r\n"`. 282 T[][] splitAsciiLines(T)(T[] text) 283 if (is(Unqual!T == char)) 284 { 285 auto lines = text.fastSplit('\n'); 286 foreach (ref line; lines) 287 if (line.length && line[$-1]=='\r') 288 line = line[0..$-1]; 289 return lines; 290 } 291 292 unittest 293 { 294 assert(splitAsciiLines("a\nb\r\nc\r\rd\n\re\r\n\nf") == ["a", "b", "c\r\rd", "\re", "", "f"]); 295 assert(splitAsciiLines(string.init) == splitLines(string.init)); 296 } 297 298 /// Like std.string.split (one argument version, which splits by 299 /// whitespace), but only splits by ASCII and does not autodecode. 300 T[][] asciiSplit(T)(T[] text) 301 if (is(Unqual!T == char)) 302 { 303 bool inWhitespace = true; 304 size_t wordStart; 305 T[][] result; 306 307 void endWord(size_t p) 308 { 309 if (!inWhitespace) 310 { 311 result ~= text[wordStart..p]; 312 inWhitespace = true; 313 } 314 } 315 316 foreach (p, c; text) 317 if (std.ascii.isWhite(c)) 318 endWord(p); 319 else 320 if (inWhitespace) 321 { 322 inWhitespace = false; 323 wordStart = p; 324 } 325 endWord(text.length); 326 return result; 327 } 328 329 unittest 330 { 331 foreach (s; ["", " ", "a", " a", "a ", "a b", " a b", "a b ", " a b ", 332 " ", " a", "a ", "a b", "a b ", "a b c"]) 333 assert(s.split == s.asciiSplit, format("Got %s, expected %s", s.asciiSplit, s.split)); 334 } 335 336 /// Like `strip`, but only removes ASCII whitespace. 337 T[] asciiStrip(T)(T[] s) 338 if (is(Unqual!T == char)) 339 { 340 while (s.length && isWhite(s[0])) 341 s = s[1..$]; 342 while (s.length && isWhite(s[$-1])) 343 s = s[0..$-1]; 344 return s; 345 } 346 347 /// 348 unittest 349 { 350 string s = "Hello, world!"; 351 assert(asciiStrip(s) is s); 352 assert(asciiStrip("\r\n\tHello ".dup) == "Hello"); 353 } 354 355 /// Covering slice-list of s with interleaved whitespace. 356 T[][] segmentByWhitespace(T)(T[] s) 357 if (is(Unqual!T == char)) 358 { 359 if (!s.length) 360 return null; 361 362 T[][] segments; 363 bool wasWhite = isWhite(s[0]); 364 size_t start = 0; 365 foreach (p, char c; s) 366 { 367 bool isWhite = isWhite(c); 368 if (isWhite != wasWhite) 369 segments ~= s[start..p], 370 start = p; 371 wasWhite = isWhite; 372 } 373 segments ~= s[start..$]; 374 375 return segments; 376 } 377 378 /// Replaces runs of ASCII whitespace which contain a newline (`'\n'`) into a single space. 379 T[] newlinesToSpaces(T)(T[] s) 380 if (is(Unqual!T == char)) 381 { 382 auto slices = segmentByWhitespace(s); 383 foreach (ref slice; slices) 384 if (slice.contains("\n")) 385 slice = " "; 386 return slices.join(); 387 } 388 389 /// Replaces all runs of ASCII whitespace with a single space. 390 ascii normalizeWhitespace(ascii s) 391 { 392 auto slices = segmentByWhitespace(strip(s)); 393 foreach (i, ref slice; slices) 394 if (i & 1) // odd 395 slice = " "; 396 return slices.join(); 397 } 398 399 /// 400 unittest 401 { 402 assert(normalizeWhitespace(" Mary had\ta\nlittle\r\n\tlamb") == "Mary had a little lamb"); 403 } 404 405 /// Splits out words from a camel-cased string. 406 /// All-uppercase words are returned as a single word. 407 string[] splitByCamelCase(string s) 408 { 409 string[] result; 410 size_t start = 0; 411 foreach (i; 1..s.length+1) 412 if (i == s.length 413 || (isLower(s[i-1]) && isUpper(s[i])) 414 || (i+1 < s.length && isUpper(s[i-1]) && isUpper(s[i]) && isLower(s[i+1])) 415 ) 416 { 417 result ~= s[start..i]; 418 start = i; 419 } 420 return result; 421 } 422 423 /// 424 unittest 425 { 426 assert(splitByCamelCase("parseIPString") == ["parse", "IP", "String"]); 427 assert(splitByCamelCase("IPString") == ["IP", "String"]); 428 } 429 430 /// Join an array of words into a camel-cased string. 431 string camelCaseJoin(string[] arr) 432 { 433 if (!arr.length) 434 return null; 435 string result = arr[0]; 436 foreach (s; arr[1..$]) 437 result ~= std.ascii.toUpper(s[0]) ~ s[1..$]; 438 return result; 439 } 440 441 unittest 442 { 443 assert("parse-IP-string".split('-').camelCaseJoin() == "parseIPString"); 444 } 445 446 // ************************************************************************ 447 448 /// Like std.string.wrap, but preserves whitespace at line start and 449 /// between (non-wrapped) words. 450 string verbatimWrap( 451 string s, 452 size_t columns = 80, 453 string firstIndent = null, 454 string indent = null, 455 size_t tabWidth = 8, 456 ) 457 { 458 if (!s.length) 459 return s; 460 461 import std.uni : isWhite; 462 import std.range; 463 464 // Result buffer. Append-only (contains only text which has been wrapped). 465 string result; 466 // Index in `s` corresponding to the end of `result` 467 size_t start; 468 // Index in `s` corresponding to after the last newline in `result` 469 size_t lineStart; 470 // Current column 471 size_t col; 472 // Was the previous character we looked at whitespace? 473 bool wasWhite; 474 // We need to add an indent at the next (non-newline) character. 475 bool needIndent; 476 477 result = firstIndent; 478 col = firstIndent.walkLength; 479 auto indentWidth = indent.walkLength; 480 481 void flush(size_t pos) 482 { 483 if (col > columns && start > lineStart) 484 { 485 result ~= "\n" ~ indent; 486 col = indentWidth + (pos - start); 487 488 // Consume whitespace at line break 489 size_t numWhite; 490 foreach (i, c; s[start .. $]) 491 if (isWhite(c)) 492 numWhite = i; 493 else 494 break; 495 start += numWhite; 496 lineStart = start; 497 } 498 result ~= s[start .. pos]; 499 start = pos; 500 } 501 502 foreach (pos, dchar c; s) 503 { 504 auto atWhite = isWhite(c); 505 if (atWhite && !wasWhite) 506 flush(pos); 507 if (c == '\n') 508 { 509 flush(pos); 510 result ~= "\n"; 511 start++; // past newline 512 lineStart = start; 513 needIndent = true; 514 col = 0; 515 } 516 else 517 { 518 if (needIndent) 519 { 520 assert(col == 0); 521 result ~= indent; 522 col += indentWidth; 523 needIndent = false; 524 } 525 if (c == '\t') 526 col += tabWidth; 527 else 528 col++; 529 } 530 wasWhite = atWhite; 531 } 532 flush(s.length); 533 if (col) 534 result ~= "\n"; // trailing newline 535 536 return result; 537 } 538 539 // ************************************************************************ 540 541 /// Case-insensitive ASCII string. 542 alias CIAsciiString = NormalizedArray!(immutable(char), s => s.byCodeUnit.map!(std.ascii.toLower)); 543 544 /// 545 unittest 546 { 547 CIAsciiString s = "test"; 548 assert(s == "TEST"); 549 assert(s >= "Test" && s <= "Test"); 550 assert(CIAsciiString("a") == CIAsciiString("A")); 551 assert(CIAsciiString("a") != CIAsciiString("B")); 552 assert(CIAsciiString("a") < CIAsciiString("B")); 553 assert(CIAsciiString("A") < CIAsciiString("b")); 554 assert(CIAsciiString("я") != CIAsciiString("Я")); 555 } 556 557 import std.uni : toLower; 558 559 /// Case-insensitive Unicode string. 560 alias CIUniString = NormalizedArray!(immutable(char), s => s.map!(toLower)); 561 562 /// 563 unittest 564 { 565 CIUniString s = "привет"; 566 assert(s == "ПРИВЕТ"); 567 assert(s >= "Привет" && s <= "Привет"); 568 assert(CIUniString("я") == CIUniString("Я")); 569 assert(CIUniString("а") != CIUniString("Б")); 570 assert(CIUniString("а") < CIUniString("Б")); 571 assert(CIUniString("А") < CIUniString("б")); 572 } 573 574 // ************************************************************************ 575 576 import std.utf; 577 578 /// Convert any data to a valid UTF-8 bytestream, so D's string functions can 579 /// properly work on it. 580 string rawToUTF8(in char[] s) 581 { 582 auto d = new dchar[s.length]; 583 foreach (i, char c; s) 584 d[i] = c; 585 return toUTF8(d); 586 } 587 588 /// Undo rawToUTF8. 589 ascii UTF8ToRaw(in char[] r) pure 590 { 591 auto s = new char[r.length]; 592 size_t i = 0; 593 foreach (dchar c; r) 594 { 595 assert(c < '\u0100'); 596 s[i++] = cast(char)c; 597 } 598 return s[0..i]; 599 } 600 601 unittest 602 { 603 char[1] c; 604 for (int i=0; i<256; i++) 605 { 606 c[0] = cast(char)i; 607 assert(UTF8ToRaw(rawToUTF8(c[])) == c[], format("%s -> %s -> %s", cast(ubyte[])c[], cast(ubyte[])rawToUTF8(c[]), cast(ubyte[])UTF8ToRaw(rawToUTF8(c[])))); 608 } 609 } 610 611 /// Where a delegate with this signature is required. 612 string nullStringTransform(in char[] s) { return to!string(s); } 613 614 /// Like readText, but with in-memory data. 615 /// Reverse of ae.utils.array.bytes (for strings). 616 inout(char)[] asText(inout(ubyte)[] bytes) 617 { 618 auto s = cast(inout(char)[]) bytes; 619 validate(s); 620 return s; 621 } 622 623 /// Lossily convert arbitrary data into a valid UTF-8 string. 624 string forceValidUTF8(ascii s) 625 { 626 try 627 { 628 validate(s); 629 return s; 630 } 631 catch (UTFException) 632 return rawToUTF8(s); 633 } 634 635 // ************************************************************************ 636 637 /// Return the slice up to the first NUL character, 638 /// or of the whole array if none is found. 639 C[] fromZArray(C, n)(ref C[n] arr) 640 { 641 auto p = arr.representation.countUntil(0); 642 return arr[0 .. p<0 ? $ : p]; 643 } 644 645 /// ditto 646 C[] fromZArray(C)(C[] arr) 647 { 648 auto p = arr.representation.countUntil(0); 649 return arr[0 .. p<0 ? $ : p]; 650 } 651 652 unittest 653 { 654 char[4] arr = "ab\0d"; 655 assert(arr.fromZArray == "ab"); 656 arr[] = "abcd"; 657 assert(arr.fromZArray == "abcd"); 658 } 659 660 unittest 661 { 662 string arr = "ab\0d"; 663 assert(arr.fromZArray == "ab"); 664 arr = "abcd"; 665 assert(arr.fromZArray == "abcd"); 666 } 667 668 // ************************************************************************ 669 670 /// Formats binary data as a hex dump (three-column layout consisting of hex 671 /// offset, byte values in hex, and printable low-ASCII characters). 672 string hexDump(const(void)[] b) 673 { 674 auto data = cast(const(ubyte)[]) b; 675 assert(data.length); 676 size_t i=0; 677 string s; 678 while (i<data.length) 679 { 680 s ~= format("%08X: ", i); 681 foreach (x; 0..16) 682 { 683 if (i+x<data.length) 684 s ~= format("%02X ", data[i+x]); 685 else 686 s ~= " "; 687 if (x==7) 688 s ~= "| "; 689 } 690 s ~= " "; 691 foreach (x; 0..16) 692 { 693 if (i+x<data.length) 694 if (data[i+x]==0) 695 s ~= ' '; 696 else 697 if (data[i+x]<32 || data[i+x]>=128) 698 s ~= '.'; 699 else 700 s ~= cast(char)data[i+x]; 701 else 702 s ~= ' '; 703 } 704 s ~= "\n"; 705 i += 16; 706 } 707 return s; 708 } 709 710 import std.conv; 711 712 /// Parses `s` as a hexadecimal number into an integer of type `T`. 713 T fromHex(T : ulong = uint, C)(const(C)[] s) 714 { 715 T result = parse!T(s, 16); 716 enforce(s.length==0, new ConvException("Could not parse entire string")); 717 return result; 718 } 719 720 /// Parses `hex` into an array of bytes. 721 /// `hex.length` should be even. 722 ubyte[] arrayFromHex(in char[] hex) 723 { 724 auto buf = new ubyte[hex.length/2]; 725 arrayFromHex(hex, buf); 726 return buf; 727 } 728 729 /// Policy for `parseHexDigit`. 730 struct HexParseConfig 731 { 732 bool checked = true; /// Throw on invalid digits. 733 bool lower = true; /// Accept lower-case digits. 734 bool upper = true; /// Accept upper-case digits. 735 } 736 737 /// Parse a single hexadecimal digit according to the policy in `config`. 738 ubyte parseHexDigit(HexParseConfig config = HexParseConfig.init)(char c) 739 { 740 static assert(config.lower || config.upper, 741 "Must parse at least either lower or upper case digits"); 742 static if (config.checked) 743 { 744 switch (c) 745 { 746 case '0': .. case '9': return cast(ubyte)(c - '0'); 747 case 'a': .. case 'f': return cast(ubyte)(c - 'a' + 10); 748 case 'A': .. case 'F': return cast(ubyte)(c - 'A' + 10); 749 default: throw new Exception("Bad hex digit: " ~ c); 750 } 751 } 752 else 753 { 754 if (c <= '9') 755 return cast(ubyte)(c - '0'); 756 static if (config.lower && config.upper) 757 { 758 if (c < 'a') 759 return cast(ubyte)(c - 'A' + 10); 760 else 761 return cast(ubyte)(c - 'a' + 10); 762 } 763 else 764 static if (config.lower) 765 return cast(ubyte)(c - 'a' + 10); 766 else 767 return cast(ubyte)(c - 'A' + 10); 768 } 769 } 770 771 /// Parses `hex` into the given array `buf`. 772 void arrayFromHex(HexParseConfig config = HexParseConfig.init)(in char[] hex, ubyte[] buf) 773 { 774 assert(buf.length == hex.length/2, "Wrong buffer size for arrayFromHex"); 775 for (int i=0; i<hex.length; i+=2) 776 buf[i/2] = cast(ubyte)( 777 parseHexDigit!config(hex[i ])*16 + 778 parseHexDigit!config(hex[i+1]) 779 ); 780 } 781 782 /// Parses `hex` into the given array `buf`. 783 /// Fast version for static arrays of known length. 784 void sarrayFromHex(HexParseConfig config = HexParseConfig.init, size_t N, Hex)(ref const Hex hex, ref ubyte[N] buf) 785 if (is(Hex == char[N*2])) 786 { 787 foreach (i; 0..N/4) 788 { 789 ulong chars = (cast(ulong*)hex.ptr)[i]; 790 uint res = 791 (parseHexDigit!config((chars >> (8*0)) & 0xFF) << (4*1)) | 792 (parseHexDigit!config((chars >> (8*1)) & 0xFF) << (4*0)) | 793 (parseHexDigit!config((chars >> (8*2)) & 0xFF) << (4*3)) | 794 (parseHexDigit!config((chars >> (8*3)) & 0xFF) << (4*2)) | 795 (parseHexDigit!config((chars >> (8*4)) & 0xFF) << (4*5)) | 796 (parseHexDigit!config((chars >> (8*5)) & 0xFF) << (4*4)) | 797 (parseHexDigit!config((chars >> (8*6)) & 0xFF) << (4*7)) | 798 (parseHexDigit!config((chars >> (8*7)) & 0xFF) << (4*6)); 799 (cast(uint*)buf.ptr)[i] = res; 800 } 801 foreach (i; N/4*4..N) 802 buf[i] = cast(ubyte)( 803 parseHexDigit!config(hex[i*2 ])*16 + 804 parseHexDigit!config(hex[i*2+1]) 805 ); 806 } 807 808 unittest 809 { 810 foreach (checked; TypeTuple!(false, true)) 811 foreach (lower; TypeTuple!(false, true)) 812 foreach (upper; TypeTuple!(false, true)) 813 static if (lower || upper) 814 { 815 enum config = HexParseConfig(checked, lower, upper); 816 char[18] buf; 817 foreach (n; 0..18) 818 if (lower && upper ? n & 1 : upper) 819 buf[n] = hexDigits[n % 16]; 820 else 821 buf[n] = lowerHexDigits[n % 16]; 822 ubyte[9] res; 823 sarrayFromHex!config(buf, res); 824 assert(res == [0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF, 0x01], text(res)); 825 } 826 } 827 828 /// Conversion from bytes to hexadecimal strings. 829 template toHex(alias digits = hexDigits) 830 { 831 // Note: using template constraints instead of overloads due to 832 // https://issues.dlang.org/show_bug.cgi?id=21504 833 834 enum isHexifiable(T) = 835 is(T : ulong) || // number 836 is(T : const(ubyte)[]) || // dynamic array of bytes 837 is(T : const(ubyte)[n], size_t n); // static array of bytes 838 839 enum isBuffer(T) = 840 (is(T : C[], C) && isSomeChar!C) || // dynamic array of chars 841 (is(T : C[n], n, C) && isSomeChar!C); // static array of chars 842 843 auto toHex(T, B)(auto ref T value, auto ref B buf) 844 if (isHexifiable!T && isBuffer!B) 845 { 846 // Get result length 847 static if (is(T : ulong)) 848 enum resultLength = T.sizeof * 2; 849 else 850 static if (is(T : const(ubyte)[n], size_t n)) 851 enum resultLength = T.length * 2; 852 else 853 auto resultLength = value.length * 2; 854 855 enum fixedResultLength = !is(typeof(&resultLength)); 856 857 // Ensure buffer size 858 { 859 enum fixedBufferLength = !is(T : C[], C); 860 static if (!fixedBufferLength && __traits(isRef, buf)) 861 { 862 if (buf.length < resultLength) 863 buf.length = resultLength; 864 } 865 else 866 { 867 static if (fixedResultLength && fixedBufferLength) 868 static assert(resultLength <= buf.length, "Buffer size is insufficient"); 869 else 870 assert(resultLength <= buf.length, "Buffer size is insufficient"); 871 } 872 } 873 874 static if (is(T : ulong)) 875 { 876 Unqual!T x = value; 877 foreach (i; Reverse!(rangeTuple!(T.sizeof*2))) 878 { 879 buf[i] = hexDigits[x & 0xF]; 880 x >>= 4; 881 } 882 } 883 else 884 { 885 foreach (i, b; value) 886 { 887 buf[i*2 ] = digits[b>>4]; 888 buf[i*2+1] = digits[b&15]; 889 } 890 } 891 892 alias C = typeof(buf[0]); 893 894 static if (fixedResultLength) 895 C[resultLength] result; 896 else 897 C[] result; 898 result = buf[0 .. resultLength]; 899 return result; 900 } 901 902 auto toHex(C = char, T)(auto ref T value) 903 if (isSomeChar!C && isHexifiable!T) 904 { 905 static if (is(T : const(ubyte)[])) 906 C[] buf; 907 else 908 C[T.sizeof * 2] buf; 909 return toHex(value, buf); 910 } 911 } 912 913 alias toLowerHex = toHex!lowerHexDigits; /// ditto 914 915 unittest 916 { 917 ubyte[] bytes = [0x12, 0x34]; 918 assert(toHex(bytes) == "1234"); 919 } 920 921 unittest 922 { 923 ubyte[] bytes = [0x12, 0x34]; 924 char[] buf = new char[4]; 925 toHex(bytes, buf); 926 assert(buf == "1234"); 927 } 928 929 unittest 930 { 931 char[8] buf; 932 toHex(0x01234567, buf); 933 assert(buf == "01234567"); 934 } 935 936 unittest 937 { 938 assert(toHex(0x01234567) == "01234567"); 939 } 940 941 unittest 942 { 943 ubyte[2] bytes = [0x12, 0x34]; 944 auto buf = bytes.toLowerHex(); 945 static assert(buf.length == 4); 946 assert(buf == "1234"); 947 } 948 949 unittest 950 { 951 import core.exception : AssertError; 952 953 ubyte[] a = new ubyte[10]; 954 char[] b = new char[10]; 955 toHex(a[0..1], b[0..2]); 956 assertThrown!AssertError(toHex(a[0..1], b[0..1])); 957 } 958 959 /// How many significant decimal digits does a FP type have 960 /// (determined empirically - valid for all D FP types on x86/64) 961 enum significantDigits(T : real) = 2 + 2 * T.sizeof; 962 963 /// Format string for a FP type which includes all necessary 964 /// significant digits 965 enum fpFormatString(T) = "%." ~ text(significantDigits!T) ~ "g"; 966 private template cWidthString(T) 967 { 968 static if (is(Unqual!T == float)) 969 enum cWidthString = ""; 970 else 971 static if (is(Unqual!T == double)) 972 enum cWidthString = "l"; 973 else 974 static if (is(Unqual!T == real)) 975 enum cWidthString = "L"; 976 } 977 /// C format string to exactly format a floating-point type `T`. 978 enum fpCFormatString(T) = "%." ~ text(significantDigits!T) ~ cWidthString!T ~ "g"; 979 /// C format string to scan a floating-point type `T`. 980 enum fpCScanString(T) = "%" ~ cWidthString!T ~ "f"; 981 982 private auto safeSprintf(size_t N, Args...)(ref char[N] buf, auto ref Args args) @trusted @nogc 983 { 984 return snprintf(buf.ptr, N, args); 985 } 986 987 /// Parse a floating-point number using the C standard library. 988 /// Note: might produce slightly different results than e.g. `to!double`. 989 bool fpTryParse(F, C)(ref const(C)[] s, ref F f) @trusted @nogc nothrow 990 if (isFloatingPoint!F && isSomeChar!C) 991 { 992 // Parity with glibc: 993 version (Windows) 994 { 995 // https://issues.dlang.org/show_bug.cgi?id=22302 996 if (s.representation.startsWith("nan".representation)) { s = s[3..$]; f = F.nan; return true; } 997 if (s.representation.startsWith("inf".representation)) { s = s[3..$]; f = F.infinity; return true; } 998 if (s.representation.startsWith("-inf".representation)) { s = s[4..$]; f = -F.infinity; return true; } 999 } 1000 1001 import core.stdc.stdlib : malloc, free; 1002 char[64] sbuf = void; 1003 auto buf = s.length >= sbuf.length ? cast(char*)malloc(s.length + 1) : sbuf.ptr; 1004 if (!buf) 1005 assert(false, "Memory allocation failed"); 1006 scope(exit) if (buf != sbuf.ptr) free(buf); 1007 1008 foreach (i, c; s) 1009 { 1010 if (c >= 0x100) 1011 return false; // Non-ASCII 1012 buf[i] = cast(char)c; 1013 } 1014 buf[s.length] = 0; 1015 1016 static immutable fmt = fpCScanString!F ~ "%n\0"; 1017 int read; 1018 if (!sscanf(buf, fmt.ptr, &f, &read)) 1019 return false; 1020 s = s[read .. $]; 1021 return true; 1022 } 1023 1024 /// ditto 1025 F fpParse(F, C)(const(C)[] s) 1026 if (isFloatingPoint!F && isSomeChar!C) 1027 { 1028 F f; 1029 fpTryParse(s, f).enforce("Failed to parse " ~ F.stringof); 1030 enforce(s.length == 0, "Failed to completely parse " ~ F.stringof); 1031 return f; 1032 } 1033 1034 private F fpParseAssumeValid(F)(const(char)[] s) @nogc 1035 { 1036 F f; 1037 auto res = fpTryParse(s, f); 1038 assert(res, "Failed to parse number we created"); 1039 assert(!s.length, "Failed to completely parse number we created"); 1040 return f; 1041 } 1042 1043 private auto fpToBuf(Q)(Q val) @safe nothrow @nogc 1044 { 1045 alias F = Unqual!Q; 1046 1047 /// Bypass FPU register, which may contain a different precision 1048 static F forceType(F d) { static F n; n = d; return n; } 1049 1050 enum isReal = is(F == real); 1051 1052 StaticBuf!(char, 64) buf = void; 1053 1054 // MSVC workaround from std.format: 1055 version (CRuntime_Microsoft) 1056 { 1057 import std.math : isNaN, isInfinity; 1058 immutable double v = val; // convert early to get "inf" in case of overflow 1059 { 1060 string s; 1061 if (isNaN(v)) 1062 s = "nan"; // snprintf writes 1.#QNAN 1063 else if (isInfinity(v)) 1064 s = val >= 0 ? "inf" : "-inf"; // snprintf writes 1.#INF 1065 else 1066 goto L1; 1067 buf.buf[0..s.length] = s; 1068 buf.pos = s.length; 1069 return buf; 1070 L1: 1071 } 1072 } 1073 else 1074 alias v = val; 1075 1076 buf.pos = safeSprintf(buf.buf, &fpCFormatString!F[0], forceType(v)); 1077 char[] s = buf.data(); 1078 1079 F parse(char[] s) 1080 { 1081 auto f = fpParseAssumeValid!F(s); 1082 return f; 1083 } 1084 1085 if (s != "nan" && s != "-nan" && s != "inf" && s != "-inf") 1086 { 1087 if (forceType(parse(s)) != v) 1088 { 1089 static if (isReal) 1090 { 1091 // Something funny with DM libc real parsing... e.g. 0.6885036635121051783 1092 return buf; 1093 } 1094 else 1095 // assert(false, "Initial conversion fails: " ~ format(fpFormatString!F, parse(s)) ~ " / " ~ s); 1096 assert(false, "Initial conversion fails"); 1097 } 1098 1099 auto suffixPos = s.length; 1100 foreach (i, c; s) 1101 if (c == 'e' || c == 'E') 1102 { 1103 suffixPos = i; 1104 break; 1105 } 1106 auto suffix = s[suffixPos .. $]; 1107 s = s[0 .. suffixPos]; 1108 1109 StaticBuf!(char, 64) testBuf; 1110 void render(char[] prefix) 1111 { 1112 testBuf.pos = 0; 1113 testBuf.put(prefix); 1114 testBuf.put(suffix); 1115 } 1116 F tryPrefix(char[] prefix) 1117 { 1118 render(prefix); 1119 return forceType(parse(testBuf.data)); 1120 } 1121 1122 foreach_reverse (i; 1..s.length) 1123 if (s[i]>='0' && s[i]<='8') 1124 { 1125 s[i]++; 1126 if (tryPrefix(s[0..i+1])==v) 1127 s = s[0..i+1]; 1128 else 1129 s[i]--; 1130 } 1131 while (s.length>2 && s[$-1]!='.' && tryPrefix(s[0..$-1])==v) 1132 s = s[0..$-1]; 1133 1134 render(s); 1135 return testBuf; 1136 } 1137 buf.pos = s.length; 1138 return buf; 1139 } 1140 1141 /// Get shortest string representation of a FP type that still converts to exactly the same number. 1142 template fpToString(F) 1143 { 1144 string fpToString(F v) @safe nothrow 1145 { 1146 return fpToBuf(v).data.idup; 1147 } 1148 1149 static if (!is(Unqual!F == real)) 1150 unittest 1151 { 1152 union U 1153 { 1154 ubyte[F.sizeof] bytes; 1155 Unqual!F d; 1156 string toString() const { return (fpFormatString!F ~ " %a [%(%02X %)]").format(d, d, bytes[]); } 1157 } 1158 import std.random : Xorshift, uniform; 1159 import std.stdio : stderr; 1160 Xorshift rng; 1161 foreach (n; 0..10000) 1162 { 1163 U u; 1164 foreach (ref b; u.bytes[]) 1165 b = uniform!ubyte(rng); 1166 static if (is(Unqual!F == real)) 1167 u.bytes[7] |= 0x80; // require normalized value 1168 scope(failure) stderr.writeln("Input:\t", u); 1169 auto s = fpToString(u.d); 1170 scope(failure) stderr.writeln("Result:\t", s); 1171 if (s == "nan" || s == "-nan") 1172 continue; // there are many NaNs... 1173 U r; 1174 //r.d = to!F(s); 1175 r.d = fpParse!F(s); 1176 assert(r.bytes == u.bytes, 1177 "fpToString mismatch:\nOutput:\t%s".format(r)); 1178 } 1179 } 1180 } 1181 1182 alias doubleToString = fpToString!double; /// 1183 1184 unittest 1185 { 1186 alias floatToString = fpToString!float; 1187 alias realToString = fpToString!real; 1188 alias crealToString = fpToString!(const(real)); 1189 } 1190 1191 unittest 1192 { 1193 assert(2.3841857910156251e-07.doubleToString == "2.384185791015625e-07"); 1194 assert(1.3e-07.doubleToString == "1.3e-07"); 1195 } 1196 1197 /// Like `fpToString`, but writes the result to a sink. 1198 void putFP(Writer, F)(auto ref Writer writer, F v) 1199 { 1200 writer.put(fpToBuf(v).data); 1201 } 1202 1203 1204 /// Wraps the result of `fpToString` in a non-allocating stringifiable struct. 1205 struct FPAsString(T) 1206 { 1207 private typeof(fpToBuf(T.init)) buf; 1208 1209 this(T f) 1210 { 1211 buf = fpToBuf(f); 1212 } /// 1213 1214 string toString() const pure nothrow 1215 { 1216 return buf.data.idup; 1217 } /// 1218 1219 void toString(W)(ref W w) const 1220 { 1221 static if (is(typeof(w.put(buf.data)))) 1222 w.put(buf.data); 1223 else 1224 foreach (c; buf.data) 1225 w.put(c); 1226 } /// 1227 } 1228 FPAsString!T fpAsString(T)(T f) { return FPAsString!T(f); } /// ditto 1229 1230 @safe //nothrow @nogc 1231 unittest 1232 { 1233 StaticBuf!(char, 1024) buf; 1234 buf.formattedWrite!"%s"(fpAsString(0.1)); 1235 assert(buf.data == "0.1", buf.data.idup); 1236 } 1237 1238 /// Get shortest string representation of a numeric 1239 /// type that still converts to exactly the same number. 1240 string numberToString(T)(T v) 1241 if (isNumeric!T) 1242 { 1243 static if (is(T : ulong)) 1244 return toDec(v); 1245 else 1246 return fpToString(v); 1247 } 1248 1249 // ************************************************************************ 1250 1251 /// Simpler implementation of Levenshtein string distance 1252 int stringDistance(string s, string t) 1253 { 1254 int n = cast(int)s.length; 1255 int m = cast(int)t.length; 1256 if (n == 0) return m; 1257 if (m == 0) return n; 1258 int[][] distance = new int[][](n+1, m+1); // matrix 1259 int cost=0; 1260 //init1 1261 foreach (i; 0..n+1) distance[i][0]=i; 1262 foreach (j; 0..m+1) distance[0][j]=j; 1263 //find min distance 1264 foreach (i; 1..n+1) 1265 foreach (j; 1..m+1) 1266 { 1267 cost = t[j-1] == s[i-1] ? 0 : 1; 1268 distance[i][j] = min( 1269 distance[i-1][j ] + 1, 1270 distance[i ][j-1] + 1, 1271 distance[i-1][j-1] + cost 1272 ); 1273 } 1274 return distance[n][m]; 1275 } 1276 1277 /// Return a number between 0.0 and 1.0 indicating how similar two strings are 1278 /// (1.0 if identical) 1279 float stringSimilarity(string string1, string string2) 1280 { 1281 float dis = stringDistance(string1, string2); 1282 float maxLen = string1.length; 1283 if (maxLen < string2.length) 1284 maxLen = string2.length; 1285 if (maxLen == 0) 1286 return 1; 1287 else 1288 return 1f - dis/maxLen; 1289 } 1290 1291 /// Select best match from a list of items. 1292 /// Returns -1 if none are above the threshold. 1293 sizediff_t findBestMatch(in string[] items, string target, float threshold = 0.7) 1294 { 1295 sizediff_t found = -1; 1296 float best = 0; 1297 1298 foreach (i, item; items) 1299 { 1300 float match = stringSimilarity(toLower(item),toLower(target)); 1301 if (match>threshold && match>=best) 1302 { 1303 best = match; 1304 found = i; 1305 } 1306 } 1307 1308 return found; 1309 } 1310 1311 /// Select best match from a list of items. 1312 /// Returns null if none are above the threshold. 1313 string selectBestFrom(in string[] items, string target, float threshold = 0.7) 1314 { 1315 auto index = findBestMatch(items, target, threshold); 1316 return index < 0 ? null : items[index]; 1317 } 1318 1319 // ************************************************************************ 1320 1321 /// Generate a random string with the given parameters. 1322 /// `std.random` is used as the source of randomness. 1323 /// Not cryptographically secure. 1324 string randomString()(int length=20, string chars="abcdefghijklmnopqrstuvwxyz") 1325 { 1326 import std.random; 1327 import std.range; 1328 1329 return length.iota.map!(n => chars[uniform(0, $)]).array; 1330 }