1 /** 2 * Parses and handles Internet mail/news messages. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <vladimir@thecybershadow.net> 12 */ 13 14 module ae.net.ietf.message; 15 16 import std.algorithm; 17 import std.array; 18 import std.base64; 19 import std.conv; 20 import std.datetime; 21 import std.exception; 22 import std.regex; 23 import std.string; 24 import std.uri; 25 import std.utf; 26 27 // TODO: Replace with logging? 28 debug(RFC850) import std.stdio : stderr; 29 30 import ae.net.ietf.headers; 31 import ae.utils.array; 32 import ae.utils.iconv; 33 import ae.utils.mime; 34 import ae.utils.text; 35 import ae.utils.time; 36 37 import ae.net.ietf.wrap; 38 39 alias ae.utils.text.ascii ascii; // https://d.puremagic.com/issues/show_bug.cgi?id=12156 40 alias std..string.indexOf indexOf; 41 42 struct Xref 43 { 44 string group; 45 int num; 46 } 47 48 class Rfc850Message 49 { 50 /// The raw message (as passed in a constructor). 51 ascii message; 52 53 /// The message ID, as specified at creation or in the Message-ID field. 54 /// Includes the usual angular brackets. 55 string id; 56 57 /// Cross-references - for newsgroups posts, list of groups where it was 58 /// posted, and article number in said group. 59 Xref[] xref; 60 61 /// The thread subject, with the leading "Re: " and list ID stripped. 62 string subject; 63 64 /// The original message subject, as it appears in the message. 65 string rawSubject; 66 67 /// The author's name, in UTF-8, stripped of quotes (no email address). 68 string author; 69 70 /// The author's email address, stripped of angular brackets. 71 string authorEmail; 72 73 /// Message date/time. 74 SysTime time; 75 76 /// A list of Message-IDs that this post is in reply to. 77 /// The most recent message (and direct parent) comes last. 78 string[] references; 79 80 /// Whether this post is a reply. 81 bool reply; 82 83 /// This message's headers. 84 Headers headers; 85 86 /// The text contents of this message (UTF-8). 87 /// "null" in case of an error. 88 string content; 89 90 /// The contents of this message (depends on mimeType). 91 ubyte[] data; 92 93 /// Explanation for null content. 94 string error; 95 96 /// Reflow options (RFC 2646). 97 bool flowed, delsp; 98 99 /// For a multipart message, contains the child parts. 100 /// May nest more than one level. 101 Rfc850Message[] parts; 102 103 /// Properties of a multipart message's part. 104 string name, fileName, description, mimeType; 105 106 /// Parses a message string and creates a Rfc850Message. 107 this(ascii message) 108 { 109 this.message = message; 110 debug(RFC850) scope(failure) stderr.writeln("Failure while parsing message: ", id); 111 112 // Split headers from message, parse headers 113 114 // TODO: this breaks binary encodings, FIXME 115 auto text = message.fastReplace("\r\n", "\n"); 116 auto headerEnd = text.indexOf("\n\n"); 117 if (headerEnd < 0) headerEnd = text.length; 118 auto header = text[0..headerEnd]; 119 header = header.fastReplace("\n\t", " ").fastReplace("\n ", " "); 120 121 // TODO: Use a proper spec-conforming header parser 122 foreach (s; header.fastSplit('\n')) 123 { 124 if (s == "") break; 125 126 auto p = s.indexOf(": "); 127 if (p<0) continue; 128 //assert(p>0, "Bad header line: " ~ s); 129 headers[s[0..p]] = s[p+2..$]; 130 } 131 132 // Decode international characters in headers 133 134 string defaultEncoding = guessDefaultEncoding(headers.get("User-Agent", null)); 135 136 foreach (string key, ref string value; headers) 137 if (hasHighAsciiChars(value)) 138 value = decodeEncodedText(value, defaultEncoding); 139 140 // Decode transfer encoding 141 142 ascii rawContent = text[min(headerEnd+2, $)..$]; 143 144 if ("Content-Transfer-Encoding" in headers) 145 try 146 rawContent = decodeTransferEncoding(rawContent, headers["Content-Transfer-Encoding"]); 147 catch (Exception e) 148 { 149 rawContent = null; 150 error = "Error decoding " ~ headers["Content-Transfer-Encoding"] ~ " message: " ~ e.msg; 151 } 152 153 // Decode message 154 155 data = cast(ubyte[])rawContent; 156 157 TokenHeader contentType, contentDisposition; 158 if ("Content-Type" in headers) 159 contentType = decodeTokenHeader(headers["Content-Type"]); 160 if ("Content-Disposition" in headers) 161 contentDisposition = decodeTokenHeader(headers["Content-Disposition"]); 162 mimeType = toLower(contentType.value); 163 flowed = contentType.properties.get("format", "fixed") == "flowed"; 164 delsp = contentType.properties.get("delsp", "no") == "yes"; 165 166 if (rawContent) 167 { 168 if (!mimeType || mimeType == "text/plain") 169 { 170 if ("charset" in contentType.properties) 171 content = decodeEncodedText(rawContent, contentType.properties["charset"]); 172 else 173 content = decodeEncodedText(rawContent, defaultEncoding); 174 } 175 else 176 if (mimeType.startsWith("multipart/") && "boundary" in contentType.properties) 177 { 178 string boundary = contentType.properties["boundary"]; 179 auto end = rawContent.indexOf("--" ~ boundary ~ "--"); 180 if (end < 0) 181 end = rawContent.length; 182 rawContent = rawContent[0..end]; 183 184 auto rawParts = rawContent.split("--" ~ boundary ~ "\n"); 185 foreach (rawPart; rawParts[1..$]) 186 { 187 auto part = new Rfc850Message(rawPart); 188 if (part.content && !content) 189 content = part.content; 190 parts ~= part; 191 } 192 193 if (!content) 194 { 195 if (rawParts.length && rawParts[0].asciiStrip().length) 196 content = rawParts[0]; // default content to multipart stub 197 else 198 error = "Couldn't find text part in this " ~ mimeType ~ " message"; 199 } 200 } 201 else 202 error = "Don't know how parse " ~ mimeType ~ " message"; 203 } 204 205 // Strip PGP signature away to a separate "attachment" 206 207 enum PGP_START = "-----BEGIN PGP SIGNED MESSAGE-----\n"; 208 enum PGP_DELIM = "\n-----BEGIN PGP SIGNATURE-----\n"; 209 enum PGP_END = "\n-----END PGP SIGNATURE-----"; 210 if (content.startsWith(PGP_START) && 211 content.contains(PGP_DELIM) && 212 content.asciiStrip().endsWith(PGP_END)) 213 { 214 // Don't attempt to create meaningful signature files... just get the clutter out of the way 215 content = content.asciiStrip(); 216 auto p = content.indexOf(PGP_DELIM); 217 auto part = new Rfc850Message(content[p+PGP_DELIM.length..$-PGP_END.length]); 218 content = content[PGP_START.length..p]; 219 p = content.indexOf("\n\n"); 220 if (p >= 0) 221 content = content[p+2..$]; 222 part.fileName = "pgp.sig"; 223 parts ~= part; 224 } 225 226 // Decode UU-encoded attachments 227 228 if (content.contains("\nbegin ")) 229 { 230 auto r = regex(`^begin [0-7]+ \S+$`); 231 auto lines = content.split("\n"); 232 size_t start; 233 bool started; 234 string fn; 235 236 for (size_t i=0; i<lines.length; i++) 237 if (!started && !match(lines[i], r).empty) 238 { 239 start = i; 240 fn = lines[i].split(" ")[2]; 241 started = true; 242 } 243 else 244 if (started && lines[i] == "end" && lines[i-1]=="`") 245 { 246 started = false; 247 try 248 { 249 auto data = uudecode(lines[start+1..i]); 250 251 auto part = new Rfc850Message(); 252 part.fileName = fn; 253 part.mimeType = guessMime(fn); 254 part.data = data; 255 parts ~= part; 256 257 lines = lines[0..start] ~ lines[i+1..$]; 258 i = start-1; 259 } 260 catch (Exception e) 261 debug(RFC850) stderr.writeln(e); 262 } 263 264 content = lines.join("\n"); 265 } 266 267 // Parse message-part properties 268 269 name = contentType.properties.get("name", string.init); 270 fileName = contentDisposition.properties.get("filename", string.init); 271 description = headers.get("Content-Description", string.init); 272 if (name == fileName) 273 name = null; 274 275 // Decode references 276 277 if ("References" in headers) 278 { 279 reply = true; 280 auto refs = asciiStrip(headers["References"]); 281 while (refs.startsWith("<")) 282 { 283 auto p = refs.indexOf(">"); 284 if (p < 0) 285 break; 286 references ~= refs[0..p+1]; 287 refs = asciiStrip(refs[p+1..$]); 288 } 289 } 290 else 291 if ("In-Reply-To" in headers) 292 references = [headers["In-Reply-To"]]; 293 294 // Decode subject 295 296 subject = rawSubject = "Subject" in headers ? decodeRfc1522(headers["Subject"]) : null; 297 if (subject.startsWith("Re: ")) 298 { 299 subject = subject[4..$]; 300 reply = true; 301 } 302 303 // Decode author 304 305 author = authorEmail = "From" in headers ? decodeRfc1522(headers["From"]) : null; 306 if ((author.indexOf('@') < 0 && author.indexOf(" at ") >= 0) 307 || (author.indexOf("<") < 0 && author.indexOf(">") < 0 && author.indexOf(" (") > 0 && author.endsWith(")"))) 308 { 309 // Mailing list archive format 310 assert(author == authorEmail); 311 if (author.indexOf(" (") > 0 && author.endsWith(")")) 312 { 313 authorEmail = author[0 .. author.lastIndexOf(" (")].replace(" at ", "@"); 314 author = author[author.lastIndexOf(" (")+2 .. $-1].decodeRfc1522(); 315 } 316 else 317 { 318 authorEmail = author.replace(" at ", "@"); 319 author = author[0 .. author.lastIndexOf(" at ")]; 320 } 321 } 322 if (author.indexOf('<')>=0 && author.endsWith('>')) 323 { 324 auto p = author.indexOf('<'); 325 authorEmail = author[p+1..$-1]; 326 author = decodeRfc1522(asciiStrip(author[0..p])); 327 } 328 if (author.length>2 && author[0]=='"' && author[$-1]=='"') 329 author = decodeRfc1522(asciiStrip(author[1..$-1])); 330 if ((author == authorEmail || author == "") && authorEmail.indexOf("@") > 0) 331 author = authorEmail[0..authorEmail.indexOf("@")]; 332 333 // Decode cross-references 334 335 if ("Xref" in headers) 336 { 337 auto xrefStrings = split(headers["Xref"], " ")[1..$]; 338 foreach (str; xrefStrings) 339 { 340 auto segs = str.split(":"); 341 xref ~= Xref(segs[0], to!int(segs[1])); 342 } 343 } 344 345 if ("List-ID" in headers && subject.startsWith("[") && !xref.length) 346 { 347 auto p = subject.indexOf("] "); 348 xref = [Xref(subject[1..p])]; 349 subject = subject[p+2..$]; 350 } 351 352 // Decode message ID 353 354 if ("Message-ID" in headers && !id) 355 id = headers["Message-ID"]; 356 357 // Decode post time 358 359 time = Clock.currTime; // default value 360 361 if ("NNTP-Posting-Date" in headers) 362 time = parseTime!`D, j M Y H:i:s O \(\U\T\C\)`(headers["NNTP-Posting-Date"].strip()); 363 else 364 if ("Date" in headers) 365 { 366 auto str = headers["Date"].strip(); 367 try 368 time = parseTime!(TimeFormats.RFC850)(str); 369 catch (Exception e) 370 try 371 time = parseTime!(`D, j M Y H:i:s O`)(str); 372 catch (Exception e) 373 try 374 time = parseTime!(`D, j M Y H:i:s e`)(str); 375 catch (Exception e) 376 try 377 time = parseTime!(`D, j M Y H:i O`)(str); 378 catch (Exception e) 379 try 380 time = parseTime!(`D, j M Y H:i e`)(str); 381 catch (Exception e) 382 { 383 // fall-back to default (class creation time) 384 // TODO: better behavior? 385 } 386 } 387 } 388 389 private this() {} // for attachments and templates 390 391 /// Create a template Rfc850Message for a new posting to the specified groups. 392 static Rfc850Message newPostTemplate(string groups) 393 { 394 auto post = new Rfc850Message(); 395 foreach (group; groups.split(",")) 396 post.xref ~= Xref(group); 397 return post; 398 } 399 400 /// Create a template Rfc850Message for a reply to this message. 401 Rfc850Message replyTemplate() 402 { 403 auto post = new Rfc850Message(); 404 post.reply = true; 405 post.xref = this.xref; 406 post.references = this.references ~ this.id; 407 post.subject = this.rawSubject; 408 if (!post.subject.startsWith("Re:")) 409 post.subject = "Re: " ~ post.subject; 410 411 auto paragraphs = unwrapText(this.content, this.flowed, this.delsp); 412 foreach (i, ref paragraph; paragraphs) 413 if (paragraph.quotePrefix.length) 414 paragraph.quotePrefix = ">" ~ paragraph.quotePrefix; 415 else 416 { 417 if (paragraph.text == "-- ") 418 { 419 paragraphs = paragraphs[0..i]; 420 break; 421 } 422 paragraph.quotePrefix = paragraph.text.length ? "> " : ">"; 423 } 424 while (paragraphs.length && paragraphs[$-1].text.length==0) 425 paragraphs = paragraphs[0..$-1]; 426 427 auto replyTime = time; 428 replyTime.timezone = UTC(); 429 post.content = 430 "On " ~ replyTime.formatTime!`l, j F Y \a\t H:i:s e`() ~ ", " ~ this.author ~ " wrote:\n" ~ 431 wrapText(paragraphs) ~ 432 "\n\n"; 433 post.flowed = true; 434 post.delsp = false; 435 436 return post; 437 } 438 439 /// Set the message text. 440 /// Rewraps as necessary. 441 void setText(string text) 442 { 443 this.content = wrapText(unwrapText(text, false, false)); 444 this.flowed = true; 445 this.delsp = false; 446 } 447 448 /// Construct the headers and message fields. 449 void compile() 450 { 451 assert(id); 452 453 headers["Message-ID"] = id; 454 headers["From"] = format(`"%s" <%s>`, author, authorEmail); 455 headers["Subject"] = subject; 456 headers["Newsgroups"] = xref.map!(x => x.group)().join(","); 457 headers["Content-Type"] = format("text/plain; charset=utf-8; format=%s; delsp=%s", flowed ? "flowed" : "fixed", delsp ? "yes" : "no"); 458 headers["Content-Transfer-Encoding"] = "8bit"; 459 if (references.length) 460 { 461 headers["References"] = references.join(" "); 462 headers["In-Reply-To"] = references[$-1]; 463 } 464 if (time == SysTime.init) 465 time = Clock.currTime(); 466 headers["Date"] = time.formatTime!(TimeFormats.RFC2822); 467 headers["User-Agent"] = "ae.net.ietf.message"; 468 469 string[] lines; 470 foreach (name, value; headers) 471 { 472 if (value.hasHighAsciiChars()) 473 value = value.encodeRfc1522(); 474 auto line = name ~ ": " ~ value; 475 auto lineStart = name.length + 2; 476 477 foreach (c; line) 478 enforce(c >= 32, "Control characters in headers"); 479 480 while (line.length >= 80) 481 { 482 auto p = line[0..80].lastIndexOf(' '); 483 if (p < lineStart) 484 { 485 p = 80 + line[80..$].indexOf(' '); 486 if (p < 80) 487 break; 488 } 489 lines ~= line[0..p]; 490 line = line[p..$]; 491 lineStart = 1; 492 } 493 lines ~= line; 494 } 495 496 message = 497 lines.join("\r\n") ~ 498 "\r\n\r\n" ~ 499 splitAsciiLines(content).join("\r\n"); 500 } 501 502 /// Get the Message-ID that this message is in reply to. 503 @property string parentID() 504 { 505 return references.length ? references[$-1] : null; 506 } 507 508 /// Return the oldest known ancestor of this post, possibly 509 /// this post's ID if it is the first one in the thread. 510 /// May not be the thread ID - some UAs/services 511 /// cut off or strip the "References" header. 512 @property string firstAncestorID() 513 { 514 return references.length ? references[0] : id; 515 } 516 } 517 518 unittest 519 { 520 auto post = new Rfc850Message("From: msonke at example.org (=?ISO-8859-1?Q?S=F6nke_Martin?=)\n\nText"); 521 assert(post.author == "Sönke Martin"); 522 assert(post.authorEmail == "msonke@example.org"); 523 524 post = new Rfc850Message("Date: Tue, 06 Sep 2011 14:52 -0700\n\nText"); 525 assert(post.time.year == 2011); 526 } 527 528 private: 529 530 /// Decode headers with international characters in them. 531 string decodeRfc1522(string str) 532 { 533 auto words = str.split(" "); 534 bool[] encoded = new bool[words.length]; 535 536 foreach (wordIndex, ref word; words) 537 if (word.length > 6 && word.startsWith("=?") && word.endsWith("?=")) 538 { 539 auto parts = split(word[2..$-2], "?"); 540 if (parts.length != 3) 541 continue; 542 auto charset = parts[0]; 543 auto encoding = parts[1]; 544 auto text = parts[2]; 545 546 switch (toUpper(encoding)) 547 { 548 case "Q": 549 text = decodeQuotedPrintable(text, true); 550 break; 551 case "B": 552 text = cast(ascii)Base64.decode(text); 553 break; 554 default: 555 continue /*foreach*/; 556 } 557 558 word = decodeEncodedText(text, charset); 559 encoded[wordIndex] = true; 560 } 561 562 string result; 563 foreach (wordIndex, word; words) 564 { 565 if (wordIndex > 0 && !(encoded[wordIndex-1] && encoded[wordIndex])) 566 result ~= ' '; 567 result ~= word; 568 } 569 570 try 571 { 572 import std.utf; 573 validate(result); 574 } 575 catch (Exception e) 576 result = toUtf8(cast(ascii)result, "ISO-8859-1", true); 577 578 return result; 579 } 580 581 /// Encodes an UTF-8 string to be used in headers. 582 string encodeRfc1522(string str) 583 { 584 if (!str.hasHighAsciiChars()) 585 return str; 586 587 string[] words; 588 bool wasIntl = false; 589 foreach (word; str.split(" ")) 590 { 591 bool isIntl = word.hasHighAsciiChars(); 592 if (wasIntl && isIntl) 593 words[$-1] ~= " " ~ word; 594 else 595 words ~= word; 596 wasIntl = isIntl; 597 } 598 599 enum CHUNK_LENGTH_THRESHOLD = 20; 600 601 foreach (ref word; words) 602 { 603 if (!word.hasHighAsciiChars()) 604 continue; 605 string[] output; 606 string s = word; 607 while (s.length) 608 { 609 size_t ptr = 0; 610 while (ptr < s.length && ptr < CHUNK_LENGTH_THRESHOLD) 611 ptr += stride(s, ptr); 612 output ~= encodeRfc1522Chunk(s[0..ptr]); 613 s = s[ptr..$]; 614 } 615 word = output.join(" "); 616 } 617 return words.join(" "); 618 } 619 620 string encodeRfc1522Chunk(string str) 621 { 622 auto result = "=?UTF-8?B?" ~ Base64.encode(cast(ubyte[])str) ~ "?="; 623 return assumeUnique(result); 624 } 625 626 unittest 627 { 628 auto text = "В лесу родилась ёлочка"; 629 assert(decodeRfc1522(encodeRfc1522(text)) == text); 630 631 // Make sure email address isn't mangled 632 assert(encodeRfc1522("Sönke Martin <msonke@example.org>").endsWith(" <msonke@example.org>")); 633 } 634 635 string decodeQuotedPrintable(string s, bool inHeaders) 636 { 637 auto r = appender!string(); 638 for (int i=0; i<s.length; ) 639 if (s[i]=='=') 640 { 641 if (i+1 >= s.length || s[i+1] == '\n') 642 i+=2; // escape newline 643 else 644 r.put(cast(char)to!ubyte(s[i+1..i+3], 16)), i+=3; 645 } 646 else 647 if (s[i]=='_' && inHeaders) 648 r.put(' '), i++; 649 else 650 r.put(s[i++]); 651 return r.data; 652 } 653 654 string guessDefaultEncoding(string userAgent) 655 { 656 switch (userAgent) 657 { 658 case "DFeed": 659 // Early DFeed versions did not specify the encoding 660 return "utf8"; 661 default: 662 return "windows1252"; 663 } 664 } 665 666 // http://d.puremagic.com/issues/show_bug.cgi?id=7016 667 static import ae.sys.cmd; 668 669 string decodeEncodedText(ascii s, string textEncoding) 670 { 671 try 672 return toUtf8(s, textEncoding, false); 673 catch (Exception e) 674 { 675 debug(RFC850) stderr.writefln("iconv fallback for %s (%s)", textEncoding, e.msg); 676 try 677 { 678 import ae.sys.cmd; 679 return iconv(s, textEncoding); 680 } 681 catch (Exception e) 682 { 683 debug(RFC850) stderr.writefln("ISO-8859-1 fallback (%s)", e.msg); 684 return toUtf8(s, "ISO-8859-1", false); 685 } 686 } 687 } 688 689 struct TokenHeader 690 { 691 string value; 692 string[string] properties; 693 } 694 695 TokenHeader decodeTokenHeader(string s) 696 { 697 string take(char until) 698 { 699 string result; 700 auto p = s.indexOf(until); 701 if (p < 0) 702 result = s, 703 s = null; 704 else 705 result = s[0..p], 706 s = asciiStrip(s[p+1..$]); 707 return result; 708 } 709 710 TokenHeader result; 711 result.value = take(';'); 712 713 while (s.length) 714 { 715 string name = take('='); 716 string value; 717 if (s.length && s[0] == '"') 718 { 719 s = s[1..$]; 720 value = take('"'); 721 take(';'); 722 } 723 else 724 value = take(';'); 725 result.properties[name] = value; 726 } 727 728 return result; 729 } 730 731 string decodeTransferEncoding(string data, string encoding) 732 { 733 switch (toLower(encoding)) 734 { 735 case "7bit": 736 return data; 737 case "quoted-printable": 738 return decodeQuotedPrintable(data, false); 739 case "base64": 740 //return cast(string)Base64.decode(data.replace("\n", "")); 741 { 742 auto s = data.fastReplace("\n", ""); 743 scope(failure) debug(RFC850) stderr.writeln(s); 744 return cast(string)Base64.decode(s); 745 } 746 default: 747 return data; 748 } 749 } 750 751 ubyte[] uudecode(string[] lines) 752 { 753 // TODO: optimize 754 //auto data = appender!(ubyte[]); // OPTLINK says no 755 ubyte[] data; 756 foreach (line; lines) 757 { 758 if (!line.length || line.startsWith("`")) 759 continue; 760 ubyte len = to!ubyte(line[0] - 32); 761 line = line[1..$]; 762 while (line.length % 4) 763 line ~= 32; 764 ubyte[] lineData; 765 while (line.length) 766 { 767 uint v = 0; 768 foreach (c; line[0..4]) 769 if (c == '`') // same as space 770 v <<= 6; 771 else 772 { 773 enforce(c >= 32 && c < 96, [c]); 774 v = (v<<6) | (c - 32); 775 } 776 777 auto a = cast(ubyte[])((&v)[0..1]); 778 lineData ~= a[2]; 779 lineData ~= a[1]; 780 lineData ~= a[0]; 781 782 line = line[4..$]; 783 } 784 while (len > lineData.length) 785 lineData ~= 0; 786 data ~= lineData[0..len]; 787 } 788 return data; 789 }