1 /** 2 * Parses and handles Internet mail/news messages. 3 * 4 * License: 5 * This Source Code Form is subject to the terms of 6 * the Mozilla Public License, v. 2.0. If a copy of 7 * the MPL was not distributed with this file, You 8 * can obtain one at http://mozilla.org/MPL/2.0/. 9 * 10 * Authors: 11 * Vladimir Panteleev <vladimir@thecybershadow.net> 12 */ 13 14 module ae.net.ietf.message; 15 16 import std.algorithm; 17 import std.array; 18 import std.base64; 19 import std.conv; 20 import std.datetime; 21 import std.exception; 22 import std.regex; 23 import std.string; 24 import std.uri; 25 import std.utf; 26 27 // TODO: Replace with logging? 28 debug(RFC850) import std.stdio : stderr; 29 30 import ae.net.ietf.headers; 31 import ae.utils.array; 32 import ae.utils.iconv; 33 import ae.utils.mime; 34 import ae.utils.regex; 35 import ae.utils.text; 36 import ae.utils.time; 37 38 import ae.net.ietf.wrap; 39 40 alias ae.utils.text.ascii.ascii ascii; // https://d.puremagic.com/issues/show_bug.cgi?id=12156 41 alias std..string.indexOf indexOf; 42 43 struct Xref 44 { 45 string group; 46 int num; 47 } 48 49 class Rfc850Message 50 { 51 /// The raw message (as passed in a constructor). 52 ascii message; 53 54 /// The message ID, as specified at creation or in the Message-ID field. 55 /// Includes the usual angular brackets. 56 string id; 57 58 /// Cross-references - for newsgroups posts, list of groups where it was 59 /// posted, and article number in said group. 60 Xref[] xref; 61 62 /// The thread subject, with the leading "Re: " and list ID stripped. 63 string subject; 64 65 /// The original message subject, as it appears in the message. 66 string rawSubject; 67 68 /// The author's name, in UTF-8, stripped of quotes (no email address). 69 string author; 70 71 /// The author's email address, stripped of angular brackets. 72 string authorEmail; 73 74 /// Message date/time. 75 SysTime time; 76 77 /// A list of Message-IDs that this post is in reply to. 78 /// The most recent message (and direct parent) comes last. 79 string[] references; 80 81 /// Whether this post is a reply. 82 bool reply; 83 84 /// This message's headers. 85 Headers headers; 86 87 /// The text contents of this message (UTF-8). 88 /// "null" in case of an error. 89 string content; 90 91 /// The contents of this message (depends on mimeType). 92 ubyte[] data; 93 94 /// Explanation for null content. 95 string error; 96 97 /// Reflow options (RFC 2646). 98 bool flowed, delsp; 99 100 /// For a multipart message, contains the child parts. 101 /// May nest more than one level. 102 Rfc850Message[] parts; 103 104 /// Properties of a multipart message's part. 105 string name, fileName, description, mimeType; 106 107 /// Parses a message string and creates a Rfc850Message. 108 this(ascii message) 109 { 110 this.message = message; 111 debug(RFC850) scope(failure) stderr.writeln("Failure while parsing message: ", id); 112 113 // Split headers from message, parse headers 114 115 // TODO: this breaks binary encodings, FIXME 116 auto text = message.fastReplace("\r\n", "\n"); 117 auto headerEnd = text.indexOf("\n\n"); 118 if (headerEnd < 0) headerEnd = text.length; 119 auto header = text[0..headerEnd]; 120 header = header.fastReplace("\n\t", " ").fastReplace("\n ", " "); 121 122 // TODO: Use a proper spec-conforming header parser 123 foreach (s; header.fastSplit('\n')) 124 { 125 if (s == "") break; 126 127 auto p = s.indexOf(": "); 128 if (p<0) continue; 129 //assert(p>0, "Bad header line: " ~ s); 130 headers[s[0..p]] = s[p+2..$]; 131 } 132 133 // Decode international characters in headers 134 135 string defaultEncoding = guessDefaultEncoding(headers.get("User-Agent", null)); 136 137 foreach (string key, ref string value; headers) 138 if (hasHighAsciiChars(value)) 139 value = decodeEncodedText(value, defaultEncoding); 140 141 // Decode transfer encoding 142 143 ascii rawContent = text[min(headerEnd+2, $)..$]; 144 145 if ("Content-Transfer-Encoding" in headers) 146 try 147 rawContent = decodeTransferEncoding(rawContent, headers["Content-Transfer-Encoding"]); 148 catch (Exception e) 149 { 150 rawContent = null; 151 error = "Error decoding " ~ headers["Content-Transfer-Encoding"] ~ " message: " ~ e.msg; 152 } 153 154 // Decode message 155 156 data = cast(ubyte[])rawContent; 157 158 TokenHeader contentType, contentDisposition; 159 if ("Content-Type" in headers) 160 contentType = decodeTokenHeader(headers["Content-Type"]); 161 if ("Content-Disposition" in headers) 162 contentDisposition = decodeTokenHeader(headers["Content-Disposition"]); 163 mimeType = toLower(contentType.value); 164 flowed = contentType.properties.get("format", "fixed").icmp("flowed")==0; 165 delsp = contentType.properties.get("delsp", "no").icmp("yes") == 0; 166 167 if (rawContent) 168 { 169 if (!mimeType || mimeType == "text/plain") 170 { 171 if ("charset" in contentType.properties) 172 content = decodeEncodedText(rawContent, contentType.properties["charset"]); 173 else 174 content = decodeEncodedText(rawContent, defaultEncoding); 175 } 176 else 177 if (mimeType.startsWith("multipart/") && "boundary" in contentType.properties) 178 { 179 string boundary = contentType.properties["boundary"]; 180 auto end = rawContent.indexOf("--" ~ boundary ~ "--"); 181 if (end < 0) 182 end = rawContent.length; 183 rawContent = rawContent[0..end]; 184 185 auto rawParts = rawContent.split("--" ~ boundary ~ "\n"); 186 foreach (rawPart; rawParts[1..$]) 187 { 188 auto part = new Rfc850Message(rawPart); 189 if (part.content && !content) 190 content = part.content; 191 parts ~= part; 192 } 193 194 if (!content) 195 { 196 if (rawParts.length && rawParts[0].asciiStrip().length) 197 content = rawParts[0]; // default content to multipart stub 198 else 199 error = "Couldn't find text part in this " ~ mimeType ~ " message"; 200 } 201 } 202 else 203 error = "Don't know how parse " ~ mimeType ~ " message"; 204 } 205 206 // Strip PGP signature away to a separate "attachment" 207 208 enum PGP_START = "-----BEGIN PGP SIGNED MESSAGE-----\n"; 209 enum PGP_DELIM = "\n-----BEGIN PGP SIGNATURE-----\n"; 210 enum PGP_END = "\n-----END PGP SIGNATURE-----"; 211 if (content.startsWith(PGP_START) && 212 content.contains(PGP_DELIM) && 213 content.asciiStrip().endsWith(PGP_END)) 214 { 215 // Don't attempt to create meaningful signature files... just get the clutter out of the way 216 content = content.asciiStrip(); 217 auto p = content.indexOf(PGP_DELIM); 218 auto part = new Rfc850Message(content[p+PGP_DELIM.length..$-PGP_END.length]); 219 content = content[PGP_START.length..p]; 220 p = content.indexOf("\n\n"); 221 if (p >= 0) 222 content = content[p+2..$]; 223 part.fileName = "pgp.sig"; 224 parts ~= part; 225 } 226 227 // Decode UU-encoded attachments 228 229 if (content.contains("\nbegin ")) 230 { 231 auto r = regex(`^begin [0-7]+ \S+$`); 232 auto lines = content.split("\n"); 233 size_t start; 234 bool started; 235 string fn; 236 237 for (size_t i=0; i<lines.length; i++) 238 if (!started && !match(lines[i], r).empty) 239 { 240 start = i; 241 fn = lines[i].split(" ")[2]; 242 started = true; 243 } 244 else 245 if (started && lines[i] == "end" && lines[i-1]=="`") 246 { 247 started = false; 248 try 249 { 250 auto data = uudecode(lines[start+1..i]); 251 252 auto part = new Rfc850Message(); 253 part.fileName = fn; 254 part.mimeType = guessMime(fn); 255 part.data = data; 256 parts ~= part; 257 258 lines = lines[0..start] ~ lines[i+1..$]; 259 i = start-1; 260 } 261 catch (Exception e) 262 debug(RFC850) stderr.writeln(e); 263 } 264 265 content = lines.join("\n"); 266 } 267 268 // Parse message-part properties 269 270 name = contentType.properties.get("name", string.init); 271 fileName = contentDisposition.properties.get("filename", string.init); 272 description = headers.get("Content-Description", string.init); 273 if (name == fileName) 274 name = null; 275 276 // Decode references 277 278 if ("References" in headers) 279 { 280 reply = true; 281 auto refs = asciiStrip(headers["References"]); 282 while (refs.startsWith("<")) 283 { 284 auto p = refs.indexOf(">"); 285 if (p < 0) 286 break; 287 references ~= refs[0..p+1]; 288 refs = asciiStrip(refs[p+1..$]); 289 } 290 } 291 else 292 if ("In-Reply-To" in headers) 293 references = [headers["In-Reply-To"]]; 294 295 // Decode subject 296 297 subject = rawSubject = "Subject" in headers ? decodeRfc1522(headers["Subject"]) : null; 298 if (subject.startsWith("Re: ")) 299 { 300 subject = subject[4..$]; 301 reply = true; 302 } 303 304 // Decode author 305 306 static string[2] decodeAuthor(string header) 307 { 308 string author, authorEmail; 309 author = authorEmail = header; 310 if ((author.indexOf('@') < 0 && author.indexOf(" at ") >= 0) 311 || (author.indexOf("<") < 0 && author.indexOf(">") < 0 && author.indexOf(" (") > 0 && author.endsWith(")"))) 312 { 313 // Mailing list archive format 314 assert(author == authorEmail); 315 if (author.indexOf(" (") > 0 && author.endsWith(")")) 316 { 317 authorEmail = author[0 .. author.lastIndexOf(" (")].replace(" at ", "@"); 318 author = author[author.lastIndexOf(" (")+2 .. $-1].decodeRfc1522(); 319 } 320 else 321 { 322 authorEmail = author.replace(" at ", "@"); 323 author = author[0 .. author.lastIndexOf(" at ")]; 324 } 325 } 326 if (author.indexOf('<')>=0 && author.endsWith('>')) 327 { 328 auto p = author.indexOf('<'); 329 authorEmail = author[p+1..$-1]; 330 author = decodeRfc1522(asciiStrip(author[0..p])); 331 } 332 333 if (author.length>2 && author[0]=='"' && author[$-1]=='"') 334 author = decodeRfc1522(asciiStrip(author[1..$-1])); 335 if ((author == authorEmail || author == "") && authorEmail.indexOf("@") > 0) 336 author = authorEmail[0..authorEmail.indexOf("@")]; 337 return [author, authorEmail]; 338 } 339 340 list(author, authorEmail) = decodeAuthor("From" in headers ? decodeRfc1522(headers["From"]) : null); 341 342 if (headers.get("List-Post", null) == "<mailto:" ~ authorEmail ~ ">" && "Reply-To" in headers) 343 list(null, authorEmail) = decodeAuthor(decodeRfc1522(headers["Reply-To"].findSplit(", ")[0])); 344 345 // Decode cross-references 346 347 if ("Xref" in headers) 348 { 349 auto xrefStrings = split(headers["Xref"], " ")[1..$]; 350 foreach (str; xrefStrings) 351 { 352 auto segs = str.split(":"); 353 xref ~= Xref(segs[0], to!int(segs[1])); 354 } 355 } 356 357 /* 358 if ("List-ID" in headers && !xref.length) 359 { 360 auto listID = headers["List-ID"]; 361 listID = listID.findSplit(" <")[0]; 362 listID = listID.replace(`"`, ``); 363 xref = [Xref(listID)]; 364 } 365 */ 366 367 if (headers.get("Sender", null).canFind("-bounces@")) 368 xref ~= Xref(headers["Sender"].findSplit(" <")[0].replace(`"`, ``)); 369 370 if ("List-Unsubscribe" in headers && !xref.length) 371 xref = headers["List-Unsubscribe"].split(", ").filter!(s => s.canFind("/options/")).map!(s => Xref(s.split("/")[$-1].stripRight('>'))).array(); 372 373 // Decode message ID 374 375 if ("Message-ID" in headers && !id) 376 id = headers["Message-ID"]; 377 378 // Decode post time 379 380 time = Clock.currTime; // default value 381 382 if ("NNTP-Posting-Date" in headers) 383 time = parseTime!`D, j M Y H:i:s O \(\U\T\C\)`(headers["NNTP-Posting-Date"].strip()); 384 else 385 if ("Date" in headers) 386 { 387 auto str = headers["Date"].strip(); 388 str = str.replace(re!`([+\-]\d\d\d\d) \(.*\)$`, "$1"); 389 try 390 time = parseTime!(TimeFormats.RFC850)(str); 391 catch (Exception e) 392 try 393 time = parseTime!(`D, j M Y H:i:s O`)(str); 394 catch (Exception e) 395 try 396 time = parseTime!(`D, j M Y H:i:s e`)(str); 397 catch (Exception e) 398 try 399 time = parseTime!(`D, j M Y H:i O`)(str); 400 catch (Exception e) 401 try 402 time = parseTime!(`D, j M Y H:i e`)(str); 403 catch (Exception e) 404 { 405 // fall-back to default (class creation time) 406 // TODO: better behavior? 407 } 408 } 409 } 410 411 private this() {} // for attachments and templates 412 413 /// Create a template Rfc850Message for a new posting to the specified groups. 414 static Rfc850Message newPostTemplate(string groups) 415 { 416 auto post = new Rfc850Message(); 417 foreach (group; groups.split(",")) 418 post.xref ~= Xref(group); 419 return post; 420 } 421 422 @property WrapFormat wrapFormat() 423 { 424 return flowed ? delsp ? WrapFormat.flowedDelSp : WrapFormat.flowed : WrapFormat.heuristics; 425 } 426 427 /// Create a template Rfc850Message for a reply to this message. 428 Rfc850Message replyTemplate() 429 { 430 auto post = new Rfc850Message(); 431 post.reply = true; 432 post.xref = this.xref; 433 post.references = this.references ~ this.id; 434 post.subject = this.rawSubject; 435 if (!post.subject.startsWith("Re:")) 436 post.subject = "Re: " ~ post.subject; 437 438 auto paragraphs = unwrapText(this.content, this.wrapFormat); 439 foreach (i, ref paragraph; paragraphs) 440 if (paragraph.quotePrefix.length) 441 paragraph.quotePrefix = ">" ~ paragraph.quotePrefix; 442 else 443 { 444 if (paragraph.text == "-- " || paragraph.text == "_______________________________________________") 445 { 446 paragraphs = paragraphs[0..i]; 447 break; 448 } 449 paragraph.quotePrefix = paragraph.text.length ? "> " : ">"; 450 } 451 while (paragraphs.length && paragraphs[$-1].text.length==0) 452 paragraphs = paragraphs[0..$-1]; 453 454 auto replyTime = time; 455 replyTime.timezone = UTC(); 456 post.content = 457 "On " ~ replyTime.formatTime!`l, j F Y \a\t H:i:s e`() ~ ", " ~ this.author ~ " wrote:\n" ~ 458 wrapText(paragraphs) ~ 459 "\n\n"; 460 post.flowed = true; 461 post.delsp = false; 462 463 return post; 464 } 465 466 /// Set the message text. 467 /// Rewraps as necessary. 468 void setText(string text) 469 { 470 this.content = wrapText(unwrapText(text, WrapFormat.input)); 471 this.flowed = true; 472 this.delsp = false; 473 } 474 475 /// Write this Message instance's fields to their appropriate headers. 476 void compileHeaders() 477 { 478 assert(id); 479 480 headers["Message-ID"] = id; 481 headers["From"] = format(`%s <%s>`, author, authorEmail); 482 headers["Subject"] = subject; 483 headers["Newsgroups"] = xref.map!(x => x.group)().join(","); 484 headers["Content-Type"] = format("text/plain; charset=utf-8; format=%s; delsp=%s", flowed ? "flowed" : "fixed", delsp ? "yes" : "no"); 485 headers["Content-Transfer-Encoding"] = "8bit"; 486 if (references.length) 487 { 488 headers["References"] = references.join(" "); 489 headers["In-Reply-To"] = references[$-1]; 490 } 491 if (time == SysTime.init) 492 time = Clock.currTime(); 493 headers["Date"] = time.formatTime!(TimeFormats.RFC2822); 494 headers["User-Agent"] = "ae.net.ietf.message"; 495 } 496 497 /// Construct the headers and message fields. 498 void compile() 499 { 500 compileHeaders(); 501 502 string[] lines; 503 foreach (name, value; headers) 504 { 505 if (value.hasHighAsciiChars()) 506 value = value.encodeRfc1522(); 507 auto line = name ~ ": " ~ value; 508 auto lineStart = name.length + 2; 509 510 foreach (c; line) 511 enforce(c >= 32, "Control characters in header: %(%s%)".format([line])); 512 513 while (line.length >= 80) 514 { 515 auto p = line[0..80].lastIndexOf(' '); 516 if (p < lineStart) 517 { 518 p = 80 + line[80..$].indexOf(' '); 519 if (p < 80) 520 break; 521 } 522 lines ~= line[0..p]; 523 line = line[p..$]; 524 lineStart = 1; 525 } 526 lines ~= line; 527 } 528 529 message = 530 lines.join("\r\n") ~ 531 "\r\n\r\n" ~ 532 splitAsciiLines(content).join("\r\n"); 533 } 534 535 /// Get the Message-ID that this message is in reply to. 536 @property string parentID() 537 { 538 return references.length ? references[$-1] : null; 539 } 540 541 /// Return the oldest known ancestor of this post, possibly 542 /// this post's ID if it is the first one in the thread. 543 /// May not be the thread ID - some UAs/services 544 /// cut off or strip the "References" header. 545 @property string firstAncestorID() 546 { 547 return references.length ? references[0] : id; 548 } 549 } 550 551 unittest 552 { 553 auto post = new Rfc850Message("From: msonke at example.org (=?ISO-8859-1?Q?S=F6nke_Martin?=)\n\nText"); 554 assert(post.author == "Sönke Martin"); 555 assert(post.authorEmail == "msonke@example.org"); 556 557 post = new Rfc850Message("Date: Tue, 06 Sep 2011 14:52 -0700\n\nText"); 558 assert(post.time.year == 2011); 559 } 560 561 private: 562 563 /// Decode headers with international characters in them. 564 string decodeRfc1522(string str) 565 { 566 auto words = str.split(" "); 567 bool[] encoded = new bool[words.length]; 568 569 foreach (wordIndex, ref word; words) 570 if (word.length > 6 && word.startsWith("=?") && word.endsWith("?=")) 571 { 572 auto parts = split(word[2..$-2], "?"); 573 if (parts.length != 3) 574 continue; 575 auto charset = parts[0]; 576 auto encoding = parts[1]; 577 auto text = parts[2]; 578 579 switch (toUpper(encoding)) 580 { 581 case "Q": 582 text = decodeQuotedPrintable(text, true); 583 break; 584 case "B": 585 text = cast(ascii)Base64.decode(text); 586 break; 587 default: 588 continue /*foreach*/; 589 } 590 591 word = decodeEncodedText(text, charset); 592 encoded[wordIndex] = true; 593 } 594 595 string result; 596 foreach (wordIndex, word; words) 597 { 598 if (wordIndex > 0 && !(encoded[wordIndex-1] && encoded[wordIndex])) 599 result ~= ' '; 600 result ~= word; 601 } 602 603 try 604 { 605 import std.utf; 606 validate(result); 607 } 608 catch (Exception e) 609 result = toUtf8(cast(ascii)result, "ISO-8859-1", true); 610 611 return result; 612 } 613 614 /// Encodes an UTF-8 string to be used in headers. 615 string encodeRfc1522(string str) 616 { 617 if (!str.hasHighAsciiChars()) 618 return str; 619 620 string[] words; 621 bool wasIntl = false; 622 foreach (word; str.split(" ")) 623 { 624 bool isIntl = word.hasHighAsciiChars(); 625 if (wasIntl && isIntl) 626 words[$-1] ~= " " ~ word; 627 else 628 words ~= word; 629 wasIntl = isIntl; 630 } 631 632 enum CHUNK_LENGTH_THRESHOLD = 20; 633 634 foreach (ref word; words) 635 { 636 if (!word.hasHighAsciiChars()) 637 continue; 638 string[] output; 639 string s = word; 640 while (s.length) 641 { 642 size_t ptr = 0; 643 while (ptr < s.length && ptr < CHUNK_LENGTH_THRESHOLD) 644 ptr += stride(s, ptr); 645 output ~= encodeRfc1522Chunk(s[0..ptr]); 646 s = s[ptr..$]; 647 } 648 word = output.join(" "); 649 } 650 return words.join(" "); 651 } 652 653 string encodeRfc1522Chunk(string str) pure 654 { 655 auto result = "=?UTF-8?B?" ~ Base64.encode(cast(ubyte[])str) ~ "?="; 656 return result; 657 } 658 659 unittest 660 { 661 auto text = "В лесу родилась ёлочка"; 662 assert(decodeRfc1522(encodeRfc1522(text)) == text); 663 664 // Make sure email address isn't mangled 665 assert(encodeRfc1522("Sönke Martin <msonke@example.org>").endsWith(" <msonke@example.org>")); 666 } 667 668 string decodeQuotedPrintable(string s, bool inHeaders) 669 { 670 auto r = appender!string(); 671 for (int i=0; i<s.length; ) 672 if (s[i]=='=') 673 { 674 if (i+1 >= s.length || s[i+1] == '\n') 675 i+=2; // escape newline 676 else 677 r.put(cast(char)to!ubyte(s[i+1..i+3], 16)), i+=3; 678 } 679 else 680 if (s[i]=='_' && inHeaders) 681 r.put(' '), i++; 682 else 683 r.put(s[i++]); 684 return r.data; 685 } 686 687 string guessDefaultEncoding(string userAgent) 688 { 689 switch (userAgent) 690 { 691 case "DFeed": 692 // Early DFeed versions did not specify the encoding 693 return "utf8"; 694 default: 695 return "windows1252"; 696 } 697 } 698 699 // http://d.puremagic.com/issues/show_bug.cgi?id=7016 700 static import ae.sys.cmd; 701 702 string decodeEncodedText(ascii s, string textEncoding) 703 { 704 try 705 return toUtf8(s, textEncoding, false); 706 catch (Exception e) 707 { 708 debug(RFC850) stderr.writefln("iconv fallback for %s (%s)", textEncoding, e.msg); 709 try 710 { 711 import ae.sys.cmd; 712 return iconv(s, textEncoding); 713 } 714 catch (Exception e) 715 { 716 debug(RFC850) stderr.writefln("ISO-8859-1 fallback (%s)", e.msg); 717 return toUtf8(s, "ISO-8859-1", false); 718 } 719 } 720 } 721 722 string decodeTransferEncoding(string data, string encoding) 723 { 724 switch (toLower(encoding)) 725 { 726 case "7bit": 727 return data; 728 case "quoted-printable": 729 return decodeQuotedPrintable(data, false); 730 case "base64": 731 //return cast(string)Base64.decode(data.replace("\n", "")); 732 { 733 auto s = data.fastReplace("\n", ""); 734 scope(failure) debug(RFC850) stderr.writeln(s); 735 return cast(string)Base64.decode(s); 736 } 737 default: 738 return data; 739 } 740 } 741 742 ubyte[] uudecode(string[] lines) 743 { 744 // TODO: optimize 745 //auto data = appender!(ubyte[]); // OPTLINK says no 746 ubyte[] data; 747 foreach (line; lines) 748 { 749 if (!line.length || line.startsWith("`")) 750 continue; 751 ubyte len = to!ubyte(line[0] - 32); 752 line = line[1..$]; 753 while (line.length % 4) 754 line ~= 32; 755 ubyte[] lineData; 756 while (line.length) 757 { 758 uint v = 0; 759 foreach (c; line[0..4]) 760 if (c == '`') // same as space 761 v <<= 6; 762 else 763 { 764 enforce(c >= 32 && c < 96, [c]); 765 v = (v<<6) | (c - 32); 766 } 767 768 auto a = cast(ubyte[])((&v)[0..1]); 769 lineData ~= a[2]; 770 lineData ~= a[1]; 771 lineData ~= a[0]; 772 773 line = line[4..$]; 774 } 775 while (len > lineData.length) 776 lineData ~= 0; 777 data ~= lineData[0..len]; 778 } 779 return data; 780 }