MacAddict 108

home *** CD-ROM | disk | FTP | other *** search

/ MacAddict 108 / MacAddict108.iso / Software / Internet & Communication / JunkMatcher 1.5.5.dmg / JunkMatcher.app / Contents / Resources / Engine / Message.py < prev next >

Wrap

Python Source | 2005-06-01 | 23.9 KB | 588 lines

# # Message.py # JunkMatcher # # Created by Benjamin Han on 2/1/05. # Copyright (c) 2005 Benjamin Han. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #!/usr/bin/env python import email, datetime, time, string from email.Header import decode_header from consts import * from utilities import * from emailAddress import * from parseURL import * from HTMLBody import * # the following headers are excluded from Message.headers since they are included # elsewhere in Message _ignoreHeaders = sets.Set(['from', 'sender', 'to', 'cc', 'subject']) _receivedDatePat = re.compile(r'(\d+)-(\d\d)-(\d\d) (\d\d):(\d\d):(\d\d) \+0000\@') _rawMIDPat = re.compile(r'<(.+\@.+)>') _ipPat = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') _asciiPat = re.compile(r'[\020-\0177]*') _anchorPat = re.compile(r'(?i)<a\s+[^>]*href\s*=\s*[\'"]?\s*(https?:/?/?[^"\'<> \t\n\r\f\v]+)[^>]*?>\s*(https?:/?/?[^"\'<> \t\n\r\f\v]+)\s*</a>') class Message (object): """An email message ---------------- WARNING: if a message is malformed, m is None and you should NOT call any method or try to get any attribute except for msgSrc, receivedDate and m! The beginning of the input msgSrc may have an received date string in UTC. If not, self.receivedDate is set to None. I. the following are set by __init__() (the following two can generate all the other data) msgSrc: the raw source of the message receivedDate: date/time (UTC) when the message was received; a datetime obj; could be None if it is not indicated in the beginning of the input msgSrc. containingNull: True iff msgSrc contains '\0' (which will be removed) m: Python email object (if the email is malformed, m is None and none of the following attributes exists) mID: message ID (None if message ID is malformed/missing) subject: the message subject (Unicode) sender: the sender of the message, in the full form (Unicode) senderEmail: the email address of the sender (None if no sender, False if malformed) senderDomain: the email domain of the sender (None if no sender, False if malformed) date: the date this email was sent - a string (None if missing) headers: complete headers except for those specified in _ignoreHeaders (Unicode) II. the following are set by _decode() charsets: a newline-separated string of all character sets used in this email charset: a representative charset (a random one from charsets); None if no charset is available body: the decoded message body; for plain text emails it's the text, for HTML mails it's the cleaned HTML code (Unicode) htmlBody: for HTML emails only - an HTMLBody object (Unicode) isHTML: True iff body is in HTML filenames: a newline-separated string of all attachment filenames urlDict: a dictionary of URLs used in the email (Unicode and in lowercase; URLs can contain weird encodings!) III. the following are set by other methods dateObj: set by _setDateObj(); False if date is malformed, None if date is missing, otherwise it's a datetime obj converted from self.date (UTC). timeDelta: set by _setTimeDelta(); None if self.dateObj is malformed/missing, otherwise it's a timedelta object = self.dateObj - self.receivedDate. recipients: set by _setRecipients(); a list of tuples (text, email address, domain) when none of the recipients is malformed (note text is *not* decoded - see emailAddress.py); otherwise it's the part of addresses that are near the error point. numRecipients: set by _setRecipients(): length of self.recipients; -1 if error occurs. decodedRecipients: set by _setDecodedRecipients(): a list of decoded recipients, in full form and Unicode. badSite: set by _setSites(); the first bad site found against globalObjects.siteDB; None if no bad site is found. phishingURL: set by _setPhishingURL(); it's a tuple ((s1, e1), (s2, e2), (real site, claimed site)) - the first sub-tuple is the span of the *real* URL, the 2nd sub-tuple is the span of the *claimed* URL, and the 3rd sub-tuple is self-explanatory (they are lowercase strings of the domain names); if no phishing URL is found, it's None. rendering: set by HTMLBody.setRendering(). headerIPs: set by _setHeaderIPs(). IMPORTANT ASSUMPTION: One Message instance per thread! """ # improving performance by not having __dict__ __slots__ = ('msgSrc', 'receivedDate', 'containingNull', 'm', 'mID', 'subject', 'sender', 'senderEmail', 'senderDomain', 'date', 'headers', '_charsetSet', '_charsetList', '_fnList', 'charsets', 'charset', 'body', 'htmlBody', 'isHTML', 'filenames', 'urlDict', '_sites', 'dateObj', 'timeDelta', 'recipients', 'numRecipients', 'decodedRecipients', 'badSite', 'phishingURL', 'rendering', 'headerIPs') identityTable = __import__('string').maketrans('', '') def __init__ (self, msgSrc): # The beginning of msgSrc might have a received date string # e.g., "2005-02-06 15:57:37 +0000@" (NOTE the '@' delimiter) # the date is always in UTC so the time zone is always +0000 mo = _receivedDatePat.match(msgSrc) if mo: msgSrc = msgSrc[mo.end(0):] self.receivedDate = datetime.datetime(int(mo.group(1)), int(mo.group(2)), int(mo.group(3)), int(mo.group(4)), int(mo.group(5)), int(mo.group(6))) else: self.receivedDate = None self.msgSrc = msgSrc # check and clean out possible null chars oldLength = len(msgSrc) msgSrc = msgSrc.translate(Message.identityTable, '\0') self.containingNull = (oldLength != len(msgSrc)) try: m = email.message_from_string(msgSrc) self.m = m except: # malformed emails self.m = None return mID = m['Message-Id'] if mID is not None: mo = _rawMIDPat.search(mID) if mo is None: self.mID = None else: self.mID = mo.group(1) else: self.mID = None self._charsetSet = sets.Set() # set self.subject - self._charsetSet will be updated subj = m['Subject'] if subj: try: self.subject = decodeTextList(decode_header(subj), self._charsetSet).strip() except: # error occurred in decode_header() self.subject = u'' else: self.subject = u'' # set self.sender, self.senderEmail and self.senderDomain - # self._charsetSet will be updated sender = m['From'] if sender: try: self.sender = decodeTextList(decode_header(sender), self._charsetSet).strip() self.senderEmail, self.senderDomain = validateEmailAddress(sender) except: # error occurred in decode_header() self.sender = u'' self.senderEmail = self.senderDomain = None else: self.sender = u'' self.senderEmail = self.senderDomain = None self.date = m['Date'] # set up defaultEncoding if len(self._charsetSet): defaultEncoding = self._charsetSet.pop() self._charsetSet.add(defaultEncoding) else: defaultEncoding = None # set up _charsetList and correct common mispellings self._charsetList = [] for c in m.get_charsets(): if c: mispelling = charsetMispellings.get(c) if mispelling: c = mispelling self._charsetSet.add(c) else: # encoding is None; fall back to the defaultEncoding c = defaultEncoding if c: self._charsetSet.add(c) self._charsetList.append(c) # set up headers self.headers = decodeText('\n'.join([': '.join((hk, hv)) for hk, hv in filter(lambda i:i[0].lower() not in _ignoreHeaders, self.m.items())])) def __getattr__ (self, name): if name == 'charsets': self._setDecodedRecipients() return self.charsets elif name == 'charset': self._setDecodedRecipients() return self.charset elif name == 'body': self._decode() return self.body elif name == 'htmlBody': self._decode() return self.htmlBody elif name == 'isHTML': self._decode() return self.isHTML elif name == 'filenames': self._decode() return self.filenames elif name == 'urlDict': self._decode() return self.urlDict elif name == 'recipients': self._setRecipients() return self.recipients elif name == 'numRecipients': self._setRecipients() return self.numRecipients elif name == 'decodedRecipients': self._setDecodedRecipients() return self.decodedRecipients elif name == '_sites': self._setSites() return self._sites elif name == 'badSite': self._setSites() return self.badSite elif name == 'phishingURL': self._setPhishingURL() return self.phishingURL elif name == 'headerIPs': self._setHeaderIPs() return self.headerIPs elif name == 'dateObj': self._setDateObj() return self.dateObj elif name == 'timeDelta': self._setTimeDelta() return self.timeDelta elif name == 'rendering': # for HTML messages the rendering is different from the message body # (using elinks to provide the HTML rendering); otherwise it's the body if self.isHTML: self.htmlBody.setRendering() self.rendering = self.htmlBody.rendering else: self.rendering = self.body return self.rendering else: raise AttributeError('No attribute %s in this %s instance.' % (name, self.__class__.__name__)) def isMultipart (self): return self.m.is_multipart() def __decodePartOfEmail (self, part, charset): """Returns unicode object or HTMLBody object (if part is in HTML) if payload exists, otherwise returns None; updates self.isHTML and self._charsetSet accordingly.""" ret = None if part.get_content_maintype() == 'text': payload = part.get_payload(decode = True) if payload: # Invariance: payload is not decoded; but ret is if part.get_content_subtype() == 'html': self.isHTML = True ret = HTMLBody(payload, charset) if ret.encoding: self._charsetSet.add(ret.encoding) # spelling should have been checked in HTMLBody else: ret = decodeText(payload, charset).strip() fn = part.get_filename() if fn is None: fn = part.get_param('name') if fn: self._fnList.append(fn) return ret def _decode (self): """Decode for content from self.m (see the attributes set by this method in the docstring of self); returns nothing.""" self.isHTML = False self._fnList = [] charsetIndex = 0 htmlBodyNotFound = True if self.m.is_multipart(): # multi-part email contentList = [] for part in self.m.walk(): if htmlBodyNotFound: payload = self.__decodePartOfEmail(part, self._charsetList[charsetIndex]) if self.isHTML: # when self.isHTML is True, payload is guaranteed to not be None self.htmlBody = payload self.body = payload.content htmlBodyNotFound = False else: if payload: contentList.append(payload) else: # we're only interested in collecting filenames if HTML body is already found fn = part.get_filename() if fn is None: fn = part.get_param('name') if fn: self._fnList.append(fn) charsetIndex += 1 if htmlBodyNotFound: # it's not an HTML body self.body = ' '.join(contentList).strip() else: # single-part email payload = self.__decodePartOfEmail(self.m, self._charsetList[charsetIndex]) if payload: if self.isHTML: self.htmlBody = payload self.body = payload.content else: self.body = payload else: self.body = u'' # set urlDict if self.isHTML: self.urlDict = self.htmlBody.urlDict else: self.urlDict = {} d = self.urlDict for url in map(string.lower, filter(lambda s:s, [m.group(0) for m in httpPat.finditer(self.body)])): d[url] = d.get(url, 0) + 1 self.filenames = '\n'.join(self._fnList) def _setDateObj (self): """Sets self.dateObj; NO return value.""" if self.date is not None: try: # TO-DO: according to Python's doc, mktime_tz() might introduce minor inaccuracies during DST changes. self.dateObj = datetime.datetime.utcfromtimestamp(email.Utils.mktime_tz(email.Utils.parsedate_tz(self.date))) except: self.dateObj = False else: # missing date counts as *not* malformed! self.dateObj = None def _setTimeDelta (self): """Sets self.timeDelta; NO return value.""" if self.dateObj and self.receivedDate: self.timeDelta = self.dateObj - self.receivedDate # normally timeDelta should be < 0 else: self.timeDelta = None def _setRecipients (self): """Parses the To and CC header fields for recipient email addresses: 1. It sets self.recipients - if successful it's a list of tuples (text, email address, domain); if not it's a string (see emailAddress.py); 2. It also sets self.numRecipients: -1 if error occurs; 2. NO return value """ recp = self.m.get_all('to') if recp is None: recp = self.m.get_all('cc') else: recp2 = self.m.get_all('cc') if recp2: recp.extend(recp2) if recp is None: self.recipients = [] self.numRecipients = 0 else: # if any email address is malformed, self.recipients becomes a string # (the text that's skipped in parsing the email addresses) self.recipients = extractEmailAddresses(', '.join(recp)) if type(self.recipients) is list: self.numRecipients = len(self.recipients) else: self.numRecipients = -1 def _setDecodedRecipients (self): """Decode the recipient address(es) based on the embedded encoding(s); sets self.decodedRecipients; NO return value""" if type(self.recipients) is list: self.decodedRecipients = [] try: # decode as much as possible self.decodedRecipients.extend(map(lambda r: decodeTextList(decode_header(r[0]), self._charsetSet), self.recipients)) except: pass else: self.decodedRecipients = [] self.charsets = '\n'.join(self._charsetSet) if len(self._charsetSet): self.charset = self._charsetSet.pop() else: self.charset = None def _setSites (self): """ Returns nothing.""" # stores only sites from parseURL that is not a safe site, and contains only ASCII # CAUTION: parseURL() might return None safeSitesPattern = globalObjects.safeSitesPattern sites = sets.Set(filter(lambda url: _asciiPat.match(url), filter(lambda url: safeSitesPattern.search(url) is None, map(lambda l:'.'.join(l), filter(lambda l: l, map(parseURL, self.urlDict.keys())))))) # _sites is a list of (nameComponents, node, b) where nameComponents is # a sequence of name components, node is a SiteDB node, and b is a boolean # (see getOne() in SiteDB.py); this is for calling addSites() later (see getOne(), # addOne() and addOneToNode() in SiteDB.py) self._sites = [] self.badSite = None if sites: siteDB = globalObjects.siteDB for site in sites: # this is to weed out the ill-encoded site from being added result = siteDB.getOne(site.split('.')) l = result[0] if l is not None: # otherwise the site is bogus self._sites.append(result) if self.badSite is None and len(l) == 0: self.badSite = site def _setPhishingURL (self): if self.isHTML: for moIter in _anchorPat.finditer(self.body): # don't bother using urllib.unquote() cuz we're after the inequality! realSite = parseURL(moIter.group(1).lower()) claimedSite = parseURL(moIter.group(2).lower()) if realSite != claimedSite: self.phishingURL = (moIter.span(1), moIter.span(2), ('.'.join(realSite), '.'.join(claimedSite))) return self.phishingURL = None def addSites (self): """Add the collected sites into globalObjects.siteDB; returns True if at least one site is added.""" if self._sites: siteDB = globalObjects.siteDB for l, n, b in self._sites: if n is None: siteDB.addOne(l, time.time()) else: siteDB.addOneToNode(l, n, b, time.time()) return True return False def removeSites (self, countMatters = True): """Remove the collected sites from globalObjects.siteDB; returns True iff at least one site is removed.""" safeSitesPattern = globalObjects.safeSitesPattern sites = sets.Set(filter(lambda url: safeSitesPattern.search(url) is None, map(lambda l:'.'.join(l), filter(lambda l: l, map(parseURL, self.urlDict.keys()))))) if sites: siteDB = globalObjects.siteDB ret = False for site in sites: if ret: siteDB.removeOne(site.split('.'), countMatters) else: ret = siteDB.removeOne(site.split('.'), countMatters) return ret return False def _setHeaderIPs (self): """Collects all the IPs mentioned in the headers in headerIPs, in reversed chronological order.""" rList = self.m.get_all('Received') ipSet = sets.Set() self.headerIPs = [] if rList: for line in rList: for moIter in _ipPat.finditer(line): ip = moIter.group(0) if not ip in ipSet: ipSet.add(ip) self.headerIPs.insert(0, moIter.group(0)) def show (self): """Just a way to dump all relevant info on screen.""" print 'This message is multi-part:', self.isMultipart() if (self.receivedDate): print 'Received date (UTC):', self.receivedDate print 'Subject:', encodeText(self.subject) print 'Sender:', encodeText(self.sender) print 'Sender email:', self.senderEmail print 'Sender domain:', self.senderDomain print 'Is HTML:', self.isHTML if len(self.charsets): print 'Charsets:', ', '.join(self.charsets.split('\n')) if len(self.filenames): print 'Filenames:', ', '.join(self.filenames.split('\n')) if self.numRecipients > 0: print 'Decoded Recipients:', encodeText(u', '.join(self.decodedRecipients)) print 'Recipient Emails:', ', '.join([r[1] for r in self.recipients]) print 'Bad site:', self.badSite if len(self.headerIPs): print 'Header IPs:', ', '.join(self.headerIPs) print 'Date Obj:', self.dateObj print 'Time delta:', self.timeDelta print print '-------------------- Headers --------------------' print self.headers print print '-------------------- Body --------------------' print encodeText(self.body) print '-------------------- Rendering --------------------' print encodeText(self.rendering) if self.isHTML: numHiddenURLs = len(self.htmlBody.hiddenURLList) if numHiddenURLs: print '-------------------- %d Hidden URL(s) --------------------'%numHiddenURLs for idx, (start, end) in enumerate(self.htmlBody.hiddenURLList): print idx, encodeText(self.htmlBody.contentWithoutEntities[start:end]) numBadTags = len(self.htmlBody.badTagList) if numBadTags: print '-------------------- %d Bad tag(s) --------------------'%numBadTags for idx, (start, end) in enumerate(self.htmlBody.badTagList): print idx, encodeText(self.htmlBody.contentWithoutEntities[start:end]) numVacuousTags = len(self.htmlBody.vacuousTagList) if numVacuousTags: print '-------------------- %d Vacuous tag(s) --------------------'%numVacuousTags for idx, (start, end) in enumerate(self.htmlBody.vacuousTagList): print idx, encodeText(self.htmlBody.contentWithoutBadTags[start:end]) numURLs = len(self.urlDict) if numURLs: print '-------------------- %d URL(s) --------------------'%numURLs for idx, (url, count) in enumerate(self.urlDict.items()): print '%d %s (%d)' % (idx, encodeText(url), count) if __name__ == '__main__': import sys if len(sys.argv) == 1: print 'Usage: ./Message.py <filename>' print ' * filename is the name of the file containing email raw source.' sys.exit(1) oldSiteDBSize = globalObjects.siteDB.size() msg = Message(open(sys.argv[1]).read()) msg.show() msg.addSites() print print '* SiteDB count change:', globalObjects.siteDB.size() - oldSiteDBSize