home *** CD-ROM | disk | FTP | other *** search
- #
- # Message.py
- # JunkMatcher
- #
- # Created by Benjamin Han on 2/1/05.
- # Copyright (c) 2005 Benjamin Han. All rights reserved.
- #
-
- # This program is free software; you can redistribute it and/or
- # modify it under the terms of the GNU General Public License
- # as published by the Free Software Foundation; either version 2
- # of the License, or (at your option) any later version.
-
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
-
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- #!/usr/bin/env python
-
- import email, datetime, time, string
- from email.Header import decode_header
-
- from consts import *
- from utilities import *
- from emailAddress import *
- from parseURL import *
- from HTMLBody import *
-
-
- # the following headers are excluded from Message.headers since they are included
- # elsewhere in Message
- _ignoreHeaders = sets.Set(['from', 'sender', 'to', 'cc', 'subject'])
-
- _receivedDatePat = re.compile(r'(\d+)-(\d\d)-(\d\d) (\d\d):(\d\d):(\d\d) \+0000\@')
- _rawMIDPat = re.compile(r'<(.+\@.+)>')
- _ipPat = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
- _asciiPat = re.compile(r'[\020-\0177]*')
- _anchorPat = re.compile(r'(?i)<a\s+[^>]*href\s*=\s*[\'"]?\s*(https?:/?/?[^"\'<> \t\n\r\f\v]+)[^>]*?>\s*(https?:/?/?[^"\'<> \t\n\r\f\v]+)\s*</a>')
-
-
- class Message (object):
- """An email message
- ----------------
- WARNING: if a message is malformed, m is None and you should NOT call any method or
- try to get any attribute except for msgSrc, receivedDate and m!
-
- The beginning of the input msgSrc may have an received date string in UTC. If not,
- self.receivedDate is set to None.
-
- I. the following are set by __init__()
-
- (the following two can generate all the other data)
- msgSrc: the raw source of the message
- receivedDate: date/time (UTC) when the message was received; a datetime obj; could
- be None if it is not indicated in the beginning of the input msgSrc.
- containingNull: True iff msgSrc contains '\0' (which will be removed)
-
- m: Python email object (if the email is malformed, m is None and none of the
- following attributes exists)
- mID: message ID (None if message ID is malformed/missing)
- subject: the message subject (Unicode)
- sender: the sender of the message, in the full form (Unicode)
- senderEmail: the email address of the sender (None if no sender, False if malformed)
- senderDomain: the email domain of the sender (None if no sender, False if malformed)
- date: the date this email was sent - a string (None if missing)
- headers: complete headers except for those specified in _ignoreHeaders (Unicode)
-
- II. the following are set by _decode()
-
- charsets: a newline-separated string of all character sets used in this email
- charset: a representative charset (a random one from charsets); None if no charset is
- available
- body: the decoded message body; for plain text emails it's the text, for HTML mails it's
- the cleaned HTML code (Unicode)
- htmlBody: for HTML emails only - an HTMLBody object (Unicode)
- isHTML: True iff body is in HTML
- filenames: a newline-separated string of all attachment filenames
- urlDict: a dictionary of URLs used in the email (Unicode and in lowercase; URLs can
- contain weird encodings!)
-
- III. the following are set by other methods
-
- dateObj: set by _setDateObj(); False if date is malformed, None if date is missing,
- otherwise it's a datetime obj converted from self.date (UTC).
- timeDelta: set by _setTimeDelta(); None if self.dateObj is malformed/missing, otherwise
- it's a timedelta object = self.dateObj - self.receivedDate.
-
- recipients: set by _setRecipients(); a list of tuples (text, email address, domain) when
- none of the recipients is malformed (note text is *not* decoded - see emailAddress.py);
- otherwise it's the part of addresses that are near the error point.
- numRecipients: set by _setRecipients(): length of self.recipients; -1 if error occurs.
- decodedRecipients: set by _setDecodedRecipients(): a list of decoded recipients, in
- full form and Unicode.
-
- badSite: set by _setSites(); the first bad site found against globalObjects.siteDB;
- None if no bad site is found.
-
- phishingURL: set by _setPhishingURL(); it's a tuple ((s1, e1), (s2, e2),
- (real site, claimed site)) - the first sub-tuple is the span of the *real* URL,
- the 2nd sub-tuple is the span of the *claimed* URL, and the 3rd sub-tuple is
- self-explanatory (they are lowercase strings of the domain names); if no phishing
- URL is found, it's None.
-
- rendering: set by HTMLBody.setRendering().
-
- headerIPs: set by _setHeaderIPs().
-
-
- IMPORTANT ASSUMPTION: One Message instance per thread!
- """
- # improving performance by not having __dict__
- __slots__ = ('msgSrc', 'receivedDate', 'containingNull', 'm', 'mID', 'subject', 'sender', 'senderEmail', 'senderDomain', 'date', 'headers',
- '_charsetSet', '_charsetList', '_fnList', 'charsets', 'charset', 'body', 'htmlBody', 'isHTML', 'filenames', 'urlDict',
- '_sites', 'dateObj', 'timeDelta', 'recipients', 'numRecipients', 'decodedRecipients', 'badSite', 'phishingURL', 'rendering',
- 'headerIPs')
-
- identityTable = __import__('string').maketrans('', '')
-
- def __init__ (self, msgSrc):
- # The beginning of msgSrc might have a received date string
- # e.g., "2005-02-06 15:57:37 +0000@" (NOTE the '@' delimiter)
- # the date is always in UTC so the time zone is always +0000
- mo = _receivedDatePat.match(msgSrc)
- if mo:
- msgSrc = msgSrc[mo.end(0):]
- self.receivedDate = datetime.datetime(int(mo.group(1)), int(mo.group(2)), int(mo.group(3)),
- int(mo.group(4)), int(mo.group(5)), int(mo.group(6)))
- else:
- self.receivedDate = None
-
- self.msgSrc = msgSrc
-
- # check and clean out possible null chars
- oldLength = len(msgSrc)
- msgSrc = msgSrc.translate(Message.identityTable, '\0')
- self.containingNull = (oldLength != len(msgSrc))
-
- try:
- m = email.message_from_string(msgSrc)
- self.m = m
- except:
- # malformed emails
- self.m = None
- return
-
- mID = m['Message-Id']
- if mID is not None:
- mo = _rawMIDPat.search(mID)
- if mo is None: self.mID = None
- else: self.mID = mo.group(1)
- else:
- self.mID = None
-
- self._charsetSet = sets.Set()
-
- # set self.subject - self._charsetSet will be updated
- subj = m['Subject']
- if subj:
- try:
- self.subject = decodeTextList(decode_header(subj), self._charsetSet).strip()
- except:
- # error occurred in decode_header()
- self.subject = u''
- else:
- self.subject = u''
-
- # set self.sender, self.senderEmail and self.senderDomain -
- # self._charsetSet will be updated
- sender = m['From']
- if sender:
- try:
- self.sender = decodeTextList(decode_header(sender), self._charsetSet).strip()
- self.senderEmail, self.senderDomain = validateEmailAddress(sender)
- except:
- # error occurred in decode_header()
- self.sender = u''
- self.senderEmail = self.senderDomain = None
- else:
- self.sender = u''
- self.senderEmail = self.senderDomain = None
-
- self.date = m['Date']
-
- # set up defaultEncoding
- if len(self._charsetSet):
- defaultEncoding = self._charsetSet.pop()
- self._charsetSet.add(defaultEncoding)
- else:
- defaultEncoding = None
-
- # set up _charsetList and correct common mispellings
- self._charsetList = []
- for c in m.get_charsets():
- if c:
- mispelling = charsetMispellings.get(c)
- if mispelling: c = mispelling
- self._charsetSet.add(c)
- else:
- # encoding is None; fall back to the defaultEncoding
- c = defaultEncoding
- if c: self._charsetSet.add(c)
-
- self._charsetList.append(c)
-
- # set up headers
- self.headers = decodeText('\n'.join([': '.join((hk, hv))
- for hk, hv in filter(lambda i:i[0].lower() not in _ignoreHeaders,
- self.m.items())]))
-
- def __getattr__ (self, name):
- if name == 'charsets':
- self._setDecodedRecipients()
- return self.charsets
- elif name == 'charset':
- self._setDecodedRecipients()
- return self.charset
- elif name == 'body':
- self._decode()
- return self.body
- elif name == 'htmlBody':
- self._decode()
- return self.htmlBody
- elif name == 'isHTML':
- self._decode()
- return self.isHTML
- elif name == 'filenames':
- self._decode()
- return self.filenames
- elif name == 'urlDict':
- self._decode()
- return self.urlDict
- elif name == 'recipients':
- self._setRecipients()
- return self.recipients
- elif name == 'numRecipients':
- self._setRecipients()
- return self.numRecipients
- elif name == 'decodedRecipients':
- self._setDecodedRecipients()
- return self.decodedRecipients
- elif name == '_sites':
- self._setSites()
- return self._sites
- elif name == 'badSite':
- self._setSites()
- return self.badSite
- elif name == 'phishingURL':
- self._setPhishingURL()
- return self.phishingURL
- elif name == 'headerIPs':
- self._setHeaderIPs()
- return self.headerIPs
- elif name == 'dateObj':
- self._setDateObj()
- return self.dateObj
- elif name == 'timeDelta':
- self._setTimeDelta()
- return self.timeDelta
- elif name == 'rendering':
- # for HTML messages the rendering is different from the message body
- # (using elinks to provide the HTML rendering); otherwise it's the body
- if self.isHTML:
- self.htmlBody.setRendering()
- self.rendering = self.htmlBody.rendering
- else:
- self.rendering = self.body
- return self.rendering
- else:
- raise AttributeError('No attribute %s in this %s instance.' % (name, self.__class__.__name__))
-
- def isMultipart (self):
- return self.m.is_multipart()
-
- def __decodePartOfEmail (self, part, charset):
- """Returns unicode object or HTMLBody object (if part is in HTML) if payload exists,
- otherwise returns None; updates self.isHTML and self._charsetSet accordingly."""
- ret = None
-
- if part.get_content_maintype() == 'text':
- payload = part.get_payload(decode = True)
- if payload:
- # Invariance: payload is not decoded; but ret is
- if part.get_content_subtype() == 'html':
- self.isHTML = True
- ret = HTMLBody(payload, charset)
- if ret.encoding:
- self._charsetSet.add(ret.encoding) # spelling should have been checked in HTMLBody
- else:
- ret = decodeText(payload, charset).strip()
-
- fn = part.get_filename()
- if fn is None: fn = part.get_param('name')
- if fn: self._fnList.append(fn)
-
- return ret
-
- def _decode (self):
- """Decode for content from self.m (see the attributes set by this method in
- the docstring of self); returns nothing."""
- self.isHTML = False
- self._fnList = []
- charsetIndex = 0
- htmlBodyNotFound = True
-
- if self.m.is_multipart():
- # multi-part email
- contentList = []
- for part in self.m.walk():
- if htmlBodyNotFound:
- payload = self.__decodePartOfEmail(part, self._charsetList[charsetIndex])
-
- if self.isHTML:
- # when self.isHTML is True, payload is guaranteed to not be None
- self.htmlBody = payload
- self.body = payload.content
- htmlBodyNotFound = False
- else:
- if payload:
- contentList.append(payload)
-
- else:
- # we're only interested in collecting filenames if HTML body is already found
- fn = part.get_filename()
- if fn is None: fn = part.get_param('name')
- if fn: self._fnList.append(fn)
-
- charsetIndex += 1
-
- if htmlBodyNotFound:
- # it's not an HTML body
- self.body = ' '.join(contentList).strip()
-
- else:
- # single-part email
- payload = self.__decodePartOfEmail(self.m, self._charsetList[charsetIndex])
- if payload:
- if self.isHTML:
- self.htmlBody = payload
- self.body = payload.content
- else:
- self.body = payload
- else: self.body = u''
-
- # set urlDict
- if self.isHTML:
- self.urlDict = self.htmlBody.urlDict
- else:
- self.urlDict = {}
- d = self.urlDict
- for url in map(string.lower, filter(lambda s:s,
- [m.group(0) for m in httpPat.finditer(self.body)])):
- d[url] = d.get(url, 0) + 1
-
- self.filenames = '\n'.join(self._fnList)
-
- def _setDateObj (self):
- """Sets self.dateObj; NO return value."""
- if self.date is not None:
- try:
- # TO-DO: according to Python's doc, mktime_tz() might introduce minor inaccuracies during DST changes.
- self.dateObj = datetime.datetime.utcfromtimestamp(email.Utils.mktime_tz(email.Utils.parsedate_tz(self.date)))
- except:
- self.dateObj = False
- else:
- # missing date counts as *not* malformed!
- self.dateObj = None
-
- def _setTimeDelta (self):
- """Sets self.timeDelta; NO return value."""
- if self.dateObj and self.receivedDate:
- self.timeDelta = self.dateObj - self.receivedDate # normally timeDelta should be < 0
- else:
- self.timeDelta = None
-
- def _setRecipients (self):
- """Parses the To and CC header fields for recipient email addresses:
-
- 1. It sets self.recipients - if successful it's a list of tuples
- (text, email address, domain); if not it's a string (see emailAddress.py);
- 2. It also sets self.numRecipients: -1 if error occurs;
- 2. NO return value
- """
- recp = self.m.get_all('to')
- if recp is None:
- recp = self.m.get_all('cc')
- else:
- recp2 = self.m.get_all('cc')
- if recp2: recp.extend(recp2)
-
- if recp is None:
- self.recipients = []
- self.numRecipients = 0
- else:
- # if any email address is malformed, self.recipients becomes a string
- # (the text that's skipped in parsing the email addresses)
- self.recipients = extractEmailAddresses(', '.join(recp))
- if type(self.recipients) is list:
- self.numRecipients = len(self.recipients)
- else:
- self.numRecipients = -1
-
- def _setDecodedRecipients (self):
- """Decode the recipient address(es) based on the embedded encoding(s); sets
- self.decodedRecipients; NO return value"""
- if type(self.recipients) is list:
- self.decodedRecipients = []
- try: # decode as much as possible
- self.decodedRecipients.extend(map(lambda r: decodeTextList(decode_header(r[0]),
- self._charsetSet),
- self.recipients))
- except:
- pass
-
- else:
- self.decodedRecipients = []
-
- self.charsets = '\n'.join(self._charsetSet)
- if len(self._charsetSet):
- self.charset = self._charsetSet.pop()
- else:
- self.charset = None
-
- def _setSites (self):
- """ Returns nothing."""
- # stores only sites from parseURL that is not a safe site, and contains only ASCII
- # CAUTION: parseURL() might return None
- safeSitesPattern = globalObjects.safeSitesPattern
- sites = sets.Set(filter(lambda url: _asciiPat.match(url),
- filter(lambda url: safeSitesPattern.search(url) is None,
- map(lambda l:'.'.join(l),
- filter(lambda l: l, map(parseURL, self.urlDict.keys()))))))
-
- # _sites is a list of (nameComponents, node, b) where nameComponents is
- # a sequence of name components, node is a SiteDB node, and b is a boolean
- # (see getOne() in SiteDB.py); this is for calling addSites() later (see getOne(),
- # addOne() and addOneToNode() in SiteDB.py)
- self._sites = []
- self.badSite = None
-
- if sites:
- siteDB = globalObjects.siteDB
- for site in sites:
- # this is to weed out the ill-encoded site from being added
- result = siteDB.getOne(site.split('.'))
- l = result[0]
- if l is not None: # otherwise the site is bogus
- self._sites.append(result)
- if self.badSite is None and len(l) == 0:
- self.badSite = site
-
- def _setPhishingURL (self):
- if self.isHTML:
- for moIter in _anchorPat.finditer(self.body):
- # don't bother using urllib.unquote() cuz we're after the inequality!
- realSite = parseURL(moIter.group(1).lower())
- claimedSite = parseURL(moIter.group(2).lower())
- if realSite != claimedSite:
- self.phishingURL = (moIter.span(1), moIter.span(2),
- ('.'.join(realSite), '.'.join(claimedSite)))
- return
-
- self.phishingURL = None
-
- def addSites (self):
- """Add the collected sites into globalObjects.siteDB; returns True if at least one site is added."""
- if self._sites:
- siteDB = globalObjects.siteDB
- for l, n, b in self._sites:
- if n is None:
- siteDB.addOne(l, time.time())
- else:
- siteDB.addOneToNode(l, n, b, time.time())
- return True
- return False
-
- def removeSites (self, countMatters = True):
- """Remove the collected sites from globalObjects.siteDB; returns True iff at least one site is removed."""
- safeSitesPattern = globalObjects.safeSitesPattern
- sites = sets.Set(filter(lambda url: safeSitesPattern.search(url) is None,
- map(lambda l:'.'.join(l),
- filter(lambda l: l, map(parseURL, self.urlDict.keys())))))
-
- if sites:
- siteDB = globalObjects.siteDB
- ret = False
- for site in sites:
- if ret:
- siteDB.removeOne(site.split('.'), countMatters)
- else:
- ret = siteDB.removeOne(site.split('.'), countMatters)
-
- return ret
- return False
-
- def _setHeaderIPs (self):
- """Collects all the IPs mentioned in the headers in headerIPs, in
- reversed chronological order."""
- rList = self.m.get_all('Received')
- ipSet = sets.Set()
- self.headerIPs = []
-
- if rList:
- for line in rList:
- for moIter in _ipPat.finditer(line):
- ip = moIter.group(0)
- if not ip in ipSet:
- ipSet.add(ip)
- self.headerIPs.insert(0, moIter.group(0))
-
- def show (self):
- """Just a way to dump all relevant info on screen."""
- print 'This message is multi-part:', self.isMultipart()
- if (self.receivedDate):
- print 'Received date (UTC):', self.receivedDate
- print 'Subject:', encodeText(self.subject)
- print 'Sender:', encodeText(self.sender)
- print 'Sender email:', self.senderEmail
- print 'Sender domain:', self.senderDomain
- print 'Is HTML:', self.isHTML
- if len(self.charsets):
- print 'Charsets:', ', '.join(self.charsets.split('\n'))
- if len(self.filenames):
- print 'Filenames:', ', '.join(self.filenames.split('\n'))
- if self.numRecipients > 0:
- print 'Decoded Recipients:', encodeText(u', '.join(self.decodedRecipients))
- print 'Recipient Emails:', ', '.join([r[1] for r in self.recipients])
- print 'Bad site:', self.badSite
- if len(self.headerIPs):
- print 'Header IPs:', ', '.join(self.headerIPs)
- print 'Date Obj:', self.dateObj
- print 'Time delta:', self.timeDelta
- print
- print '-------------------- Headers --------------------'
- print self.headers
- print
- print '-------------------- Body --------------------'
- print encodeText(self.body)
- print '-------------------- Rendering --------------------'
- print encodeText(self.rendering)
-
- if self.isHTML:
- numHiddenURLs = len(self.htmlBody.hiddenURLList)
- if numHiddenURLs:
- print '-------------------- %d Hidden URL(s) --------------------'%numHiddenURLs
- for idx, (start, end) in enumerate(self.htmlBody.hiddenURLList):
- print idx, encodeText(self.htmlBody.contentWithoutEntities[start:end])
-
- numBadTags = len(self.htmlBody.badTagList)
- if numBadTags:
- print '-------------------- %d Bad tag(s) --------------------'%numBadTags
- for idx, (start, end) in enumerate(self.htmlBody.badTagList):
- print idx, encodeText(self.htmlBody.contentWithoutEntities[start:end])
-
- numVacuousTags = len(self.htmlBody.vacuousTagList)
- if numVacuousTags:
- print '-------------------- %d Vacuous tag(s) --------------------'%numVacuousTags
- for idx, (start, end) in enumerate(self.htmlBody.vacuousTagList):
- print idx, encodeText(self.htmlBody.contentWithoutBadTags[start:end])
-
- numURLs = len(self.urlDict)
- if numURLs:
- print '-------------------- %d URL(s) --------------------'%numURLs
- for idx, (url, count) in enumerate(self.urlDict.items()):
- print '%d %s (%d)' % (idx, encodeText(url), count)
-
-
- if __name__ == '__main__':
- import sys
-
- if len(sys.argv) == 1:
- print 'Usage: ./Message.py <filename>'
- print ' * filename is the name of the file containing email raw source.'
- sys.exit(1)
-
- oldSiteDBSize = globalObjects.siteDB.size()
-
- msg = Message(open(sys.argv[1]).read())
- msg.show()
- msg.addSites()
-
- print
- print '* SiteDB count change:', globalObjects.siteDB.size() - oldSiteDBSize
-