home *** CD-ROM | disk | FTP | other *** search
- #
- # Matcher.py
- # JunkMatcher
- #
- # Created by Benjamin Han on 2/1/05.
- # Copyright (c) 2005 Benjamin Han. All rights reserved.
- #
-
- # This program is free software; you can redistribute it and/or
- # modify it under the terms of the GNU General Public License
- # as published by the Free Software Foundation; either version 2
- # of the License, or (at your option) any later version.
-
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
-
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- #!/usr/bin/env python
-
- from consts import *
- from Tests import *
- from MatchResult import *
-
- MATCHER_MODE_ALL = 0
- MATCHER_MODE_PROPERTIES = 1
- MATCHER_MODE_PATTERNS = 2
- MATCHER_MODE_LINEAR = 3
-
-
- class Matcher:
- def __init__ (self, updateStats = True):
- # sets up self.tests
- # all mentioned files are REQUIRED, EXCEPT for recipientPatterns and safeIPs
- self.tests = Tests(Properties('%sproperties' % CONF_PATH),
- Patterns('%spatterns' % CONF_PATH),
- '%stests' % CONF_PATH)
-
- self.mode = globalObjects.prefs.mode
- if self.mode == MATCHER_MODE_LINEAR:
- self.numTests = int(globalObjects.prefs.modeArgs[0])
-
- self.updateStats = updateStats
-
- def setMode (self, mode, *args):
- """mode must be one of the MATCHER_MODE* defined above; each element in
- args must be a string."""
- self.mode = mode
-
- # TO-DO: for now args is needed only for MATCHER_MODE_LINEAR.
- if mode == MATCHER_MODE_LINEAR:
- if len(args): self.numTests = int(args[0])
- else: self.numTests = 1
-
- def run (self, msg):
- matchResult = MatchResult()
- verdict = None # meaning we haven't reached a verdict
-
- # whitelisting is on for ALL modes
- if msg.m:
- verdict = globalObjects.whitelist.search(msg.sender)
-
- if verdict is not None:
- # message whitelisted, but we may want to do some testing
- # for now it's only for PropertyPhishingURL
-
- # do testing if the message is in HTML and users specified that they want to check
- # whitelisted emails against PropertyPhishingURL
- if msg.isHTML and self.tests.properties[u'PropertyPhishingURL'].checkWhitelistedEmail:
-
- # find the test for PropertyPhishingURL
- idx, t = filter(lambda i: not i[1].isPattern and i[1].propertyOrPattern.__class__.__name__ == u'PropertyPhishingURL',
- enumerate(self.tests))[0]
-
- if t.isOn:
- # only run the properties with matching recipientPattern
- r = t.propertyOrPattern
- if not r.recipientPattern or \
- r.recipientPattern.search('\n'.join(msg.decodedRecipients)):
-
- result, cpuTime = r.run(msg)
-
- if self.mode == MATCHER_MODE_LINEAR:
- # this is the only mode where we care hard tests and update statistics
- if result is not False:
- matchResult.addProperty(r.__class__.__name__, True, result, idx)
- if t.isHard or counter == 1:
- verdict = True
- else:
- matchResult.addProperty(r.__class__.__name__, False, testIdx = idx)
-
- if self.updateStats:
- m = matchResult[0]
- r.testRecord.addOne(verdict == m.isPositive, cpuTime, m.isPositive)
-
- elif self.mode != MATCHER_MODE_PATTERNS:
- if result is not False:
- matchResult.addProperty(r.__class__.__name__, True, result, idx)
- verdict = True
- else:
- matchResult.addProperty(r.__class__.__name__, False, testIdx = idx)
-
- if verdict is None:
-
- if msg.m:
- decodedRecipients = '\n'.join(msg.decodedRecipients)
-
- # NOTE we allow malformed messages to proceed, til they hit the very first property test
-
- # for ALL modes:
- # 1. apply tests only when they match the specified recipientPattern;
- # 2. apply body/rendering HTML patterns only when the message is in HTML.
-
- if self.mode == MATCHER_MODE_LINEAR:
-
- # MATCHER_MODE_LINEAR:
- # 1. Apply tests only when they match the specified encodingPattern;
- # 2. Stop running the tests immediately when a hard test gives a positive result.
- # 3. Actually update the statistics of each test
-
- knownPositivePatterns = sets.Set()
- patternConditions = {} # to cache the results of testing pattern conditions
-
- counter = self.numTests
- cpuTimes = [] # for recording the CPU time spent on each test
- testList = [] # remember what tests we actually performed
- for idx, test in enumerate(self.tests):
- if not test.isOn: continue
-
- i = test.propertyOrPattern
- if test.isPattern:
- # only run the patterns if recipientPattern AND encodingPattern both matches
- previousResult = patternConditions.get(i.origPattern)
- if previousResult is None:
- if (i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None)\
- or (i.encodingPattern and i.encodingPattern.search(msg.charsets) is None):
- patternConditions[i.origPattern] = False
- continue
- patternConditions[i.origPattern] = True
- elif previousResult is False:
- continue
-
- view = test.view
- bodyOrRendering = (view == VIEW_BODY or view == VIEW_RENDERING)
-
- # don't run the body/rendering pattern if the pattern is HTML pattern AND this is NOT an HTML message
- if bodyOrRendering and test.isHTML and not msg.isHTML: continue
-
- # we don't want to run same patterns in both body and rendering
- # because we don't want to doubly penalize a message (body and rendering are similar in nature)
- if not bodyOrRendering or not i.origPattern in knownPositivePatterns:
- mo, cpuTime = i.run(msg, view)
- cpuTimes.append(cpuTime)
- testList.append(i)
-
- if mo:
- matchResult.addPattern(i.origPattern, view, True, mo.span(0), idx)
- if bodyOrRendering:
- knownPositivePatterns.add(i.origPattern)
- if test.isHard:
- verdict = True
- break
- else:
- counter -= 1
- else:
- matchResult.addPattern(i.origPattern, view, False, testIdx = idx)
- else:
- # only run the properties with matching recipientPattern
- if i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None:
- continue
-
- result, cpuTime = i.run(msg)
- cpuTimes.append(cpuTime)
- testList.append(i)
-
- if result is not False:
- if result is True: result = None
- matchResult.addProperty(i.__class__.__name__, True, result, idx)
- if test.isHard:
- verdict = True
- break
- else:
- counter -= 1
- else:
- matchResult.addProperty(i.__class__.__name__, False, testIdx = idx)
-
- if counter == 0:
- verdict = True
- break
-
- if verdict is None: verdict = False
-
- # update the statistics of each test
- for m, cpuTime, test in zip(matchResult, cpuTimes, testList):
- if m.isProperty:
- test.testRecord.addOne(verdict == m.isPositive, cpuTime, m.isPositive)
- else:
- test.testRecords[m.view].addOne(verdict == m.isPositive,
- cpuTime, m.isPositive)
-
- elif self.mode == MATCHER_MODE_ALL:
-
- # MATCHER_MODE_ALL:
- # 1. We don't distinguis hard/soft tests in this mode.
- # 2. We don't update the statistics of each test.
-
- patternConditions = {} # to cache the results of testing pattern conditions
-
- for idx, test in enumerate(self.tests):
- if not test.isOn: continue
-
- i = test.propertyOrPattern
- if test.isPattern:
- # only run the patterns if recipientPattern AND encodingPattern both matches
- previousResult = patternConditions.get(i.origPattern)
- if previousResult is None:
- if (i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None)\
- or (i.encodingPattern and i.encodingPattern.search(msg.charsets) is None):
- patternConditions[i.origPattern] = False
- continue
- patternConditions[i.origPattern] = True
- elif previousResult is False:
- continue
-
- # don't run the body/rendering pattern if the pattern is HTML pattern AND this is NOT an HTML message
- if (test.view == VIEW_BODY or test.view == VIEW_RENDERING) and test.isHTML and not msg.isHTML:
- continue
-
- view = test.view
- mo, cpuTime = i.run(msg, view)
- if mo:
- verdict = True
- matchResult.addPattern(i.origPattern, view, True, mo.span(0), idx)
- else:
- matchResult.addPattern(i.origPattern, view, False, testIdx = idx)
- else:
-
- # only run the properties with matching recipientPattern
- if i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None:
- continue
-
- result, cpuTime = i.run(msg)
- if result is not False:
- verdict = True
- if result is True: result = None
- matchResult.addProperty(i.__class__.__name__, True, result, idx)
- else:
- matchResult.addProperty(i.__class__.__name__, False, testIdx = idx)
-
- if verdict is None: verdict = False
-
- elif self.mode == MATCHER_MODE_PROPERTIES:
-
- # MATCHER_MODE_PROPERTIES:
- # 1. We don't distinguis hard/soft tests in this mode.
- # 2. We don't update the statistics of each test.
-
- for idx, test in enumerate(self.tests):
- if not test.isOn or test.isPattern: continue
-
- i = test.propertyOrPattern
-
- # only run the properties with matching recipientPattern
- if i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None:
- continue
-
- result, cpuTime = i.run(msg)
- if result is not False:
- verdict = True
- if result is True: result = None
- matchResult.addProperty(i.__class__.__name__, True, result, idx)
- else:
- matchResult.addProperty(i.__class__.__name__, False, testIdx = idx)
-
- if verdict is None: verdict = False
-
- elif self.mode == MATCHER_MODE_PATTERNS:
-
- # MATCHER_MODE_PATTERNS:
- # 1. We don't distinguis hard/soft tests in this mode.
- # 2. We don't update the statistics of each test.
-
- if msg.m is None:
- verdict = None
-
- else:
- patternConditions = {} # to cache the results of testing pattern conditions
-
- # we don't distinguis hard/soft tests in this mode
- for idx, test in enumerate(self.tests):
- if not test.isOn or not test.isPattern: continue
-
- i = test.propertyOrPattern
-
- # only run the patterns if recipientPattern AND encodingPattern both matches
- previousResult = patternConditions.get(i.origPattern)
- if previousResult is None:
- if (i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None)\
- or (i.encodingPattern and i.encodingPattern.search(msg.charsets) is None):
- patternConditions[i.origPattern] = False
- continue
- patternConditions[i.origPattern] = True
- elif previousResult is False:
- continue
-
- # don't run the body/rendering pattern if the pattern is HTML pattern AND this is NOT an HTML message
- if (test.view == VIEW_BODY or test.view == VIEW_RENDERING) and test.isHTML and not msg.isHTML:
- continue
-
- view = test.view
- mo, cpuTime = i.run(msg, view)
- if mo:
- verdict = True
- matchResult.addPattern(i.origPattern, view, True, mo.span(0), idx)
- else:
- matchResult.addPattern(i.origPattern, view, False, testIdx = idx)
-
- if verdict is None: verdict = False
-
- matchResult.setVerdict(verdict)
-
- if self.updateStats:
- # update log and emailDB
- k = globalObjects.emailDB.addEntry(msg.msgSrc)
- globalObjects.logger.info(k, msg, matchResult)
-
- if verdict is True and msg.m:
- msg.addSites()
-
- return matchResult
-
- def getMatchResultStrings (self, msg, matchResult):
- sList = []
- properties = self.tests.properties
- for m in filter(lambda m: m.isPositive, matchResult):
- if hasattr(m, 'info'):
- if m.isProperty:
- try:
- sList.append('- %s: %s' % (properties[m.idStr].name, m.info))
- except:
- sList.append('- %s' % properties[m.idStr].name)
- else:
- sList.append('- Pattern "%s" matched in view "%s": "%s"' %
- (m.idStr, m.view, getattr(msg, m.view)[m.info[0]:m.info[1]]))
- else:
- sList.append('- %s' % properties[m.idStr].name)
-
- return sList
-
- def recycleLog (self):
- globalObjects.emailDB.recycle()
- globalObjects.logger.recycle()
-
- try:
- os.remove('%scorrections' % CONF_PATH)
- except:
- pass
-
- def recycleLogWhenItsDue (self):
- """Recycles log/emailDB if it's due; returns True iff the recycling did happen."""
- try:
- f = open('%sjm.log' % CONF_PATH)
- except:
- # file might not have been created
- return False
-
- l = f.readline().rstrip()
- if len(l) == 0: return False
-
- try:
- firstDate = __import__('cPickle').loads(f.read(int(l)))[0]
- except Exception, e:
- printException('Exception when trying to load the 1st entry of jm.log', e)
- return False
-
- try:
- delta = datetime.datetime.utcnow() - firstDate
- except Exception, e:
- printException('Exception when determining the date of the 1st log entry', e)
- return False
-
- if delta.days > globalObjects.prefs.recycleDays:
- self.recycleLog()
- NSLog(u'JunkMatcher Log is %d day(s) old; recycled.' % delta.days)
- return True
-
- return False
-
- def finalize (self):
- """Call this before the Matcher object is out of commission."""
- self.tests.properties.writeToFile()
- self.tests.patterns.writeToFile()
- globalObjects.siteDB.writeToFile()
-
-
- if __name__ == '__main__':
- import sys
-
- def showHelp ():
- print '* Matcher Shell commands:'
- print ' - "?": to show this list again.'
- print ' - "d": to display Matcher settings.'
- print ' - "m <mode #> [arg]": to change mode; arg is optional.'
- print ' - "f <msgFN>": to match a message in file msgFN.'
- print ' - "s <msgFN>": to show the relevant content of an email.'
- print ' - "q": to quit.'
- print ' (<msgFN> can be optionally surrounded by double quotes)'
- print
-
- if len(sys.argv) == 1:
- print '* Logging is off - add a second argument True to turn logging on.'
- matcher = Matcher(False)
- elif len(sys.argv) == 2:
- if sys.argv[1] == 'True':
- print '* Logging is on.'
- matcher = Matcher(True)
- else:
- print '* Logging is off - add a second argument True to turn logging on.'
- matcher = Matcher(False)
- else:
- print '* Usage: ./Matcher.py <logFlag>'
- print ' Set logFlag to True to enable loggin; omitting it will turn logging off.'
- sys.exit(1)
-
- showHelp()
-
- while True:
- try:
- cmd = raw_input('> ').strip()
- except EOFError:
- print
- break
- except Exception, e:
- print e
- break
-
- if cmd == '':
- continue
-
- elif cmd == 'exit' or cmd == 'q' or cmd == 'quit':
- break
-
- elif cmd[0] == '?':
- showHelp()
-
- elif cmd[0] == 'd':
- if matcher.mode == MATCHER_MODE_LINEAR:
- print '* Matcher mode: %d (numTests = %d)' % (matcher.mode, matcher.numTests)
- else:
- print '* Matcher mode: %d' % matcher.mode
- print '* Update stats:', matcher.updateStats
-
- elif cmd[0] == 'm':
- if len(cmd) > 1:
- cmd = cmd[2:].strip().split(' ')
- try:
- mode = int(cmd[0])
- if mode < 0 or mode > 3: raise Exception()
-
- if len(cmd) > 1:
- try:
- matcher.setMode(mode, cmd[-1])
- except:
- print '* [arg] is an integer...'
- else:
- matcher.setMode(mode)
- except:
- print '* <mode #> is an integer 0 - 3...'
- else:
- print '* Missing <mode #> [arg] ...'
-
- elif cmd[0] == 'f':
- # match a file
- if len(cmd) > 1:
- msgFN = cmd[2:].strip()
- if msgFN[0] == '"': msgFN = msgFN[1:-1]
- try:
- msgSrc = open(os.path.expanduser(msgFN)).read()
- except:
- print '* Cannot find file "%s"...' % msgFN
- continue
-
- print '* old SiteDB size:', globalObjects.siteDB.size()
- msg = Message(msgSrc)
- matchResult = matcher.run(msg)
- print '* Verdict on %s: %s' % (msgFN, matchResult.verdict)
- print '* new SiteDB size:', globalObjects.siteDB.size()
-
- # print all positive tests
- print encodeText('\n'.join(matcher.getMatchResultStrings(msg, matchResult)))
-
- else:
- print '* Missing <msgFN>...'
-
- elif cmd[0] == 's':
- if len(cmd) > 1:
- msgFN = cmd[2:].strip()
- if msgFN[0] == '"': msgFN = msgFN[1:-1]
- try:
- Message(open(os.path.expanduser(msgFN)).read()).show()
- except:
- print '* Cannot find file "%s"...' % msgFN
- else:
- print '* Missing <msgFN>...'
-
- else:
- print '* Unknown command "%s"' % cmd
-