MacAddict 108

home *** CD-ROM | disk | FTP | other *** search

/ MacAddict 108 / MacAddict108.iso / Software / Internet & Communication / JunkMatcher 1.5.5.dmg / JunkMatcher.app / Contents / Resources / Engine / Matcher.py < prev next >

Wrap

Python Source | 2005-06-01 | 22.0 KB | 512 lines

# # Matcher.py # JunkMatcher # # Created by Benjamin Han on 2/1/05. # Copyright (c) 2005 Benjamin Han. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #!/usr/bin/env python from consts import * from Tests import * from MatchResult import * MATCHER_MODE_ALL = 0 MATCHER_MODE_PROPERTIES = 1 MATCHER_MODE_PATTERNS = 2 MATCHER_MODE_LINEAR = 3 class Matcher: def __init__ (self, updateStats = True): # sets up self.tests # all mentioned files are REQUIRED, EXCEPT for recipientPatterns and safeIPs self.tests = Tests(Properties('%sproperties' % CONF_PATH), Patterns('%spatterns' % CONF_PATH), '%stests' % CONF_PATH) self.mode = globalObjects.prefs.mode if self.mode == MATCHER_MODE_LINEAR: self.numTests = int(globalObjects.prefs.modeArgs[0]) self.updateStats = updateStats def setMode (self, mode, *args): """mode must be one of the MATCHER_MODE* defined above; each element in args must be a string.""" self.mode = mode # TO-DO: for now args is needed only for MATCHER_MODE_LINEAR. if mode == MATCHER_MODE_LINEAR: if len(args): self.numTests = int(args[0]) else: self.numTests = 1 def run (self, msg): matchResult = MatchResult() verdict = None # meaning we haven't reached a verdict # whitelisting is on for ALL modes if msg.m: verdict = globalObjects.whitelist.search(msg.sender) if verdict is not None: # message whitelisted, but we may want to do some testing # for now it's only for PropertyPhishingURL # do testing if the message is in HTML and users specified that they want to check # whitelisted emails against PropertyPhishingURL if msg.isHTML and self.tests.properties[u'PropertyPhishingURL'].checkWhitelistedEmail: # find the test for PropertyPhishingURL idx, t = filter(lambda i: not i[1].isPattern and i[1].propertyOrPattern.__class__.__name__ == u'PropertyPhishingURL', enumerate(self.tests))[0] if t.isOn: # only run the properties with matching recipientPattern r = t.propertyOrPattern if not r.recipientPattern or \ r.recipientPattern.search('\n'.join(msg.decodedRecipients)): result, cpuTime = r.run(msg) if self.mode == MATCHER_MODE_LINEAR: # this is the only mode where we care hard tests and update statistics if result is not False: matchResult.addProperty(r.__class__.__name__, True, result, idx) if t.isHard or counter == 1: verdict = True else: matchResult.addProperty(r.__class__.__name__, False, testIdx = idx) if self.updateStats: m = matchResult[0] r.testRecord.addOne(verdict == m.isPositive, cpuTime, m.isPositive) elif self.mode != MATCHER_MODE_PATTERNS: if result is not False: matchResult.addProperty(r.__class__.__name__, True, result, idx) verdict = True else: matchResult.addProperty(r.__class__.__name__, False, testIdx = idx) if verdict is None: if msg.m: decodedRecipients = '\n'.join(msg.decodedRecipients) # NOTE we allow malformed messages to proceed, til they hit the very first property test # for ALL modes: # 1. apply tests only when they match the specified recipientPattern; # 2. apply body/rendering HTML patterns only when the message is in HTML. if self.mode == MATCHER_MODE_LINEAR: # MATCHER_MODE_LINEAR: # 1. Apply tests only when they match the specified encodingPattern; # 2. Stop running the tests immediately when a hard test gives a positive result. # 3. Actually update the statistics of each test knownPositivePatterns = sets.Set() patternConditions = {} # to cache the results of testing pattern conditions counter = self.numTests cpuTimes = [] # for recording the CPU time spent on each test testList = [] # remember what tests we actually performed for idx, test in enumerate(self.tests): if not test.isOn: continue i = test.propertyOrPattern if test.isPattern: # only run the patterns if recipientPattern AND encodingPattern both matches previousResult = patternConditions.get(i.origPattern) if previousResult is None: if (i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None)\ or (i.encodingPattern and i.encodingPattern.search(msg.charsets) is None): patternConditions[i.origPattern] = False continue patternConditions[i.origPattern] = True elif previousResult is False: continue view = test.view bodyOrRendering = (view == VIEW_BODY or view == VIEW_RENDERING) # don't run the body/rendering pattern if the pattern is HTML pattern AND this is NOT an HTML message if bodyOrRendering and test.isHTML and not msg.isHTML: continue # we don't want to run same patterns in both body and rendering # because we don't want to doubly penalize a message (body and rendering are similar in nature) if not bodyOrRendering or not i.origPattern in knownPositivePatterns: mo, cpuTime = i.run(msg, view) cpuTimes.append(cpuTime) testList.append(i) if mo: matchResult.addPattern(i.origPattern, view, True, mo.span(0), idx) if bodyOrRendering: knownPositivePatterns.add(i.origPattern) if test.isHard: verdict = True break else: counter -= 1 else: matchResult.addPattern(i.origPattern, view, False, testIdx = idx) else: # only run the properties with matching recipientPattern if i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None: continue result, cpuTime = i.run(msg) cpuTimes.append(cpuTime) testList.append(i) if result is not False: if result is True: result = None matchResult.addProperty(i.__class__.__name__, True, result, idx) if test.isHard: verdict = True break else: counter -= 1 else: matchResult.addProperty(i.__class__.__name__, False, testIdx = idx) if counter == 0: verdict = True break if verdict is None: verdict = False # update the statistics of each test for m, cpuTime, test in zip(matchResult, cpuTimes, testList): if m.isProperty: test.testRecord.addOne(verdict == m.isPositive, cpuTime, m.isPositive) else: test.testRecords[m.view].addOne(verdict == m.isPositive, cpuTime, m.isPositive) elif self.mode == MATCHER_MODE_ALL: # MATCHER_MODE_ALL: # 1. We don't distinguis hard/soft tests in this mode. # 2. We don't update the statistics of each test. patternConditions = {} # to cache the results of testing pattern conditions for idx, test in enumerate(self.tests): if not test.isOn: continue i = test.propertyOrPattern if test.isPattern: # only run the patterns if recipientPattern AND encodingPattern both matches previousResult = patternConditions.get(i.origPattern) if previousResult is None: if (i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None)\ or (i.encodingPattern and i.encodingPattern.search(msg.charsets) is None): patternConditions[i.origPattern] = False continue patternConditions[i.origPattern] = True elif previousResult is False: continue # don't run the body/rendering pattern if the pattern is HTML pattern AND this is NOT an HTML message if (test.view == VIEW_BODY or test.view == VIEW_RENDERING) and test.isHTML and not msg.isHTML: continue view = test.view mo, cpuTime = i.run(msg, view) if mo: verdict = True matchResult.addPattern(i.origPattern, view, True, mo.span(0), idx) else: matchResult.addPattern(i.origPattern, view, False, testIdx = idx) else: # only run the properties with matching recipientPattern if i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None: continue result, cpuTime = i.run(msg) if result is not False: verdict = True if result is True: result = None matchResult.addProperty(i.__class__.__name__, True, result, idx) else: matchResult.addProperty(i.__class__.__name__, False, testIdx = idx) if verdict is None: verdict = False elif self.mode == MATCHER_MODE_PROPERTIES: # MATCHER_MODE_PROPERTIES: # 1. We don't distinguis hard/soft tests in this mode. # 2. We don't update the statistics of each test. for idx, test in enumerate(self.tests): if not test.isOn or test.isPattern: continue i = test.propertyOrPattern # only run the properties with matching recipientPattern if i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None: continue result, cpuTime = i.run(msg) if result is not False: verdict = True if result is True: result = None matchResult.addProperty(i.__class__.__name__, True, result, idx) else: matchResult.addProperty(i.__class__.__name__, False, testIdx = idx) if verdict is None: verdict = False elif self.mode == MATCHER_MODE_PATTERNS: # MATCHER_MODE_PATTERNS: # 1. We don't distinguis hard/soft tests in this mode. # 2. We don't update the statistics of each test. if msg.m is None: verdict = None else: patternConditions = {} # to cache the results of testing pattern conditions # we don't distinguis hard/soft tests in this mode for idx, test in enumerate(self.tests): if not test.isOn or not test.isPattern: continue i = test.propertyOrPattern # only run the patterns if recipientPattern AND encodingPattern both matches previousResult = patternConditions.get(i.origPattern) if previousResult is None: if (i.recipientPattern and i.recipientPattern.search(decodedRecipients) is None)\ or (i.encodingPattern and i.encodingPattern.search(msg.charsets) is None): patternConditions[i.origPattern] = False continue patternConditions[i.origPattern] = True elif previousResult is False: continue # don't run the body/rendering pattern if the pattern is HTML pattern AND this is NOT an HTML message if (test.view == VIEW_BODY or test.view == VIEW_RENDERING) and test.isHTML and not msg.isHTML: continue view = test.view mo, cpuTime = i.run(msg, view) if mo: verdict = True matchResult.addPattern(i.origPattern, view, True, mo.span(0), idx) else: matchResult.addPattern(i.origPattern, view, False, testIdx = idx) if verdict is None: verdict = False matchResult.setVerdict(verdict) if self.updateStats: # update log and emailDB k = globalObjects.emailDB.addEntry(msg.msgSrc) globalObjects.logger.info(k, msg, matchResult) if verdict is True and msg.m: msg.addSites() return matchResult def getMatchResultStrings (self, msg, matchResult): sList = [] properties = self.tests.properties for m in filter(lambda m: m.isPositive, matchResult): if hasattr(m, 'info'): if m.isProperty: try: sList.append('- %s: %s' % (properties[m.idStr].name, m.info)) except: sList.append('- %s' % properties[m.idStr].name) else: sList.append('- Pattern "%s" matched in view "%s": "%s"' % (m.idStr, m.view, getattr(msg, m.view)[m.info[0]:m.info[1]])) else: sList.append('- %s' % properties[m.idStr].name) return sList def recycleLog (self): globalObjects.emailDB.recycle() globalObjects.logger.recycle() try: os.remove('%scorrections' % CONF_PATH) except: pass def recycleLogWhenItsDue (self): """Recycles log/emailDB if it's due; returns True iff the recycling did happen.""" try: f = open('%sjm.log' % CONF_PATH) except: # file might not have been created return False l = f.readline().rstrip() if len(l) == 0: return False try: firstDate = __import__('cPickle').loads(f.read(int(l)))[0] except Exception, e: printException('Exception when trying to load the 1st entry of jm.log', e) return False try: delta = datetime.datetime.utcnow() - firstDate except Exception, e: printException('Exception when determining the date of the 1st log entry', e) return False if delta.days > globalObjects.prefs.recycleDays: self.recycleLog() NSLog(u'JunkMatcher Log is %d day(s) old; recycled.' % delta.days) return True return False def finalize (self): """Call this before the Matcher object is out of commission.""" self.tests.properties.writeToFile() self.tests.patterns.writeToFile() globalObjects.siteDB.writeToFile() if __name__ == '__main__': import sys def showHelp (): print '* Matcher Shell commands:' print ' - "?": to show this list again.' print ' - "d": to display Matcher settings.' print ' - "m <mode #> [arg]": to change mode; arg is optional.' print ' - "f <msgFN>": to match a message in file msgFN.' print ' - "s <msgFN>": to show the relevant content of an email.' print ' - "q": to quit.' print ' (<msgFN> can be optionally surrounded by double quotes)' print if len(sys.argv) == 1: print '* Logging is off - add a second argument True to turn logging on.' matcher = Matcher(False) elif len(sys.argv) == 2: if sys.argv[1] == 'True': print '* Logging is on.' matcher = Matcher(True) else: print '* Logging is off - add a second argument True to turn logging on.' matcher = Matcher(False) else: print '* Usage: ./Matcher.py <logFlag>' print ' Set logFlag to True to enable loggin; omitting it will turn logging off.' sys.exit(1) showHelp() while True: try: cmd = raw_input('> ').strip() except EOFError: print break except Exception, e: print e break if cmd == '': continue elif cmd == 'exit' or cmd == 'q' or cmd == 'quit': break elif cmd[0] == '?': showHelp() elif cmd[0] == 'd': if matcher.mode == MATCHER_MODE_LINEAR: print '* Matcher mode: %d (numTests = %d)' % (matcher.mode, matcher.numTests) else: print '* Matcher mode: %d' % matcher.mode print '* Update stats:', matcher.updateStats elif cmd[0] == 'm': if len(cmd) > 1: cmd = cmd[2:].strip().split(' ') try: mode = int(cmd[0]) if mode < 0 or mode > 3: raise Exception() if len(cmd) > 1: try: matcher.setMode(mode, cmd[-1]) except: print '* [arg] is an integer...' else: matcher.setMode(mode) except: print '* <mode #> is an integer 0 - 3...' else: print '* Missing <mode #> [arg] ...' elif cmd[0] == 'f': # match a file if len(cmd) > 1: msgFN = cmd[2:].strip() if msgFN[0] == '"': msgFN = msgFN[1:-1] try: msgSrc = open(os.path.expanduser(msgFN)).read() except: print '* Cannot find file "%s"...' % msgFN continue print '* old SiteDB size:', globalObjects.siteDB.size() msg = Message(msgSrc) matchResult = matcher.run(msg) print '* Verdict on %s: %s' % (msgFN, matchResult.verdict) print '* new SiteDB size:', globalObjects.siteDB.size() # print all positive tests print encodeText('\n'.join(matcher.getMatchResultStrings(msg, matchResult))) else: print '* Missing <msgFN>...' elif cmd[0] == 's': if len(cmd) > 1: msgFN = cmd[2:].strip() if msgFN[0] == '"': msgFN = msgFN[1:-1] try: Message(open(os.path.expanduser(msgFN)).read()).show() except: print '* Cannot find file "%s"...' % msgFN else: print '* Missing <msgFN>...' else: print '* Unknown command "%s"' % cmd