MacAddict 108

home *** CD-ROM | disk | FTP | other *** search

/ MacAddict 108 / MacAddict108.iso / Software / Internet & Communication / JunkMatcher 1.5.5.dmg / JunkMatcher.app / Contents / Resources / Engine / parseURL.py < prev next >

Wrap

Python Source | 2005-06-01 | 5.0 KB | 118 lines

# # parseURL.py # JunkMatcher # # Created by Benjamin Han on 2/1/05. # Copyright (c) 2005 Benjamin Han. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #!/usr/bin/env python import urlparse from consts import * _yahooPrefixPat = re.compile(r'(?i)(?:\w+\.)?rds?\.|drs?\.') _msnPrefixPat = re.compile(r'(?i)g|ads') def parseURL (urlStr): """Given an unquoted URL string in lowercase, returns the real site targetted (a tuple of name components, e.g., ('www','yahoo','com')); returns None if parsing failed or no site is found. ASSUMPTION: urlStr starts with 'http:' or 'https:' """ # make sure we have '//' following 'http:' or 'https://' i = urlStr.index(':') j = i + 2 if j < len(urlStr): if urlStr[j] != '/': if urlStr[j - 1] == '/': urlStr = '%s://%s' % (urlStr[:i], urlStr[j:]) else: urlStr = '%s://%s' % (urlStr[:i], urlStr[i + 1 :]) else: return None t = urlparse.urlsplit(urlStr) site = t[1].split('@')[-1] # trick #1 - google: # urlparse.urlsplit('http://www.google.com/url?q=http://www.spammer.biz') # -> ('http', 'www.google.com', '/url', 'q=http://www.spammer.biz', '') if site == 'www.google.com' and t[2] == '/url': mo = httpPat.search(t[3]) if mo: return parseURL(mo.group(0)) # trick #2 - yahoo: # urlparse.urlsplit('http://rd.yahoo.com/some/junk/*http://my.real.site/hahaha.html') # -> ('http', 'rd.yahoo.com', '/some/junk/*http://my.real.site/hahaha.html', '', '') # Note: 'rd.yahoo.com' can be replaced with 'drs.yahoo.com', 'eur.rd.yahoo.com' or 'srd.yahoo.com' elif site.endswith('yahoo.com') and _yahooPrefixPat.match(site[:-9]) is not None: mo = httpPat.search(t[2]) if mo: return parseURL(mo.group(0)) # trick #3 - msn # urlparse.urlsplit('http://g.msn.com/0US!s5.31472_315529/HP.1001?http://auto-warranty-quotes.com/st.html') # -> ('http', 'g.msn.com', '/0US!s5.31472_315529/HP.1001', 'http://auto-warranty-quotes.com/st.html', '') # Note: 'g.msn.com' can be replaced with 'ads.msn.com' elif site.endswith('msn.com') and _msnPrefixPat.match(site[:-7]) is not None: mo = httpPat.search(t[3]) if mo: return parseURL(mo.group(0)) # For some reason urlsplit() doesn't separate the query from site in this: # 'http://www.myaffordablepills.com?rid=1000' site = site.split(':')[0].split('?')[0].strip() # some rudimentary check of valid domain names if len(site): l = site.split('.') if len(l[0]) == 0: del l[0] if len(l[-1]) == 0: del l[-1] if len(filter(lambda comp:len(comp) == 0, l)): # can't have empty components return None if len(l) > 1: return tuple(l) return None if __name__ == '__main__': print parseURL('http://user@www.cwi.nl:80/~guido/Python.html') print parseURL('http://www.google.com/url?q=http://www.spammer.biz') print parseURL('http://rd.yahoo.com/some/junk/*http://my.real.site1/hahaha.html') print parseURL('http://drs.yahoo.com/some/junk/*http://my.real.site2/hahaha.html') print parseURL('http://g.msn.com/0US!s5.31472_315529/HP.1001?http://auto-warranty-quotes.com/st.html') print parseURL('http://9c6b:MA5U@removerequest.biz/biskit/') print parseURL('http://www.fast-lender-search.biz/go') print parseURL('http://www.myaffordablepills.com?rid=1000') print parseURL('http://61.145.118.245/r13244/stats/clicks.asp?url=http://incrediblemeds.com/sv/index.php?pid=xyz123') print parseURL('http://rd.yahoo.com/M=032344.7057012.8084439.0341594/D=yahoo_top/S=3633887:LCC/A=2287395/R=0/*http://www.getitwhileitlast.com/x/index.php?AFF_ID=x0323') print parseURL('http://rd.yahoo.com/yearbook/calisthenic/abe/*http:/www.valuen.com/?partid=sf') print parseURL('http://rd.yahoo.com/yearbook/calisthenic/abe/*http:www.valuen.com/?partid=sf') print parseURL('http://ads.msn.com/ads/adredir.asp?image=/ads/IMGSFS/lwf35ysgdwsf5eo44.gif&url=http://garbage.com') print parseURL('https://www.neteller.com/neteller/signup.cfm') print parseURL('http://xn--fiq228c.xn--sxqt15hq1b.com') # TO-DO: IDN is not supported right now print parseURL('http://rds.yahoo.com/s=4611479/k=computer/v=6/sid=z/l=ws1/r=1/ss=39434998/ipc=us/she=0/h=0/sig=052yfsjo805/exp=355897284/*-http://google.com.grandi0se.net/home.asp')