home *** CD-ROM | disk | FTP | other *** search
- #
- # parseURL.py
- # JunkMatcher
- #
- # Created by Benjamin Han on 2/1/05.
- # Copyright (c) 2005 Benjamin Han. All rights reserved.
- #
-
- # This program is free software; you can redistribute it and/or
- # modify it under the terms of the GNU General Public License
- # as published by the Free Software Foundation; either version 2
- # of the License, or (at your option) any later version.
-
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
-
- # You should have received a copy of the GNU General Public License
- # along with this program; if not, write to the Free Software
- # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-
- #!/usr/bin/env python
-
- import urlparse
-
- from consts import *
-
-
- _yahooPrefixPat = re.compile(r'(?i)(?:\w+\.)?rds?\.|drs?\.')
- _msnPrefixPat = re.compile(r'(?i)g|ads')
-
-
- def parseURL (urlStr):
- """Given an unquoted URL string in lowercase, returns the real site targetted
- (a tuple of name components, e.g., ('www','yahoo','com')); returns None if parsing
- failed or no site is found.
-
- ASSUMPTION: urlStr starts with 'http:' or 'https:'
- """
-
- # make sure we have '//' following 'http:' or 'https://'
- i = urlStr.index(':')
- j = i + 2
- if j < len(urlStr):
- if urlStr[j] != '/':
- if urlStr[j - 1] == '/':
- urlStr = '%s://%s' % (urlStr[:i], urlStr[j:])
- else:
- urlStr = '%s://%s' % (urlStr[:i], urlStr[i + 1 :])
- else:
- return None
-
- t = urlparse.urlsplit(urlStr)
-
- site = t[1].split('@')[-1]
-
- # trick #1 - google:
- # urlparse.urlsplit('http://www.google.com/url?q=http://www.spammer.biz')
- # -> ('http', 'www.google.com', '/url', 'q=http://www.spammer.biz', '')
- if site == 'www.google.com' and t[2] == '/url':
- mo = httpPat.search(t[3])
- if mo:
- return parseURL(mo.group(0))
-
- # trick #2 - yahoo:
- # urlparse.urlsplit('http://rd.yahoo.com/some/junk/*http://my.real.site/hahaha.html')
- # -> ('http', 'rd.yahoo.com', '/some/junk/*http://my.real.site/hahaha.html', '', '')
- # Note: 'rd.yahoo.com' can be replaced with 'drs.yahoo.com', 'eur.rd.yahoo.com' or 'srd.yahoo.com'
- elif site.endswith('yahoo.com') and _yahooPrefixPat.match(site[:-9]) is not None:
- mo = httpPat.search(t[2])
- if mo:
- return parseURL(mo.group(0))
-
- # trick #3 - msn
- # urlparse.urlsplit('http://g.msn.com/0US!s5.31472_315529/HP.1001?http://auto-warranty-quotes.com/st.html')
- # -> ('http', 'g.msn.com', '/0US!s5.31472_315529/HP.1001', 'http://auto-warranty-quotes.com/st.html', '')
- # Note: 'g.msn.com' can be replaced with 'ads.msn.com'
- elif site.endswith('msn.com') and _msnPrefixPat.match(site[:-7]) is not None:
- mo = httpPat.search(t[3])
- if mo:
- return parseURL(mo.group(0))
-
- # For some reason urlsplit() doesn't separate the query from site in this:
- # 'http://www.myaffordablepills.com?rid=1000'
- site = site.split(':')[0].split('?')[0].strip()
-
- # some rudimentary check of valid domain names
- if len(site):
- l = site.split('.')
- if len(l[0]) == 0: del l[0]
- if len(l[-1]) == 0: del l[-1]
- if len(filter(lambda comp:len(comp) == 0, l)):
- # can't have empty components
- return None
- if len(l) > 1: return tuple(l)
-
- return None
-
-
- if __name__ == '__main__':
- print parseURL('http://user@www.cwi.nl:80/~guido/Python.html')
- print parseURL('http://www.google.com/url?q=http://www.spammer.biz')
- print parseURL('http://rd.yahoo.com/some/junk/*http://my.real.site1/hahaha.html')
- print parseURL('http://drs.yahoo.com/some/junk/*http://my.real.site2/hahaha.html')
- print parseURL('http://g.msn.com/0US!s5.31472_315529/HP.1001?http://auto-warranty-quotes.com/st.html')
- print parseURL('http://9c6b:MA5U@removerequest.biz/biskit/')
- print parseURL('http://www.fast-lender-search.biz/go')
- print parseURL('http://www.myaffordablepills.com?rid=1000')
- print parseURL('http://61.145.118.245/r13244/stats/clicks.asp?url=http://incrediblemeds.com/sv/index.php?pid=xyz123')
- print parseURL('http://rd.yahoo.com/M=032344.7057012.8084439.0341594/D=yahoo_top/S=3633887:LCC/A=2287395/R=0/*http://www.getitwhileitlast.com/x/index.php?AFF_ID=x0323')
- print parseURL('http://rd.yahoo.com/yearbook/calisthenic/abe/*http:/www.valuen.com/?partid=sf')
- print parseURL('http://rd.yahoo.com/yearbook/calisthenic/abe/*http:www.valuen.com/?partid=sf')
- print parseURL('http://ads.msn.com/ads/adredir.asp?image=/ads/IMGSFS/lwf35ysgdwsf5eo44.gif&url=http://garbage.com')
- print parseURL('https://www.neteller.com/neteller/signup.cfm')
- print parseURL('http://xn--fiq228c.xn--sxqt15hq1b.com') # TO-DO: IDN is not supported right now
- print parseURL('http://rds.yahoo.com/s=4611479/k=computer/v=6/sid=z/l=ws1/r=1/ss=39434998/ipc=us/she=0/h=0/sig=052yfsjo805/exp=355897284/*-http://google.com.grandi0se.net/home.asp')
-