Personal Computer World 2008 February

home *** CD-ROM | disk | FTP | other *** search

/ Personal Computer World 2008 February / PCWFEB08.iso / Software / Freeware / Miro 1.0 / Miro_Installer.exe / xulrunner / python / flashscraper.py < prev next >

Wrap

Python Source | 2007-11-12 | 8.2 KB | 223 lines

# Miro - an RSS based video player application # Copyright (C) 2005-2007 Participatory Culture Foundation # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA import re import httpclient import urlparse import cgi from xml.dom import minidom from urllib import unquote_plus from util import checkU, returnsUnicode # ============================================================================= def tryScrapingURL(url, callback): checkU(url) scrape =_getScrapeFunctionFor(url) if scrape is not None: scrape(url,lambda x:_actualURLCallback(url,callback,x)) else: callback(url) # ============================================================================= # The callback is wrapped in this for flv videos def _actualURLCallback(url, callback, newURL): if newURL: checkU(newURL) #print "Changed:" #print url #print " to" #print newURL callback(newURL, contentType = u"video/x-flv") def _getScrapeFunctionFor(url): checkU(url) for scrapeInfo in scraperInfoMap: if re.compile(scrapeInfo['pattern']).match(url) is not None: return scrapeInfo['func'] return None def _scrapeYouTubeURL(url, callback): checkU(url) httpclient.grabHeaders(url, lambda x:_youTubeCallback(x,callback), lambda x:_youTubeErrback(x,callback)) def _youTubeCallback(info, callback): url = info['redirected-url'] try: components = urlparse.urlsplit(url) params = cgi.parse_qs(components[3]) videoID = params['video_id'][0] t = params['t'][0] url = u"http://youtube.com/get_video.php?video_id=%s&t=%s" % (videoID, t) callback(url) except: print "DTV: WARNING, unable to scrape You Tube Video URL: %s" % url callback(None) def _youTubeErrback(err, callback): print "DTV: WARNING, network error scraping You Tube Video URL" callback(None) def _scrapeGoogleVideoURL(url, callback): try: components = urlparse.urlsplit(url) params = cgi.parse_qs(components[3]) docId = params['docId'][0] url = u"http://video.google.com/videofile/%s.flv?docid=%s&itag=5" % (docId, docId) callback(url) except: print "DTV: WARNING, unable to scrape Google Video URL: %s" % url callback(None) def _scrapeLuLuVideoURL(url, callback): try: components = urlparse.urlsplit(url) params = cgi.parse_qs(components[3]) url = unquote_plus(params['file'][0]).decode('ascii','replace') callback(url) except: print "DTV: WARNING, unable to scrape LuLu.tv Video URL: %s" % url callback(None) def _scrapeVMixVideoURL(url, callback): try: components = urlparse.urlsplit(url) params = cgi.parse_qs(components[3]) t = params['type'][0] ID = params['id'][0] l = params['l'][0] url = u"http://sdstage01.vmix.com/videos.php?type=%s&id=%s&l=%s" % (t,ID,l) httpclient.grabURL(url, lambda x:_scrapeVMixCallback(x,callback), lambda x:_scrapeVMixErrback(x,callback)) except: print "DTV: WARNING, unable to scrape VMix Video URL: %s" % url callback(None) def _scrapeVMixCallback(info, callback): try: doc = minidom.parseString(info['body']) url = doc.getElementsByTagName('file').item(0).firstChild.data.decode('ascii','replace') callback(url) except: print "DTV: WARNING, unsable to scrape XML for VMix Video URL %s" % info['redirected-url'] callback(None) def _scrapeVMixErrback(err, callback): print "DTV: WARNING, network error scraping VMix Video URL" callback(None) def _scrapeDailyMotionVideoURL(url, callback): httpclient.grabHeaders(url, lambda x:_scrapeDailyMotionCallback(x,callback), lambda x:_scrapeDailyMotionErrback(x,callback)) def _scrapeDailyMotionCallback(info, callback): url = info['redirected-url'] try: components = urlparse.urlsplit(url) params = cgi.parse_qs(components[3]) url = unquote_plus(params['url'][0]).decode('ascii','replace') callback(url) except: print "DTV: WARNING, unable to scrape Daily Motion URL: %s" % url callback(None) def _scrapeDailyMotionErrback(info, callback): print "DTV: WARNING, network error scraping Daily Motion Video URL" callback(None) def _scrapeVSocialVideoURL(url, callback): try: components = urlparse.urlsplit(url) params = cgi.parse_qs(components[3]) v = params['v'][0] url = u'http://static.vsocial.com/varmedia/vsocial/flv/%s_out.flv' % v callback(url) except: print "DTV: WARNING, unable to scrape VSocial URL: %s" % url callback(None) def _scrapeVeohTVVideoURL(url, callback): try: components = urlparse.urlsplit(url) params = cgi.parse_qs(components[3]) t = params['type'][0] permalinkId= params['permalinkId'][0] url = u'http://www.veoh.com/movieList.html?type=%s&permalinkId=%s&numResults=45' % (t, permalinkId) httpclient.grabURL(url, lambda x: _scrapeVeohTVCallback(x, callback), lambda x:_scrapeVeohTVErrback(x, callback)) except: print "DTV: WARNING, unable to scrape Veoh URL: %s" % url callback(None) def _scrapeVeohTVCallback(info, callback): url = info['redirected-url'] try: params = cgi.parse_qs(info['body']) fileHash = params['previewHashLow'][0] if fileHash[-1] == ",": fileHash=fileHash[:-1] url = u'http://ll-previews.veoh.com/previews/get.jsp?fileHash=%s' % fileHash callback(url) except: print "DTV: WARNING, unable to scrape Veoh URL data: %s" % url callback(None) def _scrapeVeohTVErrback(err, callback): print "DTV: WARNING, network error scraping Veoh TV Video URL" callback(None) def _scrapeBreakVideoURL(url, callback): httpclient.grabHeaders(url, lambda x:_scrapeBreakCallback(x,callback), lambda x:_scrapeBreakErrback(x,callback)) def _scrapeBreakCallback(info, callback): url = info['redirected-url'] try: components = urlparse.urlsplit(url) params = cgi.parse_qs(components[3]) url = unquote_plus(params['sVidLoc'][0]).decode('ascii','replace') callback(url) except: print "DTV: WARNING, unable to scrape Break URL: %s" % url callback(None) def _scrapeBreakErrback(info, callback): print "DTV: WARNING, network error scraping Break Video URL" callback(None) def _scrapeGreenPeaceVideoURL(url, callback): print "DTV: Warning, unable to scrape Green peace Video URL %s" % url print callback(None) # ============================================================================= scraperInfoMap = [ {'pattern': 'http://([^/]+\.)?youtube.com/(?!get_video\.php)', 'func': _scrapeYouTubeURL}, {'pattern': 'http://video.google.com/googleplayer.swf', 'func': _scrapeGoogleVideoURL}, {'pattern': 'http://([^/]+\.)?lulu.tv/wp-content/flash_play/flvplayer', 'func': _scrapeLuLuVideoURL}, {'pattern': 'http://([^/]+\.)?vmix.com/flash/super_player.swf', 'func': _scrapeVMixVideoURL}, {'pattern': 'http://([^/]+\.)?dailymotion.com/swf', 'func': _scrapeDailyMotionVideoURL}, {'pattern': 'http://([^/]+\.)?vsocial.com/flash/vp.swf', 'func': _scrapeVSocialVideoURL}, {'pattern': 'http://([^/]+\.)?veoh.com/multiplayer.swf', 'func': _scrapeVeohTVVideoURL}, {'pattern': 'http://([^/]+\.)?greenpeaceweb.org/GreenpeaceTV1Col.swf', 'func': _scrapeGreenPeaceVideoURL}, {'pattern': 'http://([^/]+\.)?break.com/', 'func': _scrapeBreakVideoURL}, ]