home *** CD-ROM | disk | FTP | other *** search
/ Chip 2011 November / CHIP_2011_11.iso / Programy / Narzedzia / Calibre / calibre-0.8.18.msi / file_280 / arcamax.recipe < prev    next >
Text File  |  2011-09-09  |  7KB  |  130 lines

  1. #!/usr/bin/env  python
  2.  
  3. __license__   = 'GPL v3'
  4. __copyright__ = 'Copyright 2010 Starson17'
  5. '''
  6. www.arcamax.com
  7. '''
  8. from calibre.web.feeds.news import BasicNewsRecipe
  9. from calibre.ebooks.BeautifulSoup import Tag
  10.  
  11. class Arcamax(BasicNewsRecipe):
  12.     title               = 'Arcamax'
  13.     __author__          = 'Starson17'
  14.     __version__         = '1.04'
  15.     __date__            = '18 April 2011'
  16.     description         = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
  17.     category            = 'news, comics'
  18.     language            = 'en'
  19.     use_embedded_content= False
  20.     no_stylesheets      = True
  21.     remove_javascript   = True
  22.     cover_url           = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'
  23.  
  24.     ####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
  25.     num_comics_to_get = 7
  26.     # CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
  27.  
  28.     conversion_options = {'linearize_tables'  : True
  29.                         , 'comment'           : description
  30.                         , 'tags'              : category
  31.                         , 'language'          : language
  32.                         }
  33.  
  34.     keep_only_tags     = [dict(name='div', attrs={'class':['comics-header']}),
  35.                                         dict(name='b', attrs={'class':['current']}),
  36.                                         dict(name='article', attrs={'class':['comic']}),
  37.                                         ]
  38.  
  39.     remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}),
  40.                                dict(name='div', attrs={'class':['calendar' ]}),
  41.                                dict(name='nav', attrs={'class':['calendar-nav' ]}),
  42.                                ]
  43.  
  44.     def parse_index(self):
  45.         feeds = []
  46.         for title, url in [
  47.                             ######## COMICS - GENERAL ########
  48.                             #(u"9 Chickweed Lane", u"http://www.arcamax.com/ninechickweedlane"),
  49.                             #(u"Agnes", u"http://www.arcamax.com/agnes"),
  50.                             #(u"Andy Capp", u"http://www.arcamax.com/andycapp"),
  51.                             (u"BC", u"http://www.arcamax.com/bc"),
  52.                             #(u"Baby Blues", u"http://www.arcamax.com/babyblues"),
  53.                             #(u"Beetle Bailey", u"http://www.arcamax.com/beetlebailey"),
  54.                             (u"Blondie", u"http://www.arcamax.com/blondie"),
  55.                             #u"Boondocks", u"http://www.arcamax.com/boondocks"),
  56.                             #(u"Cathy", u"http://www.arcamax.com/cathy"),
  57.                             #(u"Daddys Home", u"http://www.arcamax.com/daddyshome"),
  58.                             (u"Dilbert", u"http://www.arcamax.com/dilbert"),
  59.                             #(u"Dinette Set", u"http://www.arcamax.com/thedinetteset"),
  60.                             (u"Dog Eat Doug", u"http://www.arcamax.com/dogeatdoug"),
  61.                             (u"Doonesbury", u"http://www.arcamax.com/doonesbury"),
  62.                             #(u"Dustin", u"http://www.arcamax.com/dustin"),
  63.                             (u"Family Circus", u"http://www.arcamax.com/familycircus"),
  64.                             (u"Garfield", u"http://www.arcamax.com/garfield"),
  65.                             #(u"Get Fuzzy", u"http://www.arcamax.com/getfuzzy"),
  66.                             #(u"Girls and Sports", u"http://www.arcamax.com/girlsandsports"),
  67.                             #(u"Hagar the Horrible", u"http://www.arcamax.com/hagarthehorrible"),
  68.                             #(u"Heathcliff", u"http://www.arcamax.com/heathcliff"),
  69.                             #(u"Jerry King Cartoons", u"http://www.arcamax.com/humorcartoon"),
  70.                             #(u"Luann", u"http://www.arcamax.com/luann"),
  71.                             #(u"Momma", u"http://www.arcamax.com/momma"),
  72.                             #(u"Mother Goose and Grimm", u"http://www.arcamax.com/mothergooseandgrimm"),
  73.                             (u"Mutts", u"http://www.arcamax.com/mutts"),
  74.                             #(u"Non Sequitur", u"http://www.arcamax.com/nonsequitur"),
  75.                             #(u"Pearls Before Swine", u"http://www.arcamax.com/pearlsbeforeswine"),
  76.                             #(u"Pickles", u"http://www.arcamax.com/pickles"),
  77.                             #(u"Red and Rover", u"http://www.arcamax.com/redandrover"),
  78.                             #(u"Rubes", u"http://www.arcamax.com/rubes"),
  79.                             #(u"Rugrats", u"http://www.arcamax.com/rugrats"),
  80.                             (u"Speed Bump", u"http://www.arcamax.com/speedbump"),
  81.                             (u"Wizard of Id", u"http://www.arcamax.com/wizardofid"),
  82.                             (u"Zits", u"http://www.arcamax.com/zits"),
  83.                              ]:
  84.             articles = self.make_links(url)
  85.             if articles:
  86.                 feeds.append((title, articles))
  87.         return feeds
  88.  
  89.     def make_links(self, url):
  90.         title = 'Temp'
  91.         current_articles = []
  92.         pages = range(1, self.num_comics_to_get+1)
  93.         for page in pages:
  94.             page_soup = self.index_to_soup(url)
  95.             if page_soup:
  96.                 title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'comics-header'}).h1.contents[0])
  97.                 page_url = url
  98.                 # orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href']
  99.                 prev_page_url = 'http://www.arcamax.com' + page_soup.find('span', text='Previous').parent.parent['href']
  100.                 date = self.tag_to_string(page_soup.find(name='b', attrs={'class':['current']}))
  101.             current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date})
  102.             url = prev_page_url
  103.         current_articles.reverse()
  104.         return current_articles
  105.  
  106.     def preprocess_html(self, soup):
  107.         for img_tag in soup.findAll('img'):
  108.             parent_tag = img_tag.parent
  109.             if parent_tag.name == 'a':
  110.                 new_tag = Tag(soup,'p')
  111.                 new_tag.insert(0,img_tag)
  112.                 parent_tag.replaceWith(new_tag)
  113.             elif parent_tag.name == 'p':
  114.                 if not self.tag_to_string(parent_tag) == '':
  115.                     new_div = Tag(soup,'div')
  116.                     new_tag = Tag(soup,'p')
  117.                     new_tag.insert(0,img_tag)
  118.                     parent_tag.replaceWith(new_div)
  119.                     new_div.insert(0,new_tag)
  120.                     new_div.insert(1,parent_tag)
  121.         return soup
  122.  
  123.     extra_css = '''
  124.                     h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
  125.                     h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
  126.                     img {max-width:100%; min-width:100%;}
  127.                     p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
  128.                     body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
  129.         '''
  130.