MobileRead Forums - View Single Post - New Recipe:Arcamax

Starson17 · 04-18-2011, 10:30 AM

Kovid:

The site has changed significantly. Here's a completely rewrittten Arcamax recipe:

Spoiler:

Code:

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = 'Copyright 2010 Starson17'
'''
www.arcamax.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize, re
from calibre.ebooks.BeautifulSoup import Tag

class Arcamax(BasicNewsRecipe):
    title               = 'Arcamax'
    __author__          = 'Starson17'
    __version__         = '1.04'
    __date__            = '18 April 2011'
    description         = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
    category            = 'news, comics'
    language            = 'en'
    use_embedded_content= False
    no_stylesheets      = True
    remove_javascript   = True
    cover_url           = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'

    ####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
    num_comics_to_get = 7
    # CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS

    conversion_options = {'linearize_tables'  : True
                        , 'comment'           : description
                        , 'tags'              : category
                        , 'language'          : language
                        }

    keep_only_tags     = [dict(name='div', attrs={'class':['comics-header']}),
                                        dict(name='b', attrs={'class':['current']}),
                                        dict(name='article', attrs={'class':['comic']}),
                                        ]

    remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}),
                               dict(name='div', attrs={'class':['calendar' ]}), 
                               dict(name='nav', attrs={'class':['calendar-nav' ]}),  
                               ]
   
    def parse_index(self):
        feeds = []
        for title, url in [
                            ######## COMICS - GENERAL ########
                            #(u"9 Chickweed Lane", u"http://www.arcamax.com/ninechickweedlane"),
                            #(u"Agnes", u"http://www.arcamax.com/agnes"),
                            #(u"Andy Capp", u"http://www.arcamax.com/andycapp"),
                            (u"BC", u"http://www.arcamax.com/bc"),
                            #(u"Baby Blues", u"http://www.arcamax.com/babyblues"),
                            #(u"Beetle Bailey", u"http://www.arcamax.com/beetlebailey"),
                            (u"Blondie", u"http://www.arcamax.com/blondie"),
                            #u"Boondocks", u"http://www.arcamax.com/boondocks"),
                            #(u"Cathy", u"http://www.arcamax.com/cathy"),
                            #(u"Daddys Home", u"http://www.arcamax.com/daddyshome"),
                            (u"Dilbert", u"http://www.arcamax.com/dilbert"),
                            #(u"Dinette Set", u"http://www.arcamax.com/thedinetteset"),
                            (u"Dog Eat Doug", u"http://www.arcamax.com/dogeatdoug"),
                            (u"Doonesbury", u"http://www.arcamax.com/doonesbury"),
                            #(u"Dustin", u"http://www.arcamax.com/dustin"),
                            (u"Family Circus", u"http://www.arcamax.com/familycircus"),
                            (u"Garfield", u"http://www.arcamax.com/garfield"),
                            #(u"Get Fuzzy", u"http://www.arcamax.com/getfuzzy"),
                            #(u"Girls and Sports", u"http://www.arcamax.com/girlsandsports"),
                            #(u"Hagar the Horrible", u"http://www.arcamax.com/hagarthehorrible"),
                            #(u"Heathcliff", u"http://www.arcamax.com/heathcliff"),
                            #(u"Jerry King Cartoons", u"http://www.arcamax.com/humorcartoon"),
                            #(u"Luann", u"http://www.arcamax.com/luann"),
                            #(u"Momma", u"http://www.arcamax.com/momma"),
                            #(u"Mother Goose and Grimm", u"http://www.arcamax.com/mothergooseandgrimm"),
                            (u"Mutts", u"http://www.arcamax.com/mutts"),
                            #(u"Non Sequitur", u"http://www.arcamax.com/nonsequitur"),
                            #(u"Pearls Before Swine", u"http://www.arcamax.com/pearlsbeforeswine"),
                            #(u"Pickles", u"http://www.arcamax.com/pickles"),
                            #(u"Red and Rover", u"http://www.arcamax.com/redandrover"),
                            #(u"Rubes", u"http://www.arcamax.com/rubes"),
                            #(u"Rugrats", u"http://www.arcamax.com/rugrats"),
                            (u"Speed Bump", u"http://www.arcamax.com/speedbump"),
                            (u"Wizard of Id", u"http://www.arcamax.com/wizardofid"),
                            (u"Zits", u"http://www.arcamax.com/zits"),
                             ]:
            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds

    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        pages = range(1, self.num_comics_to_get+1)
        for page in pages:
            page_soup = self.index_to_soup(url)
            if page_soup:
                title = page_soup.find(name='div', attrs={'class':'comics-header'}).h1.contents[0]
                print 'title is: ', title
                page_url = url
                print 'url is: ', url
                # orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href']
                prev_page_url = 'http://www.arcamax.com' + page_soup.find('span', text='Previous').parent.parent['href']
                print 'prev_page_url is: ', prev_page_url
                date = self.tag_to_string(page_soup.find(name='b', attrs={'class':['current']}))
                print 'date is: ', date
            current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date})
            url = prev_page_url
        current_articles.reverse()
        return current_articles

    def preprocess_html(self, soup):
        for img_tag in soup.findAll('img'):
            parent_tag = img_tag.parent
            if parent_tag.name == 'a':
                new_tag = Tag(soup,'p')
                new_tag.insert(0,img_tag)
                parent_tag.replaceWith(new_tag)
            elif parent_tag.name == 'p':
                if not self.tag_to_string(parent_tag) == '':
                    new_div = Tag(soup,'div')
                    new_tag = Tag(soup,'p')
                    new_tag.insert(0,img_tag)
                    parent_tag.replaceWith(new_div)
                    new_div.insert(0,new_tag)
                    new_div.insert(1,parent_tag)
        return soup

    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    img {max-width:100%; min-width:100%;}
                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		'''