MobileRead Forums - View Single Post

AustinTim · 07-26-2011, 05:46 PM

Starson,
I tried the code you referenced with the code for Arcamax comics and not only did it not work it somehow made it that the script did not even download the images...

any ideas of what might be wrong here?

Thanks,
-tim

Spoiler:

Code:

#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = 'Copyright 2010 Starson17'
'''
www.arcamax.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag

class Arcamax(BasicNewsRecipe):
    title               = 'ComicsArcamax'
    __author__          = 'TDS'
    __version__         = '1.05'
    __date__            = '12 May 2011'
    description         = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
    category            = 'news, comics'
    language            = 'en'
    use_embedded_content= False
    no_stylesheets      = True
    remove_javascript   = True
    cover_url           = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'

    ####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
    num_comics_to_get = 1
    # CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS

    conversion_options = {'linearize_tables'  : True
                        , 'comment'           : description
                        , 'tags'              : category
                        , 'language'          : language
                        }

    keep_only_tags     = [dict(name='div', attrs={'class':['comics-header']}),
                                        dict(name='b', attrs={'class':['current']}),
                                        dict(name='article', attrs={'class':['comic']}),
                                        ]

    remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}),
                               dict(name='div', attrs={'class':['calendar' ]}),
                               dict(name='a', attrs={'class':['author bio']}),
                               dict(name='a', attrs={'href':['/']}),
                               dict(name='a', attrs={'href':['/comics']}),
                               dict(name='nav', attrs={'class':['calendar-nav' ]}),
                               ]

    def parse_index(self):
        feeds = []
        for title, url in [
                            ######## COMICS - GENERAL ########
                            #(u"9 Chickweed Lane", u"http://www.arcamax.com/thefunnies/ninechickweedlane"),
                            #(u"Agnes", u"http://www.arcamax.com/thefunnies/agnes"),
                            #(u"Andy Capp", u"http://www.arcamax.com/thefunnies/andycapp"),
                            #(u"BC", u"http://www.arcamax.com/thefunnies/bc"),
                            (u"Baby Blues", u"http://www.arcamax.com/thefunnies/babyblues"),
                            #(u"Beetle Bailey", u"http://www.arcamax.com/thefunnies/beetlebailey"),
                            #(u"Blondie", u"http://www.arcamax.com/thefunnies/blondie"),
                            #u"Boondocks", u"http://www.arcamax.com/thefunnies/boondocks"),
                            #(u"Cathy", u"http://www.arcamax.com/thefunnies/cathy"),
                            #(u"Daddys Home", u"http://www.arcamax.com/thefunnies/daddyshome"),
                            (u"Dilbert", u"http://www.arcamax.com/thefunnies/dilbert"),
                            #(u"Dinette Set", u"http://www.arcamax.com/thefunnies/thedinetteset"),
                            #(u"Dog Eat Doug", u"http://www.arcamax.com/thefunnies/dogeatdoug"),
                            (u"Doonesbury", u"http://www.arcamax.com/thefunnies/doonesbury"),
                            #(u"Dustin", u"http://www.arcamax.com/thefunnies/dustin"),
                            #(u"Family Circus", u"http://www.arcamax.com/thefunnies/familycircus"),
                            #(u"Garfield", u"http://www.arcamax.com/thefunnies/garfield"),
                            #(u"Get Fuzzy", u"http://www.arcamax.com/thefunnies/getfuzzy"),
                            #(u"Girls and Sports", u"http://www.arcamax.com/thefunnies/girlsandsports"),
                            #(u"Hagar the Horrible", u"http://www.arcamax.com/thefunnies/hagarthehorrible"),
                            #(u"Heathcliff", u"http://www.arcamax.com/thefunnies/heathcliff"),
                            #(u"Jerry King Cartoons", u"http://www.arcamax.com/thefunnies/humorcartoon"),
                            #(u"Luann", u"http://www.arcamax.com/thefunnies/luann"),
                            #(u"Momma", u"http://www.arcamax.com/thefunnies/momma"),
                            #(u"Mother Goose and Grimm", u"http://www.arcamax.com/thefunnies/mothergooseandgrimm"),
                            #(u"Mutts", u"http://www.arcamax.com/thefunnies/mutts"),
                            #(u"Non Sequitur", u"http://www.arcamax.com/thefunnies/nonsequitur"),
                            (u"Pearls Before Swine", u"http://www.arcamax.com/thefunnies/pearlsbeforeswine"),
                            #(u"Pickles", u"http://www.arcamax.com/thefunnies/pickles"),
                            #(u"Red and Rover", u"http://www.arcamax.com/thefunnies/redandrover"),
                            #(u"Rubes", u"http://www.arcamax.com/thefunnies/rubes"),
                            #(u"Rugrats", u"http://www.arcamax.com/thefunnies/rugrats"),
                            (u"Speed Bump", u"http://www.arcamax.com/thefunnies/speedbump"),
                            #(u"Wizard of Id", u"http://www.arcamax.com/thefunnies/wizardofid"),
                            (u"Zits", u"http://www.arcamax.com/thefunnies/zits"),
                             ]:
            articles = self.make_links(url)
            if articles:
                feeds.append((title, articles))
        return feeds

    def make_links(self, url):
        title = 'Temp'
        current_articles = []
        pages = range(1, self.num_comics_to_get+1)
        for page in pages:
            page_soup = self.index_to_soup(url)
            if page_soup:
                title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'comics-header'}).h1.contents[0])
                page_url = url
                # orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href']
                prev_page_url = 'http://www.arcamax.com' + page_soup.find('span', text='Previous').parent.parent['href']
                date = self.tag_to_string(page_soup.find(name='b', attrs={'class':['current']}))
            current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date})
            url = prev_page_url
        current_articles.reverse()
        return current_articles

    def preprocess_html(self, soup):
        for img_tag in soup.findAll('img'):
            parent_tag = img_tag.parent
            if parent_tag.name == 'a':
                new_tag = Tag(soup,'p')
                new_tag.insert(0,img_tag)
                parent_tag.replaceWith(new_tag)
            elif parent_tag.name == 'p':
                if not self.tag_to_string(parent_tag) == '':
                    new_div = Tag(soup,'div')
                    new_tag = Tag(soup,'p')
                    new_tag.insert(0,img_tag)
                    parent_tag.replaceWith(new_div)
                    new_div.insert(0,new_tag)
                    new_div.insert(1,parent_tag)
        return soup
		
    def postprocess_html(self, soup, first):
       # process all the images. assumes that the new html has the correct path
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            width, height = img.size
            print 'img is: ', iurl, 'width is: ', width, 'height is: ', height 
            if img < 0:
                raise RuntimeError('Out of memory')
            pw = PixelWand()
            if( width > height ) :
                print 'Rotate image'
                img.rotate(pw, -90)
                img.save(iurl)
        return soup
		
    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    img {max-width:100%; min-width:100%;}
                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		'''