Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = 'Copyright 2010 Starson17'
'''
www.arcamax.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Arcamax(BasicNewsRecipe):
title = 'ComicsArcamax'
__author__ = 'TDS'
__version__ = '1.05'
__date__ = '12 May 2011'
description = u'Family Friendly Comics - Customize for more days/comics: Defaults to 7 days, 25 comics - 20 general, 5 editorial.'
category = 'news, comics'
language = 'en'
use_embedded_content= False
no_stylesheets = True
remove_javascript = True
cover_url = 'http://www.arcamax.com/images/pub/amuse/leftcol/zits.jpg'
####### USER PREFERENCES - SET COMICS AND NUMBER OF COMICS TO RETRIEVE ########
num_comics_to_get = 1
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
conversion_options = {'linearize_tables' : True
, 'comment' : description
, 'tags' : category
, 'language' : language
}
keep_only_tags = [dict(name='div', attrs={'class':['comics-header']}),
dict(name='b', attrs={'class':['current']}),
dict(name='article', attrs={'class':['comic']}),
]
remove_tags = [dict(name='div', attrs={'id':['comicfull' ]}),
dict(name='div', attrs={'class':['calendar' ]}),
dict(name='a', attrs={'class':['author bio']}),
dict(name='a', attrs={'href':['/']}),
dict(name='a', attrs={'href':['/comics']}),
dict(name='nav', attrs={'class':['calendar-nav' ]}),
]
def parse_index(self):
feeds = []
for title, url in [
######## COMICS - GENERAL ########
#(u"9 Chickweed Lane", u"http://www.arcamax.com/thefunnies/ninechickweedlane"),
#(u"Agnes", u"http://www.arcamax.com/thefunnies/agnes"),
#(u"Andy Capp", u"http://www.arcamax.com/thefunnies/andycapp"),
#(u"BC", u"http://www.arcamax.com/thefunnies/bc"),
(u"Baby Blues", u"http://www.arcamax.com/thefunnies/babyblues"),
#(u"Beetle Bailey", u"http://www.arcamax.com/thefunnies/beetlebailey"),
#(u"Blondie", u"http://www.arcamax.com/thefunnies/blondie"),
#u"Boondocks", u"http://www.arcamax.com/thefunnies/boondocks"),
#(u"Cathy", u"http://www.arcamax.com/thefunnies/cathy"),
#(u"Daddys Home", u"http://www.arcamax.com/thefunnies/daddyshome"),
(u"Dilbert", u"http://www.arcamax.com/thefunnies/dilbert"),
#(u"Dinette Set", u"http://www.arcamax.com/thefunnies/thedinetteset"),
#(u"Dog Eat Doug", u"http://www.arcamax.com/thefunnies/dogeatdoug"),
(u"Doonesbury", u"http://www.arcamax.com/thefunnies/doonesbury"),
#(u"Dustin", u"http://www.arcamax.com/thefunnies/dustin"),
#(u"Family Circus", u"http://www.arcamax.com/thefunnies/familycircus"),
#(u"Garfield", u"http://www.arcamax.com/thefunnies/garfield"),
#(u"Get Fuzzy", u"http://www.arcamax.com/thefunnies/getfuzzy"),
#(u"Girls and Sports", u"http://www.arcamax.com/thefunnies/girlsandsports"),
#(u"Hagar the Horrible", u"http://www.arcamax.com/thefunnies/hagarthehorrible"),
#(u"Heathcliff", u"http://www.arcamax.com/thefunnies/heathcliff"),
#(u"Jerry King Cartoons", u"http://www.arcamax.com/thefunnies/humorcartoon"),
#(u"Luann", u"http://www.arcamax.com/thefunnies/luann"),
#(u"Momma", u"http://www.arcamax.com/thefunnies/momma"),
#(u"Mother Goose and Grimm", u"http://www.arcamax.com/thefunnies/mothergooseandgrimm"),
#(u"Mutts", u"http://www.arcamax.com/thefunnies/mutts"),
#(u"Non Sequitur", u"http://www.arcamax.com/thefunnies/nonsequitur"),
(u"Pearls Before Swine", u"http://www.arcamax.com/thefunnies/pearlsbeforeswine"),
#(u"Pickles", u"http://www.arcamax.com/thefunnies/pickles"),
#(u"Red and Rover", u"http://www.arcamax.com/thefunnies/redandrover"),
#(u"Rubes", u"http://www.arcamax.com/thefunnies/rubes"),
#(u"Rugrats", u"http://www.arcamax.com/thefunnies/rugrats"),
(u"Speed Bump", u"http://www.arcamax.com/thefunnies/speedbump"),
#(u"Wizard of Id", u"http://www.arcamax.com/thefunnies/wizardofid"),
(u"Zits", u"http://www.arcamax.com/thefunnies/zits"),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
pages = range(1, self.num_comics_to_get+1)
for page in pages:
page_soup = self.index_to_soup(url)
if page_soup:
title = self.tag_to_string(page_soup.find(name='div', attrs={'class':'comics-header'}).h1.contents[0])
page_url = url
# orig prev_page_url = 'http://www.arcamax.com' + page_soup.find('a', attrs={'class':'prev'}, text='Previous').parent['href']
prev_page_url = 'http://www.arcamax.com' + page_soup.find('span', text='Previous').parent.parent['href']
date = self.tag_to_string(page_soup.find(name='b', attrs={'class':['current']}))
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date': date})
url = prev_page_url
current_articles.reverse()
return current_articles
def preprocess_html(self, soup):
for img_tag in soup.findAll('img'):
parent_tag = img_tag.parent
if parent_tag.name == 'a':
new_tag = Tag(soup,'p')
new_tag.insert(0,img_tag)
parent_tag.replaceWith(new_tag)
elif parent_tag.name == 'p':
if not self.tag_to_string(parent_tag) == '':
new_div = Tag(soup,'div')
new_tag = Tag(soup,'p')
new_tag.insert(0,img_tag)
parent_tag.replaceWith(new_div)
new_div.insert(0,new_tag)
new_div.insert(1,parent_tag)
return soup
def postprocess_html(self, soup, first):
# process all the images. assumes that the new html has the correct path
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
width, height = img.size
print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
if img < 0:
raise RuntimeError('Out of memory')
pw = PixelWand()
if( width > height ) :
print 'Rotate image'
img.rotate(pw, -90)
img.save(iurl)
return soup
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
img {max-width:100%; min-width:100%;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''