Wizard
Posts: 4,004
Karma: 177841
Join Date: Dec 2009
Device: WinMo: IPAQ; Android: HTC HD2, Archos 7o; Java:Gravity T
|
New recipe: GoComics.com
Here is the gocomics.com recipe I promised. It has 200+ comics, about 3/4 general and 1/4 political/editorial.
There are 3 options that need to be set, so this recipe is best used as a custom recipe. They are:
the comic strips to get,
the number of past issues/days of the strip to get, and
the comic image size.
The default has 7 days of 15 comics (10 general and 5 editorial) at a moderately large size.
I pulled the last 99 images of Calvin and Hobbes as a test. It retrieved them all. The main C&H page has a link to the first strip back in the 1980's, but I have no idea if all strips are available.
Please be conservative when setting the options so you don't overload their servers and only get comics you will actually read. Anyone setting a daily retrieve for all 200+ comics with 1000 back issues at maximum image size will be severely frowned upon by Those-Who-Frown (and their computer will lock up for the next few years.)
Enjoy!
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = 'Copyright 2010 Starson17'
'''
www.gocomics.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
import urllib, re, mechanize
class GoComics(BasicNewsRecipe):
title = 'GoComics'
__author__ = 'Starson17'
__version__ = '1.01'
__date__ = '13 March 2010'
description = '200+ Comics - Customize for more days/comics: Defaults to 7 days, 15 comics - 10 general, 5 editorial.'
language = 'en'
use_embedded_content= False
no_stylesheets = True
remove_javascript = True
cover_url = 'http://paulbuckley14059.files.wordpress.com/2008/06/calvin-and-hobbes.jpg'
####### USER PREFERENCES - COMICS, IMAGE SIZE AND NUMBER OF COMICS TO RETRIEVE ########
# num_comics_to_get - I've tried up to 99 on Calvin&Hobbes
num_comics_to_get = 7
# comic_size 300 is small, 600 is medium, 900 is large, 1500 is extra-large
comic_size = 1200
# CHOOSE COMIC STRIPS BELOW - REMOVE COMMENT '# ' FROM IN FRONT OF DESIRED STRIPS
# Please do not overload their servers by selecting all comics and 1000 strips from each!
keep_only_tags = [dict(name='div', attrs={'class':['feature','banner']}),
]
remove_tags = [dict(name='a', attrs={'class':['beginning','prev','cal','next','newest']}),
dict(name='div', attrs={'class':['tag-wrapper']}),
dict(name='ul', attrs={'class':['share-nav','feature-nav']}),
]
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
orig_open_novisit = br.open_novisit
def my_open_no_visit(url, **kwargs):
req = mechanize.Request(
url,
headers = {
'Referer':'http://www.gocomics.com/',
})
return orig_open_novisit(req)
br.open_novisit = my_open_no_visit
return br
def parse_index(self):
feeds = []
for title, url in [
######## COMICS - GENERAL ########
# (u"2 Cows and a Chicken", u"http://www.gocomics.com/2cowsandachicken"),
# (u"9 to 5", u"http://www.gocomics.com/9to5"),
# (u"The Academia Waltz", u"http://www.gocomics.com/academiawaltz"),
# (u"Adam@Home", u"http://www.gocomics.com/adamathome"),
# (u"Agnes", u"http://www.gocomics.com/agnes"),
# (u"Andy Capp", u"http://www.gocomics.com/andycapp"),
# (u"Animal Crackers", u"http://www.gocomics.com/animalcrackers"),
# (u"Annie", u"http://www.gocomics.com/annie"),
# (u"The Argyle Sweater", u"http://www.gocomics.com/theargylesweater"),
# (u"Ask Shagg", u"http://www.gocomics.com/askshagg"),
(u"B.C.", u"http://www.gocomics.com/bc"),
# (u"Back in the Day", u"http://www.gocomics.com/backintheday"),
# (u"Bad Reporter", u"http://www.gocomics.com/badreporter"),
# (u"Baldo", u"http://www.gocomics.com/baldo"),
# (u"Ballard Street", u"http://www.gocomics.com/ballardstreet"),
# (u"Barkeater Lake", u"http://www.gocomics.com/barkeaterlake"),
# (u"The Barn", u"http://www.gocomics.com/thebarn"),
# (u"Basic Instructions", u"http://www.gocomics.com/basicinstructions"),
# (u"Bewley", u"http://www.gocomics.com/bewley"),
# (u"Big Top", u"http://www.gocomics.com/bigtop"),
# (u"Biographic", u"http://www.gocomics.com/biographic"),
# (u"Birdbrains", u"http://www.gocomics.com/birdbrains"),
# (u"Bleeker: The Rechargeable Dog", u"http://www.gocomics.com/bleeker"),
# (u"Bliss", u"http://www.gocomics.com/bliss"),
(u"Bloom County", u"http://www.gocomics.com/bloomcounty"),
# (u"Bo Nanas", u"http://www.gocomics.com/bonanas"),
# (u"Bob the Squirrel", u"http://www.gocomics.com/bobthesquirrel"),
# (u"The Boiling Point", u"http://www.gocomics.com/theboilingpoint"),
# (u"Boomerangs", u"http://www.gocomics.com/boomerangs"),
# (u"The Boondocks", u"http://www.gocomics.com/boondocks"),
# (u"Bottomliners", u"http://www.gocomics.com/bottomliners"),
# (u"Bound and Gagged", u"http://www.gocomics.com/boundandgagged"),
# (u"Brainwaves", u"http://www.gocomics.com/brainwaves"),
# (u"Brenda Starr", u"http://www.gocomics.com/brendastarr"),
# (u"Brewster Rockit", u"http://www.gocomics.com/brewsterrockit"),
# (u"Broom Hilda", u"http://www.gocomics.com/broomhilda"),
(u"Calvin and Hobbes", u"http://www.gocomics.com/calvinandhobbes"),
# (u"Candorville", u"http://www.gocomics.com/candorville"),
# (u"Cathy", u"http://www.gocomics.com/cathy"),
# (u"C'est la Vie", u"http://www.gocomics.com/cestlavie"),
# (u"Chuckle Bros", u"http://www.gocomics.com/chucklebros"),
# (u"Citizen Dog", u"http://www.gocomics.com/citizendog"),
# (u"The City", u"http://www.gocomics.com/thecity"),
# (u"Cleats", u"http://www.gocomics.com/cleats"),
# (u"Close to Home", u"http://www.gocomics.com/closetohome"),
# (u"Compu-toon", u"http://www.gocomics.com/compu-toon"),
# (u"Cornered", u"http://www.gocomics.com/cornered"),
# (u"Cul de Sac", u"http://www.gocomics.com/culdesac"),
# (u"Daddy's Home", u"http://www.gocomics.com/daddyshome"),
# (u"Deep Cover", u"http://www.gocomics.com/deepcover"),
# (u"Dick Tracy", u"http://www.gocomics.com/dicktracy"),
# (u"The Dinette Set", u"http://www.gocomics.com/dinetteset"),
# (u"Dog Eat Doug", u"http://www.gocomics.com/dogeatdoug"),
# (u"Domestic Abuse", u"http://www.gocomics.com/domesticabuse"),
# (u"Doodles", u"http://www.gocomics.com/doodles"),
# (u"Doonesbury", u"http://www.gocomics.com/doonesbury"),
# (u"The Doozies", u"http://www.gocomics.com/thedoozies"),
# (u"The Duplex", u"http://www.gocomics.com/duplex"),
# (u"Eek!", u"http://www.gocomics.com/eek"),
# (u"The Elderberries", u"http://www.gocomics.com/theelderberries"),
# (u"Flight Deck", u"http://www.gocomics.com/flightdeck"),
# (u"Flo and Friends", u"http://www.gocomics.com/floandfriends"),
# (u"The Flying McCoys", u"http://www.gocomics.com/theflyingmccoys"),
(u"For Better or For Worse", u"http://www.gocomics.com/forbetterorforworse"),
# (u"For Heaven's Sake", u"http://www.gocomics.com/forheavenssake"),
# (u"Fort Knox", u"http://www.gocomics.com/fortknox"),
# (u"FoxTrot", u"http://www.gocomics.com/foxtrot"),
(u"FoxTrot Classics", u"http://www.gocomics.com/foxtrotclassics"),
# (u"Frank & Ernest", u"http://www.gocomics.com/frankandernest"),
# (u"Fred Basset", u"http://www.gocomics.com/fredbasset"),
# (u"Free Range", u"http://www.gocomics.com/freerange"),
# (u"Frog Applause", u"http://www.gocomics.com/frogapplause"),
# (u"The Fusco Brothers", u"http://www.gocomics.com/thefuscobrothers"),
(u"Garfield", u"http://www.gocomics.com/garfield"),
# (u"Garfield Minus Garfield", u"http://www.gocomics.com/garfieldminusgarfield"),
# (u"Gasoline Alley", u"http://www.gocomics.com/gasolinealley"),
# (u"Gil Thorp", u"http://www.gocomics.com/gilthorp"),
# (u"Ginger Meggs", u"http://www.gocomics.com/gingermeggs"),
# (u"Girls & Sports", u"http://www.gocomics.com/girlsandsports"),
# (u"Haiku Ewe", u"http://www.gocomics.com/haikuewe"),
# (u"Heart of the City", u"http://www.gocomics.com/heartofthecity"),
# (u"Heathcliff", u"http://www.gocomics.com/heathcliff"),
# (u"Herb and Jamaal", u"http://www.gocomics.com/herbandjamaal"),
# (u"Home and Away", u"http://www.gocomics.com/homeandaway"),
# (u"Housebroken", u"http://www.gocomics.com/housebroken"),
# (u"Hubert and Abby", u"http://www.gocomics.com/hubertandabby"),
# (u"Imagine This", u"http://www.gocomics.com/imaginethis"),
# (u"In the Bleachers", u"http://www.gocomics.com/inthebleachers"),
# (u"In the Sticks", u"http://www.gocomics.com/inthesticks"),
# (u"Ink Pen", u"http://www.gocomics.com/inkpen"),
# (u"It's All About You", u"http://www.gocomics.com/itsallaboutyou"),
# (u"Joe Vanilla", u"http://www.gocomics.com/joevanilla"),
# (u"La Cucaracha", u"http://www.gocomics.com/lacucaracha"),
# (u"Last Kiss", u"http://www.gocomics.com/lastkiss"),
# (u"Legend of Bill", u"http://www.gocomics.com/legendofbill"),
# (u"Liberty Meadows", u"http://www.gocomics.com/libertymeadows"),
# (u"Lio", u"http://www.gocomics.com/lio"),
# (u"Little Dog Lost", u"http://www.gocomics.com/littledoglost"),
# (u"Little Otto", u"http://www.gocomics.com/littleotto"),
# (u"Loose Parts", u"http://www.gocomics.com/looseparts"),
# (u"Love Is...", u"http://www.gocomics.com/loveis"),
# (u"Maintaining", u"http://www.gocomics.com/maintaining"),
# (u"The Meaning of Lila", u"http://www.gocomics.com/meaningoflila"),
# (u"Middle-Aged White Guy", u"http://www.gocomics.com/middleagedwhiteguy"),
# (u"The Middletons", u"http://www.gocomics.com/themiddletons"),
# (u"Momma", u"http://www.gocomics.com/momma"),
# (u"Mutt & Jeff", u"http://www.gocomics.com/muttandjeff"),
# (u"Mythtickle", u"http://www.gocomics.com/mythtickle"),
# (u"Nest Heads", u"http://www.gocomics.com/nestheads"),
# (u"NEUROTICA", u"http://www.gocomics.com/neurotica"),
# (u"New Adventures of Queen Victoria", u"http://www.gocomics.com/thenewadventuresofqueenvictoria"),
(u"Non Sequitur", u"http://www.gocomics.com/nonsequitur"),
# (u"The Norm", u"http://www.gocomics.com/thenorm"),
# (u"On A Claire Day", u"http://www.gocomics.com/onaclaireday"),
# (u"One Big Happy", u"http://www.gocomics.com/onebighappy"),
# (u"The Other Coast", u"http://www.gocomics.com/theothercoast"),
# (u"Out of the Gene Pool Re-Runs", u"http://www.gocomics.com/outofthegenepool"),
# (u"Overboard", u"http://www.gocomics.com/overboard"),
# (u"Pibgorn", u"http://www.gocomics.com/pibgorn"),
# (u"Pibgorn Sketches", u"http://www.gocomics.com/pibgornsketches"),
(u"Pickles", u"http://www.gocomics.com/pickles"),
# (u"Pinkerton", u"http://www.gocomics.com/pinkerton"),
# (u"Pluggers", u"http://www.gocomics.com/pluggers"),
# (u"Pooch Cafe", u"http://www.gocomics.com/poochcafe"),
# (u"PreTeena", u"http://www.gocomics.com/preteena"),
# (u"The Quigmans", u"http://www.gocomics.com/thequigmans"),
# (u"Rabbits Against Magic", u"http://www.gocomics.com/rabbitsagainstmagic"),
# (u"Real Life Adventures", u"http://www.gocomics.com/reallifeadventures"),
# (u"Red and Rover", u"http://www.gocomics.com/redandrover"),
# (u"Red Meat", u"http://www.gocomics.com/redmeat"),
# (u"Reynolds Unwrapped", u"http://www.gocomics.com/reynoldsunwrapped"),
# (u"Ronaldinho Gaucho", u"http://www.gocomics.com/ronaldinhogaucho"),
# (u"Rubes", u"http://www.gocomics.com/rubes"),
# (u"Scary Gary", u"http://www.gocomics.com/scarygary"),
(u"Shoe", u"http://www.gocomics.com/shoe"),
# (u"Shoecabbage", u"http://www.gocomics.com/shoecabbage"),
# (u"Skin Horse", u"http://www.gocomics.com/skinhorse"),
# (u"Slowpoke", u"http://www.gocomics.com/slowpoke"),
# (u"Speed Bump", u"http://www.gocomics.com/speedbump"),
# (u"State of the Union", u"http://www.gocomics.com/stateoftheunion"),
# (u"Stone Soup", u"http://www.gocomics.com/stonesoup"),
# (u"Strange Brew", u"http://www.gocomics.com/strangebrew"),
# (u"Sylvia", u"http://www.gocomics.com/sylvia"),
# (u"Tank McNamara", u"http://www.gocomics.com/tankmcnamara"),
# (u"Tiny Sepuku", u"http://www.gocomics.com/tinysepuku"),
# (u"TOBY", u"http://www.gocomics.com/toby"),
# (u"Tom the Dancing Bug", u"http://www.gocomics.com/tomthedancingbug"),
# (u"Too Much Coffee Man", u"http://www.gocomics.com/toomuchcoffeeman"),
# (u"W.T. Duck", u"http://www.gocomics.com/wtduck"),
# (u"Watch Your Head", u"http://www.gocomics.com/watchyourhead"),
# (u"Wee Pals", u"http://www.gocomics.com/weepals"),
# (u"Winnie the Pooh", u"http://www.gocomics.com/winniethepooh"),
(u"Wizard of Id", u"http://www.gocomics.com/wizardofid"),
# (u"Working It Out", u"http://www.gocomics.com/workingitout"),
# (u"Yenny", u"http://www.gocomics.com/yenny"),
# (u"Zack Hill", u"http://www.gocomics.com/zackhill"),
# (u"Ziggy", u"http://www.gocomics.com/ziggy"),
######## COMICS - EDITORIAL ########
# ("Lalo Alcaraz","http://www.gocomics.com/laloalcaraz"),
# ("Nick Anderson","http://www.gocomics.com/nickanderson"),
# ("Chuck Asay","http://www.gocomics.com/chuckasay"),
# ("Tony Auth","http://www.gocomics.com/tonyauth"),
# ("Donna Barstow","http://www.gocomics.com/donnabarstow"),
# ("Bruce Beattie","http://www.gocomics.com/brucebeattie"),
# ("Clay Bennett","http://www.gocomics.com/claybennett"),
# ("Lisa Benson","http://www.gocomics.com/lisabenson"),
# ("Steve Benson","http://www.gocomics.com/stevebenson"),
# ("Chip Bok","http://www.gocomics.com/chipbok"),
# ("Steve Breen","http://www.gocomics.com/stevebreen"),
# ("Chris Britt","http://www.gocomics.com/chrisbritt"),
# ("Stuart Carlson","http://www.gocomics.com/stuartcarlson"),
# ("Ken Catalino","http://www.gocomics.com/kencatalino"),
# ("Paul Conrad","http://www.gocomics.com/paulconrad"),
# ("Jeff Danziger","http://www.gocomics.com/jeffdanziger"),
# ("Matt Davies","http://www.gocomics.com/mattdavies"),
# ("John Deering","http://www.gocomics.com/johndeering"),
# ("Bob Gorrell","http://www.gocomics.com/bobgorrell"),
# ("Walt Handelsman","http://www.gocomics.com/walthandelsman"),
# ("Clay Jones","http://www.gocomics.com/clayjones"),
# ("Kevin Kallaugher","http://www.gocomics.com/kevinkallaugher"),
# ("Steve Kelley","http://www.gocomics.com/stevekelley"),
# ("Dick Locher","http://www.gocomics.com/dicklocher"),
# ("Chan Lowe","http://www.gocomics.com/chanlowe"),
("Mike Luckovich","http://www.gocomics.com/mikeluckovich"),
# ("Gary Markstein","http://www.gocomics.com/garymarkstein"),
# ("Glenn McCoy","http://www.gocomics.com/glennmccoy"),
# ("Jim Morin","http://www.gocomics.com/jimmorin"),
# ("Jack Ohman","http://www.gocomics.com/jackohman"),
("Pat Oliphant","http://www.gocomics.com/patoliphant"),
# ("Joel Pett","http://www.gocomics.com/joelpett"),
("Ted Rall","http://www.gocomics.com/tedrall"),
# ("Michael Ramirez","http://www.gocomics.com/michaelramirez"),
# ("Marshall Ramsey","http://www.gocomics.com/marshallramsey"),
# ("Steve Sack","http://www.gocomics.com/stevesack"),
# ("Ben Sargent","http://www.gocomics.com/bensargent"),
# ("Drew Sheneman","http://www.gocomics.com/drewsheneman"),
# ("John Sherffius","http://www.gocomics.com/johnsherffius"),
("Small World","http://www.gocomics.com/smallworld"),
# ("Scott Stantis","http://www.gocomics.com/scottstantis"),
# ("Wayne Stayskal","http://www.gocomics.com/waynestayskal"),
# ("Dana Summers","http://www.gocomics.com/danasummers"),
# ("Paul Szep","http://www.gocomics.com/paulszep"),
# ("Mike Thompson","http://www.gocomics.com/mikethompson"),
("Tom Toles","http://www.gocomics.com/tomtoles"),
# ("Gary Varvel","http://www.gocomics.com/garyvarvel"),
# ("ViewsAfrica","http://www.gocomics.com/viewsafrica"),
# ("ViewsAmerica","http://www.gocomics.com/viewsamerica"),
# ("ViewsAsia","http://www.gocomics.com/viewsasia"),
# ("ViewsBusiness","http://www.gocomics.com/viewsbusiness"),
("ViewsEurope","http://www.gocomics.com/viewseurope"),
# ("ViewsLatinAmerica","http://www.gocomics.com/viewslatinamerica"),
# ("ViewsMidEast","http://www.gocomics.com/viewsmideast"),
# ("Views of the World","http://www.gocomics.com/viewsoftheworld"),
# ("Kerry Waghorn","http://www.gocomics.com/facesinthenews"),
# ("Dan Wasserman","http://www.gocomics.com/danwasserman"),
# ("Signe Wilkinson","http://www.gocomics.com/signewilkinson"),
# ("Wit of the World","http://www.gocomics.com/witoftheworld"),
# ("Don Wright","http://www.gocomics.com/donwright"),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
description = ''
date = ''
current_articles = []
pages = range(1, self.num_comics_to_get+1)
for page in pages:
page_soup = self.index_to_soup(url)
if page_soup:
strip_title = page_soup.h1.a.string
date_title = page_soup.find('ul', attrs={'class': 'feature-nav'}).li.string
title = strip_title + ' - ' + date_title
strip_url_date = page_soup.h1.a['href']
prev_strip_url_date = page_soup.find('a', attrs={'class': 'prev'})['href']
page_url = 'http://www.gocomics.com' + strip_url_date
prev_page_url = 'http://www.gocomics.com' + prev_strip_url_date
current_articles.append({'title': title, 'url': page_url, 'description':'', 'date':''})
url = prev_page_url
return current_articles
def preprocess_html(self, soup):
if soup.title:
title_string = soup.title.string.strip()
_cd = title_string.split(',',1)[1]
comic_date = ' '.join(_cd.split(' ', 4)[0:-1])
if soup.h1.span:
artist = soup.h1.span.string
soup.h1.span.string.replaceWith(comic_date + artist)
feature_item = soup.find('p',attrs={'class':'feature_item'})
if feature_item.a:
a_tag = feature_item.a
a_href = a_tag["href"]
img_tag = a_tag.img
img_tag["src"] = a_href
img_tag["width"] = self.comic_size
img_tag["height"] = None
return soup
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
|