Junior Member
Posts: 7
Karma: 1170
Join Date: Feb 2010
Location: Madison, WI
Device: Kobo Arc7 HD, Kobo Glo
|
customizing built in recipes
Hello,
I am trying to customize a couple built in recipes--namely The Los Angeles Times and Associated Press. I am the first to admit that I do not know what I am doing (which brought me here). Any help would be greatly appreciated!
For the LA Times, I just wanted to delete the "Local", "MostEmailed", and "OrangeCounty" feeds.
Here is what I have:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
latimes.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
class LATimes(BasicNewsRecipe):
title = u'The Los Angeles Times'
__author__ = u'Darko Miletic and Sujata Raman'
description = u'News from Los Angeles'
oldest_article = 7
max_articles_per_feed = 200
language = 'en'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
lang = 'en-US'
conversion_options = {
'comment' : description
, 'language' : lang
}
extra_css = '''
h1{font-family :Georgia,"Times New Roman",Times,serif; font-size:large; }
h2{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
.story{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
.entry-body{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
.entry-more{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
.credit{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
.small{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
.byline{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
.date{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
.time{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
.copyright{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; }
.subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
'''
# recursions = 1
# match_regexps = [r'http://www.latimes.com/.*page=[2-9]']
keep_only_tags = [dict(name='div', attrs={'class':["story" ,"entry"] })]
remove_tags = [ dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate", "entry-footer-left","entry-footer-right"]}),
dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}),
dict(name='p', attrs={'class':["entry-footer",]}),
dict(name='ul', attrs={'class':"article-nav clearfix"}),
dict(name=['iframe'])
]
feeds = [(u'News', u'http://feeds.latimes.com/latimes/news')
,(u'Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/')
,('National','http://feeds.latimes.com/latimes/news/nationworld/nation')
,('Politics','http://feeds.latimes.com/latimes/news/politics/')
,('Business','http://feeds.latimes.com/latimes/business')
,('Sports','http://feeds.latimes.com/latimes/sports/')
,('Entertainment','http://feeds.latimes.com/latimes/entertainment/')
]
def get_article_url(self, article):
ans = article.get('feedburner_origlink').rpartition('?')[0]
try:
self.log('Looking for full story link in', ans)
soup = self.index_to_soup(ans)
x = soup.find(text="single page")
if x is not None:
a = x.parent
if a and a.has_key('href'):
ans = 'http://www.latimes.com'+a['href']
self.log('Found full story link', ans)
except:
pass
return ans
For the Associated Press I wanted to change the Washington State feed to Wisconsin, and delete the "Strange News"--again, here's what I have:
import re
from calibre.web.feeds.news import BasicNewsRecipe
class AssociatedPress(BasicNewsRecipe):
title = u'Associated Press'
description = 'Global news'
__author__ = 'Kovid Goyal'
use_embedded_content = False
language = 'en'
max_articles_per_feed = 15
html2lrf_options = ['--force-page-break-before-tag="chapter"']
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<body class="apple-rss-no-unread-mode" onLoad="setup(null)">.*?<!-- start Entries -->', lambda match : '<body>'),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body.*?>.*?<span class="headline">', lambda match : '<body><span class="headline"><chapter>'),
(r'<tr><td><div class="body">.*?<p class="ap-story-p">', lambda match : '<p class="ap-story-p">'),
(r'<p class="ap-story-p">', lambda match : '<p>'),
(r'Learn more about our <a href="http://apdigitalnews.com/privacy.html">Privacy Policy</a>.*?</body>', lambda match : '</body>'),
]
]
feeds = [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'),
('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'),
('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'),
('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'),
('AP Wisconsinn State News', 'http://hosted.ap.org/lineups/WISCONSINHEADS-rss_2.0.xml?SITE=WIMIL&SECTION=HOME'),
('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'),
('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'),
('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'),
]
Thanks,
John
|