View Single Post
Old 02-15-2010, 03:02 PM   #1
jrasmussen
Junior Member
jrasmussen can extract oil from cheesejrasmussen can extract oil from cheesejrasmussen can extract oil from cheesejrasmussen can extract oil from cheesejrasmussen can extract oil from cheesejrasmussen can extract oil from cheesejrasmussen can extract oil from cheesejrasmussen can extract oil from cheesejrasmussen can extract oil from cheese
 
jrasmussen's Avatar
 
Posts: 7
Karma: 1170
Join Date: Feb 2010
Location: Madison, WI
Device: Kobo Arc7 HD, Kobo Glo
customizing built in recipes

Hello,

I am trying to customize a couple built in recipes--namely The Los Angeles Times and Associated Press. I am the first to admit that I do not know what I am doing (which brought me here). Any help would be greatly appreciated!


For the LA Times, I just wanted to delete the "Local", "MostEmailed", and "OrangeCounty" feeds.
Here is what I have:

#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
latimes.com
'''
from calibre.web.feeds.news import BasicNewsRecipe

class LATimes(BasicNewsRecipe):
title = u'The Los Angeles Times'
__author__ = u'Darko Miletic and Sujata Raman'
description = u'News from Los Angeles'
oldest_article = 7
max_articles_per_feed = 200
language = 'en'
no_stylesheets = True
use_embedded_content = False
encoding = 'utf-8'
lang = 'en-US'

conversion_options = {
'comment' : description
, 'language' : lang
}

extra_css = '''
h1{font-family :Georgia,"Times New Roman",Times,serif; font-size:large; }
h2{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
.story{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
.entry-body{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
.entry-more{font-family :Georgia,"Times New Roman",Times,serif; font-size: x-small;}
.credit{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
.small{color:#666666; font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
.byline{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
.date{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
.time{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; font-style:italic;}
.copyright{font-family :Georgia,"Times New Roman",Times,serif; font-size: xx-small;color:#930000; }
.subhead{font-family :Georgia,"Times New Roman",Times,serif; font-size:x-small;}
'''

# recursions = 1
# match_regexps = [r'http://www.latimes.com/.*page=[2-9]']

keep_only_tags = [dict(name='div', attrs={'class':["story" ,"entry"] })]


remove_tags = [ dict(name='div', attrs={'class':['articlerail',"sphereTools","tools","toppaginate", "entry-footer-left","entry-footer-right"]}),
dict(name='div', attrs={'id':["moduleArticleToolsContainer",]}),
dict(name='p', attrs={'class':["entry-footer",]}),
dict(name='ul', attrs={'class':"article-nav clearfix"}),
dict(name=['iframe'])
]


feeds = [(u'News', u'http://feeds.latimes.com/latimes/news')
,(u'Politics','http://feeds.latimes.com/latimes/news/local/politics/cal/')
,('National','http://feeds.latimes.com/latimes/news/nationworld/nation')
,('Politics','http://feeds.latimes.com/latimes/news/politics/')
,('Business','http://feeds.latimes.com/latimes/business')
,('Sports','http://feeds.latimes.com/latimes/sports/')
,('Entertainment','http://feeds.latimes.com/latimes/entertainment/')
]


def get_article_url(self, article):
ans = article.get('feedburner_origlink').rpartition('?')[0]

try:
self.log('Looking for full story link in', ans)
soup = self.index_to_soup(ans)
x = soup.find(text="single page")

if x is not None:
a = x.parent
if a and a.has_key('href'):
ans = 'http://www.latimes.com'+a['href']
self.log('Found full story link', ans)
except:
pass
return ans

For the Associated Press I wanted to change the Washington State feed to Wisconsin, and delete the "Strange News"--again, here's what I have:

import re
from calibre.web.feeds.news import BasicNewsRecipe


class AssociatedPress(BasicNewsRecipe):

title = u'Associated Press'
description = 'Global news'
__author__ = 'Kovid Goyal'
use_embedded_content = False
language = 'en'

max_articles_per_feed = 15
html2lrf_options = ['--force-page-break-before-tag="chapter"']


preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<body class="apple-rss-no-unread-mode" onLoad="setup(null)">.*?<!-- start Entries -->', lambda match : '<body>'),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body.*?>.*?<span class="headline">', lambda match : '<body><span class="headline"><chapter>'),
(r'<tr><td><div class="body">.*?<p class="ap-story-p">', lambda match : '<p class="ap-story-p">'),
(r'<p class="ap-story-p">', lambda match : '<p>'),
(r'Learn more about our <a href="http://apdigitalnews.com/privacy.html">Privacy Policy</a>.*?</body>', lambda match : '</body>'),
]
]



feeds = [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'),
('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'),
('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'),
('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'),
('AP Wisconsinn State News', 'http://hosted.ap.org/lineups/WISCONSINHEADS-rss_2.0.xml?SITE=WIMIL&SECTION=HOME'),
('AP Technology News', 'http://hosted.ap.org/lineups/TECHHEADS-rss_2.0.xml?SITE=CTNHR&SECTION=HOME'),
('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'),
('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'),

]

Thanks,
John
jrasmussen is offline   Reply With Quote