Richmond Times-Dispatch update

_reader · 10-17-2012, 02:39 PM

Updated with ignore_duplicate_articles = { 'title', 'url' } which has long been needed to remove duplicate articles appearing in different feeds.

Spoiler:

Code:

import re
from calibre.web.feeds.recipes import BasicNewsRecipe

class RichmondTimesDispatch(BasicNewsRecipe):
    title               = u'Richmond Times-Dispatch'
    description         = "The Richmond Times-Dispatch is the primary daily newspaper in Richmond, \
                            the capital of Virginia, United States, as well as the Virginia cities of Petersburg, \
                            Chester. Hopewell, Colonial Heights, Charlottesville, Lynchburg, Waynesboro, \
                            and is also a default paper for rural regions of the state.  \
                            The RTD has published in some form for more than 150 years."
    __author__          = '_reader'
    __date__            = '17 October 2012'
    __version__         = '1.6'
    cover_url           = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
    masthead_url            = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
    language            = 'en'
    oldest_article      = 1.5 #days
    max_articles_per_feed   = 100
    ignore_duplicate_articles = { 'title', 'url' }
    needs_subscription  = False
    publisher           = 'timesdispatch.com'
    category            = 'news, commentary'
    tags                = 'news'
    publication_type    = 'newspaper'
    no_stylesheets      = True
    use_embedded_content= False
    encoding            = None
    simultaneous_downloads  = 20
    recursions          = 0
    remove_javascript   = True
    remove_empty_feeds  = True
    auto_cleanup        = False

    conversion_options = {
                           'comments'    : description,
                           'tags'             : tags,
                           'language'      : language,
                           'publisher'      : publisher,
                           'authors'        : publisher,
                           'smarten_punctuation' : True
                            }

    remove_tags_before = dict(id='hnews hentry item')

    remove_tags_after   = dict(name='hr')

    remove_tags =   [
                    dict(name='div', attrs={'id':['mg_hd', 'mg_ft', 'sr_b', 'comments_left', 'comments_right']})
                    ,dict(name='div', attrs={'class':['bottom_social','article_bottom']})
                    ,dict(name='table', attrs={'class':['ap-mediabox-table', 'ap-htmltable-table', 'ap-photogallery-table', 'ap-htmlfragment-table']})
                    ]


    preprocess_regexps = [
                   (re.compile(r'<table class="ap-story-table hnews hentry item".*?<td class="ap-story-td">', re.DOTALL|re.IGNORECASE), lambda match: ''),
                   (re.compile(r'<p>\s*http://www2.timesdispatch.*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
                   (re.compile(r'<p>\s*<img src="http://static2.dukecms.*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
                   (re.compile(r'<p>\s*<a href="http://www2.timesdispatch.*?</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
                   (re.compile(r'<hr.*?>', re.DOTALL|re.IGNORECASE), lambda match: ''),             #strip <hr /> line break
                   (re.compile(r'<a\s*rel="item-license.*?Use</a>.', re.DOTALL|re.IGNORECASE), lambda match: ''),           #strip <hr /> line break
                   (re.compile(r'<small>\s*Richmond Times-Dispatch.*?</small>', re.DOTALL|re.IGNORECASE), lambda match: ''),            #strip <hr /> line break
                    ]


    feeds          =   [
    ('News',        'http://www2.timesdispatch.com/list/feed/rss/news-archive'),
    ('Breaking News',   'http://www2.timesdispatch.com/list/feed/rss/breaking-news'),
    ('National News',   'http://www2.timesdispatch.com/list/feed/rss/national-news'),
    ('Local News',      'http://www2.timesdispatch.com/list/feed/rss/local-news'),
    ('Business',        'http://www2.timesdispatch.com/list/feed/rss/business'),
    ('Local Business',  'http://www2.timesdispatch.com/list/feed/rss/local-business'),
    ('Politics',        'http://www2.timesdispatch.com/list/feed/rss/politics'),
    ('Virginia Politics',   'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'),
    ('History',   'http://www2.timesdispatch.com/feed/rss/special_section/news/history'),
    ('Sports',      'http://www2.timesdispatch.com/list/feed/rss/sports2'),
    ('Health',      'http://www2.timesdispatch.com/feed/rss/lifestyles/health_med_fit/'),
    ('Entertainment/Life',  'http://www2.timesdispatch.com/list/feed/rss/entertainment'),
    ('Arts/Theatre',    'http://www2.timesdispatch.com/feed/rss/entertainment/arts_theatre/'),
    ('Movies',      'http://www2.timesdispatch.com/list/feed/rss/movies'),
    ('Music',       'http://www2.timesdispatch.com/list/feed/rss/music'),
    ('Dining & Food',   'http://www2.timesdispatch.com/list/feed/rss/dining'),
    ('Home & Garden',   'http://www2.timesdispatch.com/list/feed/rss/home-and-garden/'),
    ('Travel',     'http://www2.timesdispatch.com/feed/rss/travel/'),
    ('Opinion',     'http://www2.timesdispatch.com/feed/rss/news/opinion/'),
    ('Editorials',      'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'),
    ('Columnists and Blogs',    'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'),
    ('Opinion Columnists',  'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'),
    ('Letters to the Editor',   'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'),
    ('Traffic',     'http://www2.timesdispatch.com/list/feed/rss/traffic'),
    ('Drives',     'http://www2.timesdispatch.com/feed/rss/classifieds/transportation/'),

        ]

    def print_version(self,url):
        article_num = re.sub(r'(^.*)\-([0-9]{4,10})\/$', r'\g<2>', url)
        ap_pat = re.compile('http')
        #print '\nDEBUG>>>>>>>>: article_num: ', article_num
        #print 'DEBUG>>>>>>>>: ap_pat.search(article_num): ', ap_pat.search(article_num)
        if ap_pat.search(article_num):            #AP article, no print url
            #print 'DEBUG>>>>>>>>: AP URL: ', url
            return url
        else:
            printURL = 'http://www2.timesdispatch.com/member-center/share-this/print/?content=ar' + article_num
            return printURL

Similar Threads
Thread	Thread Starter	Forum	Replies	Last Post
Revised Recipe - Richmond Times-Dispatch	_reader	Recipes	0	07-05-2012 03:02 PM
Revised Recipe for Richmond Times-Dispatch newspaper (Virginia, USA)	_reader	Recipes	0	04-27-2012 11:40 AM
Richmond (VA) Times-Dispatch	_reader	Recipes	0	04-26-2012 03:35 PM
Los Angeles Times (eng) needs an update.	JayKindle	Recipes	8	09-17-2011 03:10 AM
Update Financial Times recipe	sir-archimedes	Recipes	0	04-24-2011 10:39 AM