View Single Post
Old 08-23-2011, 11:49 AM   #1
rogerx
Enthusiast
rogerx doesn't litterrogerx doesn't litterrogerx doesn't litter
 
Posts: 29
Karma: 244
Join Date: Aug 2011
Location: North Pole, Alaska
Device: Kindle DXG
Thumbs down New Fairbanks Daily News-miner News Recipe -- Need Date inclusion only

Any suggestions pertaining to line #73... how do I just include only the DATE (story_item_date updated) of a span class and omit the rest of a div class?


I also need to figure out how to convert the titles of the news articles to a bold font style.

Once I polish this off, I figure I can then submit.

Code:
from calibre.web.feeds.news import BasicNewsRecipe

import re

class FairbanksDailyNewsminer(BasicNewsRecipe):
    title          = u'Fairbanks Daily News-miner'
    __author__ = 'Roger'
    oldest_article = 7
    max_articles_per_feed = 100

    description = ''''The voice of interior Alaska since 1903'''
    publisher   = 'http://www.newsminer.com/'
    category    = 'news, Alaska, Fairbanks'
    language    = 'en'
    #extra_css   = '''
    #                p{font-weight: normal;text-align: justify}
    #              '''

    remove_javascript = True
    use_embedded_content = False
    no_stylesheets = True
    language = 'en'
    encoding = 'utf8'
    conversion_options = {'linearize_tables':True}
    masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg'

    # I just need "story_item_date updated", trash the rest of the line!
    # <div class="signature_line"><span title="2011-08-22T10:35:58Z" class="story_item_date updated">Aug 22, 2011</span>&nbsp;|&nbsp;1463&nbsp;views&nbsp;|&nbsp;19&nbsp;<a href="/pages/full_story/push?article........class="signature_email_message"></span></div>

    #preprocess_regexps = [(re.compile(r'<span[^>]*addthis_separator*>'), lambda match: '') ]
    #preprocess_regexps = [(re.compile(r'span class="addthis_separator">|</span>'), lambda match: '') ]
    
    #preprocess_regexps = [
    #           (re.compile(r'<start>.*?<end>', re.IGNORECASE | re.DOTALL), lambda match : ''),
    #               ]

    keep_only_tags = [
                        dict(name='div', attrs={'class':'hnews hentry item'}),
                        dict(name='div', attrs={'class':'story_item_headline entry-title'}),
                        dict(name='span', attrs={'class':'story_item_date updated'}),
                        dict(name='div', attrs={'class':'full_story'})
                     ]
    #remove_tags = [
    #                dict(name='div', attrs={'class':'story_tools'}),
    #                dict(name='p', attrs={'class':'ad_label'}),
    #              ]
    remove_tags = [
                    dict(name='div', attrs={'class':'signature_line'}),
                    dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}),
                    dict(name='div', attrs={'class':['addthis_toolbox','addthis_default_style']}),
                    dict(name='span', attrs={'class':'addthis_separator'}),
                    dict(name='div', attrs={'class':'related_content'}),
                    dict(name='div', attrs={'class':'comments_container'}),
                    #dict(name='div', attrs={'class':'signature_line'}),
                    dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}),
                    dict(name='div', attrs={'id':'comments_container'})
                  ]

    #remove_tags_after = [
    #                        dict(name='div', attrs={'class':'advertisement'}),
    #                    ]

    #extra_css # tweak the appearance (ie. Change titles to bold!)
    
    # Uncomment the following feeds once Dates are included and Titles are bold!
    feeds = [
        (u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news')
     #   (u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'),
     #   (u'Local News', u'http://newsminer.com/rss/rss_feeds/local_news?content_type=article&tags=local_news&page_name=rss_feeds&offset=0&instance=local_news'),
     #   (u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'),
     #   (u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'),
     #   (u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'),
     #   (u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'),
     #   (u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'),
     #   (u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'),
     #   (u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'),
     #   (u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'),
     #   (u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'),
     #   (u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'),
     #   (u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'),
     #   (u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'),
     #   (u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin')
           ]

Last edited by rogerx; 08-23-2011 at 12:01 PM. Reason: blah comment
rogerx is offline   Reply With Quote