View Single Post
Old 12-27-2011, 03:14 AM   #3
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
Due to recent changes in calibre? - whole articles were being missed.

The mirror recipe has been adjusted - and is working correctly again.

Spoiler:

Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provide by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
    # last updated 26/12/11
    language = 'en_GB'

    cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'


    oldest_article = 1
    max_articles_per_feed = 20
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    auto_cleanup = True
    #conversion_options = { 'linearize_tables' : True }
    
    
    #keep_only_tags = [
      #     dict(name='h1'),
      # dict(name='div',attrs={'id' : 'body-content'}),
       #dict(name='div',atts={'class' : 'article-body'}),
       #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
       #dict(name='p'),
       # ]

    #remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]

    remove_tags = [
           dict(name='title'),
           dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),
          # dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
           #dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
           #dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
          ]
    
   # preprocess_regexps = [
    #(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    preprocess_regexps = [
    (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
    preprocess_regexps = [
    (re.compile(r'Advertisement >>', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
    #preprocess_regexps = [
    #(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
    
    feeds          = [

        (u'News', u'http://www.mirror.co.uk/news/rss.xml')
        ,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml')
        ,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml')
        ,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml')
        ,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml')
        ,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml')
        ,(u'Sport','http://www.mirror.co.uk/sport/rss.xml')
        ,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml')
        ,(u'Advice','http://www.mirror.co.uk/advice/rss.xml')
        ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')

           # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
  ]
    extra_css  = '''
	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                    h1{ font-size:18px;}
                    img { display:block}
                	 '''
    def postprocess_html(self, soup, first):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            if img < 0:
                raise RuntimeError('Out of memory')
            img.type = "GrayscaleType"
            img.save(iurl)
        return soup
scissors is offline   Reply With Quote