update 8/12/11
Found that when creating mobi file large junks of article were being thrown out.
This does mean there is a small amount of unwanted text at the end of articles.
I've tried various ways of removing it - but always the mobi gets messed up.
epubs aren't affected.
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image, PixelWand
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror'
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 8/12/11
language = 'en_GB'
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
oldest_article = 1
max_articles_per_feed = 20
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
#auto_cleanup = True
conversion_options = { 'linearize_tables' : True }
keep_only_tags = [
dict(name='div',attrs={'id' : 'body-content'})
]
remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
remove_tags = [
dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
]
preprocess_regexps = [
(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
preprocess_regexps = [
(re.compile(r'Sponsored Links', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [
(u'News', u'http://www.mirror.co.uk/news/rss.xml')
,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml')
,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml')
,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml')
,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml')
,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml')
,(u'Sport','http://www.mirror.co.uk/sport/rss.xml')
,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml')
,(u'Advice','http://www.mirror.co.uk/advice/rss.xml')
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
]
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
h1{ font-size:16px;}
img { display:block}
'''
def postprocess_html(self, soup, first):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
if img < 0:
raise RuntimeError('Out of memory')
img.type = "GrayscaleType"
img.save(iurl)
return soup
updated 30/10/11
Old version was missing parts of articles.
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror'
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 30/10/11
language = 'en_GB'
cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
oldest_article = 2
max_articles_per_feed = 30
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
keep_only_tags = [
dict(name='div',attrs={'id' : 'body-content'})
]
remove_tags_after = [dict (name='div',attrs={'class' : 'related'})]
remove_tags = [
dict(name='div',attrs={'id' : ['sidebar','menu','search-box','roffers-top']}),
dict(name='div',attrs={'class' :['inline-ad span-16 last','article-resize','related','list teasers']}),
dict(attrs={'class' : ['channellink','article-tags','replace','append-html']}),
dict(name='div',attrs={'class' : 'span-12 last sl-others addthis_toolbox addthis_default_style'})
]
preprocess_regexps = [
(re.compile(r'<dl class="q-search">.*?</dl>', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [
(u'News', u'http://www.mirror.co.uk/news/rss.xml')
,(u'Tech News', u'http://www.mirror.co.uk/news/technology/rss.xml')
,(u'Weird World','http://www.mirror.co.uk/news/weird-world/rss.xml')
,(u'Film Gossip','http://www.mirror.co.uk/celebs/film/rss.xml')
,(u'Music News','http://www.mirror.co.uk/celebs/music/rss.xml')
,(u'Celebs and Tv Gossip','http://www.mirror.co.uk/celebs/tv/rss.xml')
,(u'Sport','http://www.mirror.co.uk/sport/rss.xml')
,(u'Life Style','http://www.mirror.co.uk/life-style/rss.xml')
,(u'Advice','http://www.mirror.co.uk/advice/rss.xml')
,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
]
Last edited by scissors; 12-08-2011 at 02:54 PM.
Reason: Found that when creating mobi file large junks of article were being thrown out.
Added alternative recipe using auto clean up.
I found the first paragraph of text following and image was being displayed off to the right of the image.