Addict
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
|
Recipe Updates
Several recipes updated with compress news images flag set on.
(file sizes of ebooks greatly reduced)
Birmingham Post
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import re
import mechanize
from calibre.utils.magick import Image
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Birmingham post'
description = 'Author D.Asbury. News for Birmingham UK'
#timefmt = ''
__author__ = 'Dave Asbury'
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
oldest_article = 2
max_articles_per_feed = 20
linearize_tables = True
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
auto_cleanup = True
language = 'en_GB'
compress_news_images = True
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
masthead_url = 'http://www.trinitymirror.com/images/birminghampost-logo.gif'
def get_cover_url(self):
soup = self.index_to_soup('http://www.birminghampost.net')
# look for the block containing the sun button and url
cov = soup.find(attrs={'height' : re.compile('3'), 'alt' : re.compile('Post')})
print
print '%%%%%%%%%%%%%%%',cov
print
cov2 = str(cov['src'])
# cov2=cov2[7:]
print '88888888 ',cov2,' 888888888888'
#cover_url=cov2
#return cover_url
br = mechanize.Browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
return cover_url
feeds = [
#(u'News',u'http://www.birminghampost.net/news/rss.xml'),
(u'West Mids. News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
(u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
(u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
(u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')
]
Country File
Spoiler:
Code:
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
#from hashlib import md5
#import urlparse
#declare global temp file
#Feeds_File = config_dir+'\\feeds.txt'
# needed for getting rid of repeat feeds
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com'
#cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
__author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine'
# last updated 19/10/12
language = 'en_GB'
oldest_article = 30
max_articles_per_feed = 25
remove_empty_feeds = True
no_stylesheets = True
auto_cleanup = True
compress_news_images = True
ignore_duplicate_articles = {'title', 'url'}
#articles_are_obfuscated = True
#article_already_exists = False
#feed_hash = ''
def get_cover_url(self):
soup = self.index_to_soup('http://www.countryfile.com/magazine')
cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-250px_wide')})#'width' : '160',
print '&&&&&&&& ',cov,' ***'
cov=str(cov)
#cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov2 = re.findall('/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov2 = str(cov2)
cov2= "http://www.countryfile.com"+cov2[2:len(cov2)-8]
print '******** ',cov2,' ***'
# try to get cover - if can't get known cover
br = mechanize.Browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
return cover_url
remove_tags = [
# dict(attrs={'class' : ['player']}),
]
feeds = [
(u'Homepage', u'http://www.countryfile.com/rss/home'),
(u'Country News', u'http://www.countryfile.com/rss/news'),
(u'Countryside', u'http://www.countryfile.com/rss/countryside'),
]
# def parse_feeds(self):
# feeds = BasicNewsRecipe.parse_feeds(self)
# print 'create empty file'
# print
#open and close empty file - otherwise crashes as you can't append a file that doesn't exist?
# read_file=open(Feeds_File,'w+')
# read_file.close()
# repeat for all feeds
# for feed in feeds:
# print 'Feed file = ',Feeds_File
# for each section do
# print
# print 'Feed section is ',feed.title
# for each artcile in each section check if it's in the feeds file
# for article in feed.articles[:]:
# article_already_exists = False
# print
#open the file and reads lines of text
# read_file=open(Feeds_File)
# while 1:
# line=read_file.readline()
# print
# print'****'
# print 'Value of line:',line
# print 'article.title is:',article.title
# if str(line) == str(article.title+'\n'):
# article_already_exists = True
# print 'repeated article'
# break
# print'*****'
# print
# # eof reached
# if not line: break
# read_file.close()
# couldn't find article so write it to file
# if article_already_exists == False:
# read_file=open(Feeds_File,'a')
# read_file.write(article.title+'\n')
# read_file.close()
# if article_already_exists == True:
# article.url ='' # delete the url so won't download
# return feeds
Metro Uk
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
import re
import datetime
import time
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro UK'
description = 'News from The Metro, UK'
#timefmt = ''
#__author__ = 'Dave Asbury'
#last update 4/4/13
#cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
remove_empty_feeds = True
remove_javascript = True
auto_cleanup = True
max_articles_per_feed = 12
ignore_duplicate_articles = {'title', 'url'}
encoding = 'UTF-8'
language = 'en_GB'
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
compress_news_images = True
def parse_index(self):
articles = {}
key = None
ans = []
feeds = [ ('UK', 'http://metro.co.uk/news/uk/'),
('World', 'http://metro.co.uk/news/world/'),
('Weird', 'http://metro.co.uk/news/weird/'),
('Money', 'http://metro.co.uk/news/money/'),
('Sport', 'http://metro.co.uk/sport/'),
('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/')
]
for key, feed in feeds:
soup = self.index_to_soup(feed)
articles[key] = []
ans.append(key)
today = datetime.date.today()
today = time.mktime(today.timetuple())-60*60*24
for a in soup.findAll('a'):
for name, value in a.attrs:
if name == "class" and value=="post":
url = a['href']
title = a['title']
print title
description = ''
m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url)
skip = 1
if len(m.groups()) == 3:
g = m.groups()
dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d')
pubdate = time.strftime('%a, %d %b', dt.timetuple())
dt = time.mktime(dt.timetuple())
if dt >= today:
print pubdate
skip = 0
else:
pubdate = strftime('%a, %d %b')
summary = a.find(True, attrs={'class':'excerpt'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
if skip == 0:
articles[key].append(
dict(title=title, url=url, date=pubdate,
description=description,
content=''))
#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
The NME
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import mechanize
import re
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'New Musical Express Magazine'
description = 'Author D.Asbury. UK Rock & Pop Mag. '
__author__ = 'Dave Asbury'
# last updated 7/10/12
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
oldest_article = 7
max_articles_per_feed = 20
#auto_cleanup = True
language = 'en_GB'
compress_news_images = True
def get_cover_url(self):
soup = self.index_to_soup('http://www.nme.com/component/subscribe')
cov = soup.find(attrs={'id' : 'magazine_cover'})
print'&&&&&&&&&&',str(cov),'$$$$$$$$$$'
print'SRC = ', str(cov['src'])
print
print
cov2 = str(cov['src'])
# print '**** Cov url =*', cover_url,'***'
#print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***'
br = mechanize.Browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = str(cov2)
except:
cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
return cover_url
masthead_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
remove_tags = [
dict( attrs={'class':'clear_icons'}),
dict( attrs={'class':'share_links'}),
dict( attrs={'id':'right_panel'}),
dict( attrs={'class':'today box'}),
]
keep_only_tags = [
dict(name='h1'),
#dict(name='h3'),
dict(attrs={'class' : 'BText'}),
dict(attrs={'class' : 'Bmore'}),
dict(attrs={'class' : 'bPosts'}),
dict(attrs={'class' : 'text'}),
dict(attrs={'id' : 'article_gallery'}),
#dict(attrs={'class' : 'image'}),
dict(attrs={'class' : 'article_text'})
]
feeds = [
(u'NME News', u'http://feeds.feedburner.com/nmecom/rss/newsxml?format=xml'),
#(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'),
(u'Reviews',u'http://feed43.com/1817687144061333.xml'),
(u'Bloggs',u'http://feed43.com/3326754333186048.xml'),
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
Daily Mirror
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import re
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror'
description = 'News as provided by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
# last updated 19/10/12
language = 'en_GB'
#cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
compress_news_images = True
oldest_article = 1
max_articles_per_feed = 12
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
ignore_duplicate_articles = {'title'}
# auto_cleanup = True
#conversion_options = { 'linearize_tables' : True }
keep_only_tags = [ dict(name='h1'),
dict(name='div',attrs={'class' : 'lead-text'}),
dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
# dict(name='figure',attrs={'class' : 'clearfix'}),
dict(name='div',attrs={'class' :'body '}),
#dict(attrs={'class' : ['article-attr','byline append-1','published']}),
#dict(name='p'),
]
remove_tags = [
dict(attrs={'class' : ['article sa-teaser type-opinion','image-gallery','gallery-caption']}),
dict(attrs={'class' : 'comment'}),
dict(name='title'),
dict(name='ul',attrs={'class' : 'clearfix breadcrumbs '}),
dict(name='ul',attrs={'id' : 'login-201109171215'}),
dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
]
preprocess_regexps = [
(re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]
feeds = [
(u'News',u'http://www.mirror.co.uk/news/rss.xml'),
(u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
(u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'),
(u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml')
# example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
]
extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
cov=str(cov)
cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov2 = str(cov2)
cov2=cov2[2:len(cov2)-2]
print '******** ',cov2,' ***'
#cov2 now is pic url, now go back to original function
br = browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg'
# print '******** string is ', cov2,' ***'
#cover_url = cov2
#cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
return cover_url
The Sun
Spoiler:
Code:
import re, random
from calibre import browser
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK'
description = 'Recipe Author D.Asbury. Articles from The Sun tabloid UK'
__author__ = 'Dave Asbury'
# last updated 19/10/12 better cover fetch
language = 'en_GB'
oldest_article = 1
max_articles_per_feed = 15
remove_empty_feeds = True
masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
encoding = 'UTF-8'
remove_javascript = True
no_stylesheets = True
ignore_duplicate_articles = {'title','url'}
compress_news_images = True
extra_css = '''
body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
'''
keep_only_tags = [
dict(name='div',attrs={'class' : 'intro'}),
dict(name='h3'),
dict(name='div',attrs={'id' : 'articlebody'}),
#dict(attrs={'class' : ['right_col_branding','related-stories','mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
# dict(name='div',attrs={'class' : 'cf'}),
# dict(attrs={'title' : 'download flash'}),
# dict(attrs={'style' : 'padding: 5px'})
]
remove_tags_after = [dict(id='bodyText')]
remove_tags=[
dict(name='li'),
dict(attrs={'class' : 'grid-4 right-hand-column'}),
]
feeds = [
(u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
(u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
(u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
(u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
]
# starsons code
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
print 'article.title is: ', article.title
if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url:
feed.articles.remove(article)
if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url:
feed.articles.remove(article)
return feeds
def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the sun button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
#cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic
#cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
cov=str(cov)
print
print '!!!!! cov = ',cov
cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov2 = str(cov2)
print
print '@@@@@@ cov2 = ',cov2
cov2=cov2[2:len(cov2)-2]
print
print '@@@@@@ chopped cov2 =',cov2
#cov2 = str(cov)
#cov2=cov2[27:-18]
#cov2 now is pic url, now go back to original function
print "**** cov2 =",cov2,"****"
br = browser()
br.set_handle_redirect(False)
try:
br.open_novisit(cov2)
cover_url = cov2
except:
cover_url = random.choice([
'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
])
return cover_url
Last edited by scissors; 04-04-2013 at 09:36 AM.
Reason: birm post max articles now 20
|