Thread: Recipe Updates
View Single Post
Old 04-04-2013, 09:28 AM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
Recipe Updates

Several recipes updated with compress news images flag set on.
(file sizes of ebooks greatly reduced)

Birmingham Post
Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import re
import mechanize
from calibre.utils.magick import Image

class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title          = u'Birmingham post'
    description = 'Author D.Asbury. News for Birmingham UK'
    #timefmt = ''
    __author__ = 'Dave Asbury'
    cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
    oldest_article = 2
    max_articles_per_feed = 20
    linearize_tables = True
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    auto_cleanup = True
    language = 'en_GB'
    compress_news_images = True
    cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'

    masthead_url        = 'http://www.trinitymirror.com/images/birminghampost-logo.gif'
    def get_cover_url(self):
        soup = self.index_to_soup('http://www.birminghampost.net')
        # look for the block containing the sun button and url
        cov = soup.find(attrs={'height' : re.compile('3'), 'alt' : re.compile('Post')})
        print
        print '%%%%%%%%%%%%%%%',cov
        print
        cov2 = str(cov['src'])
       # cov2=cov2[7:]
        print '88888888 ',cov2,' 888888888888'
        
        #cover_url=cov2
        #return cover_url
        br = mechanize.Browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = cov2
        except:
            cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/161987_9010212100_2035706408_n.jpg'
        return cover_url
    
    

    feeds          = [
        #(u'News',u'http://www.birminghampost.net/news/rss.xml'),
        (u'West Mids. News', u'http://www.birminghampost.net/news/west-midlands-news/rss.xml'),
        (u'UK News', u'http://www.birminghampost.net/news/uk-news/rss.xml'),
        (u'Sports',u'http://www.birminghampost.net/midlands-birmingham-sport/rss.xml'),
        (u'Bloggs & Comments',u'http://www.birminghampost.net/comment/rss.xml')

         ]


Country File

Spoiler:
Code:
from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
import mechanize
from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
#from hashlib import md5
#import urlparse


#declare global temp file
#Feeds_File = config_dir+'\\feeds.txt'

# needed for getting rid of repeat feeds

class AdvancedUserRecipe1325006965(BasicNewsRecipe):
    title          = u'Countryfile.com'
    #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
    __author__ = 'Dave Asbury'
    description           = 'The official website of Countryfile Magazine'
    # last updated 19/10/12
    language = 'en_GB'
    oldest_article = 30
    max_articles_per_feed = 25
    remove_empty_feeds = True
    no_stylesheets = True
    auto_cleanup = True
    compress_news_images = True
    ignore_duplicate_articles = {'title', 'url'}
    #articles_are_obfuscated = True
    #article_already_exists = False
    #feed_hash = ''
    def get_cover_url(self):
            soup = self.index_to_soup('http://www.countryfile.com/magazine')
            cov = soup.find(attrs={'class' : re.compile('imagecache imagecache-250px_wide')})#'width' : '160', 
            print '&&&&&&&& ',cov,' ***'
            cov=str(cov)
            #cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
            cov2 =  re.findall('/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)

            cov2 = str(cov2)
            cov2= "http://www.countryfile.com"+cov2[2:len(cov2)-8]
            
            print '******** ',cov2,' ***'
             # try to get cover - if can't get known cover
            br = mechanize.Browser()
            
            br.set_handle_redirect(False)
            try:
                br.open_novisit(cov2)
                cover_url = cov2
            except:
                  cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
            return cover_url
    remove_tags    = [
                             # dict(attrs={'class' : ['player']}),

    ]
    feeds          = [
    (u'Homepage', u'http://www.countryfile.com/rss/home'),
    (u'Country News', u'http://www.countryfile.com/rss/news'),
            (u'Countryside', u'http://www.countryfile.com/rss/countryside'),
            ]

    
#    def parse_feeds(self):
#      feeds = BasicNewsRecipe.parse_feeds(self)
 #       print 'create empty file'
 #       print
       
        #open and close empty file - otherwise crashes as you can't append a file that doesn't exist?

   #     read_file=open(Feeds_File,'w+')
    #    read_file.close()

        # repeat for all feeds
     #   for feed in feeds:
      #      print 'Feed file = ',Feeds_File
            
            # for each section do
     #       print
      #      print 'Feed section is ',feed.title
            # for each artcile in each section check if it's in the feeds file
      #      for article in feed.articles[:]:
      #           article_already_exists = False
                 
     #            print
                #open the file and reads lines of text
      #           read_file=open(Feeds_File)
      #           while 1:
        #                  line=read_file.readline()
       #                   print
        #                  print'****'
        #                  print 'Value of line:',line
         #                 print 'article.title is:',article.title
        #                  if str(line) == str(article.title+'\n'):
          #                   article_already_exists = True
         #                    print 'repeated article'
        #                     break
          #                print'*****'
          #                print                         
           #               # eof reached   
             #             if not line: break
                          
            #     read_file.close()
                 # couldn't find article so write it to file
         #        if article_already_exists == False:
           #         read_file=open(Feeds_File,'a')
           #         read_file.write(article.title+'\n')
          #          read_file.close()
                    
         #        if article_already_exists == True:
           #         article.url ='' # delete the url so won't download
       # return feeds


Metro Uk

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
import re
import datetime
import time

class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title          = u'Metro UK'
    description = 'News from The Metro, UK'
    #timefmt = ''
    #__author__ = 'Dave Asbury'
    #last update 4/4/13
    #cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
    
    cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
    remove_empty_feeds = True
    remove_javascript     = True
    auto_cleanup = True
    max_articles_per_feed = 12
    ignore_duplicate_articles = {'title', 'url'}
    encoding = 'UTF-8'

    language = 'en_GB'
    masthead_url        = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
    compress_news_images = True
    def parse_index(self):
		articles = {}
		key = None
		ans = []
		feeds = [ ('UK', 'http://metro.co.uk/news/uk/'),
			('World', 'http://metro.co.uk/news/world/'),
			('Weird', 'http://metro.co.uk/news/weird/'),
			('Money', 'http://metro.co.uk/news/money/'),
			('Sport', 'http://metro.co.uk/sport/'),
			('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/')
			]
		for key, feed in feeds:
			soup = self.index_to_soup(feed)
			articles[key] = []
			ans.append(key)

			today = datetime.date.today()
			today = time.mktime(today.timetuple())-60*60*24

			for a in soup.findAll('a'):
				for name, value in a.attrs:
					if name == "class" and value=="post":
						url = a['href']
						title = a['title']
						print title
						description = ''
						m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url)
						skip = 1
						if len(m.groups()) == 3:
							g = m.groups()
							dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d')
							pubdate = time.strftime('%a, %d %b', dt.timetuple())

							dt = time.mktime(dt.timetuple())
							if dt >= today:
								print pubdate
								skip = 0
						else:
							pubdate = strftime('%a, %d %b')

						summary = a.find(True, attrs={'class':'excerpt'})
						if summary:
							description = self.tag_to_string(summary, use_alt=False)

						if skip == 0:
							articles[key].append(
										dict(title=title, url=url, date=pubdate,
												description=description,
												content=''))
		#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
		ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
		return ans


The NME

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import mechanize
import re
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'New Musical Express Magazine'
    description = 'Author D.Asbury. UK Rock & Pop Mag. '
    __author__ = 'Dave Asbury'
    # last updated 7/10/12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    oldest_article = 7
    max_articles_per_feed = 20
    #auto_cleanup = True
    language = 'en_GB'
    compress_news_images = True
    def get_cover_url(self):
          soup = self.index_to_soup('http://www.nme.com/component/subscribe')
          cov = soup.find(attrs={'id' : 'magazine_cover'})
          
          print'&&&&&&&&&&',str(cov),'$$$$$$$$$$'
          print'SRC = ', str(cov['src'])
          print
          print
          cov2 = str(cov['src'])
          # print '**** Cov url =*', cover_url,'***'
          #print '**** Cov url =*','http://www.magazinesdirect.com/article_images/articledir_3138/1569221/1_largelisting.jpg','***'
          
          
          br = mechanize.Browser()
          br.set_handle_redirect(False)
          try:
               br.open_novisit(cov2)
               cover_url = str(cov2)
          except:
                    cover_url = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'
          return cover_url
          masthead_url   = 'http://tawanda3000.files.wordpress.com/2011/02/nme-logo.jpg'

          
          remove_tags = [
	    dict( attrs={'class':'clear_icons'}),
	    dict( attrs={'class':'share_links'}),
                        dict( attrs={'id':'right_panel'}),
	    dict( attrs={'class':'today box'}),
	                          

                      ]	

    keep_only_tags = [

	dict(name='h1'),
	#dict(name='h3'),
	dict(attrs={'class' :  'BText'}),
	dict(attrs={'class' :  'Bmore'}),
	dict(attrs={'class' : 'bPosts'}),
	dict(attrs={'class' :  'text'}),
	dict(attrs={'id' :  'article_gallery'}),
                    #dict(attrs={'class' :  'image'}),
	dict(attrs={'class' :  'article_text'})

]


	         

    feeds          = [
	(u'NME News', u'http://feeds.feedburner.com/nmecom/rss/newsxml?format=xml'),
	#(u'Reviews', u'http://feeds2.feedburner.com/nme/SdML'),
	(u'Reviews',u'http://feed43.com/1817687144061333.xml'),
                    (u'Bloggs',u'http://feed43.com/3326754333186048.xml'),
	
	]
    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		'''


Daily Mirror

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
import re

class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provided by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
    # last updated 19/10/12
    language = 'en_GB'
    #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'

    compress_news_images = True
    oldest_article = 1
    max_articles_per_feed = 12
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
    ignore_duplicate_articles = {'title'}
    
   # auto_cleanup = True
    #conversion_options = { 'linearize_tables' : True }



    keep_only_tags = [         dict(name='h1'),
                         dict(name='div',attrs={'class' : 'lead-text'}),
                         dict(name='div',attrs={'class' : 'styleGroup clearfix'}),
                         dict(name='div',attrs={'class' : 'widget relatedContents pictures widget-editable viziwyg-section-245 inpage-widget-158123'}),
                        # dict(name='figure',attrs={'class' : 'clearfix'}),
                         dict(name='div',attrs={'class' :'body '}),

       #dict(attrs={'class' : ['article-attr','byline append-1','published']}),
       #dict(name='p'),
        ]


    remove_tags = [
           dict(attrs={'class' : ['article sa-teaser type-opinion','image-gallery','gallery-caption']}),
           dict(attrs={'class' : 'comment'}),
           dict(name='title'),
           dict(name='ul',attrs={'class' :  'clearfix breadcrumbs '}),
           dict(name='ul',attrs={'id' : 'login-201109171215'}),
           dict(name='div',attrs={'class' : ['inline-ad span-16 last','caption']}),#'widget navigation breadcrumb widget-editable viziwyg-section-198 inpage-widget-80721 span-17','image-credit'
                    ]

    preprocess_regexps = [
        (re.compile(r'- mirror.co.uk', re.IGNORECASE | re.DOTALL), lambda match: '')]


    feeds          = [
        (u'News',u'http://www.mirror.co.uk/news/rss.xml'),
        (u'Sports',u'http://www.mirror.co.uk/sport/rss.xml'),
        (u'3AM',u'http://www.mirror.co.uk/3am/rss.xml'),
        (u'Lifestyle',u'http://www.mirror.co.uk/lifestyle/rss.xml')




           # example of commented out feed not needed ,(u'Travel','http://www.mirror.co.uk/advice/travel/rss.xml')
  ]
    extra_css = '''
                    h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
                    h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
                    p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
                    body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
		'''

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the mirror button and url
        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
        cov2 = str(cov)
        cov2='http://www.politicshome.com'+cov2[9:-142]
#cov2 now contains url of the page containing pic
        soup = self.index_to_soup(cov2)
        cov = soup.find(attrs={'id' : 'large'})
        cov=str(cov)
        cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
        cov2 = str(cov2)
        cov2=cov2[2:len(cov2)-2]
        print '******** ',cov2,' ***'
        #cov2 now is pic url, now  go back to original function
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = cov2
        except:
            cover_url ='http://profile.ak.fbcdn.net/hprofile-ak-snc4/373019_6149699161_1710984811_n.jpg'

        # print '******** string is  ', cov2,' ***'
        #cover_url = cov2
        #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
        return cover_url



The Sun

Spoiler:
Code:
import re, random

from calibre import browser
from calibre.web.feeds.recipes import BasicNewsRecipe

class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
    description = 'Recipe Author D.Asbury. Articles from The Sun tabloid UK'
    __author__ = 'Dave Asbury'
    # last updated 19/10/12 better cover fetch
    language = 'en_GB'
    oldest_article = 1
    max_articles_per_feed = 15
    remove_empty_feeds = True
    
    masthead_url = 'http://www.thesun.co.uk/sol/img/global/Sun-logo.gif'
    encoding = 'UTF-8'
    remove_javascript     = True
    no_stylesheets = True
    
    ignore_duplicate_articles = {'title','url'}
    compress_news_images = True

    extra_css  = '''
    body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
           '''
    keep_only_tags = [
		dict(name='div',attrs={'class' : 'intro'}), 
                                dict(name='h3'),
		dict(name='div',attrs={'id' : 'articlebody'}),
           #dict(attrs={'class' : ['right_col_branding','related-stories','mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
           #                dict(name='div',attrs={'class' : 'cf'}),
          # dict(attrs={'title' : 'download flash'}),
          #                 dict(attrs={'style' : 'padding: 5px'})

           ]
    remove_tags_after = [dict(id='bodyText')]
    remove_tags=[
	              dict(name='li'),
                              dict(attrs={'class' : 'grid-4 right-hand-column'}),
		]

    feeds          = [
    (u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
    (u'Sport', u'http://www.thesun.co.uk/sol/homepage/sport/rss'),
    (u'Showbiz', u'http://www.thesun.co.uk/sol/homepage/showbiz/rss'),
    (u'Woman', u'http://www.thesun.co.uk/sol/homepage/woman/rss'),
    ]
# starsons code
    def parse_feeds (self): 
      feeds = BasicNewsRecipe.parse_feeds(self) 
      for feed in feeds:
        for article in feed.articles[:]:
          print 'article.title is: ', article.title
          if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url:
            feed.articles.remove(article)
          if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url:
            feed.articles.remove(article)
      return feeds

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
        # look for the block containing the sun button and url
        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})

        #cov = soup.find(attrs={'id' : 'large'})
        cov2 = str(cov)

        cov2='http://www.politicshome.com'+cov2[9:-133]
        #cov2 now contains url of the page containing pic
        #cov2 now contains url of the page containing pic
        soup = self.index_to_soup(cov2)
        cov = soup.find(attrs={'id' : 'large'})
        cov=str(cov)
        print
        print '!!!!! cov = ',cov
        cov2 =  re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
        cov2 = str(cov2)
        print
        print '@@@@@@ cov2 = ',cov2
        cov2=cov2[2:len(cov2)-2]
        print
        print '@@@@@@ chopped cov2 =',cov2
        #cov2 = str(cov)
        #cov2=cov2[27:-18]
        #cov2 now is pic url, now  go back to original function
        print "**** cov2 =",cov2,"****"
        br = browser()
        br.set_handle_redirect(False)
        try:
            br.open_novisit(cov2)
            cover_url = cov2
        except:
            cover_url = random.choice([
                'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage7_677962a_905505a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage5_677960a_905512a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage2_677957a_905502a.jpg'
                ,'http://img.thesun.co.uk/multimedia/archive/00905/errorpage3_677958a_905503a.jpg'
                ])

        return cover_url

Last edited by scissors; 04-04-2013 at 09:36 AM. Reason: birm post max articles now 20
scissors is offline   Reply With Quote