Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 08-18-2013, 12:56 PM   #1
scissors
Addict
scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.scissors ought to be getting tired of karma fortunes by now.
 
Posts: 241
Karma: 1001369
Join Date: Sep 2010
Device: prs300, kindle keyboard 3g
metro uk minor change

Annoying |Metro News appearing on every headline removed

Spoiler:
Code:
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import strftime
import re
import datetime
import time

class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title          = u'Metro'
    description = 'News from The Metro, UK'
    
    cover_url = 'https://twimg0-a.akamaihd.net/profile_images/1638332595/METRO_LETTERS-01.jpg'
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets        = True
    auto_cleanup = True
    max_articles_per_feed = 12
    ignore_duplicate_articles = {'title', 'url'}
    #encoding = 'UTF-8'
    
    language = 'en_GB'
    masthead_url        = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
    compress_news_images = True
    compress_news_images_max_size = 30
    remove_attributes = ['style', 'font']
    preprocess_regexps = [

		(re.compile(r'\| Metro News', re.IGNORECASE | re.DOTALL), lambda match: ''),
         		]
                     
    def parse_index(self):
		articles = {}
		key = None
		ans = []
		feeds = [ ('UK', 'http://metro.co.uk/news/uk/'),
			('World', 'http://metro.co.uk/news/world/'),
			('Weird', 'http://metro.co.uk/news/weird/'),
			('Money', 'http://metro.co.uk/news/money/'),
			('Sport', 'http://metro.co.uk/sport/'),
			('Guilty Pleasures', 'http://metro.co.uk/guilty-pleasures/')
			]
		for key, feed in feeds:
			soup = self.index_to_soup(feed)
			articles[key] = []
			ans.append(key)

			today = datetime.date.today()
			today = time.mktime(today.timetuple())-60*60*24

			for a in soup.findAll('a'):
				for name, value in a.attrs:
					if name == "class" and value=="post":
						url = a['href']
						title = a['title']
						print title
						description = ''
						m = re.search('^.*uk/([^/]*)/([^/]*)/([^/]*)/', url)
						skip = 1
						if len(m.groups()) == 3:
							g = m.groups()
							dt = datetime.datetime.strptime(''+g[0]+'-'+g[1]+'-'+g[2], '%Y-%m-%d')
							pubdate = time.strftime('%a, %d %b', dt.timetuple())

							dt = time.mktime(dt.timetuple())
							if dt >= today:
								print pubdate
								skip = 0
						else:
							pubdate = strftime('%a, %d %b')

						summary = a.find(True, attrs={'class':'excerpt'})
						if summary:
							description = self.tag_to_string(summary, use_alt=False)

						if skip == 0:
							articles[key].append(
										dict(title=title, url=url, date=pubdate,
												description=description,
												content=''))
		#ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
		ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
		return ans
scissors is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
metro uk update 9/9/12 scissors Recipes 0 09-09-2012 03:32 AM
metro uk update 4/8/12 scissors Recipes 0 08-04-2012 02:46 PM
metro uk update 9/6/12 scissors Recipes 0 06-09-2012 03:32 AM
Recipe for Metro UK Bogg Recipes 10 10-07-2011 01:06 PM
Metro News NL drMerry Recipes 1 07-07-2011 07:23 PM


All times are GMT -4. The time now is 06:59 AM.


MobileRead.com is a privately owned, operated and funded community.