Since my last post a couple of days ago, I've tried to update the Times Online recipe as the paper now requires a subscription for access to newsfeeds.
My first attempt is below get errors in lines 41 and 43 can anyone help please?
BTW version 07.8 is great, calibre gets better all the time.
************************************************** ******
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
timesonline.co.uk
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Timesonline(BasicNewsRecipe):
title = 'The Times Online'
__author__ = 'Darko Miletic and Sujata Raman'
description = 'UK news'
publisher = 'timesonline.co.uk'
category = 'news, politics, UK'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
encoding = 'ISO-8859-1'
remove_javascript = True
language = 'en_GB'
recursions = 9
LOGIN =
http://www.timesplus.co.uk/tto/news/...lightbox=false
keep_only_tags = [
dict(name='div', attrs= {'id':['region-column1and2-layout2']}),
{'class' : ['subheading']},
dict(name='div', attrs= {'id':['dynamic-image-holder']}),
dict(name='div', attrs= {'class':['article-author']}),
dict(name='div', attrs= {'id':['related-article-links']}),
]
remove_tags = [
dict(name=['embed','object','form','iframe']),
dict(name='span', attrs = {'class':'float-left padding-left-8 padding-top-2'}),
dict(name='div', attrs= {'id':['region-footer','region-column2-layout2','grid-column4','login-status','comment-sort-order']}),
dict(name='div', attrs= {'class': ['debate-quote-container','clear','your-comment','float-left related-attachements-container','float-left padding-bottom-5 padding-top-8','puff-top']}),
dict(name='span', attrs = {'id': ['comment-count']}),
dict(name='ul',attrs = {'id': 'read-all-comments'}),
dict(name='a', attrs = {'class':'reg-bold'}),
]
extra_css = '''
.small{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small; background:#F8F1D8;}
.color-666{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666; }
h1{font-family:Georgia,Times New Roman,Times,serif;font-size:large; }
.color-999 {color:#999999;}
.x-small {font-size:x-small;}
#related-article-links{font-family :Arial,Helvetica,sans-serif; font-size:small;}
h2{color:#333333;font-family :Georgia,Times New Roman,Times,serif; font-size:small;}
p{font-family :Arial,Helvetica,sans-serif; font-size:small;}
'''
feeds = [
(u'Top stories from Times Online', u'http://www.timesonline.co.uk/tol/feeds/rss/topstories.xml' ),
('Latest Business News', 'http://www.timesonline.co.uk/tol/feeds/rss/business.xml'),
('Economics', 'http://www.timesonline.co.uk/tol/feeds/rss/economics.xml'),
('World News', 'http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml'),
('UK News', 'http://www.timesonline.co.uk/tol/feeds/rss/uknews.xml'),
('Travel News', 'http://www.timesonline.co.uk/tol/feeds/rss/travel.xml'),
('Sports News', 'http://www.timesonline.co.uk/tol/feeds/rss/sport.xml'),
('Film News', 'http://www.timesonline.co.uk/tol/feeds/rss/film.xml'),
('Tech news', 'http://www.timesonline.co.uk/tol/feeds/rss/tech.xml'),
('Literary Supplement', 'http://www.timesonline.co.uk/tol/feeds/rss/thetls.xml'),
]
def get_cover_url(self):
cover_url = None
index = 'http://www.timesonline.co.uk/tol/newspapers/'
soup = self.index_to_soup(index)
link_item = soup.find(name = 'div',attrs ={'class': "float-left margin-right-15"})
if link_item:
cover_url = link_item.img['src']
return cover_url
def get_article_url(self, article):
return article.get('guid', None)
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open(self.LOGIN)
br.select_form(name='loginForm')
br['username'] = self.username
br['password'] = self.password
br.submit()
return br
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.language
soup.html['lang'] = self.language
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.language)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=ISO-8859-1")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
def postprocess_html(self,soup,first):
for tag in soup.findAll(text = ['Previous Page','Next Page']):
tag.extract()
return soup