View Single Post
Old 07-10-2010, 11:18 AM   #2293
mikegps1
Junior Member
mikegps1 began at the beginning.
 
Posts: 4
Karma: 10
Join Date: Jul 2010
Device: sony prs600
Times Online - subscription version

Since my last post a couple of days ago, I've tried to update the Times Online recipe as the paper now requires a subscription for access to newsfeeds.

My first attempt is below get errors in lines 41 and 43 can anyone help please?

BTW version 07.8 is great, calibre gets better all the time.


************************************************** ******
#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
timesonline.co.uk
'''
import re

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag

class Timesonline(BasicNewsRecipe):
title = 'The Times Online'
__author__ = 'Darko Miletic and Sujata Raman'
description = 'UK news'
publisher = 'timesonline.co.uk'
category = 'news, politics, UK'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
encoding = 'ISO-8859-1'
remove_javascript = True
language = 'en_GB'
recursions = 9
LOGIN = http://www.timesplus.co.uk/tto/news/...lightbox=false
keep_only_tags = [
dict(name='div', attrs= {'id':['region-column1and2-layout2']}),
{'class' : ['subheading']},
dict(name='div', attrs= {'id':['dynamic-image-holder']}),
dict(name='div', attrs= {'class':['article-author']}),
dict(name='div', attrs= {'id':['related-article-links']}),
]

remove_tags = [
dict(name=['embed','object','form','iframe']),
dict(name='span', attrs = {'class':'float-left padding-left-8 padding-top-2'}),
dict(name='div', attrs= {'id':['region-footer','region-column2-layout2','grid-column4','login-status','comment-sort-order']}),
dict(name='div', attrs= {'class': ['debate-quote-container','clear','your-comment','float-left related-attachements-container','float-left padding-bottom-5 padding-top-8','puff-top']}),
dict(name='span', attrs = {'id': ['comment-count']}),
dict(name='ul',attrs = {'id': 'read-all-comments'}),
dict(name='a', attrs = {'class':'reg-bold'}),
]

extra_css = '''
.small{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small; background:#F8F1D8;}
.color-666{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666; }
h1{font-family:Georgia,Times New Roman,Times,serif;font-size:large; }
.color-999 {color:#999999;}
.x-small {font-size:x-small;}
#related-article-links{font-family :Arial,Helvetica,sans-serif; font-size:small;}
h2{color:#333333;font-family :Georgia,Times New Roman,Times,serif; font-size:small;}
p{font-family :Arial,Helvetica,sans-serif; font-size:small;}
'''
feeds = [
(u'Top stories from Times Online', u'http://www.timesonline.co.uk/tol/feeds/rss/topstories.xml' ),
('Latest Business News', 'http://www.timesonline.co.uk/tol/feeds/rss/business.xml'),
('Economics', 'http://www.timesonline.co.uk/tol/feeds/rss/economics.xml'),
('World News', 'http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml'),
('UK News', 'http://www.timesonline.co.uk/tol/feeds/rss/uknews.xml'),
('Travel News', 'http://www.timesonline.co.uk/tol/feeds/rss/travel.xml'),
('Sports News', 'http://www.timesonline.co.uk/tol/feeds/rss/sport.xml'),
('Film News', 'http://www.timesonline.co.uk/tol/feeds/rss/film.xml'),
('Tech news', 'http://www.timesonline.co.uk/tol/feeds/rss/tech.xml'),
('Literary Supplement', 'http://www.timesonline.co.uk/tol/feeds/rss/thetls.xml'),
]

def get_cover_url(self):
cover_url = None
index = 'http://www.timesonline.co.uk/tol/newspapers/'
soup = self.index_to_soup(index)
link_item = soup.find(name = 'div',attrs ={'class': "float-left margin-right-15"})
if link_item:
cover_url = link_item.img['src']
return cover_url

def get_article_url(self, article):
return article.get('guid', None)

def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open(self.LOGIN)
br.select_form(name='loginForm')
br['username'] = self.username
br['password'] = self.password
br.submit()
return br

def preprocess_html(self, soup):
soup.html['xml:lang'] = self.language
soup.html['lang'] = self.language
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.language)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=ISO-8859-1")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)

def postprocess_html(self,soup,first):
for tag in soup.findAll(text = ['Previous Page','Next Page']):
tag.extract()
return soup
mikegps1 is offline