MobileRead Forums - View Single Post - Custom recipes (archive, read-only)

mikegps1 · 07-10-2010, 12:18 PM

Since my last post a couple of days ago, I've tried to update the Times Online recipe as the paper now requires a subscription for access to newsfeeds.

My first attempt is below get errors in lines 41 and 43 can anyone help please?

BTW version 07.8 is great, calibre gets better all the time.

************************************************** ******
#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
'''
timesonline.co.uk
'''
import re

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag

class Timesonline(BasicNewsRecipe):
title = 'The Times Online'
__author__ = 'Darko Miletic and Sujata Raman'
description = 'UK news'
publisher = 'timesonline.co.uk'
category = 'news, politics, UK'
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
simultaneous_downloads = 1
encoding = 'ISO-8859-1'
remove_javascript = True
language = 'en_GB'
recursions = 9
LOGIN = http://www.timesplus.co.uk/tto/news/...lightbox=false
keep_only_tags = [
dict(name='div', attrs= {'id':['region-column1and2-layout2']}),
{'class' : ['subheading']},
dict(name='div', attrs= {'id':['dynamic-image-holder']}),
dict(name='div', attrs= {'class':['article-author']}),
dict(name='div', attrs= {'id':['related-article-links']}),
]

remove_tags = [
dict(name=['embed','object','form','iframe']),
dict(name='span', attrs = {'class':'float-left padding-left-8 padding-top-2'}),
dict(name='div', attrs= {'id':['region-footer','region-column2-layout2','grid-column4','login-status','comment-sort-order']}),
dict(name='div', attrs= {'class': ['debate-quote-container','clear','your-comment','float-left related-attachements-container','float-left padding-bottom-5 padding-top-8','puff-top']}),
dict(name='span', attrs = {'id': ['comment-count']}),
dict(name='ul',attrs = {'id': 'read-all-comments'}),
dict(name='a', attrs = {'class':'reg-bold'}),
]

extra_css = '''
.small{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
.byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small; background:#F8F1D8;}
.color-666{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666; }
h1{font-family:Georgia,Times New Roman,Times,serif;font-size:large; }
.color-999 {color:#999999;}
.x-small {font-size:x-small;}
#related-article-links{font-family :Arial,Helvetica,sans-serif; font-size:small;}
h2{color:#333333;font-family :Georgia,Times New Roman,Times,serif; font-size:small;}
p{font-family :Arial,Helvetica,sans-serif; font-size:small;}
'''
feeds = [
(u'Top stories from Times Online', u'http://www.timesonline.co.uk/tol/feeds/rss/topstories.xml' ),
('Latest Business News', 'http://www.timesonline.co.uk/tol/feeds/rss/business.xml'),
('Economics', 'http://www.timesonline.co.uk/tol/feeds/rss/economics.xml'),
('World News', 'http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml'),
('UK News', 'http://www.timesonline.co.uk/tol/feeds/rss/uknews.xml'),
('Travel News', 'http://www.timesonline.co.uk/tol/feeds/rss/travel.xml'),
('Sports News', 'http://www.timesonline.co.uk/tol/feeds/rss/sport.xml'),
('Film News', 'http://www.timesonline.co.uk/tol/feeds/rss/film.xml'),
('Tech news', 'http://www.timesonline.co.uk/tol/feeds/rss/tech.xml'),
('Literary Supplement', 'http://www.timesonline.co.uk/tol/feeds/rss/thetls.xml'),
]

def get_cover_url(self):
cover_url = None
index = 'http://www.timesonline.co.uk/tol/newspapers/'
soup = self.index_to_soup(index)
link_item = soup.find(name = 'div',attrs ={'class': "float-left margin-right-15"})
if link_item:
cover_url = link_item.img['src']
return cover_url

def get_article_url(self, article):
return article.get('guid', None)

def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open(self.LOGIN)
br.select_form(name='loginForm')
br['username'] = self.username
br['password'] = self.password
br.submit()
return br

def preprocess_html(self, soup):
soup.html['xml:lang'] = self.language
soup.html['lang'] = self.language
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.language)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=ISO-8859-1")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)

def postprocess_html(self,soup,first):
for tag in soup.findAll(text = ['Previous Page','Next Page']):
tag.extract()
return soup

07-10-2010, 12:18 PM	#2293
mikegps1 Junior Member Posts: 4 Karma: 10 Join Date: Jul 2010 Device: sony prs600	Times Online - subscription version Since my last post a couple of days ago, I've tried to update the Times Online recipe as the paper now requires a subscription for access to newsfeeds. My first attempt is below get errors in lines 41 and 43 can anyone help please? BTW version 07.8 is great, calibre gets better all the time. ************************************************ **** #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>' ''' timesonline.co.uk ''' import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import Tag class Timesonline(BasicNewsRecipe): title = 'The Times Online' __author__ = 'Darko Miletic and Sujata Raman' description = 'UK news' publisher = 'timesonline.co.uk' category = 'news, politics, UK' oldest_article = 2 max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False simultaneous_downloads = 1 encoding = 'ISO-8859-1' remove_javascript = True language = 'en_GB' recursions = 9 LOGIN = http://www.timesplus.co.uk/tto/news/...lightbox=false keep_only_tags = [ dict(name='div', attrs= {'id':['region-column1and2-layout2']}), {'class' : ['subheading']}, dict(name='div', attrs= {'id':['dynamic-image-holder']}), dict(name='div', attrs= {'class':['article-author']}), dict(name='div', attrs= {'id':['related-article-links']}), ] remove_tags = [ dict(name=['embed','object','form','iframe']), dict(name='span', attrs = {'class':'float-left padding-left-8 padding-top-2'}), dict(name='div', attrs= {'id':['region-footer','region-column2-layout2','grid-column4','login-status','comment-sort-order']}), dict(name='div', attrs= {'class': ['debate-quote-container','clear','your-comment','float-left related-attachements-container','float-left padding-bottom-5 padding-top-8','puff-top']}), dict(name='span', attrs = {'id': ['comment-count']}), dict(name='ul',attrs = {'id': 'read-all-comments'}), dict(name='a', attrs = {'class':'reg-bold'}), ] extra_css = ''' .small{font-family :Arial,Helvetica,sans-serif; font-size:x-small;} .byline{font-family :Arial,Helvetica,sans-serif; font-size:x-small; background:#F8F1D8;} .color-666{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#666666; } h1{font-family:Georgia,Times New Roman,Times,serif;font-size:large; } .color-999 {color:#999999;} .x-small {font-size:x-small;} #related-article-links{font-family :Arial,Helvetica,sans-serif; font-size:small;} h2{color:#333333;font-family :Georgia,Times New Roman,Times,serif; font-size:small;} p{font-family :Arial,Helvetica,sans-serif; font-size:small;} ''' feeds = [ (u'Top stories from Times Online', u'http://www.timesonline.co.uk/tol/feeds/rss/topstories.xml' ), ('Latest Business News', 'http://www.timesonline.co.uk/tol/feeds/rss/business.xml'), ('Economics', 'http://www.timesonline.co.uk/tol/feeds/rss/economics.xml'), ('World News', 'http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml'), ('UK News', 'http://www.timesonline.co.uk/tol/feeds/rss/uknews.xml'), ('Travel News', 'http://www.timesonline.co.uk/tol/feeds/rss/travel.xml'), ('Sports News', 'http://www.timesonline.co.uk/tol/feeds/rss/sport.xml'), ('Film News', 'http://www.timesonline.co.uk/tol/feeds/rss/film.xml'), ('Tech news', 'http://www.timesonline.co.uk/tol/feeds/rss/tech.xml'), ('Literary Supplement', 'http://www.timesonline.co.uk/tol/feeds/rss/thetls.xml'), ] def get_cover_url(self): cover_url = None index = 'http://www.timesonline.co.uk/tol/newspapers/' soup = self.index_to_soup(index) link_item = soup.find(name = 'div',attrs ={'class': "float-left margin-right-15"}) if link_item: cover_url = link_item.img['src'] return cover_url def get_article_url(self, article): return article.get('guid', None) def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open(self.LOGIN) br.select_form(name='loginForm') br['username'] = self.username br['password'] = self.password br.submit() return br def preprocess_html(self, soup): soup.html['xml:lang'] = self.language soup.html['lang'] = self.language mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.language)]) mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=ISO-8859-1")]) soup.head.insert(0,mlang) soup.head.insert(1,mcharset) return self.adeify_images(soup) def postprocess_html(self,soup,first): for tag in soup.findAll(text = ['Previous Page','Next Page']): tag.extract() return soup