Junior Member
Posts: 1
Karma: 10
Join Date: Dec 2010
Device: Kindle DX
|
Request: Multipage recipe for Reuters
Hello,
Because the Reuters reciple only displays the first page of any article when more than one page, I've been trying to add the adventure gamer multipage script to the Reuters' recipe. But I have not gotten it to work. When I ran this reciple, all the articles ends up being deleted from the ebook.
Quote:
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Reuters(BasicNewsRecipe):
title = 'Reuters (Business)'
description = 'Global news'
__author__ = 'Kovid Goyal and Sujata Raman'
use_embedded_content = False
language = 'en'
max_articles_per_feed = 10
no_stylesheets = True
remove_javascript = True
filterDuplicates = True
extra_css = '''
body{font-family:arial,helvetica,sans;}
h1{ font-size:larger ; font-weight:bold; }
.byline{color:#006E97;font-size:x-small; font-weight:bold;}
.location{font-size:x-small; font-weight:bold;}
.timestamp{font-size:x-small; }
'''
keep_only_tags = [dict(name='div', attrs={'class':'column2 gridPanel grid8'})]
remove_tags = [dict(name='div', attrs={'id':['recommendedArticles','relatedNews','relatedVideo' ,'relatedFactboxes']}),
dict(name='p', attrs={'class':['relatedTopics']}),
dict(name='a', attrs={'id':['fullSizeLink']}),
dict(name='div', attrs={'class':['photoNav','relatedTopicButtons','articleComments' ,'gridPanel grid8','footerHalf gridPanel grid1','gridPanel grid2','gridPanel grid3','commentDisclaimer','relatedRail gridPanel grid2','socialHeader']}),]
preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
##(r'<HEAD>.*?</HEAD>' , lambda match : '<HEAD></HEAD>'),
(r'<div id="apple-rss-sidebar-background">.*?<!-- start Entries -->', lambda match : ''),
(r'<!-- end apple-rss-content-area -->.*?</body>', lambda match : '</body>'),
(r'<script.*?>.*?</script>', lambda match : ''),
(r'<body>.*?<div class="contentBand">', lambda match : '<body>'),
(r'<h3>Share:</h3>.*?</body>', lambda match : '<!-- END:: Shared Module id=36615 --></body>'),
(r'<div id="atools" class="articleTools">.*?<div class="linebreak">', lambda match : '<div class="linebreak">'),
]
]
feeds = [ ('Top Stories', 'http://feeds.reuters.com/reuters/topNews?format=xml'),
('US News', 'http://feeds.reuters.com/reuters/domesticNews?format=xml'),
('World News', 'http://feeds.reuters.com/reuters/worldNews?format=xml'),
('Deal News', 'http://feeds.reuters.com/reuters/dealsNews?format=xml'),
('Business News', 'http://feeds.reuters.com/reuters/businessNews?format=xml'),
('Technology News', 'http://feeds.reuters.com/reuters/technologyNews?format=xml'),
('Global Markets News', 'http://feeds.reuters.com/reuters/globalmarketsNews?format=xml'),
('Hedge Funds News', 'http://feeds.reuters.com/reuters/hedgefundsNews?format=xml'),
('Private Equity News', 'http://feeds.reuters.com/reuters/privateequityNews?format=xml'),
('Small Business News', 'http://feeds.reuters.com/reuters/smallBusinessNews?format=xml')
]
def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'class':'next'})
if pager:
nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'id':'moduleBody'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'toolbar_fat'})
if pager:
pager.extract()
return soup
|
|