|
|
#1 |
|
Member
![]() ![]() ![]() ![]() ![]() ![]() Posts: 14
Karma: 560
Join Date: Jan 2011
Device: Kindle
|
International Herald Tribune homepage
This one is quite easy - it downloads news from IHT homepage (= global version of NYTimes) and breaks it down to sections, based on URL, because that's the way I usually read it.
The sorting of sections is based on my reading preferences , so it's: Europe - World - USA - Middle East - Asia - Americas - Africa - Technology - Science - Arts - Movies - Books - Business - Opinion - SportsIt doesn't download from RSS, and it is nowhere near complete IHT print edition - but if you want complete IHT print edition, use some of the NYTimes recipes, since it's almost the same thing anyway, the difference is only which articles are taken as main headlines and which are not. So, here it is. It is mostly derived from the NYTimes recipe, so I am not 100% sure how everything works but it does!Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>, tweaked by Karel Bilek'
'''
nytimes.com
'''
import re, string, time
from calibre import entity_to_unicode, strftime
from datetime import timedelta, date
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
class NYTimes(BasicNewsRecipe):
webEdition = False
oldest_article = 7
includeSections = []
excludeSections = []
one_picture_per_article = False
max_articles_per_feed = 100
filterDuplicates = True
title='Internation Herald Tribune (web)'
description = 'IHT'
needs_subscription = True
month_list = ['january','february','march','april','may','june','july','august','september','october','november','december']
def decode_us_date(self,datestr):
udate = datestr.strip().lower().split()
try:
m = self.month_list.index(udate[0])+1
except:
return date.today()
d = int(udate[1])
y = int(udate[2])
try:
d = date(y,m,d)
except:
d = date.today
return d
earliest_date = date.today() - timedelta(days=oldest_article)
__author__ = 'GRiker/Kovid Goyal/Nick Redding'
language = 'en'
requires_version = (0, 7, 5)
timefmt = ''
masthead_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
cover_margins = (18,18,'grey99')
#cover_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
remove_tags_before = dict(id='article')
remove_tags_after = dict(id='article')
remove_tags = [dict(attrs={'class':[
'articleFooter',
'articleTools',
'columnGroup doubleRule',
'columnGroup singleRule',
'columnGroup last',
'columnGroup last',
'doubleRule',
'dottedLine',
'entry-meta',
'entry-response module',
'icon enlargeThis',
'leftNavTabs',
'metaFootnote',
'module box nav',
'nextArticleLink',
'nextArticleLink clearfix',
'post-tools',
'relatedSearchesModule',
'side_tool',
'singleAd',
re.compile('^subNavigation'),
re.compile('^leaderboard'),
re.compile('^module'),
]}),
dict(id=[
'adxLeaderboard',
'adxSponLink',
'archive',
'articleExtras',
'articleInline',
'blog_sidebar',
'businessSearchBar',
'cCol',
'entertainmentSearchBar',
'footer',
'header',
'header_search',
'inlineBox',
'login',
'masthead',
'masthead-nav',
'memberTools',
'navigation',
'portfolioInline',
'readerReviews',
'readerReviewsCount',
'relatedArticles',
'relatedTopics',
'respond',
'side_search',
'side_index',
'side_tool',
'toolsRight',
]),
dict(name=['script', 'noscript', 'style','form','hr'])]
no_stylesheets = True
extra_css = '''
.articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; }
.credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; }
.dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.timestamp { text-align: left; font-size: small; }
.caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
a:link {text-decoration: none; }
.articleBody { }
.authorId {text-align: left; }
.image {text-align: center;}
.source {text-align: left; }'''
articles = {}
key = None
ans = []
url_list = []
def filter_ans(self, ans) :
total_article_count = 0
idx = 0
idx_max = len(ans)-1
while idx <= idx_max:
if self.includeSections != []:
if ans[idx][0] not in self.includeSections:
print "SECTION NOT INCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if ans[idx][0] in self.excludeSections:
print "SECTION EXCLUDED: ",ans[idx][0]
del ans[idx]
idx_max = idx_max-1
continue
if self.verbose:
self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) )
for article in ans[idx][1]:
total_article_count += 1
if self.verbose:
self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'),
article['url'].encode('cp1252','replace')))
idx = idx+1
self.log( "Queued %d articles" % total_article_count )
return ans
def exclude_url(self,url):
if not url.startswith("http"):
return True
if not url.endswith(".html"):
return True
if 'nytimes.com' not in url:
return True
if 'podcast' in url:
return True
if '/video/' in url:
return True
if '/slideshow/' in url:
return True
if '/magazine/index' in url:
return True
if '/interactive/' in url:
return True
if '/reference/' in url:
return True
if '/premium/' in url:
return True
return False
def fixChars(self,string):
# Replace lsquo (\x91)
fixed = re.sub("\x91","‘",string)
# Replace rsquo (\x92)
fixed = re.sub("\x92","’",fixed)
# Replace ldquo (\x93)
fixed = re.sub("\x93","“",fixed)
# Replace rdquo (\x94)
fixed = re.sub("\x94","”",fixed)
# Replace ndash (\x96)
fixed = re.sub("\x96","–",fixed)
# Replace mdash (\x97)
fixed = re.sub("\x97","—",fixed)
return fixed
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://www.nytimes.com/auth/login')
br.form = br.forms().next()
br['userid'] = self.username
br['password'] = self.password
raw = br.submit().read()
if 'Please try again' in raw:
raise Exception('Your username and password are incorrect')
return br
def skip_ad_pages(self, soup):
# Skip ad pages served before actual article
skip_tag = soup.find(True, {'name':'skip'})
if skip_tag is not None:
self.log.warn("Found forwarding link: %s" % skip_tag.parent['href'])
url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href'])
url += '?pagewanted=all'
self.log.warn("Skipping ad to article at '%s'" % url)
return self.index_to_soup(url, raw=True)
def get_cover_url(self):
cover = None
st = time.localtime()
year = str(st.tm_year)
month = "%.2d" % st.tm_mon
day = "%.2d" % st.tm_mday
cover = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(cover)
except:
self.log("\nCover unavailable")
cover = None
return cover
def short_title(self):
return self.title
def index_to_soup(self, url_or_raw, raw=False):
'''
OVERRIDE of class method
deals with various page encodings between index and articles
'''
def get_the_soup(docEncoding, url_or_raw, raw=False) :
if re.match(r'\w+://', url_or_raw):
f = self.browser.open(url_or_raw)
_raw = f.read()
f.close()
if not _raw:
raise RuntimeError('Could not fetch index from %s'%url_or_raw)
else:
_raw = url_or_raw
if raw:
return _raw
if not isinstance(_raw, unicode) and self.encoding:
_raw = _raw.decode(docEncoding, 'replace')
massage = list(BeautifulSoup.MARKUP_MASSAGE)
massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding)))
return BeautifulSoup(_raw, markupMassage=massage)
# Entry point
soup = get_the_soup( self.encoding, url_or_raw )
contentType = soup.find(True,attrs={'http-equiv':'Content-Type'})
docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')]
if docEncoding == '' :
docEncoding = self.encoding
if self.verbose > 2:
self.log( " document encoding: '%s'" % docEncoding)
if docEncoding != self.encoding :
soup = get_the_soup(docEncoding, url_or_raw)
return soup
def massageNCXText(self, description):
# Kindle TOC descriptions won't render certain characters
if description:
massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
# Replace '&' with '&'
massaged = re.sub("&","&", massaged)
return self.fixChars(massaged)
else:
return description
def feed_title(self,div):
return ''.join(div.findAll(text=True, recursive=True)).strip()
def handle_article(self,div):
def find_section(url):
match = re.search(r'[0-9]+/[0-9]+/[0-9]+/(.+)/[^/]+$', url)
if not match:
return 'Uncategorized'
stred = str(match.group(1))
match = re.search(r'(.*)/(.*)', stred)
if not match:
#jednoslovne
stred = re.sub(r'/', '', stred)
if (stred == "us"):
return "USA"
return stred.capitalize()
else:
prvni = str(match.group(1))
if (prvni=="world"):
druhy = str(match.group(2))
if (druhy=="middleeast"):
return "Middle East"
return druhy.capitalize()
else:
return prvni.capitalize()
thumbnail = div.find('div','thumbnail')
if thumbnail:
thumbnail.extract()
a_s = div.findAll('a', href=True)
a = False
for aa in a_s:
if not a and aa.string:
a=aa
if not a:
return
match = re.search(r'community.nytimes.com/comments', a['href'])
if match:
return
url = re.sub(r'\?.*', '', a['href'])
if self.exclude_url(url):
return
url += '?pagewanted=all'
if self.filterDuplicates:
if url in self.url_list:
return
self.url_list.append(url)
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')
summary = div.find(True, attrs={'class':'summary'})
if summary:
description = self.tag_to_string(summary, use_alt=False)
author = ''
authorAttribution = div.find(True, attrs={'class':'byline'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
else:
authorAttribution = div.find(True, attrs={'class':'byline'})
if authorAttribution:
author = self.tag_to_string(authorAttribution, use_alt=False)
feed = find_section(url)
#feed = self.key if self.key is not None else 'Uncategorized'
if not self.articles.has_key(feed):
self.ans.append(feed)
self.articles[feed] = []
self.articles[feed].append(
dict(title=title, url=url, date=pubdate,
description=description, author=author,
content=''))
def parse_global_edition(self):
soup = self.index_to_soup('http://global.nytimes.com/')
for div in soup.findAll(True,
attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush', 'story flushBottom', 'columnGroup flushBottom']}):
if div['class'] in ['story', 'story headline'] :
self.handle_article(div)
elif div['class'] == 'headlinesOnly multiline flush':
for lidiv in div.findAll('li'):
self.handle_article(lidiv)
elif div['class'] == 'columnGroup flushBottom':
for lidh in div.findAll('h5'):
self.handle_article(lidh)
self.ans = self.sort_index_by(self.ans, {'Europe':-8, 'World':-7, 'USA':-6, 'Middle East':-5, 'Asia':-4, 'Americas':-3, 'Africa':-2, 'Technology':8, 'Science':9, 'Arts':10, 'Movies':10, 'Books':11, 'Business':12, 'Opinion':13, 'Sports':999})
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.ans)
def parse_index(self):
return self.parse_global_edition()
def strip_anchors(self,soup):
paras = soup.findAll(True)
for para in paras:
aTags = para.findAll('a')
for a in aTags:
if a.img is None:
a.replaceWith(a.renderContents().decode('cp1252','replace'))
return soup
def preprocess_html(self, soup):
if self.webEdition & (self.oldest_article>0):
date_tag = soup.find(True,attrs={'class': ['dateline','date']})
if date_tag:
date_str = self.tag_to_string(date_tag,use_alt=False)
date_str = date_str.replace('Published:','')
date_items = date_str.split(',')
try:
datestring = date_items[0]+' '+date_items[1]
article_date = self.decode_us_date(datestring)
except:
article_date = date.today()
if article_date < self.earliest_date:
self.log("Skipping article dated %s" % date_str)
return None
kicker_tag = soup.find(attrs={'class':'kicker'})
if kicker_tag: # remove Op_Ed author head shots
tagline = self.tag_to_string(kicker_tag)
if tagline=='Op-Ed Columnist':
img_div = soup.find('div','inlineImage module')
if img_div:
img_div.extract()
return self.strip_anchors(soup)
def postprocess_html(self,soup, True):
if self.one_picture_per_article:
# Remove all images after first
largeImg = soup.find(True, {'class':'articleSpanImage'})
inlineImgs = soup.findAll(True, {'class':'inlineImage module'})
if largeImg:
for inlineImg in inlineImgs:
inlineImg.extract()
else:
if inlineImgs:
firstImg = inlineImgs[0]
for inlineImg in inlineImgs[1:]:
inlineImg.extract()
# Move firstImg before article body
cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')})
if cgFirst:
# Strip all sibling NavigableStrings: noise
navstrings = cgFirst.findAll(text=True, recursive=False)
[ns.extract() for ns in navstrings]
headline_found = False
tag = cgFirst.find(True)
insertLoc = 0
while True:
insertLoc += 1
if hasattr(tag,'class') and tag['class'] == 'articleHeadline':
headline_found = True
break
tag = tag.nextSibling
if not tag:
headline_found = False
break
if headline_found:
cgFirst.insert(insertLoc,firstImg)
else:
self.log(">>> No class:'columnGroup first' found <<<")
# Change captions to italic
for caption in soup.findAll(True, {'class':'caption'}) :
if caption and caption.contents[0]:
cTag = Tag(soup, "p", [("class", "caption")])
c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip()
mp_off = c.find("More Photos")
if mp_off >= 0:
c = c[:mp_off]
cTag.insert(0, c)
caption.replaceWith(cTag)
# Change <nyt_headline> to <h2>
h1 = soup.find('h1')
if h1:
headline = h1.find("nyt_headline")
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
h1.replaceWith(tag)
else:
# Blog entry - replace headline, remove <hr> tags
headline = soup.find('title')
if headline:
tag = Tag(soup, "h2")
tag['class'] = "headline"
tag.insert(0, self.fixChars(headline.contents[0]))
soup.insert(0, tag)
hrs = soup.findAll('hr')
for hr in hrs:
hr.extract()
# Change <h1> to <h3> - used in editorial blogs
masthead = soup.find("h1")
if masthead:
# Nuke the href
if masthead.a:
del(masthead.a['href'])
tag = Tag(soup, "h3")
tag.insert(0, self.fixChars(masthead.contents[0]))
masthead.replaceWith(tag)
# Change <span class="bold"> to <b>
for subhead in soup.findAll(True, {'class':'bold'}) :
if subhead.contents:
bTag = Tag(soup, "b")
bTag.insert(0, subhead.contents[0])
subhead.replaceWith(bTag)
divTag = soup.find('div',attrs={'id':'articleBody'})
if divTag:
divTag['class'] = divTag['id']
# Add class="authorId" to <div> so we can format with CSS
divTag = soup.find('div',attrs={'id':'authorId'})
if divTag and divTag.contents[0]:
tag = Tag(soup, "p")
tag['class'] = "authorId"
tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0],
use_alt=False)))
divTag.replaceWith(tag)
return soup
def populate_article_metadata(self, article, soup, first):
shortparagraph = ""
try:
if len(article.text_summary.strip()) == 0:
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
if articlebodies:
for articlebody in articlebodies:
if articlebody:
paras = articlebody.findAll('p')
for p in paras:
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0:
if len(refparagraph) > 70: #approximately one line of text
article.summary = article.text_summary = shortparagraph + refparagraph
return
else:
shortparagraph = refparagraph + " "
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
shortparagraph = shortparagraph + "- "
except:
self.log("Error creating article descriptions")
return
Last edited by running; 01-17-2011 at 02:45 PM. |
|
|
|
|
|
#2 |
|
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 45,610
Karma: 28549044
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
You might want to update the postprocess_html method in the recipe, see https://www.mobileread.com/forums/sho...d.php?t=117043
|
|
|
|
| Advert | |
|
|
|
|
#3 |
|
Junior Member
![]() Posts: 8
Karma: 10
Join Date: Mar 2011
Device: Kindle3, HTC HD2
|
Can't get it to show Sports or World. IHT Recipe Request
Even after modifying the post_process as suggested by Kovid, I can't get this recipe to show the World and Sports sections. I also modified the sort order - no effect.
So, would like to request a clean International Heral Tribune recipe from http://global.nytimes.com/iht? Thanks |
|
|
|
|
|
#4 |
|
Junior Member
![]() Posts: 8
Karma: 10
Join Date: Sep 2008
Device: Sony PRS-950
|
failed install of IHT recipe
I have tried to copy&paste or file import he IHT recipe but always get an error when running it:
anyone any ideas ? Code:
calibre, version 0.7.48
ERROR: Conversion Error: <b>Failed</b>: Fetch news from Inter Harold Tribune
Fetch news from Inter Harold Tribune
Resolved conversion options
calibre version: 0.7.48
{'asciiize': False,
'author_sort': None,
'authors': None,
'base_font_size': 0,
'book_producer': None,
'change_justification': 'original',
'chapter': None,
'chapter_mark': 'pagebreak',
'comments': None,
'cover': None,
'debug_pipeline': None,
'dehyphenate': True,
'delete_blank_paragraphs': True,
'disable_font_rescaling': False,
'dont_download_recipe': False,
'dont_split_on_page_breaks': True,
'enable_heuristics': False,
'epub_flatten': False,
'extra_css': None,
'extract_to': None,
'fix_indents': True,
'flow_size': 260,
'font_size_mapping': None,
'format_scene_breaks': True,
'html_unwrap_factor': 0.4,
'input_encoding': None,
'input_profile': <calibre.customize.profiles.InputProfile object at 0x04FF7BB0>,
'insert_blank_line': False,
'insert_metadata': False,
'isbn': None,
'italicize_common_cases': True,
'keep_ligatures': False,
'language': None,
'level1_toc': None,
'level2_toc': None,
'level3_toc': None,
'line_height': 0,
'linearize_tables': False,
'lrf': False,
'margin_bottom': 5.0,
'margin_left': 5.0,
'margin_right': 5.0,
'margin_top': 5.0,
'markup_chapter_headings': True,
'max_toc_links': 50,
'minimum_line_height': 120.0,
'no_chapters_in_toc': False,
'no_default_epub_cover': False,
'no_inline_navbars': False,
'no_svg_cover': False,
'output_profile': <calibre.customize.profiles.SonyReader900Output object at 0x05056050>,
'page_breaks_before': None,
'password': None,
'prefer_metadata_cover': False,
'preserve_cover_aspect_ratio': False,
'pretty_print': True,
'pubdate': None,
'publisher': None,
'rating': None,
'read_metadata_from_opf': None,
'remove_first_image': False,
'remove_paragraph_spacing': False,
'remove_paragraph_spacing_indent_size': 1.5,
'renumber_headings': True,
'replace_scene_breaks': '',
'series': None,
'series_index': None,
'smarten_punctuation': False,
'sr1_replace': '',
'sr1_search': '',
'sr2_replace': '',
'sr2_search': '',
'sr3_replace': '',
'sr3_search': '',
'tags': None,
'test': False,
'timestamp': None,
'title': None,
'title_sort': None,
'toc_filter': None,
'toc_threshold': 6,
'unwrap_lines': True,
'use_auto_toc': False,
'username': None,
'verbose': 2}
InputFormatPlugin: Recipe Input running
Python function terminated unexpectedly
(Error Code: 1)
Traceback (most recent call last):
File "site.py", line 103, in main
File "site.py", line 85, in run_entry_point
File "site-packages\calibre\utils\ipc\worker.py", line 110, in main
File "site-packages\calibre\gui2\convert\gui_conversion.py", line 25, in gui_convert
File "site-packages\calibre\ebooks\conversion\plumber.py", line 904, in run
File "site-packages\calibre\customize\conversion.py", line 204, in __call__
File "site-packages\calibre\web\feeds\input.py", line 105, in convert
File "site-packages\calibre\web\feeds\news.py", line 734, in download
File "site-packages\calibre\web\feeds\news.py", line 876, in build_index
File "site-packages\calibre\web\feeds\news.py", line 1303, in parse_feeds
File "site-packages\calibre\web\feeds\news.py", line 351, in get_feeds
NotImplementedError
|
|
|
|
|
|
#5 |
|
Junior Member
![]() Posts: 6
Karma: 10
Join Date: Jun 2011
Device: kindle3
|
I too tried to copy and paste the IHT recipe but no luck: here's the error message
calibre, version 0.8.7 ERROR: Invalid entry: <p>Cannot make recipe. Error:<br>invalid syntax (recipe17.py, line 573) Any solutions since this tread was opened? |
|
|
|
| Advert | |
|
|
|
|
#6 |
|
Wizard
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 4,004
Karma: 177841
Join Date: Dec 2009
Device: WinMo: IPAQ; Android: HTC HD2, Archos 7o; Java:Gravity T
|
I took a quick look. I got utf8 decode error messages related to the smart single quote 0x91. I don't have a user/pass for IHT, so couldn't go any further.
|
|
|
|
|
|
#7 |
|
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 45,610
Karma: 28549044
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
I'm confused, there is no builtin IHT recipe. There used to be one, but its broken so it was removed. Here's the old recipe:
Code:
__license__ = 'GPL v3'
__copyright__ = '2008, Derry FitzGerald'
'''
iht.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
class InternationalHeraldTribune(BasicNewsRecipe):
title = u'The International Herald Tribune'
__author__ = 'Derry FitzGerald'
language = 'en'
oldest_article = 1
max_articles_per_feed = 30
no_stylesheets = True
remove_tags = [dict(name='div', attrs={'class':['footer','header']}),
dict(name=['form'])]
preprocess_regexps = [
(re.compile(r'<!-- webtrends.*', re.DOTALL),
lambda m:'</body></html>')
]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
remove_empty_feeds = True
feeds = [
(u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'),
(u'Business', u'http://www.iht.com/rss/business.xml'),
(u'Americas', u'http://www.iht.com/rss/america.xml'),
(u'Europe', u'http://www.iht.com/rss/europe.xml'),
(u'Asia', u'http://www.iht.com/rss/asia.xml'),
(u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'),
(u'Opinion', u'http://www.iht.com/rss/opinion.xml'),
(u'Technology', u'http://www.iht.com/rss/technology.xml'),
(u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'),
(u'Sports', u'http://www.iht.com/rss/sports.xml'),
(u'Culture', u'http://www.iht.com/rss/arts.xml'),
(u'Style and Design', u'http://www.iht.com/rss/style.xml'),
(u'Travel', u'http://www.iht.com/rss/travel.xml'),
(u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'),
(u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'),
(u'Properties', u'http://www.iht.com/rss/properties.xml')
]
temp_files = []
articles_are_obfuscated = True
masthead_url = 'http://graphics8.nytimes.com/images/misc/iht-masthead-logo.gif'
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
response1 = br.follow_link(url_regex=re.compile(r'.*pagewanted=print.*'))
html = response1.read()
self.temp_files.append(PersistentTemporaryFile('_iht.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
|
|
|
|
|
|
#8 |
|
Wizard
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 4,004
Karma: 177841
Join Date: Dec 2009
Device: WinMo: IPAQ; Android: HTC HD2, Archos 7o; Java:Gravity T
|
|
|
|
|
|
|
#9 |
|
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 45,610
Karma: 28549044
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
Ah, apologies.
|
|
|
|
|
|
#10 |
|
Junior Member
![]() Posts: 6
Karma: 10
Join Date: Jun 2011
Device: kindle3
|
I tried copy and past with the code Kovid supplied but again got this message
calibre, version 0.8.11 ERROR: Ongeldige invoer: <p>Kan recept niet aanmaken. Fout:<br>invalid syntax (recipe4.py, line 11) I'm just a user, not a specialist, so I don't know what this means and how I could solve this. I do have a pass and access code for IHT online. |
|
|
|
|
|
#11 | |
|
Wizard
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 4,004
Karma: 177841
Join Date: Dec 2009
Device: WinMo: IPAQ; Android: HTC HD2, Archos 7o; Java:Gravity T
|
Quote:
|
|
|
|
|
![]() |
|
Similar Threads
|
||||
| Thread | Thread Starter | Forum | Replies | Last Post |
| Save homepage with link depth 1 | defhir | Workshop | 1 | 09-28-2010 05:49 AM |
| International Herald Tribune: European Edition | Raoul O'Malley | Calibre | 1 | 05-02-2010 01:20 AM |
| Kindle DX no longer on Amazon Homepage | Daithi | Amazon Kindle | 9 | 05-30-2009 04:43 PM |
| Surprised they still have Kindle on Amazon homepage | markbot | Amazon Kindle | 11 | 07-04-2008 01:02 AM |
| Herald Tribune on how e-books spur sales | Alexander Turcic | News | 0 | 08-05-2005 06:09 PM |