onlinenewsreader.net
Posts: 324
Karma: 10143
Join Date: Dec 2009
Location: Phoenix, AZ & Victoria, BC
Device: Kindle 3, Kindle Fire, IPad3, iPhone4, Playbook, HTC Inspire
|
Slate formatting
Quote:
Originally Posted by sdow1
Not sure if this is the right thread, but since it's where most of the content for slate seems to be, thought I'd try here first.
For the past few days, every time I download the calibre feed for slate, I just get the cover and the Table of Contents (such as it is, since it just says "all articles"). No content whatsoever. I thought I'd wait through the weekend in case it was a matter of slate itself being relatively "dead" on the weekends, but today it was the same thing. I've also tried downloading at different times of the day, in case that was a problem, but I get the same thing. Cover, but no content.
Help!!
|
Slate has made a minor formatting change and is causing postprocess_html to trap for every article--hence no content. The code that is trapping is actually redundant so I removed it to solve the problem. Here is my version of the recipe--it has some modest improvements over the original. Note that you can make the recipe get all content (slate_complete=True) or just the previous 7 days's worth (slate_complete=False).
Code:
#!/usr/bin/env python
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
__license__ = 'GPL v3'
'''
calibre recipe for slate.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
class Slate(BasicNewsRecipe):
# Method variables for customizing downloads
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
__author__ = 'GRiker and Sujata Raman / enhanced by Nick Redding'
max_articles_per_feed = 100
oldest_article = 14
recursions = 0
delay = 0
simultaneous_downloads = 5
timeout = 120.0
timefmt = ''
feeds = None
no_stylesheets = True
encoding = None
language = 'en'
slate_complete = True
if slate_complete:
title = 'Slate (complete)'
else:
title = 'Slate (weekly)'
# Method variables for customizing feed parsing
summary_length = 250
use_embedded_content = None
# Method variables for pre/post processing of HTML
preprocess_regexps = [ (re.compile(r'<p><em>Disclosure: <strong>Slate</strong> is owned by the Washington Post.*</p>',
re.DOTALL|re.IGNORECASE),
lambda match: ''),
(re.compile(r'<p><strong><em>Join the discussion about this story on.*</p>',
re.DOTALL|re.IGNORECASE),
lambda match: '') ]
match_regexps = []
# The second entry is for 'Big Money', which comes from a different site, uses different markup
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}),
dict(attrs={ 'id':['content']}) ]
# The second entry is for 'Big Money', which comes from a different site, uses different markup
remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper',
'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
'comments_button','add_comments_button','comments-to-fray','marriott_ad',
'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
excludedTitleKeywords = ['Gabfest','Slate V','on Twitter']
excludedAuthorKeywords = []
excludedContentKeywords = ['http://twitter.com/Slate']
extra_css = '''
.h1_subhead{font-family:Arial; font-size:small; }
h1{font-family:Verdana; font-size:large; }
.byline {font-family:Georgia; margin-bottom: 0px; }
.dateline {font-family:Arial; font-size: smaller; height: 0pt;}
.imagewrapper {font-family:Verdana;font-size:x-small; }
.source {font-family:Verdana; font-size:x-small;}
.credit {font-family:Verdana; font-size: smaller;}
#article_body {font-family:Verdana; }
#content {font-family:Arial; }
.caption{font-family:Verdana;font-style:italic; font-size:x-small;}
h3{font-family:Arial; font-size:small}
'''
# Local variables to extend class
baseURL = 'http://slate.com'
section_dates = []
# class extension methods
def tag_to_strings(self, tag):
if not tag:
return ''
if isinstance(tag, basestring):
return tag
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = self.tag_to_string(item,use_alt=False)
if res:
strings.append(res)
return strings
def extract_named_sections(self):
soup = self.index_to_soup( self.baseURL )
soup_nav_bar = soup.find(True, attrs={'id':'nav'})
briefing_nav = soup.find('li')
briefing_url = briefing_nav.a['href']
for section_nav in soup_nav_bar.findAll('li'):
section_name = self.tag_to_string(section_nav,use_alt=False)
self.section_dates.append(section_name)
soup = self.index_to_soup(briefing_url)
self.log("Briefing url = %s " % briefing_url)
section_lists = soup.findAll('ul','view_links_list')
sections = []
for section in section_lists :
sections.append(section)
return sections
def extract_dated_sections(self):
soup = self.index_to_soup( self.baseURL )
soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
if soup_top_stories:
self.section_dates.append("Top Stories")
self.log("SELECTION TOP STORIES %s" % "Top Stories")
soup = soup.find(True, attrs={'id':'toc_links_container'})
todays_section = soup.find(True, attrs={'class':'todaydateline'})
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
for older_section in older_section_dates :
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))
if soup_top_stories:
headline_stories = soup_top_stories
self.log("HAVE top_stories")
else:
headline_stories = None
self.log("NO top_stories")
section_lists = soup.findAll('ul')
# Prepend the headlines to the first section
if headline_stories:
section_lists.insert(0,headline_stories)
sections = []
for section in section_lists :
sections.append(section)
return sections
def extract_section_articles(self, sections_html) :
# Find the containers with section content
sections = sections_html
articles = {}
key = None
ans = []
for (i,section) in enumerate(sections) :
# Get the section name
if section.has_key('id') :
self.log("PROCESSING SECTION id = %s" % section['id'])
key = self.section_dates[i]
if key.startswith("Pod"):
continue
if key.startswith("Blog"):
continue
articles[key] = []
ans.append(key)
elif self.slate_complete:
key = self.section_dates[i]
if key.startswith("Pod"):
continue
if key.startswith("Blog"):
continue
self.log("PROCESSING SECTION name = %s" % key)
articles[key] = []
ans.append(key)
else :
self.log("SECTION %d HAS NO id" % i);
continue
# Get the section article_list
article_list = section.findAll('li')
# Extract the article attributes
for article in article_list :
bylines = self.tag_to_strings(article)
url = article.a['href']
title = bylines[0]
full_title = self.tag_to_string(article,use_alt=False)
#self.log("ARTICLE TITLE%s" % title)
#self.log("ARTICLE FULL_TITLE%s" % full_title)
#self.log("URL %s" % url)
author = None
description = None
pubdate = None
if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
description = "A summary of what's in the major U.S. newspapers."
if len(bylines) == 3 :
author = bylines[2].strip()
author = re.sub('[\r][\n][\t][\t\t]','', author)
author = re.sub(',','', author)
if bylines[1] is not None :
description = bylines[1]
full_byline = self.tag_to_string(article)
if full_byline.find('major U.S. newspapers') > 0 :
description = "A summary of what's in the major U.S. newspapers."
if len(bylines) > 3 and author is not None:
author += " | "
for (i,substring) in enumerate(bylines[3:]) :
#print "substring: %s" % substring.encode('cp1252')
author += substring.strip()
if i < len(bylines[3:]) :
author += " | "
# Skip articles whose descriptions contain excluded keywords
if description is not None and len(self.excludedDescriptionKeywords):
excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
found_excluded = excluded.search(description)
if found_excluded :
self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
# Skip articles whose title contain excluded keywords
if full_title is not None and len(self.excludedTitleKeywords):
excluded = re.compile('|'.join(self.excludedTitleKeywords))
#self.log("evaluating full_title: %s" % full_title)
found_excluded = excluded.search(full_title)
if found_excluded :
self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
# Skip articles whose author contain excluded keywords
if author is not None and len(self.excludedAuthorKeywords):
excluded = re.compile('|'.join(self.excludedAuthorKeywords))
found_excluded = excluded.search(author)
if found_excluded :
self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
continue
skip_this_article = False
# Check to make sure we're not adding a duplicate
for article in articles[key] :
if article['url'] == url :
skip_this_article = True
self.log("SKIPPING DUP %s" % url)
break
if skip_this_article :
continue
# Build the dictionary entry for this article
feed = key
if not articles.has_key(feed) :
articles[feed] = []
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
author=author, content=''))
#self.log("KEY %s" % feed)
#self.log("APPENDED %s" % url)
# Promote 'newspapers' to top
for (i,article) in enumerate(articles[feed]) :
if article['description'] is not None :
if article['description'].find('newspapers') > 0 :
articles[feed].insert(0,articles[feed].pop(i))
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
return ans
def print_version(self, url) :
return url + 'pagenum/all/'
# Class methods
def parse_index(self) :
if self.slate_complete:
sections = self.extract_named_sections()
else:
sections = self.extract_dated_sections()
section_list = self.extract_section_articles(sections)
return section_list
def get_browser(self) :
return BasicNewsRecipe.get_browser()
def get_masthead_url(self):
masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
br = BasicNewsRecipe.get_browser()
try:
br.open(masthead)
except:
self.log("\nMasthead unavailable")
masthead = None
return masthead
def stripAnchors(self,soup):
body = soup.find('div',attrs={'id':['article_body','content']})
if body is not None:
paras = body.findAll('p')
if paras is not None:
for para in paras:
aTags = para.findAll('a')
if aTags is not None:
for a in aTags:
if a.img is None:
#print repr(a.renderContents())
a.replaceWith(a.renderContents().decode('utf-8','replace'))
return soup
def preprocess_html(self, soup) :
# Remove 'grayPlus4.png' images
imgs = soup.findAll('img')
if imgs is not None:
for img in imgs:
if re.search("grayPlus4.png",str(img)):
img.extract()
# Delete article based upon content keywords
if len(self.excludedDescriptionKeywords):
excluded = re.compile('|'.join(self.excludedContentKeywords))
found_excluded = excluded.search(str(soup))
if found_excluded :
print "No allowed content found, removing article"
raise Exception('Rejected article')
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body
head = soup.find('head')
if head.link is not None and re.search('www\.thebigmoney\.com', str(head)):
byline = soup.find('div',attrs={'id':'byline'})
if byline is not None:
byline['class'] = byline['id']
dateline = soup.find('div',attrs={'id':'dateline'})
if dateline is not None:
dateline['class'] = dateline['id']
body = soup.find('div',attrs={'id':'content'})
if body is not None:
body['class'] = 'article_body'
# Synthesize a department kicker
h3Tag = Tag(soup,'h3')
emTag = Tag(soup,'em')
emTag.insert(0,NavigableString("the big money: Today's business press"))
h3Tag.insert(0,emTag)
soup.body.insert(0,h3Tag)
# Strip anchors from HTML
return self.stripAnchors(soup)
def postprocess_html(self, soup, first_fetch) :
# Fix up dept_kicker as <h3><em>
dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
if dept_kicker is not None :
kicker_strings = self.tag_to_strings(dept_kicker)
kicker = ''.join(kicker_strings[2:])
kicker = re.sub('\.','',kicker)
h3Tag = Tag(soup, "h3")
emTag = Tag(soup, "em")
emTag.insert(0,NavigableString(kicker))
h3Tag.insert(0, emTag)
dept_kicker.replaceWith(h3Tag)
else:
self.log("No kicker--return null")
return None
# Fix up the concatenated byline and dateline
byline = soup.find(True,attrs={'class':'byline'})
if byline is not None :
bylineTag = Tag(soup,'div')
bylineTag['class'] = 'byline'
#bylineTag['height'] = '0em'
bylineTag.insert(0,self.tag_to_string(byline))
byline.replaceWith(bylineTag)
dateline = soup.find(True, attrs={'class':'dateline'})
if dateline is not None :
datelineTag = Tag(soup, 'div')
datelineTag['class'] = 'dateline'
#datelineTag['margin-top'] = '0em'
datelineTag.insert(0,self.tag_to_string(dateline))
dateline.replaceWith(datelineTag)
# Change captions to italic, add <hr>
for caption in soup.findAll(True, {'class':'caption'}) :
if caption is not None:
emTag = Tag(soup, "em")
emTag.insert(0, '<br />' + self.tag_to_string(caption))
hrTag = Tag(soup, 'hr')
emTag.insert(1, hrTag)
caption.replaceWith(emTag)
# Fix photos
for photo in soup.findAll('span',attrs={'class':'imagewrapper'}):
if photo.a is not None and photo.a.img is not None:
divTag = Tag(soup,'div')
divTag['class'] ='imagewrapper'
divTag.insert(0,photo.a.img)
photo.replaceWith(divTag)
return soup
def postprocess_book(self, oeb, opts, log) :
def extract_byline(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
byline = soup.find(True,attrs={'class':'byline'})
if byline is not None:
return self.tag_to_string(byline,use_alt=False)
else :
return None
def extract_description(href) :
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
paragraphs = soup.findAll('p')
for p in paragraphs :
if self.tag_to_string(p,use_alt=False).startswith('By ') or \
self.tag_to_string(p,use_alt=False).startswith('Posted '):
continue
comment = p.find(text=lambda text:isinstance(text, Comment))
if comment is not None:
continue
else:
return self.tag_to_string(p,use_alt=False)[:self.summary_length] + '...'
return None
# Method entry point here
# Single section toc looks different than multi-section tocs
if oeb.toc.depth() == 2 :
for article in oeb.toc :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href)
elif oeb.toc.depth() == 3 :
for section in oeb.toc :
for article in section :
if article.author is None :
article.author = extract_byline(article.href)
if article.description is None :
article.description = extract_description(article.href)
|