Junk artifacts have been creeping through with the current recipe of Los Angeles Times. So, I updated the patterns to reflect the latest markup conventions and I tweaked the recipe a bit to include the bylines of the articles.
Los Angeles Times
Code:
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
from collections import defaultdict
from pprint import pformat
from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes):
q = frozenset(classes.split(' '))
return dict(
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
def absurl(url):
if url.startswith('/'):
url = 'http://www.latimes.com' + url
return url
class LATimes(BasicNewsRecipe):
title = 'Los Angeles Times'
__author__ = 'Kovid Goyal'
description = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California' # noqa
category = 'news, politics, USA, Los Angeles, world'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False
language = 'en'
remove_empty_feeds = True
publication_type = 'newspaper'
cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf'
keep_only_tags = [
dict(name='h1'),
dict(attrs={'class': 'trb_ar_main'}),
]
remove_tags_after = [
dict(itemprop='articleBody'),
]
remove_tags = [
dict(attrs={'data-content-type': 'blurb'}),
classes('trb_ar_cont trb_gptAd trb_filmstrip trb_ar_sponsoredmod'),
]
def parse_index(self):
soup = self.index_to_soup('http://www.latimes.com')
feeds = defaultdict(list)
for x in soup.findAll(
attrs={
'data-content-type': 'story',
'data-content-section': True,
'data-content-slug': True,
}
):
a = x.find('a', attrs={'class': lambda x: not x or 'SectionHeading' not in x})
if a is not None:
url = absurl(a['href'])
section = x['data-content-section'].capitalize()
title = self.tag_to_string(a)
feeds[section].append({'title': title, 'url': url})
self.log(pformat(dict(feeds)))
return [(k, feeds[k]) for k in sorted(feeds)]
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-baseurl': True}):
img['src'] = img['data-baseurl'] + '/750'
for div in soup.findAll(itemprop='articleBody'):
div.extract()
soup.find('body').append(div)
return soup