Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes


Thread Tools Search this Thread
Old 06-17-2018, 12:11 PM   #1
Junior Member
lui1 began at the beginning.
Posts: 5
Karma: 10
Join Date: Dec 2017
Location: Los Angeles, CA
Device: Smart Phone
LA Times Complete Rewrite

Here is a rewrite of the LA Times Recipe. The old one stopped working several months ago, so this is what I came up with.

LA Times Recipe
#!/usr/bin/env python2

from collections import defaultdict
from pprint import pformat
from import BasicNewsRecipe
from import strptime

DT_EPOCH = strptime('1970-01-01', '%Y-%m-%d', assume_utc=True)

DIR_COLLECTIONS = [['world'],
                   ['opinion', 'op-ed', 'opinion-la', 'editorials', 'readersreact', 'topoftheticket', 'endorsements'],
                   ['local', 'lanow', 'california', 'crime', 'abcarian', 'education', 'weather'],
                   ['business', 'hollywood', 'technology'],
                   ['entertainment', 'movies', 'music', 'tv', 'arts', 'gossip', 'envelope'],
                   ['food', 'jonathon-gold', 'dailydish'],
                   ['style', 'laaffairs', 'pets'],
                   ['science', 'sciencenow'],

          'THE NATION',

def absurl(url):
    if url.startswith('/'):
        url = '' + url
    return url

def check_words(words):
    return lambda x: x and frozenset(words.split()).intersection(x.split())

def what_section(url):
    if   re.compile(r'^https?://www[.]latimes[.]com/local/obituaries').search(url):
        return 'OBITUARIES'
    elif re.compile(r'^https?://www[.]latimes[.]com/business/hollywood').search(url):
        return 'HOLLYWOOD'
    elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/movies').search(url):
        return 'MOVIES'
    elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/tv').search(url):
        return 'TELEVISION'
    elif re.compile(r'^https?://www[.]latimes[.]com/business/technology').search(url):
    elif re.compile(r'^https?://www[.]latimes[.]com/world').search(url):
        return 'THE WORLD'
    elif re.compile(r'^https?://www[.]latimes[.]com/nation').search(url):
        return 'THE NATION'
    elif re.compile(r'^https?://www[.]latimes[.]com/politics').search(url):
        return 'POLITICS'
    elif re.compile(r'^https?://www[.]latimes[.]com/opinion').search(url):
        return 'OPINION'
    elif re.compile(r'^https?://www[.]latimes[.]com/(?:local|style)').search(url):
        return 'CALIFORNIA'
    elif re.compile(r'^https?://www[.]latimes[.]com/business').search(url):
        return 'BUSINESS'
    elif re.compile(r'^https?://www[.]latimes[.]com/sports').search(url):
        return 'SPORTS'
    elif re.compile(r'^https?://www[.]latimes[.]com/entertainment').search(url):
        return 'ENTERTAINMENT'
    elif re.compile(r'^https?://www[.]latimes[.]com/books').search(url):
        return 'BOOKS'
    elif re.compile(r'^https?://www[.]latimes[.]com/food').search(url):
        return 'FOOD'
    elif re.compile(r'^https?://www[.]latimes[.]com/health').search(url):
        return 'HEALTH'
    elif re.compile(r'^https?://www[.]latimes[.]com/science').search(url):
    elif re.compile(r'^https?://www[.]latimes[.]com/home').search(url):
        return 'HOME'
    elif re.compile(r'^https?://www[.]latimes[.]com/travel').search(url):
        return 'TRAVEL'
    elif re.compile(r'^https?://www[.]latimes[.]com/fashion').search(url):
        return 'FASHION'
    elif re.compile(r'^https?://www[.]latimes[.]com/newsletter').search(url):
        return 'NEWSLETTERS'
        return 'OTHER'

class LATimes(BasicNewsRecipe):
    title = 'Los Angeles Times'
    __author__ = 'Jose Ortiz'
    description = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California'  # noqa
    category = 'news, politics, USA, Los Angeles, world'
    oldest_article = 1
    max_articles_per_feed = 200
    no_stylesheets = True
    encoding = 'utf8'
    use_embedded_content = False
    language = 'en'
    remove_empty_feeds = True
    ignore_duplicate_articles = {'url'}
    publication_type = 'newspaper'
    cover_url = ''

    keep_only_tags = [
        dict(name='header', attrs={'id': 'top'}),
        dict(name='div', attrs={'id': 'liveblog-story-wrapper'})

    remove_tags= [
        dict(name='div', attrs={'class': check_words('hidden-tablet hidden-mobile hidden-desktop pb-f-ads-dfp')})

    remove_tags_after = [
        dict(name='div', attrs={'class': check_words('pb-f-article-body')})

    def parse_index(self):
        index = ''
        pat = r'^(?:https?://www[.]latimes[.]com)?/[^#]+20[0-9]{6}-(?:html)?story[.]html'
        articles = self.find_articles(index, pat)
        for collection in DIR_COLLECTIONS:
            topdir = collection.pop(0)
            index = '' + topdir + '/'
            pat = r'^(?:https?://www[.]latimes[.]com)?/' + topdir + '/[^#]+20[0-9]{6}-(?:html)?story[.]html'
            articles += self.find_articles(index, pat)
            for subdir in collection:
                sub_index = index + subdir + '/'
                articles += self.find_articles(sub_index, pat)

        feeds = defaultdict(list)
        for article in articles:
            section = what_section(article['url'])

        keys = []
        for key in SECTIONS:
            if key in feeds.keys():
        return [(k, feeds[k]) for k in keys]

    def preprocess_html(self, soup):
        for img in soup.findAll('img', attrs={'data-src': True}):
            if img.findParent('a', href='') \
               is img.parent and img['data-src'].endswith('/la-letter-to-the-editor'):
                img['src'] = img['data-src']
        return soup

    def find_articles(self, index, pattern):
        self.log('Downloading and parsing index: ', index)
        self.log('Pattern: ', pattern)
            soup = self.index_to_soup(index)
            self.log('Failed to download ', index)
            return []
        if soup.main is not None:
            alinks = soup.main.findAll('a', {'href': re.compile(pattern)})
            alinks = soup.findAll('a', {'href': re.compile(pattern)})
        alinks = [a for a in alinks if len(a.contents) == 1 and a.find(text=True, recursive=False)]
        articles = [{'title': a.find(text=True), 'url': absurl(a['href'])} for a in alinks]
        date_rx = re.compile(r'^https?://www[.]latimes[.]com/[^#]+-(?P<date>20[0-9]{6})-(?:html)?story[.]html')
        for article in articles:
            mdate = date_rx.match(article['url'])
            if mdate is not None:
                article['timestamp'] = (strptime('date'),'%Y%m%d') - DT_EPOCH).total_seconds()
                article['url'] =
        self.log('Found: ', len(articles), ' articles.\n')
        return articles
lui1 is offline   Reply With Quote
Old 06-17-2018, 12:29 PM   #2
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
kovidgoyal's Avatar
Posts: 33,060
Karma: 10034422
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
kovidgoyal is offline   Reply With Quote

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Rewrite title from each article cyttorak Recipes 1 11-28-2014 06:29 PM
KT -- complete rewrite? tlc Kindle Developer's Corner 2 11-30-2011 10:29 AM
Horror James, M. R.: The Complete Complete Ghost Stories. v1. 27 Jun 2011 Mousewaffle Kindle Books 5 07-03-2011 09:04 AM
calibre 0.6.20 won't complete NY Times news feed Zapped Calibre 9 11-13-2009 10:41 AM
Greetings and offering to help on the wiki rewrite orchidpop Introduce Yourself 10 05-11-2008 11:12 PM

All times are GMT -4. The time now is 12:48 PM. is a privately owned, operated and funded community.