06-17-2018, 12:11 PM | #1 |
Enthusiast
Posts: 36
Karma: 10
Join Date: Dec 2017
Location: Los Angeles, CA
Device: Smart Phone
|
LA Times Complete Rewrite
Here is a rewrite of the LA Times Recipe. The old one stopped working several months ago, so this is what I came up with.
LA Times Recipe Code:
#!/usr/bin/env python2 from collections import defaultdict from pprint import pformat from calibre.web.feeds.news import BasicNewsRecipe from calibre.utils.date import strptime DT_EPOCH = strptime('1970-01-01', '%Y-%m-%d', assume_utc=True) DIR_COLLECTIONS = [['world'], ['nation'], ['politics'], ['opinion', 'op-ed', 'opinion-la', 'editorials', 'readersreact', 'topoftheticket', 'endorsements'], ['local', 'lanow', 'california', 'crime', 'abcarian', 'education', 'weather'], ['business', 'hollywood', 'technology'], ['sports'], ['entertainment', 'movies', 'music', 'tv', 'arts', 'gossip', 'envelope'], ['books'], ['food', 'jonathon-gold', 'dailydish'], ['health'], ['style', 'laaffairs', 'pets'], ['science', 'sciencenow'], ['home'], ['travel'], ['fashion']] SECTIONS=['THE WORLD', 'THE NATION', 'POLITICS', 'OPINION', 'CALIFORNIA', 'OBITUARIES', 'BUSINESS', 'HOLLYWOOD', 'SPORTS', 'ENTERTAINMENT', 'MOVIES', 'TELEVISION', 'BOOKS', 'FOOD', 'HEALTH', 'SCIENCE AND TECHNOLOGY', 'HOME', 'TRAVEL', 'FASHION', 'NEWSLETTERS' 'OTHER'] def absurl(url): if url.startswith('/'): url = 'http://www.latimes.com' + url return url def check_words(words): return lambda x: x and frozenset(words.split()).intersection(x.split()) def what_section(url): if re.compile(r'^https?://www[.]latimes[.]com/local/obituaries').search(url): return 'OBITUARIES' elif re.compile(r'^https?://www[.]latimes[.]com/business/hollywood').search(url): return 'HOLLYWOOD' elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/movies').search(url): return 'MOVIES' elif re.compile(r'^https?://www[.]latimes[.]com/entertainment/tv').search(url): return 'TELEVISION' elif re.compile(r'^https?://www[.]latimes[.]com/business/technology').search(url): return 'SCIENCE AND TECHNOLOGY' elif re.compile(r'^https?://www[.]latimes[.]com/world').search(url): return 'THE WORLD' elif re.compile(r'^https?://www[.]latimes[.]com/nation').search(url): return 'THE NATION' elif re.compile(r'^https?://www[.]latimes[.]com/politics').search(url): return 'POLITICS' elif re.compile(r'^https?://www[.]latimes[.]com/opinion').search(url): return 'OPINION' elif re.compile(r'^https?://www[.]latimes[.]com/(?:local|style)').search(url): return 'CALIFORNIA' elif re.compile(r'^https?://www[.]latimes[.]com/business').search(url): return 'BUSINESS' elif re.compile(r'^https?://www[.]latimes[.]com/sports').search(url): return 'SPORTS' elif re.compile(r'^https?://www[.]latimes[.]com/entertainment').search(url): return 'ENTERTAINMENT' elif re.compile(r'^https?://www[.]latimes[.]com/books').search(url): return 'BOOKS' elif re.compile(r'^https?://www[.]latimes[.]com/food').search(url): return 'FOOD' elif re.compile(r'^https?://www[.]latimes[.]com/health').search(url): return 'HEALTH' elif re.compile(r'^https?://www[.]latimes[.]com/science').search(url): return 'SCIENCE AND TECHNOLOGY' elif re.compile(r'^https?://www[.]latimes[.]com/home').search(url): return 'HOME' elif re.compile(r'^https?://www[.]latimes[.]com/travel').search(url): return 'TRAVEL' elif re.compile(r'^https?://www[.]latimes[.]com/fashion').search(url): return 'FASHION' elif re.compile(r'^https?://www[.]latimes[.]com/newsletter').search(url): return 'NEWSLETTERS' else: return 'OTHER' class LATimes(BasicNewsRecipe): title = 'Los Angeles Times' __author__ = 'Jose Ortiz' description = 'The Los Angeles Times is a leading source of news on Southern California, entertainment, movies, television, music, politics, business, health, technology, travel, sports, environment, economics, autos, jobs, real estate and other topics affecting California' # noqa category = 'news, politics, USA, Los Angeles, world' oldest_article = 1 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False language = 'en' remove_empty_feeds = True ignore_duplicate_articles = {'url'} publication_type = 'newspaper' cover_url = 'http://www.latimes.com/includes/sectionfronts/A1.pdf' keep_only_tags = [ dict(name='header', attrs={'id': 'top'}), dict(name='article'), dict(name='div', attrs={'id': 'liveblog-story-wrapper'}) ] remove_tags= [ dict(name='div', attrs={'class': check_words('hidden-tablet hidden-mobile hidden-desktop pb-f-ads-dfp')}) ] remove_tags_after = [ dict(name='div', attrs={'class': check_words('pb-f-article-body')}) ] def parse_index(self): index = 'http://www.latimes.com/' pat = r'^(?:https?://www[.]latimes[.]com)?/[^#]+20[0-9]{6}-(?:html)?story[.]html' articles = self.find_articles(index, pat) for collection in DIR_COLLECTIONS: topdir = collection.pop(0) index = 'http://www.latimes.com/' + topdir + '/' pat = r'^(?:https?://www[.]latimes[.]com)?/' + topdir + '/[^#]+20[0-9]{6}-(?:html)?story[.]html' articles += self.find_articles(index, pat) for subdir in collection: sub_index = index + subdir + '/' articles += self.find_articles(sub_index, pat) feeds = defaultdict(list) for article in articles: section = what_section(article['url']) feeds[section].append(article) keys = [] for key in SECTIONS: if key in feeds.keys(): keys.append(key) self.log(pformat(dict(feeds))) return [(k, feeds[k]) for k in keys] def preprocess_html(self, soup): for img in soup.findAll('img', attrs={'data-src': True}): if img.findParent('a', href='http://www.latimes.com/opinion/la-letter-to-the-editor-htmlstory.html') \ is img.parent and img['data-src'].endswith('/la-letter-to-the-editor'): img.parent.extract() else: img['src'] = img['data-src'] return soup def find_articles(self, index, pattern): self.log('Downloading and parsing index: ', index) self.log('Pattern: ', pattern) try: soup = self.index_to_soup(index) except: self.log('Failed to download ', index) return [] if soup.main is not None: alinks = soup.main.findAll('a', {'href': re.compile(pattern)}) else: alinks = soup.findAll('a', {'href': re.compile(pattern)}) alinks = [a for a in alinks if len(a.contents) == 1 and a.find(text=True, recursive=False)] articles = [{'title': a.find(text=True), 'url': absurl(a['href'])} for a in alinks] date_rx = re.compile(r'^https?://www[.]latimes[.]com/[^#]+-(?P<date>20[0-9]{6})-(?:html)?story[.]html') for article in articles: mdate = date_rx.match(article['url']) if mdate is not None: article['timestamp'] = (strptime(mdate.group('date'),'%Y%m%d') - DT_EPOCH).total_seconds() article['url'] = mdate.group(0) self.log('Found: ', len(articles), ' articles.\n') return articles |
06-17-2018, 12:29 PM | #2 |
creator of calibre
Posts: 43,843
Karma: 22666666
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
|
Advert | |
|
|
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
Rewrite title from each article | cyttorak | Recipes | 1 | 11-28-2014 06:29 PM |
KT -- complete rewrite? | tlc | Kindle Developer's Corner | 2 | 11-30-2011 10:29 AM |
Horror James, M. R.: The Complete Complete Ghost Stories. v1. 27 Jun 2011 | Mousewaffle | Kindle Books | 5 | 07-03-2011 09:04 AM |
calibre 0.6.20 won't complete NY Times news feed | Zapped | Calibre | 9 | 11-13-2009 10:41 AM |
Greetings and offering to help on the wiki rewrite | orchidpop | Introduce Yourself | 10 | 05-11-2008 11:12 PM |