Hello There,
I updated the recipe so that it finds the body of the article again. Apparently, they changed the CSS class name for the div containing the main text. Anyways, here's the recipe:
Popular Science
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
class AdvancedUserRecipe1282101454(BasicNewsRecipe):
title = 'Popular Science'
language = 'en'
__author__ = 'Kovid Goyal'
description = 'Popular Science'
publisher = 'Popular Science'
oldest_article = 7 # change this if you want more current articles. I like to go a week in
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
use_embedded_content = False
remove_empty_feeds = True
ignore_duplicate_articles = {'url'}
feeds = [
('Gadgets', 'http://www.popsci.com/full-feed/gadgets'),
('Cars', 'http://www.popsci.com/full-feed/cars'),
('Science', 'http://www.popsci.com/full-feed/science'),
('Technology', 'http://www.popsci.com/full-feed/technology'),
('DIY', 'http://www.popsci.com/full-feed/diy'),
('Animals', 'https://www.popsci.com/rss-animals.xml'),
('Space', 'https://www.popsci.com/rss-space.xml'),
('Environment', 'https://www.popsci.com/rss-environment.xml'),
('Eastern Arsenal', 'https://www.popsci.com/rss-eastern-arsenal.xml'),
]
pane_node_body = re.compile('pane-node-(?:\w+-){0,9}body')
keep_only_tags = [
dict(attrs={'class': lambda x: x and frozenset('pane-node-header'.split()).issubset(frozenset(x.split())) }),
dict(attrs={'class': pane_node_body}),
]
remove_tags = [
dict(attrs={'class': lambda x: x and frozenset('ads seperator'.split()).issubset(frozenset(x.split())) }),
]
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-medsrc': True}):
img['src'] = img['data-medsrc']
return soup