MobileRead Forums - View Single Post

jma1 · 07-07-2017, 09:12 PM

I have tried to modify the current built-in recipe to return all pictures in articles, it gets some but not all. Here is the recipe I have -
------------------------------------------
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from calibre.web.feeds.news import BasicNewsRecipe

# from telegraph.uk not changed
def classes(classes):
q = frozenset(classes.split(' '))
return dict(
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
)
# end from telegraph uk

class FoxNews(BasicNewsRecipe):
title = 'FOX News'
__author__ = 'Darko Miletic'
description = 'Breaking News from FOX'
publisher = 'FOXNews.com'
category = 'news, breaking news, latest news, current news, world news, national news, USA'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = False

compress_news_images = True
# compress_news_images_auto_size = 8

language = 'en'
publication_type = 'newsportal'
remove_empty_feeds = True
extra_css = """
body{font-family: Arial,sans-serif }
.caption{font-size: x-small}
.author,.dateline{font-size: small}
"""

conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}

remove_attributes = ['xmlns','lang']
ignore_duplicate_articles = {'title', 'url'}
keep_only_tags = [
#classes command below based on from telegraph uk
dict(itemprop=['headline', 'articleBody']),
dict(name='h1'),
classes('byline author'),
classes('date source'),
dict(attrs={'class':'article-info article-body'.split()}),
]

feeds = [
(u'Latest Headlines', u'http://feeds.foxnews.com/foxnews/latest' )
,(u'National' , u'http://feeds.foxnews.com/foxnews/national' )
,(u'World' , u'http://feeds.foxnews.com/foxnews/world')
,(u'Politics' , u'http://feeds.foxnews.com/foxnews/politics')
,(u'Opinion' , u'http://feeds.foxnews.com/foxnews/opinion')
,(u'Science' , u'http://feeds.foxnews.com/foxnews/science')
,(u'Technology' , u'http://feeds.foxnews.com/foxnews/tech')
,(u'Health' , u'http://feeds.foxnews.com/foxnews/health')
,(u'Lifestyle' , u'http://feeds.foxnews.com/foxnews/section/lifestyle')
,(u'Travel' , u'http://feeds.foxnews.com/foxnews/internal/travel/mixed')
]
--------------------------

As example, here is the link location of one article picture (from web view of the article in the web rss section) which was not included in the mobi output -

http://a57.foxnews.com/images.foxnew....jpg?ve=1&tl=1

Could you kindly look at this for all pictures?

NB, I was able to change built in to get the articles to include the author, publish date, and publisher just after the heading. Thanks.

07-07-2017, 09:12 PM	#1
jma1 Connoisseur Posts: 85 Karma: 10 Join Date: Dec 2015 Device: Kindle	Fox News Feed Pictures I have tried to modify the current built-in recipe to return all pictures in articles, it gets some but not all. Here is the recipe I have - ------------------------------------------ #!/usr/bin/env python2 # vim:fileencoding=utf-8 from __future__ import unicode_literals, division, absolute_import, print_function from calibre.web.feeds.news import BasicNewsRecipe # from telegraph.uk not changed def classes(classes): q = frozenset(classes.split(' ')) return dict( attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)} ) # end from telegraph uk class FoxNews(BasicNewsRecipe): title = 'FOX News' __author__ = 'Darko Miletic' description = 'Breaking News from FOX' publisher = 'FOXNews.com' category = 'news, breaking news, latest news, current news, world news, national news, USA' oldest_article = 2 max_articles_per_feed = 200 no_stylesheets = True encoding = 'utf8' use_embedded_content = False compress_news_images = True # compress_news_images_auto_size = 8 language = 'en' publication_type = 'newsportal' remove_empty_feeds = True extra_css = """ body{font-family: Arial,sans-serif } .caption{font-size: x-small} .author,.dateline{font-size: small} """ conversion_options = { 'comment' : description , 'tags' : category , 'publisher' : publisher , 'language' : language } remove_attributes = ['xmlns','lang'] ignore_duplicate_articles = {'title', 'url'} keep_only_tags = [ #classes command below based on from telegraph uk dict(itemprop=['headline', 'articleBody']), dict(name='h1'), classes('byline author'), classes('date source'), dict(attrs={'class':'article-info article-body'.split()}), ] feeds = [ (u'Latest Headlines', u'http://feeds.foxnews.com/foxnews/latest' ) ,(u'National' , u'http://feeds.foxnews.com/foxnews/national' ) ,(u'World' , u'http://feeds.foxnews.com/foxnews/world') ,(u'Politics' , u'http://feeds.foxnews.com/foxnews/politics') ,(u'Opinion' , u'http://feeds.foxnews.com/foxnews/opinion') ,(u'Science' , u'http://feeds.foxnews.com/foxnews/science') ,(u'Technology' , u'http://feeds.foxnews.com/foxnews/tech') ,(u'Health' , u'http://feeds.foxnews.com/foxnews/health') ,(u'Lifestyle' , u'http://feeds.foxnews.com/foxnews/section/lifestyle') ,(u'Travel' , u'http://feeds.foxnews.com/foxnews/internal/travel/mixed') ] -------------------------- As example, here is the link location of one article picture (from web view of the article in the web rss section) which was not included in the mobi output - http://a57.foxnews.com/images.foxnew....jpg?ve=1&tl=1 Could you kindly look at this for all pictures? NB, I was able to change built in to get the articles to include the author, publish date, and publisher just after the heading. Thanks.