View Single Post
Old 02-15-2010, 11:50 AM   #1437
Starson17
Wizard
Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.
 
Posts: 4,004
Karma: 177841
Join Date: Dec 2009
Device: WinMo: IPAQ; Android: HTC HD2, Archos 7o; Java:Gravity T
Here is another Winter Olympics recipe. This one has nothing in it but photos and the headers/titles for the photos, so it can be quite large. Even with only 3 of the feeds turned on, this ebook is 25 MB in size.

Spoiler:
#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2010, Starson17'
'''
www.nbcolympics.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString
import re

class OlympicsP_2010(BasicNewsRecipe):
title = u'Olympics Photos 2010'
__author__ = 'Starson17'
description = 'Olympics Photos 2010'
cover_url = 'http://www.digitaljournal.com/img/1/1/2/1/i/4/7/6/o/WinterOlympics2010-logo.jpg'
publisher = 'Olympics 2010'
tags = 'Olympics news'
language = 'en'
use_embedded_content = False
no_stylesheets = True
remove_javascript = False
# recursions = 3
# oldest_article = 2
max_articles_per_feed = 20

keep_only_tags = [
dict(name='div', attrs={'class':['galleryHeaderTop']}),
dict(name='div', attrs={'id':['BodyContentMM']}),
]

remove_tags = [
dict(name='div', attrs={'class':['floatBoxLeft Box100','DateUtilities','ImageDiv','NewsPhotos NewsPhotos_ieFix ','NewsPhotoHeaderSection']}),
]

# RSS feeds are at: http://www.nbcolympics.com/rss/index.html
feeds = [
('NBCOlympics.com - Most Popular - Photos', 'http://www.nbcolympics.com/rss/photos/mostpopular.xml'),
('NBCOlympics.com - Editorial Picks - Photos', 'http://www.nbcolympics.com/rss/photos/editorialpicks.xml'),
('NBCOlympics.com - Latest - Photos', 'http://www.nbcolympics.com/rss/photos/latestslideshows.xml'),
#('NBCOlympics.com - Destination Vancouver - Photos', 'http://www.nbcolympics.com/rss/specialty/destination-vancouve/photos/index.xml'),
#('NBCOlympics.com - Fast and Furious - Photos', 'http://www.nbcolympics.com/rss/specialty/fast-fearless/photos/index.xml'),
#('NBCOlympics.com - Golden Moments - Photos', 'http://www.nbcolympics.com/rss/specialty/golden-moments/photos/index.xml'),
]

def preprocess_html(self, soup):
script_item = soup.find('div',attrs={'class':'contentRow '})
header = soup.find('div',attrs={'class':'galleryHeaderTop'} )
if script_item:
p_flashlink = re.compile(r'http:.*\.xml')
m_flashlink = p_flashlink.findall(str(script_item.contents))
if m_flashlink:
print('m_flashlink[0] is: ', m_flashlink[0])
rawc = self.index_to_soup(m_flashlink[0],True)
soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)
# print ('soup2 is: ', soup2)
gallery_title = soup2.gallery.title
gallery_comment = soup2.gallery.comment
if gallery_comment :
gallery_comment_tag = Tag(soup, 'h2')
# print ('gallery_comment is: ', str(gallery_comment.string))
gallery_comment = NavigableString(gallery_comment.string)
gallery_comment_tag.insert(0, gallery_comment)
header.parent.insert(1, gallery_comment_tag)
photopaths = soup2.findAll('photopath')
if photopaths:
for photopath in photopaths:
imgsrc = str(photopath.string)
# print('imgsrc is: ', imgsrc)
divtag = Tag(soup, 'div')
photo_title_tag = Tag(soup, 'h3')
photo_comment_tag = Tag(soup, 'p')
imgtag = Tag(soup, 'img', [('src', imgsrc)])
photo_title = NavigableString(photopath.parent.title.string)
photo_comment = NavigableString(photopath.parent.comment.string)
photo_alt = NavigableString(photopath.parent.alt.string)
imgtag['alt'] = photo_alt
photo_title_tag.insert(0, photo_title)
photo_comment_tag.insert(0, photo_comment)
divtag.insert(0, photo_comment_tag)
divtag.insert(0, photo_title_tag)
divtag.insert(0, imgtag)
insert_posn = len(header.parent)
header.parent.insert(insert_posn, divtag)
script_item.extract()
else:
script_item.extract()
return soup

For convenience, here is the text-based recipe for the Olympics.
Spoiler:
#!/usr/bin/env python

__license__ = 'GPL v3'
__copyright__ = '2010, Starson17'
'''
www.nbcolympics.com
'''
from calibre.web.feeds.news import BasicNewsRecipe

class Olympics_2010(BasicNewsRecipe):
title = u'NBC Olympics 2010'
__author__ = 'Starson17'
description = 'Olympics 2010'
cover_url = 'http://www.digitaljournal.com/img/1/1/2/1/i/4/7/6/o/WinterOlympics2010-logo.jpg'
publisher = 'Olympics 2010'
tags = 'Olympics news'
language = 'en'
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
# recursions = 3
oldest_article = 7
max_articles_per_feed = 10

keep_only_tags = [dict(name='div', attrs={'class':['Article ','ArticleGallery']}),
]

remove_tags = [dict(name='div', attrs={'id':['RelatedTagsBox','ShareBox']}),
dict(name='div', attrs={'class':['DateUtilities','PhotoGallery BoxRight','Frame','ToolBox']}),
]

# RSS feeds are at: http://www.nbcolympics.com/rss/index.html
feeds = [
('NBCOlympics.com - News', 'http://www.nbcolympics.com/rss/newscenter/mostpopular.xml'),
('NBCOlympics.com - News - Top Stories', 'http://www.nbcolympics.com/rss/newscenter/topstories.xml'),
('NBCOlympics.com - News - Latest Headlines', 'http://www.nbcolympics.com/rss/newscenter/latestnews.xml'),
# ('NBCOlympics.com - Photos', 'http://www.nbcolympics.com/rss/photos/mostpopular.xml'),
# ('NBCOlympics.com - Photos - Editorial Picks', 'http://www.nbcolympics.com/rss/photos/editorialpicks.xml'),
# ('NBCOlympics.com - Photos - Latest Slideshows', 'http://www.nbcolympics.com/rss/photos/latestslideshows.xml'),
('NBCOlympics.com - Team USA - Latest news', 'http://www.nbcolympics.com/rss/countries/team-usa/index.xml'),
# ('NBCOlympics.com - Team USA - Latest Slideshows', 'http://www.nbcolympics.com/rss/countries/team-usa/photos/index.xml'),
# ('NBCOlympics.com - Team USA - Video', 'http://www.nbcolympics.com/rss/countries/team-usa/video/index.xml'),
# ('NBCOlympics.com - Alpine Skiing - Most Popular News', 'http://www.nbcolympics.com/rss/sport=AS/mostpopular.xml'),
# ('NBCOlympics.com - Alpine Skiing - Top News', 'http://www.nbcolympics.com/rss/sport=AS/topnews.xml'),
('NBCOlympics.com - Alpine Skiing - Latest News', 'http://www.nbcolympics.com/rss/sport=AS/latestnews.xml'),
# ('NBCOlympics.com - Biathlon - Most Popular News', 'http://www.nbcolympics.com/rss/sport=BT/mostpopular.xml'),
# ('NBCOlympics.com - Biathlon - Top News', 'http://www.nbcolympics.com/rss/sport=BT/topnews.xml'),
('NBCOlympics.com - Biathlon - Latest News', 'http://www.nbcolympics.com/rss/sport=BT/latestnews.xml'),
# ('NBCOlympics.com - Bobsled - Most Popular News', 'http://www.nbcolympics.com/rss/sport=BS/mostpopular.xml'),
# ('NBCOlympics.com - Bobsled - Top News', 'http://www.nbcolympics.com/rss/sport=BS/topnews.xml'),
('NBCOlympics.com - Bobsled - Latest News', 'http://www.nbcolympics.com/rss/sport=BS/latestnews.xml'),
# ('NBCOlympics.com - Cross-Country - Most Popular News', 'http://www.nbcolympics.com/rss/sport=CC/mostpopular.xml'),
# ('NBCOlympics.com - Cross-Country - Top News', 'http://www.nbcolympics.com/rss/sport=CC/topnews.xml'),
('NBCOlympics.com - Cross-Country - Latest News', 'http://www.nbcolympics.com/rss/sport=CC/latestnews.xml'),
# ('NBCOlympics.com - Curling - Most Popular News', 'http://www.nbcolympics.com/rss/sport=CU/mostpopular.xml'),
# ('NBCOlympics.com - Curling - Top News', 'http://www.nbcolympics.com/rss/sport=CU/topnews.xml'),
('NBCOlympics.com - Curling - Latest News', 'http://www.nbcolympics.com/rss/sport=CU/latestnews.xml'),
# ('NBCOlympics.com - Figure Skating - Most Popular News', 'http://www.nbcolympics.com/rss/sport=FS/mostpopular.xml'),
# ('NBCOlympics.com - Figure Skating - Top News', 'http://www.nbcolympics.com/rss/sport=FS/topnews.xml'),
('NBCOlympics.com - Figure Skating - Latest News', 'http://www.nbcolympics.com/rss/sport=FS/latestnews.xml'),
# ('NBCOlympics.com - Freestyle Skiing - Most Popular News', 'http://www.nbcolympics.com/rss/sport=FR/mostpopular.xml'),
# ('NBCOlympics.com - Freestyle Skiing - Top News', 'http://www.nbcolympics.com/rss/sport=FR/topnews.xml'),
('NBCOlympics.com - Freestyle Skiing - Latest News', 'http://www.nbcolympics.com/rss/sport=FR/latestnews.xml'),
# ('NBCOlympics.com - Hockey - Most Popular News', 'http://www.nbcolympics.com/rss/sport=IH/mostpopular.xml'),
# ('NBCOlympics.com - Hockey - Top News', 'http://www.nbcolympics.com/rss/sport=IH/topnews.xml'),
('NBCOlympics.com - Hockey - Latest News', 'http://www.nbcolympics.com/rss/sport=IH/latestnews.xml'),
# ('NBCOlympics.com - Luge - Most Popular News', 'http://www.nbcolympics.com/rss/sport=LG/mostpopular.xml'),
# ('NBCOlympics.com - Luge - Top News', 'http://www.nbcolympics.com/rss/sport=LG/topnews.xml'),
('NBCOlympics.com - Luge - Latest News', 'http://www.nbcolympics.com/rss/sport=LG/latestnews.xml'),
# ('NBCOlympics.com - Nordic Combined - Most Popular News', 'http://www.nbcolympics.com/rss/sport=NC/mostpopular.xml'),
# ('NBCOlympics.com - Nordic Combined - Top News', 'http://www.nbcolympics.com/rss/sport=NC/topnews.xml'),
('NBCOlympics.com - Nordic Combined - Latest News', 'http://www.nbcolympics.com/rss/sport=NC/latestnews.xml'),
# ('NBCOlympics.com - Short Track - Most Popular News', 'http://www.nbcolympics.com/rss/sport=ST/mostpopular.xml'),
# ('NBCOlympics.com - Short Track - Top News', 'http://www.nbcolympics.com/rss/sport=ST/topnews.xml'),
('NBCOlympics.com - Short Track - Latest News', 'http://www.nbcolympics.com/rss/sport=ST/latestnews.xml'),
# ('NBCOlympics.com - Skeleton - Most Popular News', 'http://www.nbcolympics.com/rss/sport=SN/mostpopular.xml'),
# ('NBCOlympics.com - Skeleton - Top News', 'http://www.nbcolympics.com/rss/sport=SN/topnews.xml'),
('NBCOlympics.com - Skeleton - Latest News', 'http://www.nbcolympics.com/rss/sport=SN/latestnews.xml'),
# ('NBCOlympics.com - Ski Jumping - Most Popular News', 'http://www.nbcolympics.com/rss/sport=SJ/mostpopular.xml'),
# ('NBCOlympics.com - Ski Jumping - Top News', 'http://www.nbcolympics.com/rss/sport=SJ/topnews.xml'),
('NBCOlympics.com - Ski Jumping - Latest News', 'http://www.nbcolympics.com/rss/sport=SJ/latestnews.xml'),
# ('NBCOlympics.com - Snowboarding - Most Popular News', 'http://www.nbcolympics.com/rss/sport=SB/mostpopular.xml'),
# ('NBCOlympics.com - Snowboarding - Top News', 'http://www.nbcolympics.com/rss/sport=SB/topnews.xml'),
('NBCOlympics.com - Snowboarding - Latest News', 'http://www.nbcolympics.com/rss/sport=SB/latestnews.xml'),
# ('NBCOlympics.com - Speed Skating - Most Popular News', 'http://www.nbcolympics.com/rss/sport=AS/mostpopular.xml'),
# ('NBCOlympics.com - Speed Skating - Top News', 'http://www.nbcolympics.com/rss/sport=AS/topnews.xml'),
('NBCOlympics.com - Speed Skating - Latest News', 'http://www.nbcolympics.com/rss/sport=AS/latestnews.xml'),
]

extra_css = '''
h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;}
h2{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;}
p{font-family:Arial,Helvetica,sans-serif;font-size:small;}
body{font-family:Helvetica,Arial,sans-serif;font-size:small;}
'''
Starson17 is offline