View Single Post
Old 02-25-2011, 08:54 PM   #4
Starson17
Wizard
Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.
 
Posts: 4,004
Karma: 177841
Join Date: Dec 2009
Device: WinMo: IPAQ; Android: HTC HD2, Archos 7o; Java:Gravity T
Quote:
Originally Posted by Finbar127 View Post
Do you know where I would define Feed in the recipe? Also Could you point me to a recipe where this particular chunk of code is used?
from calibre.web.feeds import Feed
Reader's Digest follows:
Spoiler:
Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'
'''
'''
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Feed


class ReadersDigest(BasicNewsRecipe):

    title       = 'Readers Digest'
    __author__  = 'BrianG'
    language = 'en'
    description = 'Readers Digest Feeds'
    no_stylesheets        = True
    use_embedded_content  = False
    oldest_article = 60
    max_articles_per_feed = 200

    language = 'en'
    remove_javascript     = True

    extra_css      = ''' h1 {font-family:georgia,serif;color:#000000;}
                        .mainHd{font-family:georgia,serif;color:#000000;}
                         h2 {font-family:Arial,Sans-serif;}
                        .name{font-family:Arial,Sans-serif; font-size:x-small;font-weight:bold; }
                        .date{font-family:Arial,Sans-serif; font-size:x-small ;color:#999999;}
                        .byline{font-family:Arial,Sans-serif; font-size:x-small ;}
                        .photoBkt{ font-size:x-small ;}
                        .vertPhoto{font-size:x-small ;}
                        .credits{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
                        .credit{font-family:Arial,Sans-serif; font-size:x-small ;color:gray;}
                        .artTxt{font-family:georgia,serif;}
                        .caption{font-family:georgia,serif; font-size:x-small;color:#333333;}
                        .credit{font-family:georgia,serif; font-size:x-small;color:#999999;}
                        a:link{color:#CC0000;}
                        .breadcrumb{font-family:Arial,Sans-serif;font-size:x-small;}
                        '''


    remove_tags = [
        dict(name='h4', attrs={'class':'close'}),
        dict(name='div', attrs={'class':'fromLine'}),
        dict(name='img', attrs={'class':'colorTag'}),
        dict(name='div', attrs={'id':'sponsorArticleHeader'}),
        dict(name='div', attrs={'class':'horizontalAd'}),
        dict(name='div', attrs={'id':'imageCounterLeft'}),
        dict(name='div', attrs={'id':'commentsPrint'})
        ]


    feeds = [
            ('New in RD', 'http://feeds.rd.com/ReadersDigest'),
            ('Jokes', 'http://feeds.rd.com/ReadersDigestJokes'),
            ('Cartoons', 'http://feeds.rd.com/ReadersDigestCartoons'),
            ('Blogs','http://feeds.rd.com/ReadersDigestBlogs')
        ]

    cover_url = 'http://www.rd.com/images/logo-main-rd.gif'



#-------------------------------------------------------------------------------------------------

    def print_version(self, url):

        # Get the identity number of the current article and append it to the root print URL

        if url.find('/article') > 0:
            ident = url[url.find('/article')+8:url.find('.html?')-4]
            url = 'http://www.rd.com/content/printContent.do?contentId=' + ident

        elif url.find('/post') > 0:

            # in this case, have to get the page itself to derive the Print page.
            soup = self.index_to_soup(url)
            newsoup = soup.find('ul',attrs={'class':'printBlock'})
            url = 'http://www.rd.com' + newsoup('a')[0]['href']
            url = url[0:url.find('&Keep')]

        return url

#-------------------------------------------------------------------------------------------------

    def parse_index(self):

        pages = [
                ('Your America','http://www.rd.com/your-america-inspiring-people-and-stories', 'channelLeftContainer',{'class':'moreLeft'}),
                # useless recipes ('Living Healthy','http://www.rd.com/living-healthy', 'channelLeftContainer',{'class':'moreLeft'}),
                ('Advice and Know-How','http://www.rd.com/advice-and-know-how', 'channelLeftContainer',{'class':'moreLeft'})

            ]

        feeds = []

        for page in pages:
            section, url, divider, attrList = page
            newArticles = self.page_parse(url, divider, attrList)
            feeds.append((section,newArticles))

        # after the pages of the site have been processed, parse several RSS feeds for additional sections
        newfeeds = Feed()
        newfeeds = self.parse_rss()


        # The utility code in parse_rss returns a Feed object.  Convert each feed/article combination into a form suitable
        # for this module (parse_index).

        for feed in newfeeds:
            newArticles = []
            for article in feed.articles:
                newArt = {
                            'title' : article.title,
                            'url'   : article.url,
                            'date'  : article.date,
                            'description' : article.text_summary
                        }
                newArticles.append(newArt)


            # New and Blogs should be the first two feeds.
            if feed.title == 'New in RD':
                feeds.insert(0,(feed.title,newArticles))
            elif feed.title == 'Blogs':
                feeds.insert(1,(feed.title,newArticles))
            else:
                feeds.append((feed.title,newArticles))


        return feeds

#-------------------------------------------------------------------------------------------------

    def page_parse(self, mainurl, divider, attrList):

        articles = []
        mainsoup = self.index_to_soup(mainurl)
        for item in mainsoup.findAll(attrs=attrList):
            newArticle = {
                        'title' : item('img')[0]['alt'],
                        'url'   : 'http://www.rd.com'+item('a')[0]['href'],
                        'date'  : '',
                        'description' : ''
                    }
            articles.append(newArticle)



        return articles



#-------------------------------------------------------------------------------------------------

    def parse_rss (self):

        # Do the "official" parse_feeds first
        feeds = BasicNewsRecipe.parse_feeds(self)


        # Loop thru the articles in all feeds to find articles with "recipe" in it
        recipeArticles = []
        for curfeed in feeds:
            delList = []
            for a,curarticle in enumerate(curfeed.articles):
                if curarticle.title.upper().find('RECIPE') >= 0:
                    recipeArticles.append(curarticle)
                    delList.append(curarticle)
            if len(delList)>0:
                for d in delList:
                    index = curfeed.articles.index(d)
                    curfeed.articles[index:index+1] = []

        # If there are any recipes found, create a new Feed object and append.
        if len(recipeArticles) > 0:
            pfeed = Feed()
            pfeed.title = 'Recipes'
            pfeed.descrition = 'Recipe Feed (Virtual)'
            pfeed.image_url  = None
            pfeed.oldest_article = 30
            pfeed.id_counter = len(recipeArticles)
            # Create a new Feed, add the recipe articles, and then append
            # to "official" list of feeds
            pfeed.articles = recipeArticles[:]
            feeds.append(pfeed)

        return feeds

Last edited by Starson17; 02-25-2011 at 09:21 PM.
Starson17 is offline   Reply With Quote