View Single Post
Old 09-25-2010, 10:28 AM   #1
Starson17
Wizard
Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.Starson17 can program the VCR without an owner's manual.
 
Posts: 4,004
Karma: 177841
Join Date: Dec 2009
Device: WinMo: IPAQ; Android: HTC HD2, Archos 7o; Java:Gravity T
Scientific American

The current Scientific American recipe is broken. The site changed. I've completely rewritten it:
Spoiler:
Code:
#!/usr/bin/env  python
__license__   = 'GPL v3'

import re
from calibre.web.feeds.news import BasicNewsRecipe

class ScientificAmerican(BasicNewsRecipe):
    title                 = u'Scientific American'
    description           = u'Popular Science. Monthly magazine.'
    category              = 'science'
    __author__            = 'Starson17'
    no_stylesheets        = True
    use_embedded_content  = False
    language              = 'en'
    publisher             = 'Nature Publishing Group'
    remove_empty_feeds    = True
    remove_javascript     = True
    oldest_article        = 30
    max_articles_per_feed = 100

    conversion_options = {'linearize_tables'  : True
                        , 'comment'           : description
                        , 'tags'              : category
                        , 'publisher'         : publisher
                        , 'language'          : language
                        }

    keep_only_tags = [
                dict(name='h2', attrs={'class':'articleTitle'})
                ,dict(name='p', attrs={'id':'articleDek'})
                ,dict(name='p', attrs={'class':'articleInfo'})
                ,dict(name='div', attrs={'id':['articleContent']})
                ,dict(name='img', attrs={'src':re.compile(r'/media/inline/blog/Image/', re.DOTALL|re.IGNORECASE)}) 
                ]

    remove_tags = [dict(name='a', attrs={'class':'tinyCommentCount'})]

    def parse_index(self):
        soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/')
        issuetag = soup.find('p',attrs={'id':'articleDek'})
        self.timefmt = ' [%s]'%(self.tag_to_string(issuetag))
        img = soup.find('img', alt='Scientific American Magazine', src=True)
        if img is not None:
            self.cover_url = img['src']
        features, feeds = [], []
        for a in soup.find(attrs={'class':'primaryCol'}).findAll('a',attrs={'title':'Feature'}):
            if a is None: continue
            desc = ''
            s = a.parent.parent.find(attrs={'class':'dek'})
            desc = self.tag_to_string(s)
            article = {
                    'url' : a['href'],
                    'title' : self.tag_to_string(a),
                    'date' : '',
                    'description' : desc,
                    }
            features.append(article)
        feeds.append(('Features', features))
        department = []
        title = None
        for li in soup.find(attrs={'class':'secondaryCol'}).findAll('li'):
            if 'department.cfm' in li.a['href']:
                if department:
                    feeds.append((title, department))
                title = self.tag_to_string(li.a)
                department = []
            if 'article.cfm' in li.h3.a['href']:
                article = {
                        'url' : li.h3.a['href'],
                        'title' : self.tag_to_string(li.h3.a),
                        'date': '',
                        'description': self.tag_to_string(li.p),
                    }
                department.append(article)
        if department:
            feeds.append((title, department))
        return feeds

    def postprocess_html(self, soup, first_fetch):
        for item in soup.findAll('a'):
            if 'topic.cfm' in item['href']:
                item.replaceWith(item.string)
        return soup

    extra_css = '''
                p{font-weight: normal; font-size:small}
                li{font-weight: normal; font-size:small}
                .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
                h2{font-size:large; font-family:Arial,Helvetica,sans-serif;}
                h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;}
                '''

Last edited by Starson17; 09-25-2010 at 11:13 AM.
Starson17 is offline   Reply With Quote