MobileRead Forums - View Single Post

limnoski · 04-29-2011, 11:56 AM

Hi guys,

I have to say thanks to Nudge for getting this working in the first place. I managed to get a couple of downloads before they changed the website!
So I have tried again to get this recipe working, I thought it might be a simple case of changing the tags. However it looks like it is not so simple...

Here is as far as I have got with Nudge's recipe:

Code:

from calibre.web.feeds.news import BasicNewsRecipe
import re

class Cracked(BasicNewsRecipe):
    title                 = u'Cracked.com'
    __author__            = u'Nudgenudge'
    language              = 'en'
    description            = 'America''s Only Humor and Video Site, since 1958'
    publisher             = 'Cracked'
    category              = 'comedy, lists'
    oldest_article        = 2
    delay                 = 10
    max_articles_per_feed = 2
    no_stylesheets        = True
    encoding              = 'cp1252'
    remove_javascript     = True
    use_embedded_content  = False
    INDEX                 = u'http://www.cracked.com'
    extra_css             = """
                                .pageheader_type{font-size: x-large; font-weight: bold; color: #828D74}
                                .pageheader_title{font-size: xx-large; color: #394128}
                                .pageheader_byline{font-size: small; font-weight: bold; color: #394128}
                                .score_bg {display: inline; width: 100%; margin-bottom: 2em}
                                .score_column_1{ padding-left: 10px; font-size: small; width: 50%}
                                .score_column_2{ padding-left: 10px; font-size: small; width: 50%}
                                .score_column_3{ padding-left: 10px; font-size: small; width: 50%}
                                .score_header{font-size: large; color: #50544A}
                                .bodytext{display: block}
                                body{font-family: Helvetica,Arial,sans-serif}
                            """

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        , 'linearize_tables' : True
                        }

    keep_only_tags    =  [
                        dict(name='section', attrs={'class':['body']})
                        ]

    feeds = [(u'Articles', u'http://feeds.feedburner.com/CrackedRSS')]

    def get_article_url(self, article):
        return article.get('guid',  None)

    def cleanup_page(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
	    for alink in soup.findAll('a'):
	        if alink.string is not None:
	            tstr = alink.string
	            alink.replaceWith(tstr)
        for div_to_remove in soup.findAll('div', attrs={'id':['persistent-share','inline-share-buttons']}):
            div_to_remove.extract()
        for div_to_remove in soup.findAll('div', attrs={'class':['FacebookLike','shareBar']}):
            div_to_remove.extract()
        for nav_to_remove in soup.findAll('nav', attrs={'class':re.compile("PaginationContent")}):
            nav_to_remove.extract()
        for image in soup.findAll('img', attrs={'alt': 'article image'}):
            image.extract()

    def append_page(self, soup, appendtag, position):
        pager = soup.find('a',attrs={'class':'next'})
        if pager:
            nexturl = self.INDEX + pager['href']
            soup2 = self.index_to_soup(nexturl)
            texttag = soup2.find('Article', attrs={'class':re.compile("Article Module")})
            newpos = len(texttag.contents)
            self.append_page(soup2,texttag,newpos)
            texttag.extract()
            self.cleanup_page(appendtag)
            appendtag.insert(position,texttag)
        else:
            self.cleanup_page(appendtag)

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        return self.adeify_images(soup)

I have also tried another variation on the recipe that uses the cracked website instead of the RSS feed, it has the same problem however. Here it is below if anyone is interested, the RSS feed is still much neater:

Code:

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre import strftime
import re

class Cracked(BasicNewsRecipe):
    title                 = u'Cracked.com'
    __author__            = u'Nudgenudge'
    language              = 'en'
    description            = 'America''s Only Humor and Video Site, since 1958'
    publisher             = 'Cracked'
    category              = 'comedy, lists'
    oldest_article        = 2
    delay                 = 10
    max_articles_per_feed = 2
    no_stylesheets        = True
    encoding              = 'cp1252'
    remove_javascript     = True
    use_embedded_content  = False
    INDEX                 = u'http://www.cracked.com'
    extra_css             = """
                                .pageheader_type{font-size: x-large; font-weight: bold; color: #828D74}
                                .pageheader_title{font-size: xx-large; color: #394128}
                                .pageheader_byline{font-size: small; font-weight: bold; color: #394128}
                                .score_bg {display: inline; width: 100%; margin-bottom: 2em}
                                .score_column_1{ padding-left: 10px; font-size: small; width: 50%}
                                .score_column_2{ padding-left: 10px; font-size: small; width: 50%}
                                .score_column_3{ padding-left: 10px; font-size: small; width: 50%}
                                .score_header{font-size: large; color: #50544A}
                                .bodytext{display: block}
                                body{font-family: Helvetica,Arial,sans-serif}
                            """

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        , 'linearize_tables' : True
                        }

    keep_only_tags    =  [
                        dict(name='section', attrs={'class':['body']})
                        ]

    def parse_index(self):
        articles = []
        rawc = self.index_to_soup('http://www.cracked.com/funny-articles.html',True)
        soup = BeautifulSoup(rawc,fromEncoding=self.encoding)
        

        for item in soup.findAll(attrs={'class':'content'}):
            description = ''
            title_prefix = ''
            feed_link = item.find('a',href=True)
            descript = item.find('a')
            if descript:
               description = self.tag_to_string(descript)
            if feed_link:
                url   = feed_link['href']
                title = title_prefix + self.tag_to_string(feed_link)
                date  = strftime(self.timefmt)
                articles.append({
                                  'title'      :title
                                 ,'date'       :date
                                 ,'url'        :url
                                 ,'description':description
                                })
        return [(self.tag_to_string(soup.find('title')), articles)]

    def get_article_url(self, article):
        return article.get('guid',  None)

    def cleanup_page(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
	    for alink in soup.findAll('a'):
	        if alink.string is not None:
	            tstr = alink.string
	            alink.replaceWith(tstr)
        for div_to_remove in soup.findAll('div', attrs={'id':['persistent-share','inline-share-buttons']}):
            div_to_remove.extract()
        for div_to_remove in soup.findAll('div', attrs={'class':['FacebookLike','shareBar']}):
            div_to_remove.extract()
        for nav_to_remove in soup.findAll('nav', attrs={'class':re.compile("PaginationContent")}):
            nav_to_remove.extract()
        for image in soup.findAll('img', attrs={'alt': 'article image'}):
            image.extract()

    def append_page(self, soup, appendtag, position):
        pager = soup.find('a',attrs={'class':'next'})
        if pager:
            nexturl = self.INDEX + pager['href']
            soup2 = self.index_to_soup(nexturl)
            texttag = soup2.find('Article', attrs={'class':re.compile("Article Module")})
            newpos = len(texttag.contents)
            self.append_page(soup2,texttag,newpos)
            texttag.extract()
            self.cleanup_page(appendtag)
            appendtag.insert(position,texttag)
        else:
            self.cleanup_page(appendtag)

    def preprocess_html(self, soup):
        self.append_page(soup, soup.body, 3)
        return self.adeify_images(soup)