View Single Post
Old 06-07-2012, 08:59 AM   #4
atordo
Connoisseur
atordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to beholdatordo is a splendid one to behold
 
Posts: 89
Karma: 19669
Join Date: Apr 2012
Device: Kindle Touch
Thanks Kovid. Bellow it's one of my failed attempts. And here's an actual link to a multipage article in the site if you want to have a look: http://www.somethingawful.com/d/dung...estor-gunt.php

Interestingly, when opened with the Kindle web browser and selecting "article view" it grabs all the pages in one. I know it has nothing to do with calibre, but perhaps it can offer some hint.

Spoiler:
Code:
import re
from calibre.web.feeds.news import BasicNewsRecipe

class SomethingAwfulRecipe(BasicNewsRecipe):
    title = 'Something Awful'
    description = 'The Internet Makes You Stupid'
    cover_url = 'http://i.somethingawful.com/core/head-logo-bluegren.png'
    masthead_url = 'http://i.somethingawful.com/core/head-logo-bluegren.png'
    oldest_article = 14
    max_articles_per_feed = 50
    auto_cleanup = False
    no_stylesheets = True
    remove_javascript = True
    language = 'en'
    use_embedded_content  = False
    remove_empty_feeds = True
    publication_type = 'magazine'

    recursions = 10
    match_regexps = [r'\?page=\d+$']

    preprocess_regexps = [
        (re.compile(r'<!-- content end-->.*</body>', re.DOTALL), lambda match: '</body>')
#       ,(re.compile(r'href="(?P<page>\?page=\d+)"'),
#        lambda match: 'href="http://' + match.group('page'))
    ]

    remove_attributes = [ 'align', 'alt', 'valign' ]

    keep_only_tags = [
        dict(name='div', attrs={'class':'content_area'})
#       ,dict(name='p', attrs={'class':'pagebar'})
    ]
    remove_tags = [
        dict(name='div', attrs={'class':['column_box','featurenav','social']})
       ,dict(name='div', attrs={'id':'sidebar'})
       ,dict(name='a', attrs={'class':'curpage'})
    ]

    extra_css = '''
        .byline{font-size:small} .font_big{font-size:large}
        .compat5{font-weight:bold} .accentbox{background-color:#E3E3E3; border:solid black}
        img{margin-bottom:0.4em; display:block; margin-left: auto; margin-right:auto}
    '''

    #feeds = [(u'Something Awful', u'http://www.somethingawful.com/rss/index.rss.xml')]
    feeds = [
         ('Photoshop Phriday', 'http://www.somethingawful.com/rss/photoshop-phriday.rss.xml')
        ,('Comedy Goldmine', 'http://www.somethingawful.com/rss/comedy-goldmine.rss.xml')
        #,('The Flash Tub', 'http://www.somethingawful.com/rss/flash-tub.rss.xml')
        ,('Awful Link of the Day', 'http://www.somethingawful.com/rss/awful-links.rss.xml')
        ,('Fake Something Awfuls', 'http://www.somethingawful.com/rss/fake-something-awful.rss.xml')
        ,('The Barbarian\'s Dojo', 'http://www.somethingawful.com/rss/steve-sumner.rss.xml')
        ,('The Great Goon Database', 'http://www.somethingawful.com/rss/great-goon-database.rss.xml')
        ,('Livejournal Theater', 'http://www.somethingawful.com/rss/livejournal-theater.rss.xml')
        ,('Joystick Token Healthpack', 'http://www.somethingawful.com/rss/token-healthpack.rss.xml')
        #,('Webcam Ward', 'http://www.somethingawful.com/rss/webcam-ward.rss.xml')
        ,('Features / Articles', 'http://www.somethingawful.com/rss/feature-articles.rss.xml')
        ,('Guides', 'http://www.somethingawful.com/rss/guides.rss.xml')
        ,('Legal Threats', 'http://www.somethingawful.com/rss/legal-threats.rss.xml')
        ,('Pranks [ICQ]', 'http://www.somethingawful.com/rss/icq-pranks.rss.xml')
        ,('State Og', 'http://www.somethingawful.com/rss/state-og.rss.xml')
        ,('Everquest', 'http://www.somethingawful.com/rss/everquest.rss.xml')
        ,('Pranks [Email]', 'http://www.somethingawful.com/rss/email-pranks.rss.xml')
        ,('The Weekend Web', 'http://www.somethingawful.com/rss/weekend-web.rss.xml')
        ,('Daily Dirt', 'http://www.somethingawful.com/rss/daily-dirt.rss.xml')
        ,('The Art of Warcraft', 'http://www.somethingawful.com/rss/art-of-warcraft.rss.xml')
        ,('Video Game Article', 'http://www.somethingawful.com/rss/video-game-article.rss.xml')
        ,('The Awful Movie Database', 'http://www.somethingawful.com/rss/awful-movie-database.rss.xml')
        #,('Downloads', 'http://www.somethingawful.com/rss/downloads.rss.xml')
        ,('Pregame Wrapup', 'http://www.somethingawful.com/rss/pregame-wrapup.rss.xml')
        ,('Second Life Safari', 'http://www.somethingawful.com/rss/second-life-safari.rss.xml')
        ,('The Hogosphere', 'http://www.somethingawful.com/rss/hogosphere.rss.xml')
        ,('Front Page News', 'http://www.somethingawful.com/rss/news.rss.xml')
        ,('Forum Friday\'s Monday', 'http://www.somethingawful.com/rss/forum-fridays.rss.xml')
        ,('Cliff Yablonski Hates You', 'http://www.somethingawful.com/rss/cliff-yablonski.rss.xml')
        ,('Manifestos From the Internet', 'http://www.somethingawful.com/rss/manifestos-from-internet.rss.xml')
        ,('Johnston Checks In', 'http://www.somethingawful.com/rss/levi-johnston.rss.xml')
        ,('Twitter Tuesday', 'http://www.somethingawful.com/rss/twitter-tuesday.rss.xml')
        ,('Music Article', 'http://www.somethingawful.com/rss/music-article.rss.xml')
        ,('Reviews [Games]', 'http://www.somethingawful.com/rss/game-reviews.rss.xml')
        ,('Reviews [Movies]', 'http://www.somethingawful.com/rss/movie-reviews.rss.xml')
        ,('Rom Pit', 'http://www.somethingawful.com/rss/rom-pit.rss.xml')
        ,('Truth Media [Reviews]', 'http://www.somethingawful.com/rss/truth-media-reviews.rss.xml')
        ,('Truth Media [Flames]', 'http://www.somethingawful.com/rss/truth-media-flames.rss.xml')
        ,('Awful Anime', 'http://www.somethingawful.com/rss/hentai-game-reviews.rss.xml')
        ,('The Horrors of Pornography', 'http://www.somethingawful.com/rss/horrors-of-porn.rss.xml')
        ,('Your Band Sucks', 'http://www.somethingawful.com/rss/your-band-sucks.rss.xml')
        ,('Fashion SWAT', 'http://www.somethingawful.com/rss/fashion-swat.rss.xml')
        #,('AwfulVision', 'http://www.somethingawful.com/rss/awfulvision.rss.xml')
        ,('MMO Roulette', 'http://www.somethingawful.com/rss/mmo-roulette.rss.xml')
        ,('The Most Awful', 'http://www.somethingawful.com/rss/most-awful.rss.xml')
        ,('Garbage Day', 'http://www.somethingawful.com/rss/garbage-day.rss.xml')
        ,('WTF, D&D!?', 'http://www.somethingawful.com/rss/dungeons-and-dragons.rss.xml')
        ,('Current Releases', 'http://www.somethingawful.com/rss/current-movie-reviews.rss.xml')
    ]

#    def is_link_wanted(self, url, tag):
#        self.log('Analizando: %s'%url)
#        ans = re.match(r'^http://www\.somethingawful\.com/.*\.php\?page=\d+$', url) is not None
#        if ans:
#            self.log('Following multipage link: %s'%url)
#        return ans

#    INDEX = ???
#    def preprocess_html(self, soup):
#        for a in soup('a'):
#            if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
#                a['href']=self.INDEX + a['href']
#        return soup
atordo is offline   Reply With Quote