MobileRead Forums - View Single Post - The Grid TO

underwarez · 05-02-2012, 10:25 PM

I removed too much code in the previous code sample and it causes the articles to not be displayed.

Here's the real copy that I'm working on, please help getting the table of content to work:

Code:

class TheGridTO(BasicNewsRecipe):
    oldest_article = 7
    max_articles_per_feed = 100
    auto_cleanup = True

    #: The title to use for the ebook
    title                  = u'The Grid TO'

    #: A couple of lines that describe the content this recipe downloads.
    #: This will be used primarily in a GUI that presents a list of recipes.
    description = (u'The Grid is a weekly city magazine and daily website providing a fresh, '
                    'accessible voice for Toronto.')

    #: The author of this recipe
    __author__             = u'Yusuf W'

    #: The language that the news is in. Must be an ISO-639 code either
    #: two or three characters long
    language               = 'en_US'

    #: Maximum number of articles to download from each feed. This is primarily
    #: useful for feeds that don't have article dates. For most feeds, you should
    #: use :attr:`BasicNewsRecipe.oldest_article`
    max_articles_per_feed  = 300

    #: Oldest article to download from this news source. In days.
    oldest_article         = 8.0

    #: Number of levels of links to follow on article webpages
    #recursions             = 0

    #: Delay between consecutive downloads in seconds. The argument may be a
    #: floating point number to indicate a more precise time.
    #delay                  = 0

    #: Publication type
    #: Set to newspaper, magazine or blog
    publication_type = 'newspaper'

    #: Number of simultaneous downloads. Set to 1 if the server is picky.
    #: Automatically reduced to 1 if :attr:`BasicNewsRecipe.delay` > 0
    simultaneous_downloads = 5

    #: Timeout for fetching files from server in seconds
    timeout                = 220.0

    #: The format string for the date shown on the first page.
    #: By default: Day_Name, Day_Number Month_Name Year
    timefmt                = ' [%a, %d %b %Y]'

    #: List of feeds to download
    #: Can be either ``[url1, url2, ...]`` or ``[('title1', url1), ('title2', url2),...]``
    feeds = None

    #: Max number of characters in the short description
    #summary_length         = 500

    #: Convenient flag to disable loading of stylesheets for websites
    #: that have overly complex stylesheets unsuitable for conversion
    #: to ebooks formats
    #: If True stylesheets are not downloaded and processed
    no_stylesheets         = False

    #: Convenient flag to strip all javascript tags from the downloaded HTML
    remove_javascript      = True

    #: If True the GUI will ask the user for a username and password
    #: to use while downloading
    #: If set to "optional" the use of a username and password becomes optional
    needs_subscription     = False

    #: If True the navigation bar is center aligned, otherwise it is left aligned
    center_navbar = True

    #: Specify an override encoding for sites that have an incorrect
    #: charset specification. The most common being specifying ``latin1`` and
    #: using ``cp1252``. If None, try to detect the encoding. If it is a
    #: callable, the callable is called with two arguments: The recipe object
    #: and the source to be decoded. It must return the decoded source.
    #encoding               = None

    #: Normally we try to guess if a feed has full articles embedded in it
    #: based on the length of the embedded content. If `None`, then the
    #: default guessing is used. If `True` then the we always assume the feeds has
    #: embedded content and if `False` we always assume the feed does not have
    #: embedded content.
    #use_embedded_content   = None

    #: Set to True and implement :meth:`get_obfuscated_article` to handle
    #: websites that try to make it difficult to scrape content.
    #articles_are_obfuscated = False

    #: Reverse the order of articles in each feed
    #reverse_article_order = False

    #: Automatically extract all the text from downloaded article pages. Uses
    #: the algorithms from the readability project. Setting this to True, means
    #: that you do not have to worry about cleaning up the downloaded HTML
    #: manually (though manual cleanup will always be superior).
    auto_cleanup = False

    #: Specify elements that the auto cleanup algorithm should never remove
    #: The syntax is a XPath expression. For example::
    #:
    #:   auto_cleanup_keep = '//div[@id="article-image"]' will keep all divs with
    #:                                                  id="article-image"
    #:   auto_cleanup_keep = '//*[@class="important"]' will keep all elements
    #:                                               with class="important"
    #:   auto_cleanup_keep = '//div[@id="article-image"]|//span[@class="important"]'
    #:                     will keep all divs with id="article-image" and spans
    #:                     with class="important"
    #:
    auto_cleanup_keep = None

    #: Specify any extra :term:`CSS` that should be added to downloaded :term:`HTML` files
    #: It will be inserted into `<style>` tags, just before the closing
    #: `</head>` tag thereby overriding all :term:`CSS` except that which is
    #: declared using the style attribute on individual :term:`HTML` tags.
    #: For example::
    #:
    #:     extra_css = '.heading { font: serif x-large }'
    #:
    extra_css              = None

    #: If True empty feeds are removed from the output.
    #: This option has no effect if parse_index is overriden in
    #: the sub class. It is meant only for recipes that return a list
    #: of feeds using `feeds` or :meth:`get_feeds`.
    #remove_empty_feeds = False

    #: List of regular expressions that determines which links to follow
    #: If empty, it is ignored. Used only if is_link_wanted is
    #: not implemented. For example::
    #:
    #:     match_regexps = [r'page=[0-9]+']
    #:
    #: will match all URLs that have `page=some number` in them.
    #:
    #: Only one of :attr:`BasicNewsRecipe.match_regexps` or
    #: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
    #match_regexps         = []

    #: List of regular expressions that determines which links to ignore
    #: If empty it is ignored. Used only if is_link_wanted is not
    #: implemented. For example::
    #:
    #:     filter_regexps = [r'ads\.doubleclick\.net']
    #:
    #: will remove all URLs that have `ads.doubleclick.net` in them.
    #:
    #: Only one of :attr:`BasicNewsRecipe.match_regexps` or
    #: :attr:`BasicNewsRecipe.filter_regexps` should be defined.
    #filter_regexps        = []

    #: Recipe specific options to control the conversion of the downloaded
    #: content into an e-book. These will override any user or plugin specified
    #: values, so only use if absolutely necessary. For example::
    #:
    #:   conversion_options = {
    #:     'base_font_size'   : 16,
    #:     'tags'             : 'mytag1,mytag2',
    #:     'title'            : 'My Title',
    #:     'linearize_tables' : True,
    #:   }
    #:
    conversion_options = {}

    #: List of tags to be removed. Specified tags are removed from downloaded HTML.
    #: A tag is specified as a dictionary of the form::
    #:
    #:    {
    #:     name      : 'tag name',   #e.g. 'div'
    #:     attrs     : a dictionary, #e.g. {class: 'advertisment'}
    #:    }
    #:
    #: All keys are optional. For a full explanantion of the search criteria, see
    #: `Beautiful Soup <http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)>`_
    #: A common example::
    #:
    #:   remove_tags = [dict(name='div', attrs={'class':'advert'})]
    #:
    #: This will remove all `<div class="advert">` tags and all
    #: their children from the downloaded :term:`HTML`.
    remove_tags           =  [
                                 dict(name='div', id=['comments','page-header', ])
                                ,dict(attrs={'class':['pull-right', 'right-content']})
                             ]


    #: Remove all tags that occur after the specified tag.
    #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
    #: For example::
    #:
    #:     remove_tags_after = [dict(id='content')]
    #:
    #: will remove all
    #: tags after the first element with `id="content"`.
    remove_tags_after     = None

    #: Remove all tags that occur before the specified tag.
    #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
    #: For example::
    #:
    #:     remove_tags_before = dict(id='content')
    #:
    #: will remove all
    #: tags before the first element with `id="content"`.
    remove_tags_before    = None

    #: List of attributes to remove from all tags
    #: For example::
    #:
    #:   remove_attributes = ['style', 'font']
    remove_attributes = []

    #: Keep only the specified tags and their children.
    #: For the format for specifying a tag see :attr:`BasicNewsRecipe.remove_tags`.
    #: If this list is not empty, then the `<body>` tag will be emptied and re-filled with
    #: the tags that match the entries in this list. For example::
    #:
    #:     keep_only_tags = [dict(id=['content', 'heading'])]
    #:
    #: will keep only tags that have an `id` attribute of `"content"` or `"heading"`.
    keep_only_tags        = [dict(name='div', id=['content'])]

    #: List of :term:`regexp` substitution rules to run on the downloaded :term:`HTML`.
    #: Each element of the
    #: list should be a two element tuple. The first element of the tuple should
    #: be a compiled regular expression and the second a callable that takes
    #: a single match object and returns a string to replace the match. For example::
    #:
    #:     preprocess_regexps = [
    #:        (re.compile(r'<!--Article ends here-->.*</body>', re.DOTALL|re.IGNORECASE),
    #:         lambda match: '</body>'),
    #:     ]
    #:
    #: will remove everythong from `<!--Article ends here-->` to `</body>`.
    preprocess_regexps    = []

    #: The CSS that is used to style the templates, i.e., the navigation bars and
    #: the Tables of Contents. Rather than overriding this variable, you should
    #: use `extra_css` in your recipe to customize look and feel.
    #template_css = u''
    
    #: By default, calibre will use a default image for the masthead (Kindle only).
    #: Override this in your recipe to provide a url to use as a masthead.
    masthead_url = None

    #: By default, the cover image returned by get_cover_url() will be used as
    #: the cover for the periodical.  Overriding this in your recipe instructs
    #: calibre to render the downloaded cover into a frame whose width and height
    #: are expressed as a percentage of the downloaded cover.
    #: cover_margins = (10, 15, '#ffffff') pads the cover with a white margin
    #: 10px on the left and right, 15px on the top and bottom.
    #: Color names defined at http://www.imagemagick.org/script/color.php
    #: Note that for some reason, white does not always work on windows. Use
    #: #ffffff instead
    cover_margins = (0, 0, '#ffffff')

    #: Set to a non empty string to disable this recipe
    #: The string will be used as the disabled message
    recipe_disabled = None

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.thegridto.com/issues/50/')        
        div = soup.find(attrs={'class':'article-block latest-issue'})
        #a = tag.find('a', href=True)
        img = div.find('img')
        
        cover_url = img.get('src')
        self.log ('\t\tCover URL', cover_url)
        return cover_url

    def parse_index(self):
        feeds = []
        
        soup = self.index_to_soup('http://www.thegridto.com/issues/51/')        
        for section in ['city', 'life', 'culture']:
            self.log('\t\t Section', section)
            section_class = 'left-content article-listing ' + section + ' pull-left'
            div = soup.find(attrs={'class': section_class})

            articles = []
            for tag in div.findAllNext(attrs={'class':'search-block'}):                
                a = tag.findAll('a', href=True)[1]                
                
                title = self.tag_to_string(a)
                url = a.get('href', False)
                
                self.log('\t\t Found Article', title)
                self.log('\t\t', url)

                articles.append({'title': title, 'url': url, 'description':'', 'date':''})
            
            feeds.append((section, articles))   
            self.log('\t\t Length of articles', len(articles))
            self.log('\t\t End section log\n')                     
        return feeds