MobileRead Forums - View Single Post - The Grid TO

underwarez · 05-02-2012, 02:35 PM

Hi All,

This will be my first recipe. It's to scrape a site thegridto.com. I have two problems so far, since I just started.

1. There are suppose to be 3 sections in my table of content: city, life and culture but somehow only city and life shows up.

2. Under each of the section, only two of article links are generated. Even though there should be over 10 articles each.

Can you please take a look at my code and help me correct the problem?

Code:

class TheGridTO(BasicNewsRecipe):
    oldest_article = 7
    max_articles_per_feed = 100
    auto_cleanup = True

    title                  = u'The Grid TO'
    description            = (u'The Grid is a weekly city magazine and daily website providing a fresh, '
                    'accessible voice for Toronto.')
    __author__             = u'Yusuf W'
    language               = 'en_US'

    max_articles_per_feed  = 300
    oldest_article         = 8.0

    publication_type = 'newspaper'
    simultaneous_downloads = 5

    no_stylesheets         = False
    remove_tags           =  [
                                 dict(name='div', id=['comments','page-header', ])
                                ,dict(attrs={'class':['pull-right', 'right-content']})
                             ]

    keep_only_tags        = [dict(name='div', id=['content'])]

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.thegridto.com/issues/50/')        
        div = soup.find(attrs={'class':'article-block latest-issue'})

        img = div.find('img')
        
        cover_url = img.get('src')
        self.log ('\t\tCover URL', cover_url)
        return cover_url

    def parse_index(self):
        feeds = []
        
        soup = self.index_to_soup('http://www.thegridto.com/issues/51/')        
        for section in ['city', 'life', 'culture']:
            self.log('\t\t Section', section)
            section_class = 'left-content article-listing ' + section + ' pull-left'
            div = soup.find(attrs={'class': section_class})

            articles = []
            for tag in div.findAllNext(attrs={'class':'search-block'}):                
                a = tag.findAll('a', href=True)[1]                
                
                title = self.tag_to_string(a)
                url = a.get('href', False)
                
                self.log('\t\t Found Article', title)
                self.log('\t\t', url)

                articles.append({'title': title, 'url': url, 'description':'', 'date':''})
            
            feeds.append((section, articles))   
            self.log('\t\t Length of articles', len(articles))
            self.log('\t\t End section log\n')                     
        return feeds