View Single Post
Old 05-02-2012, 01:35 PM   #1
underwarez
Junior Member
underwarez began at the beginning.
 
Posts: 7
Karma: 10
Join Date: Mar 2012
Location: Toronto
Device: Kobo Touch
The Grid TO - Need help with my table of content

Hi All,

This will be my first recipe. It's to scrape a site thegridto.com. I have two problems so far, since I just started.

1. There are suppose to be 3 sections in my table of content: city, life and culture but somehow only city and life shows up.

2. Under each of the section, only two of article links are generated. Even though there should be over 10 articles each.

Can you please take a look at my code and help me correct the problem?

Code:
class TheGridTO(BasicNewsRecipe):
    oldest_article = 7
    max_articles_per_feed = 100
    auto_cleanup = True

    title                  = u'The Grid TO'
    description            = (u'The Grid is a weekly city magazine and daily website providing a fresh, '
                    'accessible voice for Toronto.')
    __author__             = u'Yusuf W'
    language               = 'en_US'

    max_articles_per_feed  = 300
    oldest_article         = 8.0

    publication_type = 'newspaper'
    simultaneous_downloads = 5

    no_stylesheets         = False
    remove_tags           =  [
                                 dict(name='div', id=['comments','page-header', ])
                                ,dict(attrs={'class':['pull-right', 'right-content']})
                             ]

    keep_only_tags        = [dict(name='div', id=['content'])]

    def get_cover_url(self):
        soup = self.index_to_soup('http://www.thegridto.com/issues/50/')        
        div = soup.find(attrs={'class':'article-block latest-issue'})

        img = div.find('img')
        
        cover_url = img.get('src')
        self.log ('\t\tCover URL', cover_url)
        return cover_url

    def parse_index(self):
        feeds = []
        
        soup = self.index_to_soup('http://www.thegridto.com/issues/51/')        
        for section in ['city', 'life', 'culture']:
            self.log('\t\t Section', section)
            section_class = 'left-content article-listing ' + section + ' pull-left'
            div = soup.find(attrs={'class': section_class})

            articles = []
            for tag in div.findAllNext(attrs={'class':'search-block'}):                
                a = tag.findAll('a', href=True)[1]                
                
                title = self.tag_to_string(a)
                url = a.get('href', False)
                
                self.log('\t\t Found Article', title)
                self.log('\t\t', url)

                articles.append({'title': title, 'url': url, 'description':'', 'date':''})
            
            feeds.append((section, articles))   
            self.log('\t\t Length of articles', len(articles))
            self.log('\t\t End section log\n')                     
        return feeds
underwarez is offline   Reply With Quote