Hi All,
This will be my first recipe. It's to scrape a site thegridto.com. I have two problems so far, since I just started.
1. There are suppose to be 3 sections in my table of content: city, life and culture but somehow only city and life shows up.
2. Under each of the section, only two of article links are generated. Even though there should be over 10 articles each.
Can you please take a look at my code and help me correct the problem?
Code:
class TheGridTO(BasicNewsRecipe):
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
title = u'The Grid TO'
description = (u'The Grid is a weekly city magazine and daily website providing a fresh, '
'accessible voice for Toronto.')
__author__ = u'Yusuf W'
language = 'en_US'
max_articles_per_feed = 300
oldest_article = 8.0
publication_type = 'newspaper'
simultaneous_downloads = 5
no_stylesheets = False
remove_tags = [
dict(name='div', id=['comments','page-header', ])
,dict(attrs={'class':['pull-right', 'right-content']})
]
keep_only_tags = [dict(name='div', id=['content'])]
def get_cover_url(self):
soup = self.index_to_soup('http://www.thegridto.com/issues/50/')
div = soup.find(attrs={'class':'article-block latest-issue'})
img = div.find('img')
cover_url = img.get('src')
self.log ('\t\tCover URL', cover_url)
return cover_url
def parse_index(self):
feeds = []
soup = self.index_to_soup('http://www.thegridto.com/issues/51/')
for section in ['city', 'life', 'culture']:
self.log('\t\t Section', section)
section_class = 'left-content article-listing ' + section + ' pull-left'
div = soup.find(attrs={'class': section_class})
articles = []
for tag in div.findAllNext(attrs={'class':'search-block'}):
a = tag.findAll('a', href=True)[1]
title = self.tag_to_string(a)
url = a.get('href', False)
self.log('\t\t Found Article', title)
self.log('\t\t', url)
articles.append({'title': title, 'url': url, 'description':'', 'date':''})
feeds.append((section, articles))
self.log('\t\t Length of articles', len(articles))
self.log('\t\t End section log\n')
return feeds