After posting this, I broke it again almost immediately. Why isn't this recipe finding the main div? This is the whole recipe. I had the line
Code:
print toc_page.prettify()
in it before and the soup only seems to have a HEAD but no BODY.
Code:
from calibre.web.feeds.news import BasicNewsRecipe
import re
keep_only_tags = [
dict(name='div', attrs={'id': ['main']})
]
class IMDBAdvancedTitleSearch(BasicNewsRecipe):
title = u'IMDB Advanced Title Search'
__author__ = 'ireadtheinternet'
no_stylesheets = True
no_javascript = True
def parse_index(self):
toc_page = self.index_to_soup('http://www.imdb.com/search/title?sort=year,desc&production_status=released&title_type=feature')
toc = toc_page.find(name='div', attrs={'id':'main'})
if toc is None:
print '***toc is None***'
# ***toc in None*** prints
articles = []
for movie in toc.findAll('a', attrs={'href':re.compile(r'/title/tt.*')}):
print(movie)
title = self.tag_to_string(movie)
url = 'http://www.imdb.com' + movie['href']
self.log('Found article:', movie)
self.log('\t', url)
articles.append({'title':title, 'url':url, 'date':'',
'description':''})
return [('Movies', articles)]