EDIT 2 - Dec. 3, 2014 - Working again, but I am keeping my eye on it. Updated code below.
EDIT - no longer working, I will need to revisit it, possible change on IMDB site
This is a recipe based on IMDB Advanced Title Search (
http://www.imdb.com/search/title). The basic idea is you create your favorite advanced title search on the web site, and you look at the key=value parts of the IMDB url and plug them into the custom_imdb_searches list. In this context, sections are based on specific searches and "articles" are the basic movie info and poster. So I consider this a template, because everyone's favorite movie searches I assume will be different. That said there are several suggested searches in the recipe, two active and the others commented out.
Code:
custom_imdb_searches = [
# Each item here creates a new movie section based on
# IMDB Advanced Search
dict(), # use defaults
dict(sort='user_rating,desc'), # sort by user rating instead of newest first
#dict(languages='hi'), # Hindi movies
#dict(languages='hi',has='asin-dvd-us'), # Hindi movies at amazon.com
#dict(url='http://www.imdb.com/search/title?production_status=released&title_type=feature'),
]
You set up your own searches in the custom_imdb_searches list of the recipe. Check out the default criteria in the imdb_search method also, because you don't need to specify those criteria unless you are overriding them.
Code:
criteria = { # Default criteria:
'title_type': 'feature', # movies only, no TV shows
'production_status': 'released', # that have been released
'user_rating': '6.5,10', # with user rating of 6.5-10
'num_votes': '500,', # with at least 500 votes
'sort': 'year,desc' # sort by year, descending
}
It will also run without changing anything, recommended the first time you run it so you can see what it is about.
Please let me know of any suggestions or criticisms, thanks!
recipe updated 12/3
Code:
from calibre.web.feeds.news import BasicNewsRecipe
class IMDBAdvancedTitleSearch2468(BasicNewsRecipe):
title = 'IMDB Advanced Title Search'
language = 'en'
categories = 'IMDB,template,movies'
__author__ = 'ireadtheinternet'
max_articles_per_feed = 50
no_stylesheets = True
no_javascript = True
preprocess_regexps = [
(re.compile(r'»'), lambda match: '')
]
extra_css = 'img:first-of-type { display : block; margin-left : auto; margin-right: auto }'
keep_only_tags = [
dict(name='td', attrs={'id': ['img_primary']}), #poster
dict(name='h1', attrs={'class': ['header']}), #title
dict(name='div', attrs={'class': ['infobar']}), #length, genre, release
dict(name='div', attrs={'itemtype': ['http://schema.org/Person']}), #people
dict(name='div', attrs={'class': ['inline canwrap']}) #storyline
]
remove_tags = [
dict(name='div', attrs={'class': ['pro-title-link text-center']}),
]
IMDB_BASE = 'http://www.imdb.com'
# Make quick customizations of the recipe by changing custom_imdb_searches
# First go to IMDB Advanced Title Search: http://www.imdb.com/search/title
# Do your favorite search and figure out which non-defaults args you need
# from the url (Defaults are in criteria dict in the imdb_search method.)
# Alternatively, just copy/paste the url into the url arg
custom_imdb_searches = [
# Each item here creates a new movie section based on
# IMDB Advanced Search
dict(), # use defaults
dict(sort='user_rating,desc'), # sort by user rating instead of newest first
#dict(languages='hi'), # Hindi movies
#dict(languages='hi',has='asin-dvd-us'), # Hindi movies at amazon.com
#dict(url='http://www.imdb.com/search/title?production_status=released&title_type=feature'),
]
def build_section(self, url):
articles = []
toc_page_raw = self.index_to_soup(url, raw=True)
toc_page_raw = re.sub(r'<script\b.+?</script>', '',
toc_page_raw, flags=re.DOTALL|re.IGNORECASE)
toc_page = self.index_to_soup(toc_page_raw)
toc = toc_page.find(name='div', attrs={'id': 'main'})
for movie in toc.findAll('a', attrs={'href':re.compile(r'/title/tt.*'),'title':True}):
title = self.tag_to_string(movie)
url = self.IMDB_BASE + movie['href']
#self.log('Found movie:', movie)
#self.log('\t', url)
articles.append({'title': title, 'url': url, 'date':'','description': ''})
name = self.tag_to_string(toc_page.find('h1'))
return name, articles
def imdb_search(self, url=None, **kwargs):
search_url = url
self.IMDB_BASE = 'http://www.imdb.com'
if url is not None:
if url.startswith('http://') or url.startswith('https://'):
return search_url
else:
search_url = self.IMDB_BASE + '/search/title?' + search_url
return search_url
search_url = self.IMDB_BASE + '/search/title?'
criteria = { # Default criteria:
'title_type': 'feature', # movies only, no TV shows
'production_status': 'released', # that have been released
'user_rating': '6.5,10', # with user rating of 6.5-10
'num_votes': '500,', # with at least 500 votes
'sort': 'year,desc' # sort by year, descending
}
# merge args with criteria, possibly overriding original criteria
criteria.update(kwargs)
criteria_list = [key + '=' + criteria[key] for key in criteria]
search_url = search_url + '&'.join(criteria_list)
return search_url
def parse_index(self):
self.log('def parse_index(self)')
feeds = []
for search in self.custom_imdb_searches:
feeds.append((self.build_section(self.imdb_search(**search))))
return feeds
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
alink_text = ''.join(alink.findAll(text=True))
found_img = alink.find('img') is not None
if found_img is False:
alink.name, alink.attrs = 'div', {}
alink.replaceWith(alink_text)
for t in soup.findAll(['table', 'td', 'tr', 'tbody']):
t.name, t.attrs = 'div', {}
return soup