09-08-2012, 04:57 PM | #1 |
Connoisseur
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
History Today Recipe
[/QUOTE]
Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class HistoryToday(BasicNewsRecipe): title = 'History Today' __author__ = 'Rick Shang' description = 'UK-based magazine, publishing articles and book reviews covering all types and periods of history.' language = 'en' category = 'news' encoding = 'UTF-8' remove_tags = [dict(name='div',attrs={'class':['print-logo','print-site_name','print-breadcrumb']}), dict(name='div', attrs={'id':['ht-tools','ht-tools2','ht-tags']})] no_javascript = True no_stylesheets = True needs_subscription = True def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('http://www.historytoday.com/user/login') br.select_form(nr=1) br['name'] = self.username br['pass'] = self.password res = br.submit() raw = res.read() if 'Session limit exceeded' in raw: br.select_form(nr=1) control=br.find_control('sid').items[1] sid = [] br['sid']=sid.join(control) br.submit() return br def parse_index(self): #Find date soup0 = self.index_to_soup('http://www.historytoday.com/') dates = self.tag_to_string(soup0.find('div',attrs={'id':'block-block-226'}).span) self.timefmt = u' [%s]'%dates #Go to issue soup = self.index_to_soup('http://www.historytoday.com/contents') cover = soup.find('div',attrs={'id':'content-area'}).find('img')['src'] self.cover_url=cover #Go to the main body div = soup.find ('div', attrs={'class':'region region-content-bottom'}) feeds = OrderedDict() section_title = '' for section in div.findAll('div', attrs={'id':re.compile("block\-views\-contents.*")}): section_title = self.tag_to_string(section.find('h2',attrs={'class':'title'})) sectionbody=section.find('div', attrs={'class':'view-content'}) for article in sectionbody.findAll('div',attrs={'class':re.compile("views\-row.*")}): articles = [] subarticle = [] subarticle = article.findAll('div') if len(subarticle) < 2: continue title=self.tag_to_string(subarticle[0]) originalurl="http://www.historytoday.com" + subarticle[0].span.a['href'].strip() originalpage=self.index_to_soup(originalurl) printurl=originalpage.find('div',attrs = {'id':'ht-tools'}).a['href'].strip() url="http://www.historytoday.com" + printurl author="" desc=self.tag_to_string(subarticle[1]) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans def cleanup(self): self.browser.open('http://www.historytoday.com/logout') Last edited by rainrdx; 09-08-2012 at 05:52 PM. |
03-25-2013, 06:53 PM | #2 |
Connoisseur
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Update: fixed the cover image
Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class HistoryToday(BasicNewsRecipe): title = 'History Today' __author__ = 'Rick Shang' description = 'UK-based magazine, publishing articles and book reviews covering all types and periods of history.' language = 'en' category = 'news' encoding = 'UTF-8' remove_tags = [dict(name='div',attrs={'class':['print-logo','print-site_name','print-breadcrumb']}), dict(name='div', attrs={'id':['ht-tools','ht-tools2','ht-tags']})] no_javascript = True no_stylesheets = True needs_subscription = True def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username is not None and self.password is not None: br.open('http://www.historytoday.com/user/login') br.select_form(nr=1) br['name'] = self.username br['pass'] = self.password res = br.submit() raw = res.read() if 'Session limit exceeded' in raw: br.select_form(nr=1) control=br.find_control('sid').items[1] sid = [] br['sid']=sid.join(control) br.submit() return br def parse_index(self): #Find date soup0 = self.index_to_soup('http://www.historytoday.com/') dates = self.tag_to_string(soup0.find('div',attrs={'id':'block-block-226'}).span) self.timefmt = u' [%s]'%dates #Go to issue soup = self.index_to_soup('http://www.historytoday.com/contents') cover = soup.find('div',attrs={'id':'content-area'}).find('img', attrs={'src':re.compile('.*cover.*')})['src'] self.cover_url=cover self.log(self.cover_url) #Go to the main body div = soup.find ('div', attrs={'class':'region region-content-bottom'}) feeds = OrderedDict() section_title = '' for section in div.findAll('div', attrs={'id':re.compile("block\-views\-contents.*")}): section_title = self.tag_to_string(section.find('h2',attrs={'class':'title'})) sectionbody=section.find('div', attrs={'class':'view-content'}) for article in sectionbody.findAll('div',attrs={'class':re.compile("views\-row.*")}): articles = [] subarticle = [] subarticle = article.findAll('div') if len(subarticle) < 2: continue title=self.tag_to_string(subarticle[0]) originalurl="http://www.historytoday.com" + subarticle[0].span.a['href'].strip() originalpage=self.index_to_soup(originalurl) printurl=originalpage.find('div',attrs = {'id':'ht-tools'}).a['href'].strip() url="http://www.historytoday.com" + printurl author="" desc=self.tag_to_string(subarticle[1]) articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans def cleanup(self): self.browser.open('http://www.historytoday.com/logout') |
Advert | |
|
|
Similar Threads | ||||
Thread | Thread Starter | Forum | Replies | Last Post |
Request: Please update Psychology Today recipe | underwarez | Recipes | 0 | 07-04-2012 01:50 PM |
Linux Today Recipe | Pajoe | Recipes | 0 | 01-31-2012 04:38 AM |
Recipe for hindustan times and India Today | agbpatro | Recipes | 1 | 09-11-2011 05:02 PM |
West Hawaii Today Recipe may need fixing | sldavis01 | Recipes | 0 | 03-18-2011 10:09 PM |
Recipe for The World Today (Chatham House) | bleavett | Recipes | 0 | 02-09-2011 04:11 PM |