![]() |
#1 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Business Week Magazine
Unlike ones already in the inventory, this does not read from rss of Business Week news. It replicates the weekly magazine.
Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class BusinessWeekMagazine(BasicNewsRecipe): title = 'Business Week Magazine' __author__ = 'Rick Shang' description = 'A renowned business publication. Business news, trends and profiles of successful businesspeople.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'id':'article_body_container'}), ] remove_tags = [dict(name='ui'),dict(name='li')] no_javascript = True no_stylesheets = True cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' def parse_index(self): #Go to the issue soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm') #Find date mag=soup.find('h2',text='Magazine') self.log(mag) dates=self.tag_to_string(mag.findNext('h3')) self.timefmt = u' [%s]'%dates #Go to the main body div0 = soup.find ('div', attrs={'class':'column left'}) section_title = '' feeds = OrderedDict() for div in div0.findAll('a'): articles = [] section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div).strip() url=div['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles div1 = soup.find ('div', attrs={'class':'column center'}) section_title = '' for div in div1.findAll('a'): articles = [] desc=self.tag_to_string(div.findNext('p')).strip() section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div).strip() url=div['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans |
![]() |
![]() |
![]() |
#2 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Update:
Bug Fix Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class BusinessWeekMagazine(BasicNewsRecipe): title = 'Business Week Magazine' __author__ = 'Rick Shang' description = 'A renowned business publication. Business news, trends and profiles of successful businesspeople.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'id':'article_body_container'}), ] remove_tags = [dict(name='ui'),dict(name='li')] no_javascript = True no_stylesheets = True cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' def parse_index(self): #Go to the issue soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm') #Find date mag=soup.find('h2',text='Magazine') self.log(mag) dates=self.tag_to_string(mag.findNext('h3')) self.timefmt = u' [%s]'%dates #Go to the main body div0 = soup.find ('div', attrs={'class':'column left'}) section_title = '' feeds = OrderedDict() for div in div0.findAll('h4'): articles = [] section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles div1 = soup.find ('div', attrs={'class':'column center'}) section_title = '' for div in div1.findAll('h5'): articles = [] desc=self.tag_to_string(div.findNext('p')).strip() section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans |
![]() |
![]() |
Advert | |
|
![]() |
#3 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Update:
Polished the article layout Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class BusinessWeekMagazine(BasicNewsRecipe): title = 'Business Week Magazine' __author__ = 'Rick Shang' description = 'A renowned business publication. Business news, trends and profiles of successful businesspeople.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'id':'article_body_container'}), ] remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})] no_javascript = True no_stylesheets = True cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' def parse_index(self): #Go to the issue soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm') #Find date mag=soup.find('h2',text='Magazine') self.log(mag) dates=self.tag_to_string(mag.findNext('h3')) self.timefmt = u' [%s]'%dates #Go to the main body div0 = soup.find ('div', attrs={'class':'column left'}) section_title = '' feeds = OrderedDict() for div in div0.findAll('h4'): articles = [] section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles div1 = soup.find ('div', attrs={'class':'column center'}) section_title = '' for div in div1.findAll('h5'): articles = [] desc=self.tag_to_string(div.findNext('p')).strip() section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans |
![]() |
![]() |
![]() |
#4 |
Connoisseur
![]() Posts: 83
Karma: 10
Join Date: Aug 2009
Device: iphone, Irex iliad, sony prs950, kindle Dx, Ipad
|
how to use it. its getting error and not downloading anything.
thanks for efforts |
![]() |
![]() |
![]() |
#5 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Thanks for notifying me the issue. BW changed the page a little bit. This is the fix
Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class BusinessWeekMagazine(BasicNewsRecipe): title = 'Business Week Magazine' __author__ = 'Rick Shang' description = 'A renowned business publication. Business news, trends and profiles of successful businesspeople.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'id':'article_body_container'}), ] remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})] no_javascript = True no_stylesheets = True cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' def parse_index(self): #Go to the issue soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm') #Find date mag=soup.find('h2',text='Magazine') self.log(mag) dates=self.tag_to_string(mag.findNext('h3')) self.timefmt = u' [%s]'%dates #Go to the main body div0 = soup.find ('div', attrs={'class':'column left'}) section_title = '' feeds = OrderedDict() for div in div0.findAll('h4'): articles = [] section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles div1 = soup.find ('div', attrs={'class':'column center'}) section_title = '' for div in div1.findAll('h5'): articles = [] desc=self.tag_to_string(div.findNext('p')).strip() section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans |
![]() |
![]() |
Advert | |
|
![]() |
#6 |
Connoisseur
![]() Posts: 83
Karma: 10
Join Date: Aug 2009
Device: iphone, Irex iliad, sony prs950, kindle Dx, Ipad
|
thanks working now
|
![]() |
![]() |
![]() |
#7 |
Zealot
![]() ![]() ![]() ![]() Posts: 143
Karma: 387
Join Date: Sep 2010
Device: Kindle 3
|
I am sorry, it stopped working for me as of this week, I am getting this error log
Spoiler:
Thanks for looking at it. Cheers, Mixx |
![]() |
![]() |
![]() |
#8 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
|
![]() |
![]() |
![]() |
#9 |
Zealot
![]() ![]() ![]() ![]() Posts: 143
Karma: 387
Join Date: Sep 2010
Device: Kindle 3
|
Now I am and now it is working again. Thanks a million and I apologize for the bother. I should have tried the latest version first.
![]() Thanxx, Mixx |
![]() |
![]() |
![]() |
#10 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
|
![]() |
![]() |
![]() |
#11 |
Zealot
![]() ![]() ![]() ![]() Posts: 143
Karma: 387
Join Date: Sep 2010
Device: Kindle 3
|
Thank you for that, Rainrdx, much appreciated!
I certainly enjoy this recipe very much! Thanxx for making it available! Cheers, Mixx |
![]() |
![]() |
![]() |
#12 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Update: Fixes the missing article issue
Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class BusinessWeekMagazine(BasicNewsRecipe): title = 'Business Week Magazine' __author__ = 'Rick Shang' description = 'A renowned business publication. Business news, trends and profiles of successful businesspeople.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'id':'article_body_container'}), ] remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})] no_javascript = True no_stylesheets = True cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' def parse_index(self): #Go to the issue soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm') #Find date mag=soup.find('h2',text='Magazine') self.log(mag) dates=self.tag_to_string(mag.findNext('h3')) self.timefmt = u' [%s]'%dates #Go to the main body div0 = soup.find ('div', attrs={'class':'column left'}) section_title = '' feeds = OrderedDict() for div in div0.findAll(['h4','h5']): articles = [] section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles div1 = soup.find ('div', attrs={'class':'column center'}) section_title = '' for div in div1.findAll(['h4','h5']): articles = [] desc=self.tag_to_string(div.findNext('p')).strip() section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href'] articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans |
![]() |
![]() |
![]() |
#13 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Update: Fixes due to minor changes in the website.
Now I've remembered to use the most recent code. Code:
from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class BusinessWeekMagazine(BasicNewsRecipe): title = 'Business Week Magazine' __author__ = 'Rick Shang' description = 'A renowned business publication. Business news, trends and profiles of successful businesspeople.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'id':'article_body_container'}), ] remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})] no_javascript = True no_stylesheets = True cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' def parse_index(self): #Go to the issue soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm') #Find date mag=soup.find('h2',text='Magazine') self.log(mag) dates=self.tag_to_string(mag.findNext('h3')) self.timefmt = u' [%s]'%dates #Go to the main body div0 = soup.find ('div', attrs={'class':'column left'}) section_title = '' feeds = OrderedDict() for div in div0.findAll(['h4','h5']): articles = [] section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href'] articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles div1 = soup.find ('div', attrs={'class':'column center'}) section_title = '' for div in div1.findAll(['h4','h5']): articles = [] desc=self.tag_to_string(div.findNext('p')).strip() section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div.a).strip() url=div.a['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')})['href'] articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans |
![]() |
![]() |
![]() |
#14 |
Connoisseur
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
|
Update: fixes for the stupid website changes..
Code:
import re from calibre.web.feeds.recipes import BasicNewsRecipe from collections import OrderedDict class BusinessWeekMagazine(BasicNewsRecipe): title = 'Business Week Magazine' __author__ = 'Rick Shang' description = 'A renowned business publication. Business news, trends and profiles of successful businesspeople.' language = 'en' category = 'news' encoding = 'UTF-8' keep_only_tags = [ dict(name='div', attrs={'id':['article_body_container','story_body']}), ] remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})] no_javascript = True no_stylesheets = True cover_url = 'http://images.businessweek.com/mz/covers/current_120x160.jpg' def parse_index(self): #Go to the issue soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm') #Find date mag=soup.find('h2',text='Magazine') dates=self.tag_to_string(mag.findNext('h3')) self.timefmt = u' [%s]'%dates #Go to the main body div0 = soup.find ('div', attrs={'class':'column left'}) section_title = '' feeds = OrderedDict() for div in div0.findAll('a', attrs={'class': None}): articles = [] section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div).strip() url=div['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')}) if urlprint is not None: url=urlprint['href'] articles.append({'title':title, 'url':url, 'description':'', 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles div1 = soup.find ('div', attrs={'class':'column center'}) section_title = '' for div in div1.findAll('a'): articles = [] desc=self.tag_to_string(div.findNext('p')).strip() section_title = self.tag_to_string(div.findPrevious('h3')).strip() title=self.tag_to_string(div).strip() url=div['href'] soup0 = self.index_to_soup(url) urlprint=soup0.find('a', attrs={'href':re.compile('.*printer.*')}) if urlprint is not None: url=urlprint['href'] articles.append({'title':title, 'url':url, 'description':desc, 'date':''}) if articles: if section_title not in feeds: feeds[section_title] = [] feeds[section_title] += articles ans = [(key, val) for key, val in feeds.iteritems()] return ans |
![]() |
![]() |
![]() |
#15 |
Junior Member
![]() Posts: 6
Karma: 10
Join Date: Mar 2013
Device: Kindle Touch
|
Not working again. Any help is greatly appreciated. Thanks.
|
![]() |
![]() |
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
The Week magazine | anleva | Recipes | 5 | 01-01-2012 03:47 PM |
(Business Week) Bookstores closed not because of poor sales | Ryvyan | General Discussions | 18 | 11-27-2011 04:21 PM |
Business Week is caotic after HTML5 article | dino_hsu_1019 | Recipes | 0 | 08-13-2011 11:59 AM |
Business Week - Cell Phones take on e-Readers | =X= | News | 15 | 01-06-2009 11:09 AM |
Business Week lukewarm on e-books | VillageReader | Lounge | 1 | 08-29-2007 05:58 AM |