View Single Post
Old 01-17-2013, 09:54 PM   #5
rainrdx
Connoisseur
rainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy bluerainrdx can differentiate black from dark navy blue
 
Posts: 55
Karma: 13316
Join Date: Jul 2012
Device: iPad
Thanks for notifying me the issue. BW changed the page a little bit. This is the fix

Code:
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from collections import OrderedDict

class BusinessWeekMagazine(BasicNewsRecipe):

    title       = 'Business Week Magazine'
    __author__  = 'Rick Shang'

    description = 'A renowned business publication. Business news, trends and profiles of successful businesspeople.'
    language = 'en'
    category = 'news'
    encoding = 'UTF-8'
    keep_only_tags = [
			dict(name='div', attrs={'id':'article_body_container'}),
			]
    remove_tags = [dict(name='ui'),dict(name='li'),dict(name='div', attrs={'id':['share-email']})]
    no_javascript = True
    no_stylesheets = True
	
    cover_url             = 'http://images.businessweek.com/mz/covers/current_120x160.jpg'

    def parse_index(self):

	#Go to the issue
        soup = self.index_to_soup('http://www.businessweek.com/magazine/news/articles/business_news.htm')
	
	#Find date
	mag=soup.find('h2',text='Magazine')
	self.log(mag)
	dates=self.tag_to_string(mag.findNext('h3'))
	self.timefmt = u' [%s]'%dates

        #Go to the main body
	div0 = soup.find ('div', attrs={'class':'column left'})	
	section_title = ''
        feeds = OrderedDict()
	for div in div0.findAll('h4'):
		articles = []
		section_title = self.tag_to_string(div.findPrevious('h3')).strip()
		title=self.tag_to_string(div.a).strip()
		url=div.a['href']
		soup0 = self.index_to_soup(url)
		urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
		articles.append({'title':title, 'url':urlprint, 'description':'', 'date':''})

		
		if articles:
			if section_title not in feeds:
				feeds[section_title] = []
			feeds[section_title] += articles
	div1 = soup.find ('div', attrs={'class':'column center'})	
	section_title = ''
	for div in div1.findAll('h5'):
		articles = []
		desc=self.tag_to_string(div.findNext('p')).strip()
		section_title = self.tag_to_string(div.findPrevious('h3')).strip()
		title=self.tag_to_string(div.a).strip()
		url=div.a['href']
		soup0 = self.index_to_soup(url)
		urlprint=soup0.find('li', attrs={'class':'print tracked'}).a['href']
		articles.append({'title':title, 'url':urlprint, 'description':desc, 'date':''})

		if articles:
			if section_title not in feeds:
				feeds[section_title] = []
			feeds[section_title] += articles
        ans = [(key, val) for key, val in feeds.iteritems()]
        return ans
rainrdx is offline   Reply With Quote