View Single Post
Old 07-04-2010, 02:48 AM   #2244
rty
Zealot
rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.rty got an A in P-Chem.
 
Posts: 108
Karma: 6066
Join Date: Apr 2010
Location: Singapore
Device: iPad Air, Kindle DXG, Kindle Paperwhite
Quote:
Originally Posted by dwanthny View Post
He has the right file. When you try to load this recipe from your file you get:
Ok, I'll take a look at it tonight when I reach home.

The content of the recipe is below:

Spoiler:

Code:
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1278063072(BasicNewsRecipe):
    title          = u'Singtao Daily - Canada'
    oldest_article = 7
    max_articles_per_feed = 100
    __author__            = 'rty'
    description           = 'Toronto Canada Chinese Newspaper'
    publisher             = 'news.singtao.ca'
    category              = 'Chinese, News, Canada'
    remove_javascript = True
    use_embedded_content   = False
    no_stylesheets = True
    language = 'cn-HK'
    conversion_options = {'linearize_tables':True} 
    masthead_url = 'http://news.singtao.ca/i/site_2009/logo.jpg'
    extra_css = '''
    	@font-face {font-family: "DroidFont", serif, sans-serif; src: url(res:///system/fonts/DroidSansFallback.ttf); }\n
	body {text-align: justify; margin-right: 8pt; font-family: 'DroidFont', serif;}\n
                    h1 {font-family: 'DroidFont', serif;}\n
                    .articledescription {font-family: 'DroidFont', serif;}
            '''
    keep_only_tags = [
	dict(name='div', attrs={'id':['title','storybody']}),
	dict(name='div', attrs={'class':'content'})
	]

    def parse_index(self):
            feeds = []
            for title, url in [
            	('Editorial', 'http://news.singtao.ca/toronto/editorial.html'),
             ('Toronto 城市/社區', 'http://news.singtao.ca/toronto/city.html'),
             ('Canada 加國', 'http://news.singtao.ca/toronto/canada.html'),
          ('Entertainment', 'http://news.singtao.ca/toronto/entertainment.html'),
	('World', 'http://news.singtao.ca/toronto/world.html'),
	('Finance 國際財經', 'http://news.singtao.ca/toronto/finance.html'),
	('Sports', 'http://news.singtao.ca/toronto/sports.html'),
                            ]:
               articles = self.parse_section(url)
               if articles:
                   feeds.append((title, articles))
            return feeds
        
    def parse_section(self, url):
            soup = self.index_to_soup(url)
            div = soup.find(attrs={'class': ['newslist paddingL10T10','newslist3 paddingL10T10']})
            #date = div.find(attrs={'class': 'underlineBLK'})
            current_articles = []
            for li in div.findAll('li'):
                    a = li.find('a', href = True)
                    if a is None:
                        continue
                    title = self.tag_to_string(a)
                    url = a.get('href', False)
                    if not url or not title:
                        continue
                    if url.startswith('/'):
                         url = 'http://news.singtao.ca'+url
          #          self.log('\t\tFound article:', title)
          #          self.log('\t\t\t', url)
                    current_articles.append({'title': title, 'url': url, 'description':''})

            return current_articles

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
           del item['style']
        for item in soup.findAll(width=True):
           del item['width']
        return soup
rty is offline