Register Guidelines E-Books Search Today's Posts Mark Forums Read

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes


Thread Tools Search this Thread
Old 11-09-2010, 01:34 AM   #1
Junior Member
motorro began at the beginning.
Posts: 5
Karma: 10
Join Date: Oct 2010
Location: Russia, Moscow
Device: Kindle v3
Lightbulb Ведомости recipe


Here is another russian recipe for "Ведомости" newspaper.
The recipe uses "current issue" feed and is of the same style as "Lenta" (item category is an article section).

Note: The recipe uses a custom logo for a newspaper. It is currently located on my private server that is not very reliable. So you may want to move a picture to your desired location and make a correction to masthead_url and cover_url values.

Best regards,

#!/usr/bin/env  python


from calibre.web.feeds.feedparser import parse
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, Tag
import re

class VedomostiRecipe(BasicNewsRecipe):
    title = u'Ведомости'
    __author__ = 'Nikolai Kotchetkov'
    publisher = ''
    category = 'press, Russia'
    description = u'Ежедневная деловая газета'
    oldest_article = 3
    max_articles_per_feed = 100
    masthead_url = u''
    cover_url = u''

    #Add feed names if you want them to be sorted (feeds of this list appear first)
    sortOrder = [u'_default', u'Первая полоса', u'Власть и деньги']

    encoding = 'cp1251'
    language = 'ru'
    no_stylesheets = True
    remove_javascript = True
    recursions = 0
    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
    keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})]
    remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})]
    remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})]
    feeds = [u'']	
    #base URL for relative links
    base_url = u''
    extra_css = 'h1 {font-size: 1.5em; margin: 0em 0em 0em 0em; text-align: center;}'\
                'h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;}'\
                'h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}'\
                '.article_date {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
                '.article_authors {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
                '.article_img {width:100%; text-align: center; padding: 3px 3px 3px 3px;}'\
                '.article_img_desc {width:100%; text-align: center; font-size: 0.5em; color: gray; font-family: monospace;}'\
                '.article_desc {font-size: 1em; font-style:italic;}'  

    def parse_index(self):
        feedSource = self.index_to_soup(self.feeds[0])

            feedData = parse(self.feeds[0])
            if not feedData:
                raise NotImplementedError
            self.log("parse_index: Feed loaded successfully.")
            if feedData.feed.has_key('title'):
                self.title = feedData.feed.title
                self.log("parse_index: Title updated to: ", self.title)
            if feedData.feed.has_key('description'):
                self.description = feedData.feed.description
                self.log("parse_index: Description updated to: ", self.description)
            def get_virtual_feed_articles(feed):
                if feeds.has_key(feed):
                    return feeds[feed][1]
                self.log("Adding new feed: ", feed)
                articles = []
                feeds[feed] = (feed, articles)
                return articles
            feeds = {}
            #Iterate feed items and distribute articles using tags
            for item in feedData.entries:
                link = item.get('link', '');
                title = item.get('title', '');
                if '' == link or '' == title:
                article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''};
                if not item.has_key('tags'):
                for tag in item.tags:
                    addedToDefault = False
                    term = tag.get('term', '')
                    if '' == term:
                        if (not addedToDefault):
            #Get feed list
            #Select sorted feeds first of all
            result = []
            for feedName in self.sortOrder:
                if (not feeds.has_key(feedName)): continue
                del feeds[feedName]
            result = result + feeds.values()
            return result 
        except Exception, err:
            raise NotImplementedError
    def preprocess_html(self, soup):
        return self.adeify_images(soup)

    def postprocess_html(self, soup, first_fetch):
        self.log('Original: ', soup.prettify())
        #Find article
        contents = soup.find('div', {'class':['article_text']})
        if not contents:
            self.log('postprocess_html: article div not found!')
            return soup

        #Find title
        title = soup.find('h1')
        if title:
            contents.insert(0, title)
        #Find article image
        newstop = soup.find('div', {'class':['newstop']})
        if newstop:
            img = newstop.find('img')
            if img:
                imgDiv = Tag(soup, 'div')
                imgDiv['class'] = 'article_img'
                if img.has_key('width'):
                if img.has_key('height'):

                #find description
                element = img.parent.nextSibling

                imgDiv.insert(0, img)
                while element:
                    if not isinstance(element, Tag):
                    nextElement = element.nextSibling
                    if 'p' ==
                        element['class'] = 'article_img_desc'
                        imgDiv.insert(len(imgDiv.contents), element)
                    element = nextElement
                contents.insert(1, imgDiv)
        #find article abstract
        abstract = soup.find('p', {'class':['subhead']})
        if abstract:
            abstract['class'] = 'article_desc'
            contents.insert(2, abstract)

        #Find article authors
        authorsDiv = soup.find('div', {'class':['autors']})
        if authorsDiv:
            authorsP = authorsDiv.find('p')
            if authorsP:
                authorsP['class'] = 'article_authors'
                contents.insert(len(contents.contents), authorsP)
        #Fix urls that use relative path
        urls = contents.findAll('a');
        if urls:
            for url in urls:
                if not url.has_key('href'):
                if '/' == url['href'][0]:
                    url['href'] = self.base_url + url['href']
        body = soup.find('td', {'class':['second_content']})
        if body:
        self.log('Result: ', soup.prettify())
        return soup
motorro is offline   Reply With Quote

Thread Tools Search this Thread
Search this Thread:

Advanced Search

Forum Jump

All times are GMT -4. The time now is 12:59 AM. is a privately owned, operated and funded community.