MobileRead Forums - View Single Post

sup · 10-31-2013, 10:12 AM

Quote:

Originally Posted by BetterRed

my gui does - see attachment

BR

Hm, weird. That is the switch I was trying. This is the recipe I am using that shows different results with --change-justification and gui (normally you need password for full articles, but it should be visible even without one, I hope.)

Code:

#!/usr/bin/python
# -*- coding: utf-8 -*-
# License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
# Copyright:    tomashnyk@gmail.com

__license__     = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__   = 'tomashnyk@gmail.com'

import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup,Tag

class respektRecipe(BasicNewsRecipe):
    __author__  = 'Tomáš Hnyk'
    title = u'Respekt'
    publisher = u'Respekt Publishing a. s.'
    description = u'Articles from the printed edition without translations from The Economist that are not available online'
    language = 'cze'
    publication_type = 'magazine' 
    encoding = 'cp1250'
    language = 'cs'

    remove_javascript = True
    extra_css = '.image_caption {font-size: 50%;}, .author {text-align:left;} p.indent_first_line {text-indent:30px;}'
    remove_tags_before = dict(name='div',attrs={'class':['l']})
    remove_tags_after = dict(id='text')
    remove_tags = [dict(name='ul', attrs={'class':['tabs-d'],'id':['comm']}), \
    dict(name='div',attrs={'class':['slot','reklama','date']}), \
    dict(name='span', attrs={'class':['detail-vykrik']}), \
    dict(name='p', attrs={'class':['detail-vykrik']}),
    dict(name='strong', attrs={'class':['detail-vykrik']})]
    # this makes authors left-aligned by not using the author class)
    preprocess_regexps = [(re.compile(r'<div class="author">', re.DOTALL|re.IGNORECASE), lambda match: '<div class="">')]
    # remove empty tags
    preprocess_regexps.append((re.compile(r'<strong> </strong>', re.DOTALL|re.IGNORECASE), lambda match: ' '))
    preprocess_regexps.append((re.compile(r'<strong>&nbsp;</strong>', re.DOTALL|re.IGNORECASE), lambda match: '&nbsp;'))
    preprocess_regexps.append((re.compile(r'<p></p>', re.DOTALL|re.IGNORECASE), lambda match: ''))

    def get_cover_url(self):
        soup = self.index_to_soup('http://respekt.ihned.cz/')
        cover = soup.findAll('div', attrs={'class':'cover'})[0].find('img')['src']
        return cover
    
    needs_subscription = False

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        if self.username is not None and self.password is not None:
            br.open('http://muj-ucet.ihned.cz/')
            br.select_form(name='login')
            br['login[nick]'] = self.username
            br['login[pass]'] = self.password
            br.submit()
        return br

    def parse_index(self):
        soup = self.index_to_soup('http://respekt.ihned.cz/aktualni-cislo/')
        articles = soup.findAll('div', attrs={'class':'ow-enclose'})[0]
        ans = []
        for article in articles.findAll('div', attrs={'class':'ow'}):
            section_title = article.find(text='(rubrika: ').findNext().string
            date = article.findAll('span', attrs={'class':'date-author'})[0].contents[0][:-3]
            author = article.findAll('span', attrs={'class':'date-author'})[0].find('a').string
            title = article.find('h2').find('a')['title']
            url = article.find('h2').find('a')['href']
            link = {'title':title,'url':url,'date':date,'author':author}
            for section in ans:
                if section[0] == section_title:
                    section[1].append(link)
                    break
            else:
                ans.append((section_title,[link]))
        return ans

    def cleanup(self):
        self.browser.open('http://muj-ucet.ihned.cz/?login[logout]=1')
        
    def preprocess_html(self,soup):
        # FIXME When BeautifulSoup 4 is available in Calibre, this should be rewritten using insert_after()
        # Make image captions visible
        captions = []
        body = soup.find('div', attrs={'id':'text'})
        for index, element in enumerate(body):
            try:
                if element.name == 'img':
                   captions.append((index+1,element['title']))
            except:
                pass
        if captions:
            captions.reverse()
            for caption in captions:
                tag = Tag(BeautifulSoup(),"p",[("class","image_caption")])
                tag.insert(0,caption[1])
                body.insert(caption[0],tag)
        head = soup.findAll('h1')
        """
        # This is not reliable with BeautifulSoup 3
        # Add length in words after author
        tag = Tag(BeautifulSoup(),"div",[("class","")])
        tag.insert(0,Tag(BeautifulSoup(),"ul"))
        tag.ul.insert(0,Tag(BeautifulSoup(),"li"))
        # this will be possible with Beautiful soup 4
        #article_length = str(len(soup.text.split(' '))) + ' slov'
        article_length = str(len(u''.join(body.findAll(text=True)).split(' '))) + ' slov'
        # BeautifulSoup 4
        #tag.ul.li.string = article_length
        tag.ul.li.insert(0,article_length)
        head[0].parent.insert(head[0].parent.contents.index(head[0]) - 1,tag)
        """
        # Make perex (subheading) start on a new line
        for link_tag in head:
            link_tag_idx = link_tag.parent.contents.index(link_tag)
            link_tag.parent.insert(link_tag_idx + 1, BeautifulSoup("<br>"))
        # Indent paragraphs when typographically suitable
        parse = True
        # There are only single paragrphs in these sections
        if soup.find('title').string == u"Deset českých zpráv, které by vás neměly minout | Deset českých zpráv - RESPEKT.IHNED.CZ":
            parse = False
        if soup.find('title').string == u"Deset zahraničních zpráv, které by vás neměly minout | Deset světových zpráv - RESPEKT.IHNED.CZ":
            parse = False
        if parse:
            paragraphs = soup('body')[0]('p')
            paragraphs_to_iterate = paragraphs[1:]
            paragraphs_to_iterate.reverse()
            for par in paragraphs_to_iterate:
                try:
                    # <strong> in this or previous paragraph means no indent needed
                    if not (par.find('strong') or par.findPreviousSibling().find('strong')): 
                        indent = False
                        # Either indent if the paragraphs are the same
                        if par.findPreviousSibling().attrs == par.attrs:
                            indent = True
                        # Or else if the first paragraph of the text was special
                        if par.findPreviousSibling().has_key('class'):
                            par_name= par.findPreviousSibling()['class']
                            if par_name == '01prvniodstavecrepublicblok' or par_name == 'Zkladnodstavec' or par_name == '01titulekhlavn':
                                indent = True
                        if indent:
                            if par.has_key('class'):
                                par['class']=par['class'] + " indent_first_line"
                            else:
                                par['class']="indent_first_line"
                except:
                    pass
        return soup