View Single Post
Old 05-29-2011, 04:01 AM   #1
song2
Junior Member
song2 began at the beginning.
 
Posts: 2
Karma: 10
Join Date: May 2011
Device: Kindle
Recipes for "Dilema Veche" and "Observatorul Cultural" - weekly Romanian magazines

After 3 months of using them with no major problem, I'm ready to make them public:

Dilema Veche.

Code:
class DilemaVeche(BasicNewsRecipe):
    title          = u'Dilema Veche' # apare vinerea, mai pe dupa-masa,depinde de Luiza cred (care se semneaza ca fiind creatorul fiecarui articol in feed-ul RSS)
    __author__            = 'song2' # inspirat din scriptul pentru Le Monde. Inspired from the Le Monde script
    description           = '"Sint vechi, domnule!" (I.L. Caragiale)'
    publisher             = 'Adevarul Holding'
    oldest_article        = 7
    max_articles_per_feed = 200
    encoding              = 'utf8'
    language = 'ro'
    masthead_url = 'http://www.dilemaveche.ro/sites/all/themes/dilema/theme/dilema_two/layouter/dilema_two_homepage/logo.png'
    publication_type = 'magazine'
    feeds = [    
                ('Editoriale si opinii - Situatiunea', 'http://www.dilemaveche.ro/taxonomy/term/37/0/feed'),                      
                ('Editoriale si opinii - Pe ce lume traim', 'http://www.dilemaveche.ro/taxonomy/term/38/0/feed'),
                ('Editoriale si opinii - Bordeie si obiceie', 'http://www.dilemaveche.ro/taxonomy/term/44/0/feed'),
                ('Editoriale si opinii - Talc Show', 'http://www.dilemaveche.ro/taxonomy/term/44/0/feed'),
                ('Tema saptamanii', 'http://www.dilemaveche.ro/taxonomy/term/19/0/feed'),
                ('La zi in cultura - Dilema va recomanda', 'http://www.dilemaveche.ro/taxonomy/term/58/0/feed'),
                ('La zi in cultura - Carte', 'http://www.dilemaveche.ro/taxonomy/term/14/0/feed'),
                ('La zi in cultura - Film', 'http://www.dilemaveche.ro/taxonomy/term/13/0/feed'),
                ('La zi in cultura - Muzica', 'http://www.dilemaveche.ro/taxonomy/term/1341/0/feed'),
                ('La zi in cultura - Arte performative', 'http://www.dilemaveche.ro/taxonomy/term/1342/0/feed'),
                ('La zi in cultura - Arte vizuale', 'http://www.dilemaveche.ro/taxonomy/term/1512/0/feed'),
                ('Societate - Ieri cu vedere spre azi', 'http://www.dilemaveche.ro/taxonomy/term/15/0/feed'),
                ('Societate - Din polul opus', 'http://www.dilemaveche.ro/taxonomy/term/41/0/feed'),
                ('Societate - Mass comedia', 'http://www.dilemaveche.ro/taxonomy/term/43/0/feed'),
                ('Societate - La singular si la plural', 'http://www.dilemaveche.ro/taxonomy/term/42/0/feed'),
                ('Oameni si idei - Educatie', 'http://www.dilemaveche.ro/taxonomy/term/46/0/feed'),
                ('Oameni si idei - Polemici si dezbateri', 'http://www.dilemaveche.ro/taxonomy/term/48/0/feed'),
                ('Oameni si idei - Stiinta si tehnologie', 'http://www.dilemaveche.ro/taxonomy/term/46/0/feed'),
                ('Dileme on-line', 'http://www.dilemaveche.ro/taxonomy/term/005/0/feed')
                 ]
    remove_tags_before = dict(name='div',attrs={'class':'spacer_10'})
    remove_tags = [
        dict(name='div', attrs={'class':'art_related_left'}),
        dict(name='div', attrs={'class':'controale'}),
		dict(name='div', attrs={'class':'simple_overlay'}),
    ]
    remove_tags_after = [dict(id='facebookLike')]
    remove_javascript = True
    no_stylesheets        = True
    remove_empty_feeds = True
    extra_css             = """
        body{font-family: Georgia,Times,serif }
        img{margin-bottom: 0.4em; display:block}
                            """
    def get_cover_url(self):
        cover_url = None
        soup = self.index_to_soup('http://dilemaveche.ro')
        link_item = soup.find('div',attrs={'class':'box_dr_pdf_picture'})
        if link_item and link_item.a:
           cover_url = link_item.a['href']
        br = BasicNewsRecipe.get_browser()
        try:
            br.open(cover_url) 
        except: #daca nu gaseste pdf-ul
            self.log("\nPDF indisponibil")
            link_item = soup.find('div',attrs={'class':'box_dr_pdf_picture'})
            if link_item and link_item.img:
                cover_url = link_item.img['src']
            br = BasicNewsRecipe.get_browser()
            try:
                 br.open(cover_url)
            except: #daca nu gaseste nici imaginea mica mica
                print('Mama lor de nenorociti! nu este nici pdf nici imagine')
                cover_url ='http://www.dilemaveche.ro/sites/all/themes/dilema/theme/dilema_two/layouter/dilema_two_homepage/logo.png'
        return cover_url
    cover_margins = (10, 15, '#ffffff')
Observatorul Cultural:
Code:
import re
from calibre.web.feeds.news import BasicNewsRecipe
class ObservatorulCultural(BasicNewsRecipe):
    title                 = u'Observatorul cultural'
    __author__            = 'song2' #prelucrat dupa un script de http://www.thenowhereman.com
    encoding = 'utf-8'
    language = 'ro'
    publication_type = 'magazine'
    description = 'Spiritul critic in acţiune\n'
    no_stylesheets        = True
    remove_javascript     = True
    masthead_url='http://www.observatorcultural.ro/userfiles/article/sigla%20Observator%20cultural_02231058.JPG'
    keep_only_tags = [
        dict(name='div', attrs={'class':'detaliuArticol'})]
    remove_tags = [dict(name='div', attrs={'class':'comentariiArticol'}),
         dict(name='div', attrs={'class':'postComment'}),
         dict(name='div', attrs={'class':'utileArticol'}),
         dict(name='p', attrs={'class':'butonComenteaza'}),
         dict(name='h5'),
         dict(name='div', attrs={'style':'margin-top: 0px; padding-top: 0px;'})
         ]
    def parse_index(self):
        soup = self.index_to_soup('http://www.observatorcultural.ro/Arhiva*-archive.html')
        issueTag = soup.find('a', href=re.compile("observatorcultural.ro\/Numarul"))
        issueURL = issueTag['href']
        print issueURL;
        issueSoup = self.index_to_soup(issueURL)
        feeds = []
        stories = []
        for categorie in issueSoup.findAll('dl',attrs={'class':'continutArhive'}):
            categ=self.tag_to_string(categorie.find('dt'))
            for story in categorie.findAll('dd'):
                title=[]
                for bucatele in story.findAll('a'):
                    title.append(bucatele)
                if len(title)==1: #daca articolul nu are autor
                    stories.append({
                        'title' : self.tag_to_string(title[0]),
                        'url'   : title[0]['href'],
                        'date'  : '',
                        'author' : ''})
                else: # daca articolul are autor len(title)=2
                    stories.append({
                        'title' : self.tag_to_string(title[1]),
                        'url'   :title[1]['href'],
                        'date'  : '',
                        'author' : self.tag_to_string(title[0])})
                    print(self.tag_to_string(title[0]))
                if 'Editorial' in categ:  
                    global coverpage
                    coverpage=title[1]['href']  # am luat link-ul spre editorial
            feeds.append((categ,stories))
            stories = []
        print feeds
        return feeds
#procedura de luat coperta 
    def get_cover_url(self):
        soup = self.index_to_soup(coverpage)
        link_item = soup.find('a',attrs={'rel':'lightbox'}) # caut imaginea textului
        a=''
        cover_url = a.join(link_item.img['src'].split('_details_'))
        return cover_url
song2 is offline   Reply With Quote