View Single Post
Old 06-25-2014, 04:04 PM   #10
entodoays
Zealot
entodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enough
 
entodoays's Avatar
 
Posts: 144
Karma: 706
Join Date: Oct 2011
Device: Sony Reader PRS-T1
My first "working" recipe

I managed to create a recipe which downloads something. I started from the built-in recipe for "The Atlantic" and tried modifying it accordingly.

I'm trying to get the recipe to download all the links in http://www.aelf.org/office-laudes which are found in the following div:
Code:
<div class="bloc" onMouseOver="mabulle.hide()">
    <ul>
        <li class=""> > <a href="/office-messe">Lecture de la messe</a></li>
        <li class="current"> > Liturgie des heures
            <ul>
                <li class=""> > <a href="/office-lectures">Lectures</a></li>
                <li class="current"> > <a href="/office-laudes">Laudes</a></li>
                <li class=""> > <a href="/office-tierce">Tierce</a></li>
                <li class=""> > <a href="/office-sexte">Sexte</a></li>
                <li class=""> > <a href="/office-none">None</a></li>
                <li class=""> > <a href="/office-vepres">Vêpres</a></li>
                <li class=""> > <a href="/office-complies">Complies</a></li>
            </ul>
        </li>
    </ul>
</div>
For some reason it only downloads the link to /office-lectures and /office-messe. I cannot understand why.

The following is my recipe:
Code:
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
aelf.org Liturgie des heures
'''
import re, datetime

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString

now = datetime.datetime.now() #Get today's date
idx = (now.weekday() + 1) % 7 #Get the day of the week
Base_date = now + datetime.timedelta(7-idx) #Get this Sunday's date
next_date = Base_date
site_date = "%s/%s/%s" % (next_date.day, next_date.month, next_date.year)

class AELF(BasicNewsRecipe):

    title      = 'AELF'
    __author__ = 'Kovid Goyal and Sujata Raman'
    description = 'Liturgie des Heures'
    INDEX = "http://www.aelf.org/office-laudes?desktop=1&date_my=%s" % (site_date)
    language = 'fr'

    """keep_only_tags = [{'attrs':{'class':['bloc']}}]"""
    remove_tags    = [dict(attrs={'class':['clr', 'goTop', 'print_only', 'change_country', 'abonnement', 'current', 'bloc', 'degre', 'bas']})]
    no_stylesheets = True

    def parse_index(self):
        articles = []
        soup = self.index_to_soup(self.INDEX)
        ts = soup.find('li')
        ds = self.tag_to_string(ts.find('h2')).split(':')[-1]
        self.timefmt = ' [%s]'%ds

        cover = soup.find('img', src=True, attrs={'alt':'logo de l\'association épiscopale liturgique pour les pays francophones'})

        if cover is not None:
            self.cover_url = 'http://www.aelf.org' + cover['src']
            self.log(self.cover_url)

        feeds = []
        seen_titles = set([])
        for section in soup.findAll('div', attrs={'id':'contenu'}):
            section_title = self.tag_to_string(section.find('li', attrs={'class':''}))
            self.log('Found section:', section_title)
            articles = []
            for post in section.findAll('li'):
                a = post.find('a', href=True)
                title = self.tag_to_string(a)
                """if title in seen_titles:
                    continue"""
                seen_titles.add(title)
                url = a['href']
                if url.startswith('/'):
                    url = 'http://www.aelf.org'+url+'?desktop=1&date_my=%s' % (site_date)
                p = post.parent.find('p', attrs={'class':'current'})
                desc = None
                self.log('\tFound article:', title, 'at', url)
                if p is not None:
                    desc = self.tag_to_string(p)
                    self.log('\t\t', desc)
                articles.append({'title':title, 'url':url, 'description':desc,
                    'date':''})
            if articles:
                feeds.append((section_title, articles))

        rightContent=soup.find('div', attrs={'class':['bloc']})
        for module in rightContent.findAll('li', attrs={'class':['']}):
            section_title = self.tag_to_string(INDEX.find('h1'))
            articles = []
            for post in module.findAll('div'):
                a = post.find('a', href=True)
                title = self.tag_to_string(a)
                if title in seen_titles:
                    continue
                seen_titles.add(title)
                url = a['href']
                if url.startswith('/'):
                    url = 'http://www.aelf.org'+url
                p = post.parent.find('p')
                desc = None
                self.log('\tFound article:', title, 'at', url)
                if p is not None:
                    desc = self.tag_to_string(p)
                    self.log('\t\t', desc)
                articles.append({'title':title, 'url':url, 'description':desc, 'date':''})
            if articles:
                feeds.append((section_title, articles))

        return feeds

    def postprocess_html(self, soup, first):
        for table in soup.findAll('table', align='right'):
            img = table.find('img')
            if img is not None:
                img.extract()
                caption = self.tag_to_string(table).strip()
                div = Tag(soup, 'div')
                div['style'] = 'text-align:center'
                div.insert(0, img)
                div.insert(1, Tag(soup, 'br'))
                if caption:
                    div.insert(2, NavigableString(caption))
                table.replaceWith(div)

        return soup
entodoays is offline   Reply With Quote