View Single Post
Old 06-19-2014, 08:38 AM   #2
entodoays
Zealot
entodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enoughentodoays will become famous soon enough
 
entodoays's Avatar
 
Posts: 144
Karma: 706
Join Date: Oct 2011
Device: Sony Reader PRS-T1
A start

Since I'm new to recipe creation I started off with an exampleon the calibre documentation and started modifying it.

My first objective is to be able to get a single page to parse correctly. Then when I manage I'll try to add further steps.

My recipe looks as follows:
Code:
import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class NYTimes(BasicNewsRecipe):

    title       = 'Liturgie des Heures'
    __author__  = 'Chris Vella'
    description = 'La liturgie des heures'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = False
    remove_tags_before = dict(name='h1')
    remove_tags_after = [dict(id='print_only')]
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
                dict(id=['menuHorizontal', 'colonneDroite', 'niveau', 'don', 'font-resize', 'print_link']),
                dict(name=['script', 'noscript', 'style'])]
    encoding = 'utf8'
    no_stylesheets = True
    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'

    def parse_index(self):
		soup = self.index_to_soup('www.aelf.org/office-messe\?desktop=1&date_my=%d/%b/%Y')

    def feed_title(div):
        return ''.join(div.findAll(text=True, recursive=False)).strip()

        articles = {}
        key = None
        ans = []
        for div in soup.findAll(True,
             attrs={'class':['current']}):

             if div['class'] == 'current':
                 key = string.capwords(feed_title(div))
                 articles[key] = []
                 ans.append(key)

             elif div['class'] in ['current']:
                 a = div.find('a', href=True)
                 if not a:
                     continue
                 url = re.sub(r'\?.*', '', a['href'])
                 url += '?pagewanted=all'
                 title = self.tag_to_string(a, use_alt=True).strip()
                 description = ''
                 pubdate = strftime('%a, %d %b')
                 summary = div.find(True, attrs={'class':'summary'})
                 if summary:
                     description = self.tag_to_string(summary, use_alt=False)

                 feed = key if key is not None else 'Uncategorized'
                 if not articles.has_key(feed):
                     articles[feed] = []
                 if not 'podcasts' in url:
                     articles[feed].append(
                               dict(title=title, url=url, date=pubdate,
                                    description=description,
                                    content=''))
        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans

    def preprocess_html(self, soup):
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open('http://www.nytimes.com'+content).read()
        return BeautifulSoup(raw.decode('utf8', 'replace'))
Unfortunately I'm getting this output:
Quote:
Resolved conversion options
calibre version: 1.40.0
{'asciiize': False,
'author_sort': None,
'authors': None,
'base_font_size': 0,
'book_producer': None,
'change_justification': 'original',
'chapter': None,
'chapter_mark': 'pagebreak',
'comments': None,
'cover': None,
'debug_pipeline': u'debug',
'dehyphenate': True,
'delete_blank_paragraphs': True,
'disable_font_rescaling': False,
'dont_download_recipe': False,
'dont_split_on_page_breaks': True,
'duplicate_links_in_toc': False,
'embed_all_fonts': False,
'embed_font_family': None,
'enable_heuristics': False,
'epub_flatten': False,
'epub_inline_toc': False,
'epub_toc_at_end': False,
'expand_css': False,
'extra_css': None,
'extract_to': None,
'filter_css': None,
'fix_indents': True,
'flow_size': 260,
'font_size_mapping': None,
'format_scene_breaks': True,
'html_unwrap_factor': 0.4,
'input_encoding': None,
'input_profile': <calibre.customize.profiles.InputProfile object at 0x1ad6cd0>,
'insert_blank_line': False,
'insert_blank_line_size': 0.5,
'insert_metadata': False,
'isbn': None,
'italicize_common_cases': True,
'keep_ligatures': False,
'language': None,
'level1_toc': None,
'level2_toc': None,
'level3_toc': None,
'line_height': 0,
'linearize_tables': False,
'lrf': False,
'margin_bottom': 5.0,
'margin_left': 5.0,
'margin_right': 5.0,
'margin_top': 5.0,
'markup_chapter_headings': True,
'max_toc_links': 50,
'minimum_line_height': 120.0,
'no_chapters_in_toc': False,
'no_default_epub_cover': False,
'no_inline_navbars': False,
'no_svg_cover': False,
'output_profile': <calibre.customize.profiles.OutputProfile object at 0x1acf0d0>,
'page_breaks_before': None,
'prefer_metadata_cover': False,
'preserve_cover_aspect_ratio': False,
'pretty_print': True,
'pubdate': None,
'publisher': None,
'rating': None,
'read_metadata_from_opf': None,
'remove_fake_margins': True,
'remove_first_image': False,
'remove_paragraph_spacing': False,
'remove_paragraph_spacing_indent_size': 1.5,
'renumber_headings': True,
'replace_scene_breaks': '',
'search_replace': None,
'series': None,
'series_index': None,
'smarten_punctuation': False,
'sr1_replace': '',
'sr1_search': '',
'sr2_replace': '',
'sr2_search': '',
'sr3_replace': '',
'sr3_search': '',
'start_reading_at': None,
'subset_embedded_fonts': False,
'tags': None,
'test': (2, 2),
'timestamp': None,
'title': None,
'title_sort': None,
'toc_filter': None,
'toc_threshold': 6,
'toc_title': None,
'unsmarten_punctuation': False,
'unwrap_lines': True,
'use_auto_toc': False,
'verbose': 2}
1% Converting input to HTML...
InputFormatPlugin: Recipe Input running
Using custom recipe
1% Fetching feeds...
Traceback (most recent call last):
File "site.py", line 58, in main
File "site-packages/calibre/ebooks/conversion/cli.py", line 359, in main
File "site-packages/calibre/ebooks/conversion/plumber.py", line 1040, in run
File "site-packages/calibre/customize/conversion.py", line 241, in __call__
File "site-packages/calibre/ebooks/conversion/plugins/recipe_input.py", line 117, in convert
File "site-packages/calibre/web/feeds/news.py", line 992, in download
File "site-packages/calibre/web/feeds/news.py", line 1159, in build_index
File "site-packages/calibre/web/feeds/__init__.py", line 353, in feeds_from_index
TypeError: 'NoneType' object is not iterable
Can anybody help please? I'm not familiar with python (yet).

Thanks.
entodoays is offline   Reply With Quote