MobileRead Forums - View Single Post

entodoays · 06-19-2014, 09:38 AM

Since I'm new to recipe creation I started off with an exampleon the calibre documentation and started modifying it.

My first objective is to be able to get a single page to parse correctly. Then when I manage I'll try to add further steps.

My recipe looks as follows:

Code:

import string, re
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class NYTimes(BasicNewsRecipe):

    title       = 'Liturgie des Heures'
    __author__  = 'Chris Vella'
    description = 'La liturgie des heures'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = False
    remove_tags_before = dict(name='h1')
    remove_tags_after = [dict(id='print_only')]
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
                dict(id=['menuHorizontal', 'colonneDroite', 'niveau', 'don', 'font-resize', 'print_link']),
                dict(name=['script', 'noscript', 'style'])]
    encoding = 'utf8'
    no_stylesheets = True
    extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'

    def parse_index(self):
		soup = self.index_to_soup('www.aelf.org/office-messe\?desktop=1&date_my=%d/%b/%Y')

    def feed_title(div):
        return ''.join(div.findAll(text=True, recursive=False)).strip()

        articles = {}
        key = None
        ans = []
        for div in soup.findAll(True,
             attrs={'class':['current']}):

             if div['class'] == 'current':
                 key = string.capwords(feed_title(div))
                 articles[key] = []
                 ans.append(key)

             elif div['class'] in ['current']:
                 a = div.find('a', href=True)
                 if not a:
                     continue
                 url = re.sub(r'\?.*', '', a['href'])
                 url += '?pagewanted=all'
                 title = self.tag_to_string(a, use_alt=True).strip()
                 description = ''
                 pubdate = strftime('%a, %d %b')
                 summary = div.find(True, attrs={'class':'summary'})
                 if summary:
                     description = self.tag_to_string(summary, use_alt=False)

                 feed = key if key is not None else 'Uncategorized'
                 if not articles.has_key(feed):
                     articles[feed] = []
                 if not 'podcasts' in url:
                     articles[feed].append(
                               dict(title=title, url=url, date=pubdate,
                                    description=description,
                                    content=''))
        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans

    def preprocess_html(self, soup):
        refresh = soup.find('meta', {'http-equiv':'refresh'})
        if refresh is None:
            return soup
        content = refresh.get('content').partition('=')[2]
        raw = self.browser.open('http://www.nytimes.com'+content).read()
        return BeautifulSoup(raw.decode('utf8', 'replace'))

Unfortunately I'm getting this output:

Quote:

Resolved conversion options
calibre version: 1.40.0
{'asciiize': False,
'author_sort': None,
'authors': None,
'base_font_size': 0,
'book_producer': None,
'change_justification': 'original',
'chapter': None,
'chapter_mark': 'pagebreak',
'comments': None,
'cover': None,
'debug_pipeline': u'debug',
'dehyphenate': True,
'delete_blank_paragraphs': True,
'disable_font_rescaling': False,
'dont_download_recipe': False,
'dont_split_on_page_breaks': True,
'duplicate_links_in_toc': False,
'embed_all_fonts': False,
'embed_font_family': None,
'enable_heuristics': False,
'epub_flatten': False,
'epub_inline_toc': False,
'epub_toc_at_end': False,
'expand_css': False,
'extra_css': None,
'extract_to': None,
'filter_css': None,
'fix_indents': True,
'flow_size': 260,
'font_size_mapping': None,
'format_scene_breaks': True,
'html_unwrap_factor': 0.4,
'input_encoding': None,
'input_profile': <calibre.customize.profiles.InputProfile object at 0x1ad6cd0>,
'insert_blank_line': False,
'insert_blank_line_size': 0.5,
'insert_metadata': False,
'isbn': None,
'italicize_common_cases': True,
'keep_ligatures': False,
'language': None,
'level1_toc': None,
'level2_toc': None,
'level3_toc': None,
'line_height': 0,
'linearize_tables': False,
'lrf': False,
'margin_bottom': 5.0,
'margin_left': 5.0,
'margin_right': 5.0,
'margin_top': 5.0,
'markup_chapter_headings': True,
'max_toc_links': 50,
'minimum_line_height': 120.0,
'no_chapters_in_toc': False,
'no_default_epub_cover': False,
'no_inline_navbars': False,
'no_svg_cover': False,
'output_profile': <calibre.customize.profiles.OutputProfile object at 0x1acf0d0>,
'page_breaks_before': None,
'prefer_metadata_cover': False,
'preserve_cover_aspect_ratio': False,
'pretty_print': True,
'pubdate': None,
'publisher': None,
'rating': None,
'read_metadata_from_opf': None,
'remove_fake_margins': True,
'remove_first_image': False,
'remove_paragraph_spacing': False,
'remove_paragraph_spacing_indent_size': 1.5,
'renumber_headings': True,
'replace_scene_breaks': '',
'search_replace': None,
'series': None,
'series_index': None,
'smarten_punctuation': False,
'sr1_replace': '',
'sr1_search': '',
'sr2_replace': '',
'sr2_search': '',
'sr3_replace': '',
'sr3_search': '',
'start_reading_at': None,
'subset_embedded_fonts': False,
'tags': None,
'test': (2, 2),
'timestamp': None,
'title': None,
'title_sort': None,
'toc_filter': None,
'toc_threshold': 6,
'toc_title': None,
'unsmarten_punctuation': False,
'unwrap_lines': True,
'use_auto_toc': False,
'verbose': 2}
1% Converting input to HTML...
InputFormatPlugin: Recipe Input running
Using custom recipe
1% Fetching feeds...
Traceback (most recent call last):
File "site.py", line 58, in main
File "site-packages/calibre/ebooks/conversion/cli.py", line 359, in main
File "site-packages/calibre/ebooks/conversion/plumber.py", line 1040, in run
File "site-packages/calibre/customize/conversion.py", line 241, in __call__
File "site-packages/calibre/ebooks/conversion/plugins/recipe_input.py", line 117, in convert
File "site-packages/calibre/web/feeds/news.py", line 992, in download
File "site-packages/calibre/web/feeds/news.py", line 1159, in build_index
File "site-packages/calibre/web/feeds/__init__.py", line 353, in feeds_from_index
TypeError: 'NoneType' object is not iterable

Can anybody help please? I'm not familiar with python (yet).

Thanks.