MobileRead Forums - View Single Post

Bob Russell · 12-24-2009, 08:54 PM

Well, I took the existing WSJ feed, and basically converted the feeds into a parse_index. The result (from cmd line test is an error parsing the recipe) wasn't quite what I hoped for.

Question 1: Can I test a recipe from cmd line when using a login? Do I have to add it to the recipe directory and/or import it with the Calibre GUI?

Question 2: Python doesn't seem to count blank lines in the source code when reporting errors... what's a nice open source editor that will show matching line numbers (or is there something else going on besides blank lines?)

Here's the new recipe...

Code:

#!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'

from calibre.web.feeds.news import BasicNewsRecipe

# http://online.wsj.com/page/us_in_todays_paper.html

class WallStreetJournal(BasicNewsRecipe):

        title = 'The Wall Street Journal'
        __author__ = 'Kovid Goyal and Sujata Raman'
        description = 'News and current affairs.'
        INDEX = 'http://online.wsj.com/page/us_in_todays_paper.html'
        needs_subscription = True
        language = 'en'

        max_articles_per_feed = 200
        timefmt  = ' [%a, %b %d, %Y]'
        no_stylesheets = True

        extra_css      = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; }
                        h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
                        .subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
                        .insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small }
                        .targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif}
                        .article{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
                        .tagline {color:#333333; font-size:xx-small}
                        .dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif}
                         h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
                         .byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
                         h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
                        .paperLocation{color:#666666; font-size:xx-small}'''

        remove_tags_before = dict(name='h1')
        remove_tags = [
                       dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]),
                       {'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
                       dict(rel='shortcut icon'),
                      ]
        remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]


        def get_browser(self):
            br = BasicNewsRecipe.get_browser()
            if self.username is not None and self.password is not None:
                br.open('http://commerce.wsj.com/auth/login')
                br.select_form(nr=0)
                br['user']   = self.username
                br['password'] = self.password
                br.submit()
            return br

        def postprocess_html(self, soup, first):
            for tag in soup.findAll(name=['table', 'tr', 'td']):
                tag.name = 'div'

            for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
                tag.extract()

            return soup

        def get_article_url(self, article):
            try:
                return article.feedburner_origlink.split('?')[0]
            except AttributeError:
                return article.link.split('?')[0]

        def cleanup(self):
            self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')

        def parse_index(self):
            articles = []

            soup = self.index_to_soup(self.INDEX)

            for item in soup.findAll(lambda tag: tag.name == 'div' and not tag.attrs):
                a = item.find('a')
                if a and a.has_key('href'):
                    url = a['href']
                    if not url.startswith('http://'):
                        url = ' http://online.wsj.com'+url
                    title = self.tag_to_string(a)
                    if title in ('INTERACTIVE MAP', 'SIDEBAR'):
                        continue

                    title = title.replace('&AMP;', '&')
                    date = ''
                    description = ''

                    articles.append({
                                     'title':title,
                                     'date':date,
                                     'url':url,
                                     'description':description,
                                     'content':''
                                })

            return [('Todays US WSJ Articles', articles)]

and here's the slightly redacted error message when I try to test it...

Code:

>ebook-convert WSJ3.recipe <theTargetDirectory> --test -vv
Resolved conversion options
{'asciiize': False,
 'author_sort': None,
 'authors': None,
 'base_font_size': 0,
 'book_producer': None,
 'chapter': None,
 'chapter_mark': 'pagebreak',
 'comments': None,
 'cover': None,
 'debug_pipeline': None,
 'disable_font_rescaling': False,
 'dont_justify': False,
 'extra_css': None,
 'font_size_mapping': None,
 'footer_regex': '(?i)(?<=<hr>)((\\s*<a name=\\d+></a>((<img.+?>)*<br>\\s*)?\\d+
<br>\\s*.*?\\s*)|(\\s*<a name=\\d+></a>((<img.+?>)*<br>\\s*)?.*?<br>\\s*\\d+))(?
=<br>)',
 'header_regex': '(?i)(?<=<hr>)((\\s*<a name=\\d+></a>((<img.+?>)*<br>\\s*)?\\d+
<br>\\s*.*?\\s*)|(\\s*<a name=\\d+></a>((<img.+?>)*<br>\\s*)?.*?<br>\\s*\\d+))(?
=<br>)',
 'input_encoding': None,
 'input_profile': <calibre.customize.profiles.InputProfile object at 0x03E8F110>
,
 'insert_blank_line': False,
 'insert_metadata': False,
 'isbn': None,
 'language': None,
 'level1_toc': None,
 'level2_toc': None,
 'level3_toc': None,
 'line_height': 0,
 'linearize_tables': False,
 'lrf': False,
 'margin_bottom': 5.0,
 'margin_left': 5.0,
 'margin_right': 5.0,
 'margin_top': 5.0,
 'max_toc_links': 50,
 'no_chapters_in_toc': False,
 'no_inline_navbars': False,
 'output_profile': <calibre.customize.profiles.OutputProfile object at 0x03E8F25
0>,
 'page_breaks_before': None,
 'password': None,
 'prefer_metadata_cover': False,
 'preprocess_html': False,
 'pretty_print': True,
 'publisher': None,
 'rating': None,
 'read_metadata_from_opf': None,
 'remove_first_image': False,
 'remove_footer': False,
 'remove_header': False,
 'remove_paragraph_spacing': False,
 'remove_paragraph_spacing_indent_size': 1.5,
 'series': None,
 'series_index': None,
 'tags': None,
 'test': True,
 'title': None,
 'title_sort': None,
 'toc_filter': None,
 'toc_threshold': 6,
 'use_auto_toc': False,
 'username': None,
 'verbose': 2}
1% Converting input to HTML...
InputFormatPlugin: Recipe Input running
Failed to compile downloaded recipe. Falling back to builtin one
Traceback (most recent call last):
  File "site-packages\calibre\web\feeds\input.py", line 58, in convert
  File "site-packages\calibre\web\feeds\recipes\__init__.py", line 31, in compil
e_recipe
TypeError: 'NoneType' object is unsubscriptable

Python function terminated unexpectedly
  'NoneType' object is unsubscriptable (Error Code: 1)
Traceback (most recent call last):
  File "site.py", line 103, in main
  File "site.py", line 85, in run_entry_point
  File "site-packages\calibre\ebooks\conversion\cli.py", line 249, in main
  File "site-packages\calibre\ebooks\conversion\plumber.py", line 736, in run
  File "site-packages\calibre\customize\conversion.py", line 208, in __call__
  File "site-packages\calibre\web\feeds\input.py", line 71, in convert
  File "site-packages\calibre\web\feeds\recipes\__init__.py", line 31, in compil
e_recipe
TypeError: 'NoneType' object is unsubscriptable

Without significant time & effort, I'm not sure I'll be able to get this working, but I've invested a bit of time already. Would sure appreciate it if anyone is willing to point me in the right direction.

Thanks!