Recovering Gadget Addict
Posts: 5,381
Karma: 676161
Join Date: May 2004
Location: Pittsburgh, PA
Device: iPad
|
Well, I took the existing WSJ feed, and basically converted the feeds into a parse_index. The result (from cmd line test is an error parsing the recipe) wasn't quite what I hoped for.
Question 1: Can I test a recipe from cmd line when using a login? Do I have to add it to the recipe directory and/or import it with the Calibre GUI?
Question 2: Python doesn't seem to count blank lines in the source code when reporting errors... what's a nice open source editor that will show matching line numbers (or is there something else going on besides blank lines?)
Here's the new recipe...
Code:
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
# http://online.wsj.com/page/us_in_todays_paper.html
class WallStreetJournal(BasicNewsRecipe):
title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal and Sujata Raman'
description = 'News and current affairs.'
INDEX = 'http://online.wsj.com/page/us_in_todays_paper.html'
needs_subscription = True
language = 'en'
max_articles_per_feed = 200
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
extra_css = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; }
h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small }
.targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif}
.article{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
.tagline {color:#333333; font-size:xx-small}
.dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif}
h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
.byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
.paperLocation{color:#666666; font-size:xx-small}'''
remove_tags_before = dict(name='h1')
remove_tags = [
dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow"]),
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
dict(rel='shortcut icon'),
]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.wsj.com/auth/login')
br.select_form(nr=0)
br['user'] = self.username
br['password'] = self.password
br.submit()
return br
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
tag.extract()
return soup
def get_article_url(self, article):
try:
return article.feedburner_origlink.split('?')[0]
except AttributeError:
return article.link.split('?')[0]
def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
for item in soup.findAll(lambda tag: tag.name == 'div' and not tag.attrs):
a = item.find('a')
if a and a.has_key('href'):
url = a['href']
if not url.startswith('http://'):
url = ' http://online.wsj.com'+url
title = self.tag_to_string(a)
if title in ('INTERACTIVE MAP', 'SIDEBAR'):
continue
title = title.replace('&', '&')
date = ''
description = ''
articles.append({
'title':title,
'date':date,
'url':url,
'description':description,
'content':''
})
return [('Todays US WSJ Articles', articles)]
and here's the slightly redacted error message when I try to test it...
Code:
>ebook-convert WSJ3.recipe <theTargetDirectory> --test -vv
Resolved conversion options
{'asciiize': False,
'author_sort': None,
'authors': None,
'base_font_size': 0,
'book_producer': None,
'chapter': None,
'chapter_mark': 'pagebreak',
'comments': None,
'cover': None,
'debug_pipeline': None,
'disable_font_rescaling': False,
'dont_justify': False,
'extra_css': None,
'font_size_mapping': None,
'footer_regex': '(?i)(?<=<hr>)((\\s*<a name=\\d+></a>((<img.+?>)*<br>\\s*)?\\d+
<br>\\s*.*?\\s*)|(\\s*<a name=\\d+></a>((<img.+?>)*<br>\\s*)?.*?<br>\\s*\\d+))(?
=<br>)',
'header_regex': '(?i)(?<=<hr>)((\\s*<a name=\\d+></a>((<img.+?>)*<br>\\s*)?\\d+
<br>\\s*.*?\\s*)|(\\s*<a name=\\d+></a>((<img.+?>)*<br>\\s*)?.*?<br>\\s*\\d+))(?
=<br>)',
'input_encoding': None,
'input_profile': <calibre.customize.profiles.InputProfile object at 0x03E8F110>
,
'insert_blank_line': False,
'insert_metadata': False,
'isbn': None,
'language': None,
'level1_toc': None,
'level2_toc': None,
'level3_toc': None,
'line_height': 0,
'linearize_tables': False,
'lrf': False,
'margin_bottom': 5.0,
'margin_left': 5.0,
'margin_right': 5.0,
'margin_top': 5.0,
'max_toc_links': 50,
'no_chapters_in_toc': False,
'no_inline_navbars': False,
'output_profile': <calibre.customize.profiles.OutputProfile object at 0x03E8F25
0>,
'page_breaks_before': None,
'password': None,
'prefer_metadata_cover': False,
'preprocess_html': False,
'pretty_print': True,
'publisher': None,
'rating': None,
'read_metadata_from_opf': None,
'remove_first_image': False,
'remove_footer': False,
'remove_header': False,
'remove_paragraph_spacing': False,
'remove_paragraph_spacing_indent_size': 1.5,
'series': None,
'series_index': None,
'tags': None,
'test': True,
'title': None,
'title_sort': None,
'toc_filter': None,
'toc_threshold': 6,
'use_auto_toc': False,
'username': None,
'verbose': 2}
1% Converting input to HTML...
InputFormatPlugin: Recipe Input running
Failed to compile downloaded recipe. Falling back to builtin one
Traceback (most recent call last):
File "site-packages\calibre\web\feeds\input.py", line 58, in convert
File "site-packages\calibre\web\feeds\recipes\__init__.py", line 31, in compil
e_recipe
TypeError: 'NoneType' object is unsubscriptable
Python function terminated unexpectedly
'NoneType' object is unsubscriptable (Error Code: 1)
Traceback (most recent call last):
File "site.py", line 103, in main
File "site.py", line 85, in run_entry_point
File "site-packages\calibre\ebooks\conversion\cli.py", line 249, in main
File "site-packages\calibre\ebooks\conversion\plumber.py", line 736, in run
File "site-packages\calibre\customize\conversion.py", line 208, in __call__
File "site-packages\calibre\web\feeds\input.py", line 71, in convert
File "site-packages\calibre\web\feeds\recipes\__init__.py", line 31, in compil
e_recipe
TypeError: 'NoneType' object is unsubscriptable
Without significant time & effort, I'm not sure I'll be able to get this working, but I've invested a bit of time already. Would sure appreciate it if anyone is willing to point me in the right direction.
Thanks!
|