Quote:
Originally Posted by Starson17
You need to use multipage code. Here's an example from the adventuregamers.recipe builtin:
Code:
def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'class':'toolbar_fat_next'})
if pager:
nexturl = self.INDEX + pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'class':'bodytext'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
self.append_page(soup, soup.body, 3)
pager = soup.find('div',attrs={'class':'toolbar_fat'})
if pager:
pager.extract()
return soup
append_page recursively looks for the next page tag ('div',attrs={'class':'toolbar_fat_next'}), gets the text and inserts it into the soup at the point where the tag was found until all pages have been inserted.
preprocess_html uses append_page to modify the html. You'll need to look for the next page tag on your site and adjust accordingly. This should get you started.
Do your testing with -vv and --test
as in:
ebook-convert pcper.recipe pcper --test -vv> pcper.txt
|
Hey Starson17,
I have 2 site that I'm tiring to get the multi-page code working on, pcper.com and tweaktown.com. Both these sites have similar layouts thou tweaktown.com source code seems a bit better to learn with, so I've been workin with that one.
I'm kinda stuck, when I add the append_page code the test html only contains the feed description and date, with out it I get the 1st page so I'm screwing it up somewhere.
here's what I have for tweaktown.com:
Code:
class AdvancedUserRecipe1273795663(BasicNewsRecipe):
title = u'TweakTown Latest Tech'
description = 'TweakTown Latest Tech'
__author__ = 'KidTwisted'
publisher = 'TweakTown'
category = 'PC Articles, Reviews and Guides'
use_embedded_content = False
max_articles_per_feed = 1
oldest_article = 7
timefmt = ' [%Y %b %d ]'
no_stylesheets = True
language = 'en'
#recursion = 10
remove_javascript = True
conversion_options = { 'linearize_tables' : True}
# reverse_article_order = True
#INDEX = u'http://www.tweaktown.com'
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'
keep_only_tags = [dict(name='div', attrs={'id':['article']})]
feeds = [ (u'Articles Reviews', u'http://feeds.feedburner.com/TweaktownArticlesReviewsAndGuidesRss20?format=xml') ]
def get_article_url(self, article):
return article.get('guid', None)
def append_page(self, soup, appendtag, position):
pager = soup.find('a',attrs={'class':'next'})
if pager:
nexturl = pager.a['href']
soup2 = self.index_to_soup(nexturl)
texttag = soup2.find('div', attrs={'id':'article'})
for it in texttag.findAll(style=True):
del it['style']
newpos = len(texttag.contents)
self.append_page(soup2,texttag,newpos)
texttag.extract()
appendtag.insert(position,texttag)
def preprocess_html(self, soup):
mtag = '<meta http-equiv="Content-Language" content="en-US"/>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
soup.head.insert(0,mtag)
for item in soup.findAll(style=True):
del item['style']
self.append_page(soup, soup.body, 3)
pager = soup.find('a',attrs={'class':'next'})
if pager:
pager.extract()
return soup
Could you or someone in the know take a look at it to see what I'm doing wrong. I commented out "INDEX" because the link for the next page is a complete link, any help on this would be great.