This recipe can be used to make lrf from pages at the sugarquill. The sugarquill is a website dedicated to relatively high quality Harry Potter fan fiction (
http://www.sugarquill.net/index.php?action=faq ).
This recipe takes an author id as the username. And searches for all stories by that author and makes a book up with each story appearing in the index. If a story has multiple chapters it pulls down each chapter.
Code:
feeds2lrf TheSugarQuill.py --username 310
..
Output written to /Users/jb23/tmp/quill/code/The Stories of Arabella and Zsenya at The sugar quill [Tue, 23 Dec 2008].lrf
It does it's best with the fan art section but I think that still needs some work, I would be interested in trying to reuse some comic2lrf code here.
This is my first bit of python programming but it seems to work quiet well.
Code:
mpb19815i:code jb23$ cat !$
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, James Beal <james_@catbus.co.uk>'
'''
www.sugarquill.net
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from BeautifulSoup import BeautifulSoup
class TheSugarQuill(BasicNewsRecipe):
description = 'Harry Potter Stories'
match_regexps = [r'read.php']
no_stylesheets = True
def parse_index(self):
list = []
articles = []
url = "http://www.sugarquill.net/index.php?action=profile&id=%s" % (self.username)
soup = self.index_to_soup(url)
for table in soup.findAll('table', attrs={'width':'80%'}):
td=table.find('td',attrs={'class':'highlightcolor2'})
td1=td.findNext('td',attrs={'class':'highlightcolor2'})
self.__author__ = td1.next
self.title = "The Stories of %s at The sugar quill" % (self.__author__)
self.html2lrf_options = [
'--author', td1.next,
'--publisher', 'The Sugar Quill'
]
for table in soup.findAll('table', attrs={'width':'90%'}):
a = table.find('a')
if a and a.has_key('href'):
url = "http://www.sugarquill.net/%s" % (a['href'])
title = self.tag_to_string(a)
td=table.find('td',attrs={'class':'highlightcolor2'})
description = td.next
td1=table.find('td',attrs={'class':'highlightcolor1'})
date=''
list.append((title,[{
'title':title, 'date':date,
'url':url,
'description':description
}]))
return list
def process_chapter(self, page):
head = page.head
if head != None:
head.extract()
td=page.find('td',attrs={'class':'top_pane'})
if td != None:
if td.parent != None:
if td.parent.parent != None:
td.parent.parent.extract()
td=page.find('td',attrs={'class':'bottom_pane'})
if td != None:
if td.parent != None:
if td.parent.parent != None:
td.parent.parent.extract()
td=page.find('td',attrs={'class':'info_pane'})
if td != None:
if td.parent != None:
td.parent.extract()
td=page.find('td',attrs={'class':'info2_pane'})
if td != None:
if td.parent != None:
td.parent.extract()
td=page.find("form", attrs={'name':'SQ3'})
if td != None:
if td.parent != None:
if td.parent.parent != None:
td.parent.parent.extract()
td=page.find('td',attrs={'class':'highlightcolor1'})
if td != None:
if td.parent != None:
td.parent.extract()
form=page.find("form", attrs={'action':'read.php'})
if form != None:
table = form.parent
if table != None:
table.extract()
sugar_signoff = page.find("div" , { "style" : "font-family: Verdana; font-size: 10px;"} )
if sugar_signoff != None:
sugar_signoff.extract()
return page
def preprocess_html(self, soup):
from urllib2 import urlopen
try:
form = soup.find("form", attrs={'action':'read.php'})
except:
return self.process_chapter(soup)
if form == None:
return self.process_chapter(soup)
process = ''
storyid= form.input['value']
story_url= "http://www.sugarquill.net/read.php?storyid=%s" % ( storyid )
num_chapters = int(len(form('option')))
current_chapter = 1
while current_chapter <= num_chapters :
url = "%s&chapno=%d" % (story_url,current_chapter )
process += '<span style="page-break-after: always"></span>'
process += str (self.process_chapter(BeautifulSoup(urlopen(url).read())))
current_chapter += 1
process += '<span style="page-break-after: always"></span>'
return BeautifulSoup(process)