View Single Post
Old 12-23-2008, 08:11 AM   #78
moggie
Enthusiast
moggie is on a distinguished road
 
moggie's Avatar
 
Posts: 34
Karma: 74
Join Date: Nov 2008
Location: Cambridge,UK
Device: sony 505
The sugarquill

This recipe can be used to make lrf from pages at the sugarquill. The sugarquill is a website dedicated to relatively high quality Harry Potter fan fiction ( http://www.sugarquill.net/index.php?action=faq ).

This recipe takes an author id as the username. And searches for all stories by that author and makes a book up with each story appearing in the index. If a story has multiple chapters it pulls down each chapter.
Code:
feeds2lrf TheSugarQuill.py --username 310
..
Output written to /Users/jb23/tmp/quill/code/The Stories of Arabella and Zsenya at The sugar quill [Tue, 23 Dec 2008].lrf
It does it's best with the fan art section but I think that still needs some work, I would be interested in trying to reuse some comic2lrf code here.

This is my first bit of python programming but it seems to work quiet well.
Code:
mpb19815i:code jb23$ cat !$
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2008, James Beal <james_@catbus.co.uk>'
'''
www.sugarquill.net
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from BeautifulSoup import BeautifulSoup 

class TheSugarQuill(BasicNewsRecipe):
    
    description = 'Harry Potter Stories'
    match_regexps = [r'read.php']
    no_stylesheets     = True


    def parse_index(self):
	list = []
	articles = []
       
        url = "http://www.sugarquill.net/index.php?action=profile&id=%s" % (self.username) 
        soup = self.index_to_soup(url)
	for table in soup.findAll('table', attrs={'width':'80%'}):
		td=table.find('td',attrs={'class':'highlightcolor2'})
		td1=td.findNext('td',attrs={'class':'highlightcolor2'})
                self.__author__ = td1.next
		self.title = "The Stories of %s at The sugar quill" % (self.__author__)
		self.html2lrf_options = [ 
                         '--author', td1.next,
                         '--publisher', 'The Sugar Quill'
                       ]
        
	for table in soup.findAll('table', attrs={'width':'90%'}):
            a = table.find('a')
            if a and a.has_key('href'):
		url = "http://www.sugarquill.net/%s" % (a['href'])
                title = self.tag_to_string(a)
		td=table.find('td',attrs={'class':'highlightcolor2'})
                description = td.next
		td1=table.find('td',attrs={'class':'highlightcolor1'})
		date=''
		list.append((title,[{
                                 'title':title, 'date':date,
                                 'url':url,
                                 'description':description
                                }]))
	return list


    def process_chapter(self,  page):
        head = page.head
	if head != None:
        	head.extract()
	td=page.find('td',attrs={'class':'top_pane'})
	if td != None:
		if td.parent != None: 
        		if td.parent.parent != None:
            	            td.parent.parent.extract()
        td=page.find('td',attrs={'class':'bottom_pane'})
        if td != None:
                if td.parent != None:
                        if td.parent.parent != None:
                            td.parent.parent.extract()
        td=page.find('td',attrs={'class':'info_pane'})
        if td != None:
                if td.parent != None:
                     td.parent.extract()
        td=page.find('td',attrs={'class':'info2_pane'})
        if td != None:
                if td.parent != None:
                     td.parent.extract()
	td=page.find("form", attrs={'name':'SQ3'})
        if td != None:
                if td.parent != None:
                        if td.parent.parent != None:
                            td.parent.parent.extract()
        td=page.find('td',attrs={'class':'highlightcolor1'})
        if td != None:
                if td.parent != None:
                     td.parent.extract()
        form=page.find("form", attrs={'action':'read.php'})
	if form != None:
           table = form.parent
           if table != None:
               table.extract()
        sugar_signoff = page.find("div" , { "style" : "font-family: Verdana; font-size: 10px;"} )
        if sugar_signoff != None:
            sugar_signoff.extract()
        return page

    def preprocess_html(self, soup):
	from urllib2 import urlopen 
	try:
		form = soup.find("form", attrs={'action':'read.php'})
	except:
		return self.process_chapter(soup)
	if form == None:
		return self.process_chapter(soup)
	process =  ''
	storyid= form.input['value']
	story_url= "http://www.sugarquill.net/read.php?storyid=%s" % ( storyid ) 
	num_chapters = int(len(form('option')))
	current_chapter = 1
	while current_chapter <= num_chapters :
		url = "%s&chapno=%d" % (story_url,current_chapter )
		process += '<span style="page-break-after: always"></span>' 
		process += str (self.process_chapter(BeautifulSoup(urlopen(url).read())))
		current_chapter += 1
	process += '<span style="page-break-after: always"></span>'
        return BeautifulSoup(process)
Attached Files
File Type: zip TheSugarQuill.zip (1.4 KB, 623 views)
moggie is offline