View Single Post
Old 03-09-2011, 06:38 PM   #3
Quasii
Member
Quasii began at the beginning.
 
Posts: 12
Karma: 10
Join Date: Feb 2011
Device: Kindle3
Thanks for your quick reply.

And your link was helpful, my complete lack of knowledge in regard to python is only holding me back now.

if there is any bored python experts out there I'd be greatly appreciative, but won't be holding me breath. (and sorry I don't know how to open one of those funky code boxes like everyone else does)

original recipe for the Age -
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>'
__docformat__ = 'restructuredtext en'
'''
theage.com.au
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re
class TheAge(BasicNewsRecipe):
title = 'The Age'
description = 'Business News, World News and Breaking News in Melbourne, Australia'
publication_type = 'newspaper'
__author__ = 'Matthew Briggs'
language = 'en_AU'

max_articles_per_feed = 1000
recursions = 0
remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
return br
def parse_index(self):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
section = None
sections = {}
for tag in soup.findAll(['h3', 'a']):
if tag.name == 'h3':
section = self.tag_to_string(tag)
sections[section] = []
# Make sure to skip: <a href="/">TheAge</a>
elif section and tag.has_key('href') and len(tag['href'].strip())>1:
url = tag['href'].strip()
if url.startswith('/'):
url = 'http://www.theage.com.au' + url
title = self.tag_to_string(tag)
sections[section].append({
'title': title,
'url' : url,
'date' : strftime('%a, %d %b'),
'description' : '',
'content' : '',
})

feeds = []
# Insert feeds in specified order, if available

feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
for i in feedSort:
if i in sections:
feeds.append((i,sections[i]))
# Done with the sorted feeds
for i in feedSort:
del sections[i]

# Append what is left over...
for i in sections:
feeds.append((i,sections[i]))

return feeds
def get_cover_url(self):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())
for i in soup.findAll('a'):
href = i['href']
if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
return href
return None
def preprocess_html(self,soup):

for p in soup.findAll('p'):

# Collapse the paragraph by joining the non-tag contents
contents = [i for i in p.contents if isinstance(i,unicode)]
if len(contents):
contents = ''.join(contents)
# Filter out what's left of the text-mode navigation stuff
if re.match('((\s)|(\&nbsp\)*\[[\|\s*]*\]((\s)|(\&nbsp\)*$',contents):
p.extract()
continue
# Shrink the fine print font
if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
p['style'] = 'font-size:small'
continue

return soup




Fix on mobile read forum -

Code:

from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5

class OnlyLatestRecipe(BasicNewsRecipe):
title = u'Unknown News Source'
oldest_article = 10000
max_articles_per_feed = 10000
feeds = [ ]

def parse_feeds(self):
recipe_dir = os.path.join(config_dir,'recipes')
hash_dir = os.path.join(recipe_dir,'recipe_storage')
feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)

feeds = BasicNewsRecipe.parse_feeds(self)

for feed in feeds:
feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
feed_fn = os.path.join(feed_dir,feed_hash)

past_items = set()
if os.path.exists(feed_fn):
with file(feed_fn) as f:
for h in f:
past_items.add(h.strip())

cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content: item_hash.update(article.content.encode('utf-8'))
if article.summary: item_hash.update(article.summary.encode('utf-8'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with file(feed_fn,'w') as f:
for h in cur_items:
f.write(h+'\n')

remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)

return feeds



Anyones help would be appreciated.
Quasii is offline   Reply With Quote