MobileRead Forums - View Single Post - The Age Feed

Quasii · 03-09-2011, 06:38 PM

Thanks for your quick reply.

And your link was helpful, my complete lack of knowledge in regard to python is only holding me back now.

if there is any bored python experts out there I'd be greatly appreciative, but won't be holding me breath. (and sorry I don't know how to open one of those funky code boxes like everyone else does)

original recipe for the Age -
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>'
__docformat__ = 'restructuredtext en'
'''
theage.com.au
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re
class TheAge(BasicNewsRecipe):
title = 'The Age'
description = 'Business News, World News and Breaking News in Melbourne, Australia'
publication_type = 'newspaper'
__author__ = 'Matthew Briggs'
language = 'en_AU'

max_articles_per_feed = 1000
recursions = 0
remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.set_handle_refresh(False)
return br
def parse_index(self):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())
section = None
sections = {}
for tag in soup.findAll(['h3', 'a']):
if tag.name == 'h3':
section = self.tag_to_string(tag)
sections[section] = []
# Make sure to skip: <a href="/">TheAge</a>
elif section and tag.has_key('href') and len(tag['href'].strip())>1:
url = tag['href'].strip()
if url.startswith('/'):
url = 'http://www.theage.com.au' + url
title = self.tag_to_string(tag)
sections[section].append({
'title': title,
'url' : url,
'date' : strftime('%a, %d %b'),
'description' : '',
'content' : '',
})

feeds = []
# Insert feeds in specified order, if available

feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
for i in feedSort:
if i in sections:
feeds.append((i,sections[i]))
# Done with the sorted feeds
for i in feedSort:
del sections[i]

# Append what is left over...
for i in sections:
feeds.append((i,sections[i]))

return feeds
def get_cover_url(self):
soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())
for i in soup.findAll('a'):
href = i['href']
if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
return href
return None
def preprocess_html(self,soup):

for p in soup.findAll('p'):

# Collapse the paragraph by joining the non-tag contents
contents = [i for i in p.contents if isinstance(i,unicode)]
if len(contents):
contents = ''.join(contents)
# Filter out what's left of the text-mode navigation stuff
if re.match('((\s)|(\&nbsp\

)*\[[\|\s*]*\]((\s)|(\&nbsp\

)*$',contents):
p.extract()
continue
# Shrink the fine print font
if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
p['style'] = 'font-size:small'
continue

return soup

Fix on mobile read forum -

Code:

from calibre.constants import config_dir, CONFIG_DIR_MODE
import os, os.path, urllib
from hashlib import md5

class OnlyLatestRecipe(BasicNewsRecipe):
title = u'Unknown News Source'
oldest_article = 10000
max_articles_per_feed = 10000
feeds = [ ]

def parse_feeds(self):
recipe_dir = os.path.join(config_dir,'recipes')
hash_dir = os.path.join(recipe_dir,'recipe_storage')
feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':'))
if not os.path.isdir(feed_dir):
os.makedirs(feed_dir,mode=CONFIG_DIR_MODE)

feeds = BasicNewsRecipe.parse_feeds(self)

for feed in feeds:
feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='')
feed_fn = os.path.join(feed_dir,feed_hash)

past_items = set()
if os.path.exists(feed_fn):
with file(feed_fn) as f:
for h in f:
past_items.add(h.strip())

cur_items = set()
for article in feed.articles[:]:
item_hash = md5()
if article.content: item_hash.update(article.content.encode('utf-8'))
if article.summary: item_hash.update(article.summary.encode('utf-8'))
item_hash = item_hash.hexdigest()
if article.url:
item_hash = article.url + ':' + item_hash
cur_items.add(item_hash)
if item_hash in past_items:
feed.articles.remove(article)
with file(feed_fn,'w') as f:
for h in cur_items:
f.write(h+'\n')

remove = [f for f in feeds if len(f) == 0 and
self.remove_empty_feeds]
for f in remove:
feeds.remove(f)

return feeds

Anyones help would be appreciated.

03-09-2011, 06:38 PM	#3
Quasii Member Posts: 12 Karma: 10 Join Date: Feb 2011 Device: Kindle3	Thanks for your quick reply. And your link was helpful, my complete lack of knowledge in regard to python is only holding me back now. if there is any bored python experts out there I'd be greatly appreciative, but won't be holding me breath. (and sorry I don't know how to open one of those funky code boxes like everyone else does) original recipe for the Age - #!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>' __docformat__ = 'restructuredtext en' ''' theage.com.au ''' from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup import re class TheAge(BasicNewsRecipe): title = 'The Age' description = 'Business News, World News and Breaking News in Melbourne, Australia' publication_type = 'newspaper' __author__ = 'Matthew Briggs' language = 'en_AU' max_articles_per_feed = 1000 recursions = 0 remove_tags = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})] def get_browser(self): br = BasicNewsRecipe.get_browser() br.set_handle_refresh(False) return br def parse_index(self): soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read()) section = None sections = {} for tag in soup.findAll(['h3', 'a']): if tag.name == 'h3': section = self.tag_to_string(tag) sections[section] = [] # Make sure to skip: <a href="/">TheAge</a> elif section and tag.has_key('href') and len(tag['href'].strip())>1: url = tag['href'].strip() if url.startswith('/'): url = 'http://www.theage.com.au' + url title = self.tag_to_string(tag) sections[section].append({ 'title': title, 'url' : url, 'date' : strftime('%a, %d %b'), 'description' : '', 'content' : '', }) feeds = [] # Insert feeds in specified order, if available feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ] for i in feedSort: if i in sections: feeds.append((i,sections[i])) # Done with the sorted feeds for i in feedSort: del sections[i] # Append what is left over... for i in sections: feeds.append((i,sections[i])) return feeds def get_cover_url(self): soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read()) for i in soup.findAll('a'): href = i['href'] if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href): return href return None def preprocess_html(self,soup): for p in soup.findAll('p'): # Collapse the paragraph by joining the non-tag contents contents = [i for i in p.contents if isinstance(i,unicode)] if len(contents): contents = ''.join(contents) # Filter out what's left of the text-mode navigation stuff if re.match('((\s)\|(\&nbsp\)\[[\\|\s]\]((\s)\|(\&nbsp\)$',contents): p.extract() continue # Shrink the fine print font if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.': p['style'] = 'font-size:small' continue return soup Fix on mobile read forum - Code: from calibre.constants import config_dir, CONFIG_DIR_MODE import os, os.path, urllib from hashlib import md5 class OnlyLatestRecipe(BasicNewsRecipe): title = u'Unknown News Source' oldest_article = 10000 max_articles_per_feed = 10000 feeds = [ ] def parse_feeds(self): recipe_dir = os.path.join(config_dir,'recipes') hash_dir = os.path.join(recipe_dir,'recipe_storage') feed_dir = os.path.join(hash_dir,self.title.encode('utf-8').replace('/',':')) if not os.path.isdir(feed_dir): os.makedirs(feed_dir,mode=CONFIG_DIR_MODE) feeds = BasicNewsRecipe.parse_feeds(self) for feed in feeds: feed_hash = urllib.quote(feed.title.encode('utf-8'),safe='') feed_fn = os.path.join(feed_dir,feed_hash) past_items = set() if os.path.exists(feed_fn): with file(feed_fn) as f: for h in f: past_items.add(h.strip()) cur_items = set() for article in feed.articles[:]: item_hash = md5() if article.content: item_hash.update(article.content.encode('utf-8')) if article.summary: item_hash.update(article.summary.encode('utf-8')) item_hash = item_hash.hexdigest() if article.url: item_hash = article.url + ':' + item_hash cur_items.add(item_hash) if item_hash in past_items: feed.articles.remove(article) with file(feed_fn,'w') as f: for h in cur_items: f.write(h+'\n') remove = [f for f in feeds if len(f) == 0 and self.remove_empty_feeds] for f in remove: feeds.remove(f) return feeds Anyones help would be appreciated.