Hi everybody (hi Kovid!),
I read a bit about web2lrf, and came up with a profile for downloading the news from the italian newspaper "La Repubblica".
It is not perfect of course, until this morning I knew everything about Perl and nothing at all about Python - any feedback is welcome.
I hope I did the right thing putting the code here.
Let me know if it works for you too...
Alessandro
Code:
import re
from libprs500.ebooks.lrf.web.profiles import DefaultProfile
class LaRepubblica(DefaultProfile):
title = 'La Repubblica Feed'
max_recursions = 2
preprocess_regexps = \
[ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<div id="ge-network-top">.*?</div>', lambda match : ''),
(r'<div id="ge-network-middle">.*?</div>', lambda match : ''),
(r'<div id="ge-network-bottom">.*?</div>', lambda match : ''),
(r'<div id="cerca">.*?</div>', lambda match : ''),
(r'<div id="topmenu">.*?</div>', lambda match : ''),
(r'<div id="menu">.*?</div>', lambda match : ''),
(r'<div id="stripa">.*?</div>', lambda match : ''),
(r'<div id="stripb">.*?</div>', lambda match : ''),
(r'<div id="gee-contA">.*?</div>', lambda match : ''),
(r'<div id="addons">.*?</div>', lambda match : ''),
(r'<div id="menu">.*?</div>', lambda match : ''),
(r'<div id="newprefooter">.*?</div>', lambda match : ''),
(r'<div id="newfooter">.*?</div>', lambda match : ''),
(r'<div id="update">.*?</div>', lambda match : ''),
(r'<div id="menu">.*?</div>', lambda match : ''),
(r'<div id="menu">.*?</div>', lambda match : ''),
(r'<div id="menu">.*?</div>', lambda match : ''),
(r'<div class="wikipedia">.*?</div>', lambda match : ''),
(r'<div class="contselect">.*?</div>', lambda match : ''),
(r'<div class="generalbox gen">.*?</div>', lambda match : ''),
]
]
def get_feeds(self):
return [ ('Feed 1', 'http://www.repubblica.it/rss/homepage/rss2.0.xml') ]