View Single Post
Old 03-24-2011, 06:53 AM   #2
ironcat
Junior Member
ironcat began at the beginning.
 
ironcat's Avatar
 
Posts: 5
Karma: 10
Join Date: Mar 2011
Location: Budapest, Hungary
Device: Kindle 3 Wi-Fi
Duplicated text

The new variant removes the duplicated text from the articles:
Spoiler:

Code:
import re
from calibre.web.feeds.recipes import BasicNewsRecipe

class hu168ora(BasicNewsRecipe):
    title                 = u'168 óra'
    __author__            = u'István Papp'
    description           = u'A 168 óra friss hírei'
    timefmt               = ' [%Y. %b. %d., %a.]'
    oldest_article        = 7
    language              = 'hu'

    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    encoding              = 'utf8'
    publisher             = u'Telegráf Kiadó'
    category              = u'news, hírek, 168'
    extra_css             = 'body{ font-family: Verdana,Helvetica,Arial,sans-serif }'
    preprocess_regexps    = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
    keep_only_tags        = [
                              dict(id='cikk_fejlec')
                             ,dict(id='cikk_torzs')
                            ]
#    remove_tags_before    = dict(id='cikk_fejlec')
#    remove_tags_after     = dict(id='szoveg')
    remove_tags           = [
                              dict(id='box_toolbar')
                             ,dict(id='text')
                            ]
    remove_javascript     = True
    remove_empty_feeds    = True


    feeds = [
              (u'Itthon', u'http://www.168ora.hu/static/rss/cikkek_itthon.xml')
             ,(u'Glóbusz', u'http://www.168ora.hu/static/rss/cikkek_globusz.xml')
             ,(u'Punch', u'http://www.168ora.hu/static/rss/cikkek_punch.xml')
             ,(u'Arte', u'http://www.168ora.hu/static/rss/cikkek_arte.xml')
             ,(u'Buxa', u'http://www.168ora.hu/static/rss/cikkek_buxa.xml')
             ,(u'Sebesség', u'http://www.168ora.hu/static/rss/cikkek_sebesseg.xml')
             ,(u'Tudás', u'http://www.168ora.hu/static/rss/cikkek_tudas.xml')
             ,(u'Sport', u'http://www.168ora.hu/static/rss/cikkek_sport.xml')
             ,(u'Vélemény', u'http://www.168ora.hu/static/rss/cikkek_velemeny.xml')
             ,(u'Dolce Vita', u'http://www.168ora.hu/static/rss/cikkek_dolcevita.xml')
#             ,(u'Rádió', u'http://www.168ora.hu/static/rss/radio.xml')
            ]

    def print_version(self, url):
        url += '?print=1'
        return url
ironcat is offline   Reply With Quote