![]() |
#1 |
Junior Member
![]() Posts: 1
Karma: 10
Join Date: May 2011
Device: Amazon Kindle
|
[fixed recipe] Wprost - polish newsmagazine
Below is fixed Wprost recipe - original recipe did not remove <table> tags - it causes problems on Kindle reader.
Code:
#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2010, matek09, matek09@gmail.com' __copyright__ = 'Modified 2011, Mariusz Wolek <mariusz_dot_wolek @ gmail dot com>' from calibre.web.feeds.news import BasicNewsRecipe import re class Wprost(BasicNewsRecipe): EDITION = 0 FIND_LAST_FULL_ISSUE = True EXCLUDE_LOCKED = True ICO_BLOCKED = 'http://www.wprost.pl/G/icons/ico_blocked.gif' title = u'Wprost' __author__ = 'matek09' description = 'Weekly magazine' encoding = 'ISO-8859-2' no_stylesheets = True language = 'pl' remove_javascript = True remove_tags_before = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) remove_tags_after = dict(dict(name = 'div', attrs = {'id' : 'print-layer'})) '''keep_only_tags =[] keep_only_tags.append(dict(name = 'table', attrs = {'id' : 'title-table'})) keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-header'})) keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'div-content'})) keep_only_tags.append(dict(name = 'div', attrs = {'class' : 'def element-autor'}))''' preprocess_regexps = [(re.compile(r'style="display: none;"'), lambda match: ''), (re.compile(r'display: block;'), lambda match: ''), (re.compile(r'\<td\>\<tr\>\<\/table\>'), lambda match: ''), (re.compile(r'\<table .*?\>'), lambda match: ''), (re.compile(r'\<tr>'), lambda match: ''), (re.compile(r'\<td .*?\>'), lambda match: '')] remove_tags =[] remove_tags.append(dict(name = 'div', attrs = {'class' : 'def element-date'})) remove_tags.append(dict(name = 'div', attrs = {'class' : 'def silver'})) remove_tags.append(dict(name = 'div', attrs = {'id' : 'content-main-column-right'})) extra_css = ''' .div-header {font-size: x-small; font-weight: bold} ''' #h2 {font-size: x-large; font-weight: bold} def is_blocked(self, a): if a.findNextSibling('img') is None: return False else: return True def find_last_issue(self): soup = self.index_to_soup('http://www.wprost.pl/archiwum/') a = 0 if self.FIND_LAST_FULL_ISSUE: ico_blocked = soup.findAll('img', attrs={'src' : self.ICO_BLOCKED}) a = ico_blocked[-1].findNext('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) else: a = soup.find('a', attrs={'title' : re.compile('Zobacz spis tre.ci')}) self.EDITION = a['href'].replace('/tygodnik/?I=', '') self.cover_url = a.img['src'] def parse_index(self): self.find_last_issue() soup = self.index_to_soup('http://www.wprost.pl/tygodnik/?I=' + self.EDITION) feeds = [] for main_block in soup.findAll(attrs={'class':'main-block-s3 s3-head head-red3'}): articles = list(self.find_articles(main_block)) if len(articles) > 0: section = self.tag_to_string(main_block) feeds.append((section, articles)) return feeds def find_articles(self, main_block): for a in main_block.findAllNext( attrs={'style':['','padding-top: 15px;']}): if a.name in "td": break if self.EXCLUDE_LOCKED & self.is_blocked(a): continue yield { 'title' : self.tag_to_string(a), 'url' : 'http://www.wprost.pl' + a['href'], 'date' : '', 'description' : '' } |
![]() |
![]() |
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
Fixed brand eins recipe | siebert | Recipes | 18 | 07-30-2013 06:56 AM |
Weird polish n in ADE? | gardefjord | ePub | 8 | 05-20-2011 04:39 AM |
Polish fonts | kanonka | Sony Reader Dev Corner | 33 | 01-15-2011 08:28 PM |
AJC Recipe not working correctly (will be fixed soon) | TonytheBookworm | Recipes | 0 | 12-24-2010 07:41 PM |
Wprost recipe | matek09 | Recipes | 1 | 11-13-2010 10:58 AM |