![]() |
#1 |
Zealot
![]() ![]() Posts: 119
Karma: 100
Join Date: Jan 2011
Location: Germany / NRW /Köln
Device: prs-650 / prs-350 /kindle 3
|
recipe for Capital.de - german
Code:
from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1305470859(BasicNewsRecipe): title = u'Capital.de' language = 'de' __author__ = 'schuster' oldest_article =7 max_articles_per_feed = 35 no_stylesheets = True remove_javascript = True use_embedded_content = False masthead_url = 'http://www.wirtschaftsmedien-shop.de/media/stores/wirtschaftsmedien/capital/teaser_large_abo.jpg' cover_url = 'http://d1kb9jvg6ylufe.cloudfront.net/WebsiteCMS/de/unternehmen/linktipps/mainColumn/08/image/DE_Capital_bis20mm_SW.jpg' def preprocess_html(self, soup): for alink in soup.findAll('a'): if alink.string is not None: tstr = alink.string alink.replaceWith(tstr) return soup extra_css = ''' h1{font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:large;} h4{font-family:Arial,Helvetica,sans-serif; font-weight:normal;font-size:small;} img {min-width:300px; max-width:600px; min-height:300px; max-height:800px} p{font-family:Arial,Helvetica,sans-serif;font-size:small;} body{font-family:Helvetica,Arial,sans-serif;font-size:small;} ''' def print_version(self, url): return url.replace ('nv=rss#utm_source=rss2&utm_medium=rss_feed&utm_campaign=/', 'mode=print') remove_tags_bevor = [dict(name='td', attrs={'class':'textcell'})] remove_tags_after = [dict(name='div', attrs={'class':'artikelsplit'})] feeds = [ (u'Wirtschaftsmagazin', u'http://www.capital.de/rss/'), (u'Unternehmen', u'http://www.capital.de/rss/unternehmen'), (u'Finanz & Geldanlage', u'http://www.capital.de/rss/finanzen/geldanlage')] def append_page(self, soup, appendtag, position): pager = soup.find('div',attrs={'class':'artikelsplit'}) if pager: nexturl = self.INDEX + pager.a['href'] soup2 = self.index_to_soup(nexturl) texttag = soup2.find('div', attrs={'class':'printable'}) for it in texttag.findAll(style=True): del it['style'] newpos = len(texttag.contents) self.append_page(soup2,texttag,newpos) texttag.extract() appendtag.insert(position,texttag) def preprocess_html(self, soup): for item in soup.findAll(style=True): del item['style'] for item in soup.findAll('div', attrs={'class':'artikelsplit'}): item.extract() self.append_page(soup, soup.body, 3) pager = soup.find('div',attrs={'class':'artikelsplit'}) if pager: pager.extract() return self.adeify_images(soup) remove_tags = [dict(attrs={'class':['navSeitenAlle', 'kommentieren', 'teaserheader', 'teasercontent', 'info', 'zwischenhead', 'artikelsplit']}), dict(id=['topNav', 'mainNav', 'subNav', 'socialmedia', 'footerRahmen', 'gatrixx_marktinformationen', 'pager', 'weitere']), dict(span=['ratingtext', 'Gesamtranking', 'h3','']), dict(rel=['canonical'])] Last edited by schuster; 05-18-2011 at 03:48 PM. |
![]() |
![]() |
![]() |
#2 |
Member
![]() Posts: 17
Karma: 10
Join Date: Apr 2016
Device: Tolino Vision 3HD
|
Hi,
capital.de changed its feed address and also the structure of the articles. I modified and cleaned up the original capital_de.recipe to make it work again. Code:
#!/usr/bin/env python2 # vim:fileencoding=utf-8 from __future__ import unicode_literals, division, absolute_import, print_function ''' capital.de ''' import re from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1305470859(BasicNewsRecipe): title = 'Capital.de' __author__ = 'schuster' description = 'RSS-Feed von Capital.de' publisher = 'Gruner+Jahr GmbH & Co KG' language = 'de' oldest_article = 14 max_articles_per_feed = 35 no_stylesheets = True remove_javascript = True use_embedded_content = False conversion_options = {'smarten_punctuation' : True, 'publisher' : publisher} cover_source = 'http://shop.capital.de/abos/capital/' masthead_url = 'http://www.capital.de/files/capital/layout/logo.png' feeds = [ ('Capital.de', 'http://www.capital.de/partner-feeds/rss.xml') ] keep_only_tags = [ dict(name='div', attrs={'class':'grid_8 alpha omega layout_full block'}) ] remove_tags = [ dict(name='div', attrs={'class':'article_header'}), dict(name='br', attrs={'class':'clear'}) ] remove_attributes = ['height', 'width'] extra_css = 'h1 {font-size: 1.6em; text-align: left} \ h2 {font-size: 1em; text-align: left} \ .copyright {font-size: 0.6em} \ .caption {font-size: 0.6em}' def get_cover_url(self): soup = self.index_to_soup(self.cover_source) img_span = soup.find('span', {'class':re.compile('coverimage')}) self.cover_url = img_span.find('img', src=True)['src'] return self.cover_url def preprocess_html(self, soup): # remove all articles without relevant content tags = soup.findAll('li', {'class':'tag-chain-item'}) for li in tags: if 'BILDERSTRECKE' in self.tag_to_string(li).upper(): self.abort_article() # remove list of tags tags = soup.find('ul', {'class':'tag-chain'}) if tags: tags.extract() # remove all style attributes for item in soup.findAll(style=True): del item['style'] # remove all local hyperlinks for a in soup.findAll('a', {'href':True}): if a['href'] and not 'http' in a['href']: del a['href'] # remove picture(s) of author(s) for div in soup.findAll('div', {'class':'ce_text block'}): if div.find('hr'): for hr in div.findAll('hr'): hr.extract() for img in div.findAll('img'): img.extract() return soup |
![]() |
![]() |
Advert | |
|
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
recipe for Bild.de - German | schuster | Recipes | 2 | 05-22-2016 05:00 AM |
recipe for Express.de - german | schuster | Recipes | 1 | 06-05-2011 09:58 AM |
recipe for Golem.de - German | schuster | Recipes | 3 | 05-15-2011 11:33 AM |
recipe for Astronomie heute - german | schuster | Recipes | 0 | 05-14-2011 12:42 PM |
Hi from the ex capital of culture | snickp | Introduce Yourself | 14 | 01-30-2009 02:27 PM |