|
![]() |
|
Thread Tools | Search this Thread |
![]() |
#1 |
Junior Member
![]() Posts: 7
Karma: 10
Join Date: Sep 2016
Device: kindle p3
|
In same recipe, different from each other limitations for every rss feed???
is it possible change limitations (ex. max_articles_per_feed, oldest_article, simultaneous_downloads....) for every rss feed in a same recipe? if it is possible how can we do?
Thank you |
![]() |
![]() |
![]() |
#2 |
creator of calibre
![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() ![]() Posts: 45,251
Karma: 27110894
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
|
No, but you can always override the build_index() method from the base recipe class to do whatever arbitrary processing you want.
|
![]() |
![]() |
Advert | |
|
![]() |
#3 |
Junior Member
![]() Posts: 7
Karma: 10
Join Date: Sep 2016
Device: kindle p3
|
Kovid i did not imagine how can add together two recipes with build_index() method ??
Can you write a script template for this? Help Please...... First Recipe ------------- from calibre.web.feeds.news import BasicNewsRecipe class Cumhuriyet_tr(BasicNewsRecipe): title = 'Cumhuriyet - Yazarlar' __author__ = 'Cumhuriyet Gazetesi Yazarları' description = 'Günlük Cumhuriyet Gazetesi Köşe Yazıları' publisher = 'Cumhuriyet' category = 'news, politics, Turkey' oldest_article = 1 max_articles_per_feed = 150 no_stylesheets = True encoding = 'utf-8' use_embedded_content = False masthead_url = 'http://www.cumhuriyet.com.tr/image/template/Cumhuriyet_logo_300x60px.png' cover_url = 'http://www.cumhuriyet.com.tr/image/template/Cumhuriyet_logo_300x60px.png' language = 'tr' extra_css = """ .name {display: block;width:100%;font-size:120%;} #article-title {display: block;margin-top: 15px;width:100%;font-size:140%;} #publish-date {display: block;width:100%;font-size:80%;} """ # extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} # .article_description,body{font-family: Arial,Verdana,Helvetica,sans1,sans-serif} # """ conversion_options = { 'comment': description, 'tags': category, 'publisher': publisher, 'language': language } remove_tags = [dict(attrs={'class': 'links'}), dict( attrs={'id': 'share-bar'}), dict(attrs={'id': 'font-adjust'})] remove_tags_before = dict(attrs={'id': 'content'}) remove_tags_after = dict(attrs={'id': 'content'}) feeds = [ (u'Yazarlar', u'http://www.cumhuriyet.com.tr/rss/2') ] # def print_version(self, url): # articleid = url.rpartition('hn=')[2] # return 'http://www.cumhuriyet.com.tr/?hn=' + articleid def get_masthead_title(self): return self.title + "(" + self.end_date + ")" def preprocess_html(self, soup): return self.adeify_images(soup) Second Recipe ---------------- from calibre.web.feeds.news import BasicNewsRecipe class Hurriyet(BasicNewsRecipe): __author__ = 'Adrian Tennessee (adrian.tennessee at domainthatnobodytakes.com)' __license__ = 'GPLv3' __copyright__ = '2015, Adrian Tennessee <adrian.tennessee at domainthatnobodytakes.com)' title = u'Hürriyet' language = 'tr' description = u'Hürriyet web site ebook' publisher = 'Doğan Media Group' category = 'news' cover_url = 'https://upload.wikimedia.org/wikipedia/en/4/4f/H%C3%BCrriyet_ilk_sayi.jpg' oldest_article = 7 max_articles_per_feed = 50 use_embedded_content = False no_stylesheets = True remove_javascript = True encoding = None compress_news_images = True # some mild formatting extra_css = """.news-image { clear: left; } .news-detail-title { clear:left; } .col-md-7 { font-size:small; } .news-detail-spot { font-style:italic; font-size:smaller }""" keep_only_tags = [ # title {'class': 'news-detail-title selectionShareable'}, # date {'class': 'col-md-7 text-right'}, # image {'class': 'news-image'}, # detail {'class': 'news-detail-spot'}, # text {'class': 'news-box'}, ] feeds = [ (u'Anasayfa', u'http://www.hurriyet.com.tr/rss/anasayfa'), (u'Gündem', u' http://www.hurriyet.com.tr/rss/gundem'), (u'Ekonomi', u'http://www.hurriyet.com.tr/rss/ekonomi'), (u'Magazin', u'http://www.hurriyet.com.tr/rss/magazin'), (u'Spor', u'http://www.hurriyet.com.tr/rss/spor'), (u'Planet', u'http://www.hurriyet.com.tr/rss/dunya'), (u'Teknoloji', u'http://www.hurriyet.com.tr/rss/teknoloji'), (u'Sağlık', u'http://www.hurriyet.com.tr/rss/saglik'), (u'Astroloji', u'http://www.hurriyet.com.tr/rss/astroloji'), (u'Ankara', u'http://www.hurriyet.com.tr/rss/ankara'), (u'Ege', u'http://www.hurriyet.com.tr/rss/ege') ] Third Recipe -------------- __license__ = 'GPL v3' __copyright__ = '2014, spswerling' ''' www.hurriyetdailynews.com ''' import os import string import inspect import datetime import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class HurriyetDailyNews_en(BasicNewsRecipe): title = u'Hurriyet Daily News' __author__ = u'spswerling' description = 'a Turkey based daily in english' description = 'English version of Turkish Daily "Hurriyet"' no_stylesheets = True encoding = 'utf-8' category = 'news' language = 'en_TR' publication_type = 'newspaper' cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png' masthead_url = cover_img_url remove_empty_feeds = True # on kindle, images can make things kind of fat. Slim them down. recursions = 0 oldest_article = 1 compress_news_images = True compress_news_images_max_size = 7 scale_news_images = (150, 200) # (kindle touch: 600x800) useHighResImages = False oldest_article = 1.5 max_articles_per_section = 25 max_articles_per_subsection = 7 sections = [ u'turkey', u'economy', u'world', u'sports', # u'life', u'opinion', # u'arts/culture' ] # util for creating remove_tags and keep_tags style regex matchers def tag_matcher(elt, attr, str): return dict(name=elt, attrs={attr: re.compile(str, re.IGNORECASE)}) keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')] remove_tags = [ tag_matcher('div', 'class', 'Carousel'), tag_matcher('div', 'class', 'ShareIt'), tag_matcher('div', 'class', 'tmz'), tag_matcher('span', 'id', 'comment'), tag_matcher('h2', 'class', 'NewSpot'), tag_matcher('h2', 'class', 'pv-gallery'), ] articles = {} subsection_links = {} urls_done = [] links_per_section = {} def parse_index(self): section_links = self.section_links_from_home_page() for section_link in section_links: self.articles[self.section_name(section_link)] = [] subsection_links = self.find_subsection_links(section_link) for subsection_link in subsection_links: sub_name = self.subsection_name(subsection_link) self.subsection_links[sub_name] = [] self.parse_subsection(section_link, subsection_link) ans = [] for k in self.articles: ans.append((string.capwords(k), self.articles[k])) return ans def section_links_from_home_page(self): def include_link(link): return self.text(link).lower() in self.sections url = 'http://www.hurriyetdailynews.com/' try: self._p('hitting home page ' + url) soup = self.index_to_soup(url) except: self._p('Unable to spider home page') return [] self._p('Got home page. hunt down section links.') regex = re.compile('rmRootLink', re.IGNORECASE) links = soup.findAll('a', {'class': regex}) filtered_links = filter(include_link, links) self._p(' all sections: ' + ', '.join(map(self.text, links))) self._p(' filtered sections: ' + ', '.join(map(self.text, filtered_links))) return filtered_links def find_subsection_links(self, section_link): self._p('find subsection links for section ' + str(section_link)) url = self.abs_url(section_link['href']) try: self._p('hitting ' + url) soup = self.index_to_soup(url) except: self._p('Unable to spider subsection') return [] self._p('Got ' + url) div = soup.find('div', {'class': 'SeffafLink'}) if not div: self._p('could not find any subsections') return [section_link] links = div.findAll('a') self._p(' subsection links: ' + ', '.join(map(self.text, links))) return links def parse_subsection(self, section_link, subsection_link): section = self.section_name(section_link) if len(self.articles[section]) > self.max_articles_per_section: return # tmp dbg # if not self.subsection_name(subsection_link) == 'arts': # return self._p('hit section ' + section + ', subsect ' + self.subsection_name(subsection_link)) url = self.abs_url(subsection_link['href']) try: self._p('hitting ' + url) soup = self.index_to_soup(url) except: self._p('Unable to spider section') return [] self._p('Process links ') for link in soup.findAll('a'): if 'NewsDetail' in str(link.get('id')): self.process_link(section_link, subsection_link, link) def process_link(self, section_link, subsection_link, link): section = self.section_name(section_link) subsection = self.subsection_name(subsection_link) title = link['title'] or self.text(link) href = link.get('href') if not href: self._p("BAD HREF: " + str(link)) return self.queue_article_link(section, subsection, href, title) def queue_article_link(self, section, subsection, url, title): full_url = self.abs_url(url) if full_url in self.urls_done: # self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url])) return self.urls_done.append(full_url) if len(self.articles[section]) >= self.max_articles_per_section: return if len(self.subsection_links[subsection]) >= \ self.max_articles_per_subsection: return self._p('Q: ' + ' - '.join([section, subsection, title, url])) full_title = string.capwords(subsection + ' - ' + title) self.subsection_links[subsection].append(url) self.articles[section].append( dict(title=full_title, url=full_url, date='', description='', author='', content='')) def text(self, n): return self.tag_to_string(n).strip() def abs_url(self, url): if 'www.hurriyetdailynews.com' in url: abs_url = url elif url[0] == '/': abs_url = 'http://www.hurriyetdailynews.com' + url else: abs_url = 'http://www.hurriyetdailynews.com/' + url if '#' in abs_url: abs_url = ''.join(abs_url.split('#')[0:-1]) return abs_url def section_name(self, link): return self.text(link).lower() def subsection_name(self, link): from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1] return from_fn def preprocess_raw_html(self, raw_html, url): reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html)) if reason_to_skip: self._p('Skipping article: ' + reason_to_skip + ', ' + url) # Next line will show up as an error in the logs, but ignore, see # https://www.mobileread.com/forums/sho....php?p=2931136 return None else: return super(self.__class__, self).preprocess_raw_html(raw_html, url) def should_skip_article(self, soup): date = self.scrape_article_date(soup) if not date: return False age = (datetime.datetime.now() - date).days if (age > self.oldest_article): return "too old" return False def date_from_string(self, datestring): try: # eg: September/17/2014 dt = datetime.datetime.strptime(datestring, "%B/%d/%Y") except: try: # eg: September 17/2014 dt = datetime.datetime.strptime(datestring, "%B %d/%Y") except: dt = None if dt: self._p('From string "' + datestring + '", datetime: ' + str(dt)) else: self._p('Could not get datetime from ' + datestring) return dt def scrape_article_date(self, soup): dnode = soup.find('p', {'class': 'dateagency'}) or \ soup.find('p', {'class': 'Tarih'}) if dnode: dstring = self.text(dnode) return self.date_from_string(dstring) else: return None def _dbg_soup_node(self, node): s = ' cls: ' + str(node.get('class')).strip() + \ ' id: ' + str(node.get('id')).strip() + \ ' txt: ' + self.text(node) return s def _p(self, msg): curframe = inspect.currentframe() calframe = inspect.getouterframes(curframe, 2) calname = calframe[1][3].upper() print('[' + calname + '] ' + msg[0:120]) |
![]() |
![]() |
![]() |
|
![]() |
||||
Thread | Thread Starter | Forum | Replies | Last Post |
Recipe without rss feed? | nicolash | Recipes | 10 | 09-09-2012 06:35 AM |
RSS FEED/ RECIPE for 365tomorrows.com | earl412 | Recipes | 9 | 06-29-2012 01:55 PM |
Request: small recipe that adds borders to a borderless table inside an RSS feed | mopol | Recipes | 0 | 03-01-2012 03:26 PM |
Recipe for german RSS feed "Leipziger Volkszeitung" | a.peter | Recipes | 0 | 09-28-2011 03:05 AM |
RECIPE Request: MLB.COM RSS Feed | fung | Recipes | 0 | 03-26-2011 11:42 PM |