Register Guidelines E-Books Today's Posts Search

Go Back   MobileRead Forums > E-Book Software > Calibre > Recipes

Notices

Reply
 
Thread Tools Search this Thread
Old 12-22-2016, 12:00 PM   #1
akmeltem
Junior Member
akmeltem began at the beginning.
 
Posts: 7
Karma: 10
Join Date: Sep 2016
Device: kindle p3
In same recipe, different from each other limitations for every rss feed???

is it possible change limitations (ex. max_articles_per_feed, oldest_article, simultaneous_downloads....) for every rss feed in a same recipe? if it is possible how can we do?
Thank you
akmeltem is offline   Reply With Quote
Old 12-22-2016, 09:20 PM   #2
kovidgoyal
creator of calibre
kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.kovidgoyal ought to be getting tired of karma fortunes by now.
 
kovidgoyal's Avatar
 
Posts: 45,251
Karma: 27110894
Join Date: Oct 2006
Location: Mumbai, India
Device: Various
No, but you can always override the build_index() method from the base recipe class to do whatever arbitrary processing you want.
kovidgoyal is online now   Reply With Quote
Advert
Old 01-10-2017, 04:15 PM   #3
akmeltem
Junior Member
akmeltem began at the beginning.
 
Posts: 7
Karma: 10
Join Date: Sep 2016
Device: kindle p3
Kovid i did not imagine how can add together two recipes with build_index() method ??
Can you write a script template for this?
Help Please......

First Recipe
-------------
from calibre.web.feeds.news import BasicNewsRecipe


class Cumhuriyet_tr(BasicNewsRecipe):
title = 'Cumhuriyet - Yazarlar'
__author__ = 'Cumhuriyet Gazetesi Yazarları'
description = 'Günlük Cumhuriyet Gazetesi Köşe Yazıları'
publisher = 'Cumhuriyet'
category = 'news, politics, Turkey'
oldest_article = 1
max_articles_per_feed = 150
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
masthead_url = 'http://www.cumhuriyet.com.tr/image/template/Cumhuriyet_logo_300x60px.png'
cover_url = 'http://www.cumhuriyet.com.tr/image/template/Cumhuriyet_logo_300x60px.png'
language = 'tr'
extra_css = """ .name {display: block;width:100%;font-size:120%;}
#article-title {display: block;margin-top: 15px;width:100%;font-size:140%;}
#publish-date {display: block;width:100%;font-size:80%;}
"""
# extra_css = """ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
# .article_description,body{font-family: Arial,Verdana,Helvetica,sans1,sans-serif}
# """

conversion_options = {
'comment': description, 'tags': category, 'publisher': publisher, 'language': language
}

remove_tags = [dict(attrs={'class': 'links'}), dict(
attrs={'id': 'share-bar'}), dict(attrs={'id': 'font-adjust'})]
remove_tags_before = dict(attrs={'id': 'content'})
remove_tags_after = dict(attrs={'id': 'content'})

feeds = [
(u'Yazarlar', u'http://www.cumhuriyet.com.tr/rss/2')
]

# def print_version(self, url):
# articleid = url.rpartition('hn=')[2]
# return 'http://www.cumhuriyet.com.tr/?hn=' + articleid

def get_masthead_title(self):
return self.title + "(" + self.end_date + ")"

def preprocess_html(self, soup):
return self.adeify_images(soup)

Second Recipe
----------------
from calibre.web.feeds.news import BasicNewsRecipe


class Hurriyet(BasicNewsRecipe):
__author__ = 'Adrian Tennessee (adrian.tennessee at domainthatnobodytakes.com)'
__license__ = 'GPLv3'
__copyright__ = '2015, Adrian Tennessee <adrian.tennessee at domainthatnobodytakes.com)'

title = u'Hürriyet'
language = 'tr'
description = u'Hürriyet web site ebook'
publisher = 'Doğan Media Group'
category = 'news'
cover_url = 'https://upload.wikimedia.org/wikipedia/en/4/4f/H%C3%BCrriyet_ilk_sayi.jpg'

oldest_article = 7
max_articles_per_feed = 50
use_embedded_content = False
no_stylesheets = True
remove_javascript = True
encoding = None

compress_news_images = True

# some mild formatting
extra_css = """.news-image { clear: left; }
.news-detail-title { clear:left; }
.col-md-7 { font-size:small; }
.news-detail-spot { font-style:italic; font-size:smaller }"""

keep_only_tags = [
# title
{'class': 'news-detail-title selectionShareable'},
# date
{'class': 'col-md-7 text-right'},
# image
{'class': 'news-image'},
# detail
{'class': 'news-detail-spot'},
# text
{'class': 'news-box'},
]

feeds = [
(u'Anasayfa', u'http://www.hurriyet.com.tr/rss/anasayfa'),
(u'Gündem', u' http://www.hurriyet.com.tr/rss/gundem'),
(u'Ekonomi', u'http://www.hurriyet.com.tr/rss/ekonomi'),
(u'Magazin', u'http://www.hurriyet.com.tr/rss/magazin'),
(u'Spor', u'http://www.hurriyet.com.tr/rss/spor'),
(u'Planet', u'http://www.hurriyet.com.tr/rss/dunya'),
(u'Teknoloji', u'http://www.hurriyet.com.tr/rss/teknoloji'),
(u'Sağlık', u'http://www.hurriyet.com.tr/rss/saglik'),
(u'Astroloji', u'http://www.hurriyet.com.tr/rss/astroloji'),
(u'Ankara', u'http://www.hurriyet.com.tr/rss/ankara'),
(u'Ege', u'http://www.hurriyet.com.tr/rss/ege')
]

Third Recipe
--------------
__license__ = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
www.hurriyetdailynews.com
'''
import os
import string
import inspect
import datetime
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup


class HurriyetDailyNews_en(BasicNewsRecipe):
title = u'Hurriyet Daily News'
__author__ = u'spswerling'
description = 'a Turkey based daily in english'
description = 'English version of Turkish Daily "Hurriyet"'
no_stylesheets = True
encoding = 'utf-8'
category = 'news'
language = 'en_TR'
publication_type = 'newspaper'
cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png'
masthead_url = cover_img_url
remove_empty_feeds = True

# on kindle, images can make things kind of fat. Slim them down.
recursions = 0
oldest_article = 1
compress_news_images = True
compress_news_images_max_size = 7
scale_news_images = (150, 200) # (kindle touch: 600x800)
useHighResImages = False
oldest_article = 1.5
max_articles_per_section = 25
max_articles_per_subsection = 7

sections = [
u'turkey',
u'economy',
u'world',
u'sports',
# u'life',
u'opinion',
# u'arts/culture'
]

# util for creating remove_tags and keep_tags style regex matchers
def tag_matcher(elt, attr, str):
return dict(name=elt, attrs={attr: re.compile(str, re.IGNORECASE)})

keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')]

remove_tags = [
tag_matcher('div', 'class', 'Carousel'),
tag_matcher('div', 'class', 'ShareIt'),
tag_matcher('div', 'class', 'tmz'),
tag_matcher('span', 'id', 'comment'),
tag_matcher('h2', 'class', 'NewSpot'),
tag_matcher('h2', 'class', 'pv-gallery'),
]

articles = {}
subsection_links = {}
urls_done = []
links_per_section = {}

def parse_index(self):
section_links = self.section_links_from_home_page()
for section_link in section_links:
self.articles[self.section_name(section_link)] = []
subsection_links = self.find_subsection_links(section_link)
for subsection_link in subsection_links:
sub_name = self.subsection_name(subsection_link)
self.subsection_links[sub_name] = []
self.parse_subsection(section_link, subsection_link)
ans = []
for k in self.articles:
ans.append((string.capwords(k), self.articles[k]))
return ans

def section_links_from_home_page(self):

def include_link(link):
return self.text(link).lower() in self.sections

url = 'http://www.hurriyetdailynews.com/'
try:
self._p('hitting home page ' + url)
soup = self.index_to_soup(url)
except:
self._p('Unable to spider home page')
return []

self._p('Got home page. hunt down section links.')

regex = re.compile('rmRootLink', re.IGNORECASE)
links = soup.findAll('a', {'class': regex})

filtered_links = filter(include_link, links)
self._p(' all sections: ' + ', '.join(map(self.text, links)))
self._p(' filtered sections: ' +
', '.join(map(self.text, filtered_links)))

return filtered_links

def find_subsection_links(self, section_link):
self._p('find subsection links for section ' + str(section_link))
url = self.abs_url(section_link['href'])
try:
self._p('hitting ' + url)
soup = self.index_to_soup(url)
except:
self._p('Unable to spider subsection')
return []
self._p('Got ' + url)

div = soup.find('div', {'class': 'SeffafLink'})
if not div:
self._p('could not find any subsections')
return [section_link]
links = div.findAll('a')
self._p(' subsection links: ' + ', '.join(map(self.text, links)))
return links

def parse_subsection(self, section_link, subsection_link):

section = self.section_name(section_link)
if len(self.articles[section]) > self.max_articles_per_section:
return

# tmp dbg
# if not self.subsection_name(subsection_link) == 'arts':
# return

self._p('hit section ' + section +
', subsect ' + self.subsection_name(subsection_link))
url = self.abs_url(subsection_link['href'])
try:
self._p('hitting ' + url)
soup = self.index_to_soup(url)
except:
self._p('Unable to spider section')
return []

self._p('Process links ')
for link in soup.findAll('a'):
if 'NewsDetail' in str(link.get('id')):
self.process_link(section_link, subsection_link, link)

def process_link(self, section_link, subsection_link, link):
section = self.section_name(section_link)
subsection = self.subsection_name(subsection_link)
title = link['title'] or self.text(link)
href = link.get('href')
if not href:
self._p("BAD HREF: " + str(link))
return
self.queue_article_link(section, subsection, href, title)

def queue_article_link(self, section, subsection, url, title):
full_url = self.abs_url(url)
if full_url in self.urls_done:
# self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url]))
return

self.urls_done.append(full_url)
if len(self.articles[section]) >= self.max_articles_per_section:
return
if len(self.subsection_links[subsection]) >= \
self.max_articles_per_subsection:
return
self._p('Q: ' + ' - '.join([section, subsection, title, url]))
full_title = string.capwords(subsection + ' - ' + title)
self.subsection_links[subsection].append(url)
self.articles[section].append(
dict(title=full_title,
url=full_url,
date='',
description='',
author='',
content=''))

def text(self, n):
return self.tag_to_string(n).strip()

def abs_url(self, url):
if 'www.hurriyetdailynews.com' in url:
abs_url = url
elif url[0] == '/':
abs_url = 'http://www.hurriyetdailynews.com' + url
else:
abs_url = 'http://www.hurriyetdailynews.com/' + url
if '#' in abs_url:
abs_url = ''.join(abs_url.split('#')[0:-1])

return abs_url

def section_name(self, link):
return self.text(link).lower()

def subsection_name(self, link):
from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1]
return from_fn

def preprocess_raw_html(self, raw_html, url):
reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
if reason_to_skip:
self._p('Skipping article: ' + reason_to_skip + ', ' + url)
# Next line will show up as an error in the logs, but ignore, see
# https://www.mobileread.com/forums/sho....php?p=2931136
return None
else:
return super(self.__class__, self).preprocess_raw_html(raw_html, url)

def should_skip_article(self, soup):
date = self.scrape_article_date(soup)
if not date:
return False

age = (datetime.datetime.now() - date).days
if (age > self.oldest_article):
return "too old"
return False

def date_from_string(self, datestring):
try:
# eg: September/17/2014
dt = datetime.datetime.strptime(datestring, "%B/%d/%Y")
except:
try:
# eg: September 17/2014
dt = datetime.datetime.strptime(datestring, "%B %d/%Y")
except:
dt = None
if dt:
self._p('From string "' + datestring + '", datetime: ' + str(dt))
else:
self._p('Could not get datetime from ' + datestring)
return dt

def scrape_article_date(self, soup):
dnode = soup.find('p', {'class': 'dateagency'}) or \
soup.find('p', {'class': 'Tarih'})
if dnode:
dstring = self.text(dnode)
return self.date_from_string(dstring)
else:
return None

def _dbg_soup_node(self, node):
s = ' cls: ' + str(node.get('class')).strip() + \
' id: ' + str(node.get('id')).strip() + \
' txt: ' + self.text(node)
return s

def _p(self, msg):
curframe = inspect.currentframe()
calframe = inspect.getouterframes(curframe, 2)
calname = calframe[1][3].upper()
print('[' + calname + '] ' + msg[0:120])
akmeltem is offline   Reply With Quote
Reply


Forum Jump

Similar Threads
Thread Thread Starter Forum Replies Last Post
Recipe without rss feed? nicolash Recipes 10 09-09-2012 06:35 AM
RSS FEED/ RECIPE for 365tomorrows.com earl412 Recipes 9 06-29-2012 01:55 PM
Request: small recipe that adds borders to a borderless table inside an RSS feed mopol Recipes 0 03-01-2012 03:26 PM
Recipe for german RSS feed "Leipziger Volkszeitung" a.peter Recipes 0 09-28-2011 03:05 AM
RECIPE Request: MLB.COM RSS Feed fung Recipes 0 03-26-2011 11:42 PM


All times are GMT -4. The time now is 09:12 PM.


MobileRead.com is a privately owned, operated and funded community.