View Single Post
Old 09-10-2021, 08:34 AM   #5
oneillpt
Connoisseur
oneillpt began at the beginning.
 
Posts: 63
Karma: 46
Join Date: Feb 2011
Device: Kindle 3 (cracked screen!); PW1; Oasis
Updated recipes for Helsingin Sanomat and Аргументы и Факты

NOTE THAT THE UPDATED RECIPE FOR Аргументы и Факты REQUIRES TWO SMALL CHANGES TO CALIBRE SOURCE CODE, DISCUSSED BELOW

Helsingin Sanomat:
========================================
This recipe provides four sections of the paper (five on Sunday)
========================================

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1631181034(BasicNewsRecipe):
title = 'Helsingin Sanomat'
language = 'fi'
oldest_article = 7
max_articles_per_feed = 200
auto_cleanup = True

feeds = [
('Helsingin Sanomat', 'https://www.hs.fi'),
]
INDEX = 'https://www.hs.fi/'

def do_Section(self, nxtINDEX, section_title, feeds):
articles = []
soup = self.index_to_soup(nxtINDEX)
ii = 0
for section in soup.findAll('a', attrs={'class':'block'}):
if section is not None:
ii = ii + 1
z = section.findAll('h2')
try:
z = z[0].get_text() # strip=True
link = section['href']
if link[0:1] == '/':
link = 'https://www.hs.fi' + link
articles.append({u'title':z, u'url':link})
except Exception as inst:
self.log("exception handled")
if articles:
feeds.append((section_title, articles))
return feeds

def parse_index(self):
feeds = []
self.do_Section('https://www.hs.fi/', u'Etusivi', feeds)
self.do_Section('https://www.hs.fi/kotimaa/', u'Kotimaa', feeds)
self.do_Section('https://www.hs.fi/kulttuuri/', u'Kulttuuri', feeds)
self.do_Section('https://www.hs.fi/ulkomaat/', u'Ulkomaat', feeds)
if date.weekday(date.today()) == 6:
self.do_Section('https://www.hs.fi/sunnuntai/', u'Sunnuntai', feeds)
return feeds



========================================
Аргументы и Факты:
========================================
The distributed recipe runs, but provides no content. The recipe
below runs and provides content. However some Unicode directory
and file names are found as type 'bytes' rather than as type 'str',
and need two small modifications in news.py to handle this. The
modified code will handle both 'str' and 'bytes' types. I will suggest
these changes to the development forum for inclusion in Calibre, but
if you have local development code and need the Аргументы и Факты
recipe you need only make the changes below. I will also try to tidy the recipe further now that is working, and post a tidied recipe.

1) in canonicalize_internal_url(self, url, is_link=True):
replace
return frozenset([(parts.netloc, (parts.path or '').rstrip('/'))])
by
zzp = parts.path
zzn = parts.netloc
if type(zzp) != type(' '): #"<class 'bytes'>":
zzp = parts.path.decode("utf-8")
zzn = parts.netloc.decode("utf-8")
return frozenset([(zzn, (zzp or '').rstrip('/'))])

2) In article_downloaded(self, request, result):
replace
index = os.path.join(os.path.dirname(result[0]), 'index.html')
by
zzr = result[0]
if type(zzr) != type(' '):
zzr = result[0].decode("utf-8")
index = os.path.join(os.path.dirname(zzr), 'index.html')
========================================

#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import with_statement, unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.fetch.simple import (
AbortArticle, RecursiveFetcher, option_parser as web2disk_option_parser
)
import string as st
import calibre.web.feeds.news
import os, sys
dir(BeautifulSoup)

class AdvancedUserRecipe1592177429(BasicNewsRecipe):
title = 'Аргументы и Факты'
encoding = 'utf8'
language = 'ru'
oldest_article = 7
max_articles_per_feed = 25
auto_cleanup = True
verbose = 3

feeds = [
('AIF', 'https://www.aif.ru/rss/all.php'),
]
INDEX = 'https://www.aif.ru/rss/all.php'

def preprocess_html(self, soup):
soup = BasicNewsRecipe.preprocess_html(self, soup)
return soup
def preprocess_raw_html(self, raw_html, url):
raw_html = BasicNewsRecipe.preprocess_raw_html(self, raw_html, url)
return raw_html
def fetch_article(self, url, dir_, f, a, num_of_feeds):
br = self.browser
if hasattr(self.get_browser, 'is_base_class_implementation'):
# We are using the default get_browser, which means no need to
# clone
br = BasicNewsRecipe.get_browser(self)
else:
br = self.clone_browser(self.browser)
self.web2disk_options.browser = br
fetcher = RecursiveFetcher(self.web2disk_options, self.log, # BasicNewsRecipe.
self.image_map, self.css_map,
(url, f, a, num_of_feeds))
fetcher.browser = br
fetcher.base_dir = dir_
fetcher.current_dir = dir_
fetcher.show_progress = False
fetcher.image_url_processor = self.image_url_processor
res, path, failures = fetcher.start_fetch(url.decode()), fetcher.downloaded_paths, fetcher.failed_links
res = res.encode("utf-8")
path[0] = path[0].encode()
if not res or not os.path.exists(res):
msg = _('Could not fetch article.') + ' '
if self.debug:
msg += _('The debug traceback is available earlier in this log')
else:
msg += _('Run with -vv to see the reason')
raise Exception(msg)

return res, path, failures

def parse_index(self):
feeds = []
section_title = u'aif'
articles = []
soup = self.index_to_soup(self.INDEX)
ii = 0
for item in soup.findAll('item'):
if ii < self.max_articles_per_feed:
try:
ii = ii + 1
A = str(item)
i = A.find(u'link')
j = A.find(u'description')
ZZ = item.find('description')
ZZ1 = str(ZZ)
ZZ2 = ZZ1[24:-19]
AB = A
AB1 = AB[i:j].encode()
AU = AB1
try:
articles.append({'url':AU[6:-2], 'title':ZZ2})
except Exception as inst:
self.log("Exception handled!")
except Exception as inst:
self.log("Exception handled!")
if articles:
feeds.append((section_title, articles))
return feeds
oneillpt is offline   Reply With Quote