MobileRead Forums - View Single Post

oneillpt · 09-10-2021, 08:34 AM

NOTE THAT THE UPDATED RECIPE FOR Аргументы и Факты REQUIRES TWO SMALL CHANGES TO CALIBRE SOURCE CODE, DISCUSSED BELOW

Helsingin Sanomat:
========================================
This recipe provides four sections of the paper (five on Sunday)
========================================

#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import unicode_literals, division, absolute_import, print_function
from datetime import date
from calibre.web.feeds.news import BasicNewsRecipe

class AdvancedUserRecipe1631181034(BasicNewsRecipe):
title = 'Helsingin Sanomat'
language = 'fi'
oldest_article = 7
max_articles_per_feed = 200
auto_cleanup = True

feeds = [
('Helsingin Sanomat', 'https://www.hs.fi'),
]
INDEX = 'https://www.hs.fi/'

def do_Section(self, nxtINDEX, section_title, feeds):
articles = []
soup = self.index_to_soup(nxtINDEX)
ii = 0
for section in soup.findAll('a', attrs={'class':'block'}):
if section is not None:
ii = ii + 1
z = section.findAll('h2')
try:
z = z[0].get_text() # strip=True
link = section['href']
if link[0:1] == '/':
link = 'https://www.hs.fi' + link
articles.append({u'title':z, u'url':link})
except Exception as inst:
self.log("exception handled")
if articles:
feeds.append((section_title, articles))
return feeds

def parse_index(self):
feeds = []
self.do_Section('https://www.hs.fi/', u'Etusivi', feeds)
self.do_Section('https://www.hs.fi/kotimaa/', u'Kotimaa', feeds)
self.do_Section('https://www.hs.fi/kulttuuri/', u'Kulttuuri', feeds)
self.do_Section('https://www.hs.fi/ulkomaat/', u'Ulkomaat', feeds)
if date.weekday(date.today()) == 6:
self.do_Section('https://www.hs.fi/sunnuntai/', u'Sunnuntai', feeds)
return feeds

========================================
Аргументы и Факты:
========================================
The distributed recipe runs, but provides no content. The recipe
below runs and provides content. However some Unicode directory
and file names are found as type 'bytes' rather than as type 'str',
and need two small modifications in news.py to handle this. The
modified code will handle both 'str' and 'bytes' types. I will suggest
these changes to the development forum for inclusion in Calibre, but
if you have local development code and need the Аргументы и Факты
recipe you need only make the changes below. I will also try to tidy the recipe further now that is working, and post a tidied recipe.

1) in canonicalize_internal_url(self, url, is_link=True):
replace
return frozenset([(parts.netloc, (parts.path or '').rstrip('/'))])
by
zzp = parts.path
zzn = parts.netloc
if type(zzp) != type(' '): #"<class 'bytes'>":
zzp = parts.path.decode("utf-8")
zzn = parts.netloc.decode("utf-8")
return frozenset([(zzn, (zzp or '').rstrip('/'))])

2) In article_downloaded(self, request, result):
replace
index = os.path.join(os.path.dirname(result[0]), 'index.html')
by
zzr = result[0]
if type(zzr) != type(' '):
zzr = result[0].decode("utf-8")
index = os.path.join(os.path.dirname(zzr), 'index.html')
========================================

#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import with_statement, unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.fetch.simple import (
AbortArticle, RecursiveFetcher, option_parser as web2disk_option_parser
)
import string as st
import calibre.web.feeds.news
import os, sys
dir(BeautifulSoup)

class AdvancedUserRecipe1592177429(BasicNewsRecipe):
title = 'Аргументы и Факты'
encoding = 'utf8'
language = 'ru'
oldest_article = 7
max_articles_per_feed = 25
auto_cleanup = True
verbose = 3

feeds = [
('AIF', 'https://www.aif.ru/rss/all.php'),
]
INDEX = 'https://www.aif.ru/rss/all.php'

def preprocess_html(self, soup):
soup = BasicNewsRecipe.preprocess_html(self, soup)
return soup
def preprocess_raw_html(self, raw_html, url):
raw_html = BasicNewsRecipe.preprocess_raw_html(self, raw_html, url)
return raw_html
def fetch_article(self, url, dir_, f, a, num_of_feeds):
br = self.browser
if hasattr(self.get_browser, 'is_base_class_implementation'):
# We are using the default get_browser, which means no need to
# clone
br = BasicNewsRecipe.get_browser(self)
else:
br = self.clone_browser(self.browser)
self.web2disk_options.browser = br
fetcher = RecursiveFetcher(self.web2disk_options, self.log, # BasicNewsRecipe.
self.image_map, self.css_map,
(url, f, a, num_of_feeds))
fetcher.browser = br
fetcher.base_dir = dir_
fetcher.current_dir = dir_
fetcher.show_progress = False
fetcher.image_url_processor = self.image_url_processor
res, path, failures = fetcher.start_fetch(url.decode()), fetcher.downloaded_paths, fetcher.failed_links
res = res.encode("utf-8")
path[0] = path[0].encode()
if not res or not os.path.exists(res):
msg = _('Could not fetch article.') + ' '
if self.debug:
msg += _('The debug traceback is available earlier in this log')
else:
msg += _('Run with -vv to see the reason')
raise Exception(msg)

return res, path, failures

def parse_index(self):
feeds = []
section_title = u'aif'
articles = []
soup = self.index_to_soup(self.INDEX)
ii = 0
for item in soup.findAll('item'):
if ii < self.max_articles_per_feed:
try:
ii = ii + 1
A = str(item)
i = A.find(u'link')
j = A.find(u'description')
ZZ = item.find('description')
ZZ1 = str(ZZ)
ZZ2 = ZZ1[24:-19]
AB = A
AB1 = AB[i:j].encode()
AU = AB1
try:
articles.append({'url':AU[6:-2], 'title':ZZ2})
except Exception as inst:
self.log("Exception handled!")
except Exception as inst:
self.log("Exception handled!")
if articles:
feeds.append((section_title, articles))
return feeds

09-10-2021, 08:34 AM	#5
oneillpt Connoisseur Posts: 63 Karma: 46 Join Date: Feb 2011 Device: Kindle 3 (cracked screen!); PW1; Oasis	Updated recipes for Helsingin Sanomat and Аргументы и Факты NOTE THAT THE UPDATED RECIPE FOR Аргументы и Факты REQUIRES TWO SMALL CHANGES TO CALIBRE SOURCE CODE, DISCUSSED BELOW Helsingin Sanomat: ======================================== This recipe provides four sections of the paper (five on Sunday) ======================================== #!/usr/bin/env python2 # vim:fileencoding=utf-8 from __future__ import unicode_literals, division, absolute_import, print_function from datetime import date from calibre.web.feeds.news import BasicNewsRecipe class AdvancedUserRecipe1631181034(BasicNewsRecipe): title = 'Helsingin Sanomat' language = 'fi' oldest_article = 7 max_articles_per_feed = 200 auto_cleanup = True feeds = [ ('Helsingin Sanomat', 'https://www.hs.fi'), ] INDEX = 'https://www.hs.fi/' def do_Section(self, nxtINDEX, section_title, feeds): articles = [] soup = self.index_to_soup(nxtINDEX) ii = 0 for section in soup.findAll('a', attrs={'class':'block'}): if section is not None: ii = ii + 1 z = section.findAll('h2') try: z = z[0].get_text() # strip=True link = section['href'] if link[0:1] == '/': link = 'https://www.hs.fi' + link articles.append({u'title':z, u'url':link}) except Exception as inst: self.log("exception handled") if articles: feeds.append((section_title, articles)) return feeds def parse_index(self): feeds = [] self.do_Section('https://www.hs.fi/', u'Etusivi', feeds) self.do_Section('https://www.hs.fi/kotimaa/', u'Kotimaa', feeds) self.do_Section('https://www.hs.fi/kulttuuri/', u'Kulttuuri', feeds) self.do_Section('https://www.hs.fi/ulkomaat/', u'Ulkomaat', feeds) if date.weekday(date.today()) == 6: self.do_Section('https://www.hs.fi/sunnuntai/', u'Sunnuntai', feeds) return feeds ======================================== Аргументы и Факты: ======================================== The distributed recipe runs, but provides no content. The recipe below runs and provides content. However some Unicode directory and file names are found as type 'bytes' rather than as type 'str', and need two small modifications in news.py to handle this. The modified code will handle both 'str' and 'bytes' types. I will suggest these changes to the development forum for inclusion in Calibre, but if you have local development code and need the Аргументы и Факты recipe you need only make the changes below. I will also try to tidy the recipe further now that is working, and post a tidied recipe. 1) in canonicalize_internal_url(self, url, is_link=True): replace return frozenset([(parts.netloc, (parts.path or '').rstrip('/'))]) by zzp = parts.path zzn = parts.netloc if type(zzp) != type(' '): #"<class 'bytes'>": zzp = parts.path.decode("utf-8") zzn = parts.netloc.decode("utf-8") return frozenset([(zzn, (zzp or '').rstrip('/'))]) 2) In article_downloaded(self, request, result): replace index = os.path.join(os.path.dirname(result[0]), 'index.html') by zzr = result[0] if type(zzr) != type(' '): zzr = result[0].decode("utf-8") index = os.path.join(os.path.dirname(zzr), 'index.html') ======================================== #!/usr/bin/env python # vim:fileencoding=utf-8 from __future__ import with_statement, unicode_literals from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.fetch.simple import ( AbortArticle, RecursiveFetcher, option_parser as web2disk_option_parser ) import string as st import calibre.web.feeds.news import os, sys dir(BeautifulSoup) class AdvancedUserRecipe1592177429(BasicNewsRecipe): title = 'Аргументы и Факты' encoding = 'utf8' language = 'ru' oldest_article = 7 max_articles_per_feed = 25 auto_cleanup = True verbose = 3 feeds = [ ('AIF', 'https://www.aif.ru/rss/all.php'), ] INDEX = 'https://www.aif.ru/rss/all.php' def preprocess_html(self, soup): soup = BasicNewsRecipe.preprocess_html(self, soup) return soup def preprocess_raw_html(self, raw_html, url): raw_html = BasicNewsRecipe.preprocess_raw_html(self, raw_html, url) return raw_html def fetch_article(self, url, dir_, f, a, num_of_feeds): br = self.browser if hasattr(self.get_browser, 'is_base_class_implementation'): # We are using the default get_browser, which means no need to # clone br = BasicNewsRecipe.get_browser(self) else: br = self.clone_browser(self.browser) self.web2disk_options.browser = br fetcher = RecursiveFetcher(self.web2disk_options, self.log, # BasicNewsRecipe. self.image_map, self.css_map, (url, f, a, num_of_feeds)) fetcher.browser = br fetcher.base_dir = dir_ fetcher.current_dir = dir_ fetcher.show_progress = False fetcher.image_url_processor = self.image_url_processor res, path, failures = fetcher.start_fetch(url.decode()), fetcher.downloaded_paths, fetcher.failed_links res = res.encode("utf-8") path[0] = path[0].encode() if not res or not os.path.exists(res): msg = _('Could not fetch article.') + ' ' if self.debug: msg += _('The debug traceback is available earlier in this log') else: msg += _('Run with -vv to see the reason') raise Exception(msg) return res, path, failures def parse_index(self): feeds = [] section_title = u'aif' articles = [] soup = self.index_to_soup(self.INDEX) ii = 0 for item in soup.findAll('item'): if ii < self.max_articles_per_feed: try: ii = ii + 1 A = str(item) i = A.find(u'link') j = A.find(u'description') ZZ = item.find('description') ZZ1 = str(ZZ) ZZ2 = ZZ1[24:-19] AB = A AB1 = AB[i:j].encode() AU = AB1 try: articles.append({'url':AU[6:-2], 'title':ZZ2}) except Exception as inst: self.log("Exception handled!") except Exception as inst: self.log("Exception handled!") if articles: feeds.append((section_title, articles)) return feeds