View Single Post
Old 09-10-2021, 11:51 AM   #9
oneillpt
Connoisseur
oneillpt began at the beginning.
 
Posts: 63
Karma: 46
Join Date: Feb 2011
Device: Kindle 3 (cracked screen!); PW1; Oasis
Thanks. I've now removed all encode's and decode's as well as my modified fetch_article, and tested without the suggested modifications to news.py - all now runs successfully. The need to handle byte strings must have arisen during development of the recipe but not been necessary in the final recipe. The simplified recipe follows below. I'll try to tidy it up further and update in the next day or two.

#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import with_statement, unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.fetch.simple import (
AbortArticle, RecursiveFetcher, option_parser as web2disk_option_parser
)
import string as st
import calibre.web.feeds.news
import os, sys
dir(BeautifulSoup)

class AdvancedUserRecipe1592177429(BasicNewsRecipe):
title = 'Аргументы и Факты'
encoding = 'utf8'
language = 'ru'
oldest_article = 7
max_articles_per_feed = 25
auto_cleanup = True
verbose = 3

feeds = [
('AIF', 'https://www.aif.ru/rss/all.php'),
]
INDEX = 'https://www.aif.ru/rss/all.php'

def preprocess_html(self, soup):
soup = BasicNewsRecipe.preprocess_html(self, soup)
return soup
def preprocess_raw_html(self, raw_html, url):
raw_html = BasicNewsRecipe.preprocess_raw_html(self, raw_html, url)
return raw_html

def parse_index(self):
feeds = []
section_title = u'aif'
articles = []
soup = self.index_to_soup(self.INDEX)
ii = 0
for item in soup.findAll('item'):
if ii < self.max_articles_per_feed:
try:
ii = ii + 1
A = str(item)
i = A.find(u'link')
j = A.find(u'description')
ZZ = item.find('description')
ZZ1 = str(ZZ)
ZZ2 = ZZ1[24:-19]
AB = A
AB1 = AB[i:j]
AU = AB1
try:
articles.append({'url':AU[6:-2], 'title':ZZ2})
except Exception as inst:
self.log("Exception handled!")
except Exception as inst:
self.log("Exception handled!")
if articles:
feeds.append((section_title, articles))
return feeds
oneillpt is offline   Reply With Quote