Hello! This is my first recipe. It's for the site
https://www.theflipside.io/ Hope it's useful to someone. They don't have an RSS feed.
Caveats:
I'm probably not using "feeds" properly. I'm hardcoding it inside parse_index, but this site only has a single feed so I didn't bother to consume "feeds".
Dates aren't being parsed, so "oldest_article" doesn't take effect.
Otherwise, I'm pretty happy with the output!
Code:
#!/usr/bin/env python
# vim:fileencoding=utf-8
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1659410049(BasicNewsRecipe):
title = 'The Flip Side'
oldest_article = 50 # dates aren't being parsed so this doesn't have any effect
max_articles_per_feed = 25
auto_cleanup = False
remove_tags_before = [
dict(name='div', class_='menu-button-nav')
]
remove_tags_after = [
dict(name='div', class_='on-the-bright-side')
]
remove_tags = [
dict(name='div', class_='on-the-bright-side'),
dict(name='img', class_='archives')
]
feeds = [
('The Flip Side', 'https://www.theflipside.io/archives'),
]
def parse_index(self):
articles = []
soup = self.index_to_soup('https://www.theflipside.io/archives')
for div in soup.findAll(True, attrs={'class':['collection-item']}):
month = div.find_all('h4')[0].get_text()
day = div.find_all('h4')[1].get_text()
date = month + " " + day
title = div.find("h3").get_text() + ' - ' + date
description = ''
url = "https://www.theflipside.io" + div.find("a")['href']
article = dict(month=month, date=date, title=title, description=description, url=url)
articles.append(article)
return [('The Flip Side', articles)]
def get_article_url(self, article):
result = re.search('<a href="([^>]*)">', article.summary)
url = result.group(1)
return url
# Convert all links to text
# https://www.mobileread.com/forums/showpost.php?p=1231995&postcount=9
def preprocess_html(self, soup):
for alink in soup.findAll('a'):
if alink.string is not None:
tstr = alink.string
alink.replaceWith(tstr)
return soup