Fixing .Net once again as they changed their feed system. They are using feedburner for linking to the articles and I'm having a hard time getting my modified recipe to work. What I am getting is the entries from the feedburner and not the actual articles at all. (the title, link, description).
Tried to use parse_feed and parse_index and despite reading about them and trying them I'm not getting it to do what I need it to do.
This is the recipe using parse_feeds which I would like to take the url of each feed and put it into a list that would actually be retrieved and packaged into my epub.
Spoiler:
Code:
import os, time, traceback, re, urlparse, sys, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.fetch.simple import option_parser as web2disk_option_parser
class dotnetMagazine (BasicNewsRecipe):
__author__ = u'Bonni Salles - post in forum if questions for me'
__version__ = '1.1'
__license__ = 'GPL v3'
__copyright__ = u'2013, Bonni Salles'
title = '.net magazine'
oldest_article = 7
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = True
recursion = 2
articles_are_obfuscated = False
language = 'en'
remove_empty_feeds = True
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
#cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png'
auto_cleanup = False
#remove_tags_after = dict(name='footer', id=lambda x:not x)
#remove_tags_before = dict(name='header', id=lambda x:not x)
#remove_tags = [
# dict(name='div', attrs={'class': 'item-list'}),
# dict(name='h4', attrs={'class': 'std-hdr'}),
# dict(name='div', attrs={'class': 'item-list share-links'}), #removes share links
# dict(name=['script', 'noscript']),
# dict(name='div', attrs={'id': 'comments-form'}), #comment these out if you want the comments to show
# dict(name='div', attrs={'id': re.compile('advertorial_block_($|| )')}),
# dict(name='div', attrs={'id': 'right-col'}),
# dict(name='div', attrs={'id': 'comments'}), #comment these out if you want the comments to show
# dict(name='div', attrs={'class': 'item-list related-content'}),
# ]
feeds = [
(u'web design', u'http://feeds.feedburner.com/creativebloq/web-design?format=xml'),
(u'Tutorials', u'http://feeds.feedburner.com/creativebloq/tutorial?format=xml'),
]
# def skip_ad_pages(self, soup):
# text = soup.find(text=' click here to continue to article ')
# if text:
# a = text.parent
# url = a.get('href')
# if url:
# return self.index_to_soup(url, raw=True)
def parse_feeds (self):
feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds:
for article in feed.articles[:]:
print 'article.title is: ', article.title
print 'article.url is: ', article.url
print 'article.date is: ', article.date
print 'article.text_summary is: ', article.text_summary
# feeds.append((title, articles))
return feeds
def make_links(self, url):
title = 'Temp'
current_articles = []
soup = self.index_to_soup(url)
# print 'The soup is: ', soup
for item in soup.findAll('a', attrs={'href':'contentheading'}):
#I think that here the contentheading must be changed or deleted
for item in soup.findAll('a'):
print 'item is: ', item
link = item.find('a')
print 'the link is: ', link
if link:
url = self.INDEX + link['href']
title = self.tag_to_string(link)
print 'the title is: ', title
print 'the url is: ', url
print 'the title is: ', title
#current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
return current_articles
def preprocess_html(self, soup):
for item in soup.findAll(attrs={'style':True}):
del item['style']
return soup
This is the link to the xml format of the listing of the articles:
<channel><title>Creative Bloq Web design</title>
<link>http://www.creativebloq.com/feed/web-design</link><description>Your daily dose of design inspiration Web design</description><language>en</language><ttl>30</ttl><image><url>http://media.creativebloq.futurecdn.net/sites/creativebloq.com/themes/creativebloq/logo.png</url><title>Creative Bloq Web design</title><link>http://www.creativebloq.com/feed/web-design</link></image><generator>Future Syndication Engine - v0.01</generator><copyright>Copyright Future Publishing Limited. Reg no. 2008885 England</copyright><lastBuildDate>Sat, 05 Apr 2014 10:00:07 +0000</lastBuildDate><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="self" type="application/rss+xml" href="http://feeds.feedburner.com/creativebloq/web-design" />
<feedburner:info uri="creativebloq/web-design" /><atom10:link xmlns:atom10="http://www.w3.org/2005/Atom" rel="hub" href="http://pubsubhubbub.appspot.com/" /><feedburner:browserFriendly></feedburner:browserFriendly>
<item><link>http://www.creativebloq.com/netmag/andrew-clarke-designing-left-41411226</link><guid>http://www.creativebloq.com/netmag/andrew-clarke-designing-left-41411226</guid><title>Andrew Clarke on designing for the Left</title><description>Read more about <a href="http://www.creativebloq.com/netmag/andrew-clarke-designing-left-41411226">Andrew Clarke on designing for the Left</a> at CreativeBloq.com <hr>Stuff & Nonsense is one of five shortlisted nominees for the Agency of the Year award in the 2014 net Awards. We spoke to founder Andrew Clarke about recent projects and how things have evolved over the company's 16 year history.</description><pubDate>Fri, 04 Apr 2014 17:42:46 +0000</pubDate></item>
They also offer an html format which can be used if I could figure it out instead:
<li class="regularitem" xmlns:dc="http://purl.org/dc/elements/1.1/">
<h4 class="itemtitle">
<a href="http://www.creativebloq.com/netmag/cole-henley-creating-mud-matt-powell-41411214">Cole Henley on creating Mud with Matt Powell</a>
</h4>
<h5 class="itemposttime">
<span>Posted:</span>Wed, 02 Apr 2014 15:00:24 +0000</h5>
<div class="itemcontent" name="decodeable">Read more about <a href="http://www.creativebloq.com/netmag/cole-henley-creating-mud-matt-powell-41411214">Cole Henley on creating Mud with Matt Powell</a> at CreativeBloq.com <hr>Bath-based web design agency Mud is a shortlisted nominee for the New Agency of the Year award in the 2014 net Awards. We spoke to Cole Henley about how he joined up with Matt Powell last year to create Mud.</div>
</li>
<li class="regularitem">
<h4 class="itemtitle">
<a href="http://www.creativebloq.com/netmag/syd-lawrence-why-coding-should-still-be-fun-41411219">Syd Lawrence on why coding should still be fun</a>
</h4>
<h5 class="itemposttime">
<span>Posted:</span>Wed, 02 Apr 2014 14:21:52 +0000</h5>
<div class="itemcontent" name="decodeable">Read more about <a href="http://www.creativebloq.com/netmag/syd-lawrence-why-coding-should-still-be-fun-41411219">Syd Lawrence on why coding should still be fun</a> at CreativeBloq.com <hr>Last year we held our first ever conference for web designers, with an outstanding speaker line-up that included Oliver Reichenstein, Stephanie Reiger and Mark Boulton. It was such a great day that this year we're holding two conferences, one in the UK and one in the US.</div>
</li>
Okay all, can someone please help me figure out the parse_feed coding or parse_index coding for one of these two feeds
questions/changes (updated 4/11/14):
it's using a blog format for the entries but also has the feed for access. if anyone can figure out what I'm doing wrong on this one, please let me know. I've been playing with it for over a week and am totally stuck, I'm getting the entries of the feeds.feedburner and not following the links from it like I'd like it to.
Here is the latest version but it's giving me even more trouble:
Spoiler:
Code:
import os, time, traceback, re, urlparse, sys, cStringIO
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup
from calibre.web.feeds import feed_from_xml, templates, feeds_from_index, Feed
from calibre.ptempfile import PersistentTemporaryFile
class dotnetMagazine (BasicNewsRecipe):
__author__ = u'Bonni Salles - post in forum if questions for me'
__version__ = '1.1'
__license__ = 'GPL v3'
__copyright__ = u'2013, Bonni Salles'
title = '.net magazine'
oldest_article = 7
no_stylesheets = True
encoding = 'utf8'
use_embedded_content = True
recursion = 3
articles_are_obfuscated = True
language = 'en'
remove_empty_feeds = True
extra_css = ' body{font-family: Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} '
#cover_url = u'http://media.netmagazine.futurecdn.net/sites/all/themes/netmag/logo.png'
auto_cleanup = False
#remove_tags_after = dict(name='footer', id=lambda x:not x)
#remove_tags_before = dict(name='header', id=lambda x:not x)
#remove_tags = [
# dict(name='div', attrs={'class': 'item-list'}),
# dict(name='h4', attrs={'class': 'std-hdr'}),
# dict(name='div', attrs={'class': 'item-list share-links'}), #removes share links
# dict(name=['script', 'noscript']),
# dict(name='div', attrs={'id': 'comments-form'}), #comment these out if you want the comments to show
# dict(name='div', attrs={'id': re.compile('advertorial_block_($|| )')}),
# dict(name='div', attrs={'id': 'right-col'}),
# dict(name='div', attrs={'id': 'comments'}), #comment these out if you want the comments to show
# dict(name='div', attrs={'class': 'item-list related-content'}),
# ]
feeds = [
#(u'web design', u'http://feeds.feedburner.com/creativebloq/web-design?format=xml'),
#(u'Tutorials', u'http://feeds.feedburner.com/creativebloq/tutorial?format=xml'),
(u'web design', u'http://feeds.feedburner.com/creativebloq/web-design'),
(u'Tutorials', u'http://feeds.feedburner.com/creativebloq/tutorial'), ]
INDEX = 'http://creativeblog.com/netmag'
articles = {}
regularitem = ()
def parse_index(self):
feeds = []
for title, url in [
(u'web design', u'feeds.feedburner.com/creativebloq/web-design'),
(u'Tutorials', u'feeds.feedburner.com/creativebloq/tutorial'),
]:
articles = self.make_links(url)
if articles:
feeds.append((title, articles))
return feeds
def make_links(self, url):
#title = 'Temp'
date = ''
current_articles = []
soup = self.index_to_soup(url)
#print 'The soup is: ', soup
for item in soup.find('h4', attrs={'class':'itemtitle'}):
recipes = regularitem.findAll('a')
for recipe in recipes:
page_url = self.INDEX + recipe['href']
print 'page_url is: ', page_url
title = recipe.find('Read more about').string
print 'title is: ', title
current_articles.append({'title': title, 'url': url, 'description':'', 'date':date}) # append all this
return current_articles
# def make_links(self, url):
# #title = 'Temp'
# soup = self.index_to_soup(url)
# #print 'The soup is: ', soup
# current_articles = []
# for item in soup.findAll('li', attrs = {'class': 'regularitem'}):
# print 'item is: ', item
# link = item.find('a')
# print 'the link is: ', link
# if link:
# url = self.INDEX + link['href']
# title = self.tag_to_string(link)
# print 'the title is: ', title
# print 'the url is: ', url
# print 'the title is: ', title
# current_articles.append({'title': title, 'url': url, 'description':'', 'date':''}) # append all this
# return current_articles
def preprocess_html(self, soup):
for item in soup.findAll(attrs={'style':True}):
del item['style']
return soup
Found the issue and fixed the recipe, turned out to be much easier than I expected. So attached is the new version of the recipe so that the program can be updated. By the way it's now called Creative Blog which covers many of the magazines of which .net is just one of.