I'm trying to set the "date" for the article dictionary in parse_index(). It seems that at a certain point, Calibre stops accepting my manipulation of the source data.
This is the code I think should work:
Code:
i = div.find('i')
m = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
dateGroup = re.match(r"(?P<month>\w+) (?P<date>\d+), (?P<year>\d+)",
''.join(i.findAll(text=True, recursive=False)).strip())
artDate = date(int(dateGroup.group('year')),
m[dateGroup.group('month')],
int(dateGroup.group('date')))
pubdate = artDate.strftime('%a, %d %b')
When executed, the date is always today's date. However, when I assign "pubdate" to the "description" in the article dictionary, it is the correct value.
Curiously, the following three all work as expected:
Code:
pubdate = strftime('%a, %d %b')
Code:
pubdate = ''.join(i.findAll(text=True, recursive=False)).strip()
Code:
pubdate = dateGroup.group('year') + '{0}'.format(m[dateGroup.group('month')]) + dateGroup.group('date')
This is my complete recipe:
Spoiler:
Code:
import string, re, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from datetime import date
from datetime import timedelta
class AdvancedUserRecipe1328808344(BasicNewsRecipe):
title = u'C-Fam Friday Fax'
oldest_article = 10.66
max_articles_per_feed = 100
auto_cleanup = True
def parse_index(self):
soup = self.index_to_soup('http://www.c-fam.org/fridayfax/')
articles = []
feeds = []
seenArticles = []
for div in soup.findAll('div'):
a = div.find('a', href=True, attrs={'class':'ffArchiveLink'})
if not a:
continue
if a['href'] in seenArticles:
continue
seenArticles.append(a['href'])
i = div.find('i')
if not i:
continue
m = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
dateGroup = re.match(r"(?P<month>\w+) (?P<date>\d+), (?P<year>\d+)",
''.join(i.findAll(text=True, recursive=False)).strip())
artDate = date(int(dateGroup.group('year')),
m[dateGroup.group('month')],
int(dateGroup.group('date')))
if (artDate <= (date.today() - timedelta(days=self.oldest_article))):
continue
pubdate = artDate.strftime('%a, %d %b')
# pubdate = strftime('%a, %d %b')
# pubdate = ''.join(i.findAll(text=True, recursive=False)).strip()
# pubdate = dateGroup.group('year') + '{0}'.format(m[dateGroup.group('month')]) + dateGroup.group('date')
url = 'http://www.c-fam.org/' + a['href']
title = ''.join(a.findAll(text=True, recursive=False)).strip()
description = ''
articles.append({'title' : title,
'url' : url,
'date' : pubdate,
'description' : pubdate})
# 'description' : description})
if (len(articles) > 0):
feeds.append((self.title, articles))
else:
raise ValueError('No articles found, aborting')
return feeds
What's going on here? I don't understand why it won't work.