the toc problem isn't solved (will do it later)
the new problem is that only the last month-archive is full content.
so i had to get the date from today and change it.
at the moment i don't know the right way because i've got error-message on run.
this is my try:
Code:
import string, re
from calibre import strftime
from dateutil import relativedelta
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class AdvancedUserRecipe(BasicNewsRecipe):
title = 'National Geo_test_username3'
description = 'Magazin des NG '
__author__ = 'schuster'
publisher = 'Aus dem Online-Archiv des NG'
language = 'de'
cover_url = 'http://www.nationalgeographic.de/images/national-geographic-logo.jpg'
masthead_url = 'http://www.nationalgeographic.de/images/national-geographic-logo.jpg'
# here i try to manage that the actually month change to the last month, because only last month is fully in archive
#in second is handling of the january because there will not be a month before, so change the year one back and set month to 12
def date_manage():
year_norm = strftime('%Y')
month_norm = strftime('%m')
year_min = unicode(int(strftime('%Y')) - 1)
month_min = unicode(int(strftime('%m')[1]) - 2)
if (strftime('%Y')) <= 1:
year = year_min
month = 12
print '------------->beginning/end of year date' +year + month
return year, month
else:
year = year_norm
month = month_min
print '------------> normaldate' +year +month
return year, month
#change the INDEX
INDEX = 'http://www.nationalgeographic.de/archive/'+ year + '-' + month
print INDEX
#grab the content
def parse_index(self):
articles = []
soup = self.index_to_soup(self.INDEX)
feeds = []
for section in soup.findAll('div', attrs={'class':'searchresult_text'}):
section_title = self.tag_to_string(section.find('headline-middle_no_margin black'))
articles = []
for post in section.findAll('a', href=True):
url = post['href']
split_url = url.split("/")
section_title = split_url[1]
if url.startswith('/'): url = 'http://www.nationalgeographic.de'+url
title = self.tag_to_string(post)
if str(post).find('class=') > 0:
klass = post['class']
if klass != "":
self.log()
self.log('--> post: ', post)
self.log('--> url: ', url)
self.log('--> title: ', title)
self.log('--> class: ', klass)
articles.append({'title':title, 'url':url})
if articles:
feeds.append((section_title, articles)) #manage of build the toc is incorrect (need change)
return feeds
keep_only_tags = [dict(attrs={'class':['contentbox_no_top_border']})]
remove_tags = [dict(name='div', attrs={'class':'gallery'})]
need help on this attempt to change the date

