



after a fresh cup of coffee the building of url is O.K.
you are right there are many way's to solve a probleme.
this one IS solved right now.
here the way:
Code:
import string, re
from calibre import strftime
class AdvancedUserRecipe(BasicNewsRecipe):
title = '007_National Geo_run'
description = 'Magazin des NG '
__author__ = 'schuster'
publisher = 'Aus dem Online-Archiv des NG'
language = 'de'
cover_url = 'http://www.nationalgeographic.de/images/national-geographic-logo.jpg'
masthead_url = 'http://www.nationalgeographic.de/images/national-geographic-logo.jpg'
INDEX = 'http://www.nationalgeographic.de/archive/'
def parse_index(self):
##--------------------------------------------------------------------------------------
year_norm = strftime('%Y') ## get the year as string
month_norm = strftime('%m') ## get the month as string
year_min = unicode(int(strftime('%Y')) - 1) ## string to unicode
month_min = unicode(int(strftime('%m')[1]) - 1) ## string to unicode
if (strftime('%m')) <= 1: ## if it is january
year = year_min ## get year_min that is minus one year. so i get the last year
month = 12 ## and set month to december
else: ## otherway
year = year_norm ## get the year today
month = month_min ## and the month minus one month. to get the last month, that had the hole content
##--------------------------------------------------------------------------------------
articles = []
soup = self.index_to_soup(self.INDEX+ year + '-' + month)
feeds = []
for section in soup.findAll('div', attrs={'class':'searchresult_text'}):
section_title = self.tag_to_string(section.find('headline-middle_no_margin black'))
articles = []
for post in section.findAll('a', href=True):
url = post['href']
split_url = url.split("/")
section_title = split_url[1]
if url.startswith('/'):
url = 'http://www.nationalgeographic.de'+url
title = self.tag_to_string(post)
if str(post).find('class=') > 0:
klass = post['class']
if klass != "":
self.log()
self.log('--> post: ', post)
self.log('--> url: ', url)
self.log('--> title: ', title)
self.log('--> class: ', klass)
articles.append({'title':title, 'url':url})
if articles:
feeds.append((section_title, articles))
return feeds
keep_only_tags = [dict(attrs={'class':['contentbox_no_top_border']})]
remove_tags = [dict(name='div', attrs={'class':'gallery'})]
great feeling
- - - - - - - - - - - - - - - - - -
next prob is the right output of the toc, let me see...........